From 0e769b22dad80e324206485579913ebfae8ca284 Mon Sep 17 00:00:00 2001
From: Packit Service <user-cont-team+packit-service@redhat.com>
Date: Dec 09 2020 13:18:16 +0000
Subject: fio-3.19 base


---

diff --git a/.appveyor.yml b/.appveyor.yml
new file mode 100644
index 0000000..bf0978a
--- /dev/null
+++ b/.appveyor.yml
@@ -0,0 +1,32 @@
+clone_depth: 1 # NB: this stops FIO-VERSION-GEN making tag based versions
+
+environment:
+  CYG_MIRROR: http://cygwin.mirror.constant.com
+  CYG_ROOT: C:\cygwin64
+  MAKEFLAGS: -j 2
+  matrix:
+    - platform: x64
+      PACKAGE_ARCH: x86_64
+      CONFIGURE_OPTIONS:
+    - platform: x86
+      PACKAGE_ARCH: i686
+      CONFIGURE_OPTIONS: --build-32bit-win --target-win-ver=xp
+
+install:
+  - '%CYG_ROOT%\setup-x86_64.exe --quiet-mode --no-shortcuts --only-site --site "%CYG_MIRROR%" --packages "mingw64-%PACKAGE_ARCH%-zlib,mingw64-%PACKAGE_ARCH%-CUnit" > NUL'
+  - SET PATH=C:\Python38-x64;%CYG_ROOT%\bin;%PATH% # NB: Changed env variables persist to later sections
+  - python.exe -m pip install scipy
+
+build_script:
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure --disable-native --extra-cflags=\"-Werror\" ${CONFIGURE_OPTIONS} && make.exe'
+
+after_build:
+  - cd os\windows && dobuild.cmd %PLATFORM%
+
+test_script:
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && file.exe fio.exe && make.exe test'
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && python.exe t/run-fio-tests.py --debug'
+
+artifacts:
+  - path: os\windows\*.msi
+    name: msi
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b84b0fd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,31 @@
+*.d
+*.o
+*.exe
+/.depend
+/FIO-VERSION-FILE
+/config-host.h
+/config-host.mak
+/config.log
+/cscope.out
+/fio
+/gfio
+/t/axmap
+/t/fio-btrace2fio
+/t/fio-dedupe
+/t/fio-genzipf
+/t/fio-verify-state
+/t/gen-rand
+/t/ieee754
+t/io_uring
+/t/lfsr-test
+t/memlock
+t/read-to-pipe-async
+/t/stest
+/unittests/unittest
+y.tab.*
+lex.yy.c
+*.un~
+doc/output
+/tags
+/TAGS
+/t/zbd/test-zbd-support.log.*
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..77c31b7
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,57 @@
+language: c
+dist: bionic
+os:
+  - linux
+compiler:
+  - clang
+  - gcc
+env:
+  matrix:
+    - BUILD_ARCH="x86"
+    - BUILD_ARCH="x86_64"
+  global:
+    - MAKEFLAGS="-j 2"
+matrix:
+  include:
+    # Default xcode image
+    - os: osx
+      compiler: clang # Workaround travis setting CC=["clang", "gcc"]
+      env: BUILD_ARCH="x86_64"
+    # Latest xcode image (needs periodic updating)
+    - os: osx
+      compiler: clang
+      osx_image: xcode11.2
+      env: BUILD_ARCH="x86_64"
+  exclude:
+    - os: osx
+      compiler: gcc
+  exclude:
+    - os: linux
+      compiler: clang
+      env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
+before_install:
+  - EXTRA_CFLAGS="-Werror"
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+        pkgs=(libaio-dev libcunit1 libcunit1-dev libgoogle-perftools4 libibverbs-dev libiscsi-dev libnuma-dev librbd-dev librdmacm-dev libz-dev);
+        if [[ "$BUILD_ARCH" == "x86" ]]; then
+            pkgs=("${pkgs[@]/%/:i386}");
+            pkgs+=(gcc-multilib python-scipy);
+            EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32";
+        else
+            pkgs+=(glusterfs-common python-scipy);
+        fi;
+        sudo apt-get -qq update;
+        sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}";
+    fi;
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+        brew update;
+        brew install cunit;
+        if [[ "$TRAVIS_OSX_IMAGE" == "xcode11.2" ]]; then
+            pip3 install scipy;
+        fi;
+        pip install scipy;
+    fi;
+script:
+  - ./configure --extra-cflags="${EXTRA_CFLAGS}" && make
+  - make test
+  - sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
new file mode 100755
index 0000000..3220aaa
--- /dev/null
+++ b/FIO-VERSION-GEN
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+GVF=FIO-VERSION-FILE
+DEF_VER=fio-3.19
+
+LF='
+'
+
+# First see if there is a version file (included in release tarballs),
+# then try git-describe, then default.
+if test -f version
+then
+	VN=`cat version` || VN="$DEF_VER"
+elif test -d .git -o -f .git &&
+	VN=`git describe --match "fio-[0-9]*" --abbrev=4 HEAD 2>/dev/null` &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	fio-[0-9]*)
+		git update-index -q --refresh
+		test -z "`git diff-index --name-only HEAD --`" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$VN
+else
+	VN="$DEF_VER"
+fi
+
+VN=`expr "$VN" : v*'\(.*\)'`
+
+if test -r $GVF
+then
+	VC=`sed -e 's/^FIO_VERSION = //' <$GVF`
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "FIO_VERSION = $VN"
+	echo "FIO_VERSION = $VN" >$GVF
+}
diff --git a/GFIO-TODO b/GFIO-TODO
new file mode 100644
index 0000000..885ffcb
--- /dev/null
+++ b/GFIO-TODO
@@ -0,0 +1,52 @@
+In no particular order:
+
+- Ability to save job files. Probably in an extended gfio format,
+  so we can include options/settings outside of a fio job file.
+
+- End view improvements:
+
+	- Cleanup the layout
+	- Add ability to save the results
+	- Add ability to load end-results as well
+	- Add ability to request graphs of whatever graphing options
+	  the fio job included.
+	- Add ability to graph completion latencies, percentiles, etc.
+
+- Add ability to edit job options:
+
+	- We need an options view after sending a job, that allows us to
+	  visually see what was parsed, make changes, resubmit.
+
+	- Job options are already converted across the network and
+	  are available in gfio_client->o for view/edit. We'll need
+	  a FIO_NET_CMD_UPDATE_OPTIONS command to send them back,
+	  and backend support for updating an existing set of options.
+
+- Add support for printing end results, graphs, etc.
+
+- Improve the auto-start backend functionality, it's quite buggy.
+
+- Ensure that it works on OSX and Windows. We'll need a bit of porting
+  work there.
+
+- Persistent store of prefences set. This will need a per-OS bit as well,
+  using gfonf on Linux, registry on Windows, ?? on OSX.
+
+- Ensure that local errors go to our log, instead of being displayed on
+  the console.
+
+- Ensure that the whole connect/send/start button logic is sane. Right
+  now it works when you perform the right sequence, but if you connect
+  and disconnect, things can get confused. We'll need to improve how
+  we store and send job files. Right now they are in ge->job_files[]
+  and are always emptied on send. Keep them around?
+
+- Commit rate display is not enabled.
+
+- Group status reporting is not enabled.
+
+- Split gfio.c a bit. Add gfio/ sub directory, and split it into
+  files based on functionality. It's already ~3000 lines long.
+
+- Attempt to ensure that we work with gtk 2.10 and newer. Right
+  now the required version is ~2.18 (not quite known).
diff --git a/HOWTO b/HOWTO
new file mode 100644
index 0000000..430c7b6
--- /dev/null
+++ b/HOWTO
@@ -0,0 +1,4262 @@
+How fio works
+-------------
+
+The first step in getting fio to simulate a desired I/O workload, is writing a
+job file describing that specific setup. A job file may contain any number of
+threads and/or files -- the typical contents of the job file is a *global*
+section defining shared parameters, and one or more job sections describing the
+jobs involved. When run, fio parses this file and sets everything up as
+described. If we break down a job from top to bottom, it contains the following
+basic parameters:
+
+`I/O type`_
+
+		Defines the I/O pattern issued to the file(s).  We may only be reading
+		sequentially from this file(s), or we may be writing randomly. Or even
+		mixing reads and writes, sequentially or randomly.
+		Should we be doing buffered I/O, or direct/raw I/O?
+
+`Block size`_
+
+		In how large chunks are we issuing I/O? This may be a single value,
+		or it may describe a range of block sizes.
+
+`I/O size`_
+
+		How much data are we going to be reading/writing.
+
+`I/O engine`_
+
+		How do we issue I/O? We could be memory mapping the file, we could be
+		using regular read/write, we could be using splice, async I/O, or even
+		SG (SCSI generic sg).
+
+`I/O depth`_
+
+		If the I/O engine is async, how large a queuing depth do we want to
+		maintain?
+
+
+`Target file/device`_
+
+		How many files are we spreading the workload over.
+
+`Threads, processes and job synchronization`_
+
+		How many threads or processes should we spread this workload over.
+
+The above are the basic parameters defined for a workload, in addition there's a
+multitude of parameters that modify other aspects of how this job behaves.
+
+
+Command line options
+--------------------
+
+.. option:: --debug=type
+
+	Enable verbose tracing `type` of various fio actions.  May be ``all`` for all types
+	or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+	enable file and memory debugging).  Currently, additional logging is
+	available for:
+
+	*process*
+			Dump info related to processes.
+	*file*
+			Dump info related to file actions.
+	*io*
+			Dump info related to I/O queuing.
+	*mem*
+			Dump info related to memory allocations.
+	*blktrace*
+			Dump info related to blktrace setup.
+	*verify*
+			Dump info related to I/O verification.
+	*all*
+			Enable all debug options.
+	*random*
+			Dump info related to random offset generation.
+	*parse*
+			Dump info related to option matching and parsing.
+	*diskutil*
+			Dump info related to disk utilization updates.
+	*job:x*
+			Dump info only related to job number x.
+	*mutex*
+			Dump info only related to mutex up/down ops.
+	*profile*
+			Dump info related to profile extensions.
+	*time*
+			Dump info related to internal time keeping.
+	*net*
+			Dump info related to networking connections.
+	*rate*
+			Dump info related to I/O rate switching.
+	*compress*
+			Dump info related to log compress/decompress.
+	*steadystate*
+			Dump info related to steadystate detection.
+	*helperthread*
+			Dump info related to the helper thread.
+	*zbd*
+			Dump info related to support for zoned block devices.
+	*?* or *help*
+			Show available debug options.
+
+.. option:: --parse-only
+
+	Parse options only, don't start any I/O.
+
+.. option:: --merge-blktrace-only
+
+	Merge blktraces only, don't start any I/O.
+
+.. option:: --output=filename
+
+	Write output to file `filename`.
+
+.. option:: --output-format=format
+
+	Set the reporting `format` to `normal`, `terse`, `json`, or `json+`.  Multiple
+	formats can be selected, separated by a comma.  `terse` is a CSV based
+	format.  `json+` is like `json`, except it adds a full dump of the latency
+	buckets.
+
+.. option:: --bandwidth-log
+
+	Generate aggregate bandwidth logs.
+
+.. option:: --minimal
+
+	Print statistics in a terse, semicolon-delimited format.
+
+.. option:: --append-terse
+
+	Print statistics in selected mode AND terse, semicolon-delimited format.
+	**Deprecated**, use :option:`--output-format` instead to select multiple
+	formats.
+
+.. option:: --terse-version=version
+
+	Set terse `version` output format (default 3, or 2 or 4 or 5).
+
+.. option:: --version
+
+	Print version information and exit.
+
+.. option:: --help
+
+	Print a summary of the command line options and exit.
+
+.. option:: --cpuclock-test
+
+	Perform test and validation of internal CPU clock.
+
+.. option:: --crctest=[test]
+
+	Test the speed of the built-in checksumming functions. If no argument is
+	given, all of them are tested. Alternatively, a comma separated list can
+	be passed, in which case the given ones are tested.
+
+.. option:: --cmdhelp=command
+
+	Print help information for `command`. May be ``all`` for all commands.
+
+.. option:: --enghelp=[ioengine[,command]]
+
+	List all commands defined by `ioengine`, or print help for `command`
+	defined by `ioengine`.  If no `ioengine` is given, list all
+	available ioengines.
+
+.. option:: --showcmd=jobfile
+
+	Convert `jobfile` to a set of command-line options.
+
+.. option:: --readonly
+
+	Turn on safety read-only checks, preventing writes and trims.  The
+	``--readonly`` option is an extra safety guard to prevent users from
+	accidentally starting a write or trim workload when that is not desired.
+	Fio will only modify the device under test if
+	`rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite` is given.  This
+	safety net can be used as an extra precaution.
+
+.. option:: --eta=when
+
+	Specifies when real-time ETA estimate should be printed.  `when` may be
+	`always`, `never` or `auto`. `auto` is the default, it prints ETA
+	when requested if the output is a TTY. `always` disregards the output
+	type, and prints ETA when requested. `never` never prints ETA.
+
+.. option:: --eta-interval=time
+
+	By default, fio requests client ETA status roughly every second. With
+	this option, the interval is configurable. Fio imposes a minimum
+	allowed time to avoid flooding the console, less than 250 msec is
+	not supported.
+
+.. option:: --eta-newline=time
+
+	Force a new line for every `time` period passed.  When the unit is omitted,
+	the value is interpreted in seconds.
+
+.. option:: --status-interval=time
+
+	Force a full status dump of cumulative (from job start) values at `time`
+	intervals. This option does *not* provide per-period measurements. So
+	values such as bandwidth are running averages. When the time unit is omitted,
+	`time` is interpreted in seconds. Note that using this option with
+	``--output-format=json`` will yield output that technically isn't valid
+	json, since the output will be collated sets of valid json. It will need
+	to be split into valid sets of json after the run.
+
+.. option:: --section=name
+
+	Only run specified section `name` in job file.  Multiple sections can be specified.
+	The ``--section`` option allows one to combine related jobs into one file.
+	E.g. one job file could define light, moderate, and heavy sections. Tell
+	fio to run only the "heavy" section by giving ``--section=heavy``
+	command line option.  One can also specify the "write" operations in one
+	section and "verify" operation in another section.  The ``--section`` option
+	only applies to job sections.  The reserved *global* section is always
+	parsed and used.
+
+.. option:: --alloc-size=kb
+
+	Allocate additional internal smalloc pools of size `kb` in KiB.  The
+	``--alloc-size`` option increases shared memory set aside for use by fio.
+	If running large jobs with randommap enabled, fio can run out of memory.
+	Smalloc is an internal allocator for shared structures from a fixed size
+	memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+
+	NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+	in :file:`/tmp`.
+
+.. option:: --warnings-fatal
+
+	All fio parser warnings are fatal, causing fio to exit with an
+	error.
+
+.. option:: --max-jobs=nr
+
+	Set the maximum number of threads/processes to support to `nr`.
+	NOTE: On Linux, it may be necessary to increase the shared-memory
+	limit (:file:`/proc/sys/kernel/shmmax`) if fio runs into errors while
+	creating jobs.
+
+.. option:: --server=args
+
+	Start a backend server, with `args` specifying what to listen to.
+	See `Client/Server`_ section.
+
+.. option:: --daemonize=pidfile
+
+	Background a fio server, writing the pid to the given `pidfile` file.
+
+.. option:: --client=hostname
+
+	Instead of running the jobs locally, send and run them on the given `hostname`
+	or set of `hostname`\s.  See `Client/Server`_ section.
+
+.. option:: --remote-config=file
+
+	Tell fio server to load this local `file`.
+
+.. option:: --idle-prof=option
+
+	Report CPU idleness. `option` is one of the following:
+
+		**calibrate**
+			Run unit work calibration only and exit.
+
+		**system**
+			Show aggregate system idleness and unit work.
+
+		**percpu**
+			As **system** but also show per CPU idleness.
+
+.. option:: --inflate-log=log
+
+	Inflate and output compressed `log`.
+
+.. option:: --trigger-file=file
+
+	Execute trigger command when `file` exists.
+
+.. option:: --trigger-timeout=time
+
+	Execute trigger at this `time`.
+
+.. option:: --trigger=command
+
+	Set this `command` as local trigger.
+
+.. option:: --trigger-remote=command
+
+	Set this `command` as remote trigger.
+
+.. option:: --aux-path=path
+
+	Use the directory specified by `path` for generated state files instead
+	of the current working directory.
+
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will :option:`stonewall`
+execution between each group.
+
+
+Job file format
+---------------
+
+As previously described, fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning.  Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The :option:`--cmdhelp` option also lists all options. If used with a `command`
+argument, :option:`--cmdhelp` will detail the given `command`.
+
+See the `examples/` directory for inspiration on how to write job files.  Note
+the copyright and license requirements currently apply to `examples/` files.
+
+So let's look at a really simple job file that defines two processes, each
+randomly reading from a 128MiB file:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [global]
+    rw=randread
+    size=128m
+
+    [job1]
+
+    [job2]
+
+    ; -- end job file --
+
+As you can see, the job file sections themselves are empty as all the described
+parameters are shared. As no :option:`filename` option is given, fio makes up a
+`filename` for each of the jobs as it sees fit. On the command line, this job
+would look as follows::
+
+$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
+
+
+Let's look at an example that has a number of processes writing randomly to
+files:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    ioengine=libaio
+    iodepth=4
+    rw=randwrite
+    bs=32k
+    direct=0
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Here we have no *global* section, as we only have one job defined anyway.  We
+want to use async I/O here, with a depth of 4 for each file. We also increased
+the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
+jobs. The result is 4 processes each randomly writing to their own 64MiB
+file. Instead of using the above job file, you could have given the parameters
+on the command line. For this case, you would specify::
+
+$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
+
+When fio is utilized as a basis of any reasonably large test suite, it might be
+desirable to share a set of standardized settings across multiple job files.
+Instead of copy/pasting such settings, any section may pull in an external
+:file:`filename.fio` file with *include filename* directive, as in the following
+example::
+
+    ; -- start job file including.fio --
+    [global]
+    filename=/tmp/test
+    filesize=1m
+    include glob-include.fio
+
+    [test]
+    rw=randread
+    bs=4k
+    time_based=1
+    runtime=10
+    include test-include.fio
+    ; -- end job file including.fio --
+
+.. code-block:: ini
+
+    ; -- start job file glob-include.fio --
+    thread=1
+    group_reporting=1
+    ; -- end job file glob-include.fio --
+
+.. code-block:: ini
+
+    ; -- start job file test-include.fio --
+    ioengine=libaio
+    iodepth=4
+    ; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except *global*
+section). Include directives may be nested in that any included file may contain
+further include directive(s). Include files may not contain [] sections.
+
+
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
+
+Fio also supports environment variable expansion in job files. Any sub-string of
+the form ``${VARNAME}`` as part of an option value (in other words, on the right
+of the '='), will be expanded to the value of the environment variable called
+`VARNAME`.  If no such environment variable is defined, or `VARNAME` is the
+empty string, the empty string will be substituted.
+
+As an example, let's look at a sample fio invocation and job file::
+
+$ SIZE=64m NUMJOBS=4 fio jobfile.fio
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=${SIZE}
+    numjobs=${NUMJOBS}
+    ; -- end job file --
+
+This will expand to the following equivalent job file at runtime:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Fio ships with a few example job files, you can also look there for inspiration.
+
+Reserved keywords
+~~~~~~~~~~~~~~~~~
+
+Additionally, fio has a set of reserved keywords that will be replaced
+internally with the appropriate value. Those keywords are:
+
+**$pagesize**
+
+	The architecture page size of the running system.
+
+**$mb_memory**
+
+	Megabytes of total memory in the system.
+
+**$ncpus**
+
+	Number of online available CPUs.
+
+These can be used on the command line or in the job file, and will be
+automatically substituted with the current system values when the job is
+run. Simple math is also supported on these keywords, so you can perform actions
+like::
+
+	size=8*$mb_memory
+
+and get that properly expanded to 8 times the size of memory in the machine.
+
+
+Job file parameters
+-------------------
+
+This section describes in details each parameter associated with a job.  Some
+parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+
+	- addition (+)
+	- subtraction (-)
+	- multiplication (*)
+	- division (/)
+	- modulus (%)
+	- exponentiation (^)
+
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The following types are used:
+
+
+Parameter types
+~~~~~~~~~~~~~~~
+
+**str**
+	String: A sequence of alphanumeric characters.
+
+**time**
+	Integer with possible time suffix.  Without a unit value is interpreted as
+	seconds unless otherwise specified.  Accepts a suffix of 'd' for days, 'h' for
+	hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
+	'us' (or 'usec') for microseconds.  For example, use 10m for 10 minutes.
+
+.. _int:
+
+**int**
+	Integer. A whole number value, which may contain an integer prefix
+	and an integer suffix:
+
+	[*integer prefix*] **number** [*integer suffix*]
+
+	The optional *integer prefix* specifies the number's base. The default
+	is decimal. *0x* specifies hexadecimal.
+
+	The optional *integer suffix* specifies the number's units, and includes an
+	optional unit prefix and an optional unit.  For quantities of data, the
+	default unit is bytes. For quantities of time, the default unit is seconds
+	unless otherwise specified.
+
+	With :option:`kb_base`\=1000, fio follows international standards for unit
+	prefixes.  To specify power-of-10 decimal values defined in the
+	International System of Units (SI):
+
+		* *K* -- means kilo (K) or 1000
+		* *M* -- means mega (M) or 1000**2
+		* *G* -- means giga (G) or 1000**3
+		* *T* -- means tera (T) or 1000**4
+		* *P* -- means peta (P) or 1000**5
+
+	To specify power-of-2 binary values defined in IEC 80000-13:
+
+		* *Ki* -- means kibi (Ki) or 1024
+		* *Mi* -- means mebi (Mi) or 1024**2
+		* *Gi* -- means gibi (Gi) or 1024**3
+		* *Ti* -- means tebi (Ti) or 1024**4
+		* *Pi* -- means pebi (Pi) or 1024**5
+
+	With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
+	from those specified in the SI and IEC 80000-13 standards to provide
+	compatibility with old scripts.  For example, 4k means 4096.
+
+	For quantities of data, an optional unit of 'B' may be included
+	(e.g., 'kB' is the same as 'k').
+
+	The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+	not milli). 'b' and 'B' both mean byte, not bit.
+
+	Examples with :option:`kb_base`\=1000:
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
+		* *1 MiB*: 1048576, 1mi, 1024ki
+		* *1 MB*: 1000000, 1m, 1000k
+		* *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
+		* *1 TB*: 1000000000, 1t, 1000m, 1000000k
+
+	Examples with :option:`kb_base`\=1024 (default):
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+		* *1 MiB*: 1048576, 1m, 1024k
+		* *1 MB*: 1000000, 1mi, 1000ki
+		* *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
+		* *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
+
+	To specify times (units are not case sensitive):
+
+		* *D* -- means days
+		* *H* -- means hours
+		* *M* -- means minutes
+		* *s* -- or sec means seconds (default)
+		* *ms* -- or *msec* means milliseconds
+		* *us* -- or *usec* means microseconds
+
+	If the option accepts an upper and lower range, use a colon ':' or
+	minus '-' to separate such values. See :ref:`irange <irange>`.
+	If the lower value specified happens to be larger than the upper value
+	the two values are swapped.
+
+.. _bool:
+
+**bool**
+	Boolean. Usually parsed as an integer, however only defined for
+	true and false (1 and 0).
+
+.. _irange:
+
+**irange**
+	Integer range with suffix. Allows value range to be given, such as
+	1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+	option allows two sets of ranges, they can be specified with a ',' or '/'
+	delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
+
+**float_list**
+	A list of floating point numbers, separated by a ':' character.
+
+With the above in mind, here follows the complete list of fio job parameters.
+
+
+Units
+~~~~~
+
+.. option:: kb_base=int
+
+	Select the interpretation of unit prefixes in input parameters.
+
+		**1000**
+			Inputs comply with IEC 80000-13 and the International
+			System of Units (SI). Use:
+
+				- power-of-2 values with IEC prefixes (e.g., KiB)
+				- power-of-10 values with SI prefixes (e.g., kB)
+
+		**1024**
+			Compatibility mode (default).  To avoid breaking old scripts:
+
+				- power-of-2 values with SI prefixes
+				- power-of-10 values with IEC prefixes
+
+	See :option:`bs` for more details on input parameters.
+
+	Outputs always use correct prefixes.  Most outputs include both
+	side-by-side, like::
+
+		bw=2383.3kB/s (2327.4KiB/s)
+
+	If only one value is reported, then kb_base selects the one to use:
+
+		**1000** -- SI prefixes
+
+		**1024** -- IEC prefixes
+
+.. option:: unit_base=int
+
+	Base unit for reporting.  Allowed values are:
+
+	**0**
+		Use auto-detection (default).
+	**8**
+		Byte based.
+	**1**
+		Bit based.
+
+
+Job description
+~~~~~~~~~~~~~~~
+
+.. option:: name=str
+
+	ASCII name of the job. This may be used to override the name printed by fio
+	for this job. Otherwise the job name is used. On the command line this
+	parameter has the special purpose of also signaling the start of a new job.
+
+.. option:: description=str
+
+	Text description of the job. Doesn't do anything except dump this text
+	description when this job is run. It's not parsed.
+
+.. option:: loops=int
+
+	Run the specified number of iterations of this job. Used to repeat the same
+	workload a given number of times. Defaults to 1.
+
+.. option:: numjobs=int
+
+	Create the specified number of clones of this job. Each clone of job
+	is spawned as an independent thread or process. May be used to setup a
+	larger number of threads/processes doing the same thing. Each thread is
+	reported separately; to see statistics for all clones as a whole, use
+	:option:`group_reporting` in conjunction with :option:`new_group`.
+	See :option:`--max-jobs`.  Default: 1.
+
+
+Time related parameters
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: runtime=time
+
+	Tell fio to terminate processing after the specified period of time.  It
+	can be quite hard to determine for how long a specified job will run, so
+	this parameter is handy to cap the total runtime to a given time.  When
+	the unit is omitted, the value is interpreted in seconds.
+
+.. option:: time_based
+
+	If set, fio will run for the duration of the :option:`runtime` specified
+	even if the file(s) are completely read or written. It will simply loop over
+	the same workload as many times as the :option:`runtime` allows.
+
+.. option:: startdelay=irange(time)
+
+	Delay the start of job for the specified amount of time.  Can be a single
+	value or a range.  When given as a range, each thread will choose a value
+	randomly from within the range.  Value is in seconds if a unit is omitted.
+
+.. option:: ramp_time=time
+
+	If set, fio will run the specified workload for this amount of time before
+	logging any performance numbers. Useful for letting performance settle
+	before logging results, thus minimizing the runtime required for stable
+	results. Note that the ``ramp_time`` is considered lead in time for a job,
+	thus it will increase the total runtime if a special timeout or
+	:option:`runtime` is specified.  When the unit is omitted, the value is
+	given in seconds.
+
+.. option:: clocksource=str
+
+	Use the given clocksource as the base of timing. The supported options are:
+
+		**gettimeofday**
+			:manpage:`gettimeofday(2)`
+
+		**clock_gettime**
+			:manpage:`clock_gettime(2)`
+
+		**cpu**
+			Internal CPU clock source
+
+	cpu is the preferred clocksource if it is reliable, as it is very fast (and
+	fio is heavy on time calls). Fio will automatically use this clocksource if
+	it's supported and considered reliable on the system it is running on,
+	unless another clocksource is specifically set. For x86/x86-64 CPUs, this
+	means supporting TSC Invariant.
+
+.. option:: gtod_reduce=bool
+
+	Enable all of the :manpage:`gettimeofday(2)` reducing options
+	(:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
+	reduce precision of the timeout somewhat to really shrink the
+	:manpage:`gettimeofday(2)` call count. With this option enabled, we only do
+	about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
+	time keeping was enabled.
+
+.. option:: gtod_cpu=int
+
+	Sometimes it's cheaper to dedicate a single thread of execution to just
+	getting the current time. Fio (and databases, for instance) are very
+	intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
+	one CPU aside for doing nothing but logging current time to a shared memory
+	location. Then the other threads/processes that run I/O workloads need only
+	copy that segment, instead of entering the kernel with a
+	:manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
+	calls will be excluded from other uses. Fio will manually clear it from the
+	CPU mask of other jobs.
+
+
+Target file/device
+~~~~~~~~~~~~~~~~~~
+
+.. option:: directory=str
+
+	Prefix filenames with this directory. Used to place files in a different
+	location than :file:`./`.  You can specify a number of directories by
+	separating the names with a ':' character. These directories will be
+	assigned equally distributed to job clones created by :option:`numjobs` as
+	long as they are using generated filenames. If specific `filename(s)` are
+	set fio will use the first listed directory, and thereby matching the
+	`filename` semantic (which generates a file for each clone if not
+	specified, but lets all clones use the same file if set).
+
+	See the :option:`filename` option for information on how to escape "``:``"
+	characters within the directory path itself.
+
+	Note: To control the directory fio will use for internal state files
+	use :option:`--aux-path`.
+
+.. option:: filename=str
+
+	Fio normally makes up a `filename` based on the job name, thread number, and
+	file number (see :option:`filename_format`). If you want to share files
+	between threads in a job or several
+	jobs with fixed file paths, specify a `filename` for each of them to override
+	the default. If the ioengine is file based, you can specify a number of files
+	by separating the names with a ':' colon. So if you wanted a job to open
+	:file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
+	``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
+	specified, :option:`nrfiles` is ignored. The size of regular files specified
+	by this option will be :option:`size` divided by number of files unless an
+	explicit size is specified by :option:`filesize`.
+
+	Each colon in the wanted path must be escaped with a ``\``
+	character.  For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
+	would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
+	:file:`F:\\filename` then you would use ``filename=F\:\filename``.
+
+	On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
+	the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
+	Note: Windows and FreeBSD prevent write access to areas
+	of the disk containing in-use data (e.g. filesystems).
+
+	The filename "`-`" is a reserved name, meaning *stdin* or *stdout*.  Which
+	of the two depends on the read/write direction set.
+
+.. option:: filename_format=str
+
+	If sharing multiple files between jobs, it is usually necessary to have fio
+	generate the exact names that you want. By default, fio will name a file
+	based on the default file format specification of
+	:file:`jobname.jobnumber.filenumber`. With this option, that can be
+	customized. Fio will recognize and replace the following keywords in this
+	string:
+
+		**$jobname**
+				The name of the worker thread or process.
+		**$jobnum**
+				The incremental number of the worker thread or process.
+		**$filenum**
+				The incremental number of the file for that worker thread or
+				process.
+
+	To have dependent jobs share a set of files, this option can be set to have
+	fio generate filenames that are shared between the two. For instance, if
+	:file:`testfiles.$filenum` is specified, file number 4 for any job will be
+	named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
+	will be used if no other format specifier is given.
+
+	If you specify a path then the directories will be created up to the
+	main directory for the file.  So for example if you specify
+	``filename_format=a/b/c/$jobnum`` then the directories a/b/c will be
+	created before the file setup part of the job.  If you specify
+	:option:`directory` then the path will be relative that directory,
+	otherwise it is treated as the absolute path.
+
+.. option:: unique_filename=bool
+
+	To avoid collisions between networked clients, fio defaults to prefixing any
+	generated filenames (with a directory specified) with the source of the
+	client connecting. To disable this behavior, set this option to 0.
+
+.. option:: opendir=str
+
+	Recursively open any files below directory `str`.
+
+.. option:: lockfile=str
+
+	Fio defaults to not locking any files before it does I/O to them. If a file
+	or file descriptor is shared, fio can serialize I/O to that file to make the
+	end result consistent. This is usual for emulating real workloads that share
+	files. The lock modes are:
+
+		**none**
+			No locking. The default.
+		**exclusive**
+			Only one thread or process may do I/O at a time, excluding all
+			others.
+		**readwrite**
+			Read-write locking on the file. Many readers may
+			access the file at the same time, but writes get exclusive access.
+
+.. option:: nrfiles=int
+
+	Number of files to use for this job. Defaults to 1. The size of files
+	will be :option:`size` divided by this unless explicit size is specified by
+	:option:`filesize`. Files are created for each thread separately, and each
+	file will have a file number within its name by default, as explained in
+	:option:`filename` section.
+
+
+.. option:: openfiles=int
+
+	Number of files to keep open at the same time. Defaults to the same as
+	:option:`nrfiles`, can be set smaller to limit the number simultaneous
+	opens.
+
+.. option:: file_service_type=str
+
+	Defines how fio decides which file from a job to service next. The following
+	types are defined:
+
+		**random**
+			Choose a file at random.
+
+		**roundrobin**
+			Round robin over opened files. This is the default.
+
+		**sequential**
+			Finish one file before moving on to the next. Multiple files can
+			still be open depending on :option:`openfiles`.
+
+		**zipf**
+			Use a *Zipf* distribution to decide what file to access.
+
+		**pareto**
+			Use a *Pareto* distribution to decide what file to access.
+
+		**normal**
+			Use a *Gaussian* (normal) distribution to decide what file to
+			access.
+
+		**gauss**
+			Alias for normal.
+
+	For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
+	tell fio how many I/Os to issue before switching to a new file. For example,
+	specifying ``file_service_type=random:8`` would cause fio to issue
+	8 I/Os before selecting a new file at random. For the non-uniform
+	distributions, a floating point postfix can be given to influence how the
+	distribution is skewed. See :option:`random_distribution` for a description
+	of how that would work.
+
+.. option:: ioscheduler=str
+
+	Attempt to switch the device hosting the file to the specified I/O scheduler
+	before running.
+
+.. option:: create_serialize=bool
+
+	If true, serialize the file creation for the jobs.  This may be handy to
+	avoid interleaving of data files, which may greatly depend on the filesystem
+	used and even the number of processors in the system.  Default: true.
+
+.. option:: create_fsync=bool
+
+	:manpage:`fsync(2)` the data file after creation. This is the default.
+
+.. option:: create_on_open=bool
+
+	If true, don't pre-create files but allow the job's open() to create a file
+	when it's time to do I/O.  Default: false -- pre-create all necessary files
+	when the job starts.
+
+.. option:: create_only=bool
+
+	If true, fio will only run the setup phase of the job.  If files need to be
+	laid out or updated on disk, only that will be done -- the actual job contents
+	are not executed.  Default: false.
+
+.. option:: allow_file_create=bool
+
+	If true, fio is permitted to create files as part of its workload.  If this
+	option is false, then fio will error out if
+	the files it needs to use don't already exist. Default: true.
+
+.. option:: allow_mounted_write=bool
+
+	If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+	to what appears to be a mounted device or partition. This should help catch
+	creating inadvertently destructive tests, not realizing that the test will
+	destroy data on the mounted file system. Note that some platforms don't allow
+	writing against a mounted device regardless of this option. Default: false.
+
+.. option:: pre_read=bool
+
+	If this is given, files will be pre-read into memory before starting the
+	given I/O operation. This will also clear the :option:`invalidate` flag,
+	since it is pointless to pre-read and then drop the cache. This will only
+	work for I/O engines that are seek-able, since they allow you to read the
+	same data multiple times. Thus it will not work on non-seekable I/O engines
+	(e.g. network, splice). Default: false.
+
+.. option:: unlink=bool
+
+	Unlink the job files when done. Not the default, as repeated runs of that
+	job would then waste time recreating the file set again and again. Default:
+	false.
+
+.. option:: unlink_each_loop=bool
+
+	Unlink job files after each iteration or loop.  Default: false.
+
+.. option:: zonemode=str
+
+	Accepted values are:
+
+		**none**
+				The :option:`zonerange`, :option:`zonesize` and
+				:option:`zoneskip` parameters are ignored.
+		**strided**
+				I/O happens in a single zone until
+				:option:`zonesize` bytes have been transferred.
+				After that number of bytes has been
+				transferred processing of the next zone
+				starts.
+		**zbd**
+				Zoned block device mode. I/O happens
+				sequentially in each zone, even if random I/O
+				has been selected. Random I/O happens across
+				all zones instead of being restricted to a
+				single zone. The :option:`zoneskip` parameter
+				is ignored. :option:`zonerange` and
+				:option:`zonesize` must be identical.
+
+.. option:: zonerange=int
+
+	Size of a single zone. See also :option:`zonesize` and
+	:option:`zoneskip`.
+
+.. option:: zonesize=int
+
+	For :option:`zonemode` =strided, this is the number of bytes to
+	transfer before skipping :option:`zoneskip` bytes. If this parameter
+	is smaller than :option:`zonerange` then only a fraction of each zone
+	with :option:`zonerange` bytes will be accessed.  If this parameter is
+	larger than :option:`zonerange` then each zone will be accessed
+	multiple times before skipping to the next zone.
+
+	For :option:`zonemode` =zbd, this is the size of a single zone. The
+	:option:`zonerange` parameter is ignored in this mode.
+
+.. option:: zoneskip=int
+
+	For :option:`zonemode` =strided, the number of bytes to skip after
+	:option:`zonesize` bytes of data have been transferred. This parameter
+	must be zero for :option:`zonemode` =zbd.
+
+.. option:: read_beyond_wp=bool
+
+	This parameter applies to :option:`zonemode` =zbd only.
+
+	Zoned block devices are block devices that consist of multiple zones.
+	Each zone has a type, e.g. conventional or sequential. A conventional
+	zone can be written at any offset that is a multiple of the block
+	size. Sequential zones must be written sequentially. The position at
+	which a write must occur is called the write pointer. A zoned block
+	device can be either drive managed, host managed or host aware. For
+	host managed devices the host must ensure that writes happen
+	sequentially. Fio recognizes host managed devices and serializes
+	writes to sequential zones for these devices.
+
+	If a read occurs in a sequential zone beyond the write pointer then
+	the zoned block device will complete the read without reading any data
+	from the storage medium. Since such reads lead to unrealistically high
+	bandwidth and IOPS numbers fio only reads beyond the write pointer if
+	explicitly told to do so. Default: false.
+
+.. option:: max_open_zones=int
+
+	When running a random write test across an entire drive many more
+	zones will be open than in a typical application workload. Hence this
+	command line option that allows to limit the number of open zones. The
+	number of open zones is defined as the number of zones to which write
+	commands are issued.
+
+.. option:: zone_reset_threshold=float
+
+	A number between zero and one that indicates the ratio of logical
+	blocks with data to the total number of logical blocks in the test
+	above which zones should be reset periodically.
+
+.. option:: zone_reset_frequency=float
+
+	A number between zero and one that indicates how often a zone reset
+	should be issued if the zone reset threshold has been exceeded. A zone
+	reset is submitted after each (1 / zone_reset_frequency) write
+	requests. This and the previous parameter can be used to simulate
+	garbage collection activity.
+
+
+I/O type
+~~~~~~~~
+
+.. option:: direct=bool
+
+	If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+	OpenBSD and ZFS on Solaris don't support direct I/O.  On Windows the synchronous
+	ioengines don't support direct I/O.  Default: false.
+
+.. option:: atomic=bool
+
+	If value is true, attempt to use atomic direct I/O. Atomic writes are
+	guaranteed to be stable once acknowledged by the operating system. Only
+	Linux supports O_ATOMIC right now.
+
+.. option:: buffered=bool
+
+	If value is true, use buffered I/O. This is the opposite of the
+	:option:`direct` option. Defaults to true.
+
+.. option:: readwrite=str, rw=str
+
+	Type of I/O pattern. Accepted values are:
+
+		**read**
+				Sequential reads.
+		**write**
+				Sequential writes.
+		**trim**
+				Sequential trims (Linux block devices and SCSI
+				character devices only).
+		**randread**
+				Random reads.
+		**randwrite**
+				Random writes.
+		**randtrim**
+				Random trims (Linux block devices and SCSI
+				character devices only).
+		**rw,readwrite**
+				Sequential mixed reads and writes.
+		**randrw**
+				Random mixed reads and writes.
+		**trimwrite**
+				Sequential trim+write sequences. Blocks will be trimmed first,
+				then the same blocks will be written to.
+
+	Fio defaults to read if the option is not specified.  For the mixed I/O
+	types, the default is to split them 50/50.  For certain types of I/O the
+	result may still be skewed a bit, since the speed may be different.
+
+	It is possible to specify the number of I/Os to do before getting a new
+	offset by appending ``:<nr>`` to the end of the string given.  For a
+	random read, it would look like ``rw=randread:8`` for passing in an offset
+	modifier with a value of 8. If the suffix is used with a sequential I/O
+	pattern, then the *<nr>* value specified will be **added** to the generated
+	offset for each I/O turning sequential I/O into sequential I/O with holes.
+	For instance, using ``rw=write:4k`` will skip 4k for every write.  Also see
+	the :option:`rw_sequencer` option.
+
+.. option:: rw_sequencer=str
+
+	If an offset modifier is given by appending a number to the ``rw=<str>``
+	line, then this option controls how that number modifies the I/O offset
+	being generated. Accepted values are:
+
+		**sequential**
+			Generate sequential offset.
+		**identical**
+			Generate the same offset.
+
+	``sequential`` is only useful for random I/O, where fio would normally
+	generate a new random offset for every I/O. If you append e.g. 8 to randread,
+	you would get a new random offset for every 8 I/Os. The result would be a
+	seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8``
+	to specify that. As sequential I/O is already sequential, setting
+	``sequential`` for that would not result in any differences.  ``identical``
+	behaves in a similar fashion, except it sends the same offset 8 number of
+	times before generating a new offset.
+
+.. option:: unified_rw_reporting=bool
+
+	Fio normally reports statistics on a per data direction basis, meaning that
+	reads, writes, and trims are accounted and reported separately. If this
+	option is set fio sums the results and report them as "mixed" instead.
+
+.. option:: randrepeat=bool
+
+	Seed the random number generator used for random I/O patterns in a
+	predictable way so the pattern is repeatable across runs. Default: true.
+
+.. option:: allrandrepeat=bool
+
+	Seed all random number generators in a predictable way so results are
+	repeatable across runs.  Default: false.
+
+.. option:: randseed=int
+
+	Seed the random number generators based on this seed value, to be able to
+	control what sequence of output is being generated.  If not set, the random
+	sequence depends on the :option:`randrepeat` setting.
+
+.. option:: fallocate=str
+
+	Whether pre-allocation is performed when laying down files.
+	Accepted values are:
+
+		**none**
+			Do not pre-allocate space.
+
+		**native**
+			Use a platform's native pre-allocation call but fall back to
+			**none** behavior if it fails/is not implemented.
+
+		**posix**
+			Pre-allocate via :manpage:`posix_fallocate(3)`.
+
+		**keep**
+			Pre-allocate via :manpage:`fallocate(2)` with
+			FALLOC_FL_KEEP_SIZE set.
+
+		**truncate**
+			Extend file to final size via :manpage:`ftruncate(2)`
+			instead of allocating.
+
+		**0**
+			Backward-compatible alias for **none**.
+
+		**1**
+			Backward-compatible alias for **posix**.
+
+	May not be available on all supported platforms. **keep** is only available
+	on Linux. If using ZFS on Solaris this cannot be set to **posix**
+	because ZFS doesn't support pre-allocation. Default: **native** if any
+	pre-allocation methods except **truncate** are available, **none** if not.
+
+	Note that using **truncate** on Windows will interact surprisingly
+	with non-sequential write patterns. When writing to a file that has
+	been extended by setting the end-of-file information, Windows will
+	backfill the unwritten portion of the file up to that offset with
+	zeroes before issuing the new write. This means that a single small
+	write to the end of an extended file will stall until the entire
+	file has been filled with zeroes.
+
+.. option:: fadvise_hint=str
+
+	Use :manpage:`posix_fadvise(2)` or :manpage:`posix_fadvise(2)` to
+	advise the kernel on what I/O patterns are likely to be issued.
+	Accepted values are:
+
+		**0**
+			Backwards-compatible hint for "no hint".
+
+		**1**
+			Backwards compatible hint for "advise with fio workload type". This
+			uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
+			for a sequential workload.
+
+		**sequential**
+			Advise using **FADV_SEQUENTIAL**.
+
+		**random**
+			Advise using **FADV_RANDOM**.
+
+.. option:: write_hint=str
+
+	Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
+	from a write. Only supported on Linux, as of version 4.13. Accepted
+	values are:
+
+		**none**
+			No particular life time associated with this file.
+
+		**short**
+			Data written to this file has a short life time.
+
+		**medium**
+			Data written to this file has a medium life time.
+
+		**long**
+			Data written to this file has a long life time.
+
+		**extreme**
+			Data written to this file has a very long life time.
+
+	The values are all relative to each other, and no absolute meaning
+	should be associated with them.
+
+.. option:: offset=int
+
+	Start I/O at the provided offset in the file, given as either a fixed size in
+	bytes or a percentage. If a percentage is given, the generated offset will be
+	aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
+	provided. Data before the given offset will not be touched. This
+	effectively caps the file size at `real_size - offset`. Can be combined with
+	:option:`size` to constrain the start and end range of the I/O workload.
+	A percentage can be specified by a number between 1 and 100 followed by '%',
+	for example, ``offset=20%`` to specify 20%.
+
+.. option:: offset_align=int
+
+	If set to non-zero value, the byte offset generated by a percentage ``offset``
+	is aligned upwards to this value. Defaults to 0 meaning that a percentage
+	offset is aligned to the minimum block size.
+
+.. option:: offset_increment=int
+
+	If this is provided, then the real offset becomes `offset + offset_increment
+	* thread_number`, where the thread number is a counter that starts at 0 and
+	is incremented for each sub-job (i.e. when :option:`numjobs` option is
+	specified). This option is useful if there are several jobs which are
+	intended to operate on a file in parallel disjoint segments, with even
+	spacing between the starting points. Percentages can be used for this option.
+	If a percentage is given, the generated offset will be aligned to the minimum
+	``blocksize`` or to the value of ``offset_align`` if provided.
+
+.. option:: number_ios=int
+
+	Fio will normally perform I/Os until it has exhausted the size of the region
+	set by :option:`size`, or if it exhaust the allocated time (or hits an error
+	condition). With this setting, the range/size can be set independently of
+	the number of I/Os to perform. When fio reaches this number, it will exit
+	normally and report status. Note that this does not extend the amount of I/O
+	that will be done, it will only stop fio if this condition is met before
+	other end-of-job criteria.
+
+.. option:: fsync=int
+
+	If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
+	the dirty data for every number of blocks given. For example, if you give 32
+	as a parameter, fio will sync the file after every 32 writes issued. If fio is
+	using non-buffered I/O, we may not sync the file. The exception is the sg
+	I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+	means fio does not periodically issue and wait for a sync to complete. Also
+	see :option:`end_fsync` and :option:`fsync_on_close`.
+
+.. option:: fdatasync=int
+
+	Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
+	not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+	:manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
+	Defaults to 0, which means fio does not periodically issue and wait for a
+	data-only sync to complete.
+
+.. option:: write_barrier=int
+
+	Make every `N-th` write a barrier write.
+
+.. option:: sync_file_range=str:int
+
+	Use :manpage:`sync_file_range(2)` for every `int` number of write
+	operations. Fio will track range of writes that have happened since the last
+	:manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
+
+		**wait_before**
+			SYNC_FILE_RANGE_WAIT_BEFORE
+		**write**
+			SYNC_FILE_RANGE_WRITE
+		**wait_after**
+			SYNC_FILE_RANGE_WAIT_AFTER
+
+	So if you do ``sync_file_range=wait_before,write:8``, fio would use
+	``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
+	writes. Also see the :manpage:`sync_file_range(2)` man page.  This option is
+	Linux specific.
+
+.. option:: overwrite=bool
+
+	If true, writes to a file will always overwrite existing data. If the file
+	doesn't already exist, it will be created before the write phase begins. If
+	the file exists and is large enough for the specified write phase, nothing
+	will be done. Default: false.
+
+.. option:: end_fsync=bool
+
+	If true, :manpage:`fsync(2)` file contents when a write stage has completed.
+	Default: false.
+
+.. option:: fsync_on_close=bool
+
+	If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
+	from :option:`end_fsync` in that it will happen on every file close, not
+	just at the end of the job.  Default: false.
+
+.. option:: rwmixread=int
+
+	Percentage of a mixed workload that should be reads. Default: 50.
+
+.. option:: rwmixwrite=int
+
+	Percentage of a mixed workload that should be writes. If both
+	:option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
+	add up to 100%, the latter of the two will be used to override the
+	first. This may interfere with a given rate setting, if fio is asked to
+	limit reads or writes to a certain rate.  If that is the case, then the
+	distribution may be skewed. Default: 50.
+
+.. option:: random_distribution=str:float[,str:float][,str:float]
+
+	By default, fio will use a completely uniform random distribution when asked
+	to perform random I/O. Sometimes it is useful to skew the distribution in
+	specific ways, ensuring that some parts of the data is more hot than others.
+	fio includes the following distribution models:
+
+		**random**
+				Uniform random distribution
+
+		**zipf**
+				Zipf distribution
+
+		**pareto**
+				Pareto distribution
+
+		**normal**
+				Normal (Gaussian) distribution
+
+		**zoned**
+				Zoned random distribution
+
+		**zoned_abs**
+				Zone absolute random distribution
+
+	When using a **zipf** or **pareto** distribution, an input value is also
+	needed to define the access pattern. For **zipf**, this is the `Zipf
+	theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
+	program, :command:`fio-genzipf`, that can be used visualize what the given input
+	values will yield in terms of hit rates.  If you wanted to use **zipf** with
+	a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
+	option. If a non-uniform model is used, fio will disable use of the random
+	map. For the **normal** distribution, a normal (Gaussian) deviation is
+	supplied as a value between 0 and 100.
+
+	For a **zoned** distribution, fio supports specifying percentages of I/O
+	access that should fall within what range of the file or device. For
+	example, given a criteria of:
+
+		* 60% of accesses should be to the first 10%
+		* 30% of accesses should be to the next 20%
+		* 8% of accesses should be to the next 30%
+		* 2% of accesses should be to the next 40%
+
+	we can define that through zoning of the random accesses. For the above
+	example, the user would do::
+
+		random_distribution=zoned:60/10:30/20:8/30:2/40
+
+	A **zoned_abs** distribution works exactly like the **zoned**, except
+	that it takes absolute sizes. For example, let's say you wanted to
+	define access according to the following criteria:
+
+		* 60% of accesses should be to the first 20G
+		* 30% of accesses should be to the next 100G
+		* 10% of accesses should be to the next 500G
+
+	we can define an absolute zoning distribution with:
+
+		random_distribution=zoned_abs=60/20G:30/100G:10/500g
+
+	For both **zoned** and **zoned_abs**, fio supports defining up to
+	256 separate zones.
+
+	Similarly to how :option:`bssplit` works for setting ranges and
+	percentages of block sizes. Like :option:`bssplit`, it's possible to
+	specify separate zones for reads, writes, and trims. If just one set
+	is given, it'll apply to all of them. This goes for both **zoned**
+	**zoned_abs** distributions.
+
+.. option:: percentage_random=int[,int][,int]
+
+	For a random workload, set how big a percentage should be random. This
+	defaults to 100%, in which case the workload is fully random. It can be set
+	from anywhere from 0 to 100.  Setting it to 0 would make the workload fully
+	sequential. Any setting in between will result in a random mix of sequential
+	and random I/O, at the given percentages.  Comma-separated values may be
+	specified for reads, writes, and trims as described in :option:`blocksize`.
+
+.. option:: norandommap
+
+	Normally fio will cover every block of the file when doing random I/O. If
+	this option is given, fio will just get a new random offset without looking
+	at past I/O history. This means that some blocks may not be read or written,
+	and that some blocks may be read/written more than once. If this option is
+	used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
+	only intact blocks are verified, i.e., partially-overwritten blocks are
+	ignored.  With an async I/O engine and an I/O depth > 1, it is possible for
+	the same block to be overwritten, which can cause verification errors.  Either
+	do not use norandommap in this case, or also use the lfsr random generator.
+
+.. option:: softrandommap=bool
+
+	See :option:`norandommap`. If fio runs with the random block map enabled and
+	it fails to allocate the map, if this option is set it will continue without
+	a random block map. As coverage will not be as complete as with random maps,
+	this option is disabled by default.
+
+.. option:: random_generator=str
+
+	Fio supports the following engines for generating I/O offsets for random I/O:
+
+		**tausworthe**
+			Strong 2^88 cycle random number generator.
+		**lfsr**
+			Linear feedback shift register generator.
+		**tausworthe64**
+			Strong 64-bit 2^258 cycle random number generator.
+
+	**tausworthe** is a strong random number generator, but it requires tracking
+	on the side if we want to ensure that blocks are only read or written
+	once. **lfsr** guarantees that we never generate the same offset twice, and
+	it's also less computationally expensive. It's not a true random generator,
+	however, though for I/O purposes it's typically good enough. **lfsr** only
+	works with single block sizes, not with workloads that use multiple block
+	sizes. If used with such a workload, fio may read or write some blocks
+	multiple times. The default value is **tausworthe**, unless the required
+	space exceeds 2^32 blocks. If it does, then **tausworthe64** is
+	selected automatically.
+
+
+Block size
+~~~~~~~~~~
+
+.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
+
+	The block size in bytes used for I/O units. Default: 4096.  A single value
+	applies to reads, writes, and trims.  Comma-separated values may be
+	specified for reads, writes, and trims.  A value not terminated in a comma
+	applies to subsequent types.
+
+	Examples:
+
+		**bs=256k**
+			means 256k for reads, writes and trims.
+
+		**bs=8k,32k**
+			means 8k for reads, 32k for writes and trims.
+
+		**bs=8k,32k,**
+			means 8k for reads, 32k for writes, and default for trims.
+
+		**bs=,8k**
+			means default for reads, 8k for writes and trims.
+
+		**bs=,8k,**
+			means default for reads, 8k for writes, and default for trims.
+
+.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
+
+	A range of block sizes in bytes for I/O units.  The issued I/O unit will
+	always be a multiple of the minimum size, unless
+	:option:`blocksize_unaligned` is set.
+
+	Comma-separated ranges may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+	Example: ``bsrange=1k-4k,2k-8k``.
+
+.. option:: bssplit=str[,str][,str]
+
+	Sometimes you want even finer grained control of the block sizes
+	issued, not just an even split between them.  This option allows you to
+	weight various block sizes, so that you are able to define a specific
+	amount of block sizes issued. The format for this option is::
+
+		bssplit=blocksize/percentage:blocksize/percentage
+
+	for as many block sizes as needed. So if you want to define a workload
+	that has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would
+	write::
+
+		bssplit=4k/10:64k/50:32k/40
+
+	Ordering does not matter. If the percentage is left blank, fio will
+	fill in the remaining values evenly. So a bssplit option like this one::
+
+		bssplit=4k/50:1k/:32k/
+
+	would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always
+	add up to 100, if bssplit is given a range that adds up to more, it
+	will error out.
+
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+	If you want a workload that has 50% 2k reads and 50% 4k reads, while
+	having 90% 4k writes and 10% 8k writes, you would specify::
+
+		bssplit=2k/50:4k/50,4k/90:8k/10
+
+	Fio supports defining up to 64 different weights for each data
+	direction.
+
+.. option:: blocksize_unaligned, bs_unaligned
+
+	If set, fio will issue I/O units with any size within
+	:option:`blocksize_range`, not just multiples of the minimum size.  This
+	typically won't work with direct I/O, as that normally requires sector
+	alignment.
+
+.. option:: bs_is_seq_rand=bool
+
+	If this option is set, fio will use the normal read,write blocksize settings
+	as sequential,random blocksize settings instead. Any random read or write
+	will use the WRITE blocksize settings, and any sequential read or write will
+	use the READ blocksize settings.
+
+.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
+
+	Boundary to which fio will align random I/O units.  Default:
+	:option:`blocksize`.  Minimum alignment is typically 512b for using direct
+	I/O, though it usually depends on the hardware block size. This option is
+	mutually exclusive with using a random map for files, so it will turn off
+	that option.  Comma-separated values may be specified for reads, writes, and
+	trims as described in :option:`blocksize`.
+
+
+Buffers and memory
+~~~~~~~~~~~~~~~~~~
+
+.. option:: zero_buffers
+
+	Initialize buffers with all zeros. Default: fill buffers with random data.
+
+.. option:: refill_buffers
+
+	If this option is given, fio will refill the I/O buffers on every
+	submit. Only makes sense if :option:`zero_buffers` isn't specified,
+	naturally. Defaults to being unset i.e., the buffer is only filled at
+	init time and the data in it is reused when possible but if any of
+	:option:`verify`, :option:`buffer_compress_percentage` or
+	:option:`dedupe_percentage` are enabled then `refill_buffers` is also
+	automatically enabled.
+
+.. option:: scramble_buffers=bool
+
+	If :option:`refill_buffers` is too costly and the target is using data
+	deduplication, then setting this option will slightly modify the I/O buffer
+	contents to defeat normal de-dupe attempts. This is not enough to defeat
+	more clever block compression attempts, but it will stop naive dedupe of
+	blocks. Default: true.
+
+.. option:: buffer_compress_percentage=int
+
+	If this is set, then fio will attempt to provide I/O buffer content
+	(on WRITEs) that compresses to the specified level. Fio does this by
+	providing a mix of random data followed by fixed pattern data. The
+	fixed pattern is either zeros, or the pattern specified by
+	:option:`buffer_pattern`. If the `buffer_pattern` option is used, it
+	might skew the compression ratio slightly. Setting
+	`buffer_compress_percentage` to a value other than 100 will also
+	enable :option:`refill_buffers` in order to reduce the likelihood that
+	adjacent blocks are so similar that they over compress when seen
+	together. See :option:`buffer_compress_chunk` for how to set a finer or
+	coarser granularity for the random/fixed data region. Defaults to unset
+	i.e., buffer data will not adhere to any compression level.
+
+.. option:: buffer_compress_chunk=int
+
+	This setting allows fio to manage how big the random/fixed data region
+	is when using :option:`buffer_compress_percentage`. When
+	`buffer_compress_chunk` is set to some non-zero value smaller than the
+	block size, fio can repeat the random/fixed region throughout the I/O
+	buffer at the specified interval (which particularly useful when
+	bigger block sizes are used for a job). When set to 0, fio will use a
+	chunk size that matches the block size resulting in a single
+	random/fixed region within the I/O buffer. Defaults to 512. When the
+	unit is omitted, the value is interpreted in bytes.
+
+.. option:: buffer_pattern=str
+
+	If set, fio will fill the I/O buffers with this pattern or with the contents
+	of a file. If not set, the contents of I/O buffers are defined by the other
+	options related to buffer contents. The setting can be any pattern of bytes,
+	and can be prefixed with 0x for hex values. It may also be a string, where
+	the string must then be wrapped with ``""``. Or it may also be a filename,
+	where the filename must be wrapped with ``''`` in which case the file is
+	opened and read. Note that not all the file contents will be read if that
+	would cause the buffers to overflow. So, for example::
+
+		buffer_pattern='filename'
+
+	or::
+
+		buffer_pattern="abcd"
+
+	or::
+
+		buffer_pattern=-12
+
+	or::
+
+		buffer_pattern=0xdeadface
+
+	Also you can combine everything together in any order::
+
+		buffer_pattern=0xdeadface"abcd"-12'filename'
+
+.. option:: dedupe_percentage=int
+
+	If set, fio will generate this percentage of identical buffers when
+	writing. These buffers will be naturally dedupable. The contents of the
+	buffers depend on what other buffer compression settings have been set. It's
+	possible to have the individual buffers either fully compressible, or not at
+	all -- this option only controls the distribution of unique buffers. Setting
+	this option will also enable :option:`refill_buffers` to prevent every buffer
+	being identical.
+
+.. option:: invalidate=bool
+
+	Invalidate the buffer/page cache parts of the files to be used prior to
+	starting I/O if the platform and file type support it.  Defaults to true.
+	This will be ignored if :option:`pre_read` is also specified for the
+	same job.
+
+.. option:: sync=bool
+
+	Use synchronous I/O for buffered writes. For the majority of I/O engines,
+	this means using O_SYNC. Default: false.
+
+.. option:: iomem=str, mem=str
+
+	Fio can use various types of memory as the I/O unit buffer.  The allowed
+	values are:
+
+		**malloc**
+			Use memory from :manpage:`malloc(3)` as the buffers.  Default memory
+			type.
+
+		**shm**
+			Use shared memory as the buffers. Allocated through
+			:manpage:`shmget(2)`.
+
+		**shmhuge**
+			Same as shm, but use huge pages as backing.
+
+		**mmap**
+			Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
+			be file backed if a filename is given after the option. The format
+			is `mem=mmap:/path/to/file`.
+
+		**mmaphuge**
+			Use a memory mapped huge file as the buffer backing. Append filename
+			after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
+
+		**mmapshared**
+			Same as mmap, but use a MMAP_SHARED mapping.
+
+		**cudamalloc**
+			Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+			The :option:`ioengine` must be `rdma`.
+
+	The area allocated is a function of the maximum allowed bs size for the job,
+	multiplied by the I/O depth given. Note that for **shmhuge** and
+	**mmaphuge** to work, the system must have free huge pages allocated. This
+	can normally be checked and set by reading/writing
+	:file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
+	is 4MiB in size. So to calculate the number of huge pages you need for a
+	given job file, add up the I/O depth of all jobs (normally one unless
+	:option:`iodepth` is used) and multiply by the maximum bs set. Then divide
+	that number by the huge page size. You can see the size of the huge pages in
+	:file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
+	number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
+	see :option:`hugepage-size`.
+
+	**mmaphuge** also needs to have hugetlbfs mounted and the file location
+	should point there. So if it's mounted in :file:`/huge`, you would use
+	`mem=mmaphuge:/huge/somefile`.
+
+.. option:: iomem_align=int, mem_align=int
+
+	This indicates the memory alignment of the I/O memory buffers.  Note that
+	the given alignment is applied to the first I/O unit buffer, if using
+	:option:`iodepth` the alignment of the following buffers are given by the
+	:option:`bs` used. In other words, if using a :option:`bs` that is a
+	multiple of the page sized in the system, all buffers will be aligned to
+	this value. If using a :option:`bs` that is not page aligned, the alignment
+	of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
+	:option:`bs` used.
+
+.. option:: hugepage-size=int
+
+	Defines the size of a huge page. Must at least be equal to the system
+	setting, see :file:`/proc/meminfo`. Defaults to 4MiB.  Should probably
+	always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
+	preferred way to set this to avoid setting a non-pow-2 bad value.
+
+.. option:: lockmem=int
+
+	Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
+	simulate a smaller amount of memory.  The amount specified is per worker.
+
+
+I/O size
+~~~~~~~~
+
+.. option:: size=int
+
+	The total size of file I/O for each thread of this job. Fio will run until
+	this many bytes has been transferred, unless runtime is limited by other options
+	(such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`).
+	Fio will divide this size between the available files determined by options
+	such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
+	specified by the job. If the result of division happens to be 0, the size is
+	set to the physical size of the given files or devices if they exist.
+	If this option is not specified, fio will use the full size of the given
+	files or devices.  If the files do not exist, size must be given. It is also
+	possible to give size as a percentage between 1 and 100. If ``size=20%`` is
+	given, fio will use 20% of the full size of the given files or devices.
+	Can be combined with :option:`offset` to constrain the start and end range
+	that I/O will be done within.
+
+.. option:: io_size=int, io_limit=int
+
+	Normally fio operates within the region set by :option:`size`, which means
+	that the :option:`size` option sets both the region and size of I/O to be
+	performed. Sometimes that is not what you want. With this option, it is
+	possible to define just the amount of I/O that fio should do. For instance,
+	if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
+	will perform I/O within the first 20GiB but exit when 5GiB have been
+	done. The opposite is also possible -- if :option:`size` is set to 20GiB,
+	and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
+	the 0..20GiB region.
+
+.. option:: filesize=irange(int)
+
+	Individual file sizes. May be a range, in which case fio will select sizes
+	for files at random within the given range and limited to :option:`size` in
+	total (if that is given). If not given, each created file is the same size.
+	This option overrides :option:`size` in terms of file size, which means
+	this value is used as a fixed size or possible range of each file.
+
+.. option:: file_append=bool
+
+	Perform I/O after the end of the file. Normally fio will operate within the
+	size of a file. If this option is set, then fio will append to the file
+	instead. This has identical behavior to setting :option:`offset` to the size
+	of a file.  This option is ignored on non-regular files.
+
+.. option:: fill_device=bool, fill_fs=bool
+
+	Sets size to something really large and waits for ENOSPC (no space left on
+	device) as the terminating condition. Only makes sense with sequential
+	write. For a read workload, the mount point will be filled first then I/O
+	started on the result. This option doesn't make sense if operating on a raw
+	device node, since the size of that is already known by the file system.
+	Additionally, writing beyond end-of-device will not return ENOSPC there.
+
+
+I/O engine
+~~~~~~~~~~
+
+.. option:: ioengine=str
+
+	Defines how the job issues I/O to the file. The following types are defined:
+
+		**sync**
+			Basic :manpage:`read(2)` or :manpage:`write(2)`
+			I/O. :manpage:`lseek(2)` is used to position the I/O location.
+			See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
+
+		**psync**
+			Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O.  Default on
+			all supported operating systems except for Windows.
+
+		**vsync**
+			Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O.  Will emulate
+			queuing by coalescing adjacent I/Os into a single submission.
+
+		**pvsync**
+			Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
+
+		**pvsync2**
+			Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
+
+		**io_uring**
+			Fast Linux native asynchronous I/O. Supports async IO
+			for both direct and buffered IO.
+			This engine defines engine specific options.
+
+		**libaio**
+			Linux native asynchronous I/O. Note that Linux may only support
+			queued behavior with non-buffered I/O (set ``direct=1`` or
+			``buffered=0``).
+			This engine defines engine specific options.
+
+		**posixaio**
+			POSIX asynchronous I/O using :manpage:`aio_read(3)` and
+			:manpage:`aio_write(3)`.
+
+		**solarisaio**
+			Solaris native asynchronous I/O.
+
+		**windowsaio**
+			Windows native asynchronous I/O.  Default on Windows.
+
+		**mmap**
+			File is memory mapped with :manpage:`mmap(2)` and data copied
+			to/from using :manpage:`memcpy(3)`.
+
+		**splice**
+			:manpage:`splice(2)` is used to transfer the data and
+			:manpage:`vmsplice(2)` to transfer data from user space to the
+			kernel.
+
+		**sg**
+			SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+			ioctl, or if the target is an sg character device we use
+			:manpage:`read(2)` and :manpage:`write(2)` for asynchronous
+			I/O. Requires :option:`filename` option to specify either block or
+			character devices. This engine supports trim operations.
+			The sg engine includes engine specific options.
+
+		**null**
+			Doesn't transfer any data, just pretends to.  This is mainly used to
+			exercise fio itself and for debugging/testing purposes.
+
+		**net**
+			Transfer over the network to given ``host:port``.  Depending on the
+			:option:`protocol` used, the :option:`hostname`, :option:`port`,
+			:option:`listen` and :option:`filename` options are used to specify
+			what sort of connection to make, while the :option:`protocol` option
+			determines which protocol will be used.  This engine defines engine
+			specific options.
+
+		**netsplice**
+			Like **net**, but uses :manpage:`splice(2)` and
+			:manpage:`vmsplice(2)` to map data and send/receive.
+			This engine defines engine specific options.
+
+		**cpuio**
+			Doesn't transfer any data, but burns CPU cycles according to the
+			:option:`cpuload` and :option:`cpuchunks` options. Setting
+			:option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+			of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
+			to get desired CPU usage, as the cpuload only loads a
+			single CPU at the desired rate. A job never finishes unless there is
+			at least one non-cpuio job.
+
+		**guasi**
+			The GUASI I/O engine is the Generic Userspace Asynchronous Syscall
+			Interface approach to async I/O. See
+
+			http://www.xmailserver.org/guasi-lib.html
+
+			for more info on GUASI.
+
+		**rdma**
+			The RDMA I/O engine supports both RDMA memory semantics
+			(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+			InfiniBand, RoCE and iWARP protocols. This engine defines engine
+			specific options.
+
+		**falloc**
+			I/O engine that does regular fallocate to simulate data transfer as
+			fio ioengine.
+
+			DDIR_READ
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+
+			DDIR_WRITE
+				does fallocate(,mode = 0).
+
+			DDIR_TRIM
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+
+		**ftruncate**
+			I/O engine that sends :manpage:`ftruncate(2)` operations in response
+			to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+			size to the current block offset. :option:`blocksize` is ignored.
+
+		**e4defrag**
+			I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+			defragment activity in request to DDIR_WRITE event.
+
+		**rados**
+			I/O engine supporting direct access to Ceph Reliable Autonomic
+			Distributed Object Store (RADOS) via librados. This ioengine
+			defines engine specific options.
+
+		**rbd**
+			I/O engine supporting direct access to Ceph Rados Block Devices
+			(RBD) via librbd without the need to use the kernel rbd driver. This
+			ioengine defines engine specific options.
+
+		**http**
+			I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
+			a WebDAV or S3 endpoint.  This ioengine defines engine specific options.
+
+			This engine only supports direct IO of iodepth=1; you need to scale this
+			via numjobs. blocksize defines the size of the objects to be created.
+
+			TRIM is translated to object deletion.
+
+		**gfapi**
+			Using GlusterFS libgfapi sync interface to direct access to
+			GlusterFS volumes without having to go through FUSE.  This ioengine
+			defines engine specific options.
+
+		**gfapi_async**
+			Using GlusterFS libgfapi async interface to direct access to
+			GlusterFS volumes without having to go through FUSE. This ioengine
+			defines engine specific options.
+
+		**libhdfs**
+			Read and write through Hadoop (HDFS).  The :option:`filename` option
+			is used to specify host,port of the hdfs name-node to connect.  This
+			engine interprets offsets a little differently.  In HDFS, files once
+			created cannot be modified so random writes are not possible. To
+			imitate this the libhdfs engine expects a bunch of small files to be
+			created over HDFS and will randomly pick a file from them
+			based on the offset generated by fio backend (see the example
+			job file to create such files, use ``rw=write`` option). Please
+			note, it may be necessary to set environment variables to work
+			with HDFS/libhdfs properly.  Each job uses its own connection to
+			HDFS.
+
+		**mtd**
+			Read, write and erase an MTD character device (e.g.,
+			:file:`/dev/mtd0`). Discards are treated as erases. Depending on the
+			underlying device type, the I/O may have to go in a certain pattern,
+			e.g., on NAND, writing sequentially to erase blocks and discarding
+			before overwriting. The `trimwrite` mode works well for this
+			constraint.
+
+		**pmemblk**
+			Read and write using filesystem DAX to a file on a filesystem
+			mounted with DAX on a persistent memory device through the PMDK
+			libpmemblk library.
+
+		**dev-dax**
+			Read and write using device DAX to a persistent memory device (e.g.,
+			/dev/dax0.0) through the PMDK libpmem library.
+
+		**external**
+			Prefix to specify loading an external I/O engine object file. Append
+			the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
+			ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
+			absolute or relative. See :file:`engines/skeleton_external.c` for
+			details of writing an external I/O engine.
+
+		**filecreate**
+			Simply create the files and do no I/O to them.  You still need to
+			set  `filesize` so that all the accounting still occurs, but no
+			actual I/O will be done other than creating the file.
+
+		**filestat**
+			Simply do stat() and do no I/O to the file. You need to set 'filesize'
+			and 'nrfiles', so that files will be created.
+			This engine is to measure file lookup and meta data access.
+
+		**libpmem**
+			Read and write using mmap I/O to a file on a filesystem
+			mounted with DAX on a persistent memory device through the PMDK
+			libpmem library.
+
+		**ime_psync**
+			Synchronous read and write using DDN's Infinite Memory Engine (IME).
+			This engine is very basic and issues calls to IME whenever an IO is
+			queued.
+
+		**ime_psyncv**
+			Synchronous read and write using DDN's Infinite Memory Engine (IME).
+			This engine uses iovecs and will try to stack as much IOs as possible
+			(if the IOs are "contiguous" and the IO depth is not exceeded)
+			before issuing a call to IME.
+
+		**ime_aio**
+			Asynchronous read and write using DDN's Infinite Memory Engine (IME).
+			This engine will try to stack as much IOs as possible by creating
+			requests for IME. FIO will then decide when to commit these requests.
+		**libiscsi**
+			Read and write iscsi lun with libiscsi.
+		**nbd**
+			Read and write a Network Block Device (NBD).
+
+I/O engine specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition, there are some parameters which are only valid when a specific
+:option:`ioengine` is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+:option:`ioengine` that defines them is selected.
+
+.. option:: cmdprio_percentage=int : [io_uring] [libaio]
+
+    Set the percentage of I/O that will be issued with higher priority by setting
+    the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
+    This option cannot be used with the `prio` or `prioclass` options. For this
+    option to set the priority bit properly, NCQ priority must be supported and
+    enabled and :option:`direct`\=1 option must be used.
+
+.. option:: fixedbufs : [io_uring]
+
+    If fio is asked to do direct IO, then Linux will map pages for each
+    IO call, and release them when IO is done. If this option is set, the
+    pages are pre-mapped before IO is started. This eliminates the need to
+    map and release for each IO. This is more efficient, and reduces the
+    IO latency as well.
+
+.. option:: hipri : [io_uring]
+
+    If this option is set, fio will attempt to use polled IO completions.
+    Normal IO completions generate interrupts to signal the completion of
+    IO, polled completions do not. Hence they are require active reaping
+    by the application. The benefits are more efficient IO for high IOPS
+    scenarios, and lower latencies for low queue depth IO.
+
+.. option:: registerfiles : [io_uring]
+
+	With this option, fio registers the set of files being used with the
+	kernel. This avoids the overhead of managing file counts in the kernel,
+	making the submission and completion part more lightweight. Required
+	for the below :option:`sqthread_poll` option.
+
+.. option:: sqthread_poll : [io_uring]
+
+	Normally fio will submit IO by issuing a system call to notify the
+	kernel of available items in the SQ ring. If this option is set, the
+	act of submitting IO will be done by a polling thread in the kernel.
+	This frees up cycles for fio, at the cost of using more CPU in the
+	system.
+
+.. option:: sqthread_poll_cpu : [io_uring]
+
+	When :option:`sqthread_poll` is set, this option provides a way to
+	define which CPU should be used for the polling thread.
+
+.. option:: userspace_reap : [libaio]
+
+	Normally, with the libaio engine in use, fio will use the
+	:manpage:`io_getevents(2)` system call to reap newly returned events.  With
+	this flag turned on, the AIO ring will be read directly from user-space to
+	reap events. The reaping mode is only enabled when polling for a minimum of
+	0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
+
+.. option:: hipri : [pvsync2]
+
+	Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+	than normal.
+
+.. option:: hipri_percentage : [pvsync2]
+
+	When hipri is set this determines the probability of a pvsync2 I/O being high
+	priority. The default is 100%.
+
+.. option:: cpuload=int : [cpuio]
+
+	Attempt to use the specified percentage of CPU cycles. This is a mandatory
+	option when using cpuio I/O engine.
+
+.. option:: cpuchunks=int : [cpuio]
+
+	Split the load into cycles of the given time. In microseconds.
+
+.. option:: exit_on_io_done=bool : [cpuio]
+
+	Detect when I/O threads are done, then exit.
+
+.. option:: namenode=str : [libhdfs]
+
+	The hostname or IP address of a HDFS cluster namenode to contact.
+
+.. option:: port=int
+
+   [libhdfs]
+
+		The listening port of the HFDS cluster namenode.
+
+   [netsplice], [net]
+
+		The TCP or UDP port to bind to or connect to. If this is used with
+		:option:`numjobs` to spawn multiple instances of the same job type, then
+		this will be the starting port number since fio will use a range of
+		ports.
+
+   [rdma]
+
+		The port to use for RDMA-CM communication. This should be the same value
+		on the client and the server side.
+
+.. option:: hostname=str : [netsplice] [net] [rdma]
+
+	The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.  If the job
+	is a TCP listener or UDP reader, the hostname is not used and must be omitted
+	unless it is a valid UDP multicast address.
+
+.. option:: interface=str : [netsplice] [net]
+
+	The IP address of the network interface used to send or receive UDP
+	multicast.
+
+.. option:: ttl=int : [netsplice] [net]
+
+	Time-to-live value for outgoing UDP multicast packets. Default: 1.
+
+.. option:: nodelay=bool : [netsplice] [net]
+
+	Set TCP_NODELAY on TCP connections.
+
+.. option:: protocol=str, proto=str : [netsplice] [net]
+
+	The network protocol to use. Accepted values are:
+
+	**tcp**
+		Transmission control protocol.
+	**tcpv6**
+		Transmission control protocol V6.
+	**udp**
+		User datagram protocol.
+	**udpv6**
+		User datagram protocol V6.
+	**unix**
+		UNIX domain socket.
+
+	When the protocol is TCP or UDP, the port must also be given, as well as the
+	hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+	normal :option:`filename` option should be used and the port is invalid.
+
+.. option:: listen : [netsplice] [net]
+
+	For TCP network connections, tell fio to listen for incoming connections
+	rather than initiating an outgoing connection. The :option:`hostname` must
+	be omitted if this option is used.
+
+.. option:: pingpong : [netsplice] [net]
+
+	Normally a network writer will just continue writing data, and a network
+	reader will just consume packages. If ``pingpong=1`` is set, a writer will
+	send its normal payload to the reader, then wait for the reader to send the
+	same payload back. This allows fio to measure network latencies. The
+	submission and completion latencies then measure local time spent sending or
+	receiving, and the completion latency measures how long it took for the
+	other end to receive and send back.  For UDP multicast traffic
+	``pingpong=1`` should only be set for a single reader when multiple readers
+	are listening to the same address.
+
+.. option:: window_size : [netsplice] [net]
+
+	Set the desired socket buffer size for the connection.
+
+.. option:: mss : [netsplice] [net]
+
+	Set the TCP maximum segment size (TCP_MAXSEG).
+
+.. option:: donorname=str : [e4defrag]
+
+	File will be used as a block donor (swap extents between files).
+
+.. option:: inplace=int : [e4defrag]
+
+	Configure donor file blocks allocation strategy:
+
+	**0**
+		Default. Preallocate donor's file on init.
+	**1**
+		Allocate space immediately inside defragment event, and free right
+		after event.
+
+.. option:: clustername=str : [rbd,rados]
+
+	Specifies the name of the Ceph cluster.
+
+.. option:: rbdname=str : [rbd]
+
+	Specifies the name of the RBD.
+
+.. option:: pool=str : [rbd,rados]
+
+	Specifies the name of the Ceph pool containing RBD or RADOS data.
+
+.. option:: clientname=str : [rbd,rados]
+
+	Specifies the username (without the 'client.' prefix) used to access the
+	Ceph cluster. If the *clustername* is specified, the *clientname* shall be
+	the full *type.id* string. If no type. prefix is given, fio will add
+	'client.' by default.
+
+.. option:: busy_poll=bool : [rbd,rados]
+
+        Poll store instead of waiting for completion. Usually this provides better
+        throughput at cost of higher(up to 100%) CPU utilization.
+
+.. option:: skip_bad=bool : [mtd]
+
+	Skip operations against known bad blocks.
+
+.. option:: hdfsdirectory : [libhdfs]
+
+	libhdfs will create chunk in this HDFS directory.
+
+.. option:: chunk_size : [libhdfs]
+
+	The size of the chunk to use for each file.
+
+.. option:: verb=str : [rdma]
+
+	The RDMA verb to use on this side of the RDMA ioengine connection. Valid
+	values are write, read, send and recv. These correspond to the equivalent
+	RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
+	specified on the client side of the connection. See the examples folder.
+
+.. option:: bindname=str : [rdma]
+
+	The name to use to bind the local RDMA-CM connection to a local RDMA device.
+	This could be a hostname or an IPv4 or IPv6 address. On the server side this
+	will be passed into the rdma_bind_addr() function and on the client site it
+	will be used in the rdma_resolve_add() function. This can be useful when
+	multiple paths exist between the client and the server or in certain loopback
+	configurations.
+
+.. option:: stat_type=str : [filestat]
+
+	Specify stat system call type to measure lookup/getattr performance.
+	Default is **stat** for :manpage:`stat(2)`.
+
+.. option:: readfua=bool : [sg]
+
+	With readfua option set to 1, read operations include
+	the force unit access (fua) flag. Default is 0.
+
+.. option:: writefua=bool : [sg]
+
+	With writefua option set to 1, write operations include
+	the force unit access (fua) flag. Default is 0.
+
+.. option:: sg_write_mode=str : [sg]
+
+	Specify the type of write commands to issue. This option can take three values:
+
+	**write**
+		This is the default where write opcodes are issued as usual.
+	**verify**
+		Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
+		directs the device to carry out a medium verification with no data
+		comparison. The writefua option is ignored with this selection.
+	**same**
+		Issue WRITE SAME commands. This transfers a single block to the device
+		and writes this same block of data to a contiguous sequence of LBAs
+		beginning at the specified offset. fio's block size parameter specifies
+		the amount of data written with each command. However, the amount of data
+		actually transferred to the device is equal to the device's block
+		(sector) size. For a device with 512 byte sectors, blocksize=8k will
+		write 16 sectors with each command. fio will still generate 8k of data
+		for each command but only the first 512 bytes will be used and
+		transferred to the device. The writefua option is ignored with this
+		selection.
+
+.. option:: http_host=str : [http]
+
+	Hostname to connect to. For S3, this could be the bucket hostname.
+	Default is **localhost**
+
+.. option:: http_user=str : [http]
+
+	Username for HTTP authentication.
+
+.. option:: http_pass=str : [http]
+
+	Password for HTTP authentication.
+
+.. option:: https=str : [http]
+
+	Enable HTTPS instead of http. *on* enables HTTPS; *insecure*
+	will enable HTTPS, but disable SSL peer verification (use with
+	caution!). Default is **off**
+
+.. option:: http_mode=str : [http]
+
+	Which HTTP access mode to use: *webdav*, *swift*, or *s3*.
+	Default is **webdav**
+
+.. option:: http_s3_region=str : [http]
+
+	The S3 region/zone string.
+	Default is **us-east-1**
+
+.. option:: http_s3_key=str : [http]
+
+	The S3 secret key.
+
+.. option:: http_s3_keyid=str : [http]
+
+	The S3 key/access id.
+
+.. option:: http_swift_auth_token=str : [http]
+
+	The Swift auth token. See the example configuration file on how
+	to retrieve this.
+
+.. option:: http_verbose=int : [http]
+
+	Enable verbose requests from libcurl. Useful for debugging. 1
+	turns on verbose logging from libcurl, 2 additionally enables
+	HTTP IO tracing. Default is **0**
+
+.. option:: uri=str : [nbd]
+
+	Specify the NBD URI of the server to test.  The string
+	is a standard NBD URI
+	(see https://github.com/NetworkBlockDevice/nbd/tree/master/doc).
+	Example URIs: nbd://localhost:10809
+	nbd+unix:///?socket=/tmp/socket
+	nbds://tlshost/exportname
+
+I/O depth
+~~~~~~~~~
+
+.. option:: iodepth=int
+
+	Number of I/O units to keep in flight against the file.  Note that
+	increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
+	for small degrees when :option:`verify_async` is in use).  Even async
+	engines may impose OS restrictions causing the desired depth not to be
+	achieved.  This may happen on Linux when using libaio and not setting
+	:option:`direct`\=1, since buffered I/O is not async on that OS.  Keep an
+	eye on the I/O depth distribution in the fio output to verify that the
+	achieved depth is as expected. Default: 1.
+
+.. option:: iodepth_batch_submit=int, iodepth_batch=int
+
+	This defines how many pieces of I/O to submit at once.  It defaults to 1
+	which means that we submit each I/O as soon as it is available, but can be
+	raised to submit bigger batches of I/O at the time. If it is set to 0 the
+	:option:`iodepth` value will be used.
+
+.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
+
+	This defines how many pieces of I/O to retrieve at once. It defaults to 1
+	which means that we'll ask for a minimum of 1 I/O in the retrieval process
+	from the kernel. The I/O retrieval will go on until we hit the limit set by
+	:option:`iodepth_low`. If this variable is set to 0, then fio will always
+	check for completed events before queuing more I/O. This helps reduce I/O
+	latency, at the cost of more retrieval system calls.
+
+.. option:: iodepth_batch_complete_max=int
+
+	This defines maximum pieces of I/O to retrieve at once. This variable should
+	be used along with :option:`iodepth_batch_complete_min`\=int variable,
+	specifying the range of min and max amount of I/O which should be
+	retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
+	value.
+
+	Example #1::
+
+		iodepth_batch_complete_min=1
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we will retrieve at least 1 I/O and up to the whole
+	submitted queue depth. If none of I/O has been completed yet, we will wait.
+
+	Example #2::
+
+		iodepth_batch_complete_min=0
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we can retrieve up to the whole submitted queue depth, but
+	if none of I/O has been completed yet, we will NOT wait and immediately exit
+	the system call. In this example we simply do polling.
+
+.. option:: iodepth_low=int
+
+	The low water mark indicating when to start filling the queue
+	again. Defaults to the same as :option:`iodepth`, meaning that fio will
+	attempt to keep the queue full at all times.  If :option:`iodepth` is set to
+	e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
+	16 requests, it will let the depth drain down to 4 before starting to fill
+	it again.
+
+.. option:: serialize_overlap=bool
+
+	Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+	When two or more I/Os are submitted simultaneously, there is no guarantee that
+	the I/Os will be processed or completed in the submitted order. Further, if
+	two or more of those I/Os are writes, any overlapping region between them can
+	become indeterminate/undefined on certain storage. These issues can cause
+	verification to fail erratically when at least one of the racing I/Os is
+	changing data and the overlapping region has a non-zero size. Setting
+	``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+	serializing in-flight I/Os that have a non-zero overlap. Note that setting
+	this option can reduce both performance and the :option:`iodepth` achieved.
+
+	This option only applies to I/Os issued for a single job except when it is
+	enabled along with :option:`io_submit_mode`\=offload. In offload mode, fio
+	will check for overlap among all I/Os submitted by offload jobs with :option:`serialize_overlap`
+	enabled.
+
+	Default: false.
+
+.. option:: io_submit_mode=str
+
+	This option controls how fio submits the I/O to the I/O engine. The default
+	is `inline`, which means that the fio job threads submit and reap I/O
+	directly. If set to `offload`, the job threads will offload I/O submission
+	to a dedicated pool of I/O threads. This requires some coordination and thus
+	has a bit of extra overhead, especially for lower queue depth I/O where it
+	can increase latencies. The benefit is that fio can manage submission rates
+	independently of the device completion rates. This avoids skewed latency
+	reporting if I/O gets backed up on the device side (the coordinated omission
+	problem).
+
+
+I/O rate
+~~~~~~~~
+
+.. option:: thinktime=time
+
+	Stall the job for the specified period of time after an I/O has completed before issuing the
+	next. May be used to simulate processing being done by an application.
+	When the unit is omitted, the value is interpreted in microseconds.  See
+	:option:`thinktime_blocks` and :option:`thinktime_spin`.
+
+.. option:: thinktime_spin=time
+
+	Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
+	something with the data received, before falling back to sleeping for the
+	rest of the period specified by :option:`thinktime`.  When the unit is
+	omitted, the value is interpreted in microseconds.
+
+.. option:: thinktime_blocks=int
+
+	Only valid if :option:`thinktime` is set - control how many blocks to issue,
+	before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
+	fio wait :option:`thinktime` usecs after every block. This effectively makes any
+	queue depth setting redundant, since no more than 1 I/O will be queued
+	before we have to complete it and do our :option:`thinktime`. In other words, this
+	setting effectively caps the queue depth if the latter is larger.
+
+.. option:: rate=int[,int][,int]
+
+	Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+	suffix rules apply.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+	For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
+	500KiB/sec.  Capping only reads or writes can be done with `rate=,500k` or
+	`rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
+	latter will only limit reads.
+
+.. option:: rate_min=int[,int][,int]
+
+	Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+	to meet this requirement will cause the job to exit.  Comma-separated values
+	may be specified for reads, writes, and trims as described in
+	:option:`blocksize`.
+
+.. option:: rate_iops=int[,int][,int]
+
+	Cap the bandwidth to this number of IOPS. Basically the same as
+	:option:`rate`, just specified independently of bandwidth. If the job is
+	given a block size range instead of a fixed value, the smallest block size
+	is used as the metric.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_iops_min=int[,int][,int]
+
+	If fio doesn't meet this rate of I/O, it will cause the job to exit.
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+.. option:: rate_process=str
+
+	This option controls how fio manages rated I/O submissions. The default is
+	`linear`, which submits I/O in a linear fashion with fixed delays between
+	I/Os that gets adjusted based on I/O completion rates. If this is set to
+	`poisson`, fio will submit I/O based on a more real world random request
+	flow, known as the Poisson process
+	(https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
+	10^6 / IOPS for the given workload.
+
+.. option:: rate_ignore_thinktime=bool
+
+	By default, fio will attempt to catch up to the specified rate setting,
+	if any kind of thinktime setting was used. If this option is set, then
+	fio will ignore the thinktime and continue doing IO at the specified
+	rate, instead of entering a catch-up mode after thinktime is done.
+
+
+I/O latency
+~~~~~~~~~~~
+
+.. option:: latency_target=time
+
+	If set, fio will attempt to find the max performance point that the given
+	workload will run at while maintaining a latency below this target.  When
+	the unit is omitted, the value is interpreted in microseconds.  See
+	:option:`latency_window` and :option:`latency_percentile`.
+
+.. option:: latency_window=time
+
+	Used with :option:`latency_target` to specify the sample window that the job
+	is run at varying queue depths to test the performance.  When the unit is
+	omitted, the value is interpreted in microseconds.
+
+.. option:: latency_percentile=float
+
+	The percentage of I/Os that must fall within the criteria specified by
+	:option:`latency_target` and :option:`latency_window`. If not set, this
+	defaults to 100.0, meaning that all I/Os must be equal or below to the value
+	set by :option:`latency_target`.
+
+.. option:: max_latency=time
+
+	If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+	maximum latency. When the unit is omitted, the value is interpreted in
+	microseconds.
+
+.. option:: rate_cycle=int
+
+	Average bandwidth for :option:`rate` and :option:`rate_min` over this number
+	of milliseconds. Defaults to 1000.
+
+
+I/O replay
+~~~~~~~~~~
+
+.. option:: write_iolog=str
+
+	Write the issued I/O patterns to the specified file. See
+	:option:`read_iolog`.  Specify a separate file for each job, otherwise the
+	iologs will be interspersed and the file may be corrupt.
+
+.. option:: read_iolog=str
+
+	Open an iolog with the specified filename and replay the I/O patterns it
+	contains. This can be used to store a workload and replay it sometime
+	later. The iolog given may also be a blktrace binary file, which allows fio
+	to replay a workload captured by :command:`blktrace`. See
+	:manpage:`blktrace(8)` for how to capture such logging data. For blktrace
+	replay, the file needs to be turned into a blkparse binary data file first
+	(``blkparse <device> -o /dev/null -d file_for_fio.bin``).
+	You can specify a number of files by separating the names with a ':'
+	character. See the :option:`filename` option for information on how to
+	escape ':' characters within the file names. These files will
+	be sequentially assigned to job clones created by :option:`numjobs`.
+
+.. option:: read_iolog_chunked=bool
+
+	Determines how iolog is read. If false(default) entire :option:`read_iolog`
+	will be read at once. If selected true, input from iolog will be read
+	gradually. Useful when iolog is very large, or it is generated.
+
+.. option:: merge_blktrace_file=str
+
+	When specified, rather than replaying the logs passed to :option:`read_iolog`,
+	the logs go through a merge phase which aggregates them into a single
+	blktrace. The resulting file is then passed on as the :option:`read_iolog`
+	parameter. The intention here is to make the order of events consistent.
+	This limits the influence of the scheduler compared to replaying multiple
+	blktraces via concurrent jobs.
+
+.. option:: merge_blktrace_scalars=float_list
+
+	This is a percentage based option that is index paired with the list of
+	files passed to :option:`read_iolog`. When merging is performed, scale
+	the time of each event by the corresponding amount. For example,
+	``--merge_blktrace_scalars="50:100"`` runs the first trace in halftime
+	and the second trace in realtime. This knob is separately tunable from
+	:option:`replay_time_scale` which scales the trace during runtime and
+	does not change the output of the merge unlike this option.
+
+.. option:: merge_blktrace_iters=float_list
+
+	This is a whole number option that is index paired with the list of files
+	passed to :option:`read_iolog`. When merging is performed, run each trace
+	for the specified number of iterations. For example,
+	``--merge_blktrace_iters="2:1"`` runs the first trace for two iterations
+	and the second trace for one iteration.
+
+.. option:: replay_no_stall=bool
+
+	When replaying I/O with :option:`read_iolog` the default behavior is to
+	attempt to respect the timestamps within the log and replay them with the
+	appropriate delay between IOPS. By setting this variable fio will not
+	respect the timestamps and attempt to replay them as fast as possible while
+	still respecting ordering. The result is the same I/O pattern to a given
+	device, but different timings.
+
+.. option:: replay_time_scale=int
+
+	When replaying I/O with :option:`read_iolog`, fio will honor the
+	original timing in the trace. With this option, it's possible to scale
+	the time. It's a percentage option, if set to 50 it means run at 50%
+	the original IO rate in the trace. If set to 200, run at twice the
+	original IO rate. Defaults to 100.
+
+.. option:: replay_redirect=str
+
+	While replaying I/O patterns using :option:`read_iolog` the default behavior
+	is to replay the IOPS onto the major/minor device that each IOP was recorded
+	from.  This is sometimes undesirable because on a different machine those
+	major/minor numbers can map to a different device.  Changing hardware on the
+	same system can also result in a different major/minor mapping.
+	``replay_redirect`` causes all I/Os to be replayed onto the single specified
+	device regardless of the device it was recorded
+	from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
+	in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
+	multiple devices will be replayed onto a single device, if the trace
+	contains multiple devices. If you want multiple devices to be replayed
+	concurrently to multiple redirected devices you must blkparse your trace
+	into separate traces and replay them with independent fio invocations.
+	Unfortunately this also breaks the strict time ordering between multiple
+	device accesses.
+
+.. option:: replay_align=int
+
+	Force alignment of the byte offsets in a trace to this value. The value
+	must be a power of 2.
+
+.. option:: replay_scale=int
+
+	Scale byte offsets down by this factor when replaying traces. Should most
+	likely use :option:`replay_align` as well.
+
+.. option:: replay_skip=str
+
+	Sometimes it's useful to skip certain IO types in a replay trace.
+	This could be, for instance, eliminating the writes in the trace.
+	Or not replaying the trims/discards, if you are redirecting to
+	a device that doesn't support them. This option takes a comma
+	separated list of read, write, trim, sync.
+
+
+Threads, processes and job synchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: thread
+
+	Fio defaults to creating jobs by using fork, however if this option is
+	given, fio will create jobs by using POSIX Threads' function
+	:manpage:`pthread_create(3)` to create threads instead.
+
+.. option:: wait_for=str
+
+	If set, the current job won't be started until all workers of the specified
+	waitee job are done.
+
+	``wait_for`` operates on the job name basis, so there are a few
+	limitations. First, the waitee must be defined prior to the waiter job
+	(meaning no forward references). Second, if a job is being referenced as a
+	waitee, it must have a unique name (no duplicate waitees).
+
+.. option:: nice=int
+
+	Run the job with the given nice value. See man :manpage:`nice(2)`.
+
+	On Windows, values less than -15 set the process class to "High"; -1 through
+	-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+	priority class.
+
+.. option:: prio=int
+
+	Set the I/O priority value of this job. Linux limits us to a positive value
+	between 0 and 7, with 0 being the highest.  See man
+	:manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
+	systems since meaning of priority may differ. For per-command priority
+	setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
+	options.
+
+.. option:: prioclass=int
+
+	Set the I/O priority class. See man :manpage:`ionice(1)`. For per-command
+	priority setting, see I/O engine specific `cmdprio_percentage` and
+	`hipri_percentage` options.
+
+.. option:: cpus_allowed=str
+
+	Controls the same options as :option:`cpumask`, but accepts a textual
+	specification of the permitted CPUs instead and CPUs are indexed from 0. So
+	to use CPUs 0 and 5 you would specify ``cpus_allowed=0,5``. This option also
+	allows a range of CPUs to be specified -- say you wanted a binding to CPUs
+	0, 5, and 8 to 15, you would set ``cpus_allowed=0,5,8-15``.
+
+	On Windows, when ``cpus_allowed`` is unset only CPUs from fio's current
+	processor group will be used and affinity settings are inherited from the
+	system. An fio build configured to target Windows 7 makes options that set
+	CPUs processor group aware and values will set both the processor group
+	and a CPU from within that group. For example, on a system where processor
+	group 0 has 40 CPUs and processor group 1 has 32 CPUs, ``cpus_allowed``
+	values between 0 and 39 will bind CPUs from processor group 0 and
+	``cpus_allowed`` values between 40 and 71 will bind CPUs from processor
+	group 1. When using ``cpus_allowed_policy=shared`` all CPUs specified by a
+	single ``cpus_allowed`` option must be from the same processor group. For
+	Windows fio builds not built for Windows 7, CPUs will only be selected from
+	(and be relative to) whatever processor group fio happens to be running in
+	and CPUs from other processor groups cannot be used.
+
+.. option:: cpus_allowed_policy=str
+
+	Set the policy of how fio distributes the CPUs specified by
+	:option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
+
+		**shared**
+			All jobs will share the CPU set specified.
+		**split**
+			Each job will get a unique CPU from the CPU set.
+
+	**shared** is the default behavior, if the option isn't specified. If
+	**split** is specified, then fio will assign one cpu per job. If not
+	enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+	in the set.
+
+.. option:: cpumask=int
+
+	Set the CPU affinity of this job. The parameter given is a bit mask of
+	allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+	and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+	:manpage:`sched_setaffinity(2)`. This may not work on all supported
+	operating systems or kernel versions. This option doesn't work well for a
+	higher CPU count than what you can store in an integer mask, so it can only
+	control cpus 1-32. For boxes with larger CPU counts, use
+	:option:`cpus_allowed`.
+
+.. option:: numa_cpu_nodes=str
+
+	Set this job running on specified NUMA nodes' CPUs. The arguments allow
+	comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
+	NUMA options support, fio must be built on a system with libnuma-dev(el)
+	installed.
+
+.. option:: numa_mem_policy=str
+
+	Set this job's memory policy and corresponding NUMA nodes. Format of the
+	arguments::
+
+		<mode>[:<nodelist>]
+
+	``mode`` is one of the following memory policies: ``default``, ``prefer``,
+	``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
+	policies, no node needs to be specified.  For ``prefer``, only one node is
+	allowed.  For ``bind`` and ``interleave`` the ``nodelist`` may be as
+	follows: a comma delimited list of numbers, A-B ranges, or `all`.
+
+.. option:: cgroup=str
+
+	Add job to this control group. If it doesn't exist, it will be created. The
+	system must have a mounted cgroup blkio mount point for this to work. If
+	your system doesn't have it mounted, you can do so with::
+
+		# mount -t cgroup -o blkio none /cgroup
+
+.. option:: cgroup_weight=int
+
+	Set the weight of the cgroup to this value. See the documentation that comes
+	with the kernel, allowed values are in the range of 100..1000.
+
+.. option:: cgroup_nodelete=bool
+
+	Normally fio will delete the cgroups it has created after the job
+	completion. To override this behavior and to leave cgroups around after the
+	job completion, set ``cgroup_nodelete=1``.  This can be useful if one wants
+	to inspect various cgroup files after job completion. Default: false.
+
+.. option:: flow_id=int
+
+	The ID of the flow. If not specified, it defaults to being a global
+	flow. See :option:`flow`.
+
+.. option:: flow=int
+
+	Weight in token-based flow control. If this value is used, then there is a
+	'flow counter' which is used to regulate the proportion of activity between
+	two or more jobs. Fio attempts to keep this flow counter near zero. The
+	``flow`` parameter stands for how much should be added or subtracted to the
+	flow counter on each iteration of the main I/O loop. That is, if one job has
+	``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8
+	ratio in how much one runs vs the other.
+
+.. option:: flow_watermark=int
+
+	The maximum value that the absolute value of the flow counter is allowed to
+	reach before the job must wait for a lower value of the counter.
+
+.. option:: flow_sleep=int
+
+	The period of time, in microseconds, to wait after the flow watermark has
+	been exceeded before retrying operations.
+
+.. option:: stonewall, wait_for_previous
+
+	Wait for preceding jobs in the job file to exit, before starting this
+	one. Can be used to insert serialization points in the job file. A stone
+	wall also implies starting a new reporting group, see
+	:option:`group_reporting`.
+
+.. option:: exitall
+
+	By default, fio will continue running all other jobs when one job finishes.
+	Sometimes this is not the desired action.  Setting ``exitall`` will instead
+	make fio terminate all jobs in the same group, as soon as one job of that
+	group finishes.
+
+.. option:: exit_what
+
+	By default, fio will continue running all other jobs when one job finishes.
+	Sometimes this is not the desired action. Setting ``exit_all`` will
+	instead make fio terminate all jobs in the same group. The option
+        ``exit_what`` allows to control which jobs get terminated when ``exitall`` is
+        enabled. The default is ``group`` and does not change the behaviour of
+        ``exitall``. The setting ``all`` terminates all jobs. The setting ``stonewall``
+        terminates all currently running jobs across all groups and continues execution
+        with the next stonewalled group.
+
+.. option:: exec_prerun=str
+
+	Before running this job, issue the command specified through
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.prerun.txt`.
+
+.. option:: exec_postrun=str
+
+	After the job completes, issue the command specified though
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.postrun.txt`.
+
+.. option:: uid=int
+
+	Instead of running as the invoking user, set the user ID to this value
+	before the thread/process does any work.
+
+.. option:: gid=int
+
+	Set group ID, see :option:`uid`.
+
+
+Verification
+~~~~~~~~~~~~
+
+.. option:: verify_only
+
+	Do not perform specified workload, only verify data still matches previous
+	invocation of this workload. This option allows one to check data multiple
+	times at a later date without overwriting it. This option makes sense only
+	for workloads that write data, and does not support workloads with the
+	:option:`time_based` option set.
+
+.. option:: do_verify=bool
+
+	Run the verify phase after a write phase. Only valid if :option:`verify` is
+	set. Default: true.
+
+.. option:: verify=str
+
+	If writing to a file, fio can verify the file contents after each iteration
+	of the job. Each verification method also implies verification of special
+	header, which is written to the beginning of each block. This header also
+	includes meta information, like offset of the block, block number, timestamp
+	when block was written, etc.  :option:`verify` can be combined with
+	:option:`verify_pattern` option.  The allowed values are:
+
+		**md5**
+			Use an md5 sum of the data area and store it in the header of
+			each block.
+
+		**crc64**
+			Use an experimental crc64 sum of the data area and store it in the
+			header of each block.
+
+		**crc32c**
+			Use a crc32c sum of the data area and store it in the header of
+			each block. This will automatically use hardware acceleration
+			(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+			fall back to software crc32c if none is found. Generally the
+			fastest checksum fio supports when hardware accelerated.
+
+		**crc32c-intel**
+			Synonym for crc32c.
+
+		**crc32**
+			Use a crc32 sum of the data area and store it in the header of each
+			block.
+
+		**crc16**
+			Use a crc16 sum of the data area and store it in the header of each
+			block.
+
+		**crc7**
+			Use a crc7 sum of the data area and store it in the header of each
+			block.
+
+		**xxhash**
+			Use xxhash as the checksum function. Generally the fastest software
+			checksum that fio supports.
+
+		**sha512**
+			Use sha512 as the checksum function.
+
+		**sha256**
+			Use sha256 as the checksum function.
+
+		**sha1**
+			Use optimized sha1 as the checksum function.
+
+		**sha3-224**
+			Use optimized sha3-224 as the checksum function.
+
+		**sha3-256**
+			Use optimized sha3-256 as the checksum function.
+
+		**sha3-384**
+			Use optimized sha3-384 as the checksum function.
+
+		**sha3-512**
+			Use optimized sha3-512 as the checksum function.
+
+		**meta**
+			This option is deprecated, since now meta information is included in
+			generic verification header and meta verification happens by
+			default. For detailed information see the description of the
+			:option:`verify` setting. This option is kept because of
+			compatibility's sake with old configurations. Do not use it.
+
+		**pattern**
+			Verify a strict pattern. Normally fio includes a header with some
+			basic information and checksumming, but if this option is set, only
+			the specific pattern set with :option:`verify_pattern` is verified.
+
+		**null**
+			Only pretend to verify. Useful for testing internals with
+			:option:`ioengine`\=null, not for much else.
+
+	This option can be used for repeated burn-in tests of a system to make sure
+	that the written data is also correctly read back. If the data direction
+	given is a read or random read, fio will assume that it should verify a
+	previously written file. If the data direction includes any form of write,
+	the verify will be of the newly written data.
+
+	To avoid false verification errors, do not use the norandommap option when
+	verifying data with async I/O engines and I/O depths > 1.  Or use the
+	norandommap and the lfsr random generator together to avoid writing to the
+	same offset with muliple outstanding I/Os.
+
+.. option:: verify_offset=int
+
+	Swap the verification header with data somewhere else in the block before
+	writing. It is swapped back before verifying.
+
+.. option:: verify_interval=int
+
+	Write the verification header at a finer granularity than the
+	:option:`blocksize`. It will be written for chunks the size of
+	``verify_interval``. :option:`blocksize` should divide this evenly.
+
+.. option:: verify_pattern=str
+
+	If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+	filling with totally random bytes, but sometimes it's interesting to fill
+	with a known pattern for I/O verification purposes. Depending on the width
+	of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+	be either a decimal or a hex number).  The ``verify_pattern`` if larger than
+	a 32-bit quantity has to be a hex number that starts with either "0x" or
+	"0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
+	format, which means that for each block offset will be written and then
+	verified back, e.g.::
+
+		verify_pattern=%o
+
+	Or use combination of everything::
+
+		verify_pattern=0xff%o"abcd"-12
+
+.. option:: verify_fatal=bool
+
+	Normally fio will keep checking the entire contents before quitting on a
+	block verification failure. If this option is set, fio will exit the job on
+	the first observed failure. Default: false.
+
+.. option:: verify_dump=bool
+
+	If set, dump the contents of both the original data block and the data block
+	we read off disk to files. This allows later analysis to inspect just what
+	kind of data corruption occurred. Off by default.
+
+.. option:: verify_async=int
+
+	Fio will normally verify I/O inline from the submitting thread. This option
+	takes an integer describing how many async offload threads to create for I/O
+	verification instead, causing fio to offload the duty of verifying I/O
+	contents to one or more separate threads. If using this offload option, even
+	sync I/O engines can benefit from using an :option:`iodepth` setting higher
+	than 1, as it allows them to have I/O in flight while verifies are running.
+	Defaults to 0 async threads, i.e. verification is not asynchronous.
+
+.. option:: verify_async_cpus=str
+
+	Tell fio to set the given CPU affinity on the async I/O verification
+	threads. See :option:`cpus_allowed` for the format used.
+
+.. option:: verify_backlog=int
+
+	Fio will normally verify the written contents of a job that utilizes verify
+	once that job has completed. In other words, everything is written then
+	everything is read back and verified. You may want to verify continually
+	instead for a variety of reasons. Fio stores the meta data associated with
+	an I/O block in memory, so for large verify workloads, quite a bit of memory
+	would be used up holding this meta data. If this option is enabled, fio will
+	write only N blocks before verifying these blocks.
+
+.. option:: verify_backlog_batch=int
+
+	Control how many blocks fio will verify if :option:`verify_backlog` is
+	set. If not set, will default to the value of :option:`verify_backlog`
+	(meaning the entire queue is read back and verified).  If
+	``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
+	blocks will be verified, if ``verify_backlog_batch`` is larger than
+	:option:`verify_backlog`, some blocks will be verified more than once.
+
+.. option:: verify_state_save=bool
+
+	When a job exits during the write phase of a verify workload, save its
+	current state. This allows fio to replay up until that point, if the verify
+	state is loaded for the verify read phase. The format of the filename is,
+	roughly::
+
+		<type>-<jobname>-<jobindex>-verify.state.
+
+	<type> is "local" for a local run, "sock" for a client/server socket
+	connection, and "ip" (192.168.0.1, for instance) for a networked
+	client/server connection. Defaults to true.
+
+.. option:: verify_state_load=bool
+
+	If a verify termination trigger was used, fio stores the current write state
+	of each thread. This can be used at verification time so that fio knows how
+	far it should verify.  Without this information, fio will run a full
+	verification pass, according to the settings in the job file used.  Default
+	false.
+
+.. option:: trim_percentage=int
+
+	Number of verify blocks to discard/trim.
+
+.. option:: trim_verify_zero=bool
+
+	Verify that trim/discarded blocks are returned as zeros.
+
+.. option:: trim_backlog=int
+
+	Trim after this number of blocks are written.
+
+.. option:: trim_backlog_batch=int
+
+	Trim this number of I/O blocks.
+
+.. option:: experimental_verify=bool
+
+	Enable experimental verification.
+
+Steady state
+~~~~~~~~~~~~
+
+.. option:: steadystate=str:float, ss=str:float
+
+	Define the criterion and limit for assessing steady state performance. The
+	first parameter designates the criterion whereas the second parameter sets
+	the threshold. When the criterion falls below the threshold for the
+	specified duration, the job will stop. For example, `iops_slope:0.1%` will
+	direct fio to terminate the job when the least squares regression slope
+	falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
+	this will apply to all jobs in the group. Below is the list of available
+	steady state assessment criteria. All assessments are carried out using only
+	data from the rolling collection window. Threshold limits can be expressed
+	as a fixed value or as a percentage of the mean in the collection window.
+
+	When using this feature, most jobs should include the :option:`time_based`
+	and :option:`runtime` options or the :option:`loops` option so that fio does not
+	stop running after it has covered the full size of the specified file(s) or device(s).
+
+		**iops**
+			Collect IOPS data. Stop the job if all individual IOPS measurements
+			are within the specified limit of the mean IOPS (e.g., ``iops:2``
+			means that all individual IOPS values must be within 2 of the mean,
+			whereas ``iops:0.2%`` means that all individual IOPS values must be
+			within 0.2% of the mean IOPS to terminate the job).
+
+		**iops_slope**
+			Collect IOPS data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+		**bw**
+			Collect bandwidth data. Stop the job if all individual bandwidth
+			measurements are within the specified limit of the mean bandwidth.
+
+		**bw_slope**
+			Collect bandwidth data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+.. option:: steadystate_duration=time, ss_dur=time
+
+	A rolling window of this duration will be used to judge whether steady state
+	has been reached. Data will be collected once per second. The default is 0
+	which disables steady state detection.  When the unit is omitted, the
+	value is interpreted in seconds.
+
+.. option:: steadystate_ramp_time=time, ss_ramp=time
+
+	Allow the job to run for the specified duration before beginning data
+	collection for checking the steady state job termination criterion. The
+	default is 0.  When the unit is omitted, the value is interpreted in seconds.
+
+
+Measurements and reporting
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: per_job_logs=bool
+
+	If set, this generates bw/clat/iops log with per file private filenames. If
+	not set, jobs with identical names will share the log filename. Default:
+	true.
+
+.. option:: group_reporting
+
+	It may sometimes be interesting to display statistics for groups of jobs as
+	a whole instead of for each individual job.  This is especially true if
+	:option:`numjobs` is used; looking at individual thread/process output
+	quickly becomes unwieldy.  To see the final report per-group instead of
+	per-job, use :option:`group_reporting`. Jobs in a file will be part of the
+	same reporting group, unless if separated by a :option:`stonewall`, or by
+	using :option:`new_group`.
+
+.. option:: new_group
+
+	Start a new reporting group. See: :option:`group_reporting`.  If not given,
+	all jobs in a file will be part of the same reporting group, unless
+	separated by a :option:`stonewall`.
+
+.. option:: stats=bool
+
+	By default, fio collects and shows final output results for all jobs
+	that run. If this option is set to 0, then fio will ignore it in
+	the final stat output.
+
+.. option:: write_bw_log=str
+
+	If given, write a bandwidth log for this job. Can be used to store data of
+	the bandwidth of the jobs in their lifetime.
+
+	If no str argument is given, the default filename of
+	:file:`jobname_type.x.log` is used. Even when the argument is given, fio
+	will still append the type of log. So if one specifies::
+
+		write_bw_log=foo
+
+	The actual log name will be :file:`foo_bw.x.log` where `x` is the index
+	of the job (`1..N`, where `N` is the number of jobs). If
+	:option:`per_job_logs` is false, then the filename will not include the
+	`.x` job index.
+
+	The included :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
+	text files into nice graphs. See `Log File Formats`_ for how data is
+	structured within the file.
+
+.. option:: write_lat_log=str
+
+	Same as :option:`write_bw_log`, except this option creates I/O
+	submission (e.g., :file:`name_slat.x.log`), completion (e.g.,
+	:file:`name_clat.x.log`), and total (e.g., :file:`name_lat.x.log`)
+	latency files instead. See :option:`write_bw_log` for details about
+	the filename format and `Log File Formats`_ for how data is structured
+	within the files.
+
+.. option:: write_hist_log=str
+
+	Same as :option:`write_bw_log` but writes an I/O completion latency
+	histogram file (e.g., :file:`name_hist.x.log`) instead. Note that this
+	file will be empty unless :option:`log_hist_msec` has also been set.
+	See :option:`write_bw_log` for details about the filename format and
+	`Log File Formats`_ for how data is structured within the file.
+
+.. option:: write_iops_log=str
+
+	Same as :option:`write_bw_log`, but writes an IOPS file (e.g.
+	:file:`name_iops.x.log`) instead. Because fio defaults to individual
+	I/O logging, the value entry in the IOPS log will be 1 unless windowed
+	logging (see :option:`log_avg_msec`) has been enabled. See
+	:option:`write_bw_log` for details about the filename format and `Log
+	File Formats`_ for how data is structured within the file.
+
+.. option:: log_avg_msec=int
+
+	By default, fio will log an entry in the iops, latency, or bw log for every
+	I/O that completes. When writing to the disk log, that can quickly grow to a
+	very large size. Setting this option makes fio average the each log entry
+	over the specified period of time, reducing the resolution of the log.  See
+	:option:`log_max_value` as well. Defaults to 0, logging all entries.
+	Also see `Log File Formats`_.
+
+.. option:: log_hist_msec=int
+
+	Same as :option:`log_avg_msec`, but logs entries for completion latency
+	histograms. Computing latency percentiles from averages of intervals using
+	:option:`log_avg_msec` is inaccurate. Setting this option makes fio log
+	histogram entries over the specified period of time, reducing log sizes for
+	high IOPS devices while retaining percentile accuracy.  See
+	:option:`log_hist_coarseness` and :option:`write_hist_log` as well.
+	Defaults to 0, meaning histogram logging is disabled.
+
+.. option:: log_hist_coarseness=int
+
+	Integer ranging from 0 to 6, defining the coarseness of the resolution of
+	the histogram logs enabled with :option:`log_hist_msec`. For each increment
+	in coarseness, fio outputs half as many bins. Defaults to 0, for which
+	histogram logs contain 1216 latency bins. See :option:`write_hist_log`
+	and `Log File Formats`_.
+
+.. option:: log_max_value=bool
+
+	If :option:`log_avg_msec` is set, fio logs the average over that window. If
+	you instead want to log the maximum value, set this option to 1. Defaults to
+	0, meaning that averaged values are logged.
+
+.. option:: log_offset=bool
+
+	If this is set, the iolog options will include the byte offset for the I/O
+	entry as well as the other data values. Defaults to 0 meaning that
+	offsets are not present in logs. Also see `Log File Formats`_.
+
+.. option:: log_compression=int
+
+	If this is set, fio will compress the I/O logs as it goes, to keep the
+	memory footprint lower. When a log reaches the specified size, that chunk is
+	removed and compressed in the background. Given that I/O logs are fairly
+	highly compressible, this yields a nice memory savings for longer runs. The
+	downside is that the compression will consume some background CPU cycles, so
+	it may impact the run. This, however, is also true if the logging ends up
+	consuming most of the system memory.  So pick your poison. The I/O logs are
+	saved normally at the end of a run, by decompressing the chunks and storing
+	them in the specified log file. This feature depends on the availability of
+	zlib.
+
+.. option:: log_compression_cpus=str
+
+	Define the set of CPUs that are allowed to handle online log compression for
+	the I/O jobs. This can provide better isolation between performance
+	sensitive jobs, and background compression work. See
+	:option:`cpus_allowed` for the format used.
+
+.. option:: log_store_compressed=bool
+
+	If set, fio will store the log files in a compressed format. They can be
+	decompressed with fio, using the :option:`--inflate-log` command line
+	parameter. The files will be stored with a :file:`.fz` suffix.
+
+.. option:: log_unix_epoch=bool
+
+	If set, fio will log Unix timestamps to the log files produced by enabling
+	write_type_log for each log type, instead of the default zero-based
+	timestamps.
+
+.. option:: block_error_percentiles=bool
+
+	If set, record errors in trim block-sized units from writes and trims and
+	output a histogram of how many trims it took to get to errors, and what kind
+	of error was encountered.
+
+.. option:: bwavgtime=int
+
+	Average the calculated bandwidth over the given time. Value is specified in
+	milliseconds. If the job also does bandwidth logging through
+	:option:`write_bw_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: iopsavgtime=int
+
+	Average the calculated IOPS over the given time. Value is specified in
+	milliseconds. If the job also does IOPS logging through
+	:option:`write_iops_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: disk_util=bool
+
+	Generate disk utilization statistics, if the platform supports it.
+	Default: true.
+
+.. option:: disable_lat=bool
+
+	Disable measurements of total latency numbers. Useful only for cutting back
+	the number of calls to :manpage:`gettimeofday(2)`, as that does impact
+	performance at really high IOPS rates.  Note that to really get rid of a
+	large amount of these calls, this option must be used with
+	:option:`disable_slat` and :option:`disable_bw_measurement` as well.
+
+.. option:: disable_clat=bool
+
+	Disable measurements of completion latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_slat=bool
+
+	Disable measurements of submission latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_bw_measurement=bool, disable_bw=bool
+
+	Disable measurements of throughput/bandwidth numbers. See
+	:option:`disable_lat`.
+
+.. option:: slat_percentiles=bool
+
+	Report submission latency percentiles. Submission latency is not recorded
+	for synchronous ioengines.
+
+.. option:: clat_percentiles=bool
+
+	Report completion latency percentiles.
+
+.. option:: lat_percentiles=bool
+
+	Report total latency percentiles. Total latency is the sum of submission
+	latency and completion latency.
+
+.. option:: percentile_list=float_list
+
+	Overwrite the default list of percentiles for latencies and the block error
+	histogram.  Each number is a floating point number in the range (0,100], and
+	the maximum length of the list is 20. Use ``:`` to separate the numbers. For
+	example, ``--percentile_list=99.5:99.9`` will cause fio to report the
+	latency durations below which 99.5% and 99.9% of the observed latencies fell,
+	respectively.
+
+.. option:: significant_figures=int
+
+	If using :option:`--output-format` of `normal`, set the significant
+	figures to this	value. Higher values will yield more precise IOPS and
+	throughput units, while lower values will round. Requires a minimum
+	value of 1 and a maximum value of 10. Defaults to 4.
+
+
+Error handling
+~~~~~~~~~~~~~~
+
+.. option:: exitall_on_error
+
+	When one job finishes in error, terminate the rest. The default is to wait
+	for each job to finish.
+
+.. option:: continue_on_error=str
+
+	Normally fio will exit the job on the first observed failure. If this option
+	is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+	EILSEQ) until the runtime is exceeded or the I/O size specified is
+	completed. If this option is used, there are two more stats that are
+	appended, the total error count and the first error. The error field given
+	in the stats is the first error that was hit during the run.
+
+	The allowed values are:
+
+		**none**
+			Exit on any I/O or verify errors.
+
+		**read**
+			Continue on read errors, exit on all others.
+
+		**write**
+			Continue on write errors, exit on all others.
+
+		**io**
+			Continue on any I/O error, exit on all others.
+
+		**verify**
+			Continue on verify errors, exit on all others.
+
+		**all**
+			Continue on all errors.
+
+		**0**
+			Backward-compatible alias for 'none'.
+
+		**1**
+			Backward-compatible alias for 'all'.
+
+.. option:: ignore_error=str
+
+	Sometimes you want to ignore some errors during test in that case you can
+	specify error list for each error type, instead of only being able to
+	ignore the default 'non-fatal error' using :option:`continue_on_error`.
+	``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
+	given error type is separated with ':'. Error may be symbol ('ENOSPC',
+	'ENOMEM') or integer.  Example::
+
+		ignore_error=EAGAIN,ENOSPC:122
+
+	This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+	WRITE. This option works by overriding :option:`continue_on_error` with
+	the list of errors for each error type if any.
+
+.. option:: error_dump=bool
+
+	If set dump every error even if it is non fatal, true by default. If
+	disabled only fatal error will be dumped.
+
+Running predefined workloads
+----------------------------
+
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+
+.. option:: profile=str
+
+	The predefined workload to run.  Current profiles are:
+
+		**tiobench**
+			Threaded I/O bench (tiotest/tiobench) like workload.
+
+		**act**
+			Aerospike Certification Tool (ACT) like workload.
+
+To view a profile's additional options use :option:`--cmdhelp` after specifying
+the profile.  For example::
+
+	$ fio --profile=act --cmdhelp
+
+Act profile options
+~~~~~~~~~~~~~~~~~~~
+
+.. option:: device-names=str
+	:noindex:
+
+	Devices to use.
+
+.. option:: load=int
+	:noindex:
+
+	ACT load multiplier.  Default: 1.
+
+.. option:: test-duration=time
+	:noindex:
+
+	How long the entire test takes to run.  When the unit is omitted, the value
+	is given in seconds.  Default: 24h.
+
+.. option:: threads-per-queue=int
+	:noindex:
+
+	Number of read I/O threads per device.  Default: 8.
+
+.. option:: read-req-num-512-blocks=int
+	:noindex:
+
+	Number of 512B blocks to read at the time.  Default: 3.
+
+.. option:: large-block-op-kbytes=int
+	:noindex:
+
+	Size of large block ops in KiB (writes).  Default: 131072.
+
+.. option:: prep
+	:noindex:
+
+	Set to run ACT prep phase.
+
+Tiobench profile options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: size=str
+	:noindex:
+
+	Size in MiB.
+
+.. option:: block=int
+	:noindex:
+
+	Block size in bytes.  Default: 4096.
+
+.. option:: numruns=int
+	:noindex:
+
+	Number of runs.
+
+.. option:: dir=str
+	:noindex:
+
+	Test directory.
+
+.. option:: threads=int
+	:noindex:
+
+	Number of threads.
+
+Interpreting the output
+-----------------------
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
+		--rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
+		--runtime=2m --rw=rw
+
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be::
+
+    Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+
+The characters inside the first set of square brackets denote the current status of
+each thread.  The first character is the first job defined in the job file, and so
+forth.  The possible values (in typical life cycle order) are:
+
++------+-----+-----------------------------------------------------------+
+| Idle | Run |                                                           |
++======+=====+===========================================================+
+| P    |     | Thread setup, but not started.                            |
++------+-----+-----------------------------------------------------------+
+| C    |     | Thread created.                                           |
++------+-----+-----------------------------------------------------------+
+| I    |     | Thread initialized, waiting or generating necessary data. |
++------+-----+-----------------------------------------------------------+
+|      |  p  | Thread running pre-reading file(s).                       |
++------+-----+-----------------------------------------------------------+
+|      |  /  | Thread is in ramp period.                                 |
++------+-----+-----------------------------------------------------------+
+|      |  R  | Running, doing sequential reads.                          |
++------+-----+-----------------------------------------------------------+
+|      |  r  | Running, doing random reads.                              |
++------+-----+-----------------------------------------------------------+
+|      |  W  | Running, doing sequential writes.                         |
++------+-----+-----------------------------------------------------------+
+|      |  w  | Running, doing random writes.                             |
++------+-----+-----------------------------------------------------------+
+|      |  M  | Running, doing mixed sequential reads/writes.             |
++------+-----+-----------------------------------------------------------+
+|      |  m  | Running, doing mixed random reads/writes.                 |
++------+-----+-----------------------------------------------------------+
+|      |  D  | Running, doing sequential trims.                          |
++------+-----+-----------------------------------------------------------+
+|      |  d  | Running, doing random trims.                              |
++------+-----+-----------------------------------------------------------+
+|      |  F  | Running, currently waiting for :manpage:`fsync(2)`.       |
++------+-----+-----------------------------------------------------------+
+|      |  V  | Running, doing verification of written data.              |
++------+-----+-----------------------------------------------------------+
+| f    |     | Thread finishing.                                         |
++------+-----+-----------------------------------------------------------+
+| E    |     | Thread exited, not reaped by main thread yet.             |
++------+-----+-----------------------------------------------------------+
+| _    |     | Thread reaped.                                            |
++------+-----+-----------------------------------------------------------+
+| X    |     | Thread reaped, exited with an error.                      |
++------+-----+-----------------------------------------------------------+
+| K    |     | Thread reaped, exited due to signal.                      |
++------+-----+-----------------------------------------------------------+
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
+		--time_based --rate=2512k --bs=256K --numjobs=10 \
+		--name=readers --rw=read --name=writers --rw=write
+
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this::
+
+    Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what.  In the example above this means that jobs 1--10
+are readers and 11--20 are writers.
+
+The other values are fairly self explanatory -- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
+		--direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
+		--bs=7K --name=Client1 --rw=write
+
+When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like::
+
+	Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+	  write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+	    slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+	    clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+	     lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+	    clat percentiles (usec):
+	     |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+	     | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+	     | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+	     | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+	     | 99.99th=[78119]
+	   bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+	   iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+	  lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+	  lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+	  lat (msec)   : 100=0.65%
+	  cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+	  IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+	     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+	     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+	     issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+	     latency   : target=0, window=0, percentile=100.00%, depth=8
+
+The job name (or first job's name when using :option:`group_reporting`) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed.  Below are the I/O statistics for each data direction performed (showing
+writes in the example above).  In the order listed, they denote:
+
+**read/write/trim**
+		The string before the colon shows the I/O direction the statistics
+		are for.  **IOPS** is the average I/Os performed per second.  **BW**
+		is the average bandwidth rate shown as: value in power of 2 format
+		(value in power of 10 format).  The last two values show: (**total
+		I/O performed** in power of 2 format / **runtime** of that thread).
+
+**slat**
+		Submission latency (**min** being the minimum, **max** being the
+		maximum, **avg** being the average, **stdev** being the standard
+		deviation).  This is the time it took to submit the I/O.  For
+		sync I/O this row is not displayed as the slat is really the
+		completion latency (since queue/complete is one operation there).
+		This value can be in nanoseconds, microseconds or milliseconds ---
+		fio will choose the most appropriate base and print that (in the
+		example above nanoseconds was the best scale).  Note: in :option:`--minimal` mode
+		latencies are always expressed in microseconds.
+
+**clat**
+		Completion latency. Same names as slat, this denotes the time from
+		submission to completion of the I/O pieces. For sync I/O, clat will
+		usually be equal (or very close) to 0, as the time from submit to
+		complete is basically just CPU time (I/O has already been done, see slat
+		explanation).
+
+**lat**
+		Total latency. Same names as slat and clat, this denotes the time from
+		when fio created the I/O unit to completion of the I/O operation.
+
+**bw**
+		Bandwidth statistics based on samples. Same names as the xlat stats,
+		but also includes the number of samples taken (**samples**) and an
+		approximate percentage of total aggregate bandwidth this thread
+		received in its group (**per**). This last value is only really
+		useful if the threads in this group are on the same disk, since they
+		are then competing for disk access.
+
+**iops**
+		IOPS statistics based on samples. Same names as bw.
+
+**lat (nsec/usec/msec)**
+		The distribution of I/O completion latencies. This is the time from when
+		I/O leaves fio and when it gets completed. Unlike the separate
+		read/write/trim sections above, the data here and in the remaining
+		sections apply to all I/Os for the reporting group. 250=0.04% means that
+		0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+		of the I/Os required 250 to 499us for completion.
+
+**cpu**
+		CPU usage. User and system time, along with the number of context
+		switches this thread went through, usage of system and user time, and
+		finally the number of major and minor page faults. The CPU utilization
+		numbers are averages for the jobs in that reporting group, while the
+		context and fault counters are summed.
+
+**IO depths**
+		The distribution of I/O depths over the job lifetime.  The numbers are
+		divided into powers of 2 and each entry covers depths from that value
+		up to those that are lower than the next entry -- e.g., 16= covers
+		depths from 16 to 31.  Note that the range covered by a depth
+		distribution entry can be different to the range covered by the
+		equivalent submit/complete distribution entry.
+
+**IO submit**
+		How many pieces of I/O were submitting in a single submit call. Each
+		entry denotes that amount and below, until the previous entry -- e.g.,
+		16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+		call.  Note that the range covered by a submit distribution entry can
+		be different to the range covered by the equivalent depth distribution
+		entry.
+
+**IO complete**
+		Like the above submit number, but for completions instead.
+
+**IO issued rwt**
+		The number of read/write/trim requests issued, and how many of them were
+		short or dropped.
+
+**IO latency**
+		These values are for :option:`latency_target` and related options. When
+		these options are engaged, this section describes the I/O depth required
+		to meet the specified latency target.
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
+		--rate_process=poisson --io_limit=32M --name=read --bs=128k \
+		--rate=11M --name=write --rw=write --bs=2k --rate=700k
+
+After each client has been listed, the group statistics are printed. They
+will look like this::
+
+    Run status group 0 (all jobs):
+       READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
+      WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
+
+For each data direction it prints:
+
+**bw**
+		Aggregate bandwidth of threads in this group followed by the
+		minimum and maximum bandwidth of all the threads in this group.
+		Values outside of brackets are power-of-2 format and those
+		within are the equivalent value in a power-of-10 format.
+**io**
+		Aggregate I/O performed of all threads in this group. The
+		format is the same as bw.
+**run**
+		The smallest and longest runtimes of the threads in this group.
+
+And finally, the disk statistics are printed. This is Linux specific. They will look like this::
+
+  Disk stats (read/write):
+    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
+
+**ios**
+		Number of I/Os performed by all groups.
+**merge**
+		Number of merges performed by the I/O scheduler.
+**ticks**
+		Number of ticks we kept the disk busy.
+**in_queue**
+		Total time spent in the disk queue.
+**util**
+		The disk utilization. A value of 100% means we kept the disk
+		busy constantly, 50% would be a disk idling half of the time.
+
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the **USR1** signal.  You can
+also get regularly timed dumps by using the :option:`--status-interval`
+parameter, or by creating a file in :file:`/tmp` named
+:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
+current output status.
+
+
+Terse output
+------------
+
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format.  The format
+is one long line of values, such as::
+
+    2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+    A description of this job goes here.
+
+The job description (if provided) follows on a second line for terse v2.
+It appears on the same line for other terse versions.
+
+To enable terse output, use the :option:`--minimal` or
+:option:`--output-format`\=terse command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
+
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+
+    ::
+
+        terse version, fio version [v3], jobname, groupid, error
+
+    READ status::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    WRITE status:
+
+    ::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    TRIM status [all but version 3]:
+
+        Fields are similar to READ/WRITE status.
+
+    CPU usage::
+
+        user, system, context switches, major faults, minor faults
+
+    I/O depths::
+
+        <=1, 2, 4, 8, 16, 32, >=64
+
+    I/O latencies microseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+
+    I/O latencies milliseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+
+    Disk utilization [v3]::
+
+        disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
+        time spent in queue, disk utilization percentage
+
+    Additional Info (dependent on continue_on_error, default off)::
+
+        total # errors, first error code
+
+    Additional Info (dependent on description being set)::
+
+        Text description
+
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this::
+
+        1.00%=6112
+
+which is the Xth percentile, and the `usec` latency associated with it.
+
+For `Disk utilization`, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons::
+
+        terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+
+In client/server mode terse output differs from what appears when jobs are run
+locally. Disk utilization data is omitted from the standard terse output and
+for v3 and later appears on its own separate line at the end of each terse
+reporting cycle.
+
+
+JSON output
+------------
+
+The `json` output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+`normal` output. The `runtime` value is reported in msec and the `bw` value is
+reported in 1024 bytes per second units.
+
+
+JSON+ output
+------------
+
+The `json+` output format is identical to the `json` output format except that it
+adds a full dump of the completion latency bins. Each `bins` object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+
+	"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+
+Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to :file:`stat.h`.
+
+
+Trace file format
+-----------------
+
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+
+In any case the trace is a simple text file with a single action per line.
+
+
+Trace file format v1
+~~~~~~~~~~~~~~~~~~~~
+
+Each line represents a single I/O action in the following format::
+
+	rw, offset, length
+
+where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
+
+This format is not supported in fio versions >= 1.20-rc3.
+
+
+Trace file format v2
+~~~~~~~~~~~~~~~~~~~~
+
+The second version of the trace file format was added in fio version 1.17.  It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
+
+The first line of the trace file has to be::
+
+    fio version 2 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+    filename action
+
+The `filename` is given as an absolute path. The `action` can be one of these:
+
+**add**
+		Add the given `filename` to the trace.
+**open**
+		Open the file with the given `filename`. The `filename` has to have
+		been added with the **add** action before.
+**close**
+		Close the file with the given `filename`. The file has to have been
+		opened before.
+
+
+The file I/O action format::
+
+    filename action offset length
+
+The `filename` is given as an absolute path, and has to have been added and
+opened before it can be used with this format. The `offset` and `length` are
+given in bytes. The `action` can be one of these:
+
+**wait**
+	   Wait for `offset` microseconds. Everything below 100 is discarded.
+	   The time is relative to the previous `wait` statement.
+**read**
+	   Read `length` bytes beginning from `offset`.
+**write**
+	   Write `length` bytes beginning from `offset`.
+**sync**
+	   :manpage:`fsync(2)` the file.
+**datasync**
+	   :manpage:`fdatasync(2)` the file.
+**trim**
+	   Trim the given file from the given `offset` for `length` bytes.
+
+
+I/O Replay - Merging Traces
+---------------------------
+
+Colocation is a common practice used to get the most out of a machine.
+Knowing which workloads play nicely with each other and which ones don't is
+a much harder task. While fio can replay workloads concurrently via multiple
+jobs, it leaves some variability up to the scheduler making results harder to
+reproduce. Merging is a way to make the order of events consistent.
+
+Merging is integrated into I/O replay and done when a
+:option:`merge_blktrace_file` is specified. The list of files passed to
+:option:`read_iolog` go through the merge process and output a single file
+stored to the specified file. The output file is passed on as if it were the
+only file passed to :option:`read_iolog`. An example would look like::
+
+	$ fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"
+
+Creating only the merged file can be done by passing the command line argument
+:option:`--merge-blktrace-only`.
+
+Scaling traces can be done to see the relative impact of any particular trace
+being slowed down or sped up. :option:`merge_blktrace_scalars` takes in a colon
+separated list of percentage scalars. It is index paired with the files passed
+to :option:`read_iolog`.
+
+With scaling, it may be desirable to match the running time of all traces.
+This can be done with :option:`merge_blktrace_iters`. It is index paired with
+:option:`read_iolog` just like :option:`merge_blktrace_scalars`.
+
+In an example, given two traces, A and B, each 60s long. If we want to see
+the impact of trace A issuing IOs twice as fast and repeat trace A over the
+runtime of trace B, the following can be done::
+
+	$ fio --read_iolog="<trace_a>:"<trace_b>" --merge_blktrace_file"<output_file>" --merge_blktrace_scalars="50:100" --merge_blktrace_iters="2:1"
+
+This runs trace A at 2x the speed twice for approximately the same runtime as
+a single run of trace B.
+
+
+CPU idleness profiling
+----------------------
+
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+
+
+Verification and triggers
+-------------------------
+
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+1) Storing the write state of each job.
+2) Executing a trigger command.
+
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
+check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
+
+Verification trigger example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally::
+
+	server# fio --server
+
+and on the client, we'll fire off the workload::
+
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+
+We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
+
+	echo b > /proc/sysrq-trigger
+
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not **really** cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi-reboot. On localbox, we could then have run fio with a local trigger
+instead::
+
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+
+For this case, fio would wait for the server to send us the write state, then
+execute ``ipmi-reboot server`` when that happened.
+
+Loading verify state
+~~~~~~~~~~~~~~~~~~~~
+
+To load stored write state, a read verification job file must contain the
+:option:`verify_state_load` option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+
+
+Log File Formats
+----------------
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+    *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
+    *offset* (`bytes`)
+
+*Time* for the log entry is always in milliseconds. The *value* logged depends
+on the type of log, it will be one of the following:
+
+    **Latency log**
+		Value is latency in nsecs
+    **Bandwidth log**
+		Value is in KiB/sec
+    **IOPS log**
+		Value is IOPS
+
+*Data direction* is one of the following:
+
+	**0**
+		I/O is a READ
+	**1**
+		I/O is a WRITE
+	**2**
+		I/O is a TRIM
+
+The entry's *block size* is always in bytes. The *offset* is the position in bytes
+from the start of the file for that particular I/O. The logging of the offset can be
+toggled with :option:`log_offset`.
+
+Fio defaults to logging every individual I/O but when windowed logging is set
+through :option:`log_avg_msec`, either the average (by default) or the maximum
+(:option:`log_max_value` is set) *value* seen over the specified period of time
+is recorded. Each *data direction* seen within the window period will aggregate
+its values in a separate row. Further, when using windowed logging the *block
+size* and *offset* entries will always contain 0.
+
+
+Client/Server
+-------------
+
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+
+Start the server on the machine which has access to the storage DUT::
+
+	$ fio --server=args
+
+where `args` defines what fio listens to. The arguments are of the form
+``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
+v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
+*hostname* is either a hostname or IP address, and *port* is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+
+1) ``fio --server``
+
+   Start a fio server, listening on all interfaces on the default port (8765).
+
+2) ``fio --server=ip:hostname,4444``
+
+   Start a fio server, listening on IP belonging to hostname and on port 4444.
+
+3) ``fio --server=ip6:::1,4444``
+
+   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+
+4) ``fio --server=,4444``
+
+   Start a fio server, listening on all interfaces on port 4444.
+
+5) ``fio --server=1.2.3.4``
+
+   Start a fio server, listening on IP 1.2.3.4 on the default port.
+
+6) ``fio --server=sock:/tmp/fio.sock``
+
+   Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
+
+Once a server is running, a "client" can connect to the fio server with::
+
+	fio <local-args> --client=<server> <remote-args> <job file(s)>
+
+where `local-args` are arguments for the client where it is running, `server`
+is the connect string, and `remote-args` and `job file(s)` are sent to the
+server. The `server` string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+
+Fio can connect to multiple servers this way::
+
+    fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using :option:`--remote-config` ::
+
+   fio --client=server --remote-config /path/to/file.fio
+
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+:option:`--client` option.  For example, here is an example :file:`host.list`
+file containing 2 hostnames::
+
+	host1.your.dns.domain
+	host2.your.dns.domain
+
+The fio command would then be::
+
+    fio --client=host.list <job file(s)>
+
+In this mode, you cannot input server-specific parameters or job files -- all
+servers receive the same job file.
+
+In order to let ``fio --client`` runs use a shared filesystem from multiple
+hosts, ``fio --client`` now prepends the IP address of the server to the
+filename.  For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
+writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
+containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files::
+
+	/mnt/nfs/fio/192.168.10.120.fileio.tmp
+	/mnt/nfs/fio/192.168.10.121.fileio.tmp
+
+Terse output in client/server mode will differ slightly from what is produced
+when fio is run in stand-alone mode. See the terse output section for details.
diff --git a/MORAL-LICENSE b/MORAL-LICENSE
new file mode 100644
index 0000000..8ef3f26
--- /dev/null
+++ b/MORAL-LICENSE
@@ -0,0 +1,17 @@
+As specified by the COPYING file, fio is free software published under version
+2 of the GPL license. That covers the copying part of the license. When using
+fio, you are encouraged to uphold the following moral obligations:
+
+- If you publish results that are done using fio, it should be clearly stated
+  that fio was used. The specific version should also be listed.
+
+- If you develop features or bug fixes for fio, they should be sent upstream
+  for inclusion into the main repository. This isn't specific to fio, that
+  is a general rule for any open source project. It's just the Right Thing
+  to do. Plus it means that you don't have to maintain the feature or change
+  internally. In the long run, this is saving you a lot of time.
+
+I would consider the above to fall under "common courtesy", but since
+people tend to have differing opinions of that, it doesn't hurt to spell out
+my expectations clearly.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9a5dea7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,564 @@
+ifeq ($(SRCDIR),)
+SRCDIR := .
+endif
+
+VPATH := $(SRCDIR)
+
+all: fio
+
+config-host.mak: configure
+	@if [ ! -e "$@" ]; then					\
+	  echo "Running configure ...";				\
+	  ./configure;						\
+	else							\
+	  echo "$@ is out-of-date, running configure";		\
+	  sed -n "/.*Configured with/s/[^:]*: //p" "$@" | sh;	\
+	fi
+
+ifneq ($(MAKECMDGOALS),clean)
+include config-host.mak
+endif
+
+DEBUGFLAGS = -DFIO_INC_DEBUG
+CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
+OPTFLAGS= -g -ffast-math
+CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
+LIBS	+= -lm $(EXTLIBS)
+PROGS	= fio
+SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/fio_jsonplus_clat2csv)
+
+ifndef CONFIG_FIO_NO_OPT
+  CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
+endif
+ifdef CONFIG_BUILD_NATIVE
+  CFLAGS += -march=native
+endif
+
+ifdef CONFIG_GFIO
+  PROGS += gfio
+endif
+
+SOURCE :=	$(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
+		$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c))) \
+		gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
+		eta.c verify.c memory.c io_u.c parse.c fio_sem.c rwlock.c \
+		pshared.c options.c \
+		smalloc.c filehash.c profile.c debug.c engines/cpu.c \
+		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
+		engines/ftruncate.c engines/filecreate.c engines/filestat.c \
+		server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
+		gettime-thread.c helpers.c json.c idletime.c td_error.c \
+		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
+		workqueue.c rate-submit.c optgroup.c helper_thread.c \
+		steadystate.c zone-dist.c
+
+ifdef CONFIG_LIBHDFS
+  HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
+  HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server $(FIO_LIBHDFS_LIB)/libhdfs.a -ljvm
+  CFLAGS += $(HDFSFLAGS)
+  SOURCE += engines/libhdfs.c
+endif
+
+ifdef CONFIG_LIBISCSI
+  CFLAGS += $(LIBISCSI_CFLAGS)
+  LIBS += $(LIBISCSI_LIBS)
+  SOURCE += engines/libiscsi.c
+endif
+
+ifdef CONFIG_LIBNBD
+  CFLAGS += $(LIBNBD_CFLAGS)
+  LIBS += $(LIBNBD_LIBS)
+  SOURCE += engines/nbd.c
+endif
+
+ifdef CONFIG_64BIT
+  CFLAGS += -DBITS_PER_LONG=64
+endif
+ifdef CONFIG_32BIT
+  CFLAGS += -DBITS_PER_LONG=32
+endif
+ifdef CONFIG_LIBAIO
+  SOURCE += engines/libaio.c
+endif
+ifdef CONFIG_RDMA
+  SOURCE += engines/rdma.c
+endif
+ifdef CONFIG_POSIXAIO
+  SOURCE += engines/posixaio.c
+endif
+ifdef CONFIG_LINUX_FALLOCATE
+  SOURCE += engines/falloc.c
+endif
+ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
+  SOURCE += engines/e4defrag.c
+endif
+ifdef CONFIG_LINUX_SPLICE
+  SOURCE += engines/splice.c
+endif
+ifdef CONFIG_GUASI
+  SOURCE += engines/guasi.c
+endif
+ifdef CONFIG_SOLARISAIO
+  SOURCE += engines/solarisaio.c
+endif
+ifdef CONFIG_WINDOWSAIO
+  SOURCE += engines/windowsaio.c
+endif
+ifdef CONFIG_RADOS
+  SOURCE += engines/rados.c
+endif
+ifdef CONFIG_RBD
+  SOURCE += engines/rbd.c
+endif
+ifdef CONFIG_HTTP
+  SOURCE += engines/http.c
+endif
+SOURCE += oslib/asprintf.c
+ifndef CONFIG_STRSEP
+  SOURCE += oslib/strsep.c
+endif
+ifndef CONFIG_STRCASESTR
+  SOURCE += oslib/strcasestr.c
+endif
+ifndef CONFIG_STRLCAT
+  SOURCE += oslib/strlcat.c
+endif
+ifndef CONFIG_HAVE_STRNDUP
+  SOURCE += oslib/strndup.c
+endif
+ifndef CONFIG_GETOPT_LONG_ONLY
+  SOURCE += oslib/getopt_long.c
+endif
+ifndef CONFIG_INET_ATON
+  SOURCE += oslib/inet_aton.c
+endif
+ifndef CONFIG_HAVE_STATX
+  SOURCE += oslib/statx.c
+endif
+ifdef CONFIG_GFAPI
+  SOURCE += engines/glusterfs.c
+  SOURCE += engines/glusterfs_sync.c
+  SOURCE += engines/glusterfs_async.c
+  ifdef CONFIG_GF_FADVISE
+    CFLAGS += "-DGFAPI_USE_FADVISE"
+  endif
+endif
+ifdef CONFIG_MTD
+  SOURCE += engines/mtd.c
+  SOURCE += oslib/libmtd.c
+  SOURCE += oslib/libmtd_legacy.c
+endif
+ifdef CONFIG_PMEMBLK
+  SOURCE += engines/pmemblk.c
+endif
+ifdef CONFIG_LINUX_DEVDAX
+  SOURCE += engines/dev-dax.c
+endif
+ifdef CONFIG_LIBPMEM
+  SOURCE += engines/libpmem.c
+endif
+ifdef CONFIG_IME
+  SOURCE += engines/ime.c
+endif
+ifdef CONFIG_LINUX_BLKZONED
+  SOURCE += zbd.c
+endif
+
+ifeq ($(CONFIG_TARGET_OS), Linux)
+  SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
+		oslib/linux-dev-lookup.c engines/io_uring.c
+  LIBS += -lpthread -ldl
+  LDFLAGS += -rdynamic
+endif
+ifeq ($(CONFIG_TARGET_OS), Android)
+  SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
+		oslib/linux-dev-lookup.c
+  LIBS += -ldl -llog
+  LDFLAGS += -rdynamic
+endif
+ifeq ($(CONFIG_TARGET_OS), SunOS)
+  LIBS	 += -lpthread -ldl
+  CPPFLAGS += -D__EXTENSIONS__
+endif
+ifeq ($(CONFIG_TARGET_OS), FreeBSD)
+  SOURCE += trim.c
+  LIBS	 += -lpthread -lrt
+  LDFLAGS += -rdynamic
+endif
+ifeq ($(CONFIG_TARGET_OS), OpenBSD)
+  LIBS	 += -lpthread
+  LDFLAGS += -rdynamic
+endif
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+  LIBS	 += -lpthread -lrt
+  LDFLAGS += -rdynamic
+endif
+ifeq ($(CONFIG_TARGET_OS), DragonFly)
+  SOURCE += trim.c
+  LIBS	 += -lpthread -lrt
+  LDFLAGS += -rdynamic
+endif
+ifeq ($(CONFIG_TARGET_OS), AIX)
+  LIBS	 += -lpthread -ldl -lrt
+  CPPFLAGS += -D_LARGE_FILES -D__ppc__
+  LDFLAGS += -L/opt/freeware/lib -Wl,-blibpath:/opt/freeware/lib:/usr/lib:/lib -Wl,-bmaxdata:0x80000000
+endif
+ifeq ($(CONFIG_TARGET_OS), HP-UX)
+  LIBS   += -lpthread -ldl -lrt
+  CFLAGS += -D_LARGEFILE64_SOURCE -D_XOPEN_SOURCE_EXTENDED
+endif
+ifeq ($(CONFIG_TARGET_OS), Darwin)
+  LIBS	 += -lpthread -ldl
+endif
+ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
+  SOURCE += os/windows/cpu-affinity.c os/windows/posix.c
+  WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o lib/hweight.o
+  LIBS	 += -lpthread -lpsapi -lws2_32 -lssp
+  CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
+endif
+
+OBJS := $(SOURCE:.c=.o)
+
+FIO_OBJS = $(OBJS) fio.o
+
+GFIO_OBJS = $(OBJS) gfio.o graph.o tickmarks.o ghelpers.o goptions.o gerror.o \
+			gclient.o gcompat.o cairo_text_helpers.o printing.o
+
+ifdef CONFIG_ARITHMETIC
+FIO_OBJS += lex.yy.o y.tab.o
+GFIO_OBJS += lex.yy.o y.tab.o
+endif
+
+-include $(OBJS:.o=.d)
+
+T_SMALLOC_OBJS = t/stest.o
+T_SMALLOC_OBJS += gettime.o fio_sem.o pshared.o smalloc.o t/log.o t/debug.o \
+		  t/arch.o
+T_SMALLOC_PROGS = t/stest
+
+T_IEEE_OBJS = t/ieee754.o
+T_IEEE_OBJS += lib/ieee754.o
+T_IEEE_PROGS = t/ieee754
+
+T_ZIPF_OBS = t/genzipf.o
+T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/pattern.o lib/zipf.o \
+		lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o \
+		oslib/strndup.o
+T_ZIPF_PROGS = t/fio-genzipf
+
+T_AXMAP_OBJS = t/axmap.o
+T_AXMAP_OBJS += lib/lfsr.o lib/axmap.o
+T_AXMAP_PROGS = t/axmap
+
+T_LFSR_TEST_OBJS = t/lfsr-test.o
+T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o fio_sem.o pshared.o \
+		    t/log.o t/debug.o t/arch.o
+T_LFSR_TEST_PROGS = t/lfsr-test
+
+T_GEN_RAND_OBJS = t/gen-rand.o
+T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \
+			oslib/strcasestr.o oslib/strndup.o
+T_GEN_RAND_PROGS = t/gen-rand
+
+ifeq ($(CONFIG_TARGET_OS), Linux)
+T_BTRACE_FIO_OBJS = t/btrace2fio.o
+T_BTRACE_FIO_OBJS += fifo.o lib/flist_sort.o t/log.o oslib/linux-dev-lookup.o
+T_BTRACE_FIO_PROGS = t/fio-btrace2fio
+endif
+
+T_DEDUPE_OBJS = t/dedupe.o
+T_DEDUPE_OBJS += lib/rbtree.o t/log.o fio_sem.o pshared.o smalloc.o gettime.o \
+		crc/md5.o lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o \
+		t/arch.o crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o \
+		crc/crc32c-arm64.o crc/fnv.o
+T_DEDUPE_PROGS = t/fio-dedupe
+
+T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o
+T_VS_PROGS = t/fio-verify-state
+
+T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
+T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
+
+T_IOU_RING_OBJS = t/io_uring.o
+T_IOU_RING_PROGS = t/io_uring
+
+T_MEMLOCK_OBJS = t/memlock.o
+T_MEMLOCK_PROGS = t/memlock
+
+T_TT_OBJS = t/time-test.o
+T_TT_PROGS = t/time-test
+
+T_OBJS = $(T_SMALLOC_OBJS)
+T_OBJS += $(T_IEEE_OBJS)
+T_OBJS += $(T_ZIPF_OBJS)
+T_OBJS += $(T_AXMAP_OBJS)
+T_OBJS += $(T_LFSR_TEST_OBJS)
+T_OBJS += $(T_GEN_RAND_OBJS)
+T_OBJS += $(T_BTRACE_FIO_OBJS)
+T_OBJS += $(T_DEDUPE_OBJS)
+T_OBJS += $(T_VS_OBJS)
+T_OBJS += $(T_PIPE_ASYNC_OBJS)
+T_OBJS += $(T_MEMLOCK_OBJS)
+T_OBJS += $(T_TT_OBJS)
+T_OBJS += $(T_IOU_RING_OBJS)
+
+ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
+    T_DEDUPE_OBJS += $(WINDOWS_OBJS)
+    T_SMALLOC_OBJS += $(WINDOWS_OBJS)
+    T_LFSR_TEST_OBJS += $(WINDOWS_OBJS)
+endif
+
+T_TEST_PROGS = $(T_SMALLOC_PROGS)
+T_TEST_PROGS += $(T_IEEE_PROGS)
+T_PROGS += $(T_ZIPF_PROGS)
+T_TEST_PROGS += $(T_AXMAP_PROGS)
+T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
+T_TEST_PROGS += $(T_GEN_RAND_PROGS)
+T_PROGS += $(T_BTRACE_FIO_PROGS)
+T_PROGS += $(T_DEDUPE_PROGS)
+T_PROGS += $(T_VS_PROGS)
+T_TEST_PROGS += $(T_MEMLOCK_PROGS)
+ifdef CONFIG_PREAD
+T_TEST_PROGS += $(T_PIPE_ASYNC_PROGS)
+endif
+ifneq (,$(findstring Linux,$(CONFIG_TARGET_OS)))
+T_TEST_PROGS += $(T_IOU_RING_PROGS)
+endif
+
+PROGS += $(T_PROGS)
+
+ifdef CONFIG_HAVE_CUNIT
+UT_OBJS = unittests/unittest.o
+UT_OBJS += unittests/lib/memalign.o
+UT_OBJS += unittests/lib/strntol.o
+UT_OBJS += unittests/oslib/strlcat.o
+UT_OBJS += unittests/oslib/strndup.o
+UT_OBJS += unittests/oslib/strcasestr.o
+UT_OBJS += unittests/oslib/strsep.o
+UT_TARGET_OBJS = lib/memalign.o
+UT_TARGET_OBJS += lib/strntol.o
+UT_TARGET_OBJS += oslib/strlcat.o
+UT_TARGET_OBJS += oslib/strndup.o
+UT_TARGET_OBJS += oslib/strcasestr.o
+UT_TARGET_OBJS += oslib/strsep.o
+UT_PROGS = unittests/unittest
+else
+UT_OBJS =
+UT_TARGET_OBJS =
+UT_PROGS =
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_CC	= @echo '   ' CC $@;
+	QUIET_LINK	= @echo ' ' LINK $@;
+	QUIET_DEP	= @echo '  ' DEP $@;
+	QUIET_YACC	= @echo ' ' YACC $@;
+	QUIET_LEX	= @echo '  ' LEX $@;
+endif
+endif
+
+ifeq ($(CONFIG_TARGET_OS), SunOS)
+	INSTALL = ginstall
+else
+	INSTALL = install
+endif
+prefix = $(INSTALL_PREFIX)
+bindir = $(prefix)/bin
+
+ifeq ($(CONFIG_TARGET_OS), Darwin)
+mandir = /usr/share/man
+sharedir = /usr/share/fio
+else
+mandir = $(prefix)/man
+sharedir = $(prefix)/share/fio
+endif
+
+all: $(PROGS) $(T_TEST_PROGS) $(UT_PROGS) $(SCRIPTS) FORCE
+
+.PHONY: all install clean test
+.PHONY: FORCE cscope
+
+FIO-VERSION-FILE: FORCE
+	@$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
+-include FIO-VERSION-FILE
+
+override CFLAGS += -DFIO_VERSION='"$(FIO_VERSION)"'
+
+%.o : %.c
+	@mkdir -p $(dir $@)
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
+	@mv -f $*.d $*.d.tmp
+	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+	@if type -p fmt >/dev/null 2>&1; then				\
+		sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 |	\
+		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d;			\
+	else								\
+		sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp |		\
+		tr -cs "[:graph:]" "\n" |				\
+		sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d;	\
+	fi
+	@rm -f $*.d.tmp
+
+ifdef CONFIG_ARITHMETIC
+lex.yy.c: exp/expression-parser.l
+ifdef CONFIG_LEX_USE_O
+	$(QUIET_LEX)$(LEX) -o $@ $<
+else
+	$(QUIET_LEX)$(LEX) $<
+endif
+
+lex.yy.o: lex.yy.c y.tab.h
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+
+y.tab.o: y.tab.c y.tab.h
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+
+y.tab.c: exp/expression-parser.y
+	$(QUIET_YACC)$(YACC) -o $@ -l -d -b y $<
+
+y.tab.h: y.tab.c
+
+lexer.h: lex.yy.c
+
+exp/test-expression-parser.o: exp/test-expression-parser.c
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+exp/test-expression-parser: exp/test-expression-parser.o
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) $< y.tab.o lex.yy.o -o $@ $(LIBS)
+
+parse.o: lex.yy.o y.tab.o
+endif
+
+init.o: init.c FIO-VERSION-FILE
+
+gcompat.o: gcompat.c gcompat.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+goptions.o: goptions.c goptions.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+ghelpers.o: ghelpers.c ghelpers.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+gerror.o: gerror.c gerror.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+gclient.o: gclient.c gclient.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+gfio.o: gfio.c ghelpers.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+graph.o: graph.c graph.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+cairo_text_helpers.o: cairo_text_helpers.c cairo_text_helpers.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+printing.o: printing.c printing.h
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+t/io_uring.o: os/linux/io_uring.h
+t/io_uring: $(T_IOU_RING_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IOU_RING_OBJS) $(LIBS)
+
+t/read-to-pipe-async: $(T_PIPE_ASYNC_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_PIPE_ASYNC_OBJS) $(LIBS)
+
+t/memlock: $(T_MEMLOCK_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_MEMLOCK_OBJS) $(LIBS)
+
+t/stest: $(T_SMALLOC_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_SMALLOC_OBJS) $(LIBS)
+
+t/ieee754: $(T_IEEE_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IEEE_OBJS) $(LIBS)
+
+fio: $(FIO_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(FIO_OBJS) $(LIBS) $(HDFSLIB)
+
+gfio: $(GFIO_OBJS)
+	$(QUIET_LINK)$(CC) $(filter-out -static, $(LDFLAGS)) -o gfio $(GFIO_OBJS) $(LIBS) $(GFIO_LIBS) $(GTK_LDFLAGS) $(HDFSLIB)
+
+t/fio-genzipf: $(T_ZIPF_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS)
+
+t/axmap: $(T_AXMAP_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_AXMAP_OBJS) $(LIBS)
+
+t/lfsr-test: $(T_LFSR_TEST_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
+
+t/gen-rand: $(T_GEN_RAND_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_GEN_RAND_OBJS) $(LIBS)
+
+ifeq ($(CONFIG_TARGET_OS), Linux)
+t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
+endif
+
+t/fio-dedupe: $(T_DEDUPE_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+
+t/fio-verify-state: $(T_VS_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
+
+t/time-test: $(T_TT_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS)
+
+ifdef CONFIG_HAVE_CUNIT
+unittests/unittest: $(UT_OBJS) $(UT_TARGET_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(UT_OBJS) $(UT_TARGET_OBJS) -lcunit $(LIBS)
+endif
+
+clean: FORCE
+	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] profiles/*.[do] t/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+	@rm -f t/fio-btrace2fio t/io_uring t/read-to-pipe-async
+	@rm -rf  doc/output
+
+distclean: clean FORCE
+	@rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf fiologparser_hist.pdf
+
+cscope:
+	@cscope -b -R
+
+tools/plot/fio2gnuplot.1:
+	@cat tools/plot/fio2gnuplot.manpage | txt2man -t fio2gnuplot >  tools/plot/fio2gnuplot.1
+
+doc: tools/plot/fio2gnuplot.1
+	@man -t ./fio.1 | ps2pdf - fio.pdf
+	@man -t tools/fio_generate_plots.1 | ps2pdf - fio_generate_plots.pdf
+	@man -t tools/plot/fio2gnuplot.1 | ps2pdf - fio2gnuplot.pdf
+	@man -t tools/hist/fiologparser_hist.py.1 | ps2pdf - fiologparser_hist.pdf
+
+test: fio
+	./fio --minimal --thread --exitall_on_error --runtime=1s --name=nulltest --ioengine=null --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifyfstest --filename=fiotestfile.tmp --unlink=1 --rw=write --verify=crc32c --verify_state_save=0 --size=16K
+
+fulltest:
+	sudo modprobe null_blk &&				 	\
+	if [ ! -e /usr/include/libzbc/zbc.h ]; then			\
+	  git clone https://github.com/hgst/libzbc &&		 	\
+	  (cd libzbc &&						 	\
+	   ./autogen.sh &&					 	\
+	   ./configure --prefix=/usr &&				 	\
+	   make -j &&						 	\
+	   sudo make install)						\
+	fi &&					 			\
+	sudo t/zbd/run-tests-against-regular-nullb &&		 	\
+	if [ -e /sys/module/null_blk/parameters/zoned ]; then		\
+		sudo t/zbd/run-tests-against-zoned-nullb;	 	\
+	fi
+
+install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
+	$(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
+	$(INSTALL) $(PROGS) $(SCRIPTS) $(DESTDIR)$(bindir)
+	$(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/fio.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/fio_generate_plots.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/plot/fio2gnuplot.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/hist/fiologparser_hist.py.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 755 -d $(DESTDIR)$(sharedir)
+	$(INSTALL) -m 644 $(SRCDIR)/tools/plot/*gpm $(DESTDIR)$(sharedir)/
+
+.PHONY: test fulltest
diff --git a/README b/README
new file mode 100644
index 0000000..0f943bc
--- /dev/null
+++ b/README
@@ -0,0 +1,283 @@
+Overview and history
+--------------------
+
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often.  Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in an
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given.  The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
+
+
+Source
+------
+
+Fio resides in a git repo, the canonical place is:
+
+	git://git.kernel.dk/fio.git
+
+When inside a corporate firewall, git:// URL sometimes does not work.
+If git:// does not work, use the http protocol instead:
+
+	http://git.kernel.dk/fio.git
+
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
+Snapshots can download from:
+
+	http://brick.kernel.dk/snaps/
+
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
+
+	git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+	https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+or
+
+	git://github.com/axboe/fio.git
+
+	https://github.com/axboe/fio.git
+
+
+Mailing list
+------------
+
+The fio project mailing list is meant for anything related to fio including
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
+
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
+
+	subscribe fio
+
+in the body of the email. Archives can be found here:
+
+	http://www.spinics.net/lists/fio/
+
+and archives for the old list can be found here:
+
+	http://maillist.kernel.dk/fio-devel/
+
+
+Author
+------
+
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
+
+Jens Axboe <axboe@kernel.dk> 20060905
+
+
+Binary packages
+---------------
+
+Debian:
+	Starting with Debian "Squeeze", fio packages are part of the official
+	Debian repository. http://packages.debian.org/search?keywords=fio .
+
+Ubuntu:
+	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+	of the Ubuntu "universe" repository.
+	http://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+	packages are part of the Fedora/EPEL repositories.
+	https://apps.fedoraproject.org/packages/fio .
+
+Mandriva:
+	Mandriva has integrated fio into their package repository, so installing
+	on that distro should be as easy as typing ``urpmi fio``.
+
+Arch Linux:
+        An Arch Linux package is provided under the Community sub-repository:
+        https://www.archlinux.org/packages/?sort=&q=fio
+
+Solaris:
+	Packages for Solaris are available from OpenCSW. Install their pkgutil
+	tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+	``pkgutil -i fio``.
+
+Windows:
+	Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
+	https://bsdio.com/fio/ . The latest builds for Windows can also
+	be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
+	the latest x86 or x64 build, then selecting the ARTIFACTS tab.
+
+BSDs:
+	Packages for BSDs may be available from their binary package repositories.
+	Look for a package "fio" using their binary package managers.
+
+
+Building
+--------
+
+Just type::
+
+ $ ./configure
+ $ make
+ $ make install
+
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
+
+For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
+to be installed.  gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
+
+To build fio with a cross-compiler::
+
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
+
+Configure will attempt to determine the target platform automatically.
+
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
+configure.
+
+
+Windows
+~~~~~~~
+
+On Windows, Cygwin (https://www.cygwin.com/) is required in order to build
+fio. To create an MSI installer package install WiX from
+https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
+
+How to compile fio on 64-bit Windows:
+
+ 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
+    packages starting with **mingw64-x86_64**. Ensure
+    **mingw64-x86_64-zlib** are installed if you wish
+    to enable fio's log compression functionality.
+ 2. Open the Cygwin Terminal.
+ 3. Go to the fio directory (source files).
+ 4. Run ``make clean && make -j``.
+
+To build fio for 32-bit Windows, ensure the -i686 versions of the previously
+mentioned -x86_64 packages are installed and run ``./configure
+--build-32bit-win`` before ``make``. To build an fio that supports versions of
+Windows below Windows 7/Windows Server 2008 R2 also add ``--target-win-ver=xp``
+to the end of the configure line that you run before doing ``make``.
+
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
+
+
+Documentation
+~~~~~~~~~~~~~
+
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`.  To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
+output formats are supported run ``make -C doc help``.
+
+.. _reStructuredText: http://www.sphinx-doc.org/rest.html
+.. _Sphinx: http://www.sphinx-doc.org
+
+
+Platforms
+---------
+
+Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
+Windows, FreeBSD, and DragonFly. Some features and/or options may only be
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
+
+Some features are not available on FreeBSD/Solaris even if they could be
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
+
+Fio uses pthread mutexes for signalling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
+
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
+appreciated. There's a lot of value in having the same test/benchmark tool
+available on all platforms.
+
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
+
+    Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
+        Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
+
+indicate one needs to enable POSIX aio. Run the following commands as root::
+
+    # lsdev -C -l posix_aio0
+        posix_aio0 Defined  Posix Asynchronous I/O
+    # cfgmgr -l posix_aio0
+    # lsdev -C -l posix_aio0
+        posix_aio0 Available  Posix Asynchronous I/O
+
+POSIX aio should work now. To make the change permanent::
+
+    # chdev -l posix_aio0 -P -a autoconfig='available'
+        posix_aio0 changed
+
+
+Running fio
+-----------
+
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
+
+	$ fio [options] [jobfile] ...
+
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters.  For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name.  Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
+
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.
diff --git a/REPORTING-BUGS b/REPORTING-BUGS
new file mode 100644
index 0000000..327b6ca
--- /dev/null
+++ b/REPORTING-BUGS
@@ -0,0 +1,21 @@
+Reporting a bug
+---------------
+
+If you notice anything that seems like a fio bug, please do send email
+to the list (fio@vger.kernel.org, see README) about it. If you are not
+running the newest release of fio, upgrading first is recommended.
+
+When reporting a bug, you'll need to include:
+
+1) A description of what you think the bug is
+2) Environment (Linux distro version, kernel version). This is mostly
+   needed if it's a build bug.
+3) The output from fio --version.
+4) How to reproduce. Please include a full list of the parameters
+   passed to fio and the job file used (if any).
+
+A bug report can't have too much information. Any time information that
+is left out and has to be asked for will add to the turn-around time
+of getting to the bottom of the issue, and an eventual fix.
+
+That's it!
diff --git a/SERVER-TODO b/SERVER-TODO
new file mode 100644
index 0000000..b988405
--- /dev/null
+++ b/SERVER-TODO
@@ -0,0 +1,2 @@
+- Collate ETA output from multiple connections into 1
+- If group_reporting is set, collate final output from multiple connections
diff --git a/STEADYSTATE-TODO b/STEADYSTATE-TODO
new file mode 100644
index 0000000..e4b146e
--- /dev/null
+++ b/STEADYSTATE-TODO
@@ -0,0 +1,14 @@
+Known issues/TODO (for steady-state)
+
+- Allow user to specify the frequency of measurements
+
+- Better documentation for output
+
+- Report read, write, trim IOPS/BW separately
+
+- Semantics for the ring buffer ss->head are confusing. ss->head points
+  to the beginning of the buffer up through the point where the buffer
+  is filled for the first time. afterwards, when a new element is added,
+  ss->head is advanced to point to the second element in the buffer. if
+  steady state is attained upon adding a new element, ss->head is not
+  advanced so it actually does point to the head of the buffer.
diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h
new file mode 100644
index 0000000..2a86cc5
--- /dev/null
+++ b/arch/arch-aarch64.h
@@ -0,0 +1,30 @@
+#ifndef ARCH_AARCH64_H
+#define ARCH_AARCH64_H
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define FIO_ARCH	(arch_aarch64)
+
+#define nop		do { __asm__ __volatile__ ("yield"); } while (0)
+#define read_barrier()	do { __sync_synchronize(); } while (0)
+#define write_barrier()	do { __sync_synchronize(); } while (0)
+
+static inline int arch_ffz(unsigned long bitmask)
+{
+	unsigned long count, reversed_bits;
+	if (~bitmask == 0)	/* ffz() in lib/ffz.h does this. */
+		return 63;
+
+	__asm__ __volatile__ ("rbit %1, %2\n"
+			      "clz %0, %1\n" : 
+			      "=r"(count), "=&r"(reversed_bits) :
+			      "r"(~bitmask));
+	return count;
+}
+
+#define ARCH_HAVE_FFZ
+
+#endif
diff --git a/arch/arch-alpha.h b/arch/arch-alpha.h
new file mode 100644
index 0000000..9318e15
--- /dev/null
+++ b/arch/arch-alpha.h
@@ -0,0 +1,10 @@
+#ifndef ARCH_ALPHA_H
+#define ARCH_ALPHA_H
+
+#define FIO_ARCH	(arch_alpha)
+
+#define nop			do { } while (0)
+#define read_barrier()		__asm__ __volatile__("mb": : :"memory")
+#define write_barrier()		__asm__ __volatile__("wmb": : :"memory")
+
+#endif
diff --git a/arch/arch-arm.h b/arch/arch-arm.h
new file mode 100644
index 0000000..b356712
--- /dev/null
+++ b/arch/arch-arm.h
@@ -0,0 +1,22 @@
+#ifndef ARCH_ARM_H
+#define ARCH_ARM_H
+
+#define FIO_ARCH	(arch_arm)
+
+#if defined (__ARM_ARCH_4__) || defined (__ARM_ARCH_4T__) \
+	|| defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5E__)\
+	|| defined (__ARM_ARCH_5TE__) || defined (__ARM_ARCH_5TEJ__) \
+	|| defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
+	|| defined(__ARM_ARCH_6KZ__) || defined(__ARM_ARCH_6K__)
+#define nop             __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t")
+#define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
+#define write_barrier()	__asm__ __volatile__ ("" : : : "memory")
+#elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7VE__) || defined(__ARM_ARCH_8A__)
+#define	nop		__asm__ __volatile__ ("nop")
+#define read_barrier()	__sync_synchronize()
+#define write_barrier()	__sync_synchronize()
+#else
+#error "unsupported ARM architecture"
+#endif
+
+#endif
diff --git a/arch/arch-generic.h b/arch/arch-generic.h
new file mode 100644
index 0000000..a0b71f8
--- /dev/null
+++ b/arch/arch-generic.h
@@ -0,0 +1,10 @@
+#ifndef ARCH_GENERIC_H
+#define ARCH_GENERIC_H
+
+#define FIO_ARCH	(arch_generic)
+
+#define nop			do { } while (0)
+#define read_barrier()		__asm__ __volatile__("": : :"memory")
+#define write_barrier()		__asm__ __volatile__("": : :"memory")
+
+#endif
diff --git a/arch/arch-hppa.h b/arch/arch-hppa.h
new file mode 100644
index 0000000..eb4fc33
--- /dev/null
+++ b/arch/arch-hppa.h
@@ -0,0 +1,11 @@
+#ifndef ARCH_HPPA_H
+#define ARCH_HPPA_H
+
+#define FIO_ARCH	(arch_hppa)
+
+#define nop	do { } while (0)
+
+#define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
+#define write_barrier()	__asm__ __volatile__ ("" : : : "memory")
+
+#endif
diff --git a/arch/arch-ia64.h b/arch/arch-ia64.h
new file mode 100644
index 0000000..ece3f7e
--- /dev/null
+++ b/arch/arch-ia64.h
@@ -0,0 +1,41 @@
+#ifndef ARCH_IA64_H
+#define ARCH_IA64_H
+
+#define FIO_ARCH	(arch_ia64)
+
+#define nop		asm volatile ("hint @pause" ::: "memory");
+#define read_barrier()	asm volatile ("mf" ::: "memory")
+#define write_barrier()	asm volatile ("mf" ::: "memory")
+
+#define ia64_popcnt(x)							\
+({									\
+	unsigned long ia64_intri_res;					\
+	asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x));		\
+	ia64_intri_res;							\
+})
+
+static inline unsigned long arch_ffz(unsigned long bitmask)
+{
+	return ia64_popcnt(bitmask & (~bitmask - 1));
+}
+
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long long ret;
+
+	__asm__ __volatile__("mov %0=ar.itc" : "=r" (ret) : : "memory");
+	return ret;
+}
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+	tsc_reliable = true;
+	return 0;
+}
+
+#define ARCH_HAVE_FFZ
+#define ARCH_HAVE_CPU_CLOCK
+
+#endif
diff --git a/arch/arch-mips.h b/arch/arch-mips.h
new file mode 100644
index 0000000..6f157fb
--- /dev/null
+++ b/arch/arch-mips.h
@@ -0,0 +1,10 @@
+#ifndef ARCH_MIPS64_H
+#define ARCH_MIPS64_H
+
+#define FIO_ARCH	(arch_mips)
+
+#define read_barrier()		__asm__ __volatile__("": : :"memory")
+#define write_barrier()		__asm__ __volatile__("": : :"memory")
+#define nop			__asm__ __volatile__("": : :"memory")
+
+#endif
diff --git a/arch/arch-ppc.h b/arch/arch-ppc.h
new file mode 100644
index 0000000..804d596
--- /dev/null
+++ b/arch/arch-ppc.h
@@ -0,0 +1,149 @@
+#ifndef ARCH_PPC_H
+#define ARCH_PPC_H
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define FIO_ARCH	(arch_ppc)
+
+#define nop	do { } while (0)
+
+#ifdef __powerpc64__
+#define read_barrier()	__asm__ __volatile__ ("lwsync" : : : "memory")
+#else
+#define read_barrier()	__asm__ __volatile__ ("sync" : : : "memory")
+#endif
+
+#define write_barrier()	__asm__ __volatile__ ("sync" : : : "memory")
+
+#ifdef __powerpc64__
+#define PPC_CNTLZL "cntlzd"
+#else
+#define PPC_CNTLZL "cntlzw"
+#endif
+
+static inline int __ilog2(unsigned long bitmask)
+{
+	int lz;
+
+	asm (PPC_CNTLZL " %0,%1" : "=r" (lz) : "r" (bitmask));
+	return BITS_PER_LONG - 1 - lz;
+}
+
+static inline int arch_ffz(unsigned long bitmask)
+{
+	if ((bitmask = ~bitmask) == 0)
+		return BITS_PER_LONG;
+	return  __ilog2(bitmask & -bitmask);
+}
+
+static inline unsigned int mfspr(unsigned int reg)
+{
+	unsigned int val;
+
+	asm volatile("mfspr %0,%1": "=r" (val) : "K" (reg));
+	return val;
+}
+
+#define SPRN_TBRL  0x10C /* Time Base Register Lower */
+#define SPRN_TBRU  0x10D /* Time Base Register Upper */
+#define SPRN_ATBL  0x20E /* Alternate Time Base Lower */
+#define SPRN_ATBU  0x20F /* Alternate Time Base Upper */
+
+#ifdef __powerpc64__
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long long rval;
+
+	asm volatile(
+		"90:	mfspr %0, %1;\n"
+		"	cmpwi %0,0;\n"
+		"	beq-  90b;\n"
+	: "=r" (rval)
+	: "i" (SPRN_TBRL)
+	: "cr0");
+
+	return rval;
+}
+#else
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned int tbl, tbu0, tbu1;
+	unsigned long long ret;
+
+	do {
+		if (arch_flags & ARCH_FLAG_1) {
+			tbu0 = mfspr(SPRN_ATBU);
+			tbl = mfspr(SPRN_ATBL);
+			tbu1 = mfspr(SPRN_ATBU);
+		} else {
+			tbu0 = mfspr(SPRN_TBRU);
+			tbl = mfspr(SPRN_TBRL);
+			tbu1 = mfspr(SPRN_TBRU);
+		}
+	} while (tbu0 != tbu1);
+
+	ret = (((unsigned long long)tbu0) << 32) | tbl;
+	return ret;
+}
+#endif
+
+#if 0
+static void atb_child(void)
+{
+	arch_flags |= ARCH_FLAG_1;
+	get_cpu_clock();
+	_exit(0);
+}
+
+static void atb_clocktest(void)
+{
+	pid_t pid;
+
+	pid = fork();
+	if (!pid)
+		atb_child();
+	else if (pid != -1) {
+		int status;
+
+		pid = wait(&status);
+		if (pid == -1 || !WIFEXITED(status))
+			arch_flags &= ~ARCH_FLAG_1;
+		else
+			arch_flags |= ARCH_FLAG_1;
+	}
+}
+#endif
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+
+static inline int arch_init(char *envp[])
+{
+#if 0
+	tsc_reliable = true;
+	atb_clocktest();
+#endif
+	return 0;
+}
+
+#define ARCH_HAVE_FFZ
+
+/*
+ * We don't have it on all platforms, lets comment this out until we
+ * can handle it more intelligently.
+ *
+ * #define ARCH_HAVE_CPU_CLOCK
+ */
+
+/*
+ * Let's have it defined for ppc64
+ */
+
+#ifdef __powerpc64__
+#define ARCH_HAVE_CPU_CLOCK
+#endif
+
+#endif
diff --git a/arch/arch-s390.h b/arch/arch-s390.h
new file mode 100644
index 0000000..6bf033b
--- /dev/null
+++ b/arch/arch-s390.h
@@ -0,0 +1,38 @@
+#ifndef ARCH_S390_H
+#define ARCH_S390_H
+
+#define FIO_ARCH	(arch_s390)
+
+#define nop		asm volatile("nop" : : : "memory")
+#define read_barrier()	asm volatile("bcr 15,0" : : : "memory")
+#define write_barrier()	asm volatile("bcr 15,0" : : : "memory")
+
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long long clk;
+
+#ifdef CONFIG_S390_Z196_FACILITIES
+	/*
+	 * Fio needs monotonic (never lower), but not strict monotonic (never
+	 * the same) so store clock fast is enough.
+	 */
+	__asm__ __volatile__("stckf %0" : "=Q" (clk) : : "cc");
+#else
+	__asm__ __volatile__("stck %0" : "=Q" (clk) : : "cc");
+#endif
+	return clk>>12;
+}
+
+#define ARCH_CPU_CLOCK_CYCLES_PER_USEC 1
+#define ARCH_HAVE_CPU_CLOCK
+#undef ARCH_CPU_CLOCK_WRAPS
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+	tsc_reliable = true;
+	return 0;
+}
+
+#endif
diff --git a/arch/arch-sh.h b/arch/arch-sh.h
new file mode 100644
index 0000000..58ff226
--- /dev/null
+++ b/arch/arch-sh.h
@@ -0,0 +1,49 @@
+/* Renesas SH (32bit) only */
+
+#ifndef ARCH_SH_H
+#define ARCH_SH_H
+
+#define FIO_ARCH	(arch_sh)
+
+#define nop             __asm__ __volatile__ ("nop": : :"memory")
+
+#define mb()								\
+	do {								\
+		if (arch_flags & ARCH_FLAG_1)				\
+			__asm__ __volatile__ ("synco": : :"memory");	\
+		else							\
+			__asm__ __volatile__ (" " : : : "memory");	\
+	} while (0)
+
+#define read_barrier()	mb()
+#define write_barrier()	mb()
+
+#include <stdio.h>
+#include <elf.h>
+
+extern unsigned long arch_flags;
+
+#define CPU_HAS_LLSC	0x0040
+
+static inline int arch_init(char *envp[])
+{
+	Elf32_auxv_t *auxv;
+
+	while (*envp++ != NULL)
+		;
+
+	for (auxv = (Elf32_auxv_t *) envp; auxv->a_type != AT_NULL; auxv++) {
+		if (auxv->a_type == AT_HWCAP) {
+			if (auxv->a_un.a_val & CPU_HAS_LLSC) {
+				arch_flags |= ARCH_FLAG_1;
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
+#define ARCH_HAVE_INIT
+
+#endif
diff --git a/arch/arch-sparc.h b/arch/arch-sparc.h
new file mode 100644
index 0000000..f82a1f2
--- /dev/null
+++ b/arch/arch-sparc.h
@@ -0,0 +1,11 @@
+#ifndef ARCH_SPARC_H
+#define ARCH_SPARC_H
+
+#define FIO_ARCH	(arch_sparc)
+
+#define nop	do { } while (0)
+
+#define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
+#define write_barrier()	__asm__ __volatile__ ("" : : : "memory")
+
+#endif
diff --git a/arch/arch-sparc64.h b/arch/arch-sparc64.h
new file mode 100644
index 0000000..80c697b
--- /dev/null
+++ b/arch/arch-sparc64.h
@@ -0,0 +1,18 @@
+#ifndef ARCH_SPARC64_H
+#define ARCH_SPARC64_H
+
+#define FIO_ARCH	(arch_sparc64)
+
+#define nop	do { } while (0)
+
+#define membar_safe(type) \
+	do {    __asm__ __volatile__("ba,pt     %%xcc, 1f\n\t" \
+					" membar   " type "\n" \
+					"1:\n" \
+					: : : "memory"); \
+	} while (0)
+
+#define read_barrier()		membar_safe("#LoadLoad")
+#define write_barrier()		membar_safe("#StoreStore")
+
+#endif
diff --git a/arch/arch-x86-common.h b/arch/arch-x86-common.h
new file mode 100644
index 0000000..f32835c
--- /dev/null
+++ b/arch/arch-x86-common.h
@@ -0,0 +1,77 @@
+#ifndef FIO_ARCH_X86_COMMON
+#define FIO_ARCH_X86_COMMON
+
+#include <string.h>
+
+static inline void cpuid(unsigned int op,
+			 unsigned int *eax, unsigned int *ebx,
+			 unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	do_cpuid(eax, ebx, ecx, edx);
+}
+
+#define ARCH_HAVE_INIT
+
+extern bool tsc_reliable;
+extern int arch_random;
+
+static inline void arch_init_intel(void)
+{
+	unsigned int eax, ebx, ecx = 0, edx;
+
+	/*
+	 * Check for TSC
+	 */
+	eax = 1;
+	do_cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(edx & (1U << 4)))
+		return;
+
+	/*
+	 * Check for constant rate and synced (across cores) TSC
+	 */
+	eax = 0x80000007;
+	do_cpuid(&eax, &ebx, &ecx, &edx);
+	tsc_reliable = (edx & (1U << 8)) != 0;
+
+	/*
+	 * Check for FDRAND
+	 */
+	eax = 0x1;
+	do_cpuid(&eax, &ebx, &ecx, &edx);
+	arch_random = (ecx & (1U << 30)) != 0;
+}
+
+static inline void arch_init_amd(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+	if (eax < 0x80000007)
+		return;
+
+	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+	tsc_reliable = (edx & (1U << 8)) != 0;
+}
+
+static inline void arch_init(char *envp[])
+{
+	unsigned int level;
+	char str[13];
+
+	arch_random = tsc_reliable = 0;
+
+	cpuid(0, &level, (unsigned int *) &str[0],
+			 (unsigned int *) &str[8],
+			 (unsigned int *) &str[4]);
+
+	str[12] = '\0';
+	if (!strcmp(str, "GenuineIntel"))
+		arch_init_intel();
+	else if (!strcmp(str, "AuthenticAMD") || !strcmp(str, "HygonGenuine"))
+		arch_init_amd();
+}
+
+#endif
diff --git a/arch/arch-x86.h b/arch/arch-x86.h
new file mode 100644
index 0000000..c6bcb54
--- /dev/null
+++ b/arch/arch-x86.h
@@ -0,0 +1,40 @@
+#ifndef ARCH_X86_H
+#define ARCH_X86_H
+
+static inline void do_cpuid(unsigned int *eax, unsigned int *ebx,
+			    unsigned int *ecx, unsigned int *edx)
+{
+	asm volatile("xchgl %%ebx, %1\ncpuid\nxchgl %%ebx, %1"
+		: "=a" (*eax), "=r" (*ebx), "=c" (*ecx), "=d" (*edx)
+		: "0" (*eax)
+		: "memory");
+}
+
+#include "arch-x86-common.h" /* IWYU pragma: export */
+
+#define FIO_ARCH	(arch_x86)
+
+#define	FIO_HUGE_PAGE		4194304
+
+#define nop		__asm__ __volatile__("rep;nop": : :"memory")
+#define read_barrier()	__asm__ __volatile__("": : :"memory")
+#define write_barrier()	__asm__ __volatile__("": : :"memory")
+
+static inline unsigned long arch_ffz(unsigned long bitmask)
+{
+	__asm__("bsfl %1,%0" :"=r" (bitmask) :"r" (~bitmask));
+	return bitmask;
+}
+
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long long ret;
+
+	__asm__ __volatile__("rdtsc" : "=A" (ret));
+	return ret;
+}
+
+#define ARCH_HAVE_FFZ
+#define ARCH_HAVE_CPU_CLOCK
+
+#endif
diff --git a/arch/arch-x86_64.h b/arch/arch-x86_64.h
new file mode 100644
index 0000000..25850f9
--- /dev/null
+++ b/arch/arch-x86_64.h
@@ -0,0 +1,71 @@
+#ifndef ARCH_X86_64_H
+#define ARCH_X86_64_H
+
+static inline void do_cpuid(unsigned int *eax, unsigned int *ebx,
+			    unsigned int *ecx, unsigned int *edx)
+{
+	asm volatile("cpuid"
+		: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+		: "0" (*eax), "2" (*ecx)
+		: "memory");
+}
+
+#include "arch-x86-common.h" /* IWYU pragma: export */
+
+#define FIO_ARCH	(arch_x86_64)
+
+#define	FIO_HUGE_PAGE		2097152
+
+#define nop		__asm__ __volatile__("rep;nop": : :"memory")
+#define read_barrier()	__asm__ __volatile__("":::"memory")
+#define write_barrier()	__asm__ __volatile__("":::"memory")
+
+static inline unsigned long arch_ffz(unsigned long bitmask)
+{
+	__asm__("bsf %1,%0" :"=r" (bitmask) :"r" (~bitmask));
+	return bitmask;
+}
+
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned int lo, hi;
+
+	__asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi));
+	return ((unsigned long long) hi << 32ULL) | lo;
+}
+
+#define ARCH_HAVE_FFZ
+#define ARCH_HAVE_SSE4_2
+#define ARCH_HAVE_CPU_CLOCK
+
+#define RDRAND_LONG	".byte 0x48,0x0f,0xc7,0xf0"
+#define RDSEED_LONG	".byte 0x48,0x0f,0xc7,0xf8"
+#define RDRAND_RETRY	100
+
+static inline int arch_rand_long(unsigned long *val)
+{
+	int ok;
+
+	asm volatile("1: " RDRAND_LONG "\n\t"
+		     "jc 2f\n\t"
+		     "decl %0\n\t"
+		     "jnz 1b\n\t"
+		     "2:"
+		     : "=r" (ok), "=a" (*val)
+		     : "0" (RDRAND_RETRY));
+
+	return ok;
+}
+
+static inline int arch_rand_seed(unsigned long *seed)
+{
+	unsigned char ok;
+
+	asm volatile(RDSEED_LONG "\n\t"
+			"setc %0"
+			: "=qm" (ok), "=a" (*seed));
+
+	return 0;
+}
+
+#endif
diff --git a/arch/arch.h b/arch/arch.h
new file mode 100644
index 0000000..30c0d20
--- /dev/null
+++ b/arch/arch.h
@@ -0,0 +1,107 @@
+#ifndef ARCH_H
+#define ARCH_H
+
+#include "../lib/types.h"
+
+enum {
+	arch_x86_64 = 1,
+	arch_x86,
+	arch_ppc,
+	arch_ia64,
+	arch_s390,
+	arch_alpha,
+	arch_sparc,
+	arch_sparc64,
+	arch_arm,
+	arch_sh,
+	arch_hppa,
+	arch_mips,
+	arch_aarch64,
+
+	arch_generic,
+
+	arch_nr,
+};
+
+enum {
+	ARCH_FLAG_1	= 1 << 0,
+	ARCH_FLAG_2	= 1 << 1,
+	ARCH_FLAG_3	= 1 << 2,
+	ARCH_FLAG_4	= 1 << 3,
+};
+
+extern unsigned long arch_flags;
+
+#define ARCH_CPU_CLOCK_WRAPS
+
+/* IWYU pragma: begin_exports */
+#if defined(__i386__)
+#include "arch-x86.h"
+#elif defined(__x86_64__)
+#include "arch-x86_64.h"
+#elif defined(__powerpc__) || defined(__powerpc64__) || defined(__ppc__)
+#include "arch-ppc.h"
+#elif defined(__ia64__)
+#include "arch-ia64.h"
+#elif defined(__alpha__)
+#include "arch-alpha.h"
+#elif defined(__s390x__) || defined(__s390__)
+#include "arch-s390.h"
+#elif defined(__sparc__)
+#include "arch-sparc.h"
+#elif defined(__sparc64__)
+#include "arch-sparc64.h"
+#elif defined(__arm__)
+#include "arch-arm.h"
+#elif defined(__mips__) || defined(__mips64__)
+#include "arch-mips.h"
+#elif defined(__sh__)
+#include "arch-sh.h"
+#elif defined(__hppa__)
+#include "arch-hppa.h"
+#elif defined(__aarch64__)
+#include "arch-aarch64.h"
+#else
+#warning "Unknown architecture, attempting to use generic model."
+#include "arch-generic.h"
+#endif
+
+#include "../lib/ffz.h"
+/* IWYU pragma: end_exports */
+
+#ifndef ARCH_HAVE_INIT
+static inline int arch_init(char *envp[])
+{
+	return 0;
+}
+#endif
+
+#ifdef __alpha__
+/*
+ * alpha is the only exception, all other architectures
+ * have common numbers for new system calls.
+ */
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		535
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		536
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	537
+# endif
+#else /* !__alpha__ */
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		425
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		426
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	427
+# endif
+#endif
+
+#define ARCH_HAVE_IOURING
+
+#endif
diff --git a/backend.c b/backend.c
new file mode 100644
index 0000000..936203d
--- /dev/null
+++ b/backend.c
@@ -0,0 +1,2549 @@
+/*
+ * fio - the flexible io tester
+ *
+ * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006-2012 Jens Axboe <axboe@kernel.dk>
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <math.h>
+#include <pthread.h>
+
+#include "fio.h"
+#include "smalloc.h"
+#include "verify.h"
+#include "diskutil.h"
+#include "cgroup.h"
+#include "profile.h"
+#include "lib/rand.h"
+#include "lib/memalign.h"
+#include "server.h"
+#include "lib/getrusage.h"
+#include "idletime.h"
+#include "err.h"
+#include "workqueue.h"
+#include "lib/mountcheck.h"
+#include "rate-submit.h"
+#include "helper_thread.h"
+#include "pshared.h"
+#include "zone-dist.h"
+
+static struct fio_sem *startup_sem;
+static struct flist_head *cgroup_list;
+static struct cgroup_mnt *cgroup_mnt;
+static int exit_value;
+static volatile bool fio_abort;
+static unsigned int nr_process = 0;
+static unsigned int nr_thread = 0;
+
+struct io_log *agg_io_log[DDIR_RWDIR_CNT];
+
+int groupid = 0;
+unsigned int thread_number = 0;
+unsigned int stat_number = 0;
+int shm_id = 0;
+int temp_stall_ts;
+unsigned long done_secs = 0;
+pthread_mutex_t overlap_check = PTHREAD_MUTEX_INITIALIZER;
+
+#define JOB_START_TIMEOUT	(5 * 1000)
+
+static void sig_int(int sig)
+{
+	if (threads) {
+		if (is_backend)
+			fio_server_got_signal(sig);
+		else {
+			log_info("\nfio: terminating on signal %d\n", sig);
+			log_info_flush();
+			exit_value = 128;
+		}
+
+		fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
+	}
+}
+
+void sig_show_status(int sig)
+{
+	show_running_run_stats();
+}
+
+static void set_sig_handlers(void)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGINT, &act, NULL);
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGTERM, &act, NULL);
+
+/* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGBREAK, &act, NULL);
+#endif
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_show_status;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGUSR1, &act, NULL);
+
+	if (is_backend) {
+		memset(&act, 0, sizeof(act));
+		act.sa_handler = sig_int;
+		act.sa_flags = SA_RESTART;
+		sigaction(SIGPIPE, &act, NULL);
+	}
+}
+
+/*
+ * Check if we are above the minimum rate given.
+ */
+static bool __check_min_rate(struct thread_data *td, struct timespec *now,
+			     enum fio_ddir ddir)
+{
+	unsigned long long bytes = 0;
+	unsigned long iops = 0;
+	unsigned long spent;
+	unsigned long rate;
+	unsigned int ratemin = 0;
+	unsigned int rate_iops = 0;
+	unsigned int rate_iops_min = 0;
+
+	assert(ddir_rw(ddir));
+
+	if (!td->o.ratemin[ddir] && !td->o.rate_iops_min[ddir])
+		return false;
+
+	/*
+	 * allow a 2 second settle period in the beginning
+	 */
+	if (mtime_since(&td->start, now) < 2000)
+		return false;
+
+	iops += td->this_io_blocks[ddir];
+	bytes += td->this_io_bytes[ddir];
+	ratemin += td->o.ratemin[ddir];
+	rate_iops += td->o.rate_iops[ddir];
+	rate_iops_min += td->o.rate_iops_min[ddir];
+
+	/*
+	 * if rate blocks is set, sample is running
+	 */
+	if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) {
+		spent = mtime_since(&td->lastrate[ddir], now);
+		if (spent < td->o.ratecycle)
+			return false;
+
+		if (td->o.rate[ddir] || td->o.ratemin[ddir]) {
+			/*
+			 * check bandwidth specified rate
+			 */
+			if (bytes < td->rate_bytes[ddir]) {
+				log_err("%s: rate_min=%uB/s not met, only transferred %lluB\n",
+					td->o.name, ratemin, bytes);
+				return true;
+			} else {
+				if (spent)
+					rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
+				else
+					rate = 0;
+
+				if (rate < ratemin ||
+				    bytes < td->rate_bytes[ddir]) {
+					log_err("%s: rate_min=%uB/s not met, got %luB/s\n",
+						td->o.name, ratemin, rate);
+					return true;
+				}
+			}
+		} else {
+			/*
+			 * checks iops specified rate
+			 */
+			if (iops < rate_iops) {
+				log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
+						td->o.name, rate_iops, iops);
+				return true;
+			} else {
+				if (spent)
+					rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
+				else
+					rate = 0;
+
+				if (rate < rate_iops_min ||
+				    iops < td->rate_blocks[ddir]) {
+					log_err("%s: rate_iops_min=%u not met, got %lu IOPS\n",
+						td->o.name, rate_iops_min, rate);
+					return true;
+				}
+			}
+		}
+	}
+
+	td->rate_bytes[ddir] = bytes;
+	td->rate_blocks[ddir] = iops;
+	memcpy(&td->lastrate[ddir], now, sizeof(*now));
+	return false;
+}
+
+static bool check_min_rate(struct thread_data *td, struct timespec *now)
+{
+	bool ret = false;
+
+	if (td->bytes_done[DDIR_READ])
+		ret |= __check_min_rate(td, now, DDIR_READ);
+	if (td->bytes_done[DDIR_WRITE])
+		ret |= __check_min_rate(td, now, DDIR_WRITE);
+	if (td->bytes_done[DDIR_TRIM])
+		ret |= __check_min_rate(td, now, DDIR_TRIM);
+
+	return ret;
+}
+
+/*
+ * When job exits, we can cancel the in-flight IO if we are using async
+ * io. Attempt to do so.
+ */
+static void cleanup_pending_aio(struct thread_data *td)
+{
+	int r;
+
+	if (td->error)
+		return;
+
+	/*
+	 * get immediately available events, if any
+	 */
+	r = io_u_queued_complete(td, 0);
+	if (r < 0)
+		return;
+
+	/*
+	 * now cancel remaining active events
+	 */
+	if (td->io_ops->cancel) {
+		struct io_u *io_u;
+		int i;
+
+		io_u_qiter(&td->io_u_all, io_u, i) {
+			if (io_u->flags & IO_U_F_FLIGHT) {
+				r = td->io_ops->cancel(td, io_u);
+				if (!r)
+					put_io_u(td, io_u);
+			}
+		}
+	}
+
+	if (td->cur_depth)
+		r = io_u_queued_complete(td, td->cur_depth);
+}
+
+/*
+ * Helper to handle the final sync of a file. Works just like the normal
+ * io path, just does everything sync.
+ */
+static bool fio_io_sync(struct thread_data *td, struct fio_file *f)
+{
+	struct io_u *io_u = __get_io_u(td);
+	enum fio_q_status ret;
+
+	if (!io_u)
+		return true;
+
+	io_u->ddir = DDIR_SYNC;
+	io_u->file = f;
+	io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
+
+	if (td_io_prep(td, io_u)) {
+		put_io_u(td, io_u);
+		return true;
+	}
+
+requeue:
+	ret = td_io_queue(td, io_u);
+	switch (ret) {
+	case FIO_Q_QUEUED:
+		td_io_commit(td);
+		if (io_u_queued_complete(td, 1) < 0)
+			return true;
+		break;
+	case FIO_Q_COMPLETED:
+		if (io_u->error) {
+			td_verror(td, io_u->error, "td_io_queue");
+			return true;
+		}
+
+		if (io_u_sync_complete(td, io_u) < 0)
+			return true;
+		break;
+	case FIO_Q_BUSY:
+		td_io_commit(td);
+		goto requeue;
+	}
+
+	return false;
+}
+
+static int fio_file_fsync(struct thread_data *td, struct fio_file *f)
+{
+	int ret, ret2;
+
+	if (fio_file_open(f))
+		return fio_io_sync(td, f);
+
+	if (td_io_open_file(td, f))
+		return 1;
+
+	ret = fio_io_sync(td, f);
+	ret2 = 0;
+	if (fio_file_open(f))
+		ret2 = td_io_close_file(td, f);
+	return (ret || ret2);
+}
+
+static inline void __update_ts_cache(struct thread_data *td)
+{
+	fio_gettime(&td->ts_cache, NULL);
+}
+
+static inline void update_ts_cache(struct thread_data *td)
+{
+	if ((++td->ts_cache_nr & td->ts_cache_mask) == td->ts_cache_mask)
+		__update_ts_cache(td);
+}
+
+static inline bool runtime_exceeded(struct thread_data *td, struct timespec *t)
+{
+	if (in_ramp_time(td))
+		return false;
+	if (!td->o.timeout)
+		return false;
+	if (utime_since(&td->epoch, t) >= td->o.timeout)
+		return true;
+
+	return false;
+}
+
+/*
+ * We need to update the runtime consistently in ms, but keep a running
+ * tally of the current elapsed time in microseconds for sub millisecond
+ * updates.
+ */
+static inline void update_runtime(struct thread_data *td,
+				  unsigned long long *elapsed_us,
+				  const enum fio_ddir ddir)
+{
+	if (ddir == DDIR_WRITE && td_write(td) && td->o.verify_only)
+		return;
+
+	td->ts.runtime[ddir] -= (elapsed_us[ddir] + 999) / 1000;
+	elapsed_us[ddir] += utime_since_now(&td->start);
+	td->ts.runtime[ddir] += (elapsed_us[ddir] + 999) / 1000;
+}
+
+static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
+				int *retptr)
+{
+	int ret = *retptr;
+
+	if (ret < 0 || td->error) {
+		int err = td->error;
+		enum error_type_bit eb;
+
+		if (ret < 0)
+			err = -ret;
+
+		eb = td_error_type(ddir, err);
+		if (!(td->o.continue_on_error & (1 << eb)))
+			return true;
+
+		if (td_non_fatal_error(td, eb, err)) {
+		        /*
+		         * Continue with the I/Os in case of
+			 * a non fatal error.
+			 */
+			update_error_count(td, err);
+			td_clear_error(td);
+			*retptr = 0;
+			return false;
+		} else if (td->o.fill_device && err == ENOSPC) {
+			/*
+			 * We expect to hit this error if
+			 * fill_device option is set.
+			 */
+			td_clear_error(td);
+			fio_mark_td_terminate(td);
+			return true;
+		} else {
+			/*
+			 * Stop the I/O in case of a fatal
+			 * error.
+			 */
+			update_error_count(td, err);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void check_update_rusage(struct thread_data *td)
+{
+	if (td->update_rusage) {
+		td->update_rusage = 0;
+		update_rusage_stat(td);
+		fio_sem_up(td->rusage_sem);
+	}
+}
+
+static int wait_for_completions(struct thread_data *td, struct timespec *time)
+{
+	const int full = queue_full(td);
+	int min_evts = 0;
+	int ret;
+
+	if (td->flags & TD_F_REGROW_LOGS)
+		return io_u_quiesce(td);
+
+	/*
+	 * if the queue is full, we MUST reap at least 1 event
+	 */
+	min_evts = min(td->o.iodepth_batch_complete_min, td->cur_depth);
+	if ((full && !min_evts) || !td->o.iodepth_batch_complete_min)
+		min_evts = 1;
+
+	if (time && __should_check_rate(td))
+		fio_gettime(time, NULL);
+
+	do {
+		ret = io_u_queued_complete(td, min_evts);
+		if (ret < 0)
+			break;
+	} while (full && (td->cur_depth > td->o.iodepth_low));
+
+	return ret;
+}
+
+int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
+		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
+		   struct timespec *comp_time)
+{
+	switch (*ret) {
+	case FIO_Q_COMPLETED:
+		if (io_u->error) {
+			*ret = -io_u->error;
+			clear_io_u(td, io_u);
+		} else if (io_u->resid) {
+			long long bytes = io_u->xfer_buflen - io_u->resid;
+			struct fio_file *f = io_u->file;
+
+			if (bytes_issued)
+				*bytes_issued += bytes;
+
+			if (!from_verify)
+				trim_io_piece(io_u);
+
+			/*
+			 * zero read, fail
+			 */
+			if (!bytes) {
+				if (!from_verify)
+					unlog_io_piece(td, io_u);
+				td_verror(td, EIO, "full resid");
+				put_io_u(td, io_u);
+				break;
+			}
+
+			io_u->xfer_buflen = io_u->resid;
+			io_u->xfer_buf += bytes;
+			io_u->offset += bytes;
+
+			if (ddir_rw(io_u->ddir))
+				td->ts.short_io_u[io_u->ddir]++;
+
+			if (io_u->offset == f->real_file_size)
+				goto sync_done;
+
+			requeue_io_u(td, &io_u);
+		} else {
+sync_done:
+			if (comp_time && __should_check_rate(td))
+				fio_gettime(comp_time, NULL);
+
+			*ret = io_u_sync_complete(td, io_u);
+			if (*ret < 0)
+				break;
+		}
+
+		if (td->flags & TD_F_REGROW_LOGS)
+			regrow_logs(td);
+
+		/*
+		 * when doing I/O (not when verifying),
+		 * check for any errors that are to be ignored
+		 */
+		if (!from_verify)
+			break;
+
+		return 0;
+	case FIO_Q_QUEUED:
+		/*
+		 * if the engine doesn't have a commit hook,
+		 * the io_u is really queued. if it does have such
+		 * a hook, it has to call io_u_queued() itself.
+		 */
+		if (td->io_ops->commit == NULL)
+			io_u_queued(td, io_u);
+		if (bytes_issued)
+			*bytes_issued += io_u->xfer_buflen;
+		break;
+	case FIO_Q_BUSY:
+		if (!from_verify)
+			unlog_io_piece(td, io_u);
+		requeue_io_u(td, &io_u);
+		td_io_commit(td);
+		break;
+	default:
+		assert(*ret < 0);
+		td_verror(td, -(*ret), "td_io_queue");
+		break;
+	}
+
+	if (break_on_this_error(td, ddir, ret))
+		return 1;
+
+	return 0;
+}
+
+static inline bool io_in_polling(struct thread_data *td)
+{
+	return !td->o.iodepth_batch_complete_min &&
+		   !td->o.iodepth_batch_complete_max;
+}
+/*
+ * Unlinks files from thread data fio_file structure
+ */
+static int unlink_all_files(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int ret = 0;
+
+	for_each_file(td, f, i) {
+		if (f->filetype != FIO_TYPE_FILE)
+			continue;
+		ret = td_io_unlink_file(td, f);
+		if (ret)
+			break;
+	}
+
+	if (ret)
+		td_verror(td, ret, "unlink_all_files");
+
+	return ret;
+}
+
+/*
+ * Check if io_u will overlap an in-flight IO in the queue
+ */
+bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u)
+{
+	bool overlap;
+	struct io_u *check_io_u;
+	unsigned long long x1, x2, y1, y2;
+	int i;
+
+	x1 = io_u->offset;
+	x2 = io_u->offset + io_u->buflen;
+	overlap = false;
+	io_u_qiter(q, check_io_u, i) {
+		if (check_io_u->flags & IO_U_F_FLIGHT) {
+			y1 = check_io_u->offset;
+			y2 = check_io_u->offset + check_io_u->buflen;
+
+			if (x1 < y2 && y1 < x2) {
+				overlap = true;
+				dprint(FD_IO, "in-flight overlap: %llu/%llu, %llu/%llu\n",
+						x1, io_u->buflen,
+						y1, check_io_u->buflen);
+				break;
+			}
+		}
+	}
+
+	return overlap;
+}
+
+static enum fio_q_status io_u_submit(struct thread_data *td, struct io_u *io_u)
+{
+	/*
+	 * Check for overlap if the user asked us to, and we have
+	 * at least one IO in flight besides this one.
+	 */
+	if (td->o.serialize_overlap && td->cur_depth > 1 &&
+	    in_flight_overlap(&td->io_u_all, io_u))
+		return FIO_Q_BUSY;
+
+	return td_io_queue(td, io_u);
+}
+
+/*
+ * The main verify engine. Runs over the writes we previously submitted,
+ * reads the blocks back in, and checks the crc/md5 of the data.
+ */
+static void do_verify(struct thread_data *td, uint64_t verify_bytes)
+{
+	struct fio_file *f;
+	struct io_u *io_u;
+	int ret, min_events;
+	unsigned int i;
+
+	dprint(FD_VERIFY, "starting loop\n");
+
+	/*
+	 * sync io first and invalidate cache, to make sure we really
+	 * read from disk.
+	 */
+	for_each_file(td, f, i) {
+		if (!fio_file_open(f))
+			continue;
+		if (fio_io_sync(td, f))
+			break;
+		if (file_invalidate_cache(td, f))
+			break;
+	}
+
+	check_update_rusage(td);
+
+	if (td->error)
+		return;
+
+	/*
+	 * verify_state needs to be reset before verification
+	 * proceeds so that expected random seeds match actual
+	 * random seeds in headers. The main loop will reset
+	 * all random number generators if randrepeat is set.
+	 */
+	if (!td->o.rand_repeatable)
+		td_fill_verify_state_seed(td);
+
+	td_set_runstate(td, TD_VERIFYING);
+
+	io_u = NULL;
+	while (!td->terminate) {
+		enum fio_ddir ddir;
+		int full;
+
+		update_ts_cache(td);
+		check_update_rusage(td);
+
+		if (runtime_exceeded(td, &td->ts_cache)) {
+			__update_ts_cache(td);
+			if (runtime_exceeded(td, &td->ts_cache)) {
+				fio_mark_td_terminate(td);
+				break;
+			}
+		}
+
+		if (flow_threshold_exceeded(td))
+			continue;
+
+		if (!td->o.experimental_verify) {
+			io_u = __get_io_u(td);
+			if (!io_u)
+				break;
+
+			if (get_next_verify(td, io_u)) {
+				put_io_u(td, io_u);
+				break;
+			}
+
+			if (td_io_prep(td, io_u)) {
+				put_io_u(td, io_u);
+				break;
+			}
+		} else {
+			if (ddir_rw_sum(td->bytes_done) + td->o.rw_min_bs > verify_bytes)
+				break;
+
+			while ((io_u = get_io_u(td)) != NULL) {
+				if (IS_ERR_OR_NULL(io_u)) {
+					io_u = NULL;
+					ret = FIO_Q_BUSY;
+					goto reap;
+				}
+
+				/*
+				 * We are only interested in the places where
+				 * we wrote or trimmed IOs. Turn those into
+				 * reads for verification purposes.
+				 */
+				if (io_u->ddir == DDIR_READ) {
+					/*
+					 * Pretend we issued it for rwmix
+					 * accounting
+					 */
+					td->io_issues[DDIR_READ]++;
+					put_io_u(td, io_u);
+					continue;
+				} else if (io_u->ddir == DDIR_TRIM) {
+					io_u->ddir = DDIR_READ;
+					io_u_set(td, io_u, IO_U_F_TRIMMED);
+					break;
+				} else if (io_u->ddir == DDIR_WRITE) {
+					io_u->ddir = DDIR_READ;
+					populate_verify_io_u(td, io_u);
+					break;
+				} else {
+					put_io_u(td, io_u);
+					continue;
+				}
+			}
+
+			if (!io_u)
+				break;
+		}
+
+		if (verify_state_should_stop(td, io_u)) {
+			put_io_u(td, io_u);
+			break;
+		}
+
+		if (td->o.verify_async)
+			io_u->end_io = verify_io_u_async;
+		else
+			io_u->end_io = verify_io_u;
+
+		ddir = io_u->ddir;
+		if (!td->o.disable_slat)
+			fio_gettime(&io_u->start_time, NULL);
+
+		ret = io_u_submit(td, io_u);
+
+		if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL))
+			break;
+
+		/*
+		 * if we can queue more, do so. but check if there are
+		 * completed io_u's first. Note that we can get BUSY even
+		 * without IO queued, if the system is resource starved.
+		 */
+reap:
+		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
+		if (full || io_in_polling(td))
+			ret = wait_for_completions(td, NULL);
+
+		if (ret < 0)
+			break;
+	}
+
+	check_update_rusage(td);
+
+	if (!td->error) {
+		min_events = td->cur_depth;
+
+		if (min_events)
+			ret = io_u_queued_complete(td, min_events);
+	} else
+		cleanup_pending_aio(td);
+
+	td_set_runstate(td, TD_RUNNING);
+
+	dprint(FD_VERIFY, "exiting loop\n");
+}
+
+static bool exceeds_number_ios(struct thread_data *td)
+{
+	unsigned long long number_ios;
+
+	if (!td->o.number_ios)
+		return false;
+
+	number_ios = ddir_rw_sum(td->io_blocks);
+	number_ios += td->io_u_queued + td->io_u_in_flight;
+
+	return number_ios >= (td->o.number_ios * td->loops);
+}
+
+static bool io_bytes_exceeded(struct thread_data *td, uint64_t *this_bytes)
+{
+	unsigned long long bytes, limit;
+
+	if (td_rw(td))
+		bytes = this_bytes[DDIR_READ] + this_bytes[DDIR_WRITE];
+	else if (td_write(td))
+		bytes = this_bytes[DDIR_WRITE];
+	else if (td_read(td))
+		bytes = this_bytes[DDIR_READ];
+	else
+		bytes = this_bytes[DDIR_TRIM];
+
+	if (td->o.io_size)
+		limit = td->o.io_size;
+	else
+		limit = td->o.size;
+
+	limit *= td->loops;
+	return bytes >= limit || exceeds_number_ios(td);
+}
+
+static bool io_issue_bytes_exceeded(struct thread_data *td)
+{
+	return io_bytes_exceeded(td, td->io_issue_bytes);
+}
+
+static bool io_complete_bytes_exceeded(struct thread_data *td)
+{
+	return io_bytes_exceeded(td, td->this_io_bytes);
+}
+
+/*
+ * used to calculate the next io time for rate control
+ *
+ */
+static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+{
+	uint64_t bps = td->rate_bps[ddir];
+
+	assert(!(td->flags & TD_F_CHILD));
+
+	if (td->o.rate_process == RATE_PROCESS_POISSON) {
+		uint64_t val, iops;
+
+		iops = bps / td->o.bs[ddir];
+		val = (int64_t) (1000000 / iops) *
+				-logf(__rand_0_1(&td->poisson_state[ddir]));
+		if (val) {
+			dprint(FD_RATE, "poisson rate iops=%llu, ddir=%d\n",
+					(unsigned long long) 1000000 / val,
+					ddir);
+		}
+		td->last_usec[ddir] += val;
+		return td->last_usec[ddir];
+	} else if (bps) {
+		uint64_t bytes = td->rate_io_issue_bytes[ddir];
+		uint64_t secs = bytes / bps;
+		uint64_t remainder = bytes % bps;
+
+		return remainder * 1000000 / bps + secs * 1000000;
+	}
+
+	return 0;
+}
+
+static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned long long b;
+	uint64_t total;
+	int left;
+
+	b = ddir_rw_sum(td->io_blocks);
+	if (b % td->o.thinktime_blocks)
+		return;
+
+	io_u_quiesce(td);
+
+	total = 0;
+	if (td->o.thinktime_spin)
+		total = usec_spin(td->o.thinktime_spin);
+
+	left = td->o.thinktime - total;
+	if (left)
+		total += usec_sleep(td, left);
+
+	/*
+	 * If we're ignoring thinktime for the rate, add the number of bytes
+	 * we would have done while sleeping, minus one block to ensure we
+	 * start issuing immediately after the sleep.
+	 */
+	if (total && td->rate_bps[ddir] && td->o.rate_ign_think) {
+		uint64_t missed = (td->rate_bps[ddir] * total) / 1000000ULL;
+		uint64_t bs = td->o.min_bs[ddir];
+		uint64_t usperop = bs * 1000000ULL / td->rate_bps[ddir];
+		uint64_t over;
+
+		if (usperop <= total)
+			over = bs;
+		else
+			over = (usperop - total) / usperop * -bs;
+
+		td->rate_io_issue_bytes[ddir] += (missed - over);
+		/* adjust for rate_process=poisson */
+		td->last_usec[ddir] += total;
+	}
+}
+
+/*
+ * Main IO worker function. It retrieves io_u's to process and queues
+ * and reaps them, checking for rate and errors along the way.
+ *
+ * Returns number of bytes written and trimmed.
+ */
+static void do_io(struct thread_data *td, uint64_t *bytes_done)
+{
+	unsigned int i;
+	int ret = 0;
+	uint64_t total_bytes, bytes_issued = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		bytes_done[i] = td->bytes_done[i];
+
+	if (in_ramp_time(td))
+		td_set_runstate(td, TD_RAMP);
+	else
+		td_set_runstate(td, TD_RUNNING);
+
+	lat_target_init(td);
+
+	total_bytes = td->o.size;
+	/*
+	* Allow random overwrite workloads to write up to io_size
+	* before starting verification phase as 'size' doesn't apply.
+	*/
+	if (td_write(td) && td_random(td) && td->o.norandommap)
+		total_bytes = max(total_bytes, (uint64_t) td->o.io_size);
+	/*
+	 * If verify_backlog is enabled, we'll run the verify in this
+	 * handler as well. For that case, we may need up to twice the
+	 * amount of bytes.
+	 */
+	if (td->o.verify != VERIFY_NONE &&
+	   (td_write(td) && td->o.verify_backlog))
+		total_bytes += td->o.size;
+
+	/* In trimwrite mode, each byte is trimmed and then written, so
+	 * allow total_bytes to be twice as big */
+	if (td_trimwrite(td))
+		total_bytes += td->total_io_size;
+
+	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
+		(!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
+		td->o.time_based) {
+		struct timespec comp_time;
+		struct io_u *io_u;
+		int full;
+		enum fio_ddir ddir;
+
+		check_update_rusage(td);
+
+		if (td->terminate || td->done)
+			break;
+
+		update_ts_cache(td);
+
+		if (runtime_exceeded(td, &td->ts_cache)) {
+			__update_ts_cache(td);
+			if (runtime_exceeded(td, &td->ts_cache)) {
+				fio_mark_td_terminate(td);
+				break;
+			}
+		}
+
+		if (flow_threshold_exceeded(td))
+			continue;
+
+		/*
+		 * Break if we exceeded the bytes. The exception is time
+		 * based runs, but we still need to break out of the loop
+		 * for those to run verification, if enabled.
+		 * Jobs read from iolog do not use this stop condition.
+		 */
+		if (bytes_issued >= total_bytes &&
+		    !td->o.read_iolog_file &&
+		    (!td->o.time_based ||
+		     (td->o.time_based && td->o.verify != VERIFY_NONE)))
+			break;
+
+		io_u = get_io_u(td);
+		if (IS_ERR_OR_NULL(io_u)) {
+			int err = PTR_ERR(io_u);
+
+			io_u = NULL;
+			ddir = DDIR_INVAL;
+			if (err == -EBUSY) {
+				ret = FIO_Q_BUSY;
+				goto reap;
+			}
+			if (td->o.latency_target)
+				goto reap;
+			break;
+		}
+
+		if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY)
+			populate_verify_io_u(td, io_u);
+
+		ddir = io_u->ddir;
+
+		/*
+		 * Add verification end_io handler if:
+		 *	- Asked to verify (!td_rw(td))
+		 *	- Or the io_u is from our verify list (mixed write/ver)
+		 */
+		if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ &&
+		    ((io_u->flags & IO_U_F_VER_LIST) || !td_rw(td))) {
+
+			if (!td->o.verify_pattern_bytes) {
+				io_u->rand_seed = __rand(&td->verify_state);
+				if (sizeof(int) != sizeof(long *))
+					io_u->rand_seed *= __rand(&td->verify_state);
+			}
+
+			if (verify_state_should_stop(td, io_u)) {
+				put_io_u(td, io_u);
+				break;
+			}
+
+			if (td->o.verify_async)
+				io_u->end_io = verify_io_u_async;
+			else
+				io_u->end_io = verify_io_u;
+			td_set_runstate(td, TD_VERIFYING);
+		} else if (in_ramp_time(td))
+			td_set_runstate(td, TD_RAMP);
+		else
+			td_set_runstate(td, TD_RUNNING);
+
+		/*
+		 * Always log IO before it's issued, so we know the specific
+		 * order of it. The logged unit will track when the IO has
+		 * completed.
+		 */
+		if (td_write(td) && io_u->ddir == DDIR_WRITE &&
+		    td->o.do_verify &&
+		    td->o.verify != VERIFY_NONE &&
+		    !td->o.experimental_verify)
+			log_io_piece(td, io_u);
+
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD) {
+			const unsigned long long blen = io_u->xfer_buflen;
+			const enum fio_ddir __ddir = acct_ddir(io_u);
+
+			if (td->error)
+				break;
+
+			workqueue_enqueue(&td->io_wq, &io_u->work);
+			ret = FIO_Q_QUEUED;
+
+			if (ddir_rw(__ddir)) {
+				td->io_issues[__ddir]++;
+				td->io_issue_bytes[__ddir] += blen;
+				td->rate_io_issue_bytes[__ddir] += blen;
+			}
+
+			if (should_check_rate(td))
+				td->rate_next_io_time[__ddir] = usec_for_io(td, __ddir);
+
+		} else {
+			ret = io_u_submit(td, io_u);
+
+			if (should_check_rate(td))
+				td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
+			if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time))
+				break;
+
+			/*
+			 * See if we need to complete some commands. Note that
+			 * we can get BUSY even without IO queued, if the
+			 * system is resource starved.
+			 */
+reap:
+			full = queue_full(td) ||
+				(ret == FIO_Q_BUSY && td->cur_depth);
+			if (full || io_in_polling(td))
+				ret = wait_for_completions(td, &comp_time);
+		}
+		if (ret < 0)
+			break;
+		if (!ddir_rw_sum(td->bytes_done) &&
+		    !td_ioengine_flagged(td, FIO_NOIO))
+			continue;
+
+		if (!in_ramp_time(td) && should_check_rate(td)) {
+			if (check_min_rate(td, &comp_time)) {
+				if (exitall_on_terminate || td->o.exitall_error)
+					fio_terminate_threads(td->groupid, td->o.exit_what);
+				td_verror(td, EIO, "check_min_rate");
+				break;
+			}
+		}
+		if (!in_ramp_time(td) && td->o.latency_target)
+			lat_target_check(td);
+
+		if (ddir_rw(ddir) && td->o.thinktime)
+			handle_thinktime(td, ddir);
+	}
+
+	check_update_rusage(td);
+
+	if (td->trim_entries)
+		log_err("fio: %lu trim entries leaked?\n", td->trim_entries);
+
+	if (td->o.fill_device && td->error == ENOSPC) {
+		td->error = 0;
+		fio_mark_td_terminate(td);
+	}
+	if (!td->error) {
+		struct fio_file *f;
+
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD) {
+			workqueue_flush(&td->io_wq);
+			i = 0;
+		} else
+			i = td->cur_depth;
+
+		if (i) {
+			ret = io_u_queued_complete(td, i);
+			if (td->o.fill_device && td->error == ENOSPC)
+				td->error = 0;
+		}
+
+		if (should_fsync(td) && (td->o.end_fsync || td->o.fsync_on_close)) {
+			td_set_runstate(td, TD_FSYNCING);
+
+			for_each_file(td, f, i) {
+				if (!fio_file_fsync(td, f))
+					continue;
+
+				log_err("fio: end_fsync failed for file %s\n",
+								f->file_name);
+			}
+		}
+	} else
+		cleanup_pending_aio(td);
+
+	/*
+	 * stop job if we failed doing any IO
+	 */
+	if (!ddir_rw_sum(td->this_io_bytes))
+		td->done = 1;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		bytes_done[i] = td->bytes_done[i] - bytes_done[i];
+}
+
+static void free_file_completion_logging(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (!f->last_write_comp)
+			break;
+		sfree(f->last_write_comp);
+	}
+}
+
+static int init_file_completion_logging(struct thread_data *td,
+					unsigned int depth)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (td->o.verify == VERIFY_NONE || !td->o.verify_state_save)
+		return 0;
+
+	for_each_file(td, f, i) {
+		f->last_write_comp = scalloc(depth, sizeof(uint64_t));
+		if (!f->last_write_comp)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	free_file_completion_logging(td);
+	log_err("fio: failed to alloc write comp data\n");
+	return 1;
+}
+
+static void cleanup_io_u(struct thread_data *td)
+{
+	struct io_u *io_u;
+
+	while ((io_u = io_u_qpop(&td->io_u_freelist)) != NULL) {
+
+		if (td->io_ops->io_u_free)
+			td->io_ops->io_u_free(td, io_u);
+
+		fio_memfree(io_u, sizeof(*io_u), td_offload_overlap(td));
+	}
+
+	free_io_mem(td);
+
+	io_u_rexit(&td->io_u_requeues);
+	io_u_qexit(&td->io_u_freelist, false);
+	io_u_qexit(&td->io_u_all, td_offload_overlap(td));
+
+	free_file_completion_logging(td);
+}
+
+static int init_io_u(struct thread_data *td)
+{
+	struct io_u *io_u;
+	int cl_align, i, max_units;
+	int err;
+
+	max_units = td->o.iodepth;
+
+	err = 0;
+	err += !io_u_rinit(&td->io_u_requeues, td->o.iodepth);
+	err += !io_u_qinit(&td->io_u_freelist, td->o.iodepth, false);
+	err += !io_u_qinit(&td->io_u_all, td->o.iodepth, td_offload_overlap(td));
+
+	if (err) {
+		log_err("fio: failed setting up IO queues\n");
+		return 1;
+	}
+
+	cl_align = os_cache_line_size();
+
+	for (i = 0; i < max_units; i++) {
+		void *ptr;
+
+		if (td->terminate)
+			return 1;
+
+		ptr = fio_memalign(cl_align, sizeof(*io_u), td_offload_overlap(td));
+		if (!ptr) {
+			log_err("fio: unable to allocate aligned memory\n");
+			return 1;
+		}
+
+		io_u = ptr;
+		memset(io_u, 0, sizeof(*io_u));
+		INIT_FLIST_HEAD(&io_u->verify_list);
+		dprint(FD_MEM, "io_u alloc %p, index %u\n", io_u, i);
+
+		io_u->index = i;
+		io_u->flags = IO_U_F_FREE;
+		io_u_qpush(&td->io_u_freelist, io_u);
+
+		/*
+		 * io_u never leaves this stack, used for iteration of all
+		 * io_u buffers.
+		 */
+		io_u_qpush(&td->io_u_all, io_u);
+
+		if (td->io_ops->io_u_init) {
+			int ret = td->io_ops->io_u_init(td, io_u);
+
+			if (ret) {
+				log_err("fio: failed to init engine data: %d\n", ret);
+				return 1;
+			}
+		}
+	}
+
+	init_io_u_buffers(td);
+
+	if (init_file_completion_logging(td, max_units))
+		return 1;
+
+	return 0;
+}
+
+int init_io_u_buffers(struct thread_data *td)
+{
+	struct io_u *io_u;
+	unsigned long long max_bs, min_write;
+	int i, max_units;
+	int data_xfer = 1;
+	char *p;
+
+	max_units = td->o.iodepth;
+	max_bs = td_max_bs(td);
+	min_write = td->o.min_bs[DDIR_WRITE];
+	td->orig_buffer_size = (unsigned long long) max_bs
+					* (unsigned long long) max_units;
+
+	if (td_ioengine_flagged(td, FIO_NOIO) || !(td_read(td) || td_write(td)))
+		data_xfer = 0;
+
+	/*
+	 * if we may later need to do address alignment, then add any
+	 * possible adjustment here so that we don't cause a buffer
+	 * overflow later. this adjustment may be too much if we get
+	 * lucky and the allocator gives us an aligned address.
+	 */
+	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+	    td_ioengine_flagged(td, FIO_RAWIO))
+		td->orig_buffer_size += page_mask + td->o.mem_align;
+
+	if (td->o.mem_type == MEM_SHMHUGE || td->o.mem_type == MEM_MMAPHUGE) {
+		unsigned long long bs;
+
+		bs = td->orig_buffer_size + td->o.hugepage_size - 1;
+		td->orig_buffer_size = bs & ~(td->o.hugepage_size - 1);
+	}
+
+	if (td->orig_buffer_size != (size_t) td->orig_buffer_size) {
+		log_err("fio: IO memory too large. Reduce max_bs or iodepth\n");
+		return 1;
+	}
+
+	if (data_xfer && allocate_io_mem(td))
+		return 1;
+
+	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+	    td_ioengine_flagged(td, FIO_RAWIO))
+		p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align;
+	else
+		p = td->orig_buffer;
+
+	for (i = 0; i < max_units; i++) {
+		io_u = td->io_u_all.io_us[i];
+		dprint(FD_MEM, "io_u alloc %p, index %u\n", io_u, i);
+
+		if (data_xfer) {
+			io_u->buf = p;
+			dprint(FD_MEM, "io_u %p, mem %p\n", io_u, io_u->buf);
+
+			if (td_write(td))
+				io_u_fill_buffer(td, io_u, min_write, max_bs);
+			if (td_write(td) && td->o.verify_pattern_bytes) {
+				/*
+				 * Fill the buffer with the pattern if we are
+				 * going to be doing writes.
+				 */
+				fill_verify_pattern(td, io_u->buf, max_bs, io_u, 0, 0);
+			}
+		}
+		p += max_bs;
+	}
+
+	return 0;
+}
+
+/*
+ * This function is Linux specific.
+ * FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
+ */
+static int switch_ioscheduler(struct thread_data *td)
+{
+#ifdef FIO_HAVE_IOSCHED_SWITCH
+	char tmp[256], tmp2[128], *p;
+	FILE *f;
+	int ret;
+
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
+		return 0;
+
+	assert(td->files && td->files[0]);
+	sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
+
+	f = fopen(tmp, "r+");
+	if (!f) {
+		if (errno == ENOENT) {
+			log_err("fio: os or kernel doesn't support IO scheduler"
+				" switching\n");
+			return 0;
+		}
+		td_verror(td, errno, "fopen iosched");
+		return 1;
+	}
+
+	/*
+	 * Set io scheduler.
+	 */
+	ret = fwrite(td->o.ioscheduler, strlen(td->o.ioscheduler), 1, f);
+	if (ferror(f) || ret != 1) {
+		td_verror(td, errno, "fwrite");
+		fclose(f);
+		return 1;
+	}
+
+	rewind(f);
+
+	/*
+	 * Read back and check that the selected scheduler is now the default.
+	 */
+	ret = fread(tmp, 1, sizeof(tmp) - 1, f);
+	if (ferror(f) || ret < 0) {
+		td_verror(td, errno, "fread");
+		fclose(f);
+		return 1;
+	}
+	tmp[ret] = '\0';
+	/*
+	 * either a list of io schedulers or "none\n" is expected. Strip the
+	 * trailing newline.
+	 */
+	p = tmp;
+	strsep(&p, "\n");
+
+	/*
+	 * Write to "none" entry doesn't fail, so check the result here.
+	 */
+	if (!strcmp(tmp, "none")) {
+		log_err("fio: io scheduler is not tunable\n");
+		fclose(f);
+		return 0;
+	}
+
+	sprintf(tmp2, "[%s]", td->o.ioscheduler);
+	if (!strstr(tmp, tmp2)) {
+		log_err("fio: io scheduler %s not found\n", td->o.ioscheduler);
+		td_verror(td, EINVAL, "iosched_switch");
+		fclose(f);
+		return 1;
+	}
+
+	fclose(f);
+	return 0;
+#else
+	return 0;
+#endif
+}
+
+static bool keep_running(struct thread_data *td)
+{
+	unsigned long long limit;
+
+	if (td->done)
+		return false;
+	if (td->terminate)
+		return false;
+	if (td->o.time_based)
+		return true;
+	if (td->o.loops) {
+		td->o.loops--;
+		return true;
+	}
+	if (exceeds_number_ios(td))
+		return false;
+
+	if (td->o.io_size)
+		limit = td->o.io_size;
+	else
+		limit = td->o.size;
+
+	if (limit != -1ULL && ddir_rw_sum(td->io_bytes) < limit) {
+		uint64_t diff;
+
+		/*
+		 * If the difference is less than the maximum IO size, we
+		 * are done.
+		 */
+		diff = limit - ddir_rw_sum(td->io_bytes);
+		if (diff < td_max_bs(td))
+			return false;
+
+		if (fio_files_done(td) && !td->o.io_size)
+			return false;
+
+		return true;
+	}
+
+	return false;
+}
+
+static int exec_string(struct thread_options *o, const char *string, const char *mode)
+{
+	size_t newlen = strlen(string) + strlen(o->name) + strlen(mode) + 13 + 1;
+	int ret;
+	char *str;
+
+	str = malloc(newlen);
+	sprintf(str, "%s > %s.%s.txt 2>&1", string, o->name, mode);
+
+	log_info("%s : Saving output of %s in %s.%s.txt\n",o->name, mode, o->name, mode);
+	ret = system(str);
+	if (ret == -1)
+		log_err("fio: exec of cmd <%s> failed\n", str);
+
+	free(str);
+	return ret;
+}
+
+/*
+ * Dry run to compute correct state of numberio for verification.
+ */
+static uint64_t do_dry_run(struct thread_data *td)
+{
+	td_set_runstate(td, TD_RUNNING);
+
+	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
+		(!flist_empty(&td->trim_list)) || !io_complete_bytes_exceeded(td)) {
+		struct io_u *io_u;
+		int ret;
+
+		if (td->terminate || td->done)
+			break;
+
+		io_u = get_io_u(td);
+		if (IS_ERR_OR_NULL(io_u))
+			break;
+
+		io_u_set(td, io_u, IO_U_F_FLIGHT);
+		io_u->error = 0;
+		io_u->resid = 0;
+		if (ddir_rw(acct_ddir(io_u)))
+			td->io_issues[acct_ddir(io_u)]++;
+		if (ddir_rw(io_u->ddir)) {
+			io_u_mark_depth(td, 1);
+			td->ts.total_io_u[io_u->ddir]++;
+		}
+
+		if (td_write(td) && io_u->ddir == DDIR_WRITE &&
+		    td->o.do_verify &&
+		    td->o.verify != VERIFY_NONE &&
+		    !td->o.experimental_verify)
+			log_io_piece(td, io_u);
+
+		ret = io_u_sync_complete(td, io_u);
+		(void) ret;
+	}
+
+	return td->bytes_done[DDIR_WRITE] + td->bytes_done[DDIR_TRIM];
+}
+
+struct fork_data {
+	struct thread_data *td;
+	struct sk_out *sk_out;
+};
+
+/*
+ * Entry point for the thread based jobs. The process based jobs end up
+ * here as well, after a little setup.
+ */
+static void *thread_main(void *data)
+{
+	struct fork_data *fd = data;
+	unsigned long long elapsed_us[DDIR_RWDIR_CNT] = { 0, };
+	struct thread_data *td = fd->td;
+	struct thread_options *o = &td->o;
+	struct sk_out *sk_out = fd->sk_out;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
+	int deadlock_loop_cnt;
+	bool clear_state;
+	int ret;
+
+	sk_out_assign(sk_out);
+	free(fd);
+
+	if (!o->use_thread) {
+		setsid();
+		td->pid = getpid();
+	} else
+		td->pid = gettid();
+
+	fio_local_clock_init();
+
+	dprint(FD_PROCESS, "jobs pid=%d started\n", (int) td->pid);
+
+	if (is_backend)
+		fio_server_send_start(td);
+
+	INIT_FLIST_HEAD(&td->io_log_list);
+	INIT_FLIST_HEAD(&td->io_hist_list);
+	INIT_FLIST_HEAD(&td->verify_list);
+	INIT_FLIST_HEAD(&td->trim_list);
+	td->io_hist_tree = RB_ROOT;
+
+	ret = mutex_cond_init_pshared(&td->io_u_lock, &td->free_cond);
+	if (ret) {
+		td_verror(td, ret, "mutex_cond_init_pshared");
+		goto err;
+	}
+	ret = cond_init_pshared(&td->verify_cond);
+	if (ret) {
+		td_verror(td, ret, "mutex_cond_pshared");
+		goto err;
+	}
+
+	td_set_runstate(td, TD_INITIALIZED);
+	dprint(FD_MUTEX, "up startup_sem\n");
+	fio_sem_up(startup_sem);
+	dprint(FD_MUTEX, "wait on td->sem\n");
+	fio_sem_down(td->sem);
+	dprint(FD_MUTEX, "done waiting on td->sem\n");
+
+	/*
+	 * A new gid requires privilege, so we need to do this before setting
+	 * the uid.
+	 */
+	if (o->gid != -1U && setgid(o->gid)) {
+		td_verror(td, errno, "setgid");
+		goto err;
+	}
+	if (o->uid != -1U && setuid(o->uid)) {
+		td_verror(td, errno, "setuid");
+		goto err;
+	}
+
+	td_zone_gen_index(td);
+
+	/*
+	 * Do this early, we don't want the compress threads to be limited
+	 * to the same CPUs as the IO workers. So do this before we set
+	 * any potential CPU affinity
+	 */
+	if (iolog_compress_init(td, sk_out))
+		goto err;
+
+	/*
+	 * If we have a gettimeofday() thread, make sure we exclude that
+	 * thread from this job
+	 */
+	if (o->gtod_cpu)
+		fio_cpu_clear(&o->cpumask, o->gtod_cpu);
+
+	/*
+	 * Set affinity first, in case it has an impact on the memory
+	 * allocations.
+	 */
+	if (fio_option_is_set(o, cpumask)) {
+		if (o->cpus_allowed_policy == FIO_CPUS_SPLIT) {
+			ret = fio_cpus_split(&o->cpumask, td->thread_number - 1);
+			if (!ret) {
+				log_err("fio: no CPUs set\n");
+				log_err("fio: Try increasing number of available CPUs\n");
+				td_verror(td, EINVAL, "cpus_split");
+				goto err;
+			}
+		}
+		ret = fio_setaffinity(td->pid, o->cpumask);
+		if (ret == -1) {
+			td_verror(td, errno, "cpu_set_affinity");
+			goto err;
+		}
+	}
+
+#ifdef CONFIG_LIBNUMA
+	/* numa node setup */
+	if (fio_option_is_set(o, numa_cpunodes) ||
+	    fio_option_is_set(o, numa_memnodes)) {
+		struct bitmask *mask;
+
+		if (numa_available() < 0) {
+			td_verror(td, errno, "Does not support NUMA API\n");
+			goto err;
+		}
+
+		if (fio_option_is_set(o, numa_cpunodes)) {
+			mask = numa_parse_nodestring(o->numa_cpunodes);
+			ret = numa_run_on_node_mask(mask);
+			numa_free_nodemask(mask);
+			if (ret == -1) {
+				td_verror(td, errno, \
+					"numa_run_on_node_mask failed\n");
+				goto err;
+			}
+		}
+
+		if (fio_option_is_set(o, numa_memnodes)) {
+			mask = NULL;
+			if (o->numa_memnodes)
+				mask = numa_parse_nodestring(o->numa_memnodes);
+
+			switch (o->numa_mem_mode) {
+			case MPOL_INTERLEAVE:
+				numa_set_interleave_mask(mask);
+				break;
+			case MPOL_BIND:
+				numa_set_membind(mask);
+				break;
+			case MPOL_LOCAL:
+				numa_set_localalloc();
+				break;
+			case MPOL_PREFERRED:
+				numa_set_preferred(o->numa_mem_prefer_node);
+				break;
+			case MPOL_DEFAULT:
+			default:
+				break;
+			}
+
+			if (mask)
+				numa_free_nodemask(mask);
+
+		}
+	}
+#endif
+
+	if (fio_pin_memory(td))
+		goto err;
+
+	/*
+	 * May alter parameters that init_io_u() will use, so we need to
+	 * do this first.
+	 */
+	if (!init_iolog(td))
+		goto err;
+
+	if (td_io_init(td))
+		goto err;
+
+	if (init_io_u(td))
+		goto err;
+
+	if (td->io_ops->post_init && td->io_ops->post_init(td))
+		goto err;
+
+	if (o->verify_async && verify_async_init(td))
+		goto err;
+
+	if (fio_option_is_set(o, ioprio) ||
+	    fio_option_is_set(o, ioprio_class)) {
+		ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
+		if (ret == -1) {
+			td_verror(td, errno, "ioprio_set");
+			goto err;
+		}
+	}
+
+	if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
+		goto err;
+
+	errno = 0;
+	if (nice(o->nice) == -1 && errno != 0) {
+		td_verror(td, errno, "nice");
+		goto err;
+	}
+
+	if (o->ioscheduler && switch_ioscheduler(td))
+		goto err;
+
+	if (!o->create_serialize && setup_files(td))
+		goto err;
+
+	if (!init_random_map(td))
+		goto err;
+
+	if (o->exec_prerun && exec_string(o, o->exec_prerun, (const char *)"prerun"))
+		goto err;
+
+	if (o->pre_read && !pre_read_files(td))
+		goto err;
+
+	fio_verify_init(td);
+
+	if (rate_submit_init(td, sk_out))
+		goto err;
+
+	set_epoch_time(td, o->log_unix_epoch);
+	fio_getrusage(&td->ru_start);
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+
+	if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
+			o->ratemin[DDIR_TRIM]) {
+	        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	}
+
+	memset(bytes_done, 0, sizeof(bytes_done));
+	clear_state = false;
+
+	while (keep_running(td)) {
+		uint64_t verify_bytes;
+
+		fio_gettime(&td->start, NULL);
+		memcpy(&td->ts_cache, &td->start, sizeof(td->start));
+
+		if (clear_state) {
+			clear_io_state(td, 0);
+
+			if (o->unlink_each_loop && unlink_all_files(td))
+				break;
+		}
+
+		prune_io_piece_log(td);
+
+		if (td->o.verify_only && td_write(td))
+			verify_bytes = do_dry_run(td);
+		else {
+			do_io(td, bytes_done);
+
+			if (!ddir_rw_sum(bytes_done)) {
+				fio_mark_td_terminate(td);
+				verify_bytes = 0;
+			} else {
+				verify_bytes = bytes_done[DDIR_WRITE] +
+						bytes_done[DDIR_TRIM];
+			}
+		}
+
+		/*
+		 * If we took too long to shut down, the main thread could
+		 * already consider us reaped/exited. If that happens, break
+		 * out and clean up.
+		 */
+		if (td->runstate >= TD_EXITED)
+			break;
+
+		clear_state = true;
+
+		/*
+		 * Make sure we've successfully updated the rusage stats
+		 * before waiting on the stat mutex. Otherwise we could have
+		 * the stat thread holding stat mutex and waiting for
+		 * the rusage_sem, which would never get upped because
+		 * this thread is waiting for the stat mutex.
+		 */
+		deadlock_loop_cnt = 0;
+		do {
+			check_update_rusage(td);
+			if (!fio_sem_down_trylock(stat_sem))
+				break;
+			usleep(1000);
+			if (deadlock_loop_cnt++ > 5000) {
+				log_err("fio seems to be stuck grabbing stat_sem, forcibly exiting\n");
+				td->error = EDEADLK;
+				goto err;
+			}
+		} while (1);
+
+		if (td_read(td) && td->io_bytes[DDIR_READ])
+			update_runtime(td, elapsed_us, DDIR_READ);
+		if (td_write(td) && td->io_bytes[DDIR_WRITE])
+			update_runtime(td, elapsed_us, DDIR_WRITE);
+		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+			update_runtime(td, elapsed_us, DDIR_TRIM);
+		fio_gettime(&td->start, NULL);
+		fio_sem_up(stat_sem);
+
+		if (td->error || td->terminate)
+			break;
+
+		if (!o->do_verify ||
+		    o->verify == VERIFY_NONE ||
+		    td_ioengine_flagged(td, FIO_UNIDIR))
+			continue;
+
+		clear_io_state(td, 0);
+
+		fio_gettime(&td->start, NULL);
+
+		do_verify(td, verify_bytes);
+
+		/*
+		 * See comment further up for why this is done here.
+		 */
+		check_update_rusage(td);
+
+		fio_sem_down(stat_sem);
+		update_runtime(td, elapsed_us, DDIR_READ);
+		fio_gettime(&td->start, NULL);
+		fio_sem_up(stat_sem);
+
+		if (td->error || td->terminate)
+			break;
+	}
+
+	/*
+	 * Acquire this lock if we were doing overlap checking in
+	 * offload mode so that we don't clean up this job while
+	 * another thread is checking its io_u's for overlap
+	 */
+	if (td_offload_overlap(td))
+		pthread_mutex_lock(&overlap_check);
+	td_set_runstate(td, TD_FINISHING);
+	if (td_offload_overlap(td))
+		pthread_mutex_unlock(&overlap_check);
+
+	update_rusage_stat(td);
+	td->ts.total_run_time = mtime_since_now(&td->epoch);
+	td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
+	td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
+	td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
+
+	if (td->o.verify_state_save && !(td->flags & TD_F_VSTATE_SAVED) &&
+	    (td->o.verify != VERIFY_NONE && td_write(td)))
+		verify_save_state(td->thread_number);
+
+	fio_unpin_memory(td);
+
+	td_writeout_logs(td, true);
+
+	iolog_compress_exit(td);
+	rate_submit_exit(td);
+
+	if (o->exec_postrun)
+		exec_string(o, o->exec_postrun, (const char *)"postrun");
+
+	if (exitall_on_terminate || (o->exitall_error && td->error))
+		fio_terminate_threads(td->groupid, td->o.exit_what);
+
+err:
+	if (td->error)
+		log_info("fio: pid=%d, err=%d/%s\n", (int) td->pid, td->error,
+							td->verror);
+
+	if (o->verify_async)
+		verify_async_exit(td);
+
+	close_and_free_files(td);
+	cleanup_io_u(td);
+	close_ioengine(td);
+	cgroup_shutdown(td, cgroup_mnt);
+	verify_free_state(td);
+	td_zone_free_index(td);
+
+	if (fio_option_is_set(o, cpumask)) {
+		ret = fio_cpuset_exit(&o->cpumask);
+		if (ret)
+			td_verror(td, ret, "fio_cpuset_exit");
+	}
+
+	/*
+	 * do this very late, it will log file closing as well
+	 */
+	if (o->write_iolog_file)
+		write_iolog_close(td);
+	if (td->io_log_rfile)
+		fclose(td->io_log_rfile);
+
+	td_set_runstate(td, TD_EXITED);
+
+	/*
+	 * Do this last after setting our runstate to exited, so we
+	 * know that the stat thread is signaled.
+	 */
+	check_update_rusage(td);
+
+	sk_out_drop();
+	return (void *) (uintptr_t) td->error;
+}
+
+/*
+ * Run over the job map and reap the threads that have exited, if any.
+ */
+static void reap_threads(unsigned int *nr_running, uint64_t *t_rate,
+			 uint64_t *m_rate)
+{
+	struct thread_data *td;
+	unsigned int cputhreads, realthreads, pending;
+	int i, status, ret;
+
+	/*
+	 * reap exited threads (TD_EXITED -> TD_REAPED)
+	 */
+	realthreads = pending = cputhreads = 0;
+	for_each_td(td, i) {
+		int flags = 0;
+
+		 if (!strcmp(td->o.ioengine, "cpuio"))
+			cputhreads++;
+		else
+			realthreads++;
+
+		if (!td->pid) {
+			pending++;
+			continue;
+		}
+		if (td->runstate == TD_REAPED)
+			continue;
+		if (td->o.use_thread) {
+			if (td->runstate == TD_EXITED) {
+				td_set_runstate(td, TD_REAPED);
+				goto reaped;
+			}
+			continue;
+		}
+
+		flags = WNOHANG;
+		if (td->runstate == TD_EXITED)
+			flags = 0;
+
+		/*
+		 * check if someone quit or got killed in an unusual way
+		 */
+		ret = waitpid(td->pid, &status, flags);
+		if (ret < 0) {
+			if (errno == ECHILD) {
+				log_err("fio: pid=%d disappeared %d\n",
+						(int) td->pid, td->runstate);
+				td->sig = ECHILD;
+				td_set_runstate(td, TD_REAPED);
+				goto reaped;
+			}
+			perror("waitpid");
+		} else if (ret == td->pid) {
+			if (WIFSIGNALED(status)) {
+				int sig = WTERMSIG(status);
+
+				if (sig != SIGTERM && sig != SIGUSR2)
+					log_err("fio: pid=%d, got signal=%d\n",
+							(int) td->pid, sig);
+				td->sig = sig;
+				td_set_runstate(td, TD_REAPED);
+				goto reaped;
+			}
+			if (WIFEXITED(status)) {
+				if (WEXITSTATUS(status) && !td->error)
+					td->error = WEXITSTATUS(status);
+
+				td_set_runstate(td, TD_REAPED);
+				goto reaped;
+			}
+		}
+
+		/*
+		 * If the job is stuck, do a forceful timeout of it and
+		 * move on.
+		 */
+		if (td->terminate &&
+		    td->runstate < TD_FSYNCING &&
+		    time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) {
+			log_err("fio: job '%s' (state=%d) hasn't exited in "
+				"%lu seconds, it appears to be stuck. Doing "
+				"forceful exit of this job.\n",
+				td->o.name, td->runstate,
+				(unsigned long) time_since_now(&td->terminate_time));
+			td_set_runstate(td, TD_REAPED);
+			goto reaped;
+		}
+
+		/*
+		 * thread is not dead, continue
+		 */
+		pending++;
+		continue;
+reaped:
+		(*nr_running)--;
+		(*m_rate) -= ddir_rw_sum(td->o.ratemin);
+		(*t_rate) -= ddir_rw_sum(td->o.rate);
+		if (!td->pid)
+			pending--;
+
+		if (td->error)
+			exit_value++;
+
+		done_secs += mtime_since_now(&td->epoch) / 1000;
+		profile_td_exit(td);
+	}
+
+	if (*nr_running == cputhreads && !pending && realthreads)
+		fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
+}
+
+static bool __check_trigger_file(void)
+{
+	struct stat sb;
+
+	if (!trigger_file)
+		return false;
+
+	if (stat(trigger_file, &sb))
+		return false;
+
+	if (unlink(trigger_file) < 0)
+		log_err("fio: failed to unlink %s: %s\n", trigger_file,
+							strerror(errno));
+
+	return true;
+}
+
+static bool trigger_timedout(void)
+{
+	if (trigger_timeout)
+		if (time_since_genesis() >= trigger_timeout) {
+			trigger_timeout = 0;
+			return true;
+		}
+
+	return false;
+}
+
+void exec_trigger(const char *cmd)
+{
+	int ret;
+
+	if (!cmd || cmd[0] == '\0')
+		return;
+
+	ret = system(cmd);
+	if (ret == -1)
+		log_err("fio: failed executing %s trigger\n", cmd);
+}
+
+void check_trigger_file(void)
+{
+	if (__check_trigger_file() || trigger_timedout()) {
+		if (nr_clients)
+			fio_clients_send_trigger(trigger_remote_cmd);
+		else {
+			verify_save_state(IO_LIST_ALL);
+			fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
+			exec_trigger(trigger_cmd);
+		}
+	}
+}
+
+static int fio_verify_load_state(struct thread_data *td)
+{
+	int ret;
+
+	if (!td->o.verify_state)
+		return 0;
+
+	if (is_backend) {
+		void *data;
+
+		ret = fio_server_get_verify_state(td->o.name,
+					td->thread_number - 1, &data);
+		if (!ret)
+			verify_assign_state(td, data);
+	} else {
+		char prefix[PATH_MAX];
+
+		if (aux_path)
+			sprintf(prefix, "%s%clocal", aux_path,
+					FIO_OS_PATH_SEPARATOR);
+		else
+			strcpy(prefix, "local");
+		ret = verify_load_state(td, prefix);
+	}
+
+	return ret;
+}
+
+static void do_usleep(unsigned int usecs)
+{
+	check_for_running_stats();
+	check_trigger_file();
+	usleep(usecs);
+}
+
+static bool check_mount_writes(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (!td_write(td) || td->o.allow_mounted_write)
+		return false;
+
+	/*
+	 * If FIO_HAVE_CHARDEV_SIZE is defined, it's likely that chrdevs
+	 * are mkfs'd and mounted.
+	 */
+	for_each_file(td, f, i) {
+#ifdef FIO_HAVE_CHARDEV_SIZE
+		if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+#else
+		if (f->filetype != FIO_TYPE_BLOCK)
+#endif
+			continue;
+		if (device_is_mounted(f->file_name))
+			goto mounted;
+	}
+
+	return false;
+mounted:
+	log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.\n", f->file_name);
+	return true;
+}
+
+static bool waitee_running(struct thread_data *me)
+{
+	const char *waitee = me->o.wait_for;
+	const char *self = me->o.name;
+	struct thread_data *td;
+	int i;
+
+	if (!waitee)
+		return false;
+
+	for_each_td(td, i) {
+		if (!strcmp(td->o.name, self) || strcmp(td->o.name, waitee))
+			continue;
+
+		if (td->runstate < TD_EXITED) {
+			dprint(FD_PROCESS, "%s fenced by %s(%s)\n",
+					self, td->o.name,
+					runstate_to_name(td->runstate));
+			return true;
+		}
+	}
+
+	dprint(FD_PROCESS, "%s: %s completed, can run\n", self, waitee);
+	return false;
+}
+
+/*
+ * Main function for kicking off and reaping jobs, as needed.
+ */
+static void run_threads(struct sk_out *sk_out)
+{
+	struct thread_data *td;
+	unsigned int i, todo, nr_running, nr_started;
+	uint64_t m_rate, t_rate;
+	uint64_t spent;
+
+	if (fio_gtod_offload && fio_start_gtod_thread())
+		return;
+
+	fio_idle_prof_init();
+
+	set_sig_handlers();
+
+	nr_thread = nr_process = 0;
+	for_each_td(td, i) {
+		if (check_mount_writes(td))
+			return;
+		if (td->o.use_thread)
+			nr_thread++;
+		else
+			nr_process++;
+	}
+
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		struct buf_output out;
+
+		buf_output_init(&out);
+		__log_buf(&out, "Starting ");
+		if (nr_thread)
+			__log_buf(&out, "%d thread%s", nr_thread,
+						nr_thread > 1 ? "s" : "");
+		if (nr_process) {
+			if (nr_thread)
+				__log_buf(&out, " and ");
+			__log_buf(&out, "%d process%s", nr_process,
+						nr_process > 1 ? "es" : "");
+		}
+		__log_buf(&out, "\n");
+		log_info_buf(out.buf, out.buflen);
+		buf_output_free(&out);
+	}
+
+	todo = thread_number;
+	nr_running = 0;
+	nr_started = 0;
+	m_rate = t_rate = 0;
+
+	for_each_td(td, i) {
+		print_status_init(td->thread_number - 1);
+
+		if (!td->o.create_serialize)
+			continue;
+
+		if (fio_verify_load_state(td))
+			goto reap;
+
+		/*
+		 * do file setup here so it happens sequentially,
+		 * we don't want X number of threads getting their
+		 * client data interspersed on disk
+		 */
+		if (setup_files(td)) {
+reap:
+			exit_value++;
+			if (td->error)
+				log_err("fio: pid=%d, err=%d/%s\n",
+					(int) td->pid, td->error, td->verror);
+			td_set_runstate(td, TD_REAPED);
+			todo--;
+		} else {
+			struct fio_file *f;
+			unsigned int j;
+
+			/*
+			 * for sharing to work, each job must always open
+			 * its own files. so close them, if we opened them
+			 * for creation
+			 */
+			for_each_file(td, f, j) {
+				if (fio_file_open(f))
+					td_io_close_file(td, f);
+			}
+		}
+	}
+
+	/* start idle threads before io threads start to run */
+	fio_idle_prof_start();
+
+	set_genesis_time();
+
+	while (todo) {
+		struct thread_data *map[REAL_MAX_JOBS];
+		struct timespec this_start;
+		int this_jobs = 0, left;
+		struct fork_data *fd;
+
+		/*
+		 * create threads (TD_NOT_CREATED -> TD_CREATED)
+		 */
+		for_each_td(td, i) {
+			if (td->runstate != TD_NOT_CREATED)
+				continue;
+
+			/*
+			 * never got a chance to start, killed by other
+			 * thread for some reason
+			 */
+			if (td->terminate) {
+				todo--;
+				continue;
+			}
+
+			if (td->o.start_delay) {
+				spent = utime_since_genesis();
+
+				if (td->o.start_delay > spent)
+					continue;
+			}
+
+			if (td->o.stonewall && (nr_started || nr_running)) {
+				dprint(FD_PROCESS, "%s: stonewall wait\n",
+							td->o.name);
+				break;
+			}
+
+			if (waitee_running(td)) {
+				dprint(FD_PROCESS, "%s: waiting for %s\n",
+						td->o.name, td->o.wait_for);
+				continue;
+			}
+
+			init_disk_util(td);
+
+			td->rusage_sem = fio_sem_init(FIO_SEM_LOCKED);
+			td->update_rusage = 0;
+
+			/*
+			 * Set state to created. Thread will transition
+			 * to TD_INITIALIZED when it's done setting up.
+			 */
+			td_set_runstate(td, TD_CREATED);
+			map[this_jobs++] = td;
+			nr_started++;
+
+			fd = calloc(1, sizeof(*fd));
+			fd->td = td;
+			fd->sk_out = sk_out;
+
+			if (td->o.use_thread) {
+				int ret;
+
+				dprint(FD_PROCESS, "will pthread_create\n");
+				ret = pthread_create(&td->thread, NULL,
+							thread_main, fd);
+				if (ret) {
+					log_err("pthread_create: %s\n",
+							strerror(ret));
+					free(fd);
+					nr_started--;
+					break;
+				}
+				fd = NULL;
+				ret = pthread_detach(td->thread);
+				if (ret)
+					log_err("pthread_detach: %s",
+							strerror(ret));
+			} else {
+				pid_t pid;
+				dprint(FD_PROCESS, "will fork\n");
+				pid = fork();
+				if (!pid) {
+					int ret;
+
+					ret = (int)(uintptr_t)thread_main(fd);
+					_exit(ret);
+				} else if (i == fio_debug_jobno)
+					*fio_debug_jobp = pid;
+			}
+			dprint(FD_MUTEX, "wait on startup_sem\n");
+			if (fio_sem_down_timeout(startup_sem, 10000)) {
+				log_err("fio: job startup hung? exiting.\n");
+				fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
+				fio_abort = true;
+				nr_started--;
+				free(fd);
+				break;
+			}
+			dprint(FD_MUTEX, "done waiting on startup_sem\n");
+		}
+
+		/*
+		 * Wait for the started threads to transition to
+		 * TD_INITIALIZED.
+		 */
+		fio_gettime(&this_start, NULL);
+		left = this_jobs;
+		while (left && !fio_abort) {
+			if (mtime_since_now(&this_start) > JOB_START_TIMEOUT)
+				break;
+
+			do_usleep(100000);
+
+			for (i = 0; i < this_jobs; i++) {
+				td = map[i];
+				if (!td)
+					continue;
+				if (td->runstate == TD_INITIALIZED) {
+					map[i] = NULL;
+					left--;
+				} else if (td->runstate >= TD_EXITED) {
+					map[i] = NULL;
+					left--;
+					todo--;
+					nr_running++; /* work-around... */
+				}
+			}
+		}
+
+		if (left) {
+			log_err("fio: %d job%s failed to start\n", left,
+					left > 1 ? "s" : "");
+			for (i = 0; i < this_jobs; i++) {
+				td = map[i];
+				if (!td)
+					continue;
+				kill(td->pid, SIGTERM);
+			}
+			break;
+		}
+
+		/*
+		 * start created threads (TD_INITIALIZED -> TD_RUNNING).
+		 */
+		for_each_td(td, i) {
+			if (td->runstate != TD_INITIALIZED)
+				continue;
+
+			if (in_ramp_time(td))
+				td_set_runstate(td, TD_RAMP);
+			else
+				td_set_runstate(td, TD_RUNNING);
+			nr_running++;
+			nr_started--;
+			m_rate += ddir_rw_sum(td->o.ratemin);
+			t_rate += ddir_rw_sum(td->o.rate);
+			todo--;
+			fio_sem_up(td->sem);
+		}
+
+		reap_threads(&nr_running, &t_rate, &m_rate);
+
+		if (todo)
+			do_usleep(100000);
+	}
+
+	while (nr_running) {
+		reap_threads(&nr_running, &t_rate, &m_rate);
+		do_usleep(10000);
+	}
+
+	fio_idle_prof_stop();
+
+	update_io_ticks();
+}
+
+static void free_disk_util(void)
+{
+	disk_util_prune_entries();
+	helper_thread_destroy();
+}
+
+int fio_backend(struct sk_out *sk_out)
+{
+	struct thread_data *td;
+	int i;
+
+	if (exec_profile) {
+		if (load_profile(exec_profile))
+			return 1;
+		free(exec_profile);
+		exec_profile = NULL;
+	}
+	if (!thread_number)
+		return 0;
+
+	if (write_bw_log) {
+		struct log_params p = {
+			.log_type = IO_LOG_TYPE_BW,
+		};
+
+		setup_log(&agg_io_log[DDIR_READ], &p, "agg-read_bw.log");
+		setup_log(&agg_io_log[DDIR_WRITE], &p, "agg-write_bw.log");
+		setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log");
+	}
+
+	startup_sem = fio_sem_init(FIO_SEM_LOCKED);
+	if (!sk_out)
+		is_local_backend = true;
+	if (startup_sem == NULL)
+		return 1;
+
+	set_genesis_time();
+	stat_init();
+	if (helper_thread_create(startup_sem, sk_out))
+		log_err("fio: failed to create helper thread\n");
+
+	cgroup_list = smalloc(sizeof(*cgroup_list));
+	if (cgroup_list)
+		INIT_FLIST_HEAD(cgroup_list);
+
+	run_threads(sk_out);
+
+	helper_thread_exit();
+
+	if (!fio_abort) {
+		__show_run_stats();
+		if (write_bw_log) {
+			for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+				struct io_log *log = agg_io_log[i];
+
+				flush_log(log, false);
+				free_log(log);
+			}
+		}
+	}
+
+	for_each_td(td, i) {
+		steadystate_free(td);
+		fio_options_free(td);
+		if (td->rusage_sem) {
+			fio_sem_remove(td->rusage_sem);
+			td->rusage_sem = NULL;
+		}
+		fio_sem_remove(td->sem);
+		td->sem = NULL;
+	}
+
+	free_disk_util();
+	if (cgroup_list) {
+		cgroup_kill(cgroup_list);
+		sfree(cgroup_list);
+	}
+
+	fio_sem_remove(startup_sem);
+	stat_exit();
+	return exit_value;
+}
diff --git a/blktrace.c b/blktrace.c
new file mode 100644
index 0000000..64a610a
--- /dev/null
+++ b/blktrace.c
@@ -0,0 +1,801 @@
+/*
+ * blktrace support code for fio
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "flist.h"
+#include "fio.h"
+#include "blktrace.h"
+#include "blktrace_api.h"
+#include "oslib/linux-dev-lookup.h"
+
+#define TRACE_FIFO_SIZE	8192
+
+/*
+ * fifo refill frontend, to avoid reading data in trace sized bites
+ */
+static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
+{
+	char buf[TRACE_FIFO_SIZE];
+	unsigned int total;
+	int ret;
+
+	total = sizeof(buf);
+	if (total > fifo_room(fifo))
+		total = fifo_room(fifo);
+
+	ret = read(fd, buf, total);
+	if (ret < 0) {
+		int read_err = errno;
+
+		assert(read_err > 0);
+		td_verror(td, read_err, "read blktrace file");
+		return -read_err;
+	}
+
+	if (ret > 0)
+		ret = fifo_put(fifo, buf, ret);
+
+	dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
+	return ret;
+}
+
+/*
+ * Retrieve 'len' bytes from the fifo, refilling if necessary.
+ */
+static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
+			  void *buf, unsigned int len)
+{
+	if (fifo_len(fifo) < len) {
+		int ret = refill_fifo(td, fifo, fd);
+
+		if (ret < 0)
+			return ret;
+	}
+
+	return fifo_get(fifo, buf, len);
+}
+
+/*
+ * Just discard the pdu by seeking past it.
+ */
+static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
+		       struct blk_io_trace *t)
+{
+	if (t->pdu_len == 0)
+		return 0;
+
+	dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
+	return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
+}
+
+/*
+ * Check if this is a blktrace binary data file. We read a single trace
+ * into memory and check for the magic signature.
+ */
+bool is_blktrace(const char *filename, int *need_swap)
+{
+	struct blk_io_trace t;
+	int fd, ret;
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	ret = read(fd, &t, sizeof(t));
+	close(fd);
+
+	if (ret < 0) {
+		perror("read blktrace");
+		return false;
+	} else if (ret != sizeof(t)) {
+		log_err("fio: short read on blktrace file\n");
+		return false;
+	}
+
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
+		*need_swap = 0;
+		return true;
+	}
+
+	/*
+	 * Maybe it needs to be endian swapped...
+	 */
+	t.magic = fio_swap32(t.magic);
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
+		*need_swap = 1;
+		return true;
+	}
+
+	return false;
+}
+
+#define FMINORBITS	20
+#define FMINORMASK	((1U << FMINORBITS) - 1)
+#define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
+#define FMINOR(dev)	((unsigned int) ((dev) & FMINORMASK))
+
+static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action)
+{
+	struct io_piece *ipo;
+
+	ipo = calloc(1, sizeof(*ipo));
+	init_ipo(ipo);
+
+	ipo->ddir = DDIR_INVAL;
+	ipo->fileno = fileno;
+	ipo->file_action = action;
+	flist_add_tail(&ipo->list, &td->io_log_list);
+}
+
+static int trace_add_file(struct thread_data *td, __u32 device)
+{
+	static unsigned int last_maj, last_min, last_fileno;
+	unsigned int maj = FMAJOR(device);
+	unsigned int min = FMINOR(device);
+	struct fio_file *f;
+	char dev[256];
+	unsigned int i;
+
+	if (last_maj == maj && last_min == min)
+		return last_fileno;
+
+	last_maj = maj;
+	last_min = min;
+
+	/*
+	 * check for this file in our list
+	 */
+	for_each_file(td, f, i)
+		if (f->major == maj && f->minor == min) {
+			last_fileno = f->fileno;
+			return last_fileno;
+		}
+
+	strcpy(dev, "/dev");
+	if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
+		int fileno;
+
+		if (td->o.replay_redirect)
+			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
+					" with: %s\n", maj, min,
+					td->o.replay_redirect);
+		else
+			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
+
+		dprint(FD_BLKTRACE, "add devices %s\n", dev);
+		fileno = add_file_exclusive(td, dev);
+		td->o.open_files++;
+		td->files[fileno]->major = maj;
+		td->files[fileno]->minor = min;
+		trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
+		last_fileno = fileno;
+	}
+
+	return last_fileno;
+}
+
+static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
+{
+	if (!o->replay_align)
+		return;
+
+	t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1);
+}
+
+/*
+ * Store blk_io_trace data in an ipo for later retrieval.
+ */
+static void store_ipo(struct thread_data *td, unsigned long long offset,
+		      unsigned int bytes, int rw, unsigned long long ttime,
+		      int fileno)
+{
+	struct io_piece *ipo;
+
+	ipo = calloc(1, sizeof(*ipo));
+	init_ipo(ipo);
+
+	ipo->offset = offset * 512;
+	if (td->o.replay_scale)
+		ipo->offset = ipo->offset / td->o.replay_scale;
+	ipo_bytes_align(td->o.replay_align, ipo);
+	ipo->len = bytes;
+	ipo->delay = ttime / 1000;
+	if (rw)
+		ipo->ddir = DDIR_WRITE;
+	else
+		ipo->ddir = DDIR_READ;
+	ipo->fileno = fileno;
+
+	dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n",
+							ipo->ddir, ipo->offset,
+							ipo->len, ipo->delay);
+	queue_io_piece(td, ipo);
+}
+
+static void handle_trace_notify(struct blk_io_trace *t)
+{
+	switch (t->action) {
+	case BLK_TN_PROCESS:
+		dprint(FD_BLKTRACE, "got process notify: %x, %d\n",
+				t->action, t->pid);
+		break;
+	case BLK_TN_TIMESTAMP:
+		dprint(FD_BLKTRACE, "got timestamp notify: %x, %d\n",
+				t->action, t->pid);
+		break;
+	case BLK_TN_MESSAGE:
+		break;
+	default:
+		dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
+		break;
+	}
+}
+
+static void handle_trace_discard(struct thread_data *td,
+				 struct blk_io_trace *t,
+				 unsigned long long ttime,
+				 unsigned long *ios, unsigned int *bs)
+{
+	struct io_piece *ipo;
+	int fileno;
+
+	if (td->o.replay_skip & (1u << DDIR_TRIM))
+		return;
+
+	ipo = calloc(1, sizeof(*ipo));
+	init_ipo(ipo);
+	fileno = trace_add_file(td, t->device);
+
+	ios[DDIR_TRIM]++;
+	if (t->bytes > bs[DDIR_TRIM])
+		bs[DDIR_TRIM] = t->bytes;
+
+	td->o.size += t->bytes;
+
+	INIT_FLIST_HEAD(&ipo->list);
+
+	ipo->offset = t->sector * 512;
+	if (td->o.replay_scale)
+		ipo->offset = ipo->offset / td->o.replay_scale;
+	ipo_bytes_align(td->o.replay_align, ipo);
+	ipo->len = t->bytes;
+	ipo->delay = ttime / 1000;
+	ipo->ddir = DDIR_TRIM;
+	ipo->fileno = fileno;
+
+	dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n",
+							ipo->offset, ipo->len,
+							ipo->delay);
+	queue_io_piece(td, ipo);
+}
+
+static void dump_trace(struct blk_io_trace *t)
+{
+	log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
+}
+
+static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
+			    unsigned long long ttime, unsigned long *ios,
+			    unsigned int *bs)
+{
+	int rw;
+	int fileno;
+
+	fileno = trace_add_file(td, t->device);
+
+	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+
+	if (rw) {
+		if (td->o.replay_skip & (1u << DDIR_WRITE))
+			return;
+	} else {
+		if (td->o.replay_skip & (1u << DDIR_READ))
+			return;
+	}
+
+	if (!t->bytes) {
+		if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
+			dump_trace(t);
+		return;
+	}
+
+	if (t->bytes > bs[rw])
+		bs[rw] = t->bytes;
+
+	ios[rw]++;
+	td->o.size += t->bytes;
+	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
+}
+
+static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
+			       unsigned long long ttime, unsigned long *ios)
+{
+	struct io_piece *ipo;
+	int fileno;
+
+	if (td->o.replay_skip & (1u << DDIR_SYNC))
+		return;
+
+	ipo = calloc(1, sizeof(*ipo));
+	init_ipo(ipo);
+	fileno = trace_add_file(td, t->device);
+
+	ipo->delay = ttime / 1000;
+	ipo->ddir = DDIR_SYNC;
+	ipo->fileno = fileno;
+
+	ios[DDIR_SYNC]++;
+	dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
+	queue_io_piece(td, ipo);
+}
+
+/*
+ * We only care for queue traces, most of the others are side effects
+ * due to internal workings of the block layer.
+ */
+static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
+			 unsigned long *ios, unsigned int *bs)
+{
+	static unsigned long long last_ttime;
+	unsigned long long delay = 0;
+
+	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
+		return;
+
+	if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
+		if (!last_ttime || td->o.no_stall)
+			delay = 0;
+		else if (td->o.replay_time_scale == 100)
+			delay = t->time - last_ttime;
+		else {
+			double tmp = t->time - last_ttime;
+			double scale;
+
+			scale = (double) 100.0 / (double) td->o.replay_time_scale;
+			tmp *= scale;
+			delay = tmp;
+		}
+		last_ttime = t->time;
+	}
+
+	t_bytes_align(&td->o, t);
+
+	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
+		handle_trace_notify(t);
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		handle_trace_discard(td, t, delay, ios, bs);
+	else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
+		handle_trace_flush(td, t, delay, ios);
+	else
+		handle_trace_fs(td, t, delay, ios, bs);
+}
+
+static void byteswap_trace(struct blk_io_trace *t)
+{
+	t->magic = fio_swap32(t->magic);
+	t->sequence = fio_swap32(t->sequence);
+	t->time = fio_swap64(t->time);
+	t->sector = fio_swap64(t->sector);
+	t->bytes = fio_swap32(t->bytes);
+	t->action = fio_swap32(t->action);
+	t->pid = fio_swap32(t->pid);
+	t->device = fio_swap32(t->device);
+	t->cpu = fio_swap32(t->cpu);
+	t->error = fio_swap16(t->error);
+	t->pdu_len = fio_swap16(t->pdu_len);
+}
+
+static bool t_is_write(struct blk_io_trace *t)
+{
+	return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
+}
+
+static enum fio_ddir t_get_ddir(struct blk_io_trace *t)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_READ))
+		return DDIR_READ;
+	else if (t->action & BLK_TC_ACT(BLK_TC_WRITE))
+		return DDIR_WRITE;
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return DDIR_TRIM;
+
+	return DDIR_INVAL;
+}
+
+static void depth_inc(struct blk_io_trace *t, int *depth)
+{
+	enum fio_ddir ddir;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL)
+		depth[ddir]++;
+}
+
+static void depth_dec(struct blk_io_trace *t, int *depth)
+{
+	enum fio_ddir ddir;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL)
+		depth[ddir]--;
+}
+
+static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
+{
+	enum fio_ddir ddir = DDIR_INVAL;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL) {
+		depth[ddir] = max(depth[ddir], this_depth[ddir]);
+		this_depth[ddir] = 0;
+	}
+}
+
+/*
+ * Load a blktrace file by reading all the blk_io_trace entries, and storing
+ * them as io_pieces like the fio text version would do.
+ */
+bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
+{
+	struct blk_io_trace t;
+	unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
+	unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
+	unsigned long skipped_writes;
+	struct fifo *fifo;
+	int fd, i, old_state, max_depth;
+	struct fio_file *f;
+	int this_depth[DDIR_RWDIR_CNT] = { };
+	int depth[DDIR_RWDIR_CNT] = { };
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		td_verror(td, errno, "open blktrace file");
+		return false;
+	}
+
+	fifo = fifo_alloc(TRACE_FIFO_SIZE);
+
+	old_state = td_bump_runstate(td, TD_SETTING_UP);
+
+	td->o.size = 0;
+	skipped_writes = 0;
+	do {
+		int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
+
+		if (ret < 0)
+			goto err;
+		else if (!ret)
+			break;
+		else if (ret < (int) sizeof(t)) {
+			log_err("fio: short fifo get\n");
+			break;
+		}
+
+		if (need_swap)
+			byteswap_trace(&t);
+
+		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+			log_err("fio: bad magic in blktrace data: %x\n",
+								t.magic);
+			goto err;
+		}
+		if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
+			log_err("fio: bad blktrace version %d\n",
+								t.magic & 0xff);
+			goto err;
+		}
+		ret = discard_pdu(td, fifo, fd, &t);
+		if (ret < 0) {
+			td_verror(td, -ret, "blktrace lseek");
+			goto err;
+		} else if (t.pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
+			goto err;
+		}
+		if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
+			if ((t.action & 0xffff) == __BLK_TA_QUEUE)
+				depth_inc(&t, this_depth);
+			else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) ||
+				((t.action & 0xffff) == __BLK_TA_FRONTMERGE))
+				depth_dec(&t, this_depth);
+			else if ((t.action & 0xffff) == __BLK_TA_COMPLETE)
+				depth_end(&t, this_depth, depth);
+
+			if (t_is_write(&t) && read_only) {
+				skipped_writes++;
+				continue;
+			}
+		}
+
+		handle_trace(td, &t, ios, rw_bs);
+	} while (1);
+
+	for_each_file(td, f, i)
+		trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
+
+	fifo_free(fifo);
+	close(fd);
+
+	td_restore_runstate(td, old_state);
+
+	if (!td->files_index) {
+		log_err("fio: did not find replay device(s)\n");
+		return false;
+	}
+
+	/*
+	 * For stacked devices, we don't always get a COMPLETE event so
+	 * the depth grows to insane values. Limit it to something sane(r).
+	 */
+	max_depth = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (depth[i] > 1024)
+			depth[i] = 1024;
+		else if (!depth[i] && ios[i])
+			depth[i] = 1;
+		max_depth = max(depth[i], max_depth);
+	}
+
+	if (skipped_writes)
+		log_err("fio: %s skips replay of %lu writes due to read-only\n",
+						td->o.name, skipped_writes);
+
+	if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
+	    !ios[DDIR_SYNC]) {
+		log_err("fio: found no ios in blktrace data\n");
+		return false;
+	}
+
+	td->o.td_ddir = 0;
+	if (ios[DDIR_READ]) {
+		td->o.td_ddir |= TD_DDIR_READ;
+		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
+	}
+	if (ios[DDIR_WRITE]) {
+		td->o.td_ddir |= TD_DDIR_WRITE;
+		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
+	}
+	if (ios[DDIR_TRIM]) {
+		td->o.td_ddir |= TD_DDIR_TRIM;
+		td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
+	}
+
+	/*
+	 * We need to do direct/raw ios to the device, to avoid getting
+	 * read-ahead in our way. But only do so if the minimum block size
+	 * is a multiple of 4k, otherwise we don't know if it's safe to do so.
+	 */
+	if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
+		td->o.odirect = 1;
+
+	/*
+	 * If depth wasn't manually set, use probed depth
+	 */
+	if (!fio_option_is_set(&td->o, iodepth))
+		td->o.iodepth = td->o.iodepth_low = max_depth;
+
+	return true;
+err:
+	close(fd);
+	fifo_free(fifo);
+	return false;
+}
+
+static int init_merge_param_list(fio_fp64_t *vals, struct blktrace_cursor *bcs,
+				 int nr_logs, int def, size_t off)
+{
+	int i = 0, len = 0;
+
+	while (len < FIO_IO_U_LIST_MAX_LEN && vals[len].u.f != 0.0)
+		len++;
+
+	if (len && len != nr_logs)
+		return len;
+
+	for (i = 0; i < nr_logs; i++) {
+		int *val = (int *)((char *)&bcs[i] + off);
+		*val = def;
+		if (len)
+			*val = (int)vals[i].u.f;
+	}
+
+	return 0;
+
+}
+
+static int find_earliest_io(struct blktrace_cursor *bcs, int nr_logs)
+{
+	__u64 time = ~(__u64)0;
+	int idx = 0, i;
+
+	for (i = 0; i < nr_logs; i++) {
+		if (bcs[i].t.time < time) {
+			time = bcs[i].t.time;
+			idx = i;
+		}
+	}
+
+	return idx;
+}
+
+static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
+{
+	bcs[i].iter++;
+	if (bcs[i].iter < bcs[i].nr_iter) {
+		lseek(bcs[i].fd, 0, SEEK_SET);
+		return;
+	}
+
+	*nr_logs -= 1;
+
+	/* close file */
+	fifo_free(bcs[i].fifo);
+	close(bcs[i].fd);
+
+	/* keep active files contiguous */
+	memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
+}
+
+static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
+{
+	int ret = 0;
+	struct blk_io_trace *t = &bc->t;
+
+read_skip:
+	/* read an io trace */
+	ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
+	if (ret < 0) {
+		return ret;
+	} else if (!ret) {
+		if (!bc->length)
+			bc->length = bc->t.time;
+		return ret;
+	} else if (ret < (int) sizeof(*t)) {
+		log_err("fio: short fifo get\n");
+		return -1;
+	}
+
+	if (bc->swap)
+		byteswap_trace(t);
+
+	/* skip over actions that fio does not care about */
+	if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
+	    t_get_ddir(t) == DDIR_INVAL) {
+		ret = discard_pdu(td, bc->fifo, bc->fd, t);
+		if (ret < 0) {
+			td_verror(td, -ret, "blktrace lseek");
+			return ret;
+		} else if (t->pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret,
+				t->pdu_len);
+			return -1;
+		}
+		goto read_skip;
+	}
+
+	t->time = (t->time + bc->iter * bc->length) * bc->scalar / 100;
+
+	return ret;
+}
+
+static int write_trace(FILE *fp, struct blk_io_trace *t)
+{
+	/* pdu is not used so just write out only the io trace */
+	t->pdu_len = 0;
+	return fwrite((void *)t, sizeof(*t), 1, fp);
+}
+
+int merge_blktrace_iologs(struct thread_data *td)
+{
+	int nr_logs = get_max_str_idx(td->o.read_iolog_file);
+	struct blktrace_cursor *bcs = malloc(sizeof(struct blktrace_cursor) *
+					     nr_logs);
+	struct blktrace_cursor *bc;
+	FILE *merge_fp;
+	char *str, *ptr, *name, *merge_buf;
+	int i, ret;
+
+	ret = init_merge_param_list(td->o.merge_blktrace_scalars, bcs, nr_logs,
+				    100, offsetof(struct blktrace_cursor,
+						  scalar));
+	if (ret) {
+		log_err("fio: merge_blktrace_scalars(%d) != nr_logs(%d)\n",
+			ret, nr_logs);
+		goto err_param;
+	}
+
+	ret = init_merge_param_list(td->o.merge_blktrace_iters, bcs, nr_logs,
+				    1, offsetof(struct blktrace_cursor,
+						nr_iter));
+	if (ret) {
+		log_err("fio: merge_blktrace_iters(%d) != nr_logs(%d)\n",
+			ret, nr_logs);
+		goto err_param;
+	}
+
+	/* setup output file */
+	merge_fp = fopen(td->o.merge_blktrace_file, "w");
+	merge_buf = malloc(128 * 1024);
+	if (!merge_buf)
+		goto err_out_file;
+	ret = setvbuf(merge_fp, merge_buf, _IOFBF, 128 * 1024);
+	if (ret)
+		goto err_merge_buf;
+
+	/* setup input files */
+	str = ptr = strdup(td->o.read_iolog_file);
+	nr_logs = 0;
+	for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
+		bcs[i].fd = open(name, O_RDONLY);
+		if (bcs[i].fd < 0) {
+			log_err("fio: could not open file: %s\n", name);
+			ret = bcs[i].fd;
+			free(str);
+			goto err_file;
+		}
+		bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
+		nr_logs++;
+
+		if (!is_blktrace(name, &bcs[i].swap)) {
+			log_err("fio: file is not a blktrace: %s\n", name);
+			free(str);
+			goto err_file;
+		}
+
+		ret = read_trace(td, &bcs[i]);
+		if (ret < 0) {
+			free(str);
+			goto err_file;
+		} else if (!ret) {
+			merge_finish_file(bcs, i, &nr_logs);
+			i--;
+		}
+	}
+	free(str);
+
+	/* merge files */
+	while (nr_logs) {
+		i = find_earliest_io(bcs, nr_logs);
+		bc = &bcs[i];
+		/* skip over the pdu */
+		ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
+		if (ret < 0) {
+			td_verror(td, -ret, "blktrace lseek");
+			goto err_file;
+		} else if (bc->t.pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret,
+				bc->t.pdu_len);
+			goto err_file;
+		}
+
+		ret = write_trace(merge_fp, &bc->t);
+		ret = read_trace(td, bc);
+		if (ret < 0)
+			goto err_file;
+		else if (!ret)
+			merge_finish_file(bcs, i, &nr_logs);
+	}
+
+	/* set iolog file to read from the newly merged file */
+	td->o.read_iolog_file = td->o.merge_blktrace_file;
+	ret = 0;
+
+err_file:
+	/* cleanup */
+	for (i = 0; i < nr_logs; i++) {
+		fifo_free(bcs[i].fifo);
+		close(bcs[i].fd);
+	}
+err_merge_buf:
+	free(merge_buf);
+err_out_file:
+	fflush(merge_fp);
+	fclose(merge_fp);
+err_param:
+	free(bcs);
+
+	return ret;
+}
diff --git a/blktrace.h b/blktrace.h
new file mode 100644
index 0000000..a0e82fa
--- /dev/null
+++ b/blktrace.h
@@ -0,0 +1,45 @@
+#ifndef FIO_BLKTRACE_H
+#define FIO_BLKTRACE_H
+
+
+#ifdef FIO_HAVE_BLKTRACE
+
+#include <asm/types.h>
+
+#include "blktrace_api.h"
+
+struct blktrace_cursor {
+	struct fifo		*fifo;	// fifo queue for reading
+	int			fd;	// blktrace file
+	__u64			length; // length of trace
+	struct blk_io_trace	t;	// current io trace
+	int			swap;	// bitwise reverse required
+	int			scalar;	// scale percentage
+	int			iter;	// current iteration
+	int			nr_iter; // number of iterations to run
+};
+
+bool is_blktrace(const char *, int *);
+bool load_blktrace(struct thread_data *, const char *, int);
+int merge_blktrace_iologs(struct thread_data *td);
+
+#else
+
+static inline bool is_blktrace(const char *fname, int *need_swap)
+{
+	return false;
+}
+
+static inline bool load_blktrace(struct thread_data *td, const char *fname,
+				 int need_swap)
+{
+	return false;
+}
+
+static inline int merge_blktrace_iologs(struct thread_data *td)
+{
+	return false;
+}
+
+#endif
+#endif
diff --git a/blktrace_api.h b/blktrace_api.h
new file mode 100644
index 0000000..32ce1d8
--- /dev/null
+++ b/blktrace_api.h
@@ -0,0 +1,130 @@
+#ifndef BLKTRACEAPI_H
+#define BLKTRACEAPI_H
+
+#include <asm/types.h>
+
+/*
+ * Trace categories
+ */
+enum {
+	BLK_TC_READ	= 1 << 0,	/* reads */
+	BLK_TC_WRITE	= 1 << 1,	/* writes */
+	BLK_TC_FLUSH	= 1 << 2,	/* flush */
+	BLK_TC_SYNC	= 1 << 3,	/* sync */
+	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
+	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
+	BLK_TC_ISSUE	= 1 << 6,	/* issue */
+	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
+	BLK_TC_FS	= 1 << 8,	/* fs requests */
+	BLK_TC_PC	= 1 << 9,	/* pc requests */
+	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
+	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
+	BLK_TC_META	= 1 << 12,	/* metadata */
+	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
+	BLK_TC_DRV_DATA	= 1 << 14,	/* binary per-driver data */
+
+	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+};
+
+#define BLK_TC_SHIFT		(16)
+#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+
+/*
+ * Basic trace actions
+ */
+enum {
+	__BLK_TA_QUEUE = 1,		/* queued */
+	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
+	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
+	__BLK_TA_GETRQ,			/* allocated new request */
+	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
+	__BLK_TA_REQUEUE,		/* request requeued */
+	__BLK_TA_ISSUE,			/* sent to driver */
+	__BLK_TA_COMPLETE,		/* completed by driver */
+	__BLK_TA_PLUG,			/* queue was plugged */
+	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
+	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
+	__BLK_TA_INSERT,		/* insert request */
+	__BLK_TA_SPLIT,			/* bio was split */
+	__BLK_TA_BOUNCE,		/* bio was bounced */
+	__BLK_TA_REMAP,			/* bio was remapped */
+	__BLK_TA_ABORT,			/* request aborted */
+	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
+};
+
+/*
+ * Notify events.
+ */
+enum blktrace_notify {
+	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
+	__BLK_TN_TIMESTAMP,		/* include system clock */
+	__BLK_TN_MESSAGE,		/* Character string message */
+};
+
+/*
+ * Trace actions in full. Additionally, read or write is masked
+ */
+#define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
+#define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
+#define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
+#define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
+#define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
+#define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
+
+#define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
+#define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
+#define BLK_TN_MESSAGE          (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY))
+
+#define BLK_IO_TRACE_MAGIC	0x65617400
+#define BLK_IO_TRACE_VERSION	0x07
+
+/*
+ * The trace itself
+ */
+struct blk_io_trace {
+	__u32 magic;		/* MAGIC << 8 | version */
+	__u32 sequence;		/* event number */
+	__u64 time;		/* in nanoseconds */
+	__u64 sector;		/* disk offset */
+	__u32 bytes;		/* transfer length */
+	__u32 action;		/* what happened */
+	__u32 pid;		/* who did it */
+	__u32 device;		/* device identifier (dev_t) */
+	__u32 cpu;		/* on what cpu did it happen */
+	__u16 error;		/* completion error */
+	__u16 pdu_len;		/* length of data after this trace */
+};
+
+/*
+ * The remap event
+ */
+struct blk_io_trace_remap {
+	__u32 device;
+	__u32 device_from;
+	__u64 sector;
+};
+
+/*
+ * User setup structure passed with BLKSTARTTRACE
+ */
+struct blk_user_trace_setup {
+	char name[32];			/* output */
+	__u16 act_mask;			/* input */
+	__u32 buf_size;			/* input */
+	__u32 buf_nr;			/* input */
+	__u64 start_lba;
+	__u64 end_lba;
+	__u32 pid;
+};
+
+#endif
diff --git a/cairo_text_helpers.c b/cairo_text_helpers.c
new file mode 100644
index 0000000..19fb8e0
--- /dev/null
+++ b/cairo_text_helpers.c
@@ -0,0 +1,85 @@
+#include <cairo.h>
+#include <gtk/gtk.h>
+#include <math.h>
+
+static void draw_aligned_text(cairo_t *cr, const char *font, double x, double y,
+			       double fontsize, const char *text, int alignment)
+{
+#define CENTERED 0
+#define LEFT_JUSTIFIED 1
+#define RIGHT_JUSTIFIED 2
+
+	double factor, direction;
+	cairo_text_extents_t extents;
+
+	switch (alignment) {
+		case CENTERED:
+			direction = -1.0;
+			factor = 0.5;
+			break;
+		case RIGHT_JUSTIFIED:
+			direction = -1.0;
+			factor = 1.0;
+			break;
+		case LEFT_JUSTIFIED:
+		default:
+			direction = 1.0;
+			factor = 0.0;
+			break;
+	}
+	cairo_select_font_face(cr, font, CAIRO_FONT_SLANT_NORMAL, CAIRO_FONT_WEIGHT_NORMAL);
+
+	cairo_set_font_size(cr, fontsize);
+	cairo_text_extents(cr, text, &extents);
+	x = x + direction * (factor * extents.width  + extents.x_bearing);
+	y = y - (extents.height / 2 + extents.y_bearing);
+
+	cairo_move_to(cr, x, y);
+	cairo_show_text(cr, text);
+}
+
+void draw_centered_text(cairo_t *cr, const char *font, double x, double y,
+			       double fontsize, const char *text)
+{
+	draw_aligned_text(cr, font, x, y, fontsize, text, CENTERED);
+}
+
+void draw_right_justified_text(cairo_t *cr, const char *font,
+				double x, double y,
+				double fontsize, const char *text)
+{
+	draw_aligned_text(cr, font, x, y, fontsize, text, RIGHT_JUSTIFIED);
+}
+
+void draw_left_justified_text(cairo_t *cr, const char *font,
+				double x, double y,
+				double fontsize, const char *text)
+{
+	draw_aligned_text(cr, font, x, y, fontsize, text, LEFT_JUSTIFIED);
+}
+
+void draw_vertical_centered_text(cairo_t *cr, const char *font, double x,
+					double y, double fontsize,
+					const char *text)
+{
+	double sx, sy;
+	cairo_text_extents_t extents;
+
+	cairo_select_font_face(cr, font, CAIRO_FONT_SLANT_NORMAL, CAIRO_FONT_WEIGHT_NORMAL);
+
+	cairo_set_font_size(cr, fontsize);
+	cairo_text_extents(cr, text, &extents);
+	sx = x;
+	sy = y;
+	y = y + (extents.width / 2.0 + extents.x_bearing);
+	x = x - (extents.height / 2.0 + extents.y_bearing);
+
+	cairo_move_to(cr, x, y);
+	cairo_save(cr);
+	cairo_translate(cr, -sx, -sy);
+	cairo_rotate(cr, -90.0 * M_PI / 180.0);
+	cairo_translate(cr, sx, sy);
+	cairo_show_text(cr, text);
+	cairo_restore(cr);
+}
+
diff --git a/cairo_text_helpers.h b/cairo_text_helpers.h
new file mode 100644
index 0000000..014001a
--- /dev/null
+++ b/cairo_text_helpers.h
@@ -0,0 +1,18 @@
+#ifndef CAIRO_TEXT_HELPERS_H
+#define CAIRO_TEXT_HELPERS_H
+
+void draw_centered_text(cairo_t *cr, const char *font, double x, double y,
+			       double fontsize, const char *text);
+
+void draw_right_justified_text(cairo_t *cr, const char *font,
+				double x, double y,
+				double fontsize, const char *text);
+
+void draw_left_justified_text(cairo_t *cr, const char *font,
+				double x, double y,
+				double fontsize, const char *text);
+
+void draw_vertical_centered_text(cairo_t *cr, const char *font, double x,
+					double y, double fontsize,
+					const char *text);
+#endif
diff --git a/cconv.c b/cconv.c
new file mode 100644
index 0000000..48218dc
--- /dev/null
+++ b/cconv.c
@@ -0,0 +1,617 @@
+#include <string.h>
+
+#include "log.h"
+#include "thread_options.h"
+
+static void string_to_cpu(char **dst, const uint8_t *src)
+{
+	const char *__src = (const char *) src;
+
+	if (strlen(__src))
+		*dst = strdup(__src);
+}
+
+static void __string_to_net(uint8_t *dst, const char *src, size_t dst_size)
+{
+	if (src)
+		snprintf((char *) dst, dst_size, "%s", src);
+	else
+		dst[0] = '\0';
+}
+
+#define string_to_net(dst, src)	__string_to_net((dst), (src), sizeof(dst))
+
+static void free_thread_options_to_cpu(struct thread_options *o)
+{
+	int i;
+
+	free(o->description);
+	free(o->name);
+	free(o->wait_for);
+	free(o->directory);
+	free(o->filename);
+	free(o->filename_format);
+	free(o->opendir);
+	free(o->ioengine);
+	free(o->mmapfile);
+	free(o->read_iolog_file);
+	free(o->write_iolog_file);
+	free(o->merge_blktrace_file);
+	free(o->bw_log_file);
+	free(o->lat_log_file);
+	free(o->iops_log_file);
+	free(o->hist_log_file);
+	free(o->replay_redirect);
+	free(o->exec_prerun);
+	free(o->exec_postrun);
+	free(o->ioscheduler);
+	free(o->profile);
+	free(o->cgroup);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		free(o->bssplit[i]);
+		free(o->zone_split[i]);
+	}
+}
+
+void convert_thread_options_to_cpu(struct thread_options *o,
+				   struct thread_options_pack *top)
+{
+	int i, j;
+
+	for (i = 0; i < NR_OPTS_SZ; i++)
+		o->set_options[i] = le64_to_cpu(top->set_options[i]);
+
+	string_to_cpu(&o->description, top->description);
+	string_to_cpu(&o->name, top->name);
+	string_to_cpu(&o->wait_for, top->wait_for);
+	string_to_cpu(&o->directory, top->directory);
+	string_to_cpu(&o->filename, top->filename);
+	string_to_cpu(&o->filename_format, top->filename_format);
+	string_to_cpu(&o->opendir, top->opendir);
+	string_to_cpu(&o->ioengine, top->ioengine);
+	string_to_cpu(&o->mmapfile, top->mmapfile);
+	string_to_cpu(&o->read_iolog_file, top->read_iolog_file);
+	string_to_cpu(&o->write_iolog_file, top->write_iolog_file);
+	string_to_cpu(&o->merge_blktrace_file, top->merge_blktrace_file);
+	string_to_cpu(&o->bw_log_file, top->bw_log_file);
+	string_to_cpu(&o->lat_log_file, top->lat_log_file);
+	string_to_cpu(&o->iops_log_file, top->iops_log_file);
+	string_to_cpu(&o->hist_log_file, top->hist_log_file);
+	string_to_cpu(&o->replay_redirect, top->replay_redirect);
+	string_to_cpu(&o->exec_prerun, top->exec_prerun);
+	string_to_cpu(&o->exec_postrun, top->exec_postrun);
+	string_to_cpu(&o->ioscheduler, top->ioscheduler);
+	string_to_cpu(&o->profile, top->profile);
+	string_to_cpu(&o->cgroup, top->cgroup);
+
+	o->allow_create = le32_to_cpu(top->allow_create);
+	o->allow_mounted_write = le32_to_cpu(top->allow_mounted_write);
+	o->td_ddir = le32_to_cpu(top->td_ddir);
+	o->rw_seq = le32_to_cpu(top->rw_seq);
+	o->kb_base = le32_to_cpu(top->kb_base);
+	o->unit_base = le32_to_cpu(top->unit_base);
+	o->ddir_seq_nr = le32_to_cpu(top->ddir_seq_nr);
+	o->ddir_seq_add = le64_to_cpu(top->ddir_seq_add);
+	o->iodepth = le32_to_cpu(top->iodepth);
+	o->iodepth_low = le32_to_cpu(top->iodepth_low);
+	o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
+	o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
+	o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
+	o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
+	o->size = le64_to_cpu(top->size);
+	o->io_size = le64_to_cpu(top->io_size);
+	o->size_percent = le32_to_cpu(top->size_percent);
+	o->fill_device = le32_to_cpu(top->fill_device);
+	o->file_append = le32_to_cpu(top->file_append);
+	o->file_size_low = le64_to_cpu(top->file_size_low);
+	o->file_size_high = le64_to_cpu(top->file_size_high);
+	o->start_offset = le64_to_cpu(top->start_offset);
+	o->start_offset_align = le64_to_cpu(top->start_offset_align);
+	o->start_offset_percent = le32_to_cpu(top->start_offset_percent);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		o->bs[i] = le64_to_cpu(top->bs[i]);
+		o->ba[i] = le64_to_cpu(top->ba[i]);
+		o->min_bs[i] = le64_to_cpu(top->min_bs[i]);
+		o->max_bs[i] = le64_to_cpu(top->max_bs[i]);
+		o->bssplit_nr[i] = le32_to_cpu(top->bssplit_nr[i]);
+
+		if (o->bssplit_nr[i]) {
+			o->bssplit[i] = malloc(o->bssplit_nr[i] * sizeof(struct bssplit));
+			for (j = 0; j < o->bssplit_nr[i]; j++) {
+				o->bssplit[i][j].bs = le64_to_cpu(top->bssplit[i][j].bs);
+				o->bssplit[i][j].perc = le32_to_cpu(top->bssplit[i][j].perc);
+			}
+		}
+
+		o->zone_split_nr[i] = le32_to_cpu(top->zone_split_nr[i]);
+
+		if (o->zone_split_nr[i]) {
+			o->zone_split[i] = malloc(o->zone_split_nr[i] * sizeof(struct zone_split));
+			for (j = 0; j < o->zone_split_nr[i]; j++) {
+				o->zone_split[i][j].access_perc = top->zone_split[i][j].access_perc;
+				o->zone_split[i][j].size_perc = top->zone_split[i][j].size_perc;
+			}
+		}
+
+		o->rwmix[i] = le32_to_cpu(top->rwmix[i]);
+		o->rate[i] = le64_to_cpu(top->rate[i]);
+		o->ratemin[i] = le64_to_cpu(top->ratemin[i]);
+		o->rate_iops[i] = le32_to_cpu(top->rate_iops[i]);
+		o->rate_iops_min[i] = le32_to_cpu(top->rate_iops_min[i]);
+
+		o->perc_rand[i] = le32_to_cpu(top->perc_rand[i]);
+	}
+
+	o->ratecycle = le32_to_cpu(top->ratecycle);
+	o->io_submit_mode = le32_to_cpu(top->io_submit_mode);
+	o->unique_filename = le32_to_cpu(top->unique_filename);
+	o->nr_files = le32_to_cpu(top->nr_files);
+	o->open_files = le32_to_cpu(top->open_files);
+	o->file_lock_mode = le32_to_cpu(top->file_lock_mode);
+	o->odirect = le32_to_cpu(top->odirect);
+	o->oatomic = le32_to_cpu(top->oatomic);
+	o->invalidate_cache = le32_to_cpu(top->invalidate_cache);
+	o->create_serialize = le32_to_cpu(top->create_serialize);
+	o->create_fsync = le32_to_cpu(top->create_fsync);
+	o->create_on_open = le32_to_cpu(top->create_on_open);
+	o->create_only = le32_to_cpu(top->create_only);
+	o->end_fsync = le32_to_cpu(top->end_fsync);
+	o->pre_read = le32_to_cpu(top->pre_read);
+	o->sync_io = le32_to_cpu(top->sync_io);
+	o->write_hint = le32_to_cpu(top->write_hint);
+	o->verify = le32_to_cpu(top->verify);
+	o->do_verify = le32_to_cpu(top->do_verify);
+	o->experimental_verify = le32_to_cpu(top->experimental_verify);
+	o->verify_state = le32_to_cpu(top->verify_state);
+	o->verify_interval = le32_to_cpu(top->verify_interval);
+	o->verify_offset = le32_to_cpu(top->verify_offset);
+
+	memcpy(o->verify_pattern, top->verify_pattern, MAX_PATTERN_SIZE);
+	memcpy(o->buffer_pattern, top->buffer_pattern, MAX_PATTERN_SIZE);
+
+	o->verify_pattern_bytes = le32_to_cpu(top->verify_pattern_bytes);
+	o->verify_fatal = le32_to_cpu(top->verify_fatal);
+	o->verify_dump = le32_to_cpu(top->verify_dump);
+	o->verify_async = le32_to_cpu(top->verify_async);
+	o->verify_batch = le32_to_cpu(top->verify_batch);
+	o->use_thread = le32_to_cpu(top->use_thread);
+	o->unlink = le32_to_cpu(top->unlink);
+	o->unlink_each_loop = le32_to_cpu(top->unlink_each_loop);
+	o->do_disk_util = le32_to_cpu(top->do_disk_util);
+	o->override_sync = le32_to_cpu(top->override_sync);
+	o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
+	o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
+	o->rand_seed = le64_to_cpu(top->rand_seed);
+	o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
+	o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
+	o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
+	o->log_max = le32_to_cpu(top->log_max);
+	o->log_offset = le32_to_cpu(top->log_offset);
+	o->log_gz = le32_to_cpu(top->log_gz);
+	o->log_gz_store = le32_to_cpu(top->log_gz_store);
+	o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
+	o->norandommap = le32_to_cpu(top->norandommap);
+	o->softrandommap = le32_to_cpu(top->softrandommap);
+	o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
+	o->fsync_on_close = le32_to_cpu(top->fsync_on_close);
+	o->bs_is_seq_rand = le32_to_cpu(top->bs_is_seq_rand);
+	o->random_distribution = le32_to_cpu(top->random_distribution);
+	o->exitall_error = le32_to_cpu(top->exitall_error);
+	o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
+	o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
+	o->gauss_dev.u.f = fio_uint64_to_double(le64_to_cpu(top->gauss_dev.u.i));
+	o->random_generator = le32_to_cpu(top->random_generator);
+	o->hugepage_size = le32_to_cpu(top->hugepage_size);
+	o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
+	o->thinktime = le32_to_cpu(top->thinktime);
+	o->thinktime_spin = le32_to_cpu(top->thinktime_spin);
+	o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks);
+	o->fsync_blocks = le32_to_cpu(top->fsync_blocks);
+	o->fdatasync_blocks = le32_to_cpu(top->fdatasync_blocks);
+	o->barrier_blocks = le32_to_cpu(top->barrier_blocks);
+
+	o->verify_backlog = le64_to_cpu(top->verify_backlog);
+	o->start_delay = le64_to_cpu(top->start_delay);
+	o->start_delay_high = le64_to_cpu(top->start_delay_high);
+	o->timeout = le64_to_cpu(top->timeout);
+	o->ramp_time = le64_to_cpu(top->ramp_time);
+	o->ss_dur = le64_to_cpu(top->ss_dur);
+	o->ss_ramp_time = le64_to_cpu(top->ss_ramp_time);
+	o->ss_state = le32_to_cpu(top->ss_state);
+	o->ss_limit.u.f = fio_uint64_to_double(le64_to_cpu(top->ss_limit.u.i));
+	o->zone_range = le64_to_cpu(top->zone_range);
+	o->zone_size = le64_to_cpu(top->zone_size);
+	o->zone_skip = le64_to_cpu(top->zone_skip);
+	o->zone_mode = le32_to_cpu(top->zone_mode);
+	o->lockmem = le64_to_cpu(top->lockmem);
+	o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
+	o->offset_increment = le64_to_cpu(top->offset_increment);
+	o->number_ios = le64_to_cpu(top->number_ios);
+
+	o->overwrite = le32_to_cpu(top->overwrite);
+	o->bw_avg_time = le32_to_cpu(top->bw_avg_time);
+	o->iops_avg_time = le32_to_cpu(top->iops_avg_time);
+	o->loops = le32_to_cpu(top->loops);
+	o->mem_type = le32_to_cpu(top->mem_type);
+	o->mem_align = le32_to_cpu(top->mem_align);
+	o->exit_what = le16_to_cpu(top->exit_what);
+	o->stonewall = le16_to_cpu(top->stonewall);
+	o->new_group = le32_to_cpu(top->new_group);
+	o->numjobs = le32_to_cpu(top->numjobs);
+	o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
+	o->gpu_dev_id = le32_to_cpu(top->gpu_dev_id);
+	o->iolog = le32_to_cpu(top->iolog);
+	o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
+	o->nice = le32_to_cpu(top->nice);
+	o->ioprio = le32_to_cpu(top->ioprio);
+	o->ioprio_class = le32_to_cpu(top->ioprio_class);
+	o->file_service_type = le32_to_cpu(top->file_service_type);
+	o->group_reporting = le32_to_cpu(top->group_reporting);
+	o->stats = le32_to_cpu(top->stats);
+	o->fadvise_hint = le32_to_cpu(top->fadvise_hint);
+	o->fallocate_mode = le32_to_cpu(top->fallocate_mode);
+	o->zero_buffers = le32_to_cpu(top->zero_buffers);
+	o->refill_buffers = le32_to_cpu(top->refill_buffers);
+	o->scramble_buffers = le32_to_cpu(top->scramble_buffers);
+	o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes);
+	o->time_based = le32_to_cpu(top->time_based);
+	o->disable_lat = le32_to_cpu(top->disable_lat);
+	o->disable_clat = le32_to_cpu(top->disable_clat);
+	o->disable_slat = le32_to_cpu(top->disable_slat);
+	o->disable_bw = le32_to_cpu(top->disable_bw);
+	o->unified_rw_rep = le32_to_cpu(top->unified_rw_rep);
+	o->gtod_reduce = le32_to_cpu(top->gtod_reduce);
+	o->gtod_cpu = le32_to_cpu(top->gtod_cpu);
+	o->clocksource = le32_to_cpu(top->clocksource);
+	o->no_stall = le32_to_cpu(top->no_stall);
+	o->trim_percentage = le32_to_cpu(top->trim_percentage);
+	o->trim_batch = le32_to_cpu(top->trim_batch);
+	o->trim_zero = le32_to_cpu(top->trim_zero);
+	o->clat_percentiles = le32_to_cpu(top->clat_percentiles);
+	o->lat_percentiles = le32_to_cpu(top->lat_percentiles);
+	o->slat_percentiles = le32_to_cpu(top->slat_percentiles);
+	o->percentile_precision = le32_to_cpu(top->percentile_precision);
+	o->sig_figs = le32_to_cpu(top->sig_figs);
+	o->continue_on_error = le32_to_cpu(top->continue_on_error);
+	o->cgroup_weight = le32_to_cpu(top->cgroup_weight);
+	o->cgroup_nodelete = le32_to_cpu(top->cgroup_nodelete);
+	o->uid = le32_to_cpu(top->uid);
+	o->gid = le32_to_cpu(top->gid);
+	o->flow_id = __le32_to_cpu(top->flow_id);
+	o->flow = __le32_to_cpu(top->flow);
+	o->flow_watermark = __le32_to_cpu(top->flow_watermark);
+	o->flow_sleep = le32_to_cpu(top->flow_sleep);
+	o->sync_file_range = le32_to_cpu(top->sync_file_range);
+	o->latency_target = le64_to_cpu(top->latency_target);
+	o->latency_window = le64_to_cpu(top->latency_window);
+	o->max_latency = le64_to_cpu(top->max_latency);
+	o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
+	o->compress_percentage = le32_to_cpu(top->compress_percentage);
+	o->compress_chunk = le32_to_cpu(top->compress_chunk);
+	o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+	o->block_error_hist = le32_to_cpu(top->block_error_hist);
+	o->replay_align = le32_to_cpu(top->replay_align);
+	o->replay_scale = le32_to_cpu(top->replay_scale);
+	o->replay_time_scale = le32_to_cpu(top->replay_time_scale);
+	o->replay_skip = le32_to_cpu(top->replay_skip);
+	o->per_job_logs = le32_to_cpu(top->per_job_logs);
+	o->write_bw_log = le32_to_cpu(top->write_bw_log);
+	o->write_lat_log = le32_to_cpu(top->write_lat_log);
+	o->write_iops_log = le32_to_cpu(top->write_iops_log);
+	o->write_hist_log = le32_to_cpu(top->write_hist_log);
+
+	o->trim_backlog = le64_to_cpu(top->trim_backlog);
+	o->rate_process = le32_to_cpu(top->rate_process);
+	o->rate_ign_think = le32_to_cpu(top->rate_ign_think);
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		o->percentile_list[i].u.f = fio_uint64_to_double(le64_to_cpu(top->percentile_list[i].u.i));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		o->merge_blktrace_scalars[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_scalars[i].u.i));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		o->merge_blktrace_iters[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_iters[i].u.i));
+#if 0
+	uint8_t cpumask[FIO_TOP_STR_MAX];
+	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
+#endif
+}
+
+void convert_thread_options_to_net(struct thread_options_pack *top,
+				   struct thread_options *o)
+{
+	int i, j;
+
+	for (i = 0; i < NR_OPTS_SZ; i++)
+		top->set_options[i] = cpu_to_le64(o->set_options[i]);
+
+	string_to_net(top->description, o->description);
+	string_to_net(top->name, o->name);
+	string_to_net(top->wait_for, o->wait_for);
+	string_to_net(top->directory, o->directory);
+	string_to_net(top->filename, o->filename);
+	string_to_net(top->filename_format, o->filename_format);
+	string_to_net(top->opendir, o->opendir);
+	string_to_net(top->ioengine, o->ioengine);
+	string_to_net(top->mmapfile, o->mmapfile);
+	string_to_net(top->read_iolog_file, o->read_iolog_file);
+	string_to_net(top->write_iolog_file, o->write_iolog_file);
+	string_to_net(top->merge_blktrace_file, o->merge_blktrace_file);
+	string_to_net(top->bw_log_file, o->bw_log_file);
+	string_to_net(top->lat_log_file, o->lat_log_file);
+	string_to_net(top->iops_log_file, o->iops_log_file);
+	string_to_net(top->hist_log_file, o->hist_log_file);
+	string_to_net(top->replay_redirect, o->replay_redirect);
+	string_to_net(top->exec_prerun, o->exec_prerun);
+	string_to_net(top->exec_postrun, o->exec_postrun);
+	string_to_net(top->ioscheduler, o->ioscheduler);
+	string_to_net(top->profile, o->profile);
+	string_to_net(top->cgroup, o->cgroup);
+
+	top->allow_create = cpu_to_le32(o->allow_create);
+	top->allow_mounted_write = cpu_to_le32(o->allow_mounted_write);
+	top->td_ddir = cpu_to_le32(o->td_ddir);
+	top->rw_seq = cpu_to_le32(o->rw_seq);
+	top->kb_base = cpu_to_le32(o->kb_base);
+	top->unit_base = cpu_to_le32(o->unit_base);
+	top->ddir_seq_nr = cpu_to_le32(o->ddir_seq_nr);
+	top->iodepth = cpu_to_le32(o->iodepth);
+	top->iodepth_low = cpu_to_le32(o->iodepth_low);
+	top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
+	top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
+	top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
+	top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
+	top->size_percent = cpu_to_le32(o->size_percent);
+	top->fill_device = cpu_to_le32(o->fill_device);
+	top->file_append = cpu_to_le32(o->file_append);
+	top->ratecycle = cpu_to_le32(o->ratecycle);
+	top->io_submit_mode = cpu_to_le32(o->io_submit_mode);
+	top->nr_files = cpu_to_le32(o->nr_files);
+	top->unique_filename = cpu_to_le32(o->unique_filename);
+	top->open_files = cpu_to_le32(o->open_files);
+	top->file_lock_mode = cpu_to_le32(o->file_lock_mode);
+	top->odirect = cpu_to_le32(o->odirect);
+	top->oatomic = cpu_to_le32(o->oatomic);
+	top->invalidate_cache = cpu_to_le32(o->invalidate_cache);
+	top->create_serialize = cpu_to_le32(o->create_serialize);
+	top->create_fsync = cpu_to_le32(o->create_fsync);
+	top->create_on_open = cpu_to_le32(o->create_on_open);
+	top->create_only = cpu_to_le32(o->create_only);
+	top->end_fsync = cpu_to_le32(o->end_fsync);
+	top->pre_read = cpu_to_le32(o->pre_read);
+	top->sync_io = cpu_to_le32(o->sync_io);
+	top->write_hint = cpu_to_le32(o->write_hint);
+	top->verify = cpu_to_le32(o->verify);
+	top->do_verify = cpu_to_le32(o->do_verify);
+	top->experimental_verify = cpu_to_le32(o->experimental_verify);
+	top->verify_state = cpu_to_le32(o->verify_state);
+	top->verify_interval = cpu_to_le32(o->verify_interval);
+	top->verify_offset = cpu_to_le32(o->verify_offset);
+	top->verify_pattern_bytes = cpu_to_le32(o->verify_pattern_bytes);
+	top->verify_fatal = cpu_to_le32(o->verify_fatal);
+	top->verify_dump = cpu_to_le32(o->verify_dump);
+	top->verify_async = cpu_to_le32(o->verify_async);
+	top->verify_batch = cpu_to_le32(o->verify_batch);
+	top->use_thread = cpu_to_le32(o->use_thread);
+	top->unlink = cpu_to_le32(o->unlink);
+	top->unlink_each_loop = cpu_to_le32(o->unlink_each_loop);
+	top->do_disk_util = cpu_to_le32(o->do_disk_util);
+	top->override_sync = cpu_to_le32(o->override_sync);
+	top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
+	top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
+	top->rand_seed = __cpu_to_le64(o->rand_seed);
+	top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
+	top->log_max = cpu_to_le32(o->log_max);
+	top->log_offset = cpu_to_le32(o->log_offset);
+	top->log_gz = cpu_to_le32(o->log_gz);
+	top->log_gz_store = cpu_to_le32(o->log_gz_store);
+	top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
+	top->norandommap = cpu_to_le32(o->norandommap);
+	top->softrandommap = cpu_to_le32(o->softrandommap);
+	top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
+	top->fsync_on_close = cpu_to_le32(o->fsync_on_close);
+	top->bs_is_seq_rand = cpu_to_le32(o->bs_is_seq_rand);
+	top->random_distribution = cpu_to_le32(o->random_distribution);
+	top->exitall_error = cpu_to_le32(o->exitall_error);
+	top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
+	top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
+	top->gauss_dev.u.i = __cpu_to_le64(fio_double_to_uint64(o->gauss_dev.u.f));
+	top->random_generator = cpu_to_le32(o->random_generator);
+	top->hugepage_size = cpu_to_le32(o->hugepage_size);
+	top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
+	top->thinktime = cpu_to_le32(o->thinktime);
+	top->thinktime_spin = cpu_to_le32(o->thinktime_spin);
+	top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks);
+	top->fsync_blocks = cpu_to_le32(o->fsync_blocks);
+	top->fdatasync_blocks = cpu_to_le32(o->fdatasync_blocks);
+	top->barrier_blocks = cpu_to_le32(o->barrier_blocks);
+	top->overwrite = cpu_to_le32(o->overwrite);
+	top->bw_avg_time = cpu_to_le32(o->bw_avg_time);
+	top->iops_avg_time = cpu_to_le32(o->iops_avg_time);
+	top->loops = cpu_to_le32(o->loops);
+	top->mem_type = cpu_to_le32(o->mem_type);
+	top->mem_align = cpu_to_le32(o->mem_align);
+	top->exit_what = cpu_to_le16(o->exit_what);
+	top->stonewall = cpu_to_le16(o->stonewall);
+	top->new_group = cpu_to_le32(o->new_group);
+	top->numjobs = cpu_to_le32(o->numjobs);
+	top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
+	top->gpu_dev_id = cpu_to_le32(o->gpu_dev_id);
+	top->iolog = cpu_to_le32(o->iolog);
+	top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
+	top->nice = cpu_to_le32(o->nice);
+	top->ioprio = cpu_to_le32(o->ioprio);
+	top->ioprio_class = cpu_to_le32(o->ioprio_class);
+	top->file_service_type = cpu_to_le32(o->file_service_type);
+	top->group_reporting = cpu_to_le32(o->group_reporting);
+	top->stats = cpu_to_le32(o->stats);
+	top->fadvise_hint = cpu_to_le32(o->fadvise_hint);
+	top->fallocate_mode = cpu_to_le32(o->fallocate_mode);
+	top->zero_buffers = cpu_to_le32(o->zero_buffers);
+	top->refill_buffers = cpu_to_le32(o->refill_buffers);
+	top->scramble_buffers = cpu_to_le32(o->scramble_buffers);
+	top->buffer_pattern_bytes = cpu_to_le32(o->buffer_pattern_bytes);
+	top->time_based = cpu_to_le32(o->time_based);
+	top->disable_lat = cpu_to_le32(o->disable_lat);
+	top->disable_clat = cpu_to_le32(o->disable_clat);
+	top->disable_slat = cpu_to_le32(o->disable_slat);
+	top->disable_bw = cpu_to_le32(o->disable_bw);
+	top->unified_rw_rep = cpu_to_le32(o->unified_rw_rep);
+	top->gtod_reduce = cpu_to_le32(o->gtod_reduce);
+	top->gtod_cpu = cpu_to_le32(o->gtod_cpu);
+	top->clocksource = cpu_to_le32(o->clocksource);
+	top->no_stall = cpu_to_le32(o->no_stall);
+	top->trim_percentage = cpu_to_le32(o->trim_percentage);
+	top->trim_batch = cpu_to_le32(o->trim_batch);
+	top->trim_zero = cpu_to_le32(o->trim_zero);
+	top->clat_percentiles = cpu_to_le32(o->clat_percentiles);
+	top->lat_percentiles = cpu_to_le32(o->lat_percentiles);
+	top->slat_percentiles = cpu_to_le32(o->slat_percentiles);
+	top->percentile_precision = cpu_to_le32(o->percentile_precision);
+	top->sig_figs = cpu_to_le32(o->sig_figs);
+	top->continue_on_error = cpu_to_le32(o->continue_on_error);
+	top->cgroup_weight = cpu_to_le32(o->cgroup_weight);
+	top->cgroup_nodelete = cpu_to_le32(o->cgroup_nodelete);
+	top->uid = cpu_to_le32(o->uid);
+	top->gid = cpu_to_le32(o->gid);
+	top->flow_id = __cpu_to_le32(o->flow_id);
+	top->flow = __cpu_to_le32(o->flow);
+	top->flow_watermark = __cpu_to_le32(o->flow_watermark);
+	top->flow_sleep = cpu_to_le32(o->flow_sleep);
+	top->sync_file_range = cpu_to_le32(o->sync_file_range);
+	top->latency_target = __cpu_to_le64(o->latency_target);
+	top->latency_window = __cpu_to_le64(o->latency_window);
+	top->max_latency = __cpu_to_le64(o->max_latency);
+	top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
+	top->compress_percentage = cpu_to_le32(o->compress_percentage);
+	top->compress_chunk = cpu_to_le32(o->compress_chunk);
+	top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+	top->block_error_hist = cpu_to_le32(o->block_error_hist);
+	top->replay_align = cpu_to_le32(o->replay_align);
+	top->replay_scale = cpu_to_le32(o->replay_scale);
+	top->replay_time_scale = cpu_to_le32(o->replay_time_scale);
+	top->replay_skip = cpu_to_le32(o->replay_skip);
+	top->per_job_logs = cpu_to_le32(o->per_job_logs);
+	top->write_bw_log = cpu_to_le32(o->write_bw_log);
+	top->write_lat_log = cpu_to_le32(o->write_lat_log);
+	top->write_iops_log = cpu_to_le32(o->write_iops_log);
+	top->write_hist_log = cpu_to_le32(o->write_hist_log);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		top->bs[i] = __cpu_to_le64(o->bs[i]);
+		top->ba[i] = __cpu_to_le64(o->ba[i]);
+		top->min_bs[i] = __cpu_to_le64(o->min_bs[i]);
+		top->max_bs[i] = __cpu_to_le64(o->max_bs[i]);
+		top->bssplit_nr[i] = cpu_to_le32(o->bssplit_nr[i]);
+
+		if (o->bssplit_nr[i]) {
+			unsigned int bssplit_nr = o->bssplit_nr[i];
+
+			if (bssplit_nr > BSSPLIT_MAX) {
+				log_err("fio: BSSPLIT_MAX is too small\n");
+				bssplit_nr = BSSPLIT_MAX;
+			}
+			for (j = 0; j < bssplit_nr; j++) {
+				top->bssplit[i][j].bs = cpu_to_le64(o->bssplit[i][j].bs);
+				top->bssplit[i][j].perc = cpu_to_le32(o->bssplit[i][j].perc);
+			}
+		}
+
+		top->zone_split_nr[i] = cpu_to_le32(o->zone_split_nr[i]);
+
+		if (o->zone_split_nr[i]) {
+			unsigned int zone_split_nr = o->zone_split_nr[i];
+
+			if (zone_split_nr > ZONESPLIT_MAX) {
+				log_err("fio: ZONESPLIT_MAX is too small\n");
+				zone_split_nr = ZONESPLIT_MAX;
+			}
+			for (j = 0; j < zone_split_nr; j++) {
+				top->zone_split[i][j].access_perc = o->zone_split[i][j].access_perc;
+				top->zone_split[i][j].size_perc = o->zone_split[i][j].size_perc;
+			}
+		}
+
+		top->rwmix[i] = cpu_to_le32(o->rwmix[i]);
+		top->rate[i] = cpu_to_le64(o->rate[i]);
+		top->ratemin[i] = cpu_to_le64(o->ratemin[i]);
+		top->rate_iops[i] = cpu_to_le32(o->rate_iops[i]);
+		top->rate_iops_min[i] = cpu_to_le32(o->rate_iops_min[i]);
+
+		top->perc_rand[i] = cpu_to_le32(o->perc_rand[i]);
+	}
+
+	memcpy(top->verify_pattern, o->verify_pattern, MAX_PATTERN_SIZE);
+	memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE);
+
+	top->size = __cpu_to_le64(o->size);
+	top->io_size = __cpu_to_le64(o->io_size);
+	top->verify_backlog = __cpu_to_le64(o->verify_backlog);
+	top->start_delay = __cpu_to_le64(o->start_delay);
+	top->start_delay_high = __cpu_to_le64(o->start_delay_high);
+	top->timeout = __cpu_to_le64(o->timeout);
+	top->ramp_time = __cpu_to_le64(o->ramp_time);
+	top->ss_dur = __cpu_to_le64(top->ss_dur);
+	top->ss_ramp_time = __cpu_to_le64(top->ss_ramp_time);
+	top->ss_state = cpu_to_le32(top->ss_state);
+	top->ss_limit.u.i = __cpu_to_le64(fio_double_to_uint64(o->ss_limit.u.f));
+	top->zone_range = __cpu_to_le64(o->zone_range);
+	top->zone_size = __cpu_to_le64(o->zone_size);
+	top->zone_skip = __cpu_to_le64(o->zone_skip);
+	top->zone_mode = __cpu_to_le32(o->zone_mode);
+	top->lockmem = __cpu_to_le64(o->lockmem);
+	top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
+	top->file_size_low = __cpu_to_le64(o->file_size_low);
+	top->file_size_high = __cpu_to_le64(o->file_size_high);
+	top->start_offset = __cpu_to_le64(o->start_offset);
+	top->start_offset_align = __cpu_to_le64(o->start_offset_align);
+	top->start_offset_percent = __cpu_to_le32(o->start_offset_percent);
+	top->trim_backlog = __cpu_to_le64(o->trim_backlog);
+	top->offset_increment_percent = __cpu_to_le32(o->offset_increment_percent);
+	top->offset_increment = __cpu_to_le64(o->offset_increment);
+	top->number_ios = __cpu_to_le64(o->number_ios);
+	top->rate_process = cpu_to_le32(o->rate_process);
+	top->rate_ign_think = cpu_to_le32(o->rate_ign_think);
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		top->percentile_list[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->percentile_list[i].u.f));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		top->merge_blktrace_scalars[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_scalars[i].u.f));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		top->merge_blktrace_iters[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_iters[i].u.f));
+#if 0
+	uint8_t cpumask[FIO_TOP_STR_MAX];
+	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
+#endif
+
+}
+
+/*
+ * Basic conversion test. We'd really need to fill in more of the options
+ * to have a thorough test. Even better, we should auto-generate the
+ * converter functions...
+ */
+int fio_test_cconv(struct thread_options *__o)
+{
+	struct thread_options o;
+	struct thread_options_pack top1, top2;
+
+	memset(&top1, 0, sizeof(top1));
+	memset(&top2, 0, sizeof(top2));
+
+	convert_thread_options_to_net(&top1, __o);
+	memset(&o, 0, sizeof(o));
+	convert_thread_options_to_cpu(&o, &top1);
+	convert_thread_options_to_net(&top2, &o);
+
+	free_thread_options_to_cpu(&o);
+
+	return memcmp(&top1, &top2, sizeof(top1));
+}
diff --git a/cgroup.c b/cgroup.c
new file mode 100644
index 0000000..77e31a4
--- /dev/null
+++ b/cgroup.c
@@ -0,0 +1,237 @@
+/*
+ * Code related to setting up a blkio cgroup
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <mntent.h>
+#include <sys/stat.h>
+#include "fio.h"
+#include "flist.h"
+#include "cgroup.h"
+#include "smalloc.h"
+
+static struct fio_sem *lock;
+
+struct cgroup_member {
+	struct flist_head list;
+	char *root;
+	unsigned int cgroup_nodelete;
+};
+
+static struct cgroup_mnt *find_cgroup_mnt(struct thread_data *td)
+{
+	struct cgroup_mnt *cgroup_mnt = NULL;
+	struct mntent *mnt, dummy;
+	char buf[256] = {0};
+	FILE *f;
+	bool cgroup2 = false;
+
+	f = setmntent("/proc/mounts", "r");
+	if (!f) {
+		td_verror(td, errno, "setmntent /proc/mounts");
+		return NULL;
+	}
+
+	while ((mnt = getmntent_r(f, &dummy, buf, sizeof(buf))) != NULL) {
+		if (!strcmp(mnt->mnt_type, "cgroup") &&
+		    strstr(mnt->mnt_opts, "blkio"))
+			break;
+		if (!strcmp(mnt->mnt_type, "cgroup2")) {
+			cgroup2 = true;
+			break;
+		}
+	}
+
+	if (mnt) {
+		cgroup_mnt = smalloc(sizeof(*cgroup_mnt));
+		if (cgroup_mnt) {
+			cgroup_mnt->path = smalloc_strdup(mnt->mnt_dir);
+			if (!cgroup_mnt->path) {
+				sfree(cgroup_mnt);
+				log_err("fio: could not allocate memory\n");
+			} else {
+				cgroup_mnt->cgroup2 = cgroup2;
+			}
+		}
+	} else {
+		log_err("fio: cgroup blkio does not appear to be mounted\n");
+	}
+
+	endmntent(f);
+	return cgroup_mnt;
+}
+
+static void add_cgroup(struct thread_data *td, const char *name,
+			struct flist_head *clist)
+{
+	struct cgroup_member *cm;
+
+	if (!lock)
+		return;
+
+	cm = smalloc(sizeof(*cm));
+	if (!cm) {
+err:
+		log_err("fio: failed to allocate cgroup member\n");
+		return;
+	}
+
+	INIT_FLIST_HEAD(&cm->list);
+	cm->root = smalloc_strdup(name);
+	if (!cm->root) {
+		sfree(cm);
+		goto err;
+	}
+	if (td->o.cgroup_nodelete)
+		cm->cgroup_nodelete = 1;
+	fio_sem_down(lock);
+	flist_add_tail(&cm->list, clist);
+	fio_sem_up(lock);
+}
+
+void cgroup_kill(struct flist_head *clist)
+{
+	struct flist_head *n, *tmp;
+	struct cgroup_member *cm;
+
+	if (!lock)
+		return;
+
+	fio_sem_down(lock);
+
+	flist_for_each_safe(n, tmp, clist) {
+		cm = flist_entry(n, struct cgroup_member, list);
+		if (!cm->cgroup_nodelete)
+			rmdir(cm->root);
+		flist_del(&cm->list);
+		sfree(cm->root);
+		sfree(cm);
+	}
+
+	fio_sem_up(lock);
+}
+
+static char *get_cgroup_root(struct thread_data *td, struct cgroup_mnt *mnt)
+{
+	char *str = malloc(64);
+
+	if (td->o.cgroup)
+		sprintf(str, "%s/%s", mnt->path, td->o.cgroup);
+	else
+		sprintf(str, "%s/%s", mnt->path, td->o.name);
+
+	return str;
+}
+
+static int write_int_to_file(struct thread_data *td, const char *path,
+			     const char *filename, unsigned int val,
+			     const char *onerr)
+{
+	char tmp[256];
+	FILE *f;
+
+	sprintf(tmp, "%s/%s", path, filename);
+	f = fopen(tmp, "w");
+	if (!f) {
+		td_verror(td, errno, onerr);
+		return 1;
+	}
+
+	fprintf(f, "%u", val);
+	fclose(f);
+	return 0;
+
+}
+
+static int cgroup_write_pid(struct thread_data *td, char *path, bool cgroup2)
+{
+	unsigned int val = td->pid;
+
+	if (cgroup2)
+		return write_int_to_file(td, path, "cgroup.procs",
+					 val, "cgroup write pid");
+	return write_int_to_file(td, path, "tasks", val, "cgroup write pid");
+}
+
+/*
+ * Move pid to root class
+ */
+static int cgroup_del_pid(struct thread_data *td, struct cgroup_mnt *mnt)
+{
+	return cgroup_write_pid(td, mnt->path, mnt->cgroup2);
+}
+
+int cgroup_setup(struct thread_data *td, struct flist_head *clist, struct cgroup_mnt **mnt)
+{
+	char *root;
+
+	if (!clist)
+		return 1;
+
+	if (!*mnt) {
+		*mnt = find_cgroup_mnt(td);
+		if (!*mnt)
+			return 1;
+	}
+
+	/*
+	 * Create container, if it doesn't exist
+	 */
+	root = get_cgroup_root(td, *mnt);
+	if (mkdir(root, 0755) < 0) {
+		int __e = errno;
+
+		if (__e != EEXIST) {
+			td_verror(td, __e, "cgroup mkdir");
+			log_err("fio: path %s\n", root);
+			goto err;
+		}
+	} else
+		add_cgroup(td, root, clist);
+
+	if (td->o.cgroup_weight) {
+		if ((*mnt)->cgroup2) {
+			log_err("fio: cgroup weit doesn't work with cgroup2\n");
+			goto err;
+		}
+		if (write_int_to_file(td, root, "blkio.weight",
+					td->o.cgroup_weight,
+					"cgroup open weight"))
+			goto err;
+	}
+
+	if (!cgroup_write_pid(td, root, (*mnt)->cgroup2)) {
+		free(root);
+		return 0;
+	}
+
+err:
+	free(root);
+	return 1;
+}
+
+void cgroup_shutdown(struct thread_data *td, struct cgroup_mnt *mnt)
+{
+	if (mnt == NULL)
+		return;
+	if (!td->o.cgroup_weight && !td->o.cgroup)
+		goto out;
+
+	cgroup_del_pid(td, mnt);
+out:
+	if (mnt->path)
+		sfree(mnt->path);
+	sfree(mnt);
+}
+
+static void fio_init cgroup_init(void)
+{
+	lock = fio_sem_init(FIO_SEM_UNLOCKED);
+	if (!lock)
+		log_err("fio: failed to allocate cgroup lock\n");
+}
+
+static void fio_exit cgroup_exit(void)
+{
+	fio_sem_remove(lock);
+}
diff --git a/cgroup.h b/cgroup.h
new file mode 100644
index 0000000..10313b7
--- /dev/null
+++ b/cgroup.h
@@ -0,0 +1,36 @@
+#ifndef FIO_CGROUP_H
+#define FIO_CGROUP_H
+
+#ifdef FIO_HAVE_CGROUPS
+
+struct cgroup_mnt {
+	char *path;
+	bool cgroup2;
+};
+
+int cgroup_setup(struct thread_data *, struct flist_head *, struct cgroup_mnt **);
+void cgroup_shutdown(struct thread_data *, struct cgroup_mnt *);
+
+void cgroup_kill(struct flist_head *list);
+
+#else
+
+struct cgroup_mnt;
+
+static inline int cgroup_setup(struct thread_data *td, struct flist_head *list,
+			       struct cgroup_mnt **mnt)
+{
+	td_verror(td, EINVAL, "cgroup_setup");
+	return 1;
+}
+
+static inline void cgroup_shutdown(struct thread_data *td, struct cgroup_mnt *mnt)
+{
+}
+
+static inline void cgroup_kill(struct flist_head *list)
+{
+}
+
+#endif
+#endif
diff --git a/client.c b/client.c
new file mode 100644
index 0000000..b757559
--- /dev/null
+++ b/client.c
@@ -0,0 +1,2170 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <signal.h>
+#ifdef CONFIG_ZLIB
+#include <zlib.h>
+#endif
+
+#include "fio.h"
+#include "client.h"
+#include "server.h"
+#include "flist.h"
+#include "hash.h"
+#include "verify-state.h"
+
+static void handle_du(struct fio_client *client, struct fio_net_cmd *cmd);
+static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd);
+static void handle_gs(struct fio_client *client, struct fio_net_cmd *cmd);
+static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd);
+static void handle_text(struct fio_client *client, struct fio_net_cmd *cmd);
+static void handle_stop(struct fio_client *client);
+static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd);
+
+static void convert_text(struct fio_net_cmd *cmd);
+static void client_display_thread_status(struct jobs_eta *je);
+
+struct client_ops fio_client_ops = {
+	.text		= handle_text,
+	.disk_util	= handle_du,
+	.thread_status	= handle_ts,
+	.group_stats	= handle_gs,
+	.stop		= handle_stop,
+	.start		= handle_start,
+	.eta		= client_display_thread_status,
+	.probe		= handle_probe,
+	.eta_msec	= FIO_CLIENT_DEF_ETA_MSEC,
+	.client_type	= FIO_CLIENT_TYPE_CLI,
+};
+
+static struct timespec eta_ts;
+
+static FLIST_HEAD(client_list);
+static FLIST_HEAD(eta_list);
+
+static FLIST_HEAD(arg_list);
+
+struct thread_stat client_ts;
+struct group_run_stats client_gs;
+int sum_stat_clients;
+
+static int sum_stat_nr;
+static struct buf_output allclients;
+static struct json_object *root = NULL;
+static struct json_object *job_opt_object = NULL;
+static struct json_array *clients_array = NULL;
+static struct json_array *du_array = NULL;
+
+static int error_clients;
+
+#define FIO_CLIENT_HASH_BITS	7
+#define FIO_CLIENT_HASH_SZ	(1 << FIO_CLIENT_HASH_BITS)
+#define FIO_CLIENT_HASH_MASK	(FIO_CLIENT_HASH_SZ - 1)
+static struct flist_head client_hash[FIO_CLIENT_HASH_SZ];
+
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *, bool *);
+
+static void fio_client_add_hash(struct fio_client *client)
+{
+	int bucket = hash_long(client->fd, FIO_CLIENT_HASH_BITS);
+
+	bucket &= FIO_CLIENT_HASH_MASK;
+	flist_add(&client->hash_list, &client_hash[bucket]);
+}
+
+static void fio_client_remove_hash(struct fio_client *client)
+{
+	if (!flist_empty(&client->hash_list))
+		flist_del_init(&client->hash_list);
+}
+
+static void fio_init fio_client_hash_init(void)
+{
+	int i;
+
+	for (i = 0; i < FIO_CLIENT_HASH_SZ; i++)
+		INIT_FLIST_HEAD(&client_hash[i]);
+}
+
+static int read_data(int fd, void *data, size_t size)
+{
+	ssize_t ret;
+
+	while (size) {
+		ret = read(fd, data, size);
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			break;
+		} else if (!ret)
+			break;
+		else {
+			data += ret;
+			size -= ret;
+		}
+	}
+
+	if (size)
+		return EAGAIN;
+
+	return 0;
+}
+
+static int read_ini_data(int fd, void *data, size_t size)
+{
+	char *p = data;
+	int ret = 0;
+	FILE *fp;
+	int dupfd;
+
+	dupfd = dup(fd);
+	if (dupfd < 0)
+		return errno;
+
+	fp = fdopen(dupfd, "r");
+	if (!fp) {
+		ret = errno;
+		close(dupfd);
+		goto out;
+	}
+
+	while (1) {
+		ssize_t len;
+		char buf[OPT_LEN_MAX+1], *sub;
+
+		if (!fgets(buf, sizeof(buf), fp)) {
+			if (ferror(fp)) {
+				if (errno == EAGAIN || errno == EINTR)
+					continue;
+				ret = errno;
+			}
+			break;
+		}
+
+		sub = fio_option_dup_subs(buf);
+		len = strlen(sub);
+		if (len + 1 > size) {
+			log_err("fio: no space left to read data\n");
+			free(sub);
+			ret = ENOSPC;
+			break;
+		}
+
+		memcpy(p, sub, len);
+		free(sub);
+		p += len;
+		*p = '\0';
+		size -= len;
+	}
+
+	fclose(fp);
+out:
+	return ret;
+}
+
+static void fio_client_json_init(void)
+{
+	char time_buf[32];
+	time_t time_p;
+
+	if (!(output_format & FIO_OUTPUT_JSON))
+		return;
+
+	time(&time_p);
+	os_ctime_r((const time_t *) &time_p, time_buf, sizeof(time_buf));
+	time_buf[strlen(time_buf) - 1] = '\0';
+
+	root = json_create_object();
+	json_object_add_value_string(root, "fio version", fio_version_string);
+	json_object_add_value_int(root, "timestamp", time_p);
+	json_object_add_value_string(root, "time", time_buf);
+
+	job_opt_object = json_create_object();
+	json_object_add_value_object(root, "global options", job_opt_object);
+	clients_array = json_create_array();
+	json_object_add_value_array(root, "client_stats", clients_array);
+	du_array = json_create_array();
+	json_object_add_value_array(root, "disk_util", du_array);
+}
+
+static void fio_client_json_fini(void)
+{
+	struct buf_output out;
+
+	if (!root)
+		return;
+
+	buf_output_init(&out);
+
+	__log_buf(&out, "\n");
+	json_print_object(root, &out);
+	__log_buf(&out, "\n");
+	log_info_buf(out.buf, out.buflen);
+
+	buf_output_free(&out);
+
+	json_free_object(root);
+	root = NULL;
+	job_opt_object = NULL;
+	clients_array = NULL;
+	du_array = NULL;
+}
+
+static struct fio_client *find_client_by_fd(int fd)
+{
+	int bucket = hash_long(fd, FIO_CLIENT_HASH_BITS) & FIO_CLIENT_HASH_MASK;
+	struct fio_client *client;
+	struct flist_head *entry;
+
+	flist_for_each(entry, &client_hash[bucket]) {
+		client = flist_entry(entry, struct fio_client, hash_list);
+
+		if (client->fd == fd) {
+			client->refs++;
+			return client;
+		}
+	}
+
+	return NULL;
+}
+
+void fio_put_client(struct fio_client *client)
+{
+	if (--client->refs)
+		return;
+
+	log_info_buf(client->buf.buf, client->buf.buflen);
+	buf_output_free(&client->buf);
+
+	free(client->hostname);
+	if (client->argv)
+		free(client->argv);
+	if (client->name)
+		free(client->name);
+	while (client->nr_files) {
+		struct client_file *cf = &client->files[--client->nr_files];
+
+		free(cf->file);
+	}
+	if (client->files)
+		free(client->files);
+	if (client->opt_lists)
+		free(client->opt_lists);
+
+	if (!client->did_stat)
+		sum_stat_clients--;
+
+	if (client->error)
+		error_clients++;
+
+	free(client);
+}
+
+static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
+{
+	if (!--eta->pending) {
+		eta_fn(&eta->eta);
+		free(eta);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void fio_drain_client_text(struct fio_client *client)
+{
+	do {
+		struct fio_net_cmd *cmd;
+
+		cmd = fio_net_recv_cmd(client->fd, false);
+		if (!cmd)
+			break;
+
+		if (cmd->opcode == FIO_NET_CMD_TEXT) {
+			convert_text(cmd);
+			client->ops->text(client, cmd);
+		}
+
+		free(cmd);
+	} while (1);
+}
+
+static void remove_client(struct fio_client *client)
+{
+	assert(client->refs);
+
+	dprint(FD_NET, "client: removed <%s>\n", client->hostname);
+
+	fio_drain_client_text(client);
+
+	if (!flist_empty(&client->list))
+		flist_del_init(&client->list);
+
+	fio_client_remove_hash(client);
+
+	if (!flist_empty(&client->eta_list)) {
+		flist_del_init(&client->eta_list);
+		fio_client_dec_jobs_eta(client->eta_in_flight, client->ops->eta);
+	}
+
+	close(client->fd);
+	client->fd = -1;
+
+	if (client->ops->removed)
+		client->ops->removed(client);
+
+	nr_clients--;
+	fio_put_client(client);
+}
+
+struct fio_client *fio_get_client(struct fio_client *client)
+{
+	client->refs++;
+	return client;
+}
+
+static void __fio_client_add_cmd_option(struct fio_client *client,
+					const char *opt)
+{
+	int index;
+
+	index = client->argc++;
+	client->argv = realloc(client->argv, sizeof(char *) * client->argc);
+	client->argv[index] = strdup(opt);
+	dprint(FD_NET, "client: add cmd %d: %s\n", index, opt);
+}
+
+void fio_client_add_cmd_option(void *cookie, const char *opt)
+{
+	struct fio_client *client = cookie;
+	struct flist_head *entry;
+
+	if (!client || !opt)
+		return;
+
+	__fio_client_add_cmd_option(client, opt);
+
+	/*
+	 * Duplicate arguments to shared client group
+	 */
+	flist_for_each(entry, &arg_list) {
+		client = flist_entry(entry, struct fio_client, arg_list);
+
+		__fio_client_add_cmd_option(client, opt);
+	}
+}
+
+static struct fio_client *get_new_client(void)
+{
+	struct fio_client *client;
+
+	client = malloc(sizeof(*client));
+	memset(client, 0, sizeof(*client));
+
+	INIT_FLIST_HEAD(&client->list);
+	INIT_FLIST_HEAD(&client->hash_list);
+	INIT_FLIST_HEAD(&client->arg_list);
+	INIT_FLIST_HEAD(&client->eta_list);
+	INIT_FLIST_HEAD(&client->cmd_list);
+
+	buf_output_init(&client->buf);
+
+	return client;
+}
+
+struct fio_client *fio_client_add_explicit(struct client_ops *ops,
+					   const char *hostname, int type,
+					   int port)
+{
+	struct fio_client *client;
+
+	client = get_new_client();
+
+	client->hostname = strdup(hostname);
+
+	if (type == Fio_client_socket)
+		client->is_sock = true;
+	else {
+		int ipv6;
+
+		ipv6 = type == Fio_client_ipv6;
+		if (fio_server_parse_host(hostname, ipv6,
+						&client->addr.sin_addr,
+						&client->addr6.sin6_addr))
+			goto err;
+
+		client->port = port;
+	}
+
+	client->fd = -1;
+	client->ops = ops;
+	client->refs = 1;
+	client->type = ops->client_type;
+
+	__fio_client_add_cmd_option(client, "fio");
+
+	flist_add(&client->list, &client_list);
+	nr_clients++;
+	dprint(FD_NET, "client: added <%s>\n", client->hostname);
+	return client;
+err:
+	free(client);
+	return NULL;
+}
+
+int fio_client_add_ini_file(void *cookie, const char *ini_file, bool remote)
+{
+	struct fio_client *client = cookie;
+	struct client_file *cf;
+	size_t new_size;
+	void *new_files;
+
+	if (!client)
+		return 1;
+
+	dprint(FD_NET, "client <%s>: add ini %s\n", client->hostname, ini_file);
+
+	new_size = (client->nr_files + 1) * sizeof(struct client_file);
+	new_files = realloc(client->files, new_size);
+	if (!new_files)
+		return 1;
+
+	client->files = new_files;
+	cf = &client->files[client->nr_files];
+	cf->file = strdup(ini_file);
+	cf->remote = remote;
+	client->nr_files++;
+	return 0;
+}
+
+int fio_client_add(struct client_ops *ops, const char *hostname, void **cookie)
+{
+	struct fio_client *existing = *cookie;
+	struct fio_client *client;
+
+	if (existing) {
+		/*
+		 * We always add our "exec" name as the option, hence 1
+		 * means empty.
+		 */
+		if (existing->argc == 1)
+			flist_add_tail(&existing->arg_list, &arg_list);
+		else {
+			while (!flist_empty(&arg_list))
+				flist_del_init(arg_list.next);
+		}
+	}
+
+	client = get_new_client();
+
+	if (fio_server_parse_string(hostname, &client->hostname,
+					&client->is_sock, &client->port,
+					&client->addr.sin_addr,
+					&client->addr6.sin6_addr,
+					&client->ipv6))
+		return -1;
+
+	client->fd = -1;
+	client->ops = ops;
+	client->refs = 1;
+	client->type = ops->client_type;
+
+	__fio_client_add_cmd_option(client, "fio");
+
+	flist_add(&client->list, &client_list);
+	nr_clients++;
+	dprint(FD_NET, "client: added <%s>\n", client->hostname);
+	*cookie = client;
+	return 0;
+}
+
+static const char *server_name(struct fio_client *client, char *buf,
+			       size_t bufsize)
+{
+	const char *from;
+
+	if (client->ipv6)
+		from = inet_ntop(AF_INET6, (struct sockaddr *) &client->addr6.sin6_addr, buf, bufsize);
+	else if (client->is_sock)
+		from = "sock";
+	else
+		from = inet_ntop(AF_INET, (struct sockaddr *) &client->addr.sin_addr, buf, bufsize);
+
+	return from;
+}
+
+static void probe_client(struct fio_client *client)
+{
+	struct cmd_client_probe_pdu pdu;
+	const char *sname;
+	uint64_t tag;
+	char buf[64];
+
+	dprint(FD_NET, "client: send probe\n");
+
+#ifdef CONFIG_ZLIB
+	pdu.flags = __le64_to_cpu(FIO_PROBE_FLAG_ZLIB);
+#else
+	pdu.flags = 0;
+#endif
+
+	sname = server_name(client, buf, sizeof(buf));
+	memset(pdu.server, 0, sizeof(pdu.server));
+	snprintf((char *) pdu.server, sizeof(pdu.server), "%s", sname);
+
+	fio_net_send_cmd(client->fd, FIO_NET_CMD_PROBE, &pdu, sizeof(pdu), &tag, &client->cmd_list);
+}
+
+static int fio_client_connect_ip(struct fio_client *client)
+{
+	struct sockaddr *addr;
+	socklen_t socklen;
+	int fd, domain;
+
+	if (client->ipv6) {
+		client->addr6.sin6_family = AF_INET6;
+		client->addr6.sin6_port = htons(client->port);
+		domain = AF_INET6;
+		addr = (struct sockaddr *) &client->addr6;
+		socklen = sizeof(client->addr6);
+	} else {
+		client->addr.sin_family = AF_INET;
+		client->addr.sin_port = htons(client->port);
+		domain = AF_INET;
+		addr = (struct sockaddr *) &client->addr;
+		socklen = sizeof(client->addr);
+	}
+
+	fd = socket(domain, SOCK_STREAM, 0);
+	if (fd < 0) {
+		int ret = -errno;
+
+		log_err("fio: socket: %s\n", strerror(errno));
+		return ret;
+	}
+
+	if (connect(fd, addr, socklen) < 0) {
+		int ret = -errno;
+
+		log_err("fio: connect: %s\n", strerror(errno));
+		log_err("fio: failed to connect to %s:%u\n", client->hostname,
+								client->port);
+		close(fd);
+		return ret;
+	}
+
+	return fd;
+}
+
+static int fio_client_connect_sock(struct fio_client *client)
+{
+	struct sockaddr_un *addr = &client->addr_un;
+	socklen_t len;
+	int fd;
+
+	memset(addr, 0, sizeof(*addr));
+	addr->sun_family = AF_UNIX;
+	snprintf(addr->sun_path, sizeof(addr->sun_path), "%s",
+		 client->hostname);
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0) {
+		int ret = -errno;
+
+		log_err("fio: socket: %s\n", strerror(errno));
+		return ret;
+	}
+
+	len = sizeof(addr->sun_family) + strlen(addr->sun_path) + 1;
+	if (connect(fd, (struct sockaddr *) addr, len) < 0) {
+		int ret = -errno;
+
+		log_err("fio: connect; %s\n", strerror(errno));
+		close(fd);
+		return ret;
+	}
+
+	return fd;
+}
+
+int fio_client_connect(struct fio_client *client)
+{
+	int fd;
+
+	dprint(FD_NET, "client: connect to host %s\n", client->hostname);
+
+	if (client->is_sock)
+		fd = fio_client_connect_sock(client);
+	else
+		fd = fio_client_connect_ip(client);
+
+	dprint(FD_NET, "client: %s connected %d\n", client->hostname, fd);
+
+	if (fd < 0)
+		return fd;
+
+	client->fd = fd;
+	fio_client_add_hash(client);
+	client->state = Client_connected;
+
+	probe_client(client);
+	return 0;
+}
+
+int fio_client_terminate(struct fio_client *client)
+{
+	return fio_net_send_quit(client->fd);
+}
+
+static void fio_clients_terminate(void)
+{
+	struct flist_head *entry;
+	struct fio_client *client;
+
+	dprint(FD_NET, "client: terminate clients\n");
+
+	flist_for_each(entry, &client_list) {
+		client = flist_entry(entry, struct fio_client, list);
+		fio_client_terminate(client);
+	}
+}
+
+static void sig_int(int sig)
+{
+	dprint(FD_NET, "client: got signal %d\n", sig);
+	fio_clients_terminate();
+}
+
+static void client_signal_handler(void)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGINT, &act, NULL);
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGTERM, &act, NULL);
+
+/* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGBREAK, &act, NULL);
+#endif
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_show_status;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGUSR1, &act, NULL);
+}
+
+static int send_client_cmd_line(struct fio_client *client)
+{
+	struct cmd_single_line_pdu *cslp;
+	struct cmd_line_pdu *clp;
+	unsigned long offset;
+	unsigned int *lens;
+	void *pdu;
+	size_t mem;
+	int i, ret;
+
+	dprint(FD_NET, "client: send cmdline %d\n", client->argc);
+
+	lens = malloc(client->argc * sizeof(unsigned int));
+
+	/*
+	 * Find out how much mem we need
+	 */
+	for (i = 0, mem = 0; i < client->argc; i++) {
+		lens[i] = strlen(client->argv[i]) + 1;
+		mem += lens[i];
+	}
+
+	/*
+	 * We need one cmd_line_pdu, and argc number of cmd_single_line_pdu
+	 */
+	mem += sizeof(*clp) + (client->argc * sizeof(*cslp));
+
+	pdu = malloc(mem);
+	clp = pdu;
+	offset = sizeof(*clp);
+
+	for (i = 0; i < client->argc; i++) {
+		uint16_t arg_len = lens[i];
+
+		cslp = pdu + offset;
+		strcpy((char *) cslp->text, client->argv[i]);
+		cslp->len = cpu_to_le16(arg_len);
+		offset += sizeof(*cslp) + arg_len;
+	}
+
+	free(lens);
+	clp->lines = cpu_to_le16(client->argc);
+	clp->client_type = __cpu_to_le16(client->type);
+	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_JOBLINE, pdu, mem, NULL, NULL);
+	free(pdu);
+	return ret;
+}
+
+int fio_clients_connect(void)
+{
+	struct fio_client *client;
+	struct flist_head *entry, *tmp;
+	int ret;
+
+#ifdef WIN32
+	WSADATA wsd;
+	WSAStartup(MAKEWORD(2, 2), &wsd);
+#endif
+
+	dprint(FD_NET, "client: connect all\n");
+
+	client_signal_handler();
+
+	flist_for_each_safe(entry, tmp, &client_list) {
+		client = flist_entry(entry, struct fio_client, list);
+
+		ret = fio_client_connect(client);
+		if (ret) {
+			remove_client(client);
+			continue;
+		}
+
+		if (client->argc > 1)
+			send_client_cmd_line(client);
+	}
+
+	return !nr_clients;
+}
+
+int fio_start_client(struct fio_client *client)
+{
+	dprint(FD_NET, "client: start %s\n", client->hostname);
+	return fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_RUN, 0, NULL);
+}
+
+int fio_start_all_clients(void)
+{
+	struct fio_client *client;
+	struct flist_head *entry, *tmp;
+	int ret;
+
+	dprint(FD_NET, "client: start all\n");
+
+	fio_client_json_init();
+
+	flist_for_each_safe(entry, tmp, &client_list) {
+		client = flist_entry(entry, struct fio_client, list);
+
+		ret = fio_start_client(client);
+		if (ret) {
+			remove_client(client);
+			continue;
+		}
+	}
+
+	return flist_empty(&client_list);
+}
+
+static int __fio_client_send_remote_ini(struct fio_client *client,
+					const char *filename)
+{
+	struct cmd_load_file_pdu *pdu;
+	size_t p_size;
+	int ret;
+
+	dprint(FD_NET, "send remote ini %s to %s\n", filename, client->hostname);
+
+	p_size = sizeof(*pdu) + strlen(filename) + 1;
+	pdu = malloc(p_size);
+	memset(pdu, 0, p_size);
+	pdu->name_len = strlen(filename);
+	strcpy((char *) pdu->file, filename);
+	pdu->client_type = cpu_to_le16((uint16_t) client->type);
+
+	client->sent_job = true;
+	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_LOAD_FILE, pdu, p_size,NULL, NULL);
+	free(pdu);
+	return ret;
+}
+
+/*
+ * Send file contents to server backend. We could use sendfile(), but to remain
+ * more portable lets just read/write the darn thing.
+ */
+static int __fio_client_send_local_ini(struct fio_client *client,
+				       const char *filename)
+{
+	struct cmd_job_pdu *pdu;
+	size_t p_size;
+	struct stat sb;
+	char *p;
+	void *buf;
+	off_t len;
+	int fd, ret;
+
+	dprint(FD_NET, "send ini %s to %s\n", filename, client->hostname);
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		ret = -errno;
+		log_err("fio: job file <%s> open: %s\n", filename, strerror(errno));
+		return ret;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		ret = -errno;
+		log_err("fio: job file stat: %s\n", strerror(errno));
+		close(fd);
+		return ret;
+	}
+
+	/*
+	 * Add extra space for variable expansion, but doesn't guarantee.
+	 */
+	sb.st_size += OPT_LEN_MAX;
+	p_size = sb.st_size + sizeof(*pdu);
+	pdu = malloc(p_size);
+	buf = pdu->buf;
+
+	len = sb.st_size;
+	p = buf;
+	if (read_ini_data(fd, p, len)) {
+		log_err("fio: failed reading job file %s\n", filename);
+		close(fd);
+		free(pdu);
+		return 1;
+	}
+
+	pdu->buf_len = __cpu_to_le32(sb.st_size);
+	pdu->client_type = cpu_to_le32(client->type);
+
+	client->sent_job = true;
+	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_JOB, pdu, p_size, NULL, NULL);
+	free(pdu);
+	close(fd);
+	return ret;
+}
+
+int fio_client_send_ini(struct fio_client *client, const char *filename,
+			bool remote)
+{
+	int ret;
+
+	if (!remote)
+		ret = __fio_client_send_local_ini(client, filename);
+	else
+		ret = __fio_client_send_remote_ini(client, filename);
+
+	if (!ret)
+		client->sent_job = true;
+
+	return ret;
+}
+
+static int fio_client_send_cf(struct fio_client *client,
+			      struct client_file *cf)
+{
+	return fio_client_send_ini(client, cf->file, cf->remote);
+}
+
+int fio_clients_send_ini(const char *filename)
+{
+	struct fio_client *client;
+	struct flist_head *entry, *tmp;
+
+	flist_for_each_safe(entry, tmp, &client_list) {
+		bool failed = false;
+
+		client = flist_entry(entry, struct fio_client, list);
+
+		if (client->nr_files) {
+			int i;
+
+			for (i = 0; i < client->nr_files; i++) {
+				struct client_file *cf;
+
+				cf = &client->files[i];
+
+				if (fio_client_send_cf(client, cf)) {
+					failed = true;
+					remove_client(client);
+					break;
+				}
+			}
+		}
+		if (client->sent_job || failed)
+			continue;
+		if (!filename || fio_client_send_ini(client, filename, 0))
+			remove_client(client);
+	}
+
+	return !nr_clients;
+}
+
+int fio_client_update_options(struct fio_client *client,
+			      struct thread_options *o, uint64_t *tag)
+{
+	struct cmd_add_job_pdu pdu;
+
+	pdu.thread_number = cpu_to_le32(client->thread_number);
+	pdu.groupid = cpu_to_le32(client->groupid);
+	convert_thread_options_to_net(&pdu.top, o);
+
+	return fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, &pdu, sizeof(pdu), tag, &client->cmd_list);
+}
+
+static void convert_io_stat(struct io_stat *dst, struct io_stat *src)
+{
+	dst->max_val	= le64_to_cpu(src->max_val);
+	dst->min_val	= le64_to_cpu(src->min_val);
+	dst->samples	= le64_to_cpu(src->samples);
+
+	/*
+	 * Floats arrive as IEEE 754 encoded uint64_t, convert back to double
+	 */
+	dst->mean.u.f	= fio_uint64_to_double(le64_to_cpu(dst->mean.u.i));
+	dst->S.u.f	= fio_uint64_to_double(le64_to_cpu(dst->S.u.i));
+}
+
+static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
+{
+	int i, j, k;
+
+	dst->error		= le32_to_cpu(src->error);
+	dst->thread_number	= le32_to_cpu(src->thread_number);
+	dst->groupid		= le32_to_cpu(src->groupid);
+	dst->pid		= le32_to_cpu(src->pid);
+	dst->members		= le32_to_cpu(src->members);
+	dst->unified_rw_rep	= le32_to_cpu(src->unified_rw_rep);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		convert_io_stat(&dst->clat_stat[i], &src->clat_stat[i]);
+		convert_io_stat(&dst->slat_stat[i], &src->slat_stat[i]);
+		convert_io_stat(&dst->lat_stat[i], &src->lat_stat[i]);
+		convert_io_stat(&dst->bw_stat[i], &src->bw_stat[i]);
+		convert_io_stat(&dst->iops_stat[i], &src->iops_stat[i]);
+	}
+	convert_io_stat(&dst->sync_stat, &src->sync_stat);
+
+	dst->usr_time		= le64_to_cpu(src->usr_time);
+	dst->sys_time		= le64_to_cpu(src->sys_time);
+	dst->ctx		= le64_to_cpu(src->ctx);
+	dst->minf		= le64_to_cpu(src->minf);
+	dst->majf		= le64_to_cpu(src->majf);
+	dst->clat_percentiles	= le32_to_cpu(src->clat_percentiles);
+	dst->lat_percentiles	= le32_to_cpu(src->lat_percentiles);
+	dst->slat_percentiles	= le32_to_cpu(src->slat_percentiles);
+	dst->percentile_precision = le64_to_cpu(src->percentile_precision);
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
+		fio_fp64_t *fps = &src->percentile_list[i];
+		fio_fp64_t *fpd = &dst->percentile_list[i];
+
+		fpd->u.f = fio_uint64_to_double(le64_to_cpu(fps->u.i));
+	}
+
+	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
+		dst->io_u_map[i]	= le64_to_cpu(src->io_u_map[i]);
+		dst->io_u_submit[i]	= le64_to_cpu(src->io_u_submit[i]);
+		dst->io_u_complete[i]	= le64_to_cpu(src->io_u_complete[i]);
+	}
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		dst->io_u_lat_n[i]	= le64_to_cpu(src->io_u_lat_n[i]);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		dst->io_u_lat_u[i]	= le64_to_cpu(src->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		dst->io_u_lat_m[i]	= le64_to_cpu(src->io_u_lat_m[i]);
+
+	for (i = 0; i < FIO_LAT_CNT; i++)
+		for (j = 0; j < DDIR_RWDIR_CNT; j++)
+			for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+				dst->io_u_plat[i][j][k] = le64_to_cpu(src->io_u_plat[i][j][k]);
+
+	for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
+		dst->io_u_sync_plat[j] = le64_to_cpu(src->io_u_sync_plat[j]);
+
+	for (i = 0; i < DDIR_RWDIR_SYNC_CNT; i++)
+		dst->total_io_u[i]	= le64_to_cpu(src->total_io_u[i]);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		dst->short_io_u[i]	= le64_to_cpu(src->short_io_u[i]);
+		dst->drop_io_u[i]	= le64_to_cpu(src->drop_io_u[i]);
+	}
+
+	dst->total_submit	= le64_to_cpu(src->total_submit);
+	dst->total_complete	= le64_to_cpu(src->total_complete);
+	dst->nr_zone_resets	= le64_to_cpu(src->nr_zone_resets);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		dst->io_bytes[i]	= le64_to_cpu(src->io_bytes[i]);
+		dst->runtime[i]		= le64_to_cpu(src->runtime[i]);
+	}
+
+	dst->total_run_time	= le64_to_cpu(src->total_run_time);
+	dst->continue_on_error	= le16_to_cpu(src->continue_on_error);
+	dst->total_err_count	= le64_to_cpu(src->total_err_count);
+	dst->first_error	= le32_to_cpu(src->first_error);
+	dst->kb_base		= le32_to_cpu(src->kb_base);
+	dst->unit_base		= le32_to_cpu(src->unit_base);
+
+	dst->sig_figs		= le32_to_cpu(src->sig_figs);
+
+	dst->latency_depth	= le32_to_cpu(src->latency_depth);
+	dst->latency_target	= le64_to_cpu(src->latency_target);
+	dst->latency_window	= le64_to_cpu(src->latency_window);
+	dst->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(src->latency_percentile.u.i));
+
+	dst->nr_block_infos	= le64_to_cpu(src->nr_block_infos);
+	for (i = 0; i < dst->nr_block_infos; i++)
+		dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			dst->io_u_plat_high_prio[i][j] = le64_to_cpu(src->io_u_plat_high_prio[i][j]);
+			dst->io_u_plat_low_prio[i][j] = le64_to_cpu(src->io_u_plat_low_prio[i][j]);
+		}
+		convert_io_stat(&dst->clat_high_prio_stat[i], &src->clat_high_prio_stat[i]);
+		convert_io_stat(&dst->clat_low_prio_stat[i], &src->clat_low_prio_stat[i]);
+	}
+
+	dst->ss_dur		= le64_to_cpu(src->ss_dur);
+	dst->ss_state		= le32_to_cpu(src->ss_state);
+	dst->ss_head		= le32_to_cpu(src->ss_head);
+	dst->ss_limit.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_limit.u.i));
+	dst->ss_slope.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_slope.u.i));
+	dst->ss_deviation.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
+	dst->ss_criterion.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
+
+	if (dst->ss_state & FIO_SS_DATA) {
+		for (i = 0; i < dst->ss_dur; i++ ) {
+			dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
+			dst->ss_bw_data[i] = le64_to_cpu(src->ss_bw_data[i]);
+		}
+	}
+
+	dst->cachehit		= le64_to_cpu(src->cachehit);
+	dst->cachemiss		= le64_to_cpu(src->cachemiss);
+}
+
+static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		dst->max_run[i]		= le64_to_cpu(src->max_run[i]);
+		dst->min_run[i]		= le64_to_cpu(src->min_run[i]);
+		dst->max_bw[i]		= le64_to_cpu(src->max_bw[i]);
+		dst->min_bw[i]		= le64_to_cpu(src->min_bw[i]);
+		dst->iobytes[i]		= le64_to_cpu(src->iobytes[i]);
+		dst->agg[i]		= le64_to_cpu(src->agg[i]);
+	}
+
+	dst->kb_base	= le32_to_cpu(src->kb_base);
+	dst->unit_base	= le32_to_cpu(src->unit_base);
+	dst->sig_figs	= le32_to_cpu(src->sig_figs);
+	dst->groupid	= le32_to_cpu(src->groupid);
+	dst->unified_rw_rep	= le32_to_cpu(src->unified_rw_rep);
+}
+
+static void json_object_add_client_info(struct json_object *obj,
+					struct fio_client *client)
+{
+	const char *hostname = client->hostname ? client->hostname : "";
+
+	json_object_add_value_string(obj, "hostname", hostname);
+	json_object_add_value_int(obj, "port", client->port);
+}
+
+static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+	struct flist_head *opt_list = NULL;
+	struct json_object *tsobj;
+
+	if (client->opt_lists && p->ts.thread_number <= client->jobs)
+		opt_list = &client->opt_lists[p->ts.thread_number - 1];
+
+	tsobj = show_thread_status(&p->ts, &p->rs, opt_list, &client->buf);
+	client->did_stat = true;
+	if (tsobj) {
+		json_object_add_client_info(tsobj, client);
+		json_array_add_value_object(clients_array, tsobj);
+	}
+
+	if (sum_stat_clients <= 1)
+		return;
+
+	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+	sum_group_stats(&client_gs, &p->rs);
+
+	client_ts.members++;
+	client_ts.thread_number = p->ts.thread_number;
+	client_ts.groupid = p->ts.groupid;
+	client_ts.unified_rw_rep = p->ts.unified_rw_rep;
+	client_ts.sig_figs = p->ts.sig_figs;
+
+	if (++sum_stat_nr == sum_stat_clients) {
+		strcpy(client_ts.name, "All clients");
+		tsobj = show_thread_status(&client_ts, &client_gs, NULL, &allclients);
+		if (tsobj) {
+			json_object_add_client_info(tsobj, client);
+			json_array_add_value_object(clients_array, tsobj);
+		}
+	}
+}
+
+static void handle_gs(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct group_run_stats *gs = (struct group_run_stats *) cmd->payload;
+
+	if (output_format & FIO_OUTPUT_NORMAL)
+		show_group_stats(gs, &client->buf);
+}
+
+static void handle_job_opt(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_job_option *pdu = (struct cmd_job_option *) cmd->payload;
+	struct print_option *p;
+
+	if (!job_opt_object)
+		return;
+
+	pdu->global = le16_to_cpu(pdu->global);
+	pdu->truncated = le16_to_cpu(pdu->truncated);
+	pdu->groupid = le32_to_cpu(pdu->groupid);
+
+	p = malloc(sizeof(*p));
+	p->name = strdup((char *) pdu->name);
+	if (pdu->value[0] != '\0')
+		p->value = strdup((char *) pdu->value);
+	else
+		p->value = NULL;
+
+	if (pdu->global) {
+		const char *pos = "";
+
+		if (p->value)
+			pos = p->value;
+
+		json_object_add_value_string(job_opt_object, p->name, pos);
+	} else if (client->opt_lists) {
+		struct flist_head *opt_list = &client->opt_lists[pdu->groupid];
+
+		flist_add_tail(&p->list, opt_list);
+	} else {
+		free(p->value);
+		free(p->name);
+		free(p);
+	}
+}
+
+static void handle_text(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_text_pdu *pdu = (struct cmd_text_pdu *) cmd->payload;
+	const char *buf = (const char *) pdu->buf;
+	const char *name;
+	int fio_unused ret;
+	struct buf_output out;
+
+	buf_output_init(&out);
+
+	name = client->name ? client->name : client->hostname;
+
+	if (!client->skip_newline && !(output_format & FIO_OUTPUT_TERSE))
+		__log_buf(&out, "<%s> ", name);
+	__log_buf(&out, "%s", buf);
+	log_info_buf(out.buf, out.buflen);
+	buf_output_free(&out);
+	client->skip_newline = strchr(buf, '\n') == NULL;
+}
+
+static void convert_agg(struct disk_util_agg *agg)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		agg->ios[i]	= le64_to_cpu(agg->ios[i]);
+		agg->merges[i]	= le64_to_cpu(agg->merges[i]);
+		agg->sectors[i]	= le64_to_cpu(agg->sectors[i]);
+		agg->ticks[i]	= le64_to_cpu(agg->ticks[i]);
+	}
+
+	agg->io_ticks		= le64_to_cpu(agg->io_ticks);
+	agg->time_in_queue	= le64_to_cpu(agg->time_in_queue);
+	agg->slavecount		= le32_to_cpu(agg->slavecount);
+	agg->max_util.u.f	= fio_uint64_to_double(le64_to_cpu(agg->max_util.u.i));
+}
+
+static void convert_dus(struct disk_util_stat *dus)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		dus->s.ios[i]		= le64_to_cpu(dus->s.ios[i]);
+		dus->s.merges[i]	= le64_to_cpu(dus->s.merges[i]);
+		dus->s.sectors[i]	= le64_to_cpu(dus->s.sectors[i]);
+		dus->s.ticks[i]		= le64_to_cpu(dus->s.ticks[i]);
+	}
+
+	dus->s.io_ticks		= le64_to_cpu(dus->s.io_ticks);
+	dus->s.time_in_queue	= le64_to_cpu(dus->s.time_in_queue);
+	dus->s.msec		= le64_to_cpu(dus->s.msec);
+}
+
+static void handle_du(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
+
+	if (!client->disk_stats_shown)
+		client->disk_stats_shown = true;
+
+	if (output_format & FIO_OUTPUT_JSON) {
+		struct json_object *duobj;
+
+		json_array_add_disk_util(&du->dus, &du->agg, du_array);
+		duobj = json_array_last_value_object(du_array);
+		json_object_add_client_info(duobj, client);
+	}
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		__log_buf(&client->buf, "\nDisk stats (read/write):\n");
+		print_disk_util(&du->dus, &du->agg, 0, &client->buf);
+	}
+	if (output_format & FIO_OUTPUT_TERSE && terse_version >= 3) {
+		print_disk_util(&du->dus, &du->agg, 1, &client->buf);
+		__log_buf(&client->buf, "\n");
+	}
+}
+
+static void convert_jobs_eta(struct jobs_eta *je)
+{
+	int i;
+
+	je->nr_running		= le32_to_cpu(je->nr_running);
+	je->nr_ramp		= le32_to_cpu(je->nr_ramp);
+	je->nr_pending		= le32_to_cpu(je->nr_pending);
+	je->nr_setting_up	= le32_to_cpu(je->nr_setting_up);
+	je->files_open		= le32_to_cpu(je->files_open);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		je->m_rate[i]	= le64_to_cpu(je->m_rate[i]);
+		je->t_rate[i]	= le64_to_cpu(je->t_rate[i]);
+		je->m_iops[i]	= le32_to_cpu(je->m_iops[i]);
+		je->t_iops[i]	= le32_to_cpu(je->t_iops[i]);
+		je->rate[i]	= le64_to_cpu(je->rate[i]);
+		je->iops[i]	= le32_to_cpu(je->iops[i]);
+	}
+
+	je->elapsed_sec		= le64_to_cpu(je->elapsed_sec);
+	je->eta_sec		= le64_to_cpu(je->eta_sec);
+	je->nr_threads		= le32_to_cpu(je->nr_threads);
+	je->is_pow2		= le32_to_cpu(je->is_pow2);
+	je->unit_base		= le32_to_cpu(je->unit_base);
+	je->sig_figs		= le32_to_cpu(je->sig_figs);
+}
+
+void fio_client_sum_jobs_eta(struct jobs_eta *dst, struct jobs_eta *je)
+{
+	int i;
+
+	dst->nr_running		+= je->nr_running;
+	dst->nr_ramp		+= je->nr_ramp;
+	dst->nr_pending		+= je->nr_pending;
+	dst->nr_setting_up	+= je->nr_setting_up;
+	dst->files_open		+= je->files_open;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		dst->m_rate[i]	+= je->m_rate[i];
+		dst->t_rate[i]	+= je->t_rate[i];
+		dst->m_iops[i]	+= je->m_iops[i];
+		dst->t_iops[i]	+= je->t_iops[i];
+		dst->rate[i]	+= je->rate[i];
+		dst->iops[i]	+= je->iops[i];
+	}
+
+	dst->elapsed_sec	+= je->elapsed_sec;
+
+	if (je->eta_sec > dst->eta_sec)
+		dst->eta_sec = je->eta_sec;
+
+	dst->nr_threads		+= je->nr_threads;
+
+	/*
+	 * This wont be correct for multiple strings, but at least it
+	 * works for the basic cases.
+	 */
+	strcpy((char *) dst->run_str, (char *) je->run_str);
+}
+
+static bool remove_reply_cmd(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct fio_net_cmd_reply *reply = NULL;
+	struct flist_head *entry;
+
+	flist_for_each(entry, &client->cmd_list) {
+		reply = flist_entry(entry, struct fio_net_cmd_reply, list);
+
+		if (cmd->tag == (uintptr_t) reply)
+			break;
+
+		reply = NULL;
+	}
+
+	if (!reply) {
+		log_err("fio: client: unable to find matching tag (%llx)\n", (unsigned long long) cmd->tag);
+		return false;
+	}
+
+	flist_del(&reply->list);
+	cmd->tag = reply->saved_tag;
+	free(reply);
+	return true;
+}
+
+int fio_client_wait_for_reply(struct fio_client *client, uint64_t tag)
+{
+	do {
+		struct fio_net_cmd_reply *reply = NULL;
+		struct flist_head *entry;
+
+		flist_for_each(entry, &client->cmd_list) {
+			reply = flist_entry(entry, struct fio_net_cmd_reply, list);
+
+			if (tag == (uintptr_t) reply)
+				break;
+
+			reply = NULL;
+		}
+
+		if (!reply)
+			break;
+
+		usleep(1000);
+	} while (1);
+
+	return 0;
+}
+
+static void handle_eta(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct jobs_eta *je = (struct jobs_eta *) cmd->payload;
+	struct client_eta *eta = (struct client_eta *) (uintptr_t) cmd->tag;
+
+	dprint(FD_NET, "client: got eta tag %p, %d\n", eta, eta->pending);
+
+	assert(client->eta_in_flight == eta);
+
+	client->eta_in_flight = NULL;
+	flist_del_init(&client->eta_list);
+	client->eta_timeouts = 0;
+
+	if (client->ops->jobs_eta)
+		client->ops->jobs_eta(client, je);
+
+	fio_client_sum_jobs_eta(&eta->eta, je);
+	fio_client_dec_jobs_eta(eta, client->ops->eta);
+}
+
+static void client_flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
+				      uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, j, nr_samples;
+	struct io_u_plat_entry *entry;
+	uint64_t *io_u_plat;
+
+	int stride = 1 << hist_coarseness;
+
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+
+		s = (struct io_sample *)((char *)__get_sample(samples, log_offset, i) +
+			i * sizeof(struct io_u_plat_entry));
+
+		entry = s->data.plat_entry;
+		io_u_plat = entry->io_u_plat;
+
+		fprintf(f, "%lu, %u, %llu, ", (unsigned long) s->time,
+						io_sample_ddir(s), (unsigned long long) s->bs);
+		for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
+			fprintf(f, "%llu, ", (unsigned long long)hist_sum(j, stride, io_u_plat, NULL));
+		}
+		fprintf(f, "%llu\n", (unsigned long long)
+			hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat, NULL));
+
+	}
+}
+
+static int fio_client_handle_iolog(struct fio_client *client,
+				   struct fio_net_cmd *cmd)
+{
+	struct cmd_iolog_pdu *pdu = NULL;
+	bool store_direct;
+	char *log_pathname = NULL;
+	int ret = 0;
+
+	pdu = convert_iolog(cmd, &store_direct);
+	if (!pdu) {
+		log_err("fio: failed converting IO log\n");
+		ret = 1;
+		goto out;
+	}
+
+        /* allocate buffer big enough for next sprintf() call */
+	log_pathname = malloc(10 + strlen((char *)pdu->name) +
+			strlen(client->hostname));
+	if (!log_pathname) {
+		log_err("fio: memory allocation of unique pathname failed\n");
+		ret = -1;
+		goto out;
+	}
+	/* generate a unique pathname for the log file using hostname */
+	sprintf(log_pathname, "%s.%s", pdu->name, client->hostname);
+
+	if (store_direct) {
+		ssize_t wrote;
+		size_t sz;
+		int fd;
+
+		fd = open((const char *) log_pathname,
+				O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (fd < 0) {
+			log_err("fio: open log %s: %s\n",
+				log_pathname, strerror(errno));
+			ret = 1;
+			goto out;
+		}
+
+		sz = cmd->pdu_len - sizeof(*pdu);
+		wrote = write(fd, pdu->samples, sz);
+		close(fd);
+
+		if (wrote != sz) {
+			log_err("fio: short write on compressed log\n");
+			ret = 1;
+			goto out;
+		}
+
+		ret = 0;
+	} else {
+		FILE *f;
+		f = fopen((const char *) log_pathname, "w");
+		if (!f) {
+			log_err("fio: fopen log %s : %s\n",
+				log_pathname, strerror(errno));
+			ret = 1;
+			goto out;
+		}
+
+		if (pdu->log_type == IO_LOG_TYPE_HIST) {
+			client_flush_hist_samples(f, pdu->log_hist_coarseness, pdu->samples,
+					   pdu->nr_samples * sizeof(struct io_sample));
+		} else {
+			flush_samples(f, pdu->samples,
+					pdu->nr_samples * sizeof(struct io_sample));
+		}
+		fclose(f);
+		ret = 0;
+	}
+
+out:
+	if (pdu && pdu != (void *) cmd->payload)
+		free(pdu);
+
+	if (log_pathname)
+		free(log_pathname);
+
+	return ret;
+}
+
+static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_probe_reply_pdu *probe = (struct cmd_probe_reply_pdu *) cmd->payload;
+	const char *os, *arch;
+	char bit[16];
+
+	os = fio_get_os_string(probe->os);
+	if (!os)
+		os = "unknown";
+
+	arch = fio_get_arch_string(probe->arch);
+	if (!arch)
+		os = "unknown";
+
+	sprintf(bit, "%d-bit", probe->bpp * 8);
+	probe->flags = le64_to_cpu(probe->flags);
+
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		log_info("hostname=%s, be=%u, %s, os=%s, arch=%s, fio=%s, flags=%lx\n",
+			probe->hostname, probe->bigendian, bit, os, arch,
+			probe->fio_version, (unsigned long) probe->flags);
+	}
+
+	if (!client->name)
+		client->name = strdup((char *) probe->hostname);
+}
+
+static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_start_pdu *pdu = (struct cmd_start_pdu *) cmd->payload;
+
+	client->state = Client_started;
+	client->jobs = le32_to_cpu(pdu->jobs);
+	client->nr_stat = le32_to_cpu(pdu->stat_outputs);
+
+	if (client->jobs) {
+		int i;
+
+		if (client->opt_lists)
+			free(client->opt_lists);
+
+		client->opt_lists = malloc(client->jobs * sizeof(struct flist_head));
+		for (i = 0; i < client->jobs; i++)
+			INIT_FLIST_HEAD(&client->opt_lists[i]);
+	}
+
+	sum_stat_clients += client->nr_stat;
+}
+
+static void handle_stop(struct fio_client *client)
+{
+	if (client->error)
+		log_info("client <%s>: exited with error %d\n", client->hostname, client->error);
+}
+
+static void convert_stop(struct fio_net_cmd *cmd)
+{
+	struct cmd_end_pdu *pdu = (struct cmd_end_pdu *) cmd->payload;
+
+	pdu->error = le32_to_cpu(pdu->error);
+}
+
+static void convert_text(struct fio_net_cmd *cmd)
+{
+	struct cmd_text_pdu *pdu = (struct cmd_text_pdu *) cmd->payload;
+
+	pdu->level	= le32_to_cpu(pdu->level);
+	pdu->buf_len	= le32_to_cpu(pdu->buf_len);
+	pdu->log_sec	= le64_to_cpu(pdu->log_sec);
+	pdu->log_usec	= le64_to_cpu(pdu->log_usec);
+}
+
+static struct cmd_iolog_pdu *convert_iolog_gz(struct fio_net_cmd *cmd,
+					      struct cmd_iolog_pdu *pdu)
+{
+#ifdef CONFIG_ZLIB
+	struct cmd_iolog_pdu *ret;
+	z_stream stream;
+	uint64_t nr_samples;
+	size_t total;
+	char *p;
+
+	stream.zalloc = Z_NULL;
+	stream.zfree = Z_NULL;
+	stream.opaque = Z_NULL;
+	stream.avail_in = 0;
+	stream.next_in = Z_NULL;
+
+	if (inflateInit(&stream) != Z_OK)
+		return NULL;
+
+	/*
+	 * Get header first, it's not compressed
+	 */
+	nr_samples = le64_to_cpu(pdu->nr_samples);
+
+	if (pdu->log_type == IO_LOG_TYPE_HIST)
+		total = nr_samples * (__log_entry_sz(le32_to_cpu(pdu->log_offset)) +
+					sizeof(struct io_u_plat_entry));
+	else
+		total = nr_samples * __log_entry_sz(le32_to_cpu(pdu->log_offset));
+	ret = malloc(total + sizeof(*pdu));
+	ret->nr_samples = nr_samples;
+
+	memcpy(ret, pdu, sizeof(*pdu));
+
+	p = (char *) ret + sizeof(*pdu);
+
+	stream.avail_in = cmd->pdu_len - sizeof(*pdu);
+	stream.next_in = (void *)((char *) pdu + sizeof(*pdu));
+	while (stream.avail_in) {
+		unsigned int this_chunk = 65536;
+		unsigned int this_len;
+		int err;
+
+		if (this_chunk > total)
+			this_chunk = total;
+
+		stream.avail_out = this_chunk;
+		stream.next_out = (void *)p;
+		err = inflate(&stream, Z_NO_FLUSH);
+		/* may be Z_OK, or Z_STREAM_END */
+		if (err < 0) {
+			/*
+			 * Z_STREAM_ERROR and Z_BUF_ERROR can safely be
+			 * ignored */
+			if (err == Z_STREAM_ERROR || err == Z_BUF_ERROR)
+				break;
+			log_err("fio: inflate error %d\n", err);
+			free(ret);
+			ret = NULL;
+			goto err;
+		}
+
+		this_len = this_chunk - stream.avail_out;
+		p += this_len;
+		total -= this_len;
+	}
+
+err:
+	inflateEnd(&stream);
+	return ret;
+#else
+	return NULL;
+#endif
+}
+
+/*
+ * This has been compressed on the server side, since it can be big.
+ * Uncompress here.
+ */
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
+					   bool *store_direct)
+{
+	struct cmd_iolog_pdu *pdu = (struct cmd_iolog_pdu *) cmd->payload;
+	struct cmd_iolog_pdu *ret;
+	uint64_t i;
+	int compressed;
+	void *samples;
+
+	*store_direct = false;
+
+	/*
+	 * Convert if compressed and we support it. If it's not
+	 * compressed, we need not do anything.
+	 */
+	compressed = le32_to_cpu(pdu->compressed);
+	if (compressed == XMIT_COMPRESSED) {
+#ifndef CONFIG_ZLIB
+		log_err("fio: server sent compressed data by mistake\n");
+		return NULL;
+#endif
+		ret = convert_iolog_gz(cmd, pdu);
+		if (!ret) {
+			log_err("fio: failed decompressing log\n");
+			return NULL;
+		}
+	} else if (compressed == STORE_COMPRESSED) {
+		*store_direct = true;
+		ret = pdu;
+	} else
+		ret = pdu;
+
+	ret->nr_samples		= le64_to_cpu(ret->nr_samples);
+	ret->thread_number	= le32_to_cpu(ret->thread_number);
+	ret->log_type		= le32_to_cpu(ret->log_type);
+	ret->compressed		= le32_to_cpu(ret->compressed);
+	ret->log_offset		= le32_to_cpu(ret->log_offset);
+	ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness);
+
+	if (*store_direct)
+		return ret;
+
+	samples = &ret->samples[0];
+	for (i = 0; i < ret->nr_samples; i++) {
+		struct io_sample *s;
+
+		s = __get_sample(samples, ret->log_offset, i);
+		if (ret->log_type == IO_LOG_TYPE_HIST)
+			s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
+
+		s->time		= le64_to_cpu(s->time);
+		s->data.val	= le64_to_cpu(s->data.val);
+		s->__ddir	= __le32_to_cpu(s->__ddir);
+		s->bs		= le64_to_cpu(s->bs);
+
+		if (ret->log_offset) {
+			struct io_sample_offset *so = (void *) s;
+
+			so->offset = le64_to_cpu(so->offset);
+		}
+
+		if (ret->log_type == IO_LOG_TYPE_HIST) {
+			s->data.plat_entry = (struct io_u_plat_entry *)(((char *)s) + sizeof(*s));
+			s->data.plat_entry->list.next = NULL;
+			s->data.plat_entry->list.prev = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static void sendfile_reply(int fd, struct cmd_sendfile_reply *rep,
+			   size_t size, uint64_t tag)
+{
+	rep->error = cpu_to_le32(rep->error);
+	fio_net_send_cmd(fd, FIO_NET_CMD_SENDFILE, rep, size, &tag, NULL);
+}
+
+static int fio_send_file(struct fio_client *client, struct cmd_sendfile *pdu,
+			 uint64_t tag)
+{
+	struct cmd_sendfile_reply *rep;
+	struct stat sb;
+	size_t size;
+	int fd;
+
+	size = sizeof(*rep);
+	rep = malloc(size);
+
+	if (stat((char *)pdu->path, &sb) < 0) {
+fail:
+		rep->error = errno;
+		sendfile_reply(client->fd, rep, size, tag);
+		free(rep);
+		return 1;
+	}
+
+	size += sb.st_size;
+	rep = realloc(rep, size);
+	rep->size = cpu_to_le32((uint32_t) sb.st_size);
+
+	fd = open((char *)pdu->path, O_RDONLY);
+	if (fd == -1 )
+		goto fail;
+
+	rep->error = read_data(fd, &rep->data, sb.st_size);
+	sendfile_reply(client->fd, rep, size, tag);
+	free(rep);
+	close(fd);
+	return 0;
+}
+
+int fio_handle_client(struct fio_client *client)
+{
+	struct client_ops *ops = client->ops;
+	struct fio_net_cmd *cmd;
+	int size;
+
+	dprint(FD_NET, "client: handle %s\n", client->hostname);
+
+	cmd = fio_net_recv_cmd(client->fd, true);
+	if (!cmd)
+		return 0;
+
+	dprint(FD_NET, "client: got cmd op %s from %s (pdu=%u)\n",
+		fio_server_op(cmd->opcode), client->hostname, cmd->pdu_len);
+
+	client->last_cmd = cmd->opcode;
+
+	switch (cmd->opcode) {
+	case FIO_NET_CMD_QUIT:
+		if (ops->quit)
+			ops->quit(client, cmd);
+		remove_client(client);
+		break;
+	case FIO_NET_CMD_TEXT:
+		convert_text(cmd);
+		ops->text(client, cmd);
+		break;
+	case FIO_NET_CMD_DU: {
+		struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
+
+		convert_dus(&du->dus);
+		convert_agg(&du->agg);
+
+		ops->disk_util(client, cmd);
+		break;
+		}
+	case FIO_NET_CMD_TS: {
+		struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+
+		dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
+		if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) {
+			dprint(FD_NET, "client: received steadystate ring buffers\n");
+
+			size = le64_to_cpu(p->ts.ss_dur);
+			p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
+			p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+		}
+
+		convert_ts(&p->ts, &p->ts);
+		convert_gs(&p->rs, &p->rs);
+
+		ops->thread_status(client, cmd);
+		break;
+		}
+	case FIO_NET_CMD_GS: {
+		struct group_run_stats *gs = (struct group_run_stats *) cmd->payload;
+
+		convert_gs(gs, gs);
+
+		ops->group_stats(client, cmd);
+		break;
+		}
+	case FIO_NET_CMD_ETA: {
+		struct jobs_eta *je = (struct jobs_eta *) cmd->payload;
+
+		if (!remove_reply_cmd(client, cmd))
+			break;
+		convert_jobs_eta(je);
+		handle_eta(client, cmd);
+		break;
+		}
+	case FIO_NET_CMD_PROBE:
+		remove_reply_cmd(client, cmd);
+		ops->probe(client, cmd);
+		break;
+	case FIO_NET_CMD_SERVER_START:
+		client->state = Client_running;
+		if (ops->job_start)
+			ops->job_start(client, cmd);
+		break;
+	case FIO_NET_CMD_START: {
+		struct cmd_start_pdu *pdu = (struct cmd_start_pdu *) cmd->payload;
+
+		pdu->jobs = le32_to_cpu(pdu->jobs);
+		ops->start(client, cmd);
+		break;
+		}
+	case FIO_NET_CMD_STOP: {
+		struct cmd_end_pdu *pdu = (struct cmd_end_pdu *) cmd->payload;
+
+		convert_stop(cmd);
+		client->state = Client_stopped;
+		client->error = le32_to_cpu(pdu->error);
+		client->signal = le32_to_cpu(pdu->signal);
+		ops->stop(client);
+		break;
+		}
+	case FIO_NET_CMD_ADD_JOB: {
+		struct cmd_add_job_pdu *pdu = (struct cmd_add_job_pdu *) cmd->payload;
+
+		client->thread_number = le32_to_cpu(pdu->thread_number);
+		client->groupid = le32_to_cpu(pdu->groupid);
+
+		if (ops->add_job)
+			ops->add_job(client, cmd);
+		break;
+		}
+	case FIO_NET_CMD_IOLOG:
+		fio_client_handle_iolog(client, cmd);
+		break;
+	case FIO_NET_CMD_UPDATE_JOB:
+		ops->update_job(client, cmd);
+		remove_reply_cmd(client, cmd);
+		break;
+	case FIO_NET_CMD_VTRIGGER: {
+		struct all_io_list *pdu = (struct all_io_list *) cmd->payload;
+		char buf[128];
+		int off = 0;
+
+		if (aux_path) {
+			strcpy(buf, aux_path);
+			off = strlen(buf);
+		}
+
+		__verify_save_state(pdu, server_name(client, &buf[off], sizeof(buf) - off));
+		exec_trigger(trigger_cmd);
+		break;
+		}
+	case FIO_NET_CMD_SENDFILE: {
+		struct cmd_sendfile *pdu = (struct cmd_sendfile *) cmd->payload;
+		fio_send_file(client, pdu, cmd->tag);
+		break;
+		}
+	case FIO_NET_CMD_JOB_OPT: {
+		handle_job_opt(client, cmd);
+		break;
+	}
+	default:
+		log_err("fio: unknown client op: %s\n", fio_server_op(cmd->opcode));
+		break;
+	}
+
+	free(cmd);
+	return 1;
+}
+
+int fio_clients_send_trigger(const char *cmd)
+{
+	struct flist_head *entry;
+	struct fio_client *client;
+	size_t slen;
+
+	dprint(FD_NET, "client: send vtrigger: %s\n", cmd);
+
+	if (!cmd)
+		slen = 0;
+	else
+		slen = strlen(cmd);
+
+	flist_for_each(entry, &client_list) {
+		struct cmd_vtrigger_pdu *pdu;
+
+		client = flist_entry(entry, struct fio_client, list);
+
+		pdu = malloc(sizeof(*pdu) + slen);
+		pdu->len = cpu_to_le16((uint16_t) slen);
+		if (slen)
+			memcpy(pdu->cmd, cmd, slen);
+		fio_net_send_cmd(client->fd, FIO_NET_CMD_VTRIGGER, pdu,
+					sizeof(*pdu) + slen, NULL, NULL);
+		free(pdu);
+	}
+
+	return 0;
+}
+
+static void request_client_etas(struct client_ops *ops)
+{
+	struct fio_client *client;
+	struct flist_head *entry;
+	struct client_eta *eta;
+	int skipped = 0;
+
+	if (eta_print == FIO_ETA_NEVER)
+		return;
+
+	dprint(FD_NET, "client: request eta (%d)\n", nr_clients);
+
+	eta = calloc(1, sizeof(*eta) + __THREAD_RUNSTR_SZ(REAL_MAX_JOBS));
+	eta->pending = nr_clients;
+
+	flist_for_each(entry, &client_list) {
+		client = flist_entry(entry, struct fio_client, list);
+
+		if (!flist_empty(&client->eta_list)) {
+			skipped++;
+			continue;
+		}
+		if (client->state != Client_running)
+			continue;
+
+		assert(!client->eta_in_flight);
+		flist_add_tail(&client->eta_list, &eta_list);
+		client->eta_in_flight = eta;
+		fio_net_send_simple_cmd(client->fd, FIO_NET_CMD_SEND_ETA,
+					(uintptr_t) eta, &client->cmd_list);
+	}
+
+	while (skipped--) {
+		if (!fio_client_dec_jobs_eta(eta, ops->eta))
+			break;
+	}
+
+	dprint(FD_NET, "client: requested eta tag %p\n", eta);
+}
+
+/*
+ * A single SEND_ETA timeout isn't fatal. Attempt to recover.
+ */
+static int handle_cmd_timeout(struct fio_client *client,
+			      struct fio_net_cmd_reply *reply)
+{
+	uint16_t reply_opcode = reply->opcode;
+
+	flist_del(&reply->list);
+	free(reply);
+
+	if (reply_opcode != FIO_NET_CMD_SEND_ETA)
+		return 1;
+
+	log_info("client <%s>: timeout on SEND_ETA\n", client->hostname);
+
+	flist_del_init(&client->eta_list);
+	if (client->eta_in_flight) {
+		fio_client_dec_jobs_eta(client->eta_in_flight, client->ops->eta);
+		client->eta_in_flight = NULL;
+	}
+
+	/*
+	 * If we fail 5 in a row, give up...
+	 */
+	if (client->eta_timeouts++ > 5)
+		return 1;
+
+	return 0;
+}
+
+static int client_check_cmd_timeout(struct fio_client *client,
+				    struct timespec *now)
+{
+	struct fio_net_cmd_reply *reply;
+	struct flist_head *entry, *tmp;
+	int ret = 0;
+
+	flist_for_each_safe(entry, tmp, &client->cmd_list) {
+		unsigned int op;
+
+		reply = flist_entry(entry, struct fio_net_cmd_reply, list);
+
+		if (mtime_since(&reply->ts, now) < FIO_NET_CLIENT_TIMEOUT)
+			continue;
+
+		op = reply->opcode;
+		if (!handle_cmd_timeout(client, reply))
+			continue;
+
+		log_err("fio: client %s, timeout on cmd %s\n", client->hostname,
+						fio_server_op(op));
+		ret = 1;
+	}
+
+	return flist_empty(&client->cmd_list) && ret;
+}
+
+static int fio_check_clients_timed_out(void)
+{
+	struct fio_client *client;
+	struct flist_head *entry, *tmp;
+	struct timespec ts;
+	int ret = 0;
+
+	fio_gettime(&ts, NULL);
+
+	flist_for_each_safe(entry, tmp, &client_list) {
+		client = flist_entry(entry, struct fio_client, list);
+
+		if (flist_empty(&client->cmd_list))
+			continue;
+
+		if (!client_check_cmd_timeout(client, &ts))
+			continue;
+
+		if (client->ops->timed_out)
+			client->ops->timed_out(client);
+		else
+			log_err("fio: client %s timed out\n", client->hostname);
+
+		if (client->last_cmd != FIO_NET_CMD_VTRIGGER)
+			client->error = ETIMEDOUT;
+		else
+			log_info("fio: ignoring timeout due to vtrigger\n");
+		remove_client(client);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+int fio_handle_clients(struct client_ops *ops)
+{
+	struct pollfd *pfds;
+	int i, ret = 0, retval = 0;
+
+	fio_gettime(&eta_ts, NULL);
+
+	pfds = malloc(nr_clients * sizeof(struct pollfd));
+
+	init_thread_stat(&client_ts);
+	init_group_run_stat(&client_gs);
+
+	while (!exit_backend && nr_clients) {
+		struct flist_head *entry, *tmp;
+		struct fio_client *client;
+
+		i = 0;
+		flist_for_each_safe(entry, tmp, &client_list) {
+			client = flist_entry(entry, struct fio_client, list);
+
+			if (!client->sent_job && !client->ops->stay_connected &&
+			    flist_empty(&client->cmd_list)) {
+				remove_client(client);
+				continue;
+			}
+
+			pfds[i].fd = client->fd;
+			pfds[i].events = POLLIN;
+			i++;
+		}
+
+		if (!nr_clients)
+			break;
+
+		assert(i == nr_clients);
+
+		do {
+			struct timespec ts;
+			int timeout;
+
+			fio_gettime(&ts, NULL);
+			if (eta_time_within_slack(mtime_since(&eta_ts, &ts))) {
+				request_client_etas(ops);
+				memcpy(&eta_ts, &ts, sizeof(ts));
+
+				if (fio_check_clients_timed_out())
+					break;
+			}
+
+			check_trigger_file();
+
+			timeout = min(100u, ops->eta_msec);
+
+			ret = poll(pfds, nr_clients, timeout);
+			if (ret < 0) {
+				if (errno == EINTR)
+					continue;
+				log_err("fio: poll clients: %s\n", strerror(errno));
+				break;
+			} else if (!ret)
+				continue;
+		} while (ret <= 0);
+
+		for (i = 0; i < nr_clients; i++) {
+			if (!(pfds[i].revents & POLLIN))
+				continue;
+
+			client = find_client_by_fd(pfds[i].fd);
+			if (!client) {
+				log_err("fio: unknown client fd %ld\n", (long) pfds[i].fd);
+				continue;
+			}
+			if (!fio_handle_client(client)) {
+				log_info("client: host=%s disconnected\n",
+						client->hostname);
+				remove_client(client);
+				retval = 1;
+			} else if (client->error)
+				retval = 1;
+			fio_put_client(client);
+		}
+	}
+
+	log_info_buf(allclients.buf, allclients.buflen);
+	buf_output_free(&allclients);
+
+	fio_client_json_fini();
+
+	free(pfds);
+	return retval || error_clients;
+}
+
+static void client_display_thread_status(struct jobs_eta *je)
+{
+	if (!(output_format & FIO_OUTPUT_JSON))
+		display_thread_status(je);
+}
diff --git a/client.h b/client.h
new file mode 100644
index 0000000..8033325
--- /dev/null
+++ b/client.h
@@ -0,0 +1,155 @@
+#ifndef CLIENT_H
+#define CLIENT_H
+
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "lib/types.h"
+#include "stat.h"
+
+struct fio_net_cmd;
+
+enum {
+	Client_created		= 0,
+	Client_connected	= 1,
+	Client_started		= 2,
+	Client_running		= 3,
+	Client_stopped		= 4,
+	Client_exited		= 5,
+};
+
+struct client_file {
+	char *file;
+	bool remote;
+};
+
+struct fio_client {
+	struct flist_head list;
+	struct flist_head hash_list;
+	struct flist_head arg_list;
+	union {
+		struct sockaddr_in addr;
+		struct sockaddr_in6 addr6;
+		struct sockaddr_un addr_un;
+	};
+	char *hostname;
+	int port;
+	int fd;
+	unsigned int refs;
+	unsigned int last_cmd;
+
+	char *name;
+
+	struct flist_head *opt_lists;
+
+	int state;
+
+	bool skip_newline;
+	bool is_sock;
+	bool disk_stats_shown;
+	unsigned int jobs;
+	unsigned int nr_stat;
+	int error;
+	int signal;
+	int ipv6;
+	bool sent_job;
+	bool did_stat;
+	uint32_t type;
+
+	uint32_t thread_number;
+	uint32_t groupid;
+
+	struct flist_head eta_list;
+	struct client_eta *eta_in_flight;
+	unsigned int eta_timeouts;
+
+	struct flist_head cmd_list;
+
+	uint16_t argc;
+	char **argv;
+
+	struct client_ops *ops;
+	void *client_data;
+
+	struct client_file *files;
+	unsigned int nr_files;
+
+	struct buf_output buf;
+};
+
+typedef void (client_cmd_op)(struct fio_client *, struct fio_net_cmd *);
+typedef void (client_op)(struct fio_client *);
+typedef void (client_eta_op)(struct jobs_eta *je);
+typedef void (client_timed_out_op)(struct fio_client *);
+typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je);
+
+extern struct client_ops fio_client_ops;
+
+struct client_ops {
+	client_cmd_op		*text;
+	client_cmd_op		*disk_util;
+	client_cmd_op		*thread_status;
+	client_cmd_op		*group_stats;
+	client_jobs_eta_op	*jobs_eta;
+	client_eta_op		*eta;
+	client_cmd_op		*probe;
+	client_cmd_op		*quit;
+	client_cmd_op		*add_job;
+	client_cmd_op		*update_job;
+	client_timed_out_op	*timed_out;
+	client_op		*stop;
+	client_cmd_op		*start;
+	client_cmd_op		*job_start;
+	client_timed_out_op	*removed;
+
+	unsigned int eta_msec;
+	int stay_connected;
+	uint32_t client_type;
+};
+
+struct client_eta {
+	unsigned int pending;
+	struct jobs_eta eta;
+};
+
+extern int fio_handle_client(struct fio_client *);
+extern void fio_client_sum_jobs_eta(struct jobs_eta *dst, struct jobs_eta *je);
+
+enum {
+	Fio_client_ipv4 = 1,
+	Fio_client_ipv6,
+	Fio_client_socket,
+};
+
+extern int fio_client_connect(struct fio_client *);
+extern int fio_clients_connect(void);
+extern int fio_start_client(struct fio_client *);
+extern int fio_start_all_clients(void);
+extern int fio_clients_send_ini(const char *);
+extern int fio_client_send_ini(struct fio_client *, const char *, bool);
+extern int fio_handle_clients(struct client_ops *);
+extern int fio_client_add(struct client_ops *, const char *, void **);
+extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
+extern void fio_client_add_cmd_option(void *, const char *);
+extern int fio_client_add_ini_file(void *, const char *, bool);
+extern int fio_client_terminate(struct fio_client *);
+extern struct fio_client *fio_get_client(struct fio_client *);
+extern void fio_put_client(struct fio_client *);
+extern int fio_client_update_options(struct fio_client *, struct thread_options *, uint64_t *);
+extern int fio_client_wait_for_reply(struct fio_client *, uint64_t);
+extern int fio_clients_send_trigger(const char *);
+
+#define FIO_CLIENT_DEF_ETA_MSEC		900
+
+enum {
+	FIO_CLIENT_TYPE_CLI		= 1,
+	FIO_CLIENT_TYPE_GUI		= 2,
+};
+
+extern int sum_stat_clients;
+extern struct thread_stat client_ts;
+extern struct group_run_stats client_gs;
+
+#endif
+
diff --git a/compiler/compiler-gcc4.h b/compiler/compiler-gcc4.h
new file mode 100644
index 0000000..e8701cf
--- /dev/null
+++ b/compiler/compiler-gcc4.h
@@ -0,0 +1,17 @@
+#ifndef FIO_COMPILER_GCC4_H
+#define FIO_COMPILER_GCC4_H
+
+#ifndef __must_check
+#define __must_check		__attribute__((warn_unused_result))
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000		\
+			+ __GNUC_MINOR__ * 100	\
+			+ __GNUC_PATCHLEVEL__)
+
+#if GCC_VERSION >= 40300
+#define __compiletime_warning(message)	__attribute__((warning(message)))
+#define __compiletime_error(message)	__attribute__((error(message)))
+#endif
+
+#endif
diff --git a/compiler/compiler.h b/compiler/compiler.h
new file mode 100644
index 0000000..ddfbcc1
--- /dev/null
+++ b/compiler/compiler.h
@@ -0,0 +1,76 @@
+#ifndef FIO_COMPILER_H
+#define FIO_COMPILER_H
+
+/* IWYU pragma: begin_exports */
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+#include "compiler-gcc4.h"
+#else
+#error Compiler too old, need at least gcc 4.1.0
+#endif
+/* IWYU pragma: end_exports */
+
+#ifndef __must_check
+#define __must_check
+#endif
+
+/*
+ * Mark unused variables passed to ops functions as unused, to silence gcc
+ */
+#define fio_unused	__attribute__((__unused__))
+#define fio_init	__attribute__((constructor))
+#define fio_exit	__attribute__((destructor))
+
+#define fio_unlikely(x)	__builtin_expect(!!(x), 0)
+
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	__typeof__(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+
+
+#if defined(CONFIG_STATIC_ASSERT)
+#define compiletime_assert(condition, msg) _Static_assert(condition, msg)
+
+#elif !defined(CONFIG_DISABLE_OPTIMIZATIONS)
+
+#ifndef __compiletime_error
+#define __compiletime_error(message)
+#endif
+
+#ifndef __compiletime_error_fallback
+#define __compiletime_error_fallback(condition)	do { } while (0)
+#endif
+
+#define __compiletime_assert(condition, msg, prefix, suffix)		\
+	do {								\
+		int __cond = !(condition);				\
+		extern void prefix ## suffix(void) __compiletime_error(msg); \
+		if (__cond)						\
+			prefix ## suffix();				\
+		__compiletime_error_fallback(__cond);			\
+	} while (0)
+
+#define _compiletime_assert(condition, msg, prefix, suffix) \
+	__compiletime_assert(condition, msg, prefix, suffix)
+
+#define compiletime_assert(condition, msg) \
+	_compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
+
+#else
+
+#define compiletime_assert(condition, msg)	do { } while (0)
+
+#endif
+
+#ifdef FIO_INTERNAL
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+#define FIELD_SIZE(s, f) (sizeof(((__typeof__(s))0)->f))
+#endif
+
+#endif
diff --git a/configure b/configure
new file mode 100755
index 0000000..5de86ca
--- /dev/null
+++ b/configure
@@ -0,0 +1,2941 @@
+#!/bin/sh
+#
+# Fio configure script. Heavily influenced by the manual qemu configure
+# script. Sad this is easier than autoconf and enemies.
+#
+
+# set temporary file name
+if test ! -z "$TMPDIR" ; then
+    TMPDIR1="${TMPDIR}"
+elif test ! -z "$TEMPDIR" ; then
+    TMPDIR1="${TEMPDIR}"
+else
+    TMPDIR1="/tmp"
+fi
+
+TMPC="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}.c"
+TMPC2="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}-2.c"
+TMPO="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}.o"
+TMPE="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}.exe"
+
+# NB: do not call "exit" in the trap handler; this is buggy with some shells;
+# see <1285349658-3122-1-git-send-email-loic.minier@linaro.org>
+trap "rm -f $TMPC $TMPC2 $TMPO $TMPE" EXIT INT QUIT TERM
+
+rm -rf config.log
+
+config_host_mak="config-host.mak"
+config_host_h="config-host.h"
+
+rm -rf $config_host_mak
+rm -rf $config_host_h
+
+fatal() {
+  echo $@
+  echo "Configure failed, check config.log and/or the above output"
+  rm -rf $config_host_mak
+  rm -rf $config_host_h
+  exit 1
+}
+
+# Print result for each configuration test
+print_config() {
+  printf "%-30s%s\n" "$1" "$2"
+}
+
+# Default CFLAGS
+CFLAGS="-D_GNU_SOURCE -include config-host.h"
+BUILD_CFLAGS=""
+
+# Print a helpful header at the top of config.log
+echo "# FIO configure log $(date)" >> config.log
+printf "# Configured with:" >> config.log
+printf " '%s'" "$0" "$@" >> config.log
+echo >> config.log
+echo "#" >> config.log
+
+# Print configure header at the top of $config_host_h
+echo "/*" > $config_host_h
+echo " * Automatically generated by configure - do not modify" >> $config_host_h
+printf " * Configured with:" >> $config_host_h
+printf " * '%s'" "$0" "$@" >> $config_host_h
+echo "" >> $config_host_h
+echo " */" >> $config_host_h
+
+do_cc() {
+    # Run the compiler, capturing its output to the log.
+    echo $cc "$@" >> config.log
+    $cc "$@" >> config.log 2>&1 || return $?
+    # Test passed. If this is an --enable-werror build, rerun
+    # the test with -Werror and bail out if it fails. This
+    # makes warning-generating-errors in configure test code
+    # obvious to developers.
+    if test "$werror" != "yes"; then
+        return 0
+    fi
+    # Don't bother rerunning the compile if we were already using -Werror
+    case "$*" in
+        *-Werror*)
+           return 0
+        ;;
+    esac
+    echo $cc -Werror "$@" >> config.log
+    $cc -Werror "$@" >> config.log 2>&1 && return $?
+    echo "ERROR: configure test passed without -Werror but failed with -Werror."
+    echo "This is probably a bug in the configure script. The failing command"
+    echo "will be at the bottom of config.log."
+    fatal "You can run configure with --disable-werror to bypass this check."
+}
+
+compile_object() {
+  do_cc $CFLAGS -Werror-implicit-function-declaration -c -o $TMPO $TMPC
+}
+
+compile_prog() {
+  local_cflags="$1"
+  local_ldflags="$2 $LIBS"
+  echo "Compiling test case $3" >> config.log
+  do_cc $CFLAGS -Werror-implicit-function-declaration $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
+}
+
+feature_not_found() {
+  feature=$1
+  packages=$2
+
+  echo "ERROR"
+  echo "ERROR: User requested feature $feature"
+  if test ! -z "$packages" ; then
+    echo "ERROR: That feature needs $packages installed"
+  fi
+  echo "ERROR: configure was not able to find it"
+  fatal "ERROR"
+}
+
+has() {
+  type "$1" >/dev/null 2>&1
+}
+
+check_define() {
+  cat > $TMPC <<EOF
+#if !defined($1)
+#error $1 not defined
+#endif
+int main(void)
+{
+  return 0;
+}
+EOF
+  compile_object
+}
+
+output_sym() {
+  echo "$1=y" >> $config_host_mak
+  echo "#define $1" >> $config_host_h
+}
+
+targetos=""
+cpu=""
+
+# default options
+show_help="no"
+exit_val=0
+gfio_check="no"
+libhdfs="no"
+pmemblk="no"
+devdax="no"
+pmem="no"
+disable_lex=""
+disable_pmem="no"
+disable_native="no"
+march_set="no"
+libiscsi="no"
+libnbd="no"
+libaio_uring="no"
+prefix=/usr/local
+
+# parse options
+for opt do
+  optarg=`expr "x$opt" : 'x[^=]*=\(.*\)'`
+  case "$opt" in
+  --prefix=*) prefix="$optarg"
+  ;;
+  --cpu=*) cpu="$optarg"
+  ;;
+  #  esx is cross compiled and cannot be detect through simple uname calls
+  --esx)
+  esx="yes"
+  ;;
+  --cc=*) CC="$optarg"
+  ;;
+  --extra-cflags=*) CFLAGS="$CFLAGS $optarg"
+  ;;
+  --build-32bit-win) build_32bit_win="yes"
+  ;;
+  --target-win-ver=*) target_win_ver="$optarg"
+  ;;
+  --build-static) build_static="yes"
+  ;;
+  --enable-gfio) gfio_check="yes"
+  ;;
+  --disable-numa) disable_numa="yes"
+  ;;
+  --disable-rdma) disable_rdma="yes"
+  ;;
+  --disable-rados) disable_rados="yes"
+  ;;
+  --disable-rbd) disable_rbd="yes"
+  ;;
+  --disable-http) disable_http="yes"
+  ;;
+  --disable-gfapi) disable_gfapi="yes"
+  ;;
+  --enable-libhdfs) libhdfs="yes"
+  ;;
+  --disable-lex) disable_lex="yes"
+  ;;
+  --enable-lex) disable_lex="no"
+  ;;
+  --disable-shm) no_shm="yes"
+  ;;
+  --disable-optimizations) disable_opt="yes"
+  ;;
+  --disable-pmem) disable_pmem="yes"
+  ;;
+  --enable-cuda) enable_cuda="yes"
+  ;;
+  --disable-native) disable_native="yes"
+  ;;
+  --with-ime=*) ime_path="$optarg"
+  ;;
+  --enable-libiscsi) libiscsi="yes"
+  ;;
+  --enable-libnbd) libnbd="yes"
+  ;;
+  --disable-tcmalloc) disable_tcmalloc="yes"
+  ;;
+  --enable-libaio-uring) libaio_uring="yes"
+  ;;
+  --help)
+    show_help="yes"
+    ;;
+  *)
+  echo "Bad option $opt"
+  show_help="yes"
+  exit_val=1
+  esac
+done
+
+if test "$show_help" = "yes" ; then
+  echo "--prefix=               Use this directory as installation prefix"
+  echo "--cpu=                  Specify target CPU if auto-detect fails"
+  echo "--cc=                   Specify compiler to use"
+  echo "--extra-cflags=         Specify extra CFLAGS to pass to compiler"
+  echo "--build-32bit-win       Enable 32-bit build on Windows"
+  echo "--target-win-ver=       Minimum version of Windows to target (XP or 7)"
+  echo "--build-static          Build a static fio"
+  echo "--esx                   Configure build options for esx"
+  echo "--enable-gfio           Enable building of gtk gfio"
+  echo "--disable-numa          Disable libnuma even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
+  echo "--disable-rados         Disable Rados support even if found"
+  echo "--disable-rbd           Disable Rados Block Device even if found"
+  echo "--disable-http          Disable HTTP support even if found"
+  echo "--disable-gfapi         Disable gfapi"
+  echo "--enable-libhdfs        Enable hdfs support"
+  echo "--disable-lex           Disable use of lex/yacc for math"
+  echo "--disable-pmem          Disable pmem based engines even if found"
+  echo "--enable-lex            Enable use of lex/yacc for math"
+  echo "--disable-shm           Disable SHM support"
+  echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
+  echo "--disable-native        Don't build for native host"
+  echo "--with-ime=             Install path for DDN's Infinite Memory Engine"
+  echo "--enable-libiscsi       Enable iscsi support"
+  echo "--enable-libnbd         Enable libnbd (NBD engine) support"
+  echo "--disable-tcmalloc	Disable tcmalloc support"
+  echo "--enable-libaio-uring   Enable libaio emulated over io_uring"
+  exit $exit_val
+fi
+
+cross_prefix=${cross_prefix-${CROSS_COMPILE}}
+# Preferred compiler (can be overriden later after we know the platform):
+#  ${CC} (if set)
+#  ${cross_prefix}gcc (if cross-prefix specified)
+#  gcc if available
+#  clang if available
+if test -z "${CC}${cross_prefix}"; then
+  if has gcc; then
+    cc=gcc
+  elif has clang; then
+    cc=clang
+  fi
+else
+  cc="${CC-${cross_prefix}gcc}"
+fi
+
+if check_define __ANDROID__ ; then
+  targetos="Android"
+elif check_define __linux__ ; then
+  targetos="Linux"
+elif check_define __OpenBSD__ ; then
+  targetos='OpenBSD'
+elif check_define __NetBSD__ ; then
+  targetos='NetBSD'
+elif check_define __sun__ ; then
+  targetos='SunOS'
+  CFLAGS="$CFLAGS -D_REENTRANT"
+elif check_define _WIN32 ; then
+  targetos='CYGWIN'
+else
+  targetos=`uname -s`
+fi
+
+echo "# Automatically generated by configure - do not modify" > $config_host_mak
+printf "# Configured with:" >> $config_host_mak
+printf " '%s'" "$0" "$@" >> $config_host_mak
+echo >> $config_host_mak
+echo "CONFIG_TARGET_OS=$targetos" >> $config_host_mak
+
+if test "$no_shm" = "yes" ; then
+  output_sym "CONFIG_NO_SHM"
+fi
+
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_FIO_NO_OPT"
+fi
+
+# Some host OSes need non-standard checks for which CPU to use.
+# Note that these checks are broken for cross-compilation: if you're
+# cross-compiling to one of these OSes then you'll need to specify
+# the correct CPU with the --cpu option.
+case $targetos in
+AIX|OpenBSD|NetBSD)
+  # Unless explicitly enabled, turn off lex.
+  # OpenBSD will hit syntax error when enabled.
+  if test -z "$disable_lex" ; then
+    disable_lex="yes"
+  else
+    force_no_lex_o="yes"
+  fi
+  ;;
+FreeBSD)
+  CFLAGS="$CFLAGS -I/usr/local/include"
+  LDFLAGS="$LDFLAGS -L/usr/local/lib"
+  ;;
+Darwin)
+  # on Leopard most of the system is 32-bit, so we have to ask the kernel if
+  # we can run 64-bit userspace code.
+  # If the user didn't specify a CPU explicitly and the kernel says this is
+  # 64 bit hw, then assume x86_64. Otherwise fall through to the usual
+  # detection code.
+  if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then
+    cpu="x86_64"
+  fi
+  # Error at compile time linking of weak/partial symbols if possible...
+cat > $TMPC <<EOF
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "" "-Wl,-no_weak_imports" "disable weak symbols"; then
+    echo "Disabling weak symbols"
+    LDFLAGS="$LDFLAGS -Wl,-no_weak_imports"
+  fi
+  ;;
+SunOS)
+  # `uname -m` returns i86pc even on an x86_64 box, so default based on isainfo
+  if test -z "$cpu" && test "$(isainfo -k)" = "amd64"; then
+    cpu="x86_64"
+  fi
+  LIBS="-lnsl -lsocket"
+  ;;
+CYGWIN*)
+  # We still force some options, so keep this message here.
+  echo "Forcing some known good options on Windows"
+  if test -z "${CC}${cross_prefix}"; then
+    if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
+      cc="i686-w64-mingw32-gcc"
+    else
+      cc="x86_64-w64-mingw32-gcc"
+    fi
+  fi
+
+  target_win_ver=$(echo "$target_win_ver" | tr '[:lower:]' '[:upper:]')
+  if test -z "$target_win_ver"; then
+    # Default Windows API target
+    target_win_ver="7"
+  fi
+  if test "$target_win_ver" = "XP"; then
+    output_sym "CONFIG_WINDOWS_XP"
+  elif test "$target_win_ver" = "7"; then
+    output_sym "CONFIG_WINDOWS_7"
+    CFLAGS="$CFLAGS -D_WIN32_WINNT=0x0601"
+  else
+    fatal "Unknown target Windows version"
+  fi
+
+  # We need this to be output_sym'd here because this is Windows specific.
+  # The regular configure path never sets this config.
+  output_sym "CONFIG_WINDOWSAIO"
+  # We now take the regular configuration path without having exit 0 here.
+  # Flags below are still necessary mostly for MinGW.
+  build_static="yes"
+  rusage_thread="yes"
+  fdatasync="yes"
+  clock_gettime="yes" # clock_monotonic probe has dependency on this
+  clock_monotonic="yes"
+  sched_idle="yes"
+  ;;
+esac
+
+# Now we know the target platform we can have another guess at the preferred
+# compiler when it wasn't explictly set
+if test -z "${CC}${cross_prefix}"; then
+  if test "$targetos" = "FreeBSD" || test "$targetos" = "Darwin"; then
+    if has clang; then
+      cc=clang
+    fi
+  fi
+fi
+if test -z "$cc"; then
+    echo "configure: failed to find compiler"
+    exit 1
+fi
+
+if test ! -z "$cpu" ; then
+  # command line argument
+  :
+elif check_define __i386__ ; then
+  cpu="i386"
+elif check_define __x86_64__ ; then
+  cpu="x86_64"
+elif check_define __sparc__ ; then
+  if check_define __arch64__ ; then
+    cpu="sparc64"
+  else
+    cpu="sparc"
+  fi
+elif check_define _ARCH_PPC ; then
+  if check_define _ARCH_PPC64 ; then
+    cpu="ppc64"
+  else
+    cpu="ppc"
+  fi
+elif check_define __mips__ ; then
+  cpu="mips"
+elif check_define __ia64__ ; then
+  cpu="ia64"
+elif check_define __s390__ ; then
+  if check_define __s390x__ ; then
+    cpu="s390x"
+  else
+    cpu="s390"
+  fi
+elif check_define __arm__ ; then
+  cpu="arm"
+elif check_define __aarch64__ ; then
+  cpu="aarch64"
+elif check_define __hppa__ ; then
+  cpu="hppa"
+else
+  cpu=`uname -m`
+fi
+
+# Normalise host CPU name and set ARCH.
+case "$cpu" in
+  ia64|ppc|ppc64|s390|s390x|sparc64)
+    cpu="$cpu"
+  ;;
+  i386|i486|i586|i686|i86pc|BePC)
+    cpu="x86"
+  ;;
+  x86_64|amd64)
+    cpu="x86_64"
+  ;;
+  armv*b|armv*l|arm)
+    cpu="arm"
+  ;;
+  aarch64)
+    cpu="arm64"
+  ;;
+  hppa|parisc|parisc64)
+    cpu="hppa"
+  ;;
+  mips*)
+    cpu="mips"
+  ;;
+  sparc|sun4[cdmuv])
+    cpu="sparc"
+  ;;
+  *)
+  echo "Unknown CPU"
+  ;;
+esac
+
+##########################################
+# check cross compile
+
+if test "$cross_compile" != "yes" ; then
+  cross_compile="no"
+fi
+cat > $TMPC <<EOF
+int main(void)
+{
+  return 0;
+}
+EOF
+if compile_prog "" "" "cross"; then
+  $TMPE 2>/dev/null || cross_compile="yes"
+else
+  fatal "compile test failed"
+fi
+
+##########################################
+# check endianness
+if test "$bigendian" != "yes" ; then
+  bigendian="no"
+fi
+if test "$cross_compile" = "no" ; then
+  cat > $TMPC <<EOF
+#include <inttypes.h>
+int main(void)
+{
+  volatile uint32_t i=0x01234567;
+  return (*((uint8_t*)(&i))) == 0x67;
+}
+EOF
+  if compile_prog "" "" "endian"; then
+    $TMPE && bigendian="yes"
+  fi
+else
+  # If we're cross compiling, try our best to work it out and rely on the
+  # run-time check to fail if we get it wrong.
+  cat > $TMPC <<EOF
+#include <endian.h>
+int main(void)
+{
+#if __BYTE_ORDER != __BIG_ENDIAN
+# error "Unknown endianness"
+#endif
+}
+EOF
+  compile_prog "" "" "endian" && bigendian="yes"
+  check_define "__ARMEB__" && bigendian="yes"
+  check_define "__MIPSEB__" && bigendian="yes"
+fi
+
+
+print_config "Operating system" "$targetos"
+print_config "CPU" "$cpu"
+print_config "Big endian" "$bigendian"
+if test ! -z "$target_win_ver"; then
+  print_config "Target Windows version" "$target_win_ver"
+fi
+print_config "Compiler" "$cc"
+print_config "Cross compile" "$cross_compile"
+echo
+
+##########################################
+# See if we need to build a static build
+if test "$build_static" = "yes" ; then
+  CFLAGS="$CFLAGS -ffunction-sections -fdata-sections"
+  LDFLAGS="$LDFLAGS -static -Wl,--gc-sections"
+else
+  build_static="no"
+fi
+print_config "Static build" "$build_static"
+
+##########################################
+# check for wordsize
+wordsize="0"
+cat > $TMPC <<EOF
+#include <limits.h>
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+int main(void)
+{
+  BUILD_BUG_ON(sizeof(long)*CHAR_BIT != WORDSIZE);
+  return 0;
+}
+EOF
+if compile_prog "-DWORDSIZE=32" "" "wordsize"; then
+  wordsize="32"
+elif compile_prog "-DWORDSIZE=64" "" "wordsize"; then
+  wordsize="64"
+else
+  fatal "Unknown wordsize"
+fi
+print_config "Wordsize" "$wordsize"
+
+##########################################
+# zlib probe
+if test "$zlib" != "yes" ; then
+  zlib="no"
+fi
+cat > $TMPC <<EOF
+#include <zlib.h>
+int main(void)
+{
+  z_stream stream;
+  if (inflateInit(&stream) != Z_OK)
+    return 1;
+  return 0;
+}
+EOF
+if compile_prog "" "-lz" "zlib" ; then
+  zlib=yes
+  LIBS="-lz $LIBS"
+fi
+print_config "zlib" "$zlib"
+
+##########################################
+# linux-aio probe
+if test "$libaio" != "yes" ; then
+  libaio="no"
+fi
+if test "$esx" != "yes" ; then
+  cat > $TMPC <<EOF
+#include <libaio.h>
+#include <stddef.h>
+int main(void)
+{
+  io_setup(0, NULL);
+  return 0;
+}
+EOF
+  if test "$libaio_uring" = "yes" && compile_prog "" "-luring" "libaio io_uring" ; then
+    libaio=yes
+    LIBS="-luring $LIBS"
+  elif compile_prog "" "-laio" "libaio" ; then
+    libaio=yes
+    libaio_uring=no
+    LIBS="-laio $LIBS"
+  else
+    if test "$libaio" = "yes" ; then
+      feature_not_found "linux AIO" "libaio-dev or libaio-devel"
+    fi
+    libaio=no
+    libaio_uring=no
+  fi
+fi
+print_config "Linux AIO support" "$libaio"
+print_config "Linux AIO over io_uring" "$libaio_uring"
+
+##########################################
+# posix aio probe
+if test "$posix_aio" != "yes" ; then
+  posix_aio="no"
+fi
+if test "$posix_aio_lrt" != "yes" ; then
+  posix_aio_lrt="no"
+fi
+cat > $TMPC <<EOF
+#include <aio.h>
+int main(void)
+{
+  struct aiocb cb;
+  aio_read(&cb);
+  return 0;
+}
+EOF
+if compile_prog "" "" "posixaio" ; then
+  posix_aio="yes"
+elif compile_prog "" "-lrt" "posixaio -lrt"; then
+  posix_aio="yes"
+  posix_aio_lrt="yes"
+  LIBS="-lrt $LIBS"
+fi
+print_config "POSIX AIO support" "$posix_aio"
+print_config "POSIX AIO support needs -lrt" "$posix_aio_lrt"
+
+##########################################
+# posix aio fsync probe
+if test "$posix_aio_fsync" != "yes" ; then
+  posix_aio_fsync="no"
+fi
+if test "$posix_aio" = "yes" ; then
+  cat > $TMPC <<EOF
+#include <fcntl.h>
+#include <aio.h>
+int main(void)
+{
+  struct aiocb cb;
+  return aio_fsync(O_SYNC, &cb);
+  return 0;
+}
+EOF
+  if compile_prog "" "$LIBS" "posix_aio_fsync" ; then
+    posix_aio_fsync=yes
+  fi
+fi
+print_config "POSIX AIO fsync" "$posix_aio_fsync"
+
+##########################################
+# POSIX pshared attribute probe
+if test "$posix_pshared" != "yes" ; then
+  posix_pshared="no"
+fi
+cat > $TMPC <<EOF
+#include <unistd.h>
+int main(void)
+{
+#if defined(_POSIX_THREAD_PROCESS_SHARED) && ((_POSIX_THREAD_PROCESS_SHARED + 0) > 0)
+# if defined(__CYGWIN__)
+#  error "_POSIX_THREAD_PROCESS_SHARED is buggy on Cygwin"
+# elif defined(__APPLE__)
+#  include <AvailabilityMacros.h>
+#  include <TargetConditionals.h>
+#  if TARGET_OS_MAC && MAC_OS_X_VERSION_MIN_REQUIRED < 1070
+#   error "_POSIX_THREAD_PROCESS_SHARED is buggy/unsupported prior to OSX 10.7"
+#  endif
+# endif
+#else
+# error "_POSIX_THREAD_PROCESS_SHARED is unsupported"
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "posix_pshared" ; then
+  posix_pshared=yes
+fi
+print_config "POSIX pshared support" "$posix_pshared"
+
+##########################################
+# POSIX pthread_condattr_setclock() probe
+if test "$pthread_condattr_setclock" != "yes" ; then
+  pthread_condattr_setclock="no"
+fi
+cat > $TMPC <<EOF
+#include <pthread.h>
+int main(void)
+{
+  pthread_condattr_t condattr;
+  pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC);
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
+  pthread_condattr_setclock=yes
+elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
+  pthread_condattr_setclock=yes
+  LIBS="$LIBS -lpthread"
+fi
+print_config "pthread_condattr_setclock()" "$pthread_condattr_setclock"
+
+##########################################
+# pthread_sigmask() probe
+if test "$pthread_sigmask" != "yes" ; then
+  pthread_sigmask="no"
+fi
+cat > $TMPC <<EOF
+#include <stddef.h> /* NULL */
+#include <signal.h> /* pthread_sigmask() */
+int main(void)
+{
+  return pthread_sigmask(0, NULL, NULL);
+}
+EOF
+if compile_prog "" "$LIBS" "pthread_sigmask" ; then
+  pthread_sigmask=yes
+elif compile_prog "" "$LIBS -lpthread" "pthread_sigmask" ; then
+  pthread_sigmask=yes
+  LIBS="$LIBS -lpthread"
+fi
+print_config "pthread_sigmask()" "$pthread_sigmask"
+
+##########################################
+# solaris aio probe
+if test "$solaris_aio" != "yes" ; then
+  solaris_aio="no"
+fi
+cat > $TMPC <<EOF
+#include <sys/types.h>
+#include <sys/asynch.h>
+#include <unistd.h>
+int main(void)
+{
+  aio_result_t res;
+  return aioread(0, NULL, 0, 0, SEEK_SET, &res);
+  return 0;
+}
+EOF
+if compile_prog "" "-laio" "solarisaio" ; then
+  solaris_aio=yes
+  LIBS="-laio $LIBS"
+fi
+print_config "Solaris AIO support" "$solaris_aio"
+
+##########################################
+# __sync_fetch_and_add test
+if test "$sfaa" != "yes" ; then
+  sfaa="no"
+fi
+cat > $TMPC << EOF
+#include <inttypes.h>
+static int sfaa(uint64_t *ptr)
+{
+  return __sync_fetch_and_add(ptr, 0);
+}
+
+int main(int argc, char **argv)
+{
+  uint64_t val = 42;
+  sfaa(&val);
+  return val;
+}
+EOF
+if compile_prog "" "" "__sync_fetch_and_add()" ; then
+    sfaa="yes"
+fi
+print_config "__sync_fetch_and_add" "$sfaa"
+
+##########################################
+# __sync_synchronize() test
+if test "$sync_sync" != "yes" ; then
+  sync_sync="no"
+fi
+cat > $TMPC << EOF
+#include <inttypes.h>
+
+int main(int argc, char **argv)
+{
+  __sync_synchronize();
+  return 0;
+}
+EOF
+if compile_prog "" "" "__sync_synchronize()" ; then
+    sync_sync="yes"
+fi
+print_config "__sync_synchronize" "$sync_sync"
+
+##########################################
+# __sync_val_compare_and_swap() test
+if test "$cmp_swap" != "yes" ; then
+  cmp_swap="no"
+fi
+cat > $TMPC << EOF
+#include <inttypes.h>
+
+int main(int argc, char **argv)
+{
+  int x = 0;
+  return __sync_val_compare_and_swap(&x, 1, 2);
+}
+EOF
+if compile_prog "" "" "__sync_val_compare_and_swap()" ; then
+    cmp_swap="yes"
+fi
+print_config "__sync_val_compare_and_swap" "$cmp_swap"
+
+##########################################
+# libverbs probe
+if test "$libverbs" != "yes" ; then
+  libverbs="no"
+fi
+cat > $TMPC << EOF
+#include <infiniband/verbs.h>
+int main(int argc, char **argv)
+{
+  struct ibv_pd *pd = ibv_alloc_pd(NULL);
+  return pd != NULL;
+}
+EOF
+if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; then
+    libverbs="yes"
+    LIBS="-libverbs $LIBS"
+fi
+print_config "libverbs" "$libverbs"
+
+##########################################
+# rdmacm probe
+if test "$rdmacm" != "yes" ; then
+  rdmacm="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <rdma/rdma_cma.h>
+int main(int argc, char **argv)
+{
+  rdma_destroy_qp(NULL);
+  return 0;
+}
+EOF
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then
+    rdmacm="yes"
+    LIBS="-lrdmacm $LIBS"
+fi
+print_config "rdmacm" "$rdmacm"
+
+##########################################
+# asprintf() and vasprintf() probes
+if test "$have_asprintf" != "yes" ; then
+  have_asprintf="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+  return asprintf(NULL, "%s", "str") == 0;
+}
+EOF
+if compile_prog "" "" "have_asprintf"; then
+    have_asprintf="yes"
+fi
+print_config "asprintf()" "$have_asprintf"
+
+if test "$have_vasprintf" != "yes" ; then
+  have_vasprintf="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+  return vasprintf(NULL, "%s", NULL) == 0;
+}
+EOF
+if compile_prog "" "" "have_vasprintf"; then
+    have_vasprintf="yes"
+fi
+print_config "vasprintf()" "$have_vasprintf"
+
+##########################################
+# Linux fallocate probe
+if test "$linux_fallocate" != "yes" ; then
+  linux_fallocate="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <fcntl.h>
+#include <linux/falloc.h>
+int main(int argc, char **argv)
+{
+  int r = fallocate(0, FALLOC_FL_KEEP_SIZE, 0, 1024);
+  return r;
+}
+EOF
+if compile_prog "" "" "linux_fallocate"; then
+    linux_fallocate="yes"
+fi
+print_config "Linux fallocate" "$linux_fallocate"
+
+##########################################
+# POSIX fadvise probe
+if test "$posix_fadvise" != "yes" ; then
+  posix_fadvise="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <fcntl.h>
+int main(int argc, char **argv)
+{
+  int r = posix_fadvise(0, 0, 0, POSIX_FADV_NORMAL);
+  return r;
+}
+EOF
+if compile_prog "" "" "posix_fadvise"; then
+    posix_fadvise="yes"
+fi
+print_config "POSIX fadvise" "$posix_fadvise"
+
+##########################################
+# POSIX fallocate probe
+if test "$posix_fallocate" != "yes" ; then
+  posix_fallocate="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <fcntl.h>
+int main(int argc, char **argv)
+{
+  int r = posix_fallocate(0, 0, 1024);
+  return r;
+}
+EOF
+if compile_prog "" "" "posix_fallocate"; then
+    posix_fallocate="yes"
+fi
+print_config "POSIX fallocate" "$posix_fallocate"
+
+##########################################
+# sched_set/getaffinity 2 or 3 argument test
+if test "$linux_2arg_affinity" != "yes" ; then
+  linux_2arg_affinity="no"
+fi
+if test "$linux_3arg_affinity" != "yes" ; then
+  linux_3arg_affinity="no"
+fi
+cat > $TMPC << EOF
+#include <sched.h>
+int main(int argc, char **argv)
+{
+  cpu_set_t mask;
+  return sched_setaffinity(0, sizeof(mask), &mask);
+}
+EOF
+if compile_prog "" "" "sched_setaffinity(,,)"; then
+  linux_3arg_affinity="yes"
+else
+  cat > $TMPC << EOF
+#include <sched.h>
+int main(int argc, char **argv)
+{
+  cpu_set_t mask;
+  return sched_setaffinity(0, &mask);
+}
+EOF
+  if compile_prog "" "" "sched_setaffinity(,)"; then
+    linux_2arg_affinity="yes"
+  fi
+fi
+print_config "sched_setaffinity(3 arg)" "$linux_3arg_affinity"
+print_config "sched_setaffinity(2 arg)" "$linux_2arg_affinity"
+
+##########################################
+# clock_gettime probe
+if test "$clock_gettime" != "yes" ; then
+  clock_gettime="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char **argv)
+{
+  return clock_gettime(0, NULL);
+}
+EOF
+if compile_prog "" "" "clock_gettime"; then
+    clock_gettime="yes"
+elif compile_prog "" "-lrt" "clock_gettime"; then
+    clock_gettime="yes"
+    LIBS="-lrt $LIBS"
+fi
+print_config "clock_gettime" "$clock_gettime"
+
+##########################################
+# CLOCK_MONOTONIC probe
+if test "$clock_monotonic" != "yes" ; then
+  clock_monotonic="no"
+fi
+if test "$clock_gettime" = "yes" ; then
+  cat > $TMPC << EOF
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char **argv)
+{
+  return clock_gettime(CLOCK_MONOTONIC, NULL);
+}
+EOF
+  if compile_prog "" "$LIBS" "clock monotonic"; then
+      clock_monotonic="yes"
+  fi
+fi
+print_config "CLOCK_MONOTONIC" "$clock_monotonic"
+
+##########################################
+# CLOCK_MONOTONIC_RAW probe
+if test "$clock_monotonic_raw" != "yes" ; then
+  clock_monotonic_raw="no"
+fi
+if test "$clock_gettime" = "yes" ; then
+  cat > $TMPC << EOF
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char **argv)
+{
+  return clock_gettime(CLOCK_MONOTONIC_RAW, NULL);
+}
+EOF
+  if compile_prog "" "$LIBS" "clock monotonic"; then
+      clock_monotonic_raw="yes"
+  fi
+fi
+print_config "CLOCK_MONOTONIC_RAW" "$clock_monotonic_raw"
+
+##########################################
+# CLOCK_MONOTONIC_PRECISE probe
+if test "$clock_monotonic_precise" != "yes" ; then
+  clock_monotonic_precise="no"
+fi
+if test "$clock_gettime" = "yes" ; then
+  cat > $TMPC << EOF
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char **argv)
+{
+  return clock_gettime(CLOCK_MONOTONIC_PRECISE, NULL);
+}
+EOF
+  if compile_prog "" "$LIBS" "clock monotonic precise"; then
+      clock_monotonic_precise="yes"
+  fi
+fi
+print_config "CLOCK_MONOTONIC_PRECISE" "$clock_monotonic_precise"
+
+##########################################
+# clockid_t probe
+if test "$clockid_t" != "yes" ; then
+  clockid_t="no"
+fi
+cat > $TMPC << EOF
+#include <time.h>
+#include <string.h>
+int main(int argc, char **argv)
+{
+  volatile clockid_t cid;
+  memset((void*)&cid, 0, sizeof(cid));
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "clockid_t"; then
+  clockid_t="yes"
+fi
+print_config "clockid_t" "$clockid_t"
+
+##########################################
+# gettimeofday() probe
+if test "$gettimeofday" != "yes" ; then
+  gettimeofday="no"
+fi
+cat > $TMPC << EOF
+#include <sys/time.h>
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+  struct timeval tv;
+  return gettimeofday(&tv, NULL);
+}
+EOF
+if compile_prog "" "" "gettimeofday"; then
+    gettimeofday="yes"
+fi
+print_config "gettimeofday" "$gettimeofday"
+
+##########################################
+# fdatasync() probe
+if test "$fdatasync" != "yes" ; then
+  fdatasync="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <unistd.h>
+int main(int argc, char **argv)
+{
+  return fdatasync(0);
+}
+EOF
+if compile_prog "" "" "fdatasync"; then
+  fdatasync="yes"
+fi
+print_config "fdatasync" "$fdatasync"
+
+##########################################
+# pipe() probe
+if test "$pipe" != "yes" ; then
+  pipe="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+int main(int argc, char **argv)
+{
+  int fd[2];
+  return pipe(fd);
+}
+EOF
+if compile_prog "" "" "pipe"; then
+  pipe="yes"
+fi
+print_config "pipe()" "$pipe"
+
+##########################################
+# pipe2() probe
+if test "$pipe2" != "yes" ; then
+  pipe2="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+int main(int argc, char **argv)
+{
+  int fd[2];
+  return pipe2(fd, 0);
+}
+EOF
+if compile_prog "" "" "pipe2"; then
+  pipe2="yes"
+fi
+print_config "pipe2()" "$pipe2"
+
+##########################################
+# pread() probe
+if test "$pread" != "yes" ; then
+  pread="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+int main(int argc, char **argv)
+{
+  return pread(0, NULL, 0, 0);
+}
+EOF
+if compile_prog "" "" "pread"; then
+  pread="yes"
+fi
+print_config "pread()" "$pread"
+
+##########################################
+# sync_file_range() probe
+if test "$sync_file_range" != "yes" ; then
+  sync_file_range="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+int main(int argc, char **argv)
+{
+  unsigned int flags = SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
+			SYNC_FILE_RANGE_WAIT_AFTER;
+  return sync_file_range(0, 0, 0, flags);
+}
+EOF
+if compile_prog "" "" "sync_file_range"; then
+  sync_file_range="yes"
+fi
+print_config "sync_file_range" "$sync_file_range"
+
+##########################################
+# ext4 move extent probe
+if test "$ext4_me" != "yes" ; then
+  ext4_me="no"
+fi
+cat > $TMPC << EOF
+#include <fcntl.h>
+#include <sys/ioctl.h>
+int main(int argc, char **argv)
+{
+  struct move_extent me;
+  return ioctl(0, EXT4_IOC_MOVE_EXT, &me);
+}
+EOF
+if compile_prog "" "" "ext4 move extent" ; then
+  ext4_me="yes"
+elif test $targetos = "Linux" ; then
+  # On Linux, just default to it on and let it error at runtime if we really
+  # don't have it. None of my updated systems have it defined, but it does
+  # work. Takes a while to bubble back.
+  ext4_me="yes"
+fi
+print_config "EXT4 move extent" "$ext4_me"
+
+##########################################
+# splice probe
+if test "$linux_splice" != "yes" ; then
+  linux_splice="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <fcntl.h>
+int main(int argc, char **argv)
+{
+  return splice(0, NULL, 0, NULL, 0, SPLICE_F_NONBLOCK);
+}
+EOF
+if compile_prog "" "" "linux splice"; then
+  linux_splice="yes"
+fi
+print_config "Linux splice(2)" "$linux_splice"
+
+##########################################
+# GUASI probe
+if test "$guasi" != "yes" ; then
+  guasi="no"
+fi
+cat > $TMPC << EOF
+#include <guasi.h>
+#include <guasi_syscalls.h>
+int main(int argc, char **argv)
+{
+  guasi_t ctx = guasi_create(0, 0, 0);
+  return 0;
+}
+EOF
+if compile_prog "" "" "guasi"; then
+  guasi="yes"
+fi
+print_config "GUASI" "$guasi"
+
+##########################################
+# libnuma probe
+if test "$libnuma" != "yes" ; then
+  libnuma="no"
+fi
+cat > $TMPC << EOF
+#include <numa.h>
+int main(int argc, char **argv)
+{
+  return numa_available();
+}
+EOF
+if test "$disable_numa" != "yes"  && compile_prog "" "-lnuma" "libnuma"; then
+  libnuma="yes"
+  LIBS="-lnuma $LIBS"
+fi
+print_config "libnuma" "$libnuma"
+
+##########################################
+# libnuma 2.x version API, initialize with "no" only if $libnuma is set to "yes"
+if test "$libnuma" = "yes" ; then
+libnuma_v2="no"
+cat > $TMPC << EOF
+#include <numa.h>
+int main(int argc, char **argv)
+{
+  struct bitmask *mask = numa_parse_nodestring(NULL);
+  return mask->size == 0;
+}
+EOF
+if compile_prog "" "" "libnuma api"; then
+  libnuma_v2="yes"
+fi
+print_config "libnuma v2" "$libnuma_v2"
+fi
+
+##########################################
+# strsep() probe
+if test "$strsep" != "yes" ; then
+  strsep="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+int main(int argc, char **argv)
+{
+  static char *string = "This is a string";
+  strsep(&string, "needle");
+  return 0;
+}
+EOF
+if compile_prog "" "" "strsep"; then
+  strsep="yes"
+fi
+print_config "strsep" "$strsep"
+
+##########################################
+# strcasestr() probe
+if test "$strcasestr" != "yes" ; then
+  strcasestr="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+int main(int argc, char **argv)
+{
+  return strcasestr(argv[0], argv[1]) != NULL;
+}
+EOF
+if compile_prog "" "" "strcasestr"; then
+  strcasestr="yes"
+fi
+print_config "strcasestr" "$strcasestr"
+
+##########################################
+# strlcat() probe
+if test "$strlcat" != "yes" ; then
+  strlcat="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+int main(int argc, char **argv)
+{
+  static char dst[64];
+  static char *string = "This is a string";
+  memset(dst, 0, sizeof(dst));
+  strlcat(dst, string, sizeof(dst));
+  return 0;
+}
+EOF
+if compile_prog "" "" "strlcat"; then
+  strlcat="yes"
+fi
+print_config "strlcat" "$strlcat"
+
+##########################################
+# getopt_long_only() probe
+if test "$getopt_long_only" != "yes" ; then
+  getopt_long_only="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+#include <stdio.h>
+#include <getopt.h>
+int main(int argc, char **argv)
+{
+  int c = getopt_long_only(argc, argv, "", NULL, NULL);
+  return c;
+}
+EOF
+if compile_prog "" "" "getopt_long_only"; then
+  getopt_long_only="yes"
+fi
+print_config "getopt_long_only()" "$getopt_long_only"
+
+##########################################
+# inet_aton() probe
+if test "$inet_aton" != "yes" ; then
+  inet_aton="no"
+fi
+cat > $TMPC << EOF
+#ifdef _WIN32
+#include <winsock2.h>
+#else
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#endif
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+  struct in_addr in;
+  return inet_aton(NULL, &in);
+}
+EOF
+if compile_prog "" "" "inet_aton"; then
+  inet_aton="yes"
+fi
+print_config "inet_aton" "$inet_aton"
+
+##########################################
+# socklen_t probe
+if test "$socklen_t" != "yes" ; then
+  socklen_t="no"
+fi
+cat > $TMPC << EOF
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <sys/socket.h>
+#endif
+int main(int argc, char **argv)
+{
+  socklen_t len = 0;
+  return len;
+}
+EOF
+if compile_prog "" "" "socklen_t"; then
+  socklen_t="yes"
+fi
+print_config "socklen_t" "$socklen_t"
+
+##########################################
+# Whether or not __thread is supported for TLS
+if test "$tls_thread" != "yes" ; then
+  tls_thread="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+static __thread int ret;
+int main(int argc, char **argv)
+{
+  return ret;
+}
+EOF
+if compile_prog "" "" "__thread"; then
+  tls_thread="yes"
+fi
+print_config "__thread" "$tls_thread"
+
+##########################################
+# Check if we have required gtk/glib support for gfio
+if test "$gfio" != "yes" ; then
+  gfio="no"
+fi
+if test "$gfio_check" = "yes" ; then
+  cat > $TMPC << EOF
+#include <glib.h>
+#include <cairo.h>
+#include <gtk/gtk.h>
+int main(void)
+{
+  gdk_threads_enter();
+  gdk_threads_leave();
+
+  return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */
+}
+EOF
+GTK_CFLAGS=$(${cross_prefix}pkg-config --cflags gtk+-2.0 gthread-2.0)
+ORG_LDFLAGS=$LDFLAGS
+LDFLAGS=$(echo $LDFLAGS | sed s/"-static"//g)
+if test "$?" != "0" ; then
+  echo "configure: gtk and gthread not found"
+  exit 1
+fi
+GTK_LIBS=$(${cross_prefix}pkg-config --libs gtk+-2.0 gthread-2.0)
+if test "$?" != "0" ; then
+  echo "configure: gtk and gthread not found"
+  exit 1
+fi
+if ! ${cross_prefix}pkg-config --atleast-version 2.18.0 gtk+-2.0; then
+  echo "GTK found, but need version 2.18 or higher"
+  gfio="no"
+else
+  if compile_prog "$GTK_CFLAGS" "$GTK_LIBS" "gfio" ; then
+    gfio="yes"
+    GFIO_LIBS="$LIBS $GTK_LIBS"
+    CFLAGS="$CFLAGS $GTK_CFLAGS"
+  else
+    echo "Please install gtk and gdk libraries"
+    gfio="no"
+  fi
+fi
+LDFLAGS=$ORG_LDFLAGS
+fi
+
+if test "$gfio_check" = "yes" ; then
+  print_config "gtk 2.18 or higher" "$gfio"
+fi
+
+##########################################
+# Check whether we have getrusage(RUSAGE_THREAD)
+if test "$rusage_thread" != "yes" ; then
+  rusage_thread="no"
+fi
+cat > $TMPC << EOF
+#include <sys/time.h>
+#include <sys/resource.h>
+int main(int argc, char **argv)
+{
+  struct rusage ru;
+  getrusage(RUSAGE_THREAD, &ru);
+  return 0;
+}
+EOF
+if compile_prog "" "" "RUSAGE_THREAD"; then
+  rusage_thread="yes"
+fi
+print_config "RUSAGE_THREAD" "$rusage_thread"
+
+##########################################
+# Check whether we have SCHED_IDLE
+if test "$sched_idle" != "yes" ; then
+  sched_idle="no"
+fi
+cat > $TMPC << EOF
+#include <sched.h>
+int main(int argc, char **argv)
+{
+  struct sched_param p;
+  return sched_setscheduler(0, SCHED_IDLE, &p);
+}
+EOF
+if compile_prog "" "" "SCHED_IDLE"; then
+  sched_idle="yes"
+fi
+print_config "SCHED_IDLE" "$sched_idle"
+
+##########################################
+# Check whether we have TCP_NODELAY
+if test "$tcp_nodelay" != "yes" ; then
+  tcp_nodelay="no"
+fi
+cat > $TMPC << EOF
+#ifdef _WIN32
+#include <winsock2.h>
+#else
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#endif
+int main(int argc, char **argv)
+{
+  return getsockopt(0, 0, TCP_NODELAY, NULL, NULL);
+}
+EOF
+if compile_prog "" "" "TCP_NODELAY"; then
+  tcp_nodelay="yes"
+elif compile_prog "" "-lws2_32" "TCP_NODELAY"; then
+  tcp_nodelay="yes"
+  LIBS="$LIBS -lws2_32"
+fi
+print_config "TCP_NODELAY" "$tcp_nodelay"
+
+##########################################
+# Check whether we have SO_SNDBUF
+if test "$window_size" != "yes" ; then
+  window_size="no"
+fi
+cat > $TMPC << EOF
+#ifdef _WIN32
+#include <winsock2.h>
+#else
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#endif
+int main(int argc, char **argv)
+{
+  setsockopt(0, SOL_SOCKET, SO_SNDBUF, NULL, 0);
+  setsockopt(0, SOL_SOCKET, SO_RCVBUF, NULL, 0);
+}
+EOF
+if compile_prog "" "" "SO_SNDBUF"; then
+  window_size="yes"
+elif compile_prog "" "-lws2_32" "SO_SNDBUF"; then
+  window_size="yes"
+  LIBS="$LIBS -lws2_32"
+fi
+print_config "Net engine window_size" "$window_size"
+
+##########################################
+# Check whether we have TCP_MAXSEG
+if test "$mss" != "yes" ; then
+  mss="no"
+fi
+cat > $TMPC << EOF
+#ifdef _WIN32
+#include <winsock2.h>
+#else
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#endif
+int main(int argc, char **argv)
+{
+  return setsockopt(0, IPPROTO_TCP, TCP_MAXSEG, NULL, 0);
+}
+EOF
+if compile_prog "" "" "TCP_MAXSEG"; then
+  mss="yes"
+elif compile_prog "" "-lws2_32" "TCP_MAXSEG"; then
+  mss="yes"
+  LIBS="$LIBS -lws2_32"
+fi
+print_config "TCP_MAXSEG" "$mss"
+
+##########################################
+# Check whether we have RLIMIT_MEMLOCK
+if test "$rlimit_memlock" != "yes" ; then
+  rlimit_memlock="no"
+fi
+cat > $TMPC << EOF
+#include <sys/time.h>
+#include <sys/resource.h>
+int main(int argc, char **argv)
+{
+  struct rlimit rl;
+  return getrlimit(RLIMIT_MEMLOCK, &rl);
+}
+EOF
+if compile_prog "" "" "RLIMIT_MEMLOCK"; then
+  rlimit_memlock="yes"
+fi
+print_config "RLIMIT_MEMLOCK" "$rlimit_memlock"
+
+##########################################
+# Check whether we have pwritev/preadv
+if test "$pwritev" != "yes" ; then
+  pwritev="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/uio.h>
+int main(int argc, char **argv)
+{
+  return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0);
+}
+EOF
+if compile_prog "" "" "pwritev"; then
+  pwritev="yes"
+fi
+print_config "pwritev/preadv" "$pwritev"
+
+##########################################
+# Check whether we have pwritev2/preadv2
+if test "$pwritev2" != "yes" ; then
+  pwritev2="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/uio.h>
+int main(int argc, char **argv)
+{
+  return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+}
+EOF
+if compile_prog "" "" "pwritev2"; then
+  pwritev2="yes"
+fi
+print_config "pwritev2/preadv2" "$pwritev2"
+
+##########################################
+# Check whether we have the required functions for ipv6
+if test "$ipv6" != "yes" ; then
+  ipv6="no"
+fi
+cat > $TMPC << EOF
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#endif
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+  struct addrinfo hints;
+  struct in6_addr addr;
+  int ret;
+
+  ret = getaddrinfo(NULL, NULL, &hints, NULL);
+  freeaddrinfo(NULL);
+  printf("%s\n", gai_strerror(ret));
+  addr = in6addr_any;
+  return 0;
+}
+EOF
+if compile_prog "" "" "ipv6"; then
+  ipv6="yes"
+fi
+print_config "IPv6 helpers" "$ipv6"
+
+##########################################
+# check for http
+if test "$http" != "yes" ; then
+  http="no"
+fi
+# check for openssl >= 1.1.0, which uses an opaque HMAC_CTX pointer
+cat > $TMPC << EOF
+#include <curl/curl.h>
+#include <openssl/hmac.h>
+
+int main(int argc, char **argv)
+{
+  CURL *curl;
+  HMAC_CTX *ctx;
+
+  curl = curl_easy_init();
+  curl_easy_cleanup(curl);
+
+  ctx = HMAC_CTX_new();
+  HMAC_CTX_reset(ctx);
+  HMAC_CTX_free(ctx);
+  return 0;
+}
+EOF
+# openssl < 1.1.0 uses the HMAC_CTX type directly
+cat > $TMPC2 << EOF
+#include <curl/curl.h>
+#include <openssl/hmac.h>
+
+int main(int argc, char **argv)
+{
+  CURL *curl;
+  HMAC_CTX ctx;
+
+  curl = curl_easy_init();
+  curl_easy_cleanup(curl);
+
+  HMAC_CTX_init(&ctx);
+  HMAC_CTX_cleanup(&ctx);
+  return 0;
+}
+EOF
+if test "$disable_http" != "yes"; then
+  HTTP_LIBS="-lcurl -lssl -lcrypto"
+  if compile_prog "" "$HTTP_LIBS" "curl-new-ssl"; then
+    output_sym "CONFIG_HAVE_OPAQUE_HMAC_CTX"
+    http="yes"
+    LIBS="$HTTP_LIBS $LIBS"
+  elif mv $TMPC2 $TMPC && compile_prog "" "$HTTP_LIBS" "curl-old-ssl"; then
+    http="yes"
+    LIBS="$HTTP_LIBS $LIBS"
+  fi
+fi
+print_config "http engine" "$http"
+
+##########################################
+# check for rados
+if test "$rados" != "yes" ; then
+  rados="no"
+fi
+cat > $TMPC << EOF
+#include <rados/librados.h>
+
+int main(int argc, char **argv)
+{
+  rados_t cluster;
+  rados_ioctx_t io_ctx;
+  const char cluster_name[] = "ceph";
+  const char user_name[] = "client.admin";
+  const char pool[] = "rados";
+
+  /* The rados_create2 signature required was only introduced in ceph 0.65 */
+  rados_create2(&cluster, cluster_name, user_name, 0);
+  rados_ioctx_create(cluster, pool, &io_ctx);
+
+  return 0;
+}
+EOF
+if test "$disable_rados" != "yes"  && compile_prog "" "-lrados" "rados"; then
+  LIBS="-lrados $LIBS"
+  rados="yes"
+fi
+print_config "Rados engine" "$rados"
+
+##########################################
+# check for rbd
+if test "$rbd" != "yes" ; then
+  rbd="no"
+fi
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+
+int main(int argc, char **argv)
+{
+  rados_t cluster;
+  rados_ioctx_t io_ctx;
+  const char cluster_name[] = "ceph";
+  const char user_name[] = "client.admin";
+  const char pool[] = "rbd";
+  int major, minor, extra;
+
+  rbd_version(&major, &minor, &extra);
+  /* The rados_create2 signature required was only introduced in ceph 0.65 */
+  rados_create2(&cluster, cluster_name, user_name, 0);
+  rados_ioctx_create(cluster, pool, &io_ctx);
+
+  return 0;
+}
+EOF
+if test "$disable_rbd" != "yes"  && compile_prog "" "-lrbd -lrados" "rbd"; then
+  LIBS="-lrbd -lrados $LIBS"
+  rbd="yes"
+fi
+print_config "Rados Block Device engine" "$rbd"
+
+##########################################
+# check for rbd_poll
+if test "$rbd_poll" != "yes" ; then
+  rbd_poll="no"
+fi
+if test "$rbd" = "yes"; then
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+#include <sys/eventfd.h>
+
+int main(int argc, char **argv)
+{
+  rbd_image_t image;
+  rbd_completion_t comp;
+
+  int fd = eventfd(0, EFD_NONBLOCK);
+  rbd_set_image_notification(image, fd, EVENT_TYPE_EVENTFD);
+  rbd_poll_io_events(image, comp, 1);
+
+  return 0;
+}
+EOF
+if compile_prog "" "-lrbd -lrados" "rbd"; then
+  rbd_poll="yes"
+fi
+print_config "rbd_poll" "$rbd_poll"
+fi
+
+##########################################
+# check for rbd_invalidate_cache()
+if test "$rbd_inval" != "yes" ; then
+  rbd_inval="no"
+fi
+if test "$rbd" = "yes"; then
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+
+int main(int argc, char **argv)
+{
+  rbd_image_t image;
+
+  return rbd_invalidate_cache(image);
+}
+EOF
+if compile_prog "" "-lrbd -lrados" "rbd"; then
+  rbd_inval="yes"
+fi
+print_config "rbd_invalidate_cache" "$rbd_inval"
+fi
+
+##########################################
+# Check whether we have setvbuf
+if test "$setvbuf" != "yes" ; then
+  setvbuf="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+  FILE *f = NULL;
+  char buf[80];
+  setvbuf(f, buf, _IOFBF, sizeof(buf));
+  return 0;
+}
+EOF
+if compile_prog "" "" "setvbuf"; then
+  setvbuf="yes"
+fi
+print_config "setvbuf" "$setvbuf"
+
+##########################################
+# check for gfapi
+if test "$gfapi" != "yes" ; then
+  gfapi="no"
+fi
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  glfs_t *g = glfs_new("foo");
+
+  return 0;
+}
+EOF
+if test "$disable_gfapi" != "yes"  && compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
+  LIBS="-lgfapi -lglusterfs $LIBS"
+  gfapi="yes"
+fi
+print_config "Gluster API engine" "$gfapi"
+
+##########################################
+# check for gfapi fadvise support, initialize with "no" only if $gfapi is set to "yes"
+if test "$gfapi" = "yes" ; then
+gf_fadvise="no"
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  struct glfs_fd *fd;
+  int ret = glfs_fadvise(fd, 0, 0, 1);
+
+  return 0;
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
+  gf_fadvise="yes"
+fi
+print_config "Gluster API use fadvise" "$gf_fadvise"
+fi
+
+##########################################
+# check for newer gfapi
+if test "$gfapi" = "yes" ; then
+gf_new="no"
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  return glfs_fsync(NULL, NULL, NULL) && glfs_ftruncate(NULL, 0, NULL, NULL);
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gf new api"; then
+  gf_new="yes"
+fi
+print_config "Gluster new API" "$gf_new"
+fi
+
+##########################################
+# check for gfapi trim support
+if test "$gf_trim" != "yes" ; then
+  gf_trim="no"
+fi
+if test "$gfapi" = "yes" ; then
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  return glfs_discard_async(NULL, 0, 0);
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gf trim"; then
+  gf_trim="yes"
+fi
+print_config "Gluster API trim support" "$gf_trim"
+fi
+
+##########################################
+# Check if we support stckf on s390
+if test "$s390_z196_facilities" != "yes" ; then
+  s390_z196_facilities="no"
+fi
+cat > $TMPC << EOF
+#define STFLE_BITS_Z196 45 /* various z196 facilities ... */
+int main(int argc, char **argv)
+{
+    /* We want just 1 double word to be returned.  */
+    register unsigned long reg0 asm("0") = 0;
+    unsigned long stfle_bits;
+    asm volatile(".machine push"        "\n\t"
+                 ".machine \"z9-109\""  "\n\t"
+                 "stfle %0"             "\n\t"
+                 ".machine pop"         "\n"
+                 : "=QS" (stfle_bits), "+d" (reg0)
+                 : : "cc");
+
+    if ((stfle_bits & (1UL << (63 - STFLE_BITS_Z196))) != 0)
+      return 0;
+    else
+      return -1;
+}
+EOF
+if compile_prog "" "" "s390_z196_facilities"; then
+  $TMPE
+  if [ $? -eq 0 ]; then
+  	s390_z196_facilities="yes"
+  fi
+fi
+print_config "s390_z196_facilities" "$s390_z196_facilities"
+
+##########################################
+# Check if we have required environment variables configured for libhdfs
+if test "$libhdfs" = "yes" ; then
+  hdfs_conf_error=0
+  if test "$JAVA_HOME" = "" ; then
+    echo "configure: JAVA_HOME should be defined to jdk/jvm path"
+    hdfs_conf_error=1
+  fi
+  if test "$FIO_LIBHDFS_INCLUDE" = "" ; then
+    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path"
+    hdfs_conf_error=1
+  fi
+  if test "$FIO_LIBHDFS_LIB" = "" ; then
+    echo "configure: FIO_LIBHDFS_LIB should be defined to libhdfs library path"
+    hdfs_conf_error=1
+  fi
+  if test "$hdfs_conf_error" = "1" ; then
+    exit 1
+  fi
+  FIO_HDFS_CPU=$cpu
+  if test "$FIO_HDFS_CPU" = "x86_64" ; then
+    FIO_HDFS_CPU="amd64"
+  fi
+fi
+print_config "HDFS engine" "$libhdfs"
+
+##########################################
+# Check whether we have MTD
+if test "$mtd" != "yes" ; then
+  mtd="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+#include <mtd/mtd-user.h>
+#include <sys/ioctl.h>
+int main(int argc, char **argv)
+{
+  struct mtd_write_req ops;
+  struct mtd_info_user info;
+  memset(&ops, 0, sizeof(ops));
+  info.type = MTD_MLCNANDFLASH;
+  return ioctl(0, MEMGETINFO, &info);
+}
+EOF
+if compile_prog "" "" "mtd"; then
+  mtd="yes"
+fi
+print_config "MTD" "$mtd"
+
+##########################################
+# Check whether we have libpmem
+if test "$libpmem" != "yes" ; then
+  libpmem="no"
+fi
+cat > $TMPC << EOF
+#include <libpmem.h>
+int main(int argc, char **argv)
+{
+  int rc;
+  rc = pmem_is_pmem(0, 0);
+  return 0;
+}
+EOF
+if compile_prog "" "-lpmem" "libpmem"; then
+  libpmem="yes"
+  LIBS="-lpmem $LIBS"
+fi
+print_config "libpmem" "$libpmem"
+
+##########################################
+# Check whether we have libpmemblk
+# libpmem is a prerequisite
+if test "$libpmemblk" != "yes" ; then
+  libpmemblk="no"
+fi
+if test "$libpmem" = "yes"; then
+  cat > $TMPC << EOF
+#include <libpmemblk.h>
+int main(int argc, char **argv)
+{
+  PMEMblkpool *pbp;
+  pbp = pmemblk_open("", 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "-lpmemblk" "libpmemblk"; then
+    libpmemblk="yes"
+    LIBS="-lpmemblk $LIBS"
+  fi
+fi
+print_config "libpmemblk" "$libpmemblk"
+
+# Choose the ioengines
+if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
+  pmem="yes"
+  devdax="yes"
+  if test "$libpmemblk" = "yes"; then
+    pmemblk="yes"
+  fi
+fi
+
+##########################################
+# Report whether pmemblk engine is enabled
+print_config "PMDK pmemblk engine" "$pmemblk"
+
+##########################################
+# Report whether dev-dax engine is enabled
+print_config "PMDK dev-dax engine" "$devdax"
+
+##########################################
+# Report whether libpmem engine is enabled
+print_config "PMDK libpmem engine" "$pmem"
+
+##########################################
+# Check whether we support DDN's IME
+if test "$libime" != "yes" ; then
+  libime="no"
+fi
+cat > $TMPC << EOF
+#include <ime_native.h>
+int main(int argc, char **argv)
+{
+  int rc;
+  ime_native_init();
+  rc = ime_native_finalize();
+  return 0;
+}
+EOF
+if compile_prog "-I${ime_path}/include" "-L${ime_path}/lib -lim_client" "libime"; then
+  libime="yes"
+  CFLAGS="-I${ime_path}/include $CFLAGS"
+  LDFLAGS="-Wl,-rpath ${ime_path}/lib -L${ime_path}/lib $LDFLAGS"
+  LIBS="-lim_client $LIBS"
+fi
+print_config "DDN's Infinite Memory Engine" "$libime"
+
+##########################################
+# Check if we have required environment variables configured for libiscsi
+if test "$libiscsi" = "yes" ; then
+  if $(pkg-config --atleast-version=1.9.0 libiscsi); then
+    libiscsi="yes"
+    libiscsi_cflags=$(pkg-config --cflags libiscsi)
+    libiscsi_libs=$(pkg-config --libs libiscsi)
+  else
+    if test "$libiscsi" = "yes" ; then
+      echo "libiscsi" "Install libiscsi >= 1.9.0"
+    fi
+    libiscsi="no"
+  fi
+fi
+print_config "iscsi engine" "$libiscsi"
+
+##########################################
+# Check if we have libnbd (for NBD support).
+minimum_libnbd=0.9.8
+if test "$libnbd" = "yes" ; then
+  if $(pkg-config --atleast-version=$minimum_libnbd libnbd); then
+    libnbd="yes"
+    libnbd_cflags=$(pkg-config --cflags libnbd)
+    libnbd_libs=$(pkg-config --libs libnbd)
+  else
+    if test "$libnbd" = "yes" ; then
+      echo "libnbd" "Install libnbd >= $minimum_libnbd"
+    fi
+    libnbd="no"
+  fi
+fi
+print_config "NBD engine" "$libnbd"
+
+##########################################
+# Check if we have lex/yacc available
+yacc="no"
+yacc_is_bison="no"
+lex="no"
+arith="no"
+if test "$disable_lex" = "no" || test -z "$disable_lex" ; then
+if test "$targetos" != "SunOS" ; then
+LEX=$(which lex 2> /dev/null)
+if test -x "$LEX" ; then
+  lex="yes"
+fi
+YACC=$(which bison 2> /dev/null)
+if test -x "$YACC" ; then
+  yacc="yes"
+  yacc_is_bison="yes"
+else
+  YACC=$(which yacc 2> /dev/null)
+  if test -x "$YACC" ; then
+    yacc="yes"
+  fi
+fi
+if test "$yacc" = "yes" && test "$lex" = "yes" ; then
+  arith="yes"
+fi
+
+if test "$arith" = "yes" ; then
+cat > $TMPC << EOF
+extern int yywrap(void);
+
+int main(int argc, char **argv)
+{
+  yywrap();
+  return 0;
+}
+EOF
+if compile_prog "" "-ll" "lex"; then
+  LIBS="-ll $LIBS"
+else
+  arith="no"
+fi
+fi
+fi
+fi
+
+# Check if lex fails using -o
+if test "$arith" = "yes" ; then
+if test "$force_no_lex_o" = "yes" ; then
+  lex_use_o="no"
+else
+$LEX -o lex.yy.c exp/expression-parser.l 2> /dev/null
+if test "$?" = "0" ; then
+  lex_use_o="yes"
+else
+  lex_use_o="no"
+fi
+fi
+fi
+
+print_config "lex/yacc for arithmetic" "$arith"
+
+##########################################
+# Check whether we have setmntent/getmntent
+if test "$getmntent" != "yes" ; then
+  getmntent="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <mntent.h>
+int main(int argc, char **argv)
+{
+  FILE *mtab = setmntent(NULL, "r");
+  struct mntent *mnt = getmntent(mtab);
+  endmntent(mtab);
+  return 0;
+}
+EOF
+if compile_prog "" "" "getmntent"; then
+  getmntent="yes"
+fi
+print_config "getmntent" "$getmntent"
+
+##########################################
+# Check whether we have getmntinfo
+# These are originally added for BSDs, but may also work
+# on other operating systems with getmntinfo(3).
+
+# getmntinfo(3) for FreeBSD/DragonFlyBSD/OpenBSD.
+# Note that NetBSD needs -Werror to catch warning as error.
+if test "$getmntinfo" != "yes" ; then
+  getmntinfo="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+int main(int argc, char **argv)
+{
+  struct statfs *st;
+  return getmntinfo(&st, MNT_NOWAIT);
+}
+EOF
+if compile_prog "-Werror" "" "getmntinfo"; then
+  getmntinfo="yes"
+fi
+print_config "getmntinfo" "$getmntinfo"
+
+# getmntinfo(3) for NetBSD.
+if test "$getmntinfo_statvfs" != "yes" ; then
+  getmntinfo_statvfs="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/statvfs.h>
+int main(int argc, char **argv)
+{
+  struct statvfs *st;
+  return getmntinfo(&st, MNT_NOWAIT);
+}
+EOF
+# Skip the test if the one with statfs arg is detected.
+if test "$getmntinfo" != "yes" && compile_prog "-Werror" "" "getmntinfo_statvfs"; then
+  getmntinfo_statvfs="yes"
+  print_config "getmntinfo_statvfs" "$getmntinfo_statvfs"
+fi
+
+##########################################
+# Check whether we have _Static_assert
+if test "$static_assert" != "yes" ; then
+  static_assert="no"
+fi
+cat > $TMPC << EOF
+#include <assert.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+struct foo {
+  int a, b;
+};
+
+int main(int argc, char **argv)
+{
+  _Static_assert(offsetof(struct foo, a) == 0 , "Check");
+  return 0 ;
+}
+EOF
+if compile_prog "" "" "static_assert"; then
+    static_assert="yes"
+fi
+print_config "Static Assert" "$static_assert"
+
+##########################################
+# Check whether we have bool / stdbool.h
+if test "$have_bool" != "yes" ; then
+  have_bool="no"
+fi
+cat > $TMPC << EOF
+#include <stdbool.h>
+int main(int argc, char **argv)
+{
+  bool var = true;
+  return var != false;
+}
+EOF
+if compile_prog "" "" "bool"; then
+  have_bool="yes"
+fi
+print_config "bool" "$have_bool"
+
+##########################################
+# Check whether we have strndup()
+strndup="no"
+cat > $TMPC << EOF
+#include <string.h>
+#include <stdlib.h>
+int main(int argc, char **argv)
+{
+  char *res = strndup("test string", 8);
+
+  free(res);
+  return 0;
+}
+EOF
+if compile_prog "" "" "strndup"; then
+  strndup="yes"
+fi
+print_config "strndup" "$strndup"
+
+##########################################
+# <valgrind/drd.h> probe
+# Note: presence of <valgrind/drd.h> implies that <valgrind/valgrind.h> is
+# also available but not the other way around.
+if test "$valgrind_dev" != "yes" ; then
+  valgrind_dev="no"
+fi
+cat > $TMPC << EOF
+#include <valgrind/drd.h>
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if compile_prog "" "" "valgrind_dev"; then
+  valgrind_dev="yes"
+fi
+print_config "Valgrind headers" "$valgrind_dev"
+
+##########################################
+# <linux/blkzoned.h> probe
+if test "$linux_blkzoned" != "yes" ; then
+  linux_blkzoned="no"
+fi
+cat > $TMPC << EOF
+#include <linux/blkzoned.h>
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if compile_prog "" "" "linux_blkzoned"; then
+  linux_blkzoned="yes"
+fi
+print_config "Zoned block device support" "$linux_blkzoned"
+
+##########################################
+# check march=armv8-a+crc+crypto
+if test "$march_armv8_a_crc_crypto" != "yes" ; then
+  march_armv8_a_crc_crypto="no"
+fi
+if test "$cpu" = "arm64" ; then
+  cat > $TMPC <<EOF
+#include <arm_acle.h>
+#include <arm_neon.h>
+#include <sys/auxv.h>
+
+int main(void)
+{
+  /* Can we also do a runtime probe? */
+#if __linux__
+  return getauxval(AT_HWCAP);
+#else
+# error "Don't know how to do runtime probe for ARM CRC32c"
+#endif
+}
+EOF
+  if compile_prog "-march=armv8-a+crc+crypto" "" "ARM CRC32c"; then
+    march_armv8_a_crc_crypto="yes"
+    CFLAGS="$CFLAGS -march=armv8-a+crc+crypto"
+    march_set="yes"
+  fi
+fi
+print_config "march_armv8_a_crc_crypto" "$march_armv8_a_crc_crypto"
+
+##########################################
+# cuda probe
+if test "$cuda" != "yes" ; then
+  cuda="no"
+fi
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" = "yes" && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+print_config "cuda" "$cuda"
+
+##########################################
+# check for cc -march=native
+build_native="no"
+cat > $TMPC << EOF
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if test "$disable_native" = "no" && test "$disable_opt" != "yes" && \
+   compile_prog "-march=native" "" "march=native"; then
+  build_native="yes"
+fi
+print_config "Build march=native" "$build_native"
+
+##########################################
+# check for -lcunit
+if test "$cunit" != "yes" ; then
+  cunit="no"
+fi
+cat > $TMPC << EOF
+#include <CUnit/CUnit.h>
+#include <CUnit/Basic.h>
+int main(void)
+{
+  if (CU_initialize_registry() != CUE_SUCCESS)
+    return CU_get_error();
+  CU_basic_set_mode(CU_BRM_VERBOSE);
+  CU_basic_run_tests();
+  CU_cleanup_registry();
+  return CU_get_error();
+}
+EOF
+if compile_prog "" "-lcunit" "CUnit"; then
+  cunit="yes"
+fi
+print_config "CUnit" "$cunit"
+
+##########################################
+# check for __kernel_rwf_t
+__kernel_rwf_t="no"
+cat > $TMPC << EOF
+#include <linux/fs.h>
+int main(int argc, char **argv)
+{
+  __kernel_rwf_t x;
+  x = 0;
+  return x;
+}
+EOF
+if compile_prog "" "" "__kernel_rwf_t"; then
+  __kernel_rwf_t="yes"
+fi
+print_config "__kernel_rwf_t" "$__kernel_rwf_t"
+
+##########################################
+# check if gcc has -Wimplicit-fallthrough
+fallthrough="no"
+cat > $TMPC << EOF
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if compile_prog "-Wimplicit-fallthrough" "" "-Wimplicit-fallthrough"; then
+  fallthrough="yes"
+fi
+print_config "-Wimplicit-fallthrough" "$fallthrough"
+
+##########################################
+# check for MADV_HUGEPAGE support
+if test "$thp" != "yes" ; then
+  thp="no"
+fi
+if test "$esx" != "yes" ; then
+  cat > $TMPC <<EOF
+#include <sys/mman.h>
+int main(void)
+{
+  return madvise(0, 0x1000, MADV_HUGEPAGE);
+}
+EOF
+  if compile_prog "" "" "thp" ; then
+    thp=yes
+  else
+    if test "$thp" = "yes" ; then
+      feature_not_found "Transparent Huge Page" ""
+    fi
+    thp=no
+  fi
+fi
+print_config "MADV_HUGEPAGE" "$thp"
+
+##########################################
+# check for gettid()
+gettid="no"
+cat > $TMPC << EOF
+#include <unistd.h>
+int main(int argc, char **argv)
+{
+  return gettid();
+}
+EOF
+if compile_prog "" "" "gettid"; then
+  gettid="yes"
+fi
+print_config "gettid" "$gettid"
+
+##########################################
+# check for statx(2) support by libc
+statx="no"
+cat > $TMPC << EOF
+#include <unistd.h>
+#include <sys/stat.h>
+
+int main(int argc, char **argv)
+{
+	struct statx st;
+	return statx(-1, *argv, 0, 0, &st);
+}
+EOF
+if compile_prog "" "" "statx"; then
+  statx="yes"
+fi
+print_config "statx(2)/libc" "$statx"
+
+##########################################
+# check for statx(2) support by kernel
+statx_syscall="no"
+cat > $TMPC << EOF
+#include <unistd.h>
+#include <linux/stat.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+
+static int _statx(int dfd, const char *pathname, int flags, unsigned int mask,
+		  struct statx *buffer)
+{
+	return syscall(__NR_statx, dfd, pathname, flags, mask, buffer);
+}
+
+int main(int argc, char **argv)
+{
+	struct statx st;
+	return _statx(-1, *argv, 0, 0, &st);
+}
+EOF
+if compile_prog "" "" "statx_syscall"; then
+  statx_syscall="yes"
+fi
+print_config "statx(2)/syscall" "$statx_syscall"
+
+#############################################################################
+
+if test "$wordsize" = "64" ; then
+  output_sym "CONFIG_64BIT"
+elif test "$wordsize" = "32" ; then
+  output_sym "CONFIG_32BIT"
+else
+  fatal "Unknown wordsize!"
+fi
+if test "$bigendian" = "yes" ; then
+  output_sym "CONFIG_BIG_ENDIAN"
+else
+  output_sym "CONFIG_LITTLE_ENDIAN"
+fi
+if test "$zlib" = "yes" ; then
+  output_sym "CONFIG_ZLIB"
+fi
+if test "$libaio" = "yes" ; then
+  output_sym "CONFIG_LIBAIO"
+  if test "$libaio_uring" = "yes" ; then
+    output_sym "CONFIG_LIBAIO_URING"
+  fi
+fi
+if test "$posix_aio" = "yes" ; then
+  output_sym "CONFIG_POSIXAIO"
+fi
+if test "$posix_aio_fsync" = "yes" ; then
+  output_sym "CONFIG_POSIXAIO_FSYNC"
+fi
+if test "$posix_pshared" = "yes" ; then
+  output_sym "CONFIG_PSHARED"
+fi
+if test "$pthread_condattr_setclock" = "yes" ; then
+  output_sym "CONFIG_PTHREAD_CONDATTR_SETCLOCK"
+fi
+if test "$pthread_sigmask" = "yes" ; then
+  output_sym "CONFIG_PTHREAD_SIGMASK"
+fi
+if test "$have_asprintf" = "yes" ; then
+    output_sym "CONFIG_HAVE_ASPRINTF"
+fi
+if test "$have_vasprintf" = "yes" ; then
+    output_sym "CONFIG_HAVE_VASPRINTF"
+fi
+if test "$linux_fallocate" = "yes" ; then
+  output_sym "CONFIG_LINUX_FALLOCATE"
+fi
+if test "$posix_fallocate" = "yes" ; then
+  output_sym "CONFIG_POSIX_FALLOCATE"
+fi
+if test "$fdatasync" = "yes" ; then
+  output_sym "CONFIG_FDATASYNC"
+fi
+if test "$pipe" = "yes" ; then
+  output_sym "CONFIG_PIPE"
+fi
+if test "$pipe2" = "yes" ; then
+  output_sym "CONFIG_PIPE2"
+fi
+if test "$pread" = "yes" ; then
+  output_sym "CONFIG_PREAD"
+fi
+if test "$sync_file_range" = "yes" ; then
+  output_sym "CONFIG_SYNC_FILE_RANGE"
+fi
+if test "$sfaa" = "yes" ; then
+  output_sym "CONFIG_SFAA"
+fi
+if test "$sync_sync" = "yes" ; then
+  output_sym "CONFIG_SYNC_SYNC"
+fi
+if test "$cmp_swap" = "yes" ; then
+  output_sym "CONFIG_CMP_SWAP"
+fi
+if test "$libverbs" = "yes" -a "$rdmacm" = "yes" ; then
+  output_sym "CONFIG_RDMA"
+fi
+if test "$clock_gettime" = "yes" ; then
+  output_sym "CONFIG_CLOCK_GETTIME"
+fi
+if test "$clock_monotonic" = "yes" ; then
+  output_sym "CONFIG_CLOCK_MONOTONIC"
+fi
+if test "$clock_monotonic_raw" = "yes" ; then
+  output_sym "CONFIG_CLOCK_MONOTONIC_RAW"
+fi
+if test "$clock_monotonic_precise" = "yes" ; then
+  output_sym "CONFIG_CLOCK_MONOTONIC_PRECISE"
+fi
+if test "$clockid_t" = "yes"; then
+  output_sym "CONFIG_CLOCKID_T"
+fi
+if test "$gettimeofday" = "yes" ; then
+  output_sym "CONFIG_GETTIMEOFDAY"
+fi
+if test "$posix_fadvise" = "yes" ; then
+  output_sym "CONFIG_POSIX_FADVISE"
+fi
+if test "$linux_3arg_affinity" = "yes" ; then
+  output_sym "CONFIG_3ARG_AFFINITY"
+elif test "$linux_2arg_affinity" = "yes" ; then
+  output_sym "CONFIG_2ARG_AFFINITY"
+fi
+if test "$strsep" = "yes" ; then
+  output_sym "CONFIG_STRSEP"
+fi
+if test "$strcasestr" = "yes" ; then
+  output_sym "CONFIG_STRCASESTR"
+fi
+if test "$strlcat" = "yes" ; then
+  output_sym "CONFIG_STRLCAT"
+fi
+if test "$getopt_long_only" = "yes" ; then
+  output_sym "CONFIG_GETOPT_LONG_ONLY"
+fi
+if test "$inet_aton" = "yes" ; then
+  output_sym "CONFIG_INET_ATON"
+fi
+if test "$socklen_t" = "yes" ; then
+  output_sym "CONFIG_SOCKLEN_T"
+fi
+if test "$ext4_me" = "yes" ; then
+  output_sym "CONFIG_LINUX_EXT4_MOVE_EXTENT"
+fi
+if test "$linux_splice" = "yes" ; then
+  output_sym "CONFIG_LINUX_SPLICE"
+fi
+if test "$guasi" = "yes" ; then
+  output_sym "CONFIG_GUASI"
+fi
+if test "$libnuma_v2" = "yes" ; then
+  output_sym "CONFIG_LIBNUMA"
+fi
+if test "$solaris_aio" = "yes" ; then
+  output_sym "CONFIG_SOLARISAIO"
+fi
+if test "$tls_thread" = "yes" ; then
+  output_sym "CONFIG_TLS_THREAD"
+fi
+if test "$rusage_thread" = "yes" ; then
+  output_sym "CONFIG_RUSAGE_THREAD"
+fi
+if test "$gfio" = "yes" ; then
+  output_sym "CONFIG_GFIO"
+fi
+if test "$esx" = "yes" ; then
+  output_sym "CONFIG_ESX"
+  output_sym "CONFIG_NO_SHM"
+fi
+if test "$sched_idle" = "yes" ; then
+  output_sym "CONFIG_SCHED_IDLE"
+fi
+if test "$tcp_nodelay" = "yes" ; then
+  output_sym "CONFIG_TCP_NODELAY"
+fi
+if test "$window_size" = "yes" ; then
+  output_sym "CONFIG_NET_WINDOWSIZE"
+fi
+if test "$mss" = "yes" ; then
+  output_sym "CONFIG_NET_MSS"
+fi
+if test "$rlimit_memlock" = "yes" ; then
+  output_sym "CONFIG_RLIMIT_MEMLOCK"
+fi
+if test "$pwritev" = "yes" ; then
+  output_sym "CONFIG_PWRITEV"
+fi
+if test "$pwritev2" = "yes" ; then
+  output_sym "CONFIG_PWRITEV2"
+fi
+if test "$ipv6" = "yes" ; then
+  output_sym "CONFIG_IPV6"
+fi
+if test "$http" = "yes" ; then
+  output_sym "CONFIG_HTTP"
+fi
+if test "$rados" = "yes" ; then
+  output_sym "CONFIG_RADOS"
+fi
+if test "$rbd" = "yes" ; then
+  output_sym "CONFIG_RBD"
+fi
+if test "$rbd_poll" = "yes" ; then
+  output_sym "CONFIG_RBD_POLL"
+fi
+if test "$rbd_inval" = "yes" ; then
+  output_sym "CONFIG_RBD_INVAL"
+fi
+if test "$setvbuf" = "yes" ; then
+  output_sym "CONFIG_SETVBUF"
+fi
+if test "$s390_z196_facilities" = "yes" ; then
+  output_sym "CONFIG_S390_Z196_FACILITIES"
+  CFLAGS="$CFLAGS -march=z9-109"
+  march_set="yes"
+fi
+if test "$gfapi" = "yes" ; then
+  output_sym "CONFIG_GFAPI"
+fi
+if test "$gf_fadvise" = "yes" ; then
+  output_sym "CONFIG_GF_FADVISE"
+fi
+if test "$gf_trim" = "yes" ; then
+  output_sym "CONFIG_GF_TRIM"
+fi
+if test "$gf_new" = "yes" ; then
+  output_sym "CONFIG_GF_NEW_API"
+fi
+if test "$libhdfs" = "yes" ; then
+  output_sym "CONFIG_LIBHDFS"
+  echo "FIO_HDFS_CPU=$FIO_HDFS_CPU" >> $config_host_mak
+  echo "JAVA_HOME=$JAVA_HOME" >> $config_host_mak
+  echo "FIO_LIBHDFS_INCLUDE=$FIO_LIBHDFS_INCLUDE" >> $config_host_mak
+  echo "FIO_LIBHDFS_LIB=$FIO_LIBHDFS_LIB" >> $config_host_mak
+fi
+if test "$mtd" = "yes" ; then
+  output_sym "CONFIG_MTD"
+fi
+if test "$pmemblk" = "yes" ; then
+  output_sym "CONFIG_PMEMBLK"
+fi
+if test "$devdax" = "yes" ; then
+  output_sym "CONFIG_LINUX_DEVDAX"
+fi
+if test "$pmem" = "yes" ; then
+  output_sym "CONFIG_LIBPMEM"
+fi
+if test "$libime" = "yes" ; then
+  output_sym "CONFIG_IME"
+fi
+if test "$arith" = "yes" ; then
+  output_sym "CONFIG_ARITHMETIC"
+  if test "$yacc_is_bison" = "yes" ; then
+    echo "YACC=$YACC -y" >> $config_host_mak
+  else
+    echo "YACC=$YACC" >> $config_host_mak
+  fi
+  if test "$lex_use_o" = "yes" ; then
+    echo "CONFIG_LEX_USE_O=y" >> $config_host_mak
+  fi
+fi
+if test "$getmntent" = "yes" ; then
+  output_sym "CONFIG_GETMNTENT"
+fi
+if test "$getmntinfo" = "yes" ; then
+  output_sym "CONFIG_GETMNTINFO"
+fi
+if test "$getmntinfo_statvfs" = "yes" ; then
+  output_sym "CONFIG_GETMNTINFO_STATVFS"
+fi
+if test "$static_assert" = "yes" ; then
+  output_sym "CONFIG_STATIC_ASSERT"
+fi
+if test "$have_bool" = "yes" ; then
+  output_sym "CONFIG_HAVE_BOOL"
+fi
+if test "$strndup" = "yes" ; then
+  output_sym "CONFIG_HAVE_STRNDUP"
+fi
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
+fi
+if test "$valgrind_dev" = "yes"; then
+  output_sym "CONFIG_VALGRIND_DEV"
+fi
+if test "$linux_blkzoned" = "yes" ; then
+  output_sym "CONFIG_LINUX_BLKZONED"
+fi
+if test "$zlib" = "no" ; then
+  echo "Consider installing zlib-dev (zlib-devel, some fio features depend on it."
+  if test "$build_static" = "yes"; then
+    echo "Note that some distros have separate packages for static libraries."
+  fi
+fi
+if test "$march_armv8_a_crc_crypto" = "yes" ; then
+  output_sym "ARCH_HAVE_CRC_CRYPTO"
+fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
+if test "$march_set" = "no" && test "$build_native" = "yes" ; then
+  output_sym "CONFIG_BUILD_NATIVE"
+fi
+if test "$cunit" = "yes" ; then
+  output_sym "CONFIG_HAVE_CUNIT"
+fi
+if test "$__kernel_rwf_t" = "yes"; then
+  output_sym "CONFIG_HAVE_KERNEL_RWF_T"
+fi
+if test "$gettid" = "yes"; then
+  output_sym "CONFIG_HAVE_GETTID"
+fi
+if test "$statx" = "yes"; then
+  output_sym "CONFIG_HAVE_STATX"
+fi
+if test "$statx_syscall" = "yes"; then
+  output_sym "CONFIG_HAVE_STATX_SYSCALL"
+fi
+if test "$fallthrough" = "yes"; then
+  CFLAGS="$CFLAGS -Wimplicit-fallthrough"
+fi
+if test "$thp" = "yes" ; then
+  output_sym "CONFIG_HAVE_THP"
+fi
+if test "$libiscsi" = "yes" ; then
+  output_sym "CONFIG_LIBISCSI"
+  echo "CONFIG_LIBISCSI=m" >> $config_host_mak
+  echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak
+  echo "LIBISCSI_LIBS=$libiscsi_libs" >> $config_host_mak
+fi
+if test "$libnbd" = "yes" ; then
+  output_sym "CONFIG_LIBNBD"
+  echo "CONFIG_LIBNBD=m" >> $config_host_mak
+  echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak
+  echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak
+fi
+cat > $TMPC << EOF
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if test "$disable_tcmalloc" != "yes"  && compile_prog "" "-ltcmalloc" "tcmalloc"; then
+  LIBS="-ltcmalloc $LIBS"
+  tcmalloc="yes"
+else
+  tcmalloc="no"
+fi
+print_config "TCMalloc support" "$tcmalloc"
+
+echo "LIBS+=$LIBS" >> $config_host_mak
+echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
+echo "CFLAGS+=$CFLAGS" >> $config_host_mak
+echo "LDFLAGS+=$LDFLAGS" >> $config_host_mak
+echo "CC=$cc" >> $config_host_mak
+echo "BUILD_CFLAGS=$BUILD_CFLAGS $CFLAGS" >> $config_host_mak
+echo "INSTALL_PREFIX=$prefix" >> $config_host_mak
+
+if [ `dirname $0` != "." -a ! -e Makefile ]; then
+    cat > Makefile <<EOF
+SRCDIR:=`dirname $0`
+include \$(SRCDIR)/Makefile
+EOF
+fi
diff --git a/crc/crc16.c b/crc/crc16.c
new file mode 100644
index 0000000..f593f71
--- /dev/null
+++ b/crc/crc16.c
@@ -0,0 +1,54 @@
+/*
+ *      crc16.c
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "crc16.h"
+
+/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
+unsigned short const crc16_table[256] = {
+	0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
+	0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
+	0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
+	0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
+	0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
+	0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
+	0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
+	0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
+	0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
+	0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
+	0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
+	0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
+	0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
+	0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
+	0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
+	0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
+	0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
+	0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
+	0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
+	0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
+	0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
+	0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
+	0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
+	0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
+	0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
+	0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
+	0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
+	0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
+	0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
+	0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
+	0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
+	0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
+};
+
+unsigned short fio_crc16(const void *buffer, unsigned int len)
+{
+	const unsigned char *cp = (const unsigned char *) buffer;
+	unsigned short crc = 0;
+
+	while (len--)
+		crc = crc16_byte(crc, *cp++);
+	return crc;
+}
diff --git a/crc/crc16.h b/crc/crc16.h
new file mode 100644
index 0000000..433a7b1
--- /dev/null
+++ b/crc/crc16.h
@@ -0,0 +1,28 @@
+/*
+ *	crc16.h - CRC-16 routine
+ *
+ * Implements the standard CRC-16:
+ *   Width 16
+ *   Poly  0x8005 (x^16 + x^15 + x^2 + 1)
+ *   Init  0
+ *
+ * Copyright (c) 2005 Ben Gardner <bgardner@wabtec.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#ifndef __CRC16_H
+#define __CRC16_H
+
+extern unsigned short const crc16_table[256];
+
+extern unsigned short fio_crc16(const void *buffer, unsigned int len);
+
+static inline unsigned short crc16_byte(unsigned short crc,
+					const unsigned char data)
+{
+	return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff];
+}
+
+#endif /* __CRC16_H */
diff --git a/crc/crc32.c b/crc/crc32.c
new file mode 100644
index 0000000..e35f5d9
--- /dev/null
+++ b/crc/crc32.c
@@ -0,0 +1,84 @@
+/* crc32 -- calculate and POSIX.2 checksum 
+   Copyright (C) 92, 1995-1999 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#include "crc32.h"
+
+static const uint32_t crctab[256] = {
+  0x0,
+  0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B,
+  0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6,
+  0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD,
+  0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC,
+  0x5BD4B01B, 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F,
+  0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A,
+  0x745E66CD, 0x9823B6E0, 0x9CE2AB57, 0x91A18D8E, 0x95609039,
+  0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58,
+  0xBAEA46EF, 0xB7A96036, 0xB3687D81, 0xAD2F2D84, 0xA9EE3033,
+  0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE,
+  0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, 0xCA753D95,
+  0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4,
+  0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0,
+  0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5,
+  0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16,
+  0x018AEB13, 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07,
+  0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C,
+  0x6211E6B5, 0x66D0FB02, 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1,
+  0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA,
+  0xACA5C697, 0xA864DB20, 0xA527FDF9, 0xA1E6E04E, 0xBFA1B04B,
+  0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698,
+  0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, 0x902B669D,
+  0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E,
+  0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F,
+  0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34,
+  0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80,
+  0x644FC637, 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB,
+  0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A,
+  0x58C1663D, 0x558240E4, 0x51435D53, 0x251D3B9E, 0x21DC2629,
+  0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C,
+  0x3B5A6B9B, 0x0315D626, 0x07D4CB91, 0x0A97ED48, 0x0E56F0FF,
+  0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E,
+  0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, 0xE6EA3D65,
+  0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8,
+  0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3,
+  0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2,
+  0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71,
+  0x92B45BA8, 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74,
+  0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640,
+  0x4E8EE645, 0x4A4FFBF2, 0x470CDD2B, 0x43CDC09C, 0x7B827D21,
+  0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A,
+  0x61043093, 0x65C52D24, 0x119B4BE9, 0x155A565E, 0x18197087,
+  0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC,
+  0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, 0x2497D08D,
+  0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE,
+  0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB,
+  0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18,
+  0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09,
+  0x8D79E0BE, 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662,
+  0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF,
+  0xA2F33668, 0xBCB4666D, 0xB8757BDA, 0xB5365D03, 0xB1F740B4
+};
+
+uint32_t fio_crc32(const void *buffer, unsigned long length)
+{
+	const unsigned char *cp = (const unsigned char *) buffer;
+	uint32_t crc = 0;
+
+	while (length--)
+		crc = (crc << 8) ^ crctab[((crc >> 24) ^ *(cp++)) & 0xFF];
+
+	return crc;
+}
diff --git a/crc/crc32.h b/crc/crc32.h
new file mode 100644
index 0000000..6378e81
--- /dev/null
+++ b/crc/crc32.h
@@ -0,0 +1,25 @@
+/* crc32 -- calculate and POSIX.2 checksum 
+   Copyright (C) 92, 1995-1999 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef CRC32_H
+#define CRC32_H
+
+#include <inttypes.h>
+
+extern uint32_t fio_crc32(const void * const, unsigned long);
+
+#endif
diff --git a/crc/crc32c-arm64.c b/crc/crc32c-arm64.c
new file mode 100644
index 0000000..11bfe5d
--- /dev/null
+++ b/crc/crc32c-arm64.c
@@ -0,0 +1,107 @@
+#include "crc32c.h"
+#include "../os/os.h"
+
+bool crc32c_arm64_available = false;
+
+#ifdef ARCH_HAVE_CRC_CRYPTO
+
+#define CRC32C3X8(ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR)));
+
+#define CRC32C7X3X8(ITR) do {\
+	CRC32C3X8((ITR)*7+0) \
+	CRC32C3X8((ITR)*7+1) \
+	CRC32C3X8((ITR)*7+2) \
+	CRC32C3X8((ITR)*7+3) \
+	CRC32C3X8((ITR)*7+4) \
+	CRC32C3X8((ITR)*7+5) \
+	CRC32C3X8((ITR)*7+6) \
+	} while(0)
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+static bool crc32c_probed;
+
+/*
+ * Function to calculate reflected crc with PMULL Instruction
+ * crc done "by 3" for fixed input block size of 1024 bytes
+ */
+uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
+{
+	signed long len = length;
+	uint32_t crc = ~0;
+	uint32_t crc0, crc1, crc2;
+
+	/* Load two consts: K1 and K2 */
+	const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+	uint64_t t0, t1;
+
+	while ((len -= 1024) >= 0) {
+		/* Do first 8 bytes here for better pipelining */
+		crc0 = __crc32cd(crc, *(const uint64_t *)data);
+		crc1 = 0;
+		crc2 = 0;
+		data += sizeof(uint64_t);
+
+		/* Process block inline
+		   Process crc0 last to avoid dependency with above */
+		CRC32C7X3X8(0);
+		CRC32C7X3X8(1);
+		CRC32C7X3X8(2);
+		CRC32C7X3X8(3);
+		CRC32C7X3X8(4);
+		CRC32C7X3X8(5);
+
+		data += 42*3*sizeof(uint64_t);
+
+		/* Merge crc0 and crc1 into crc2
+		   crc1 multiply by K2
+		   crc0 multiply by K1 */
+
+		t1 = (uint64_t)vmull_p64(crc1, k2);
+		t0 = (uint64_t)vmull_p64(crc0, k1);
+		crc = __crc32cd(crc2, *(const uint64_t *)data);
+		crc1 = __crc32cd(0, t1);
+		crc ^= crc1;
+		crc0 = __crc32cd(0, t0);
+		crc ^= crc0;
+
+		data += sizeof(uint64_t);
+	}
+
+	if (!(len += 1024))
+		return crc;
+
+	while ((len -= sizeof(uint64_t)) >= 0) {
+                crc = __crc32cd(crc, *(const uint64_t *)data);
+                data += sizeof(uint64_t);
+        }
+
+        /* The following is more efficient than the straight loop */
+        if (len & sizeof(uint32_t)) {
+                crc = __crc32cw(crc, *(const uint32_t *)data);
+                data += sizeof(uint32_t);
+        }
+        if (len & sizeof(uint16_t)) {
+                crc = __crc32ch(crc, *(const uint16_t *)data);
+                data += sizeof(uint16_t);
+        }
+        if (len & sizeof(uint8_t)) {
+                crc = __crc32cb(crc, *(const uint8_t *)data);
+        }
+
+	return crc;
+}
+
+void crc32c_arm64_probe(void)
+{
+	if (!crc32c_probed) {
+		crc32c_arm64_available = os_cpu_has(CPU_ARM64_CRC32C);
+		crc32c_probed = true;
+	}
+}
+
+#endif /* ARCH_HAVE_CRC_CRYPTO */
diff --git a/crc/crc32c-intel.c b/crc/crc32c-intel.c
new file mode 100644
index 0000000..6e810a2
--- /dev/null
+++ b/crc/crc32c-intel.c
@@ -0,0 +1,87 @@
+#include "crc32c.h"
+
+/*
+ * Based on a posting to lkml by Austin Zhang <austin.zhang@intel.com>
+ *
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ */
+
+bool crc32c_intel_available = false;
+
+#ifdef ARCH_HAVE_SSE4_2
+
+#if BITS_PER_LONG == 64
+#define REX_PRE "0x48, "
+#define SCALE_F 8
+#else
+#define REX_PRE
+#define SCALE_F 4
+#endif
+
+static bool crc32c_probed;
+
+static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data,
+					unsigned long length)
+{
+	while (length--) {
+		__asm__ __volatile__(
+			".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+			:"=S"(crc)
+			:"0"(crc), "c"(*data)
+		);
+		data++;
+	}
+
+	return crc;
+}
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected 
+ * crc using table.
+ */
+uint32_t crc32c_intel(unsigned char const *data, unsigned long length)
+{
+	unsigned int iquotient = length / SCALE_F;
+	unsigned int iremainder = length % SCALE_F;
+#if BITS_PER_LONG == 64
+	uint64_t *ptmp = (uint64_t *) data;
+#else
+	uint32_t *ptmp = (uint32_t *) data;
+#endif
+	uint32_t crc = ~0;
+
+	while (iquotient--) {
+		__asm__ __volatile__(
+			".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+			:"=S"(crc)
+			:"0"(crc), "c"(*ptmp)
+		);
+		ptmp++;
+	}
+
+	if (iremainder)
+		crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+				 iremainder);
+
+	return crc;
+}
+
+void crc32c_intel_probe(void)
+{
+	if (!crc32c_probed) {
+		unsigned int eax, ebx, ecx = 0, edx;
+
+		eax = 1;
+
+		do_cpuid(&eax, &ebx, &ecx, &edx);
+		crc32c_intel_available = (ecx & (1 << 20)) != 0;
+		crc32c_probed = true;
+	}
+}
+
+#endif /* ARCH_HAVE_SSE4_2 */
diff --git a/crc/crc32c.c b/crc/crc32c.c
new file mode 100644
index 0000000..34944ae
--- /dev/null
+++ b/crc/crc32c.c
@@ -0,0 +1,124 @@
+/* 
+ * CRC32C
+ *@Article{castagnoli-crc,
+ * author =       { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
+ * title =        {{Optimization of Cyclic Redundancy-Check Codes with 24
+ *                 and 32 Parity Bits}},
+ * journal =      IEEE Transactions on Communication,
+ * year =         {1993},
+ * volume =       {41},
+ * number =       {6},
+ * pages =        {},
+ * month =        {June},
+ *}
+ * Used by the iSCSI driver, possibly others, and derived from the
+ * the iscsi-crc.c module of the linux-iscsi driver at
+ * http://linux-iscsi.sourceforge.net.
+ *
+ * Following the example of lib/crc32, this function is intended to be
+ * flexible and useful for all users.  Modules that currently have their
+ * own crc32c, but hopefully may be able to use this one are:
+ *  net/sctp (please add all your doco to here if you change to
+ *            use this one!)
+ *  <endoflist>
+ *
+ * Copyright (c) 2004 Cisco Systems, Inc.
+ * 
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+#include "crc32c.h"
+
+/*
+ * This is the CRC-32C table
+ * Generated with:
+ * width = 32 bits
+ * poly = 0x1EDC6F41
+ * reflect input bytes = true
+ * reflect output bytes = true
+ */
+
+static const uint32_t crc32c_table[256] = {
+	0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+	0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+	0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+	0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+	0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+	0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+	0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+	0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+	0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+	0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+	0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+	0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+	0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+	0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+	0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+	0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+	0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+	0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+	0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+	0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+	0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+	0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+	0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+	0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+	0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+	0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+	0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+	0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+	0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+	0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+	0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+	0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+	0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+	0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+	0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+	0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+	0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+	0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+	0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+	0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+	0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+	0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+	0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+	0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+	0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+	0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+	0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+	0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+	0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+	0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+	0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+	0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+	0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+	0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+	0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+	0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+	0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+	0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+	0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+	0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+	0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+	0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+	0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+	0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected 
+ * crc using table.
+ */
+
+uint32_t crc32c_sw(unsigned char const *data, unsigned long length)
+{
+	uint32_t crc = ~0;
+
+	while (length--)
+		crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+
+	return crc;
+}
diff --git a/crc/crc32c.h b/crc/crc32c.h
new file mode 100644
index 0000000..18f1161
--- /dev/null
+++ b/crc/crc32c.h
@@ -0,0 +1,61 @@
+/* crc32c -- calculate and POSIX.2 checksum 
+   Copyright (C) 92, 1995-1999 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef CRC32C_H
+#define CRC32C_H
+
+#include <inttypes.h>
+
+#include "../arch/arch.h"
+#include "../lib/types.h"
+
+extern uint32_t crc32c_sw(unsigned char const *, unsigned long);
+extern bool crc32c_arm64_available;
+extern bool crc32c_intel_available;
+
+#ifdef ARCH_HAVE_CRC_CRYPTO
+extern uint32_t crc32c_arm64(unsigned char const *, unsigned long);
+extern void crc32c_arm64_probe(void);
+#else
+#define crc32c_arm64 crc32c_sw
+static inline void crc32c_arm64_probe(void)
+{
+}
+#endif /* ARCH_HAVE_CRC_CRYPTO */
+
+#ifdef ARCH_HAVE_SSE4_2
+extern uint32_t crc32c_intel(unsigned char const *, unsigned long);
+extern void crc32c_intel_probe(void);
+#else
+#define crc32c_intel crc32c_sw
+static inline void crc32c_intel_probe(void)
+{
+}
+#endif /* ARCH_HAVE_SSE4_2 */
+
+static inline uint32_t fio_crc32c(unsigned char const *buf, unsigned long len)
+{
+	if (crc32c_arm64_available)
+		return crc32c_arm64(buf, len);
+
+	if (crc32c_intel_available)
+		return crc32c_intel(buf, len);
+
+	return crc32c_sw(buf, len);
+}
+
+#endif
diff --git a/crc/crc64.c b/crc/crc64.c
new file mode 100644
index 0000000..bf24a97
--- /dev/null
+++ b/crc/crc64.c
@@ -0,0 +1,104 @@
+#include "crc64.h"
+
+/*
+ * poly 0x95AC9329AC4BC9B5ULL and init 0xFFFFFFFFFFFFFFFFULL
+ */
+static const unsigned long long crctab64[256] = {
+  0x0000000000000000ULL, 0x7ad870c830358979ULL, 0xf5b0e190606b12f2ULL,
+  0x8f689158505e9b8bULL, 0xc038e5739841b68fULL, 0xbae095bba8743ff6ULL,
+  0x358804e3f82aa47dULL, 0x4f50742bc81f2d04ULL, 0xab28ecb46814fe75ULL,
+  0xd1f09c7c5821770cULL, 0x5e980d24087fec87ULL, 0x24407dec384a65feULL,
+  0x6b1009c7f05548faULL, 0x11c8790fc060c183ULL, 0x9ea0e857903e5a08ULL,
+  0xe478989fa00bd371ULL, 0x7d08ff3b88be6f81ULL, 0x07d08ff3b88be6f8ULL,
+  0x88b81eabe8d57d73ULL, 0xf2606e63d8e0f40aULL, 0xbd301a4810ffd90eULL,
+  0xc7e86a8020ca5077ULL, 0x4880fbd87094cbfcULL, 0x32588b1040a14285ULL,
+  0xd620138fe0aa91f4ULL, 0xacf86347d09f188dULL, 0x2390f21f80c18306ULL,
+  0x594882d7b0f40a7fULL, 0x1618f6fc78eb277bULL, 0x6cc0863448deae02ULL,
+  0xe3a8176c18803589ULL, 0x997067a428b5bcf0ULL, 0xfa11fe77117cdf02ULL,
+  0x80c98ebf2149567bULL, 0x0fa11fe77117cdf0ULL, 0x75796f2f41224489ULL,
+  0x3a291b04893d698dULL, 0x40f16bccb908e0f4ULL, 0xcf99fa94e9567b7fULL,
+  0xb5418a5cd963f206ULL, 0x513912c379682177ULL, 0x2be1620b495da80eULL,
+  0xa489f35319033385ULL, 0xde51839b2936bafcULL, 0x9101f7b0e12997f8ULL,
+  0xebd98778d11c1e81ULL, 0x64b116208142850aULL, 0x1e6966e8b1770c73ULL,
+  0x8719014c99c2b083ULL, 0xfdc17184a9f739faULL, 0x72a9e0dcf9a9a271ULL,
+  0x08719014c99c2b08ULL, 0x4721e43f0183060cULL, 0x3df994f731b68f75ULL,
+  0xb29105af61e814feULL, 0xc849756751dd9d87ULL, 0x2c31edf8f1d64ef6ULL,
+  0x56e99d30c1e3c78fULL, 0xd9810c6891bd5c04ULL, 0xa3597ca0a188d57dULL,
+  0xec09088b6997f879ULL, 0x96d1784359a27100ULL, 0x19b9e91b09fcea8bULL,
+  0x636199d339c963f2ULL, 0xdf7adabd7a6e2d6fULL, 0xa5a2aa754a5ba416ULL,
+  0x2aca3b2d1a053f9dULL, 0x50124be52a30b6e4ULL, 0x1f423fcee22f9be0ULL,
+  0x659a4f06d21a1299ULL, 0xeaf2de5e82448912ULL, 0x902aae96b271006bULL,
+  0x74523609127ad31aULL, 0x0e8a46c1224f5a63ULL, 0x81e2d7997211c1e8ULL,
+  0xfb3aa75142244891ULL, 0xb46ad37a8a3b6595ULL, 0xceb2a3b2ba0eececULL,
+  0x41da32eaea507767ULL, 0x3b024222da65fe1eULL, 0xa2722586f2d042eeULL,
+  0xd8aa554ec2e5cb97ULL, 0x57c2c41692bb501cULL, 0x2d1ab4dea28ed965ULL,
+  0x624ac0f56a91f461ULL, 0x1892b03d5aa47d18ULL, 0x97fa21650afae693ULL,
+  0xed2251ad3acf6feaULL, 0x095ac9329ac4bc9bULL, 0x7382b9faaaf135e2ULL,
+  0xfcea28a2faafae69ULL, 0x8632586aca9a2710ULL, 0xc9622c4102850a14ULL,
+  0xb3ba5c8932b0836dULL, 0x3cd2cdd162ee18e6ULL, 0x460abd1952db919fULL,
+  0x256b24ca6b12f26dULL, 0x5fb354025b277b14ULL, 0xd0dbc55a0b79e09fULL,
+  0xaa03b5923b4c69e6ULL, 0xe553c1b9f35344e2ULL, 0x9f8bb171c366cd9bULL,
+  0x10e3202993385610ULL, 0x6a3b50e1a30ddf69ULL, 0x8e43c87e03060c18ULL,
+  0xf49bb8b633338561ULL, 0x7bf329ee636d1eeaULL, 0x012b592653589793ULL,
+  0x4e7b2d0d9b47ba97ULL, 0x34a35dc5ab7233eeULL, 0xbbcbcc9dfb2ca865ULL,
+  0xc113bc55cb19211cULL, 0x5863dbf1e3ac9decULL, 0x22bbab39d3991495ULL,
+  0xadd33a6183c78f1eULL, 0xd70b4aa9b3f20667ULL, 0x985b3e827bed2b63ULL,
+  0xe2834e4a4bd8a21aULL, 0x6debdf121b863991ULL, 0x1733afda2bb3b0e8ULL,
+  0xf34b37458bb86399ULL, 0x8993478dbb8deae0ULL, 0x06fbd6d5ebd3716bULL,
+  0x7c23a61ddbe6f812ULL, 0x3373d23613f9d516ULL, 0x49aba2fe23cc5c6fULL,
+  0xc6c333a67392c7e4ULL, 0xbc1b436e43a74e9dULL, 0x95ac9329ac4bc9b5ULL,
+  0xef74e3e19c7e40ccULL, 0x601c72b9cc20db47ULL, 0x1ac40271fc15523eULL,
+  0x5594765a340a7f3aULL, 0x2f4c0692043ff643ULL, 0xa02497ca54616dc8ULL,
+  0xdafce7026454e4b1ULL, 0x3e847f9dc45f37c0ULL, 0x445c0f55f46abeb9ULL,
+  0xcb349e0da4342532ULL, 0xb1eceec59401ac4bULL, 0xfebc9aee5c1e814fULL,
+  0x8464ea266c2b0836ULL, 0x0b0c7b7e3c7593bdULL, 0x71d40bb60c401ac4ULL,
+  0xe8a46c1224f5a634ULL, 0x927c1cda14c02f4dULL, 0x1d148d82449eb4c6ULL,
+  0x67ccfd4a74ab3dbfULL, 0x289c8961bcb410bbULL, 0x5244f9a98c8199c2ULL,
+  0xdd2c68f1dcdf0249ULL, 0xa7f41839ecea8b30ULL, 0x438c80a64ce15841ULL,
+  0x3954f06e7cd4d138ULL, 0xb63c61362c8a4ab3ULL, 0xcce411fe1cbfc3caULL,
+  0x83b465d5d4a0eeceULL, 0xf96c151de49567b7ULL, 0x76048445b4cbfc3cULL,
+  0x0cdcf48d84fe7545ULL, 0x6fbd6d5ebd3716b7ULL, 0x15651d968d029fceULL,
+  0x9a0d8ccedd5c0445ULL, 0xe0d5fc06ed698d3cULL, 0xaf85882d2576a038ULL,
+  0xd55df8e515432941ULL, 0x5a3569bd451db2caULL, 0x20ed197575283bb3ULL,
+  0xc49581ead523e8c2ULL, 0xbe4df122e51661bbULL, 0x3125607ab548fa30ULL,
+  0x4bfd10b2857d7349ULL, 0x04ad64994d625e4dULL, 0x7e7514517d57d734ULL,
+  0xf11d85092d094cbfULL, 0x8bc5f5c11d3cc5c6ULL, 0x12b5926535897936ULL,
+  0x686de2ad05bcf04fULL, 0xe70573f555e26bc4ULL, 0x9ddd033d65d7e2bdULL,
+  0xd28d7716adc8cfb9ULL, 0xa85507de9dfd46c0ULL, 0x273d9686cda3dd4bULL,
+  0x5de5e64efd965432ULL, 0xb99d7ed15d9d8743ULL, 0xc3450e196da80e3aULL,
+  0x4c2d9f413df695b1ULL, 0x36f5ef890dc31cc8ULL, 0x79a59ba2c5dc31ccULL,
+  0x037deb6af5e9b8b5ULL, 0x8c157a32a5b7233eULL, 0xf6cd0afa9582aa47ULL,
+  0x4ad64994d625e4daULL, 0x300e395ce6106da3ULL, 0xbf66a804b64ef628ULL,
+  0xc5bed8cc867b7f51ULL, 0x8aeeace74e645255ULL, 0xf036dc2f7e51db2cULL,
+  0x7f5e4d772e0f40a7ULL, 0x05863dbf1e3ac9deULL, 0xe1fea520be311aafULL,
+  0x9b26d5e88e0493d6ULL, 0x144e44b0de5a085dULL, 0x6e963478ee6f8124ULL,
+  0x21c640532670ac20ULL, 0x5b1e309b16452559ULL, 0xd476a1c3461bbed2ULL,
+  0xaeaed10b762e37abULL, 0x37deb6af5e9b8b5bULL, 0x4d06c6676eae0222ULL,
+  0xc26e573f3ef099a9ULL, 0xb8b627f70ec510d0ULL, 0xf7e653dcc6da3dd4ULL,
+  0x8d3e2314f6efb4adULL, 0x0256b24ca6b12f26ULL, 0x788ec2849684a65fULL,
+  0x9cf65a1b368f752eULL, 0xe62e2ad306bafc57ULL, 0x6946bb8b56e467dcULL,
+  0x139ecb4366d1eea5ULL, 0x5ccebf68aecec3a1ULL, 0x2616cfa09efb4ad8ULL,
+  0xa97e5ef8cea5d153ULL, 0xd3a62e30fe90582aULL, 0xb0c7b7e3c7593bd8ULL,
+  0xca1fc72bf76cb2a1ULL, 0x45775673a732292aULL, 0x3faf26bb9707a053ULL,
+  0x70ff52905f188d57ULL, 0x0a2722586f2d042eULL, 0x854fb3003f739fa5ULL,
+  0xff97c3c80f4616dcULL, 0x1bef5b57af4dc5adULL, 0x61372b9f9f784cd4ULL,
+  0xee5fbac7cf26d75fULL, 0x9487ca0fff135e26ULL, 0xdbd7be24370c7322ULL,
+  0xa10fceec0739fa5bULL, 0x2e675fb4576761d0ULL, 0x54bf2f7c6752e8a9ULL,
+  0xcdcf48d84fe75459ULL, 0xb71738107fd2dd20ULL, 0x387fa9482f8c46abULL,
+  0x42a7d9801fb9cfd2ULL, 0x0df7adabd7a6e2d6ULL, 0x772fdd63e7936bafULL,
+  0xf8474c3bb7cdf024ULL, 0x829f3cf387f8795dULL, 0x66e7a46c27f3aa2cULL,
+  0x1c3fd4a417c62355ULL, 0x935745fc4798b8deULL, 0xe98f353477ad31a7ULL,
+  0xa6df411fbfb21ca3ULL, 0xdc0731d78f8795daULL, 0x536fa08fdfd90e51ULL,
+  0x29b7d047efec8728ULL
+};
+
+unsigned long long fio_crc64(const unsigned char *buffer, unsigned long length)
+{
+	unsigned long long crc = 0;
+
+	while (length--)
+		crc = crctab64[(crc ^ *(buffer++)) & 0xff] ^ (crc >> 8);
+
+	return crc;
+}
+
diff --git a/crc/crc64.h b/crc/crc64.h
new file mode 100644
index 0000000..fe9cad3
--- /dev/null
+++ b/crc/crc64.h
@@ -0,0 +1,6 @@
+#ifndef CRC64_H
+#define CRC64_H
+
+unsigned long long fio_crc64(const unsigned char *, unsigned long);
+
+#endif
diff --git a/crc/crc7.c b/crc/crc7.c
new file mode 100644
index 0000000..bf7fd1c
--- /dev/null
+++ b/crc/crc7.c
@@ -0,0 +1,53 @@
+/*
+ *      crc7.c
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "crc7.h"
+
+/* Table for CRC-7 (polynomial x^7 + x^3 + 1) */
+const unsigned char crc7_syndrome_table[256] = {
+	0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+	0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
+	0x19, 0x10, 0x0b, 0x02, 0x3d, 0x34, 0x2f, 0x26,
+	0x51, 0x58, 0x43, 0x4a, 0x75, 0x7c, 0x67, 0x6e,
+	0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x04, 0x0d,
+	0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45,
+	0x2b, 0x22, 0x39, 0x30, 0x0f, 0x06, 0x1d, 0x14,
+	0x63, 0x6a, 0x71, 0x78, 0x47, 0x4e, 0x55, 0x5c,
+	0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b,
+	0x2c, 0x25, 0x3e, 0x37, 0x08, 0x01, 0x1a, 0x13,
+	0x7d, 0x74, 0x6f, 0x66, 0x59, 0x50, 0x4b, 0x42,
+	0x35, 0x3c, 0x27, 0x2e, 0x11, 0x18, 0x03, 0x0a,
+	0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69,
+	0x1e, 0x17, 0x0c, 0x05, 0x3a, 0x33, 0x28, 0x21,
+	0x4f, 0x46, 0x5d, 0x54, 0x6b, 0x62, 0x79, 0x70,
+	0x07, 0x0e, 0x15, 0x1c, 0x23, 0x2a, 0x31, 0x38,
+	0x41, 0x48, 0x53, 0x5a, 0x65, 0x6c, 0x77, 0x7e,
+	0x09, 0x00, 0x1b, 0x12, 0x2d, 0x24, 0x3f, 0x36,
+	0x58, 0x51, 0x4a, 0x43, 0x7c, 0x75, 0x6e, 0x67,
+	0x10, 0x19, 0x02, 0x0b, 0x34, 0x3d, 0x26, 0x2f,
+	0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c,
+	0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04,
+	0x6a, 0x63, 0x78, 0x71, 0x4e, 0x47, 0x5c, 0x55,
+	0x22, 0x2b, 0x30, 0x39, 0x06, 0x0f, 0x14, 0x1d,
+	0x25, 0x2c, 0x37, 0x3e, 0x01, 0x08, 0x13, 0x1a,
+	0x6d, 0x64, 0x7f, 0x76, 0x49, 0x40, 0x5b, 0x52,
+	0x3c, 0x35, 0x2e, 0x27, 0x18, 0x11, 0x0a, 0x03,
+	0x74, 0x7d, 0x66, 0x6f, 0x50, 0x59, 0x42, 0x4b,
+	0x17, 0x1e, 0x05, 0x0c, 0x33, 0x3a, 0x21, 0x28,
+	0x5f, 0x56, 0x4d, 0x44, 0x7b, 0x72, 0x69, 0x60,
+	0x0e, 0x07, 0x1c, 0x15, 0x2a, 0x23, 0x38, 0x31,
+	0x46, 0x4f, 0x54, 0x5d, 0x62, 0x6b, 0x70, 0x79
+};
+
+unsigned char fio_crc7(const unsigned char *buffer, unsigned int len)
+{
+	unsigned char crc = 0;
+
+	while (len--)
+		crc = crc7_byte(crc, *buffer++);
+	return crc;
+}
diff --git a/crc/crc7.h b/crc/crc7.h
new file mode 100644
index 0000000..5d5d188
--- /dev/null
+++ b/crc/crc7.h
@@ -0,0 +1,13 @@
+#ifndef CRC7_H
+#define CRC7_H
+
+extern const unsigned char crc7_syndrome_table[256];
+
+static inline unsigned char crc7_byte(unsigned char crc, unsigned char data)
+{
+	return crc7_syndrome_table[(crc << 1) ^ data];
+}
+
+extern unsigned char fio_crc7(const unsigned char *buffer, unsigned int len);
+
+#endif
diff --git a/crc/fnv.c b/crc/fnv.c
new file mode 100644
index 0000000..4cd0650
--- /dev/null
+++ b/crc/fnv.c
@@ -0,0 +1,34 @@
+#include "fnv.h"
+
+#define FNV_PRIME	0x100000001b3ULL
+
+/*
+ * 64-bit fnv, but don't require 64-bit multiples of data. Use bytes
+ * for the last unaligned chunk.
+ */
+uint64_t fnv(const void *buf, uint32_t len, uint64_t hval)
+{
+	const uint64_t *ptr = buf;
+
+	while (len) {
+		hval *= FNV_PRIME;
+		if (len >= sizeof(uint64_t)) {
+			hval ^= (uint64_t) *ptr++;
+			len -= sizeof(uint64_t);
+			continue;
+		} else {
+			const uint8_t *ptr8 = (const uint8_t *) ptr;
+			uint64_t val = 0;
+			int i;
+
+			for (i = 0; i < len; i++) {
+				val <<= 8;
+				val |= (uint8_t) *ptr8++;
+			}
+			hval ^= val;
+			break;
+		}
+	}
+
+	return hval;
+}
diff --git a/crc/fnv.h b/crc/fnv.h
new file mode 100644
index 0000000..ef2b77b
--- /dev/null
+++ b/crc/fnv.h
@@ -0,0 +1,8 @@
+#ifndef FIO_FNV_H
+#define FIO_FNV_H
+
+#include <inttypes.h>
+
+uint64_t fnv(const void *, uint32_t, uint64_t);
+
+#endif
diff --git a/crc/md5.c b/crc/md5.c
new file mode 100644
index 0000000..ade4f69
--- /dev/null
+++ b/crc/md5.c
@@ -0,0 +1,146 @@
+/*
+ * Shamelessly lifted from the 2.6 kernel (crypto/md5.c)
+ */
+#include <string.h>
+#include "md5.h"
+
+static void md5_transform(uint32_t *hash, uint32_t const *in)
+{
+	uint32_t a, b, c, d;
+
+	a = hash[0];
+	b = hash[1];
+	c = hash[2];
+	d = hash[3];
+
+	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+	hash[0] += a;
+	hash[1] += b;
+	hash[2] += c;
+	hash[3] += d;
+}
+
+void fio_md5_init(struct fio_md5_ctx *mctx)
+{
+	mctx->hash[0] = 0x67452301;
+	mctx->hash[1] = 0xefcdab89;
+	mctx->hash[2] = 0x98badcfe;
+	mctx->hash[3] = 0x10325476;
+}
+
+void fio_md5_update(struct fio_md5_ctx *mctx, const uint8_t *data,
+		    unsigned int len)
+{
+	const uint32_t avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
+
+	mctx->byte_count += len;
+
+	if (avail > len) {
+		memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
+		       data, len);
+		return;
+	}
+
+	memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
+	       data, avail);
+
+	md5_transform(mctx->hash, mctx->block);
+	data += avail;
+	len -= avail;
+
+	while (len >= sizeof(mctx->block)) {
+		memcpy(mctx->block, data, sizeof(mctx->block));
+		md5_transform(mctx->hash, mctx->block);
+		data += sizeof(mctx->block);
+		len -= sizeof(mctx->block);
+	}
+
+	memcpy(mctx->block, data, len);
+}
+
+void fio_md5_final(struct fio_md5_ctx *mctx)
+{
+	const unsigned int offset = mctx->byte_count & 0x3f;
+	char *p = (char *)mctx->block + offset;
+	int padding = 56 - (offset + 1);
+
+	*p++ = 0x80;
+	if (padding < 0) {
+		memset(p, 0x00, padding + sizeof (uint64_t));
+		md5_transform(mctx->hash, mctx->block);
+		p = (char *)mctx->block;
+		padding = 56;
+	}
+
+	memset(p, 0, padding);
+	mctx->block[14] = mctx->byte_count << 3;
+	mctx->block[15] = mctx->byte_count >> 29;
+	md5_transform(mctx->hash, mctx->block);
+}
diff --git a/crc/md5.h b/crc/md5.h
new file mode 100644
index 0000000..54e350c
--- /dev/null
+++ b/crc/md5.h
@@ -0,0 +1,29 @@
+#ifndef MD5_H
+#define MD5_H
+
+#include <stdint.h>
+
+#define MD5_DIGEST_SIZE		16
+#define MD5_HMAC_BLOCK_SIZE	64
+#define MD5_BLOCK_WORDS		16
+#define MD5_HASH_WORDS		4
+
+#define F1(x, y, z)	(z ^ (x & (y ^ z)))
+#define F2(x, y, z)	F1(z, x, y)
+#define F3(x, y, z)	(x ^ y ^ z)
+#define F4(x, y, z)	(y ^ (x | ~z))
+
+#define MD5STEP(f, w, x, y, z, in, s) \
+	(w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x)
+
+struct fio_md5_ctx {
+	uint32_t *hash;
+	uint32_t block[MD5_BLOCK_WORDS];
+	uint64_t byte_count;
+};
+
+extern void fio_md5_update(struct fio_md5_ctx *, const uint8_t *, unsigned int);
+extern void fio_md5_final(struct fio_md5_ctx *);
+extern void fio_md5_init(struct fio_md5_ctx *);
+
+#endif
diff --git a/crc/murmur3.c b/crc/murmur3.c
new file mode 100644
index 0000000..f4f2f2c
--- /dev/null
+++ b/crc/murmur3.c
@@ -0,0 +1,70 @@
+#include "murmur3.h"
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+	return (x << r) | (x >> (32 - r));
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+static inline uint32_t fmix32(uint32_t h)
+{
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+
+	return h;
+}
+
+static uint32_t murmur3_tail(const uint8_t *data, const int nblocks,
+			     uint32_t len, const uint32_t c1,
+			     const uint32_t c2, uint32_t h1)
+{
+	const uint8_t *tail = (const uint8_t *)(data + nblocks * 4);
+
+	uint32_t k1 = 0;
+	switch (len & 3) {
+	case 3:
+		k1 ^= tail[2] << 16;
+		/* fall through */
+	case 2:
+		k1 ^= tail[1] << 8;
+		/* fall through */
+	case 1:
+		k1 ^= tail[0];
+		k1 *= c1;
+		k1 = rotl32(k1, 15);
+		k1 *= c2;
+		h1 ^= k1;
+	};
+
+	return fmix32(h1 ^ len);
+}
+
+uint32_t murmurhash3(const void *key, uint32_t len, uint32_t seed)
+{
+	const uint8_t *data = (const uint8_t *)key;
+	const int nblocks = len / 4;
+	uint32_t h1 = seed;
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+	const uint32_t *blocks = (const uint32_t *)(data + nblocks * 4);
+	int i;
+
+	for (i = -nblocks; i; i++) {
+		uint32_t k1 = blocks[i];
+
+		k1 *= c1;
+		k1 = rotl32(k1, 15);
+		k1 *= c2;
+
+		h1 ^= k1;
+		h1 = rotl32(h1, 13);
+		h1 = h1 * 5 + 0xe6546b64;
+	}
+
+	return murmur3_tail(data, nblocks, len, c1, c2, h1);
+}
diff --git a/crc/murmur3.h b/crc/murmur3.h
new file mode 100644
index 0000000..89f6500
--- /dev/null
+++ b/crc/murmur3.h
@@ -0,0 +1,8 @@
+#ifndef FIO_MURMUR3_H
+#define FIO_MURMUR3_H
+
+#include <inttypes.h>
+
+uint32_t murmurhash3(const void *key, uint32_t len, uint32_t seed);
+
+#endif
diff --git a/crc/sha1.c b/crc/sha1.c
new file mode 100644
index 0000000..8d64c8e
--- /dev/null
+++ b/crc/sha1.c
@@ -0,0 +1,216 @@
+/*
+ * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.c),
+ * optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ */
+
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "sha1.h"
+
+/* Hash one 64-byte block of data */
+static void blk_SHA1Block(struct fio_sha1_ctx *ctx, const unsigned int *data);
+
+void fio_sha1_init(struct fio_sha1_ctx *ctx)
+{
+	ctx->size = 0;
+
+	/* Initialize H with the magic constants (see FIPS180 for constants)
+	 */
+	ctx->H[0] = 0x67452301;
+	ctx->H[1] = 0xefcdab89;
+	ctx->H[2] = 0x98badcfe;
+	ctx->H[3] = 0x10325476;
+	ctx->H[4] = 0xc3d2e1f0;
+}
+
+void fio_sha1_update(struct fio_sha1_ctx *ctx, const void *data,
+		     unsigned long len)
+{
+	int lenW = ctx->size & 63;
+
+	ctx->size += len;
+
+	/* Read the data into W and process blocks as they get full
+	 */
+	if (lenW) {
+		int left = 64 - lenW;
+		if (len < left)
+			left = len;
+		memcpy(lenW + (char *)ctx->W, data, left);
+		lenW = (lenW + left) & 63;
+		len -= left;
+		data += left;
+		if (lenW)
+			return;
+		blk_SHA1Block(ctx, ctx->W);
+	}
+	while (len >= 64) {
+		blk_SHA1Block(ctx, data);
+		data += 64;
+		len -= 64;
+	}
+	if (len)
+		memcpy(ctx->W, data, len);
+}
+
+void fio_sha1_final(struct fio_sha1_ctx *ctx)
+{
+	static const unsigned char pad[64] = { 0x80 };
+	unsigned int padlen[2];
+	int i;
+
+	/* Pad with a binary 1 (ie 0x80), then zeroes, then length
+	 */
+	padlen[0] = htonl(ctx->size >> 29);
+	padlen[1] = htonl(ctx->size << 3);
+
+	i = ctx->size & 63;
+	fio_sha1_update(ctx, pad, 1+ (63 & (55 - i)));
+	fio_sha1_update(ctx, padlen, 8);
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
+#define SHA_ROL(x,n)	SHA_ASM("rol", x, n)
+#define SHA_ROR(x,n)	SHA_ASM("ror", x, n)
+
+#else
+
+#define SHA_ROT(X,l,r)	(((X) << (l)) | ((X) >> (r)))
+#define SHA_ROL(X,n)	SHA_ROT(X,n,32-(n))
+#define SHA_ROR(X,n)	SHA_ROT(X,32-(n),n)
+
+#endif
+
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+#define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
+
+/*
+ * Where do we get the source from? The first 16 iterations get it from
+ * the input data, the next mix it from the 512-bit array.
+ */
+#define SHA_SRC(t) htonl(data[t])
+#define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
+
+#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
+	unsigned int TEMP = input(t); setW(t, TEMP); \
+	E += TEMP + SHA_ROL(A,5) + (fn) + (constant); \
+	B = SHA_ROR(B, 2); } while (0)
+
+#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
+#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
+#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )
+
+static void blk_SHA1Block(struct fio_sha1_ctx *ctx, const unsigned int *data)
+{
+	unsigned int A,B,C,D,E;
+	unsigned int array[16];
+
+	A = ctx->H[0];
+	B = ctx->H[1];
+	C = ctx->H[2];
+	D = ctx->H[3];
+	E = ctx->H[4];
+
+	/* Round 1 - iterations 0-16 take their input from 'data' */
+	T_0_15( 0, A, B, C, D, E);
+	T_0_15( 1, E, A, B, C, D);
+	T_0_15( 2, D, E, A, B, C);
+	T_0_15( 3, C, D, E, A, B);
+	T_0_15( 4, B, C, D, E, A);
+	T_0_15( 5, A, B, C, D, E);
+	T_0_15( 6, E, A, B, C, D);
+	T_0_15( 7, D, E, A, B, C);
+	T_0_15( 8, C, D, E, A, B);
+	T_0_15( 9, B, C, D, E, A);
+	T_0_15(10, A, B, C, D, E);
+	T_0_15(11, E, A, B, C, D);
+	T_0_15(12, D, E, A, B, C);
+	T_0_15(13, C, D, E, A, B);
+	T_0_15(14, B, C, D, E, A);
+	T_0_15(15, A, B, C, D, E);
+
+	/* Round 1 - tail. Input from 512-bit mixing array */
+	T_16_19(16, E, A, B, C, D);
+	T_16_19(17, D, E, A, B, C);
+	T_16_19(18, C, D, E, A, B);
+	T_16_19(19, B, C, D, E, A);
+
+	/* Round 2 */
+	T_20_39(20, A, B, C, D, E);
+	T_20_39(21, E, A, B, C, D);
+	T_20_39(22, D, E, A, B, C);
+	T_20_39(23, C, D, E, A, B);
+	T_20_39(24, B, C, D, E, A);
+	T_20_39(25, A, B, C, D, E);
+	T_20_39(26, E, A, B, C, D);
+	T_20_39(27, D, E, A, B, C);
+	T_20_39(28, C, D, E, A, B);
+	T_20_39(29, B, C, D, E, A);
+	T_20_39(30, A, B, C, D, E);
+	T_20_39(31, E, A, B, C, D);
+	T_20_39(32, D, E, A, B, C);
+	T_20_39(33, C, D, E, A, B);
+	T_20_39(34, B, C, D, E, A);
+	T_20_39(35, A, B, C, D, E);
+	T_20_39(36, E, A, B, C, D);
+	T_20_39(37, D, E, A, B, C);
+	T_20_39(38, C, D, E, A, B);
+	T_20_39(39, B, C, D, E, A);
+
+	/* Round 3 */
+	T_40_59(40, A, B, C, D, E);
+	T_40_59(41, E, A, B, C, D);
+	T_40_59(42, D, E, A, B, C);
+	T_40_59(43, C, D, E, A, B);
+	T_40_59(44, B, C, D, E, A);
+	T_40_59(45, A, B, C, D, E);
+	T_40_59(46, E, A, B, C, D);
+	T_40_59(47, D, E, A, B, C);
+	T_40_59(48, C, D, E, A, B);
+	T_40_59(49, B, C, D, E, A);
+	T_40_59(50, A, B, C, D, E);
+	T_40_59(51, E, A, B, C, D);
+	T_40_59(52, D, E, A, B, C);
+	T_40_59(53, C, D, E, A, B);
+	T_40_59(54, B, C, D, E, A);
+	T_40_59(55, A, B, C, D, E);
+	T_40_59(56, E, A, B, C, D);
+	T_40_59(57, D, E, A, B, C);
+	T_40_59(58, C, D, E, A, B);
+	T_40_59(59, B, C, D, E, A);
+
+	/* Round 4 */
+	T_60_79(60, A, B, C, D, E);
+	T_60_79(61, E, A, B, C, D);
+	T_60_79(62, D, E, A, B, C);
+	T_60_79(63, C, D, E, A, B);
+	T_60_79(64, B, C, D, E, A);
+	T_60_79(65, A, B, C, D, E);
+	T_60_79(66, E, A, B, C, D);
+	T_60_79(67, D, E, A, B, C);
+	T_60_79(68, C, D, E, A, B);
+	T_60_79(69, B, C, D, E, A);
+	T_60_79(70, A, B, C, D, E);
+	T_60_79(71, E, A, B, C, D);
+	T_60_79(72, D, E, A, B, C);
+	T_60_79(73, C, D, E, A, B);
+	T_60_79(74, B, C, D, E, A);
+	T_60_79(75, A, B, C, D, E);
+	T_60_79(76, E, A, B, C, D);
+	T_60_79(77, D, E, A, B, C);
+	T_60_79(78, C, D, E, A, B);
+	T_60_79(79, B, C, D, E, A);
+
+	ctx->H[0] += A;
+	ctx->H[1] += B;
+	ctx->H[2] += C;
+	ctx->H[3] += D;
+	ctx->H[4] += E;
+}
diff --git a/crc/sha1.h b/crc/sha1.h
new file mode 100644
index 0000000..416199b
--- /dev/null
+++ b/crc/sha1.h
@@ -0,0 +1,22 @@
+#ifndef FIO_SHA1
+#define FIO_SHA1
+
+#include <inttypes.h>
+
+/*
+ * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.h),
+ * optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ */
+
+struct fio_sha1_ctx {
+	uint32_t *H;
+	unsigned int W[16];
+	unsigned long long size;
+};
+
+void fio_sha1_init(struct fio_sha1_ctx *);
+void fio_sha1_update(struct fio_sha1_ctx *, const void *dataIn, unsigned long len);
+void fio_sha1_final(struct fio_sha1_ctx *);
+
+#endif
diff --git a/crc/sha256.c b/crc/sha256.c
new file mode 100644
index 0000000..2b39c42
--- /dev/null
+++ b/crc/sha256.c
@@ -0,0 +1,292 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA-256, as specified in
+ * http://csrc.nist.gov/cryptval/shs/sha256-384-512.pdf
+ *
+ * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+#include <string.h>
+
+#include "../lib/bswap.h"
+#include "sha256.h"
+
+#define SHA256_DIGEST_SIZE	32
+#define SHA256_HMAC_BLOCK_SIZE	64
+
+static inline uint32_t Ch(uint32_t x, uint32_t y, uint32_t z)
+{
+	return z ^ (x & (y ^ z));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+	return (x & y) | (z & (x | y));
+}
+
+#define e0(x)       (ror32(x, 2) ^ ror32(x,13) ^ ror32(x,22))
+#define e1(x)       (ror32(x, 6) ^ ror32(x,11) ^ ror32(x,25))
+#define s0(x)       (ror32(x, 7) ^ ror32(x,18) ^ (x >> 3))
+#define s1(x)       (ror32(x,17) ^ ror32(x,19) ^ (x >> 10))
+
+#define H0         0x6a09e667
+#define H1         0xbb67ae85
+#define H2         0x3c6ef372
+#define H3         0xa54ff53a
+#define H4         0x510e527f
+#define H5         0x9b05688c
+#define H6         0x1f83d9ab
+#define H7         0x5be0cd19
+
+static inline uint32_t ror32(uint32_t word, unsigned int shift)
+{
+	 return (word >> shift) | (word << (32 - shift));
+}
+
+static inline void LOAD_OP(int I, uint32_t *W, const uint8_t *input)
+{
+	W[I] = __be32_to_cpu(((uint32_t *)(input))[I]);
+}
+
+static inline void BLEND_OP(int I, uint32_t *W)
+{
+	W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+}
+
+static void sha256_transform(uint32_t *state, const uint8_t *input)
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t W[64];
+	int i;
+
+	/* load the input */
+	for (i = 0; i < 16; i++)
+		LOAD_OP(i, W, input);
+
+	/* now blend */
+	for (i = 16; i < 64; i++)
+		BLEND_OP(i, W);
+
+	/* load the state into our registers */
+	a=state[0];  b=state[1];  c=state[2];  d=state[3];
+	e=state[4];  f=state[5];  g=state[6];  h=state[7];
+
+	/* now iterate */
+	t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0x0fc19dc6 + W[18];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0x240ca1cc + W[19];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0x2de92c6f + W[20];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0x4a7484aa + W[21];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0x5cb0a9dc + W[22];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0x76f988da + W[23];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0x983e5152 + W[24];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0xa831c66d + W[25];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0xb00327c8 + W[26];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0xbf597fc7 + W[27];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0xc6e00bf3 + W[28];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0xd5a79147 + W[29];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0x06ca6351 + W[30];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0x14292967 + W[31];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0x27b70a85 + W[32];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0x2e1b2138 + W[33];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0x4d2c6dfc + W[34];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0x53380d13 + W[35];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0x650a7354 + W[36];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0x766a0abb + W[37];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0x81c2c92e + W[38];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0x92722c85 + W[39];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0xa2bfe8a1 + W[40];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0xa81a664b + W[41];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0xc24b8b70 + W[42];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0xc76c51a3 + W[43];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0xd192e819 + W[44];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0xd6990624 + W[45];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0xf40e3585 + W[46];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0x106aa070 + W[47];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0x19a4c116 + W[48];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0x1e376c08 + W[49];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0x2748774c + W[50];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0x34b0bcb5 + W[51];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0x391c0cb3 + W[52];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0x4ed8aa4a + W[53];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0x5b9cca4f + W[54];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0x682e6ff3 + W[55];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56];
+	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+	t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57];
+	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+	t1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58];
+	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+	t1 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59];
+	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+	t1 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60];
+	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+	t1 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61];
+	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+	t1 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62];
+	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+	t1 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63];
+	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+
+	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+
+	/* clear any sensitive info... */
+	a = b = c = d = e = f = g = h = t1 = t2 = 0;
+	memset(W, 0, 64 * sizeof(uint32_t));
+}
+
+void fio_sha256_init(struct fio_sha256_ctx *sctx)
+{
+	sctx->state[0] = H0;
+	sctx->state[1] = H1;
+	sctx->state[2] = H2;
+	sctx->state[3] = H3;
+	sctx->state[4] = H4;
+	sctx->state[5] = H5;
+	sctx->state[6] = H6;
+	sctx->state[7] = H7;
+	sctx->count = 0;
+}
+
+void fio_sha256_update(struct fio_sha256_ctx *sctx, const uint8_t *data,
+		       unsigned int len)
+{
+	unsigned int partial, done;
+	const uint8_t *src;
+
+	partial = sctx->count & 0x3f;
+	sctx->count += len;
+	done = 0;
+	src = data;
+
+	if ((partial + len) > 63) {
+		if (partial) {
+			done = -partial;
+			memcpy(sctx->buf + partial, data, done + 64);
+			src = sctx->buf;
+		}
+
+		do {
+			sha256_transform(sctx->state, src);
+			done += 64;
+			src = data + done;
+		} while (done + 63 < len);
+
+		partial = 0;
+	}
+	memcpy(sctx->buf + partial, src, len - done);
+}
+
+void fio_sha256_final(struct fio_sha256_ctx *sctx)
+{
+	uint64_t bits;
+	unsigned int index, pad_len;
+	int i;
+	static const uint8_t padding[64] = { 0x80, };
+
+	/* Save number of bits */
+	bits = (uint64_t) sctx->count << 3;
+
+	/* Pad out to 56 mod 64. */
+	index = sctx->count & 0x3f;
+	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+	fio_sha256_update(sctx, padding, pad_len);
+
+	/* Append length (before padding) */
+	fio_sha256_update(sctx, (const uint8_t *)&bits, sizeof(bits));
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		sctx->buf[i] = sctx->state[i];
+}
diff --git a/crc/sha256.h b/crc/sha256.h
new file mode 100644
index 0000000..b904c7d
--- /dev/null
+++ b/crc/sha256.h
@@ -0,0 +1,19 @@
+#ifndef FIO_SHA256_H
+#define FIO_SHA256_H
+
+#include <inttypes.h>
+
+#define SHA256_DIGEST_SIZE	32
+#define SHA256_BLOCK_SIZE	64
+
+struct fio_sha256_ctx {
+	uint32_t count;
+	uint32_t state[SHA256_DIGEST_SIZE / 4];
+	uint8_t *buf;
+};
+
+void fio_sha256_init(struct fio_sha256_ctx *);
+void fio_sha256_update(struct fio_sha256_ctx *, const uint8_t *, unsigned int);
+void fio_sha256_final(struct fio_sha256_ctx *);
+
+#endif
diff --git a/crc/sha3.c b/crc/sha3.c
new file mode 100644
index 0000000..c136550
--- /dev/null
+++ b/crc/sha3.c
@@ -0,0 +1,172 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA-3, as specified in
+ * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)•
+ * any later version.
+ *
+ */
+#include <string.h>
+
+#include "../os/os.h"
+
+#include "sha3.h"
+
+#define KECCAK_ROUNDS 24
+
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+
+static const uint64_t keccakf_rndc[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+static const int keccakf_rotc[24] = {
+	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+static const int keccakf_piln[24] = {
+	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+};
+
+/* update the state with given number of rounds */
+
+static void keccakf(uint64_t st[25])
+{
+	int i, j, round;
+	uint64_t t, bc[5];
+
+	for (round = 0; round < KECCAK_ROUNDS; round++) {
+
+		/* Theta */
+		for (i = 0; i < 5; i++)
+			bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15]
+				^ st[i + 20];
+
+		for (i = 0; i < 5; i++) {
+			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+			for (j = 0; j < 25; j += 5)
+				st[j + i] ^= t;
+		}
+
+		/* Rho Pi */
+		t = st[1];
+		for (i = 0; i < 24; i++) {
+			j = keccakf_piln[i];
+			bc[0] = st[j];
+			st[j] = ROTL64(t, keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		/* Chi */
+		for (j = 0; j < 25; j += 5) {
+			for (i = 0; i < 5; i++)
+				bc[i] = st[j + i];
+			for (i = 0; i < 5; i++)
+				st[j + i] ^= (~bc[(i + 1) % 5]) &
+					     bc[(i + 2) % 5];
+		}
+
+		/* Iota */
+		st[0] ^= keccakf_rndc[round];
+	}
+}
+
+static void fio_sha3_init(struct fio_sha3_ctx *sctx, unsigned int digest_sz)
+{
+	memset(sctx->st, 0, sizeof(sctx->st));
+	sctx->md_len = digest_sz;
+	sctx->rsiz = 200 - 2 * digest_sz;
+	sctx->rsizw = sctx->rsiz / 8;
+	sctx->partial = 0;
+	memset(sctx->buf, 0, sizeof(sctx->buf));
+}
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_224_DIGEST_SIZE);
+}
+
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_256_DIGEST_SIZE);
+}
+
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_384_DIGEST_SIZE);
+}
+
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_512_DIGEST_SIZE);
+}
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len)
+{
+	unsigned int done;
+	const uint8_t *src;
+
+	done = 0;
+	src = data;
+
+	if ((sctx->partial + len) > (sctx->rsiz - 1)) {
+		if (sctx->partial) {
+			done = -sctx->partial;
+			memcpy(sctx->buf + sctx->partial, data,
+			       done + sctx->rsiz);
+			src = sctx->buf;
+		}
+
+		do {
+			unsigned int i;
+
+			for (i = 0; i < sctx->rsizw; i++)
+				sctx->st[i] ^= ((uint64_t *) src)[i];
+			keccakf(sctx->st);
+
+			done += sctx->rsiz;
+			src = data + done;
+		} while (done + (sctx->rsiz - 1) < len);
+
+		sctx->partial = 0;
+	}
+	memcpy(sctx->buf + sctx->partial, src, len - done);
+	sctx->partial += (len - done);
+
+	return 0;
+}
+
+void fio_sha3_final(struct fio_sha3_ctx *sctx)
+{
+	unsigned int i, inlen = sctx->partial;
+
+	sctx->buf[inlen++] = 0x06;
+	memset(sctx->buf + inlen, 0, sctx->rsiz - inlen);
+	sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] ^= ((uint64_t *) sctx->buf)[i];
+
+	keccakf(sctx->st);
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] = cpu_to_le64(sctx->st[i]);
+
+	memcpy(sctx->sha, sctx->st, sctx->md_len);
+}
diff --git a/crc/sha3.h b/crc/sha3.h
new file mode 100644
index 0000000..9f1970a
--- /dev/null
+++ b/crc/sha3.h
@@ -0,0 +1,42 @@
+/*
+ * Common values for SHA-3 algorithms
+ */
+#ifndef __CRYPTO_SHA3_H__
+#define __CRYPTO_SHA3_H__
+
+#include <inttypes.h>
+
+#define SHA3_224_DIGEST_SIZE	(224 / 8)
+#define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
+
+#define SHA3_256_DIGEST_SIZE	(256 / 8)
+#define SHA3_256_BLOCK_SIZE	(200 - 2 * SHA3_256_DIGEST_SIZE)
+
+#define SHA3_384_DIGEST_SIZE	(384 / 8)
+#define SHA3_384_BLOCK_SIZE	(200 - 2 * SHA3_384_DIGEST_SIZE)
+
+#define SHA3_512_DIGEST_SIZE	(512 / 8)
+#define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
+
+struct fio_sha3_ctx {
+	uint64_t	st[25];
+	unsigned int	md_len;
+	unsigned int	rsiz;
+	unsigned int	rsizw;
+
+	unsigned int	partial;
+	uint8_t		buf[SHA3_224_BLOCK_SIZE];
+
+	uint8_t		*sha;
+};
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx);
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len);
+void fio_sha3_final(struct fio_sha3_ctx *sctx);
+
+#endif
diff --git a/crc/sha512.c b/crc/sha512.c
new file mode 100644
index 0000000..f599cdc
--- /dev/null
+++ b/crc/sha512.c
@@ -0,0 +1,197 @@
+/* SHA-512 code by Jean-Luc Cooke <jlcooke@certainkey.com>
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ */
+
+#include <string.h>
+
+#include "../lib/bswap.h"
+#include "sha512.h"
+
+#define SHA384_DIGEST_SIZE 48
+#define SHA512_DIGEST_SIZE 64
+#define SHA384_HMAC_BLOCK_SIZE 128
+#define SHA512_HMAC_BLOCK_SIZE 128
+
+static inline uint64_t Ch(uint64_t x, uint64_t y, uint64_t z)
+{
+        return z ^ (x & (y ^ z));
+}
+
+static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
+{
+        return (x & y) | (z & (x | y));
+}
+
+static inline uint64_t RORuint64_t(uint64_t x, uint64_t y)
+{
+        return (x >> y) | (x << (64 - y));
+}
+
+static const uint64_t sha512_K[80] = {
+        0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+        0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+        0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+        0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+        0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+        0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+        0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+        0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+        0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+        0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+        0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+        0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+        0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+        0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+        0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+        0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+        0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+        0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+        0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+        0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+        0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+        0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+        0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+        0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+        0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+        0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+        0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define e0(x)       (RORuint64_t(x,28) ^ RORuint64_t(x,34) ^ RORuint64_t(x,39))
+#define e1(x)       (RORuint64_t(x,14) ^ RORuint64_t(x,18) ^ RORuint64_t(x,41))
+#define s0(x)       (RORuint64_t(x, 1) ^ RORuint64_t(x, 8) ^ (x >> 7))
+#define s1(x)       (RORuint64_t(x,19) ^ RORuint64_t(x,61) ^ (x >> 6))
+
+/* H* initial state for SHA-512 */
+#define H0         0x6a09e667f3bcc908ULL
+#define H1         0xbb67ae8584caa73bULL
+#define H2         0x3c6ef372fe94f82bULL
+#define H3         0xa54ff53a5f1d36f1ULL
+#define H4         0x510e527fade682d1ULL
+#define H5         0x9b05688c2b3e6c1fULL
+#define H6         0x1f83d9abfb41bd6bULL
+#define H7         0x5be0cd19137e2179ULL
+
+/* H'* initial state for SHA-384 */
+#define HP0 0xcbbb9d5dc1059ed8ULL
+#define HP1 0x629a292a367cd507ULL
+#define HP2 0x9159015a3070dd17ULL
+#define HP3 0x152fecd8f70e5939ULL
+#define HP4 0x67332667ffc00b31ULL
+#define HP5 0x8eb44a8768581511ULL
+#define HP6 0xdb0c2e0d64f98fa7ULL
+#define HP7 0x47b5481dbefa4fa4ULL
+
+static inline void LOAD_OP(int I, uint64_t *W, const uint8_t *input)
+{
+	W[I] = __be64_to_cpu( ((uint64_t *)(input))[I] );
+}
+
+static inline void BLEND_OP(int I, uint64_t *W)
+{
+	W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+}
+
+static void sha512_transform(uint64_t *state, uint64_t *W, const uint8_t *input)
+{
+	uint64_t a, b, c, d, e, f, g, h, t1, t2;
+
+	int i;
+
+	/* load the input */
+	for (i = 0; i < 16; i++)
+		LOAD_OP(i, W, input);
+
+	for (i = 16; i < 80; i++)
+		BLEND_OP(i, W);
+
+	/* load the state into our registers */
+	a=state[0];   b=state[1];   c=state[2];   d=state[3];
+	e=state[4];   f=state[5];   g=state[6];   h=state[7];
+
+	/* now iterate */
+	for (i=0; i<80; i+=8) {
+		t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i  ] + W[i  ];
+		t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
+		t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[i+1];
+		t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
+		t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[i+2];
+		t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
+		t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[i+3];
+		t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
+		t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[i+4];
+		t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
+		t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[i+5];
+		t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
+		t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[i+6];
+		t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
+		t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[i+7];
+		t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
+	}
+
+	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+
+	/* erase our data */
+	a = b = c = d = e = f = g = h = t1 = t2 = 0;
+}
+
+void fio_sha512_init(struct fio_sha512_ctx *sctx)
+{
+	sctx->state[0] = H0;
+	sctx->state[1] = H1;
+	sctx->state[2] = H2;
+	sctx->state[3] = H3;
+	sctx->state[4] = H4;
+	sctx->state[5] = H5;
+	sctx->state[6] = H6;
+	sctx->state[7] = H7;
+	sctx->count[0] = sctx->count[1] = sctx->count[2] = sctx->count[3] = 0;
+}
+
+void fio_sha512_update(struct fio_sha512_ctx *sctx, const uint8_t *data,
+		       unsigned int len)
+{
+	unsigned int i, idx, part_len;
+
+	/* Compute number of bytes mod 128 */
+	idx = (unsigned int)((sctx->count[0] >> 3) & 0x7F);
+	
+	/* Update number of bits */
+	if ((sctx->count[0] += (len << 3)) < (len << 3)) {
+		if ((sctx->count[1] += 1) < 1)
+			if ((sctx->count[2] += 1) < 1)
+				sctx->count[3]++;
+		sctx->count[1] += (len >> 29);
+	}
+	
+        part_len = 128 - idx;
+	
+	/* Transform as many times as possible. */
+	if (len >= part_len) {
+		memcpy(&sctx->buf[idx], data, part_len);
+		sha512_transform(sctx->state, sctx->W, sctx->buf);
+
+		for (i = part_len; i + 127 < len; i+=128)
+			sha512_transform(sctx->state, sctx->W, &data[i]);
+
+		idx = 0;
+	} else {
+		i = 0;
+	}
+
+	/* Buffer remaining input */
+	memcpy(&sctx->buf[idx], &data[i], len - i);
+
+	/* erase our data */
+	memset(sctx->W, 0, sizeof(sctx->W));
+}
diff --git a/crc/sha512.h b/crc/sha512.h
new file mode 100644
index 0000000..5adf627
--- /dev/null
+++ b/crc/sha512.h
@@ -0,0 +1,16 @@
+#ifndef FIO_SHA512_H
+#define FIO_SHA512_H
+
+#include <inttypes.h>
+
+struct fio_sha512_ctx {
+	uint64_t state[8];
+	uint32_t count[4];
+	uint8_t *buf;
+	uint64_t W[80];
+};
+
+void fio_sha512_init(struct fio_sha512_ctx *);
+void fio_sha512_update(struct fio_sha512_ctx *, const uint8_t *, unsigned int);
+
+#endif
diff --git a/crc/test.c b/crc/test.c
new file mode 100644
index 0000000..b57f07a
--- /dev/null
+++ b/crc/test.c
@@ -0,0 +1,432 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../gettime.h"
+#include "../fio_time.h"
+#include "../lib/rand.h"
+#include "../os/os.h"
+
+#include "../crc/md5.h"
+#include "../crc/crc64.h"
+#include "../crc/crc32.h"
+#include "../crc/crc32c.h"
+#include "../crc/crc16.h"
+#include "../crc/crc7.h"
+#include "../crc/sha1.h"
+#include "../crc/sha256.h"
+#include "../crc/sha512.h"
+#include "../crc/sha3.h"
+#include "../crc/xxhash.h"
+#include "../crc/murmur3.h"
+#include "../crc/fnv.h"
+#include "../hash.h"
+
+#include "test.h"
+
+#define CHUNK		131072U
+#define NR_CHUNKS	  2048U
+
+struct test_type {
+	const char *name;
+	unsigned int mask;
+	void (*fn)(struct test_type *, void *, size_t);
+	uint32_t output;
+};
+
+enum {
+	T_MD5		= 1U << 0,
+	T_CRC64		= 1U << 1,
+	T_CRC32		= 1U << 2,
+	T_CRC32C	= 1U << 3,
+	T_CRC16		= 1U << 4,
+	T_CRC7		= 1U << 5,
+	T_SHA1		= 1U << 6,
+	T_SHA256	= 1U << 7,
+	T_SHA512	= 1U << 8,
+	T_XXHASH	= 1U << 9,
+	T_MURMUR3	= 1U << 10,
+	T_JHASH		= 1U << 11,
+	T_FNV		= 1U << 12,
+	T_SHA3_224	= 1U << 13,
+	T_SHA3_256	= 1U << 14,
+	T_SHA3_384	= 1U << 15,
+	T_SHA3_512	= 1U << 16,
+};
+
+static void t_md5(struct test_type *t, void *buf, size_t size)
+{
+	uint32_t digest[4];
+	struct fio_md5_ctx ctx = { .hash = digest };
+	int i;
+
+	fio_md5_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_md5_update(&ctx, buf, size);
+		fio_md5_final(&ctx);
+	}
+}
+
+static void t_crc64(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc64(buf, size);
+}
+
+static void t_crc32(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc32(buf, size);
+}
+
+static void t_crc32c(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc32c(buf, size);
+}
+
+static void t_crc16(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc16(buf, size);
+}
+
+static void t_crc7(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc7(buf, size);
+}
+
+static void t_sha1(struct test_type *t, void *buf, size_t size)
+{
+	uint32_t sha[5];
+	struct fio_sha1_ctx ctx = { .H = sha };
+	int i;
+
+	fio_sha1_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha1_update(&ctx, buf, size);
+		fio_sha1_final(&ctx);
+	}
+}
+
+static void t_sha256(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[64];
+	struct fio_sha256_ctx ctx = { .buf = sha };
+	int i;
+
+	fio_sha256_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha256_update(&ctx, buf, size);
+		fio_sha256_final(&ctx);
+	}
+}
+
+static void t_sha512(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[128];
+	struct fio_sha512_ctx ctx = { .buf = sha };
+	int i;
+
+	fio_sha512_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		fio_sha512_update(&ctx, buf, size);
+}
+
+static void t_sha3_224(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_224_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_256(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_256_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_384(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_384_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_512(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_512_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_murmur3(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += murmurhash3(buf, size, 0x8989);
+}
+
+static void t_jhash(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += jhash(buf, size, 0x8989);
+}
+
+static void t_fnv(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fnv(buf, size, 0x8989);
+}
+
+static void t_xxhash(struct test_type *t, void *buf, size_t size)
+{
+	void *state;
+	int i;
+
+	state = XXH32_init(0x8989);
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		XXH32_update(state, buf, size);
+
+	t->output = XXH32_digest(state);
+}
+
+static struct test_type t[] = {
+	{
+		.name = "md5",
+		.mask = T_MD5,
+		.fn = t_md5,
+	},
+	{
+		.name = "crc64",
+		.mask = T_CRC64,
+		.fn = t_crc64,
+	},
+	{
+		.name = "crc32",
+		.mask = T_CRC32,
+		.fn = t_crc32,
+	},
+	{
+		.name = "crc32c",
+		.mask = T_CRC32C,
+		.fn = t_crc32c,
+	},
+	{
+		.name = "crc16",
+		.mask = T_CRC16,
+		.fn = t_crc16,
+	},
+	{
+		.name = "crc7",
+		.mask = T_CRC7,
+		.fn = t_crc7,
+	},
+	{
+		.name = "sha1",
+		.mask = T_SHA1,
+		.fn = t_sha1,
+	},
+	{
+		.name = "sha256",
+		.mask = T_SHA256,
+		.fn = t_sha256,
+	},
+	{
+		.name = "sha512",
+		.mask = T_SHA512,
+		.fn = t_sha512,
+	},
+	{
+		.name = "xxhash",
+		.mask = T_XXHASH,
+		.fn = t_xxhash,
+	},
+	{
+		.name = "murmur3",
+		.mask = T_MURMUR3,
+		.fn = t_murmur3,
+	},
+	{
+		.name = "jhash",
+		.mask = T_JHASH,
+		.fn = t_jhash,
+	},
+	{
+		.name = "fnv",
+		.mask = T_FNV,
+		.fn = t_fnv,
+	},
+	{
+		.name = "sha3-224",
+		.mask = T_SHA3_224,
+		.fn = t_sha3_224,
+	},
+	{
+		.name = "sha3-256",
+		.mask = T_SHA3_256,
+		.fn = t_sha3_256,
+	},
+	{
+		.name = "sha3-384",
+		.mask = T_SHA3_384,
+		.fn = t_sha3_384,
+	},
+	{
+		.name = "sha3-512",
+		.mask = T_SHA3_512,
+		.fn = t_sha3_512,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static unsigned int get_test_mask(const char *type)
+{
+	char *ostr, *str = strdup(type);
+	unsigned int mask;
+	char *name;
+	int i;
+
+	ostr = str;
+	mask = 0;
+	while ((name = strsep(&str, ",")) != NULL) {
+		for (i = 0; t[i].name; i++) {
+			if (!strcmp(t[i].name, name)) {
+				mask |= t[i].mask;
+				break;
+			}
+		}
+	}
+
+	free(ostr);
+	return mask;
+}
+
+static int list_types(void)
+{
+	int i;
+
+	for (i = 0; t[i].name; i++)
+		printf("%s\n", t[i].name);
+
+	return 1;
+}
+
+int fio_crctest(const char *type)
+{
+	unsigned int test_mask = 0;
+	uint64_t mb = CHUNK * NR_CHUNKS;
+	struct frand_state state;
+	int i, first = 1;
+	void *buf;
+
+	crc32c_arm64_probe();
+	crc32c_intel_probe();
+
+	if (!type)
+		test_mask = ~0U;
+	else if (!strcmp(type, "help") || !strcmp(type, "list"))
+		return list_types();
+	else
+		test_mask = get_test_mask(type);
+
+	if (!test_mask) {
+		fprintf(stderr, "fio: unknown hash `%s`. Available:\n", type);
+		return list_types();
+	}
+
+	buf = malloc(CHUNK);
+	init_rand_seed(&state, 0x8989, 0);
+	fill_random_buf(&state, buf, CHUNK);
+
+	for (i = 0; t[i].name; i++) {
+		struct timespec ts;
+		double mb_sec;
+		uint64_t usec;
+		char pre[3];
+
+		if (!(t[i].mask & test_mask))
+			continue;
+
+		/*
+		 * For first run, make sure CPUs are spun up and that
+		 * we've touched the data.
+		 */
+		if (first) {
+			usec_spin(100000);
+			t[i].fn(&t[i], buf, CHUNK);
+		}
+
+		fio_gettime(&ts, NULL);
+		t[i].fn(&t[i], buf, CHUNK);
+		usec = utime_since_now(&ts);
+
+		if (usec) {
+			mb_sec = (double) mb / (double) usec;
+			mb_sec /= (1.024 * 1.024);
+			if (strlen(t[i].name) >= 7)
+				sprintf(pre, "\t");
+			else
+				sprintf(pre, "\t\t");
+			printf("%s:%s%8.2f MiB/sec\n", t[i].name, pre, mb_sec);
+		} else
+			printf("%s:inf MiB/sec\n", t[i].name);
+		first = 0;
+	}
+
+	free(buf);
+	return 0;
+}
diff --git a/crc/test.h b/crc/test.h
new file mode 100644
index 0000000..2b52d6a
--- /dev/null
+++ b/crc/test.h
@@ -0,0 +1,6 @@
+#ifndef FIO_CRC_TEST_H
+#define FIO_CRC_TEST_H
+
+int fio_crctest(const char *type);
+
+#endif
diff --git a/crc/xxhash.c b/crc/xxhash.c
new file mode 100644
index 0000000..4736c52
--- /dev/null
+++ b/crc/xxhash.c
@@ -0,0 +1,421 @@
+/*
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2014, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+
+//**************************************
+// Tuning parameters
+//**************************************
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for uint32_t).
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
+// XXH_ACCEPT_NULL_INPUT_POINTER :
+// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+// This option has a very small performance cost (only measurable on small inputs).
+// By default, this option is disabled. To enable it, uncomment below define :
+//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
+
+// XXH_FORCE_NATIVE_FORMAT :
+// By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+// Results are therefore identical for little-endian and big-endian CPU.
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independance be of no importance for your application, you may set the #define below to 1.
+// It will improve speed for Big-endian CPU.
+// This option has no impact on Little_Endian CPU.
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+
+//**************************************
+// Includes & Memory related functions
+//**************************************
+#include "xxhash.h"
+#include <stdlib.h>
+#include <string.h>
+
+
+#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  ifdef __IBMC__
+#    pragma pack(1)
+#  else
+#    pragma pack(push, 1)
+#  endif
+#endif
+
+typedef struct _uint32_t_S { uint32_t v; } _PACKED uint32_t_S;
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A32(x) (((uint32_t_S *)(x))->v)
+
+
+//***************************************
+// Compiler-specific Functions and Macros
+//***************************************
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+#if defined(_MSC_VER)     // Visual Studio
+#  define XXH_swap32 _byteswap_ulong
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static inline uint32_t XXH_swap32 (uint32_t x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+        ((x <<  8) & 0x00ff0000 ) |
+        ((x >>  8) & 0x0000ff00 ) |
+        ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+//**************************************
+// Constants
+//**************************************
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+
+//**************************************
+// Architecture Macros
+//**************************************
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
+    static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
+
+
+//****************************
+// Memory reads
+//****************************
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+static uint32_t XXH_readLE32_align(const uint32_t* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
+    else
+        return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
+}
+
+static uint32_t XXH_readLE32(const uint32_t* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); }
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+static uint32_t XXH32_endian_align(const void* input, int len, uint32_t seed, XXH_endianess endian, XXH_alignment align)
+{
+    const uint8_t *p = (const uint8_t *)input;
+    const uint8_t * const bEnd = p + len;
+    uint32_t h32;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) { len=0; p=(const uint8_t *)(size_t)16; }
+#endif
+
+    if (len>=16)
+    {
+        const uint8_t * const limit = bEnd - 16;
+        uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+        uint32_t v2 = seed + PRIME32_2;
+        uint32_t v3 = seed + 0;
+        uint32_t v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (uint32_t) len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+uint32_t XXH32(const void* input, uint32_t len, uint32_t seed)
+{
+#if 0
+    // Simple version, good for code maintenance, but unfortunately slow for small inputs
+    void* state = XXH32_init(seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3))   // Input is aligned, let's leverage the speed advantage
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+int XXH32_sizeofState(void)
+{
+    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
+    return sizeof(struct XXH_state32_t);
+}
+
+
+XXH_errorcode XXH32_resetState(void* state_in, uint32_t seed)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+void* XXH32_init (uint32_t seed)
+{
+    void *state = malloc (sizeof(struct XXH_state32_t));
+    XXH32_resetState(state, seed);
+    return state;
+}
+
+
+static XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const uint8_t *p = (const uint8_t *)input;
+    const uint8_t * const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   // fill in tmp buffer
+    {
+        memcpy(state->memory + state->memsize, input, len);
+        state->memsize +=  len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   // some data left from previous update
+    {
+        memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        {
+            const uint32_t* p32 = (const uint32_t*)state->memory;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const uint8_t * const limit = bEnd - 16;
+        uint32_t v1 = state->v1;
+        uint32_t v2 = state->v2;
+        uint32_t v3 = state->v3;
+        uint32_t v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        memcpy(state->memory, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+static uint32_t XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const uint8_t *p = (const uint8_t *)state->memory;
+    uint8_t * bEnd = (uint8_t *)state->memory + state->memsize;
+    uint32_t h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (uint32_t) state->total_len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+uint32_t XXH32_intermediateDigest (void* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian);
+}
+
+
+uint32_t XXH32_digest (void* state_in)
+{
+    uint32_t h32 = XXH32_intermediateDigest(state_in);
+
+    free(state_in);
+
+    return h32;
+}
diff --git a/crc/xxhash.h b/crc/xxhash.h
new file mode 100644
index 0000000..934c555
--- /dev/null
+++ b/crc/xxhash.h
@@ -0,0 +1,177 @@
+/*
+   xxHash - Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2014, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+  
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+  
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function. 
+It depends on successfully passing SMHasher test set. 
+10 is a perfect score.
+*/
+
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include <inttypes.h>
+
+struct XXH_state32_t
+{
+    uint64_t total_len;
+    uint32_t seed;
+    uint32_t v1;
+    uint32_t v2;
+    uint32_t v3;
+    uint32_t v4;
+    int memsize;
+    char memory[16];
+};
+
+//****************************
+// Type
+//****************************
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+
+uint32_t XXH32 (const void* input, uint32_t len, uint32_t seed);
+
+/*
+XXH32() :
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    The memory between input & input+len must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Note that "len" is type "int", which means it is limited to 2^31-1.
+    If your data is larger, use the advanced functions below.
+*/
+
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+void*         XXH32_init   (uint32_t seed);
+XXH_errorcode XXH32_update (void* state, const void* input, int len);
+uint32_t XXH32_digest (void* state);
+
+/*
+These functions calculate the xxhash of an input provided in several small packets,
+as opposed to an input provided as a single block.
+
+It must be started with :
+void* XXH32_init()
+The function returns a pointer which holds the state of calculation.
+
+This pointer must be provided as "void* state" parameter for XXH32_update().
+XXH32_update() can be called as many times as necessary.
+The user must provide a valid (allocated) input.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+Note that "len" is type "int", which means it is limited to 2^31-1. 
+If your data is larger, it is recommended to chunk your data into blocks 
+of size for example 2^30 (1GB) to avoid any "int" overflow issue.
+
+Finally, you can end the calculation anytime, by using XXH32_digest().
+This function returns the final 32-bits hash.
+You must provide the same "void* state" parameter created by XXH32_init().
+Memory will be freed by XXH32_digest().
+*/
+
+
+int           XXH32_sizeofState(void);
+XXH_errorcode XXH32_resetState(void* state, uint32_t seed);
+
+#define       XXH32_SIZEOFSTATE 48
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
+/*
+These functions allow user application to make its own allocation for state.
+
+XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
+Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
+This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+
+For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
+use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+*/
+
+
+uint32_t XXH32_intermediateDigest (void* state);
+/*
+This function does the same as XXH32_digest(), generating a 32-bit hash,
+but preserve memory context.
+This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
+To free memory context, use XXH32_digest(), or free().
+*/
+
+
+
+//****************************
+// Deprecated function names
+//****************************
+// The following translations are provided to ease code transition
+// You are encouraged to no longer this function names
+#define XXH32_feed   XXH32_update
+#define XXH32_result XXH32_digest
+#define XXH32_getIntermediateResult XXH32_intermediateDigest
+
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/debug.c b/debug.c
new file mode 100644
index 0000000..d1e2987
--- /dev/null
+++ b/debug.c
@@ -0,0 +1,18 @@
+#include <assert.h>
+#include <stdarg.h>
+
+#include "debug.h"
+#include "log.h"
+
+#ifdef FIO_INC_DEBUG
+void __dprint(int type, const char *str, ...)
+{
+	va_list args;
+
+	assert(type < FD_DEBUG_MAX);
+
+	va_start(args, str);
+	log_prevalist(type, str, args);
+	va_end(args);
+}
+#endif
diff --git a/debug.h b/debug.h
new file mode 100644
index 0000000..51b18de
--- /dev/null
+++ b/debug.h
@@ -0,0 +1,76 @@
+#ifndef FIO_DEBUG_H
+#define FIO_DEBUG_H
+
+#include "lib/types.h"
+
+enum {
+	FD_PROCESS	= 0,
+	FD_FILE,
+	FD_IO,
+	FD_MEM,
+	FD_BLKTRACE,
+	FD_VERIFY,
+	FD_RANDOM,
+	FD_PARSE,
+	FD_DISKUTIL,
+	FD_JOB,
+	FD_MUTEX,
+	FD_PROFILE,
+	FD_TIME,
+	FD_NET,
+	FD_RATE,
+	FD_COMPRESS,
+	FD_STEADYSTATE,
+	FD_HELPERTHREAD,
+	FD_ZBD,
+	FD_DEBUG_MAX,
+};
+
+extern unsigned int fio_debug_jobno, *fio_debug_jobp, *fio_warned;
+
+static inline bool fio_did_warn(unsigned int mask)
+{
+	if (*fio_warned & mask)
+		return true;
+
+	*fio_warned |= mask;
+	return false;
+}
+
+enum {
+	FIO_WARN_ROOT_FLUSH	= 1,
+	FIO_WARN_VERIFY_BUF	= 2,
+	FIO_WARN_ZONED_BUG	= 4,
+	FIO_WARN_IOLOG_DROP	= 8,
+	FIO_WARN_FADVISE	= 16,
+	FIO_WARN_BTRACE_ZERO	= 32,
+};
+
+#ifdef FIO_INC_DEBUG
+struct debug_level {
+	const char *name;
+	const char *help;
+	unsigned long shift;
+	unsigned int jobno;
+};
+extern const struct debug_level debug_levels[];
+
+extern unsigned long fio_debug;
+
+void __dprint(int type, const char *str, ...) __attribute__((format (printf, 2, 3)));
+
+#define dprint(type, str, args...)			\
+	do {						\
+		if (((1 << type) & fio_debug) == 0)	\
+			break;				\
+		__dprint((type), (str), ##args);	\
+	} while (0)					\
+
+#else
+
+static inline void dprint(int type, const char *str, ...)
+{
+}
+#endif
+
+#endif
diff --git a/diskutil.c b/diskutil.c
new file mode 100644
index 0000000..6c6380b
--- /dev/null
+++ b/diskutil.c
@@ -0,0 +1,521 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <dirent.h>
+#include <libgen.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
+
+#include "fio.h"
+#include "smalloc.h"
+#include "diskutil.h"
+#include "helper_thread.h"
+
+static int last_majdev, last_mindev;
+static struct disk_util *last_du;
+
+static struct fio_sem *disk_util_sem;
+
+static struct disk_util *__init_per_file_disk_util(struct thread_data *td,
+		int majdev, int mindev, char *path);
+
+static void disk_util_free(struct disk_util *du)
+{
+	if (du == last_du)
+		last_du = NULL;
+
+	while (!flist_empty(&du->slaves)) {
+		struct disk_util *slave;
+
+		slave = flist_first_entry(&du->slaves, struct disk_util, slavelist);
+		flist_del(&slave->slavelist);
+		slave->users--;
+	}
+
+	fio_sem_remove(du->lock);
+	free(du->sysfs_root);
+	sfree(du);
+}
+
+static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus)
+{
+	unsigned in_flight;
+	unsigned long long sectors[2];
+	char line[256];
+	FILE *f;
+	char *p;
+	int ret;
+
+	dprint(FD_DISKUTIL, "open stat file: %s\n", du->path);
+
+	f = fopen(du->path, "r");
+	if (!f)
+		return 1;
+
+	p = fgets(line, sizeof(line), f);
+	if (!p) {
+		fclose(f);
+		return 1;
+	}
+
+	dprint(FD_DISKUTIL, "%s: %s", du->path, p);
+
+	ret = sscanf(p, "%llu %llu %llu %llu %llu %llu %llu %llu %u %llu %llu\n",
+				(unsigned long long *) &dus->s.ios[0],
+				(unsigned long long *) &dus->s.merges[0],
+				&sectors[0],
+				(unsigned long long *) &dus->s.ticks[0],
+				(unsigned long long *) &dus->s.ios[1],
+				(unsigned long long *) &dus->s.merges[1],
+				&sectors[1],
+				(unsigned long long *) &dus->s.ticks[1],
+				&in_flight,
+				(unsigned long long *) &dus->s.io_ticks,
+				(unsigned long long *) &dus->s.time_in_queue);
+	fclose(f);
+	dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 1);
+	dus->s.sectors[0] = sectors[0];
+	dus->s.sectors[1] = sectors[1];
+	return ret != 11;
+}
+
+static void update_io_tick_disk(struct disk_util *du)
+{
+	struct disk_util_stat __dus, *dus, *ldus;
+	struct timespec t;
+
+	if (!du->users)
+		return;
+	if (get_io_ticks(du, &__dus))
+		return;
+
+	dus = &du->dus;
+	ldus = &du->last_dus;
+
+	dus->s.sectors[0] += (__dus.s.sectors[0] - ldus->s.sectors[0]);
+	dus->s.sectors[1] += (__dus.s.sectors[1] - ldus->s.sectors[1]);
+	dus->s.ios[0] += (__dus.s.ios[0] - ldus->s.ios[0]);
+	dus->s.ios[1] += (__dus.s.ios[1] - ldus->s.ios[1]);
+	dus->s.merges[0] += (__dus.s.merges[0] - ldus->s.merges[0]);
+	dus->s.merges[1] += (__dus.s.merges[1] - ldus->s.merges[1]);
+	dus->s.ticks[0] += (__dus.s.ticks[0] - ldus->s.ticks[0]);
+	dus->s.ticks[1] += (__dus.s.ticks[1] - ldus->s.ticks[1]);
+	dus->s.io_ticks += (__dus.s.io_ticks - ldus->s.io_ticks);
+	dus->s.time_in_queue += (__dus.s.time_in_queue - ldus->s.time_in_queue);
+
+	fio_gettime(&t, NULL);
+	dus->s.msec += mtime_since(&du->time, &t);
+	memcpy(&du->time, &t, sizeof(t));
+	memcpy(&ldus->s, &__dus.s, sizeof(__dus.s));
+}
+
+int update_io_ticks(void)
+{
+	struct flist_head *entry;
+	struct disk_util *du;
+	int ret = 0;
+
+	dprint(FD_DISKUTIL, "update io ticks\n");
+
+	fio_sem_down(disk_util_sem);
+
+	if (!helper_should_exit()) {
+		flist_for_each(entry, &disk_list) {
+			du = flist_entry(entry, struct disk_util, list);
+			update_io_tick_disk(du);
+		}
+	} else
+		ret = 1;
+
+	fio_sem_up(disk_util_sem);
+	return ret;
+}
+
+static struct disk_util *disk_util_exists(int major, int minor)
+{
+	struct flist_head *entry;
+	struct disk_util *du;
+
+	fio_sem_down(disk_util_sem);
+
+	flist_for_each(entry, &disk_list) {
+		du = flist_entry(entry, struct disk_util, list);
+
+		if (major == du->major && minor == du->minor) {
+			fio_sem_up(disk_util_sem);
+			return du;
+		}
+	}
+
+	fio_sem_up(disk_util_sem);
+	return NULL;
+}
+
+static int get_device_numbers(char *file_name, int *maj, int *min)
+{
+	struct stat st;
+	int majdev, mindev;
+	char tempname[PATH_MAX], *p;
+
+	if (!lstat(file_name, &st)) {
+		if (S_ISBLK(st.st_mode)) {
+			majdev = major(st.st_rdev);
+			mindev = minor(st.st_rdev);
+		} else if (S_ISCHR(st.st_mode)) {
+			majdev = major(st.st_rdev);
+			mindev = minor(st.st_rdev);
+			if (fio_lookup_raw(st.st_rdev, &majdev, &mindev))
+				return -1;
+		} else if (S_ISFIFO(st.st_mode))
+			return -1;
+		else {
+			majdev = major(st.st_dev);
+			mindev = minor(st.st_dev);
+		}
+	} else {
+		/*
+		 * must be a file, open "." in that path
+		 */
+		snprintf(tempname, ARRAY_SIZE(tempname), "%s", file_name);
+		p = dirname(tempname);
+		if (stat(p, &st)) {
+			perror("disk util stat");
+			return -1;
+		}
+
+		majdev = major(st.st_dev);
+		mindev = minor(st.st_dev);
+	}
+
+	*min = mindev;
+	*maj = majdev;
+
+	return 0;
+}
+
+static int read_block_dev_entry(char *path, int *maj, int *min)
+{
+	char line[256], *p;
+	FILE *f;
+
+	f = fopen(path, "r");
+	if (!f) {
+		perror("open path");
+		return 1;
+	}
+
+	p = fgets(line, sizeof(line), f);
+	fclose(f);
+
+	if (!p)
+		return 1;
+
+	if (sscanf(p, "%u:%u", maj, min) != 2)
+		return 1;
+
+	return 0;
+}
+
+static void find_add_disk_slaves(struct thread_data *td, char *path,
+				 struct disk_util *masterdu)
+{
+	DIR *dirhandle = NULL;
+	struct dirent *dirent = NULL;
+	char slavesdir[PATH_MAX], temppath[PATH_MAX], slavepath[PATH_MAX];
+	struct disk_util *slavedu = NULL;
+	int majdev, mindev;
+	ssize_t linklen;
+
+	sprintf(slavesdir, "%s/%s", path, "slaves");
+	dirhandle = opendir(slavesdir);
+	if (!dirhandle)
+		return;
+
+	while ((dirent = readdir(dirhandle)) != NULL) {
+		if (!strcmp(dirent->d_name, ".") ||
+		    !strcmp(dirent->d_name, ".."))
+			continue;
+
+		nowarn_snprintf(temppath, sizeof(temppath), "%s/%s", slavesdir,
+				dirent->d_name);
+		/* Can we always assume that the slaves device entries
+		 * are links to the real directories for the slave
+		 * devices?
+		 */
+		linklen = readlink(temppath, slavepath, PATH_MAX - 1);
+		if (linklen < 0) {
+			perror("readlink() for slave device.");
+			closedir(dirhandle);
+			return;
+		}
+		slavepath[linklen] = '\0';
+
+		nowarn_snprintf(temppath, sizeof(temppath), "%s/%s/dev",
+				slavesdir, slavepath);
+		if (access(temppath, F_OK) != 0)
+			nowarn_snprintf(temppath, sizeof(temppath),
+					"%s/%s/device/dev", slavesdir,
+					slavepath);
+		if (read_block_dev_entry(temppath, &majdev, &mindev)) {
+			perror("Error getting slave device numbers");
+			closedir(dirhandle);
+			return;
+		}
+
+		/*
+		 * See if this maj,min already exists
+		 */
+		slavedu = disk_util_exists(majdev, mindev);
+		if (slavedu)
+			continue;
+
+		nowarn_snprintf(temppath, sizeof(temppath), "%s/%s", slavesdir,
+				slavepath);
+		__init_per_file_disk_util(td, majdev, mindev, temppath);
+		slavedu = disk_util_exists(majdev, mindev);
+
+		/* Should probably use an assert here. slavedu should
+		 * always be present at this point. */
+		if (slavedu) {
+			slavedu->users++;
+			flist_add_tail(&slavedu->slavelist, &masterdu->slaves);
+		}
+	}
+
+	closedir(dirhandle);
+}
+
+static struct disk_util *disk_util_add(struct thread_data *td, int majdev,
+				       int mindev, char *path)
+{
+	struct disk_util *du, *__du;
+	struct flist_head *entry;
+	int l;
+
+	dprint(FD_DISKUTIL, "add maj/min %d/%d: %s\n", majdev, mindev, path);
+
+	du = smalloc(sizeof(*du));
+	if (!du)
+		return NULL;
+
+	DRD_IGNORE_VAR(du->users);
+	memset(du, 0, sizeof(*du));
+	INIT_FLIST_HEAD(&du->list);
+	l = snprintf(du->path, sizeof(du->path), "%s/stat", path);
+	if (l < 0 || l >= sizeof(du->path)) {
+		log_err("constructed path \"%.100s[...]/stat\" larger than buffer (%zu bytes)\n",
+			path, sizeof(du->path) - 1);
+		sfree(du);
+		return NULL;
+	}
+	snprintf((char *) du->dus.name, ARRAY_SIZE(du->dus.name), "%s",
+		 basename(path));
+	du->sysfs_root = strdup(path);
+	du->major = majdev;
+	du->minor = mindev;
+	INIT_FLIST_HEAD(&du->slavelist);
+	INIT_FLIST_HEAD(&du->slaves);
+	du->lock = fio_sem_init(FIO_SEM_UNLOCKED);
+	du->users = 0;
+
+	fio_sem_down(disk_util_sem);
+
+	flist_for_each(entry, &disk_list) {
+		__du = flist_entry(entry, struct disk_util, list);
+
+		dprint(FD_DISKUTIL, "found %s in list\n", __du->dus.name);
+
+		if (!strcmp((char *) du->dus.name, (char *) __du->dus.name)) {
+			disk_util_free(du);
+			fio_sem_up(disk_util_sem);
+			return __du;
+		}
+	}
+
+	dprint(FD_DISKUTIL, "add %s to list\n", du->dus.name);
+
+	fio_gettime(&du->time, NULL);
+	get_io_ticks(du, &du->last_dus);
+
+	flist_add_tail(&du->list, &disk_list);
+	fio_sem_up(disk_util_sem);
+
+	find_add_disk_slaves(td, path, du);
+	return du;
+}
+
+static int check_dev_match(int majdev, int mindev, char *path)
+{
+	int major, minor;
+
+	if (read_block_dev_entry(path, &major, &minor))
+		return 1;
+
+	if (majdev == major && mindev == minor)
+		return 0;
+
+	return 1;
+}
+
+static int find_block_dir(int majdev, int mindev, char *path, int link_ok)
+{
+	struct dirent *dir;
+	struct stat st;
+	int found = 0;
+	DIR *D;
+
+	D = opendir(path);
+	if (!D)
+		return 0;
+
+	while ((dir = readdir(D)) != NULL) {
+		char full_path[257];
+
+		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
+			continue;
+
+		sprintf(full_path, "%s/%s", path, dir->d_name);
+
+		if (!strcmp(dir->d_name, "dev")) {
+			if (!check_dev_match(majdev, mindev, full_path)) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (link_ok) {
+			if (stat(full_path, &st) == -1) {
+				perror("stat");
+				break;
+			}
+		} else {
+			if (lstat(full_path, &st) == -1) {
+				perror("stat");
+				break;
+			}
+		}
+
+		if (!S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
+			continue;
+
+		found = find_block_dir(majdev, mindev, full_path, 0);
+		if (found) {
+			strcpy(path, full_path);
+			break;
+		}
+	}
+
+	closedir(D);
+	return found;
+}
+
+static struct disk_util *__init_per_file_disk_util(struct thread_data *td,
+						   int majdev, int mindev,
+						   char *path)
+{
+	struct stat st;
+	char tmp[PATH_MAX];
+	char *p;
+
+	/*
+	 * If there's a ../queue/ directory there, we are inside a partition.
+	 * Check if that is the case and jump back. For loop/md/dm etc we
+	 * are already in the right spot.
+	 */
+	sprintf(tmp, "%s/../queue", path);
+	if (!stat(tmp, &st)) {
+		p = dirname(path);
+		sprintf(tmp, "%s/queue", p);
+		if (stat(tmp, &st)) {
+			log_err("unknown sysfs layout\n");
+			return NULL;
+		}
+		snprintf(tmp, ARRAY_SIZE(tmp), "%s", p);
+		sprintf(path, "%s", tmp);
+	}
+
+	return disk_util_add(td, majdev, mindev, path);
+}
+
+static struct disk_util *init_per_file_disk_util(struct thread_data *td,
+						 char *filename)
+{
+
+	char foo[PATH_MAX];
+	struct disk_util *du;
+	int mindev, majdev;
+
+	if (get_device_numbers(filename, &majdev, &mindev))
+		return NULL;
+
+	dprint(FD_DISKUTIL, "%s belongs to maj/min %d/%d\n", filename, majdev,
+			mindev);
+
+	du = disk_util_exists(majdev, mindev);
+	if (du)
+		return du;
+
+	/*
+	 * for an fs without a device, we will repeatedly stat through
+	 * sysfs which can take oodles of time for thousands of files. so
+	 * cache the last lookup and compare with that before going through
+	 * everything again.
+	 */
+	if (mindev == last_mindev && majdev == last_majdev)
+		return last_du;
+
+	last_mindev = mindev;
+	last_majdev = majdev;
+
+	sprintf(foo, "/sys/block");
+	if (!find_block_dir(majdev, mindev, foo, 1))
+		return NULL;
+
+	return __init_per_file_disk_util(td, majdev, mindev, foo);
+}
+
+static struct disk_util *__init_disk_util(struct thread_data *td,
+					  struct fio_file *f)
+{
+	return init_per_file_disk_util(td, f->file_name);
+}
+
+void init_disk_util(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (!td->o.do_disk_util ||
+	    td_ioengine_flagged(td, FIO_DISKLESSIO | FIO_NODISKUTIL))
+		return;
+
+	for_each_file(td, f, i)
+		f->du = __init_disk_util(td, f);
+}
+
+void disk_util_prune_entries(void)
+{
+	fio_sem_down(disk_util_sem);
+
+	while (!flist_empty(&disk_list)) {
+		struct disk_util *du;
+
+		du = flist_first_entry(&disk_list, struct disk_util, list);
+		flist_del(&du->list);
+		disk_util_free(du);
+	}
+
+	last_majdev = last_mindev = -1;
+	fio_sem_up(disk_util_sem);
+	fio_sem_remove(disk_util_sem);
+}
+
+void setup_disk_util(void)
+{
+	disk_util_sem = fio_sem_init(FIO_SEM_UNLOCKED);
+}
diff --git a/diskutil.h b/diskutil.h
new file mode 100644
index 0000000..83bcbf8
--- /dev/null
+++ b/diskutil.h
@@ -0,0 +1,123 @@
+#ifndef FIO_DISKUTIL_H
+#define FIO_DISKUTIL_H
+#define FIO_DU_NAME_SZ		64
+
+#include "helper_thread.h"
+#include "fio_sem.h"
+
+struct disk_util_stats {
+	uint64_t ios[2];
+	uint64_t merges[2];
+	uint64_t sectors[2];
+	uint64_t ticks[2];
+	uint64_t io_ticks;
+	uint64_t time_in_queue;
+	uint64_t msec;
+};
+
+/*
+ * Disk utils as read in /sys/block/<dev>/stat
+ */
+struct disk_util_stat {
+	uint8_t name[FIO_DU_NAME_SZ];
+	struct disk_util_stats s;
+};
+
+struct disk_util_agg {
+	uint64_t ios[2];
+	uint64_t merges[2];
+	uint64_t sectors[2];
+	uint64_t ticks[2];
+	uint64_t io_ticks;
+	uint64_t time_in_queue;
+	uint32_t slavecount;
+	uint32_t pad;
+	fio_fp64_t max_util;
+};
+
+/*
+ * Per-device disk util management
+ */
+struct disk_util {
+	struct flist_head list;
+	/* If this disk is a slave, hook it into the master's
+	 * list using this head.
+	 */
+	struct flist_head slavelist;
+
+	char *sysfs_root;
+	char path[PATH_MAX];
+	int major, minor;
+
+	struct disk_util_stat dus;
+	struct disk_util_stat last_dus;
+
+	struct disk_util_agg agg;
+
+	/* For software raids, this entry maintains pointers to the
+	 * entries for the slave devices. The disk_util entries for
+	 * the slaves devices should primarily be maintained through
+	 * the disk_list list, i.e. for memory allocation and
+	 * de-allocation, etc. Whereas this list should be used only
+	 * for aggregating a software RAID's disk util figures.
+	 */
+	struct flist_head slaves;
+
+	struct timespec time;
+
+	struct fio_sem *lock;
+	unsigned long users;
+};
+
+static inline void disk_util_mod(struct disk_util *du, int val)
+{
+	if (du) {
+		struct flist_head *n;
+
+		fio_sem_down(du->lock);
+		du->users += val;
+
+		flist_for_each(n, &du->slavelist) {
+			struct disk_util *slave;
+
+			slave = flist_entry(n, struct disk_util, slavelist);
+			slave->users += val;
+		}
+		fio_sem_up(du->lock);
+	}
+}
+static inline void disk_util_inc(struct disk_util *du)
+{
+	disk_util_mod(du, 1);
+}
+
+static inline void disk_util_dec(struct disk_util *du)
+{
+	disk_util_mod(du, -1);
+}
+
+#define DISK_UTIL_MSEC	(250)
+
+extern struct flist_head disk_list;
+
+/*
+ * disk util stuff
+ */
+#ifdef FIO_HAVE_DISK_UTIL
+extern void init_disk_util(struct thread_data *);
+extern int update_io_ticks(void);
+extern void setup_disk_util(void);
+extern void disk_util_prune_entries(void);
+#else
+/* keep this as a function to avoid a warning in handle_du() */
+#define disk_util_prune_entries()
+#define init_disk_util(td)
+#define setup_disk_util()
+
+static inline int update_io_ticks(void)
+{
+	return helper_should_exit();
+}
+#endif
+
+#endif
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..3b979f9
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,225 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = output
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  epub3      to make an epub3"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+	@echo "  dummy      to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fio.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fio.qhc"
+
+.PHONY: applehelp
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+.PHONY: devhelp
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/fio"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fio"
+	@echo "# devhelp"
+
+.PHONY: epub
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+	@echo
+	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+	@echo
+	@echo "Build finished. Dummy builder generates no files."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..10b72ec
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,362 @@
+# -*- coding: utf-8 -*-
+#
+# fio documentation build configuration file, created by
+# sphinx-quickstart on Mon Nov 14 13:56:30 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'fio'
+copyright = '2017, Jens Axboe <axboe@kernel.dk>'
+author = 'Jens Axboe <axboe@kernel.dk>'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+
+# The short X.Y version.
+# version = '1'
+# The full version, including alpha/beta/rc tags.
+# release = '1'
+
+def fio_version():
+
+	from os.path import exists, dirname, join
+	wsroot = dirname(dirname(__file__))
+	version_file = join(wsroot, "FIO-VERSION-FILE")
+	if not exists(version_file):
+		version_gen = join(wsroot, "FIO-VERSION-GEN")
+		from subprocess import call
+		rc = call(version_gen, shell=True, cwd=wsroot)
+		if rc:
+			print("Couldn't generate version file. rc=%r" % rc)
+			return "Unknown", "Unknown"
+
+	vsl = open(version_file).read().strip().split('-')
+	version = vsl[1]
+	release = '-'.join(vsl[1:])
+	return version, release
+
+version, release = fio_version()
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['output', 'Thumbs.db', '.DS_Store', 'fio_examples.rst']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+# html_title = 'fio v1'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'fiodoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     # 'papersize': 'letterpaper',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'fio.tex', 'fio Documentation',
+     'a', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('fio_man', 'fio', 'flexible I/O tester',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'fio', 'fio Documentation',
+     author, 'fio', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
diff --git a/doc/fio-histo-log-pctiles.pdf b/doc/fio-histo-log-pctiles.pdf
new file mode 100644
index 0000000..069ab99
Binary files /dev/null and b/doc/fio-histo-log-pctiles.pdf differ
diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst
new file mode 100644
index 0000000..b5987b5
--- /dev/null
+++ b/doc/fio_doc.rst
@@ -0,0 +1,51 @@
+fio - Flexible I/O tester rev. |version|
+========================================
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
+
+
+
+Examples
+========
+
+.. include:: fio_examples.rst
+
+
+
+TODO
+====
+
+
+GFIO TODO
+---------
+
+.. include:: ../GFIO-TODO
+
+
+Server TODO
+-----------
+
+.. include:: ../SERVER-TODO
+
+
+Steady State TODO
+-----------------
+
+.. include:: ../STEADYSTATE-TODO
+
+
+
+Moral License
+=============
+
+.. include:: ../MORAL-LICENSE
+
+
+License
+=======
+
+.. literalinclude:: ../COPYING
diff --git a/doc/fio_examples.rst b/doc/fio_examples.rst
new file mode 100644
index 0000000..cff1f39
--- /dev/null
+++ b/doc/fio_examples.rst
@@ -0,0 +1,72 @@
+Some job file examples.
+
+
+Poisson request flow
+--------------------
+
+.. only:: builder_html
+
+:download:`Download poisson-rate-submission.fio <../examples/poisson-rate-submission.fio>`
+
+.. literalinclude:: ../examples/poisson-rate-submission.fio
+	:language: ini
+
+Latency profile
+---------------
+
+.. only:: builder_html
+
+:download:`Download latency-profile.fio <../examples/latency-profile.fio>`
+
+.. literalinclude:: ../examples/latency-profile.fio
+	:language: ini
+
+Read 4 files with aio at different depths
+-----------------------------------------
+
+.. only:: builder_html
+
+:download:`Download aio-read.fio <../examples/aio-read.fio>`
+
+.. literalinclude:: ../examples/aio-read.fio
+	:language: ini
+
+Read backwards in a file
+------------------------
+
+.. only:: builder_html
+
+:download:`Download backwards-read.fio <../examples/backwards-read.fio>`
+
+.. literalinclude:: ../examples/backwards-read.fio
+	:language: ini
+
+Basic verification
+------------------
+
+.. only:: builder_html
+
+:download:`Download basic-verify.fio <../examples/basic-verify.fio>`
+
+.. literalinclude:: ../examples/basic-verify.fio
+	:language: ini
+
+Fixed rate submission
+---------------------
+
+.. only:: builder_html
+
+:download:`Download fixed-rate-submission.fio <../examples/fixed-rate-submission.fio>`
+
+.. literalinclude:: ../examples/fixed-rate-submission.fio
+	:language: ini
+
+Butterfly seek pattern
+-----------------------
+
+.. only:: builder_html
+
+:download:`Download butterfly.fio <../examples/butterfly.fio>`
+
+.. literalinclude:: ../examples/butterfly.fio
+	:language: ini
diff --git a/doc/fio_man.rst b/doc/fio_man.rst
new file mode 100644
index 0000000..c6a6438
--- /dev/null
+++ b/doc/fio_man.rst
@@ -0,0 +1,12 @@
+:orphan:
+
+Fio Manpage
+===========
+
+(rev. |release|)
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..b98d997
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,25 @@
+.. FIO documentation master file, created by
+   sphinx-quickstart on Thu Mar 20 16:24:25 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to FIO's documentation!
+===============================
+
+**Version:** |release|
+
+Contents:
+
+.. toctree::
+   :maxdepth: 3
+   :numbered:
+
+	fio - Flexible I/O tester |version| <fio_doc>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..71fa19c
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,281 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  epub3      to make an epub3
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	echo.  dummy      to check syntax errors of document sources
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 1>NUL 2>NUL
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fio.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fio.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "epub3" (
+	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+if "%1" == "dummy" (
+	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. Dummy builder generates no files.
+	goto end
+)
+
+:end
diff --git a/engines/cpu.c b/engines/cpu.c
new file mode 100644
index 0000000..4d572b4
--- /dev/null
+++ b/engines/cpu.c
@@ -0,0 +1,123 @@
+/*
+ * CPU engine
+ *
+ * Doesn't transfer any data, merely burns CPU cycles according to
+ * the settings.
+ *
+ */
+#include "../fio.h"
+#include "../optgroup.h"
+
+struct cpu_options {
+	void *pad;
+	unsigned int cpuload;
+	unsigned int cpucycle;
+	unsigned int exit_io_done;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "cpuload",
+		.lname	= "CPU load",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct cpu_options, cpuload),
+		.help	= "Use this percentage of CPU",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "cpuchunks",
+		.lname	= "CPU chunk",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct cpu_options, cpucycle),
+		.help	= "Length of the CPU burn cycles (usecs)",
+		.def	= "50000",
+		.parent = "cpuload",
+		.hide	= 1,
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "exit_on_io_done",
+		.lname	= "Exit when IO threads are done",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct cpu_options, exit_io_done),
+		.help	= "Exit when IO threads finish",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+
+static enum fio_q_status fio_cpuio_queue(struct thread_data *td,
+					 struct io_u fio_unused *io_u)
+{
+	struct cpu_options *co = td->eo;
+
+	if (co->exit_io_done && !fio_running_or_pending_io_threads()) {
+		td->done = 1;
+		return FIO_Q_BUSY;
+	}
+
+	usec_spin(co->cpucycle);
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_cpuio_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+	struct cpu_options *co = td->eo;
+
+	if (!co->cpuload) {
+		td_vmsg(td, EINVAL, "cpu thread needs rate (cpuload=)","cpuio");
+		return 1;
+	}
+
+	if (co->cpuload > 100)
+		co->cpuload = 100;
+
+	/*
+	 * set thinktime_sleep and thinktime_spin appropriately
+	 */
+	o->thinktime_blocks = 1;
+	o->thinktime_spin = 0;
+	o->thinktime = ((unsigned long long) co->cpucycle * (100 - co->cpuload)) / co->cpuload;
+
+	o->nr_files = o->open_files = 1;
+
+	log_info("%s: ioengine=%s, cpuload=%u, cpucycle=%u\n",
+		td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+
+	return 0;
+}
+
+static int fio_cpuio_open(struct thread_data fio_unused *td,
+			  struct fio_file fio_unused *f)
+{
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "cpuio",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_cpuio_queue,
+	.init		= fio_cpuio_init,
+	.open_file	= fio_cpuio_open,
+	.flags		= FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+	.options		= options,
+	.option_struct_size	= sizeof(struct cpu_options),
+};
+
+static void fio_init fio_cpuio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_cpuio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/dev-dax.c b/engines/dev-dax.c
new file mode 100644
index 0000000..422ea63
--- /dev/null
+++ b/engines/dev-dax.c
@@ -0,0 +1,351 @@
+/*
+ * device DAX engine
+ *
+ * IO engine that reads/writes from files by doing memcpy to/from
+ * a memory mapped region of DAX enabled device.
+ *
+ * Copyright (C) 2016 Intel Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * device dax engine
+ * IO engine that access a DAX device directly for read and write data
+ *
+ * To use:
+ *   ioengine=dev-dax
+ *
+ *   Other relevant settings:
+ *     iodepth=1
+ *     direct=0	   REQUIRED
+ *     filename=/dev/daxN.N
+ *     bs=2m
+ *
+ *     direct should be left to 0. Using dev-dax implies that memory access
+ *     is direct. However, dev-dax does not support O_DIRECT flag by design
+ *     since it is not necessary.
+ *
+ *     bs should adhere to the device dax alignment at minimally.
+ *
+ * libpmem.so
+ *   By default, the dev-dax engine will let the system find the libpmem.so
+ *   that it uses. You can use an alternative libpmem by setting the
+ *   FIO_PMEM_LIB environment variable to the full path to the desired
+ *   libpmem.so.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <libgen.h>
+#include <libpmem.h>
+
+#include "../fio.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total to model after
+ * mmap engine behavior
+ */
+#define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
+
+struct fio_devdax_data {
+	void *devdax_ptr;
+	size_t devdax_sz;
+	off_t devdax_off;
+};
+
+static int fio_devdax_file(struct thread_data *td, struct fio_file *f,
+			   size_t length, off_t off)
+{
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int flags = 0;
+
+	if (td_rw(td))
+		flags = PROT_READ | PROT_WRITE;
+	else if (td_write(td)) {
+		flags = PROT_WRITE;
+
+		if (td->o.verify != VERIFY_NONE)
+			flags |= PROT_READ;
+	} else
+		flags = PROT_READ;
+
+	fdd->devdax_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off);
+	if (fdd->devdax_ptr == MAP_FAILED) {
+		fdd->devdax_ptr = NULL;
+		td_verror(td, errno, "mmap");
+	}
+
+	if (td->error && fdd->devdax_ptr)
+		munmap(fdd->devdax_ptr, length);
+
+	return td->error;
+}
+
+/*
+ * Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_devdax_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+
+	if (io_u->buflen > f->real_file_size) {
+		log_err("dev-dax: bs too big for dev-dax engine\n");
+		return EIO;
+	}
+
+	fdd->devdax_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
+	if (fdd->devdax_sz > f->io_size)
+		fdd->devdax_sz = f->io_size;
+
+	fdd->devdax_off = io_u->offset;
+
+	return fio_devdax_file(td, f, fdd->devdax_sz, fdd->devdax_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_devdax_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	if (fio_file_partial_mmap(f))
+		return EINVAL;
+
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
+
+	fdd->devdax_sz = f->io_size;
+	fdd->devdax_off = 0;
+
+	ret = fio_devdax_file(td, f, fdd->devdax_sz, fdd->devdax_off);
+	if (ret)
+		fio_file_set_partial_mmap(f);
+
+	return ret;
+}
+
+static int fio_devdax_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	/*
+	 * It fits within existing mapping, use it
+	 */
+	if (io_u->offset >= fdd->devdax_off &&
+	    io_u->offset + io_u->buflen <= fdd->devdax_off + fdd->devdax_sz)
+		goto done;
+
+	/*
+	 * unmap any existing mapping
+	 */
+	if (fdd->devdax_ptr) {
+		if (munmap(fdd->devdax_ptr, fdd->devdax_sz) < 0)
+			return errno;
+		fdd->devdax_ptr = NULL;
+	}
+
+	if (fio_devdax_prep_full(td, io_u)) {
+		td_clear_error(td);
+		ret = fio_devdax_prep_limited(td, io_u);
+		if (ret)
+			return ret;
+	}
+
+done:
+	io_u->mmap_data = fdd->devdax_ptr + io_u->offset - fdd->devdax_off -
+				f->file_offset;
+	return 0;
+}
+
+static enum fio_q_status fio_devdax_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	fio_ro_check(td, io_u);
+	io_u->error = 0;
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+		break;
+	case DDIR_WRITE:
+		pmem_memcpy_persist(io_u->mmap_data, io_u->xfer_buf,
+				    io_u->xfer_buflen);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_devdax_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	if ((o->rw_min_bs & page_mask) &&
+	    (o->fsync_blocks || o->fdatasync_blocks)) {
+		log_err("dev-dax: mmap options dictate a minimum block size of %llu bytes\n",
+			(unsigned long long) page_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_devdax_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_devdax_data *fdd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fdd = calloc(1, sizeof(*fdd));
+	if (!fdd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fdd);
+
+	return 0;
+}
+
+static int fio_devdax_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fdd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
+static int
+fio_devdax_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	char spath[PATH_MAX];
+	char npath[PATH_MAX];
+	char *rpath, *basename;
+	FILE *sfile;
+	uint64_t size;
+	struct stat st;
+	int rc;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype != FIO_TYPE_CHAR)
+		return -EINVAL;
+
+	rc = stat(f->file_name, &st);
+	if (rc < 0) {
+		log_err("%s: failed to stat file %s (%s)\n",
+			td->o.name, f->file_name, strerror(errno));
+		return -errno;
+	}
+
+	snprintf(spath, PATH_MAX, "/sys/dev/char/%d:%d/subsystem",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	rpath = realpath(spath, npath);
+	if (!rpath) {
+		log_err("%s: realpath on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return -errno;
+	}
+
+	/* check if DAX device */
+	basename = strrchr(rpath, '/');
+	if (!basename || strcmp("dax", basename+1)) {
+		log_err("%s: %s not a DAX device!\n",
+			td->o.name, f->file_name);
+	}
+
+	snprintf(spath, PATH_MAX, "/sys/dev/char/%d:%d/size",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	sfile = fopen(spath, "r");
+	if (!sfile) {
+		log_err("%s: fopen on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return 1;
+	}
+
+	rc = fscanf(sfile, "%lu", &size);
+	if (rc < 0) {
+		log_err("%s: fscanf on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		fclose(sfile);
+		return 1;
+	}
+
+	f->real_file_size = size;
+
+	fclose(sfile);
+
+	if (f->file_offset > f->real_file_size) {
+		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
+					(unsigned long long) f->file_offset,
+					(unsigned long long) f->real_file_size);
+		return 1;
+	}
+
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "dev-dax",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_devdax_init,
+	.prep		= fio_devdax_prep,
+	.queue		= fio_devdax_queue,
+	.open_file	= fio_devdax_open_file,
+	.close_file	= fio_devdax_close_file,
+	.get_file_size	= fio_devdax_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_devdax_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_devdax_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/e4defrag.c b/engines/e4defrag.c
new file mode 100644
index 0000000..8f71d02
--- /dev/null
+++ b/engines/e4defrag.c
@@ -0,0 +1,215 @@
+/*
+ * ioe_e4defrag:  ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+ * defragment activity
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#ifndef EXT4_IOC_MOVE_EXT
+#define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+struct move_extent {
+	__u32 reserved;         /* should be zero */
+	__u32 donor_fd;         /* donor file descriptor */
+	__u64 orig_start;       /* logical start offset in block for orig */
+	__u64 donor_start;      /* logical start offset in block for donor */
+	__u64 len;              /* block length to be moved */
+	__u64 moved_len;        /* moved block length */
+};
+#endif
+
+struct e4defrag_data {
+	int donor_fd;
+	int bsz;
+};
+
+struct e4defrag_options {
+	void *pad;
+	unsigned int inplace;
+	char * donor_name;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "donorname",
+		.lname	= "Donor Name",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct e4defrag_options, donor_name),
+		.help	= "File used as a block donor",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_E4DEFRAG,
+	},
+	{
+		.name	= "inplace",
+		.lname	= "In Place",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct e4defrag_options, inplace),
+		.minval	= 0,
+		.maxval	= 1,
+		.help	= "Alloc and free space inside defrag event",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_E4DEFRAG,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int fio_e4defrag_init(struct thread_data *td)
+{
+	int r, len = 0;
+	struct e4defrag_options *o = td->eo;
+	struct e4defrag_data *ed;
+	struct stat stub;
+	char donor_name[PATH_MAX];
+
+	if (!strlen(o->donor_name)) {
+		log_err("'donorname' options required\n");
+		return 1;
+	}
+
+	ed = malloc(sizeof(*ed));
+	if (!ed) {
+		td_verror(td, ENOMEM, "io_queue_init");
+		return 1;
+	}
+	memset(ed, 0 ,sizeof(*ed));
+
+	if (td->o.directory)
+		len = sprintf(donor_name, "%s/", td->o.directory);
+	sprintf(donor_name + len, "%s", o->donor_name);
+
+	ed->donor_fd = open(donor_name, O_CREAT|O_WRONLY, 0644);
+	if (ed->donor_fd < 0) {
+		td_verror(td, errno, "io_queue_init");
+		log_err("Can't open donor file %s err:%d\n", donor_name, ed->donor_fd);
+		free(ed);
+		return 1;
+	}
+
+	if (!o->inplace) {
+		long long __len = td->o.file_size_high - td->o.start_offset;
+		r = fallocate(ed->donor_fd, 0, td->o.start_offset, __len);
+		if (r)
+			goto err;
+	}
+	r = fstat(ed->donor_fd, &stub);
+	if (r)
+		goto err;
+
+	ed->bsz = stub.st_blksize;
+	td->io_ops_data = ed;
+	return 0;
+err:
+	td_verror(td, errno, "io_queue_init");
+	close(ed->donor_fd);
+	free(ed);
+	return 1;
+}
+
+static void fio_e4defrag_cleanup(struct thread_data *td)
+{
+	struct e4defrag_data *ed = td->io_ops_data;
+	if (ed) {
+		if (ed->donor_fd >= 0)
+			close(ed->donor_fd);
+		free(ed);
+	}
+}
+
+
+static enum fio_q_status fio_e4defrag_queue(struct thread_data *td,
+					    struct io_u *io_u)
+{
+
+	int ret;
+	unsigned long long len;
+	struct move_extent me;
+	struct fio_file *f = io_u->file;
+	struct e4defrag_data *ed = td->io_ops_data;
+	struct e4defrag_options *o = td->eo;
+
+	fio_ro_check(td, io_u);
+
+	/* Theoretically defragmentation should not change data, but it
+	 * changes data layout. So this function handle only DDIR_WRITE
+	 * in order to satisfy strict read only access pattern
+	 */
+	if (io_u->ddir != DDIR_WRITE) {
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+
+	if (o->inplace) {
+		ret = fallocate(ed->donor_fd, 0, io_u->offset, io_u->xfer_buflen);
+		if (ret)
+			goto out;
+	}
+
+	memset(&me, 0, sizeof(me));
+	me.donor_fd = ed->donor_fd;
+	me.orig_start = io_u->offset / ed->bsz;
+	me.donor_start = me.orig_start;
+	len = (io_u->offset + io_u->xfer_buflen + ed->bsz -1);
+	me.len = len / ed->bsz - me.orig_start;
+
+	ret = ioctl(f->fd, EXT4_IOC_MOVE_EXT, &me);
+	len = me.moved_len * ed->bsz;
+
+	if (len > io_u->xfer_buflen)
+		len = io_u->xfer_buflen;
+
+	if (len != io_u->xfer_buflen) {
+		if (len) {
+			io_u->resid = io_u->xfer_buflen - len;
+			io_u->error = 0;
+		} else {
+			/* access beyond i_size */
+			io_u->error = EINVAL;
+		}
+	}
+	if (ret)
+		io_u->error = errno;
+
+	if (o->inplace)
+		ret = ftruncate(ed->donor_fd, 0);
+out:
+	if (ret && !io_u->error)
+		io_u->error = errno;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "e4defrag",
+	.version		= FIO_IOOPS_VERSION,
+	.init			= fio_e4defrag_init,
+	.queue			= fio_e4defrag_queue,
+	.open_file		= generic_open_file,
+	.close_file		= generic_close_file,
+	.get_file_size		= generic_get_file_size,
+	.flags			= FIO_SYNCIO,
+	.cleanup		= fio_e4defrag_cleanup,
+	.options		= options,
+	.option_struct_size	= sizeof(struct e4defrag_options),
+
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/falloc.c b/engines/falloc.c
new file mode 100644
index 0000000..6382569
--- /dev/null
+++ b/engines/falloc.c
@@ -0,0 +1,112 @@
+/*
+ * falloc: ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular fallocate to simulate data transfer 
+ * as fio ioengine.
+ * DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
+ * DDIR_WRITE does fallocate(,mode = 0) : fallocate with size extension
+ * DDIR_TRIM  does fallocate(,mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)
+ *
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+#include "../filehash.h"
+
+/*
+ * generic_open_file is not appropriate because does not allow to perform
+ * TRIM in to file
+ */
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+	int from_hash = 0;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported fallocate \n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+open_again:
+	from_hash = file_lookup_open(f, O_CREAT|O_RDWR);
+
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+		td_verror(td, e, buf);
+	}
+
+	if (!from_hash && f->fd != -1) {
+		if (add_file_hash(f)) {
+			int fio_unused ret;
+
+			/*
+			 * OK to ignore, we haven't done anything with it
+			 */
+			ret = generic_close_file(td, f);
+			goto open_again;
+		}
+	}
+
+	return 0;
+}
+
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE     0x01 /* default is extend size */
+#endif
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE    0x02 /* de-allocates range */
+#endif
+
+static enum fio_q_status fio_fallocate_queue(struct thread_data *td,
+					     struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+	int flags = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		flags = FALLOC_FL_KEEP_SIZE;
+	else if (io_u->ddir == DDIR_WRITE)
+		flags = 0;
+	else if (io_u->ddir == DDIR_TRIM)
+		flags = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+	ret = fallocate(f->fd, flags, io_u->offset, io_u->xfer_buflen);
+
+	if (ret)
+		io_u->error = errno;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "falloc",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_fallocate_queue,
+	.open_file	= open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/filecreate.c b/engines/filecreate.c
new file mode 100644
index 0000000..5fec854
--- /dev/null
+++ b/engines/filecreate.c
@@ -0,0 +1,118 @@
+/*
+ * filecreate engine
+ *
+ * IO engine that doesn't do any IO, just creates files and tracks the latency
+ * of the file creation.
+ */
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include "../fio.h"
+
+struct fc_data {
+	enum fio_ddir stat_ddir;
+};
+
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct timespec start;
+	int do_lat = !td->o.disable_lat;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported fallocate \n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (do_lat)
+		fio_gettime(&start, NULL);
+
+	f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
+
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+		td_verror(td, e, buf);
+		return 1;
+	}
+
+	if (do_lat) {
+		struct fc_data *data = td->io_ops_data;
+		uint64_t nsec;
+
+		nsec = ntime_since_now(&start);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
+	}
+
+	return 0;
+}
+
+static enum fio_q_status queue_io(struct thread_data *td,
+				  struct io_u fio_unused *io_u)
+{
+	return FIO_Q_COMPLETED;
+}
+
+/*
+ * Ensure that we at least have a block size worth of IO to do for each
+ * file. If the job file has td->o.size < nr_files * block_size, then
+ * fio won't do anything.
+ */
+static int get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	f->real_file_size = td_min_bs(td);
+	return 0;
+}
+
+static int init(struct thread_data *td)
+{
+	struct fc_data *data;
+
+	data = calloc(1, sizeof(*data));
+
+	if (td_read(td))
+		data->stat_ddir = DDIR_READ;
+	else if (td_write(td))
+		data->stat_ddir = DDIR_WRITE;
+
+	td->io_ops_data = data;
+	return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+	struct fc_data *data = td->io_ops_data;
+
+	free(data);
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "filecreate",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= init,
+	.cleanup	= cleanup,
+	.queue		= queue_io,
+	.get_file_size	= get_file_size,
+	.open_file	= open_file,
+	.close_file	= generic_close_file,
+	.flags		= FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+				FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static void fio_init fio_filecreate_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_filecreate_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/filestat.c b/engines/filestat.c
new file mode 100644
index 0000000..405f028
--- /dev/null
+++ b/engines/filestat.c
@@ -0,0 +1,190 @@
+/*
+ * filestat engine
+ *
+ * IO engine that doesn't do any IO, just stat files and tracks the latency
+ * of the file stat.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../oslib/statx.h"
+
+struct fc_data {
+	enum fio_ddir stat_ddir;
+};
+
+struct filestat_options {
+	void *pad;
+	unsigned int stat_type;
+};
+
+enum {
+	FIO_FILESTAT_STAT	= 1,
+	FIO_FILESTAT_LSTAT	= 2,
+	FIO_FILESTAT_STATX	= 3,
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "stat_type",
+		.lname	= "stat_type",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct filestat_options, stat_type),
+		.help	= "Specify stat system call type to measure lookup/getattr performance",
+		.def	= "stat",
+		.posval = {
+			  { .ival = "stat",
+			    .oval = FIO_FILESTAT_STAT,
+			    .help = "Use stat(2)",
+			  },
+			  { .ival = "lstat",
+			    .oval = FIO_FILESTAT_LSTAT,
+			    .help = "Use lstat(2)",
+			  },
+			  { .ival = "statx",
+			    .oval = FIO_FILESTAT_STATX,
+			    .help = "Use statx(2) if exists",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_FILESTAT,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int stat_file(struct thread_data *td, struct fio_file *f)
+{
+	struct filestat_options *o = td->eo;
+	struct timespec start;
+	int do_lat = !td->o.disable_lat;
+	struct stat statbuf;
+#ifndef WIN32
+	struct statx statxbuf;
+	char *abspath;
+#endif
+	int ret;
+
+	dprint(FD_FILE, "fd stat %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported\n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (do_lat)
+		fio_gettime(&start, NULL);
+
+	switch (o->stat_type){
+	case FIO_FILESTAT_STAT:
+		ret = stat(f->file_name, &statbuf);
+		break;
+	case FIO_FILESTAT_LSTAT:
+		ret = lstat(f->file_name, &statbuf);
+		break;
+	case FIO_FILESTAT_STATX:
+#ifndef WIN32
+		abspath = realpath(f->file_name, NULL);
+		if (abspath) {
+			ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
+			free(abspath);
+		} else
+			ret = -1;
+#else
+		ret = -1;
+#endif
+		break;
+	default:
+		ret = -1;
+		break;
+	}
+
+	if (ret == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
+			o->stat_type);
+		td_verror(td, e, buf);
+		return 1;
+	}
+
+	if (do_lat) {
+		struct fc_data *data = td->io_ops_data;
+		uint64_t nsec;
+
+		nsec = ntime_since_now(&start);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
+	}
+
+	return 0;
+}
+
+static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u)
+{
+	return FIO_Q_COMPLETED;
+}
+
+static int init(struct thread_data *td)
+{
+	struct fc_data *data;
+
+	data = calloc(1, sizeof(*data));
+
+	if (td_read(td))
+		data->stat_ddir = DDIR_READ;
+	else if (td_write(td))
+		data->stat_ddir = DDIR_WRITE;
+
+	td->io_ops_data = data;
+	return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+	struct fc_data *data = td->io_ops_data;
+
+	free(data);
+}
+
+static int stat_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	/* do nothing because file not opened */
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "filestat",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= init,
+	.cleanup	= cleanup,
+	.queue		= queue_io,
+	.invalidate	= stat_invalidate,
+	.get_file_size	= generic_get_file_size,
+	.open_file	= stat_file,
+	.flags		=  FIO_SYNCIO | FIO_FAKEIO |
+				FIO_NOSTATS | FIO_NOFILEHASH,
+	.options	= options,
+	.option_struct_size = sizeof(struct filestat_options),
+};
+
+static void fio_init fio_filestat_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_filestat_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/ftruncate.c b/engines/ftruncate.c
new file mode 100644
index 0000000..c7ad038
--- /dev/null
+++ b/engines/ftruncate.c
@@ -0,0 +1,52 @@
+/*
+ * ftruncate: ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular truncates to simulate data transfer
+ * as fio ioengine.
+ * DDIR_WRITE does ftruncate
+ *
+ */
+#include <errno.h>
+#include <unistd.h>
+
+#include "../fio.h"
+
+static enum fio_q_status fio_ftruncate_queue(struct thread_data *td,
+					     struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir != DDIR_WRITE) {
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+
+	ret = ftruncate(f->fd, io_u->offset);
+	if (ret)
+		io_u->error = errno;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "ftruncate",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_ftruncate_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_FAKEIO
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/gfapi.h b/engines/gfapi.h
new file mode 100644
index 0000000..e4cdbcb
--- /dev/null
+++ b/engines/gfapi.h
@@ -0,0 +1,23 @@
+#include <glusterfs/api/glfs.h>
+#include "../fio.h"
+
+struct gf_options {
+	void *pad;
+	char *gf_vol;
+	char *gf_brick;
+	int gf_single_instance;
+};
+
+struct gf_data {
+	glfs_t *fs;
+	glfs_fd_t *fd;
+	struct io_u **aio_events;
+};
+
+extern struct fio_option gfapi_options[];
+extern int fio_gf_setup(struct thread_data *td);
+extern void fio_gf_cleanup(struct thread_data *td);
+extern int fio_gf_get_file_size(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_open_file(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_close_file(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f);
diff --git a/engines/glusterfs.c b/engines/glusterfs.c
new file mode 100644
index 0000000..f2b84a2
--- /dev/null
+++ b/engines/glusterfs.c
@@ -0,0 +1,435 @@
+/*
+ * glusterfs engine
+ *
+ * common Glusterfs's gfapi interface
+ *
+ */
+
+#include "gfapi.h"
+#include "../optgroup.h"
+
+struct fio_option gfapi_options[] = {
+	{
+	 .name = "volume",
+	 .lname = "Glusterfs volume",
+	 .type = FIO_OPT_STR_STORE,
+	 .help = "Name of the Glusterfs volume",
+	 .off1 = offsetof(struct gf_options, gf_vol),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = "brick",
+	 .lname = "Glusterfs brick name",
+	 .type = FIO_OPT_STR_STORE,
+	 .help = "Name of the Glusterfs brick to connect",
+	 .off1 = offsetof(struct gf_options, gf_brick),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = "single-instance",
+	 .lname = "Single glusterfs instance",
+	 .type = FIO_OPT_BOOL,
+	 .help = "Only one glusterfs instance",
+	 .off1 = offsetof(struct gf_options, gf_single_instance),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = NULL,
+	 },
+};
+
+struct glfs_info {
+	struct flist_head	list;
+	char			*volume;
+	char			*brick;
+	glfs_t			*fs;
+	int			refcount;
+};
+
+static pthread_mutex_t glfs_lock = PTHREAD_MUTEX_INITIALIZER;
+static FLIST_HEAD(glfs_list_head);
+
+static glfs_t *fio_gf_new_fs(char *volume, char *brick)
+{
+	int r = 0;
+	glfs_t *fs;
+	struct stat sb = { 0, };
+
+	fs = glfs_new(volume);
+	if (!fs) {
+		log_err("glfs_new failed.\n");
+		goto out;
+	}
+	glfs_set_logging(fs, "/tmp/fio_gfapi.log", 7);
+	/* default to tcp */
+	r = glfs_set_volfile_server(fs, "tcp", brick, 0);
+	if (r) {
+		log_err("glfs_set_volfile_server failed.\n");
+		goto out;
+	}
+	r = glfs_init(fs);
+	if (r) {
+		log_err("glfs_init failed. Is glusterd running on brick?\n");
+		goto out;
+	}
+	sleep(2);
+	r = glfs_lstat(fs, ".", &sb);
+	if (r) {
+		log_err("glfs_lstat failed.\n");
+		goto out;
+	}
+
+out:
+	if (r) {
+		glfs_fini(fs);
+		fs = NULL;
+	}
+	return fs;
+}
+
+static glfs_t *fio_gf_get_glfs(struct gf_options *opt,
+			       char *volume, char *brick)
+{
+	struct glfs_info *glfs = NULL;
+	struct glfs_info *tmp;
+	struct flist_head *entry;
+
+	if (!opt->gf_single_instance)
+		return fio_gf_new_fs(volume, brick);
+
+	pthread_mutex_lock (&glfs_lock);
+
+	flist_for_each(entry, &glfs_list_head) {
+		tmp = flist_entry(entry, struct glfs_info, list);
+		if (!strcmp(volume, tmp->volume) &&
+		    !strcmp(brick, tmp->brick)) {
+			glfs = tmp;
+			break;
+		}
+	}
+
+	if (glfs) {
+		glfs->refcount++;
+	} else {
+		glfs = malloc(sizeof(*glfs));
+		if (!glfs)
+			goto out;
+		INIT_FLIST_HEAD(&glfs->list);
+		glfs->refcount = 0;
+		glfs->volume = strdup(volume);
+		glfs->brick = strdup(brick);
+		glfs->fs = fio_gf_new_fs(volume, brick);
+		if (!glfs->fs) {
+			free(glfs);
+			glfs = NULL;
+			goto out;
+		}
+
+		flist_add_tail(&glfs->list, &glfs_list_head);
+		glfs->refcount = 1;
+	}
+
+out:
+	pthread_mutex_unlock (&glfs_lock);
+
+	if (glfs)
+		return glfs->fs;
+	return NULL;
+}
+
+static void fio_gf_put_glfs(struct gf_options *opt, glfs_t *fs)
+{
+	struct glfs_info *glfs = NULL;
+	struct glfs_info *tmp;
+	struct flist_head *entry;
+
+	if (!opt->gf_single_instance) {
+		glfs_fini(fs);
+		return;
+	}
+
+	pthread_mutex_lock (&glfs_lock);
+
+	flist_for_each(entry, &glfs_list_head) {
+		tmp = flist_entry(entry, struct glfs_info, list);
+		if (tmp->fs == fs) {
+			glfs = tmp;
+			break;
+		}
+	}
+
+	if (!glfs) {
+		log_err("glfs not found to fini.\n");
+	} else {
+		glfs->refcount--;
+
+		if (glfs->refcount == 0) {
+			glfs_fini(glfs->fs);
+			free(glfs->volume);
+			free(glfs->brick);
+			flist_del(&glfs->list);
+		}
+	}
+
+	pthread_mutex_unlock (&glfs_lock);
+}
+
+int fio_gf_setup(struct thread_data *td)
+{
+	struct gf_data *g = NULL;
+	struct gf_options *opt = td->eo;
+
+	dprint(FD_IO, "fio setup\n");
+
+	if (td->io_ops_data)
+		return 0;
+
+	g = malloc(sizeof(struct gf_data));
+	if (!g) {
+		log_err("malloc failed.\n");
+		return -ENOMEM;
+	}
+	g->fd = NULL;
+	g->aio_events = NULL;
+
+	g->fs = fio_gf_get_glfs(opt, opt->gf_vol, opt->gf_brick);
+	if (!g->fs)
+		goto cleanup;
+
+	dprint(FD_FILE, "fio setup %p\n", g->fs);
+	td->io_ops_data = g;
+	return 0;
+cleanup:
+	free(g);
+	td->io_ops_data = NULL;
+	return -EIO;
+}
+
+void fio_gf_cleanup(struct thread_data *td)
+{
+	struct gf_data *g = td->io_ops_data;
+
+	if (g) {
+		if (g->aio_events)
+			free(g->aio_events);
+		if (g->fd)
+			glfs_close(g->fd);
+		if (g->fs)
+			fio_gf_put_glfs(td->eo, g->fs);
+		free(g);
+		td->io_ops_data = NULL;
+	}
+}
+
+int fio_gf_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct stat buf;
+	int ret;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "get file size %s\n", f->file_name);
+
+	if (!g || !g->fs) {
+		return 0;
+	}
+	if (fio_file_size_known(f))
+		return 0;
+
+	ret = glfs_lstat(g->fs, f->file_name, &buf);
+	if (ret < 0) {
+		log_err("glfs_lstat failed.\n");
+		return ret;
+	}
+
+	f->real_file_size = buf.st_size;
+	fio_file_set_size_known(f);
+
+	return 0;
+
+}
+
+int fio_gf_open_file(struct thread_data *td, struct fio_file *f)
+{
+
+	int flags = 0;
+	int ret = 0;
+	struct gf_data *g = td->io_ops_data;
+	struct stat sb = { 0, };
+
+	if (td_write(td)) {
+		if (!read_only)
+			flags = O_RDWR;
+	} else if (td_read(td)) {
+		if (!read_only)
+			flags = O_RDWR;
+		else
+			flags = O_RDONLY;
+	}
+
+	if (td->o.odirect)
+		flags |= OS_O_DIRECT;
+	if (td->o.sync_io)
+		flags |= O_SYNC;
+
+	dprint(FD_FILE, "fio file %s open mode %s td rw %s\n", f->file_name,
+	       flags & O_RDONLY ? "ro" : "rw", td_read(td) ? "read" : "write");
+	g->fd = glfs_creat(g->fs, f->file_name, flags, 0644);
+	if (!g->fd) {
+		ret = errno;
+		log_err("glfs_creat failed.\n");
+		return ret;
+	}
+	/* file for read doesn't exist or shorter than required, create/extend it */
+	if (td_read(td)) {
+		if (glfs_lstat(g->fs, f->file_name, &sb)
+		    || sb.st_size < f->real_file_size) {
+			dprint(FD_FILE, "fio extend file %s from %jd to %" PRIu64 "\n",
+			       f->file_name, (intmax_t) sb.st_size, f->real_file_size);
+#if defined(CONFIG_GF_NEW_API)
+			ret = glfs_ftruncate(g->fd, f->real_file_size, NULL, NULL);
+#else
+			ret = glfs_ftruncate(g->fd, f->real_file_size);
+#endif
+			if (ret) {
+				log_err("failed fio extend file %s to %" PRIu64 "\n",
+					f->file_name, f->real_file_size);
+			} else {
+				unsigned long long left;
+				unsigned int bs;
+				char *b;
+				int r;
+
+				/* fill the file, copied from extend_file */
+				b = malloc(td->o.max_bs[DDIR_WRITE]);
+
+				left = f->real_file_size;
+				while (left && !td->terminate) {
+					bs = td->o.max_bs[DDIR_WRITE];
+					if (bs > left)
+						bs = left;
+
+					fill_io_buffer(td, b, bs, bs);
+
+					r = glfs_write(g->fd, b, bs, 0);
+					dprint(FD_IO,
+					       "fio write %d of %" PRIu64 " file %s\n",
+					       r, f->real_file_size,
+					       f->file_name);
+
+					if (r > 0) {
+						left -= r;
+						continue;
+					} else {
+						if (r < 0) {
+							int __e = errno;
+
+							if (__e == ENOSPC) {
+								if (td->o.
+								    fill_device)
+									break;
+								log_info
+								    ("fio: ENOSPC on laying out "
+								     "file, stopping\n");
+								break;
+							}
+							td_verror(td, errno,
+								  "write");
+						} else
+							td_verror(td, EIO,
+								  "write");
+
+						break;
+					}
+				}
+
+				if (b)
+					free(b);
+				glfs_lseek(g->fd, 0, SEEK_SET);
+
+				if (td->terminate && td->o.unlink) {
+					dprint(FD_FILE, "terminate unlink %s\n",
+					       f->file_name);
+					glfs_unlink(g->fs, f->file_name);
+				} else if (td->o.create_fsync) {
+#if defined(CONFIG_GF_NEW_API)
+					if (glfs_fsync(g->fd, NULL, NULL) < 0) {
+#else
+					if (glfs_fsync(g->fd) < 0) {
+#endif
+						dprint(FD_FILE,
+						       "failed to sync, close %s\n",
+						       f->file_name);
+						td_verror(td, errno, "fsync");
+						glfs_close(g->fd);
+						g->fd = NULL;
+						return 1;
+					}
+				}
+			}
+		}
+	}
+#if defined(GFAPI_USE_FADVISE)
+	{
+		int r = 0;
+		if (td_random(td)) {
+			r = glfs_fadvise(g->fd, 0, f->real_file_size,
+					 POSIX_FADV_RANDOM);
+		} else {
+			r = glfs_fadvise(g->fd, 0, f->real_file_size,
+					 POSIX_FADV_SEQUENTIAL);
+		}
+		if (r) {
+			dprint(FD_FILE, "fio %p fadvise %s status %d\n", g->fs,
+			       f->file_name, r);
+		}
+	}
+#endif
+	dprint(FD_FILE, "fio %p created %s\n", g->fs, f->file_name);
+	f->fd = -1;
+	f->shadow_fd = -1;
+	td->o.open_files ++;
+	return ret;
+}
+
+int fio_gf_close_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	if (g) {
+		if (g->fd && glfs_close(g->fd) < 0)
+			ret = errno;
+		g->fd = NULL;
+	}
+
+	return ret;
+}
+
+int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "fd unlink %s\n", f->file_name);
+
+	if (g) {
+		if (g->fd && glfs_close(g->fd) < 0)
+			ret = errno;
+
+		glfs_unlink(g->fs, f->file_name);
+
+		if (g->fs)
+			glfs_fini(g->fs);
+
+		g->fd = NULL;
+		free(g);
+	}
+	td->io_ops_data = NULL;
+
+	return ret;
+}
diff --git a/engines/glusterfs_async.c b/engines/glusterfs_async.c
new file mode 100644
index 0000000..0392ad6
--- /dev/null
+++ b/engines/glusterfs_async.c
@@ -0,0 +1,193 @@
+/*
+ * glusterfs engine
+ *
+ * IO engine using Glusterfs's gfapi async interface
+ *
+ */
+#include "gfapi.h"
+#define NOT_YET 1
+struct fio_gf_iou {
+	struct io_u *io_u;
+	int io_complete;
+};
+
+static struct io_u *fio_gf_event(struct thread_data *td, int event)
+{
+	struct gf_data *gf_data = td->io_ops_data;
+
+	dprint(FD_IO, "%s\n", __FUNCTION__);
+	return gf_data->aio_events[event];
+}
+
+static int fio_gf_getevents(struct thread_data *td, unsigned int min,
+			    unsigned int max, const struct timespec *t)
+{
+	struct gf_data *g = td->io_ops_data;
+	unsigned int events = 0;
+	struct io_u *io_u;
+	int i;
+
+	dprint(FD_IO, "%s\n", __FUNCTION__);
+	do {
+		io_u_qiter(&td->io_u_all, io_u, i) {
+			struct fio_gf_iou *io;
+
+			if (!(io_u->flags & IO_U_F_FLIGHT))
+				continue;
+
+			io = io_u->engine_data;
+			if (io->io_complete) {
+				io->io_complete = 0;
+				g->aio_events[events] = io_u;
+				events++;
+
+				if (events >= max)
+					break;
+			}
+
+		}
+		if (events < min)
+			usleep(100);
+		else
+			break;
+
+	} while (1);
+
+	return events;
+}
+
+static void fio_gf_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_gf_iou *io = io_u->engine_data;
+
+	if (io) {
+		if (io->io_complete)
+			log_err("incomplete IO found.\n");
+		io_u->engine_data = NULL;
+		free(io);
+	}
+}
+
+static int fio_gf_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+    struct fio_gf_iou *io;
+	dprint(FD_FILE, "%s\n", __FUNCTION__);
+    
+    io = malloc(sizeof(struct fio_gf_iou));
+    if (!io) {
+        td_verror(td, errno, "malloc");
+        return 1;
+    }
+    io->io_complete = 0;
+    io->io_u = io_u;
+    io_u->engine_data = io;
+	return 0;
+}
+
+#if defined(CONFIG_GF_NEW_API)
+static void gf_async_cb(glfs_fd_t * fd, ssize_t ret, struct glfs_stat *prestat,
+			struct glfs_stat *poststat, void *data)
+#else
+static void gf_async_cb(glfs_fd_t * fd, ssize_t ret, void *data)
+#endif
+{
+	struct io_u *io_u = data;
+	struct fio_gf_iou *iou = io_u->engine_data;
+
+	dprint(FD_IO, "%s ret %zd\n", __FUNCTION__, ret);
+	iou->io_complete = 1;
+}
+
+static enum fio_q_status fio_gf_async_queue(struct thread_data fio_unused * td,
+					    struct io_u *io_u)
+{
+	struct gf_data *g = td->io_ops_data;
+	int r;
+
+	dprint(FD_IO, "%s op %s\n", __FUNCTION__, io_ddir_name(io_u->ddir));
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		r = glfs_pread_async(g->fd, io_u->xfer_buf, io_u->xfer_buflen,
+				     io_u->offset, 0, gf_async_cb, io_u);
+	else if (io_u->ddir == DDIR_WRITE)
+		r = glfs_pwrite_async(g->fd, io_u->xfer_buf, io_u->xfer_buflen,
+				      io_u->offset, 0, gf_async_cb, io_u);
+#if defined(CONFIG_GF_TRIM)
+	else if (io_u->ddir == DDIR_TRIM)
+		r = glfs_discard_async(g->fd, io_u->offset, io_u->xfer_buflen,
+				       gf_async_cb, io_u);
+#endif
+	else if (io_u->ddir == DDIR_DATASYNC)
+		r = glfs_fdatasync_async(g->fd, gf_async_cb, io_u);
+	else if (io_u->ddir == DDIR_SYNC)
+		r = glfs_fsync_async(g->fd, gf_async_cb, io_u);
+	else
+		r = EINVAL;
+
+	if (r) {
+		log_err("glfs queue failed.\n");
+		io_u->error = r;
+		goto failed;
+	}
+	return FIO_Q_QUEUED;
+
+failed:
+	io_u->error = r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_gf_async_setup(struct thread_data *td)
+{
+	struct gf_data *g;
+	int r;
+
+#if defined(NOT_YET)
+	log_err("the async interface is still very experimental...\n");
+#endif
+	r = fio_gf_setup(td);
+	if (r)
+		return r;
+
+	td->o.use_thread = 1;
+	g = td->io_ops_data;
+	g->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!g->aio_events) {
+		r = -ENOMEM;
+		fio_gf_cleanup(td);
+		return r;
+	}
+
+	return r;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "gfapi_async",
+	.version = FIO_IOOPS_VERSION,
+	.init = fio_gf_async_setup,
+	.cleanup = fio_gf_cleanup,
+	.queue = fio_gf_async_queue,
+	.open_file = fio_gf_open_file,
+	.close_file = fio_gf_close_file,
+	.unlink_file = fio_gf_unlink_file,
+	.get_file_size = fio_gf_get_file_size,
+	.getevents = fio_gf_getevents,
+	.event = fio_gf_event,
+	.io_u_init = fio_gf_io_u_init,
+	.io_u_free = fio_gf_io_u_free,
+	.options = gfapi_options,
+	.option_struct_size = sizeof(struct gf_options),
+	.flags = FIO_DISKLESSIO,
+};
+
+static void fio_init fio_gf_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_gf_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/glusterfs_sync.c b/engines/glusterfs_sync.c
new file mode 100644
index 0000000..de73261
--- /dev/null
+++ b/engines/glusterfs_sync.c
@@ -0,0 +1,107 @@
+/*
+ * glusterfs engine
+ *
+ * IO engine using Glusterfs's gfapi sync interface
+ *
+ */
+
+#include "gfapi.h"
+
+#define LAST_POS(f)	((f)->engine_pos)
+static int fio_gf_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "fio prep\n");
+
+	if (!ddir_rw(io_u->ddir))
+		return 0;
+
+	if (LAST_POS(f) != -1ULL && LAST_POS(f) == io_u->offset)
+		return 0;
+
+	if (glfs_lseek(g->fd, io_u->offset, SEEK_SET) < 0) {
+		td_verror(td, errno, "lseek");
+		return 1;
+	}
+
+	return 0;
+}
+
+static enum fio_q_status fio_gf_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct gf_data *g = td->io_ops_data;
+	int ret = 0;
+
+	dprint(FD_FILE, "fio queue len %llu\n", io_u->xfer_buflen);
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = glfs_read(g->fd, io_u->xfer_buf, io_u->xfer_buflen, 0);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = glfs_write(g->fd, io_u->xfer_buf, io_u->xfer_buflen, 0);
+	else if (io_u->ddir == DDIR_SYNC)
+#if defined(CONFIG_GF_NEW_API)
+		ret = glfs_fsync(g->fd, NULL, NULL);
+#else
+		ret = glfs_fsync(g->fd);
+#endif
+	else if (io_u->ddir == DDIR_DATASYNC)
+#if defined(CONFIG_GF_NEW_API)
+		ret = glfs_fdatasync(g->fd, NULL, NULL);
+#else
+		ret = glfs_fdatasync(g->fd);
+#endif
+	else {
+		log_err("unsupported operation.\n");
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+	dprint(FD_FILE, "fio len %llu ret %d\n", io_u->xfer_buflen, ret);
+	if (io_u->file && ret >= 0 && ddir_rw(io_u->ddir))
+		LAST_POS(io_u->file) = io_u->offset + ret;
+
+	if (ret != (int)io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		log_err("IO failed.\n");
+		td_verror(td, io_u->error, "xfer");
+	}
+
+	return FIO_Q_COMPLETED;
+
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "gfapi",
+	.version = FIO_IOOPS_VERSION,
+	.init = fio_gf_setup,
+	.cleanup = fio_gf_cleanup,
+	.prep = fio_gf_prep,
+	.queue = fio_gf_queue,
+	.open_file = fio_gf_open_file,
+	.close_file = fio_gf_close_file,
+	.unlink_file = fio_gf_unlink_file,
+	.get_file_size = fio_gf_get_file_size,
+	.options = gfapi_options,
+	.option_struct_size = sizeof(struct gf_options),
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static void fio_init fio_gf_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_gf_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/guasi.c b/engines/guasi.c
new file mode 100644
index 0000000..cb26802
--- /dev/null
+++ b/engines/guasi.c
@@ -0,0 +1,270 @@
+/*
+ * guasi engine
+ *
+ * IO engine using the GUASI library.
+ *
+ * Before running make. You'll need the GUASI lib as well:
+ *
+ * http://www.xmailserver.org/guasi-lib.html
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../fio.h"
+
+#define GFIO_MIN_THREADS 32
+#ifndef GFIO_MAX_THREADS
+#define GFIO_MAX_THREADS 2000
+#endif
+
+#include <guasi.h>
+#include <guasi_syscalls.h>
+
+#ifdef GFIO_DEBUG
+#define GDBG_PRINT(a) printf a
+#else
+#define GDBG_PRINT(a) (void) 0
+#endif
+
+struct guasi_data {
+	guasi_t hctx;
+	int max_reqs;
+	guasi_req_t *reqs;
+	struct io_u **io_us;
+	int queued_nr;
+	int reqs_nr;
+};
+
+static int fio_guasi_prep(struct thread_data fio_unused *td, struct io_u *io_u)
+{
+
+	GDBG_PRINT(("fio_guasi_prep(%p)\n", io_u));
+	io_u->greq = NULL;
+
+	return 0;
+}
+
+static struct io_u *fio_guasi_event(struct thread_data *td, int event)
+{
+	struct guasi_data *ld = td->io_ops_data;
+	struct io_u *io_u;
+	struct guasi_reqinfo rinf;
+
+	GDBG_PRINT(("fio_guasi_event(%d)\n", event));
+	if (guasi_req_info(ld->reqs[event], &rinf) < 0) {
+		log_err("guasi_req_info(%d) FAILED!\n", event);
+		return NULL;
+	}
+	io_u = rinf.asid;
+	io_u->error = EINPROGRESS;
+	GDBG_PRINT(("fio_guasi_event(%d) -> %p\n", event, io_u));
+	if (rinf.status == GUASI_STATUS_COMPLETE) {
+		io_u->error = rinf.result;
+		if (io_u->ddir == DDIR_READ ||
+		    io_u->ddir == DDIR_WRITE) {
+			io_u->error = 0;
+			if (rinf.result != (long) io_u->xfer_buflen) {
+				if (rinf.result >= 0)
+					io_u->resid = io_u->xfer_buflen - rinf.result;
+				else
+					io_u->error = rinf.error;
+			}
+		}
+	}
+
+	return io_u;
+}
+
+static int fio_guasi_getevents(struct thread_data *td, unsigned int min,
+			       unsigned int max, const struct timespec *t)
+{
+	struct guasi_data *ld = td->io_ops_data;
+	int n, r;
+	long timeo = -1;
+
+	GDBG_PRINT(("fio_guasi_getevents(%d, %d)\n", min, max));
+	if (min > ld->max_reqs)
+		min = ld->max_reqs;
+	if (max > ld->max_reqs)
+		max = ld->max_reqs;
+	if (t)
+		timeo = t->tv_sec * 1000L + t->tv_nsec / 1000000L;
+	for (n = 0; n < ld->reqs_nr; n++)
+		guasi_req_free(ld->reqs[n]);
+	n = 0;
+	do {
+		r = guasi_fetch(ld->hctx, ld->reqs + n, min - n,
+				max - n, timeo);
+		if (r < 0) {
+			log_err("guasi_fetch() FAILED! (%d)\n", r);
+			break;
+		}
+		n += r;
+		if (n >= min)
+			break;
+	} while (1);
+	ld->reqs_nr = n;
+	GDBG_PRINT(("fio_guasi_getevents() -> %d\n", n));
+
+	return n;
+}
+
+static enum fio_q_status fio_guasi_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct guasi_data *ld = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	GDBG_PRINT(("fio_guasi_queue(%p)\n", io_u));
+	if (ld->queued_nr == (int) td->o.iodepth)
+		return FIO_Q_BUSY;
+
+	ld->io_us[ld->queued_nr] = io_u;
+	ld->queued_nr++;
+	return FIO_Q_QUEUED;
+}
+
+static void fio_guasi_queued(struct thread_data *td, struct io_u **io_us, int nr)
+{
+	int i;
+	struct io_u *io_u;
+	struct timespec now;
+
+	if (!fio_fill_issue_time(td))
+		return;
+
+	io_u_mark_submit(td, nr);
+	fio_gettime(&now, NULL);
+	for (i = 0; i < nr; i++) {
+		io_u = io_us[i];
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+	}
+}
+
+static int fio_guasi_commit(struct thread_data *td)
+{
+	struct guasi_data *ld = td->io_ops_data;
+	int i;
+	struct io_u *io_u;
+	struct fio_file *f;
+
+	GDBG_PRINT(("fio_guasi_commit(%d)\n", ld->queued_nr));
+	for (i = 0; i < ld->queued_nr; i++) {
+		io_u = ld->io_us[i];
+		GDBG_PRINT(("fio_guasi_commit(%d) --> %p\n", i, io_u));
+		f = io_u->file;
+		io_u->greq = NULL;
+		if (io_u->ddir == DDIR_READ)
+			io_u->greq = guasi__pread(ld->hctx, ld, io_u, 0,
+						  f->fd, io_u->xfer_buf, io_u->xfer_buflen,
+						  io_u->offset);
+		else if (io_u->ddir == DDIR_WRITE)
+			io_u->greq = guasi__pwrite(ld->hctx, ld, io_u, 0,
+						   f->fd, io_u->xfer_buf, io_u->xfer_buflen,
+						   io_u->offset);
+		else if (ddir_sync(io_u->ddir))
+			io_u->greq = guasi__fsync(ld->hctx, ld, io_u, 0, f->fd);
+		else {
+			log_err("fio_guasi_commit() FAILED: unknow request %d\n",
+				io_u->ddir);
+		}
+		if (io_u->greq == NULL) {
+			log_err("fio_guasi_commit() FAILED: submit failed (%s)\n",
+				strerror(errno));
+			return -1;
+		}
+	}
+	fio_guasi_queued(td, ld->io_us, i);
+	ld->queued_nr = 0;
+	GDBG_PRINT(("fio_guasi_commit() -> %d\n", i));
+
+	return 0;
+}
+
+static int fio_guasi_cancel(struct thread_data fio_unused *td,
+			    struct io_u *io_u)
+{
+	GDBG_PRINT(("fio_guasi_cancel(%p) req=%p\n", io_u, io_u->greq));
+	if (io_u->greq != NULL)
+		guasi_req_cancel(io_u->greq);
+
+	return 0;
+}
+
+static void fio_guasi_cleanup(struct thread_data *td)
+{
+	struct guasi_data *ld = td->io_ops_data;
+	int n;
+
+	GDBG_PRINT(("fio_guasi_cleanup(%p)\n", ld));
+	if (ld) {
+		for (n = 0; n < ld->reqs_nr; n++)
+			guasi_req_free(ld->reqs[n]);
+		guasi_free(ld->hctx);
+		free(ld->reqs);
+		free(ld->io_us);
+		free(ld);
+	}
+	GDBG_PRINT(("fio_guasi_cleanup(%p) DONE\n", ld));
+}
+
+static int fio_guasi_init(struct thread_data *td)
+{
+	int maxthr;
+	struct guasi_data *ld = malloc(sizeof(*ld));
+
+	GDBG_PRINT(("fio_guasi_init(): depth=%d\n", td->o.iodepth));
+	memset(ld, 0, sizeof(*ld));
+	maxthr = td->o.iodepth > GFIO_MIN_THREADS ? td->o.iodepth: GFIO_MIN_THREADS;
+	if (maxthr > GFIO_MAX_THREADS)
+		maxthr = GFIO_MAX_THREADS;
+	if ((ld->hctx = guasi_create(GFIO_MIN_THREADS, maxthr, 1)) == NULL) {
+		td_verror(td, errno, "guasi_create");
+		free(ld);
+		return 1;
+	}
+	ld->max_reqs = td->o.iodepth;
+	ld->reqs = malloc(ld->max_reqs * sizeof(guasi_req_t));
+	ld->io_us = malloc(ld->max_reqs * sizeof(struct io_u *));
+	memset(ld->io_us, 0, ld->max_reqs * sizeof(struct io_u *));
+	ld->queued_nr = 0;
+	ld->reqs_nr = 0;
+
+	td->io_ops_data = ld;
+	GDBG_PRINT(("fio_guasi_init(): depth=%d -> %p\n", td->o.iodepth, ld));
+
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "guasi",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_guasi_init,
+	.prep		= fio_guasi_prep,
+	.queue		= fio_guasi_queue,
+	.commit		= fio_guasi_commit,
+	.cancel		= fio_guasi_cancel,
+	.getevents	= fio_guasi_getevents,
+	.event		= fio_guasi_event,
+	.cleanup	= fio_guasi_cleanup,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+};
+
+static void fio_init fio_guasi_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_guasi_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
+
diff --git a/engines/http.c b/engines/http.c
new file mode 100644
index 0000000..275fcab
--- /dev/null
+++ b/engines/http.c
@@ -0,0 +1,665 @@
+/*
+ * HTTP GET/PUT IO engine
+ *
+ * IO engine to perform HTTP(S) GET/PUT requests via libcurl-easy.
+ *
+ * Copyright (C) 2018 SUSE LLC
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <pthread.h>
+#include <time.h>
+#include <curl/curl.h>
+#include <openssl/hmac.h>
+#include <openssl/sha.h>
+#include <openssl/md5.h>
+#include "fio.h"
+#include "../optgroup.h"
+
+
+enum {
+	FIO_HTTP_WEBDAV	    = 0,
+	FIO_HTTP_S3	    = 1,
+	FIO_HTTP_SWIFT	    = 2,
+
+	FIO_HTTPS_OFF	    = 0,
+	FIO_HTTPS_ON	    = 1,
+	FIO_HTTPS_INSECURE  = 2,
+};
+
+struct http_data {
+	CURL *curl;
+};
+
+struct http_options {
+	void *pad;
+	unsigned int https;
+	char *host;
+	char *user;
+	char *pass;
+	char *s3_key;
+	char *s3_keyid;
+	char *s3_region;
+	char *swift_auth_token;
+	int verbose;
+	unsigned int mode;
+};
+
+struct http_curl_stream {
+	char *buf;
+	size_t pos;
+	size_t max;
+};
+
+static struct fio_option options[] = {
+	{
+		.name     = "https",
+		.lname    = "https",
+		.type     = FIO_OPT_STR,
+		.help     = "Enable https",
+		.off1     = offsetof(struct http_options, https),
+		.def      = "off",
+		.posval = {
+			  { .ival = "off",
+			    .oval = FIO_HTTPS_OFF,
+			    .help = "No HTTPS",
+			  },
+			  { .ival = "on",
+			    .oval = FIO_HTTPS_ON,
+			    .help = "Enable HTTPS",
+			  },
+			  { .ival = "insecure",
+			    .oval = FIO_HTTPS_INSECURE,
+			    .help = "Enable HTTPS, disable peer verification",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_host",
+		.lname    = "http_host",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Hostname (S3 bucket)",
+		.off1     = offsetof(struct http_options, host),
+		.def	  = "localhost",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_user",
+		.lname    = "http_user",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "HTTP user name",
+		.off1     = offsetof(struct http_options, user),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_pass",
+		.lname    = "http_pass",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "HTTP password",
+		.off1     = offsetof(struct http_options, pass),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_key",
+		.lname    = "S3 secret key",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 secret key",
+		.off1     = offsetof(struct http_options, s3_key),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_keyid",
+		.lname    = "S3 key id",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 key id",
+		.off1     = offsetof(struct http_options, s3_keyid),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_swift_auth_token",
+		.lname    = "Swift auth token",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "OpenStack Swift auth token",
+		.off1     = offsetof(struct http_options, swift_auth_token),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_region",
+		.lname    = "S3 region",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 region",
+		.off1     = offsetof(struct http_options, s3_region),
+		.def	  = "us-east-1",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_mode",
+		.lname    = "Request mode to use",
+		.type     = FIO_OPT_STR,
+		.help     = "Whether to use WebDAV, Swift, or S3",
+		.off1     = offsetof(struct http_options, mode),
+		.def	  = "webdav",
+		.posval = {
+			  { .ival = "webdav",
+			    .oval = FIO_HTTP_WEBDAV,
+			    .help = "WebDAV server",
+			  },
+			  { .ival = "s3",
+			    .oval = FIO_HTTP_S3,
+			    .help = "S3 storage backend",
+			  },
+			  { .ival = "swift",
+			    .oval = FIO_HTTP_SWIFT,
+			    .help = "OpenStack Swift storage",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_verbose",
+		.lname    = "HTTP verbosity level",
+		.type     = FIO_OPT_INT,
+		.help     = "increase http engine verbosity",
+		.off1     = offsetof(struct http_options, verbose),
+		.def	  = "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = NULL,
+	},
+};
+
+static char *_aws_uriencode(const char *uri)
+{
+	size_t bufsize = 1024;
+	char *r = malloc(bufsize);
+	char c;
+	int i, n;
+	const char *hex = "0123456789ABCDEF";
+
+	if (!r) {
+		log_err("malloc failed\n");
+		return NULL;
+	}
+
+	n = 0;
+	for (i = 0; (c = uri[i]); i++) {
+		if (n > bufsize-5) {
+			log_err("encoding the URL failed\n");
+			return NULL;
+		}
+
+		if ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+		|| (c >= '0' && c <= '9') || c == '_' || c == '-'
+		|| c == '~' || c == '.' || c == '/')
+			r[n++] = c;
+		else {
+			r[n++] = '%';
+			r[n++] = hex[(c >> 4 ) & 0xF];
+			r[n++] = hex[c & 0xF];
+		}
+	}
+	r[n++] = 0;
+	return r;
+}
+
+static char *_conv_hex(const unsigned char *p, size_t len)
+{
+	char *r;
+	int i,n;
+	const char *hex = "0123456789abcdef";
+	r = malloc(len * 2 + 1);
+	n = 0;
+	for (i = 0; i < len; i++) {
+		r[n++] = hex[(p[i] >> 4 ) & 0xF];
+		r[n++] = hex[p[i] & 0xF];
+	}
+	r[n] = 0;
+
+	return r;
+}
+
+static char *_gen_hex_sha256(const char *p, size_t len)
+{
+	unsigned char hash[SHA256_DIGEST_LENGTH];
+
+	SHA256((unsigned char*)p, len, hash);
+	return _conv_hex(hash, SHA256_DIGEST_LENGTH);
+}
+
+static char *_gen_hex_md5(const char *p, size_t len)
+{
+	unsigned char hash[MD5_DIGEST_LENGTH];
+
+	MD5((unsigned char*)p, len, hash);
+	return _conv_hex(hash, MD5_DIGEST_LENGTH);
+}
+
+static void _hmac(unsigned char *md, void *key, int key_len, char *data) {
+#ifndef CONFIG_HAVE_OPAQUE_HMAC_CTX
+	HMAC_CTX _ctx;
+#endif
+	HMAC_CTX *ctx;
+	unsigned int hmac_len;
+
+#ifdef CONFIG_HAVE_OPAQUE_HMAC_CTX
+	ctx = HMAC_CTX_new();
+#else
+	ctx = &_ctx;
+	/* work-around crash in certain versions of libssl */
+	HMAC_CTX_init(ctx);
+#endif
+	HMAC_Init_ex(ctx, key, key_len, EVP_sha256(), NULL);
+	HMAC_Update(ctx, (unsigned char*)data, strlen(data));
+	HMAC_Final(ctx, md, &hmac_len);
+#ifdef CONFIG_HAVE_OPAQUE_HMAC_CTX
+	HMAC_CTX_free(ctx);
+#else
+	HMAC_CTX_cleanup(ctx);
+#endif
+}
+
+static int _curl_trace(CURL *handle, curl_infotype type,
+	     char *data, size_t size,
+	     void *userp)
+{
+	const char *text;
+	(void)handle; /* prevent compiler warning */
+	(void)userp;
+
+	switch (type) {
+	case CURLINFO_TEXT:
+		fprintf(stderr, "== Info: %s", data);
+		/* fall through */
+	default:
+	case CURLINFO_SSL_DATA_OUT:
+		/* fall through */
+	case CURLINFO_SSL_DATA_IN:
+		return 0;
+
+	case CURLINFO_HEADER_OUT:
+		text = "=> Send header";
+		break;
+	case CURLINFO_DATA_OUT:
+		text = "=> Send data";
+		break;
+	case CURLINFO_HEADER_IN:
+		text = "<= Recv header";
+		break;
+	case CURLINFO_DATA_IN:
+		text = "<= Recv data";
+		break;
+	}
+
+	log_info("%s: %s", text, data);
+	return 0;
+}
+
+/* https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html
+ * https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html#signing-request-intro
+ */
+static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
+		int op, const char *uri, char *buf, size_t len)
+{
+	char date_short[16];
+	char date_iso[32];
+	char method[8];
+	char dkey[128];
+	char creq[512];
+	char sts[256];
+	char s[512];
+	char *uri_encoded = NULL;
+	char *dsha = NULL;
+	char *csha = NULL;
+	char *signature = NULL;
+	const char *service = "s3";
+	const char *aws = "aws4_request";
+	unsigned char md[SHA256_DIGEST_LENGTH];
+
+	time_t t = time(NULL);
+	struct tm *gtm = gmtime(&t);
+
+	strftime (date_short, sizeof(date_short), "%Y%m%d", gtm);
+	strftime (date_iso, sizeof(date_iso), "%Y%m%dT%H%M%SZ", gtm);
+	uri_encoded = _aws_uriencode(uri);
+
+	if (op == DDIR_WRITE) {
+		dsha = _gen_hex_sha256(buf, len);
+		sprintf(method, "PUT");
+	} else {
+		/* DDIR_READ && DDIR_TRIM supply an empty body */
+		if (op == DDIR_READ)
+			sprintf(method, "GET");
+		else
+			sprintf(method, "DELETE");
+		dsha = _gen_hex_sha256("", 0);
+	}
+
+	/* Create the canonical request first */
+	snprintf(creq, sizeof(creq),
+	"%s\n"
+	"%s\n"
+	"\n"
+	"host:%s\n"
+	"x-amz-content-sha256:%s\n"
+	"x-amz-date:%s\n"
+	"\n"
+	"host;x-amz-content-sha256;x-amz-date\n"
+	"%s"
+	, method
+	, uri_encoded, o->host, dsha, date_iso, dsha);
+
+	csha = _gen_hex_sha256(creq, strlen(creq));
+	snprintf(sts, sizeof(sts), "AWS4-HMAC-SHA256\n%s\n%s/%s/%s/%s\n%s",
+		date_iso, date_short, o->s3_region, service, aws, csha);
+
+	snprintf((char *)dkey, sizeof(dkey), "AWS4%s", o->s3_key);
+	_hmac(md, dkey, strlen(dkey), date_short);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, o->s3_region);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, (char*) service);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, (char*) aws);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, sts);
+
+	signature = _conv_hex(md, SHA256_DIGEST_LENGTH);
+
+	/* Surpress automatic Accept: header */
+	slist = curl_slist_append(slist, "Accept:");
+
+	snprintf(s, sizeof(s), "x-amz-content-sha256: %s", dsha);
+	slist = curl_slist_append(slist, s);
+
+	snprintf(s, sizeof(s), "x-amz-date: %s", date_iso);
+	slist = curl_slist_append(slist, s);
+
+	snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+	"SignedHeaders=host;x-amz-content-sha256;x-amz-date,Signature=%s",
+	o->s3_keyid, date_short, o->s3_region, signature);
+	slist = curl_slist_append(slist, s);
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
+
+	free(uri_encoded);
+	free(csha);
+	free(dsha);
+	free(signature);
+}
+
+static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
+		int op, const char *uri, char *buf, size_t len)
+{
+	char *dsha = NULL;
+	char s[512];
+
+	if (op == DDIR_WRITE) {
+		dsha = _gen_hex_md5(buf, len);
+	}
+	/* Surpress automatic Accept: header */
+	slist = curl_slist_append(slist, "Accept:");
+
+	snprintf(s, sizeof(s), "etag: %s", dsha);
+	slist = curl_slist_append(slist, s);
+
+	snprintf(s, sizeof(s), "x-auth-token: %s", o->swift_auth_token);
+	slist = curl_slist_append(slist, s);
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
+
+	free(dsha);
+}
+
+static void fio_http_cleanup(struct thread_data *td)
+{
+	struct http_data *http = td->io_ops_data;
+
+	if (http) {
+		curl_easy_cleanup(http->curl);
+		free(http);
+	}
+}
+
+static size_t _http_read(void *ptr, size_t size, size_t nmemb, void *stream)
+{
+	struct http_curl_stream *state = stream;
+	size_t len = size * nmemb;
+	/* We're retrieving; nothing is supposed to be read locally */
+	if (!stream)
+		return 0;
+	if (len+state->pos > state->max)
+		len = state->max - state->pos;
+	memcpy(ptr, &state->buf[state->pos], len);
+	state->pos += len;
+	return len;
+}
+
+static size_t _http_write(void *ptr, size_t size, size_t nmemb, void *stream)
+{
+	struct http_curl_stream *state = stream;
+	/* We're just discarding the returned body after a PUT */
+	if (!stream)
+		return nmemb;
+	if (size != 1)
+		return CURLE_WRITE_ERROR;
+	if (nmemb + state->pos > state->max)
+		return CURLE_WRITE_ERROR;
+	memcpy(&state->buf[state->pos], ptr, nmemb);
+	state->pos += nmemb;
+	return nmemb;
+}
+
+static int _http_seek(void *stream, curl_off_t offset, int origin)
+{
+	struct http_curl_stream *state = stream;
+	if (offset < state->max && origin == SEEK_SET) {
+		state->pos = offset;
+		return CURL_SEEKFUNC_OK;
+	} else
+		return CURL_SEEKFUNC_FAIL;
+}
+
+static enum fio_q_status fio_http_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct http_data *http = td->io_ops_data;
+	struct http_options *o = td->eo;
+	struct http_curl_stream _curl_stream;
+	struct curl_slist *slist = NULL;
+	char object[512];
+	char url[1024];
+	long status;
+	CURLcode res;
+	int r = -1;
+
+	fio_ro_check(td, io_u);
+	memset(&_curl_stream, 0, sizeof(_curl_stream));
+	snprintf(object, sizeof(object), "%s_%llu_%llu", td->files[0]->file_name,
+		io_u->offset, io_u->xfer_buflen);
+	if (o->https == FIO_HTTPS_OFF)
+		snprintf(url, sizeof(url), "http://%s%s", o->host, object);
+	else
+		snprintf(url, sizeof(url), "https://%s%s", o->host, object);
+	curl_easy_setopt(http->curl, CURLOPT_URL, url);
+	_curl_stream.buf = io_u->xfer_buf;
+	_curl_stream.max = io_u->xfer_buflen;
+	curl_easy_setopt(http->curl, CURLOPT_SEEKDATA, &_curl_stream);
+	curl_easy_setopt(http->curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)io_u->xfer_buflen);
+
+	if (o->mode == FIO_HTTP_S3)
+		_add_aws_auth_header(http->curl, slist, o, io_u->ddir, object,
+			io_u->xfer_buf, io_u->xfer_buflen);
+	else if (o->mode == FIO_HTTP_SWIFT)
+		_add_swift_header(http->curl, slist, o, io_u->ddir, object,
+			io_u->xfer_buf, io_u->xfer_buflen);
+
+	if (io_u->ddir == DDIR_WRITE) {
+		curl_easy_setopt(http->curl, CURLOPT_READDATA, &_curl_stream);
+		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, NULL);
+		curl_easy_setopt(http->curl, CURLOPT_UPLOAD, 1L);
+		res = curl_easy_perform(http->curl);
+		if (res == CURLE_OK) {
+			curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status);
+			if (status == 100 || (status >= 200 && status <= 204))
+				goto out;
+			log_err("DDIR_WRITE failed with HTTP status code %ld\n", status);
+			goto err;
+		}
+	} else if (io_u->ddir == DDIR_READ) {
+		curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
+		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, &_curl_stream);
+		curl_easy_setopt(http->curl, CURLOPT_HTTPGET, 1L);
+		res = curl_easy_perform(http->curl);
+		if (res == CURLE_OK) {
+			curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status);
+			if (status == 200)
+				goto out;
+			else if (status == 404) {
+				/* Object doesn't exist. Pretend we read
+				 * zeroes */
+				memset(io_u->xfer_buf, 0, io_u->xfer_buflen);
+				goto out;
+			}
+			log_err("DDIR_READ failed with HTTP status code %ld\n", status);
+		}
+		goto err;
+	} else if (io_u->ddir == DDIR_TRIM) {
+		curl_easy_setopt(http->curl, CURLOPT_HTTPGET, 1L);
+		curl_easy_setopt(http->curl, CURLOPT_CUSTOMREQUEST, "DELETE");
+		curl_easy_setopt(http->curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)0);
+		curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
+		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, NULL);
+		res = curl_easy_perform(http->curl);
+		if (res == CURLE_OK) {
+			curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status);
+			if (status == 200 || status == 202 || status == 204 || status == 404)
+				goto out;
+			log_err("DDIR_TRIM failed with HTTP status code %ld\n", status);
+		}
+		goto err;
+	}
+
+	log_err("WARNING: Only DDIR_READ/DDIR_WRITE/DDIR_TRIM are supported!\n");
+
+err:
+	io_u->error = r;
+	td_verror(td, io_u->error, "transfer");
+out:
+	curl_slist_free_all(slist);
+	return FIO_Q_COMPLETED;
+}
+
+static struct io_u *fio_http_event(struct thread_data *td, int event)
+{
+	/* sync IO engine - never any outstanding events */
+	return NULL;
+}
+
+int fio_http_getevents(struct thread_data *td, unsigned int min,
+	unsigned int max, const struct timespec *t)
+{
+	/* sync IO engine - never any outstanding events */
+	return 0;
+}
+
+static int fio_http_setup(struct thread_data *td)
+{
+	struct http_data *http = NULL;
+	struct http_options *o = td->eo;
+
+	/* allocate engine specific structure to deal with libhttp. */
+	http = calloc(1, sizeof(*http));
+	if (!http) {
+		log_err("calloc failed.\n");
+		goto cleanup;
+	}
+
+	http->curl = curl_easy_init();
+	if (o->verbose)
+		curl_easy_setopt(http->curl, CURLOPT_VERBOSE, 1L);
+	if (o->verbose > 1)
+		curl_easy_setopt(http->curl, CURLOPT_DEBUGFUNCTION, &_curl_trace);
+	curl_easy_setopt(http->curl, CURLOPT_NOPROGRESS, 1L);
+	curl_easy_setopt(http->curl, CURLOPT_FOLLOWLOCATION, 1L);
+	curl_easy_setopt(http->curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP|CURLPROTO_HTTPS);
+	if (o->https == FIO_HTTPS_INSECURE) {
+		curl_easy_setopt(http->curl, CURLOPT_SSL_VERIFYPEER, 0L);
+		curl_easy_setopt(http->curl, CURLOPT_SSL_VERIFYHOST, 0L);
+	}
+	curl_easy_setopt(http->curl, CURLOPT_READFUNCTION, _http_read);
+	curl_easy_setopt(http->curl, CURLOPT_WRITEFUNCTION, _http_write);
+	curl_easy_setopt(http->curl, CURLOPT_SEEKFUNCTION, &_http_seek);
+	if (o->user && o->pass) {
+		curl_easy_setopt(http->curl, CURLOPT_USERNAME, o->user);
+		curl_easy_setopt(http->curl, CURLOPT_PASSWORD, o->pass);
+		curl_easy_setopt(http->curl, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
+	}
+
+	td->io_ops_data = http;
+
+	/* Force single process mode. */
+	td->o.use_thread = 1;
+
+	return 0;
+cleanup:
+	fio_http_cleanup(td);
+	return 1;
+}
+
+static int fio_http_open(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+static int fio_http_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "http",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_DISKLESSIO | FIO_SYNCIO,
+	.setup			= fio_http_setup,
+	.queue			= fio_http_queue,
+	.getevents		= fio_http_getevents,
+	.event			= fio_http_event,
+	.cleanup		= fio_http_cleanup,
+	.open_file		= fio_http_open,
+	.invalidate		= fio_http_invalidate,
+	.options		= options,
+	.option_struct_size	= sizeof(struct http_options),
+};
+
+static void fio_init fio_http_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_http_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/ime.c b/engines/ime.c
new file mode 100644
index 0000000..4298402
--- /dev/null
+++ b/engines/ime.c
@@ -0,0 +1,899 @@
+/*
+ * FIO engines for DDN's Infinite Memory Engine.
+ * This file defines 3 engines: ime_psync, ime_psyncv, and ime_aio
+ *
+ * Copyright (C) 2018      DataDirect Networks. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Some details about the new engines are given below:
+ *
+ *
+ * ime_psync:
+ * Most basic engine that issues calls to ime_native whenever an IO is queued.
+ *
+ * ime_psyncv:
+ * This engine tries to queue the IOs (by creating iovecs) if asked by FIO (via
+ * iodepth_batch). It refuses to queue when the iovecs can't be appended, and
+ * waits for FIO to issue a commit. After a call to commit and get_events, new
+ * IOs can be queued.
+ *
+ * ime_aio:
+ * This engine tries to queue the IOs (by creating iovecs) if asked by FIO (via
+ * iodepth_batch). When the iovecs can't be appended to the current request, a
+ * new request for IME is created. These requests will be issued to IME when
+ * commit is called. Contrary to ime_psyncv, there can be several requests at
+ * once. We don't need to wait for a request to terminate before creating a new
+ * one.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <linux/limits.h>
+#include <ime_native.h>
+
+#include "../fio.h"
+
+
+/**************************************************************
+ *              Types and constants definitions
+ *
+ **************************************************************/
+
+/* define constants for async IOs */
+#define FIO_IME_IN_PROGRESS -1
+#define FIO_IME_REQ_ERROR   -2
+
+/* This flag is used when some jobs were created using threads. In that
+   case, IME can't be finalized in the engine-specific cleanup function,
+   because other threads might still use IME. Instead, IME is finalized
+   in the destructor (see fio_ime_unregister), only when the flag
+   fio_ime_is_initialized is true (which means at least one thread has
+   initialized IME). */
+static bool fio_ime_is_initialized = false;
+
+struct imesio_req {
+	int 			fd;		/* File descriptor */
+	enum fio_ddir	ddir;	/* Type of IO (read or write) */
+	off_t			offset;	/* File offset */
+};
+struct imeaio_req {
+	struct ime_aiocb 	iocb;			/* IME aio request */
+	ssize_t      		status;			/* Status of the IME request */
+	enum fio_ddir		ddir;			/* Type of IO (read or write) */
+	pthread_cond_t		cond_endio;		/* Condition var to notify FIO */
+	pthread_mutex_t		status_mutex;	/* Mutex for cond_endio */
+};
+
+/* This structure will be used for 2 engines: ime_psyncv and ime_aio */
+struct ime_data {
+	union {
+		struct imeaio_req 	*aioreqs;	/* array of aio requests */
+		struct imesio_req	*sioreq;	/* pointer to the only syncio request */
+	};
+	struct iovec 	*iovecs;		/* array of queued iovecs */
+	struct io_u 	**io_us;		/* array of queued io_u pointers */
+	struct io_u 	**event_io_us;	/* array of the events retieved afer get_events*/
+	unsigned int 	queued;			/* iovecs/io_us in the queue */
+	unsigned int 	events;			/* number of committed iovecs/io_us */
+
+	/* variables used to implement a "ring" queue */
+	unsigned int depth;			/* max entries in the queue */
+	unsigned int head;			/* index used to append */
+	unsigned int tail;			/* index used to pop */
+	unsigned int cur_commit;	/* index of the first uncommitted req */
+
+	/* offset used by the last iovec (used to check if the iovecs can be appended)*/
+	unsigned long long	last_offset;
+
+	/* The variables below are used for aio only */
+	struct imeaio_req	*last_req; /* last request awaiting committing */
+};
+
+
+/**************************************************************
+ *         Private functions for queueing/unqueueing
+ *
+ **************************************************************/
+
+static void fio_ime_queue_incr (struct ime_data *ime_d)
+{
+	ime_d->head = (ime_d->head + 1) % ime_d->depth;
+	ime_d->queued++;
+}
+
+static void fio_ime_queue_red (struct ime_data *ime_d)
+{
+	ime_d->tail = (ime_d->tail + 1) % ime_d->depth;
+	ime_d->queued--;
+	ime_d->events--;
+}
+
+static void fio_ime_queue_commit (struct ime_data *ime_d, int iovcnt)
+{
+	ime_d->cur_commit = (ime_d->cur_commit + iovcnt) % ime_d->depth;
+	ime_d->events += iovcnt;
+}
+
+static void fio_ime_queue_reset (struct ime_data *ime_d)
+{
+	ime_d->head = 0;
+	ime_d->tail = 0;
+	ime_d->cur_commit = 0;
+	ime_d->queued = 0;
+	ime_d->events = 0;
+}
+
+/**************************************************************
+ *                   General IME functions
+ *             (needed for both sync and async IOs)
+ **************************************************************/
+
+static char *fio_set_ime_filename(char* filename)
+{
+	static __thread char ime_filename[PATH_MAX];
+	int ret;
+
+	ret = snprintf(ime_filename, PATH_MAX, "%s%s", DEFAULT_IME_FILE_PREFIX, filename);
+	if (ret < PATH_MAX)
+		return ime_filename;
+
+	return NULL;
+}
+
+static int fio_ime_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct stat buf;
+	int ret;
+	char *ime_filename;
+
+	dprint(FD_FILE, "get file size %s\n", f->file_name);
+
+	ime_filename = fio_set_ime_filename(f->file_name);
+	if (ime_filename == NULL)
+		return 1;
+	ret = ime_native_stat(ime_filename, &buf);
+	if (ret == -1) {
+		td_verror(td, errno, "fstat");
+		return 1;
+	}
+
+	f->real_file_size = buf.st_size;
+	return 0;
+}
+
+/* This functions mimics the generic_file_open function, but issues
+   IME native calls instead of POSIX calls. */
+static int fio_ime_open_file(struct thread_data *td, struct fio_file *f)
+{
+	int flags = 0;
+	int ret;
+	uint64_t desired_fs;
+	char *ime_filename;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (td_trim(td)) {
+		td_verror(td, EINVAL, "IME does not support TRIM operation");
+		return 1;
+	}
+
+	if (td->o.oatomic) {
+		td_verror(td, EINVAL, "IME does not support atomic IO");
+		return 1;
+	}
+	if (td->o.odirect)
+		flags |= O_DIRECT;
+	if (td->o.sync_io)
+		flags |= O_SYNC;
+	if (td->o.create_on_open && td->o.allow_create)
+		flags |= O_CREAT;
+
+	if (td_write(td)) {
+		if (!read_only)
+			flags |= O_RDWR;
+
+		if (td->o.allow_create)
+			flags |= O_CREAT;
+	} else if (td_read(td)) {
+		flags |= O_RDONLY;
+	} else {
+		/* We should never go here. */
+		td_verror(td, EINVAL, "Unsopported open mode");
+		return 1;
+	}
+
+	ime_filename = fio_set_ime_filename(f->file_name);
+	if (ime_filename == NULL)
+		return 1;
+	f->fd = ime_native_open(ime_filename, flags, 0600);
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int __e = errno;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+		td_verror(td, __e, buf);
+		return 1;
+	}
+
+	/* Now we need to make sure the real file size is sufficient for FIO
+	   to do its things. This is normally done before the file open function
+	   is called, but because FIO would use POSIX calls, we need to do it
+	   ourselves */
+	ret = fio_ime_get_file_size(td, f);
+	if (ret < 0) {
+		ime_native_close(f->fd);
+		td_verror(td, errno, "ime_get_file_size");
+		return 1;
+	}
+
+	desired_fs = f->io_size + f->file_offset;
+	if (td_write(td)) {
+		dprint(FD_FILE, "Laying out file %s%s\n",
+			DEFAULT_IME_FILE_PREFIX, f->file_name);
+		if (!td->o.create_on_open &&
+				f->real_file_size < desired_fs &&
+				ime_native_ftruncate(f->fd, desired_fs) < 0) {
+			ime_native_close(f->fd);
+			td_verror(td, errno, "ime_native_ftruncate");
+			return 1;
+		}
+		if (f->real_file_size < desired_fs)
+			f->real_file_size = desired_fs;
+	} else if (td_read(td) && f->real_file_size < desired_fs) {
+		ime_native_close(f->fd);
+		log_err("error: can't read %lu bytes from file with "
+						"%lu bytes\n", desired_fs, f->real_file_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_ime_close_file(struct thread_data fio_unused *td, struct fio_file *f)
+{
+	int ret = 0;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	if (ime_native_close(f->fd) < 0)
+		ret = errno;
+
+	f->fd = -1;
+	return ret;
+}
+
+static int fio_ime_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	char *ime_filename = fio_set_ime_filename(f->file_name);
+	int ret;
+
+	if (ime_filename == NULL)
+		return 1;
+
+	ret = unlink(ime_filename);
+	return ret < 0 ? errno : 0;
+}
+
+static struct io_u *fio_ime_event(struct thread_data *td, int event)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	return ime_d->event_io_us[event];
+}
+
+/* Setup file used to replace get_file_sizes when settin up the file.
+   Instead we will set real_file_sie to 0 for each file. This way we
+   can avoid calling ime_native_init before the forks are created. */
+static int fio_ime_setup(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		dprint(FD_FILE, "setup: set file size to 0 for %p/%d/%s\n",
+			f, i, f->file_name);
+		f->real_file_size = 0;
+	}
+
+	return 0;
+}
+
+static int fio_ime_engine_init(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	dprint(FD_IO, "ime engine init\n");
+	if (fio_ime_is_initialized && !td->o.use_thread) {
+		log_err("Warning: something might go wrong. Not all threads/forks were"
+				" created before the FIO jobs were initialized.\n");
+	}
+
+	ime_native_init();
+	fio_ime_is_initialized = true;
+
+	/* We have to temporarily set real_file_size so that
+	   FIO can initialize properly. It will be corrected
+	   on file open. */
+	for_each_file(td, f, i)
+		f->real_file_size = f->io_size + f->file_offset;
+
+	return 0;
+}
+
+static void fio_ime_engine_finalize(struct thread_data *td)
+{
+	/* Only finalize IME when using forks */
+	if (!td->o.use_thread) {
+		if (ime_native_finalize() < 0)
+			log_err("error in ime_native_finalize\n");
+		fio_ime_is_initialized = false;
+	}
+}
+
+
+/**************************************************************
+ *             Private functions for blocking IOs
+ *                     (without iovecs)
+ **************************************************************/
+
+/* Notice: this function comes from the sync engine */
+/* It is used by the commit function to return a proper code and fill
+   some attributes in the io_u used for the IO. */
+static int fio_ime_psync_end(struct thread_data *td, struct io_u *io_u, ssize_t ret)
+{
+	if (ret != (ssize_t) io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		io_u_log_error(td, io_u);
+		td_verror(td, io_u->error, "xfer");
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static enum fio_q_status fio_ime_psync_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	ssize_t ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = ime_native_pread(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = ime_native_pwrite(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	else if (io_u->ddir == DDIR_SYNC)
+		ret = ime_native_fsync(f->fd);
+	else {
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+	}
+
+	return fio_ime_psync_end(td, io_u, ret);
+}
+
+
+/**************************************************************
+ *             Private functions for blocking IOs
+ *                       (with iovecs)
+ **************************************************************/
+
+static bool fio_ime_psyncv_can_queue(struct ime_data *ime_d, struct io_u *io_u)
+{
+	/* We can only queue if:
+	  - There are no queued iovecs
+	  - Or if there is at least one:
+		 - There must be no event waiting for retrieval
+		 - The offsets must be contiguous
+		 - The ddir and fd must be the same */
+	return (ime_d->queued == 0 || (
+			ime_d->events == 0 &&
+			ime_d->last_offset == io_u->offset &&
+			ime_d->sioreq->ddir == io_u->ddir &&
+			ime_d->sioreq->fd == io_u->file->fd));
+}
+
+/* Before using this function, we should have already
+   ensured that the queue is not full */
+static void fio_ime_psyncv_enqueue(struct ime_data *ime_d, struct io_u *io_u)
+{
+	struct imesio_req *ioreq = ime_d->sioreq;
+	struct iovec *iov = &ime_d->iovecs[ime_d->head];
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (ime_d->queued == 0) {
+		ioreq->offset = io_u->offset;
+		ioreq->ddir = io_u->ddir;
+		ioreq->fd = io_u->file->fd;
+	}
+
+	ime_d->io_us[ime_d->head] = io_u;
+	ime_d->last_offset = io_u->offset + io_u->xfer_buflen;
+	fio_ime_queue_incr(ime_d);
+}
+
+/* Tries to queue an IO. It will fail if the IO can't be appended to the
+   current request or if the current request has been committed but not
+   yet retrieved by get_events. */
+static enum fio_q_status fio_ime_psyncv_queue(struct thread_data *td,
+	struct io_u *io_u)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	if (ime_d->queued == ime_d->depth)
+		return FIO_Q_BUSY;
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (!fio_ime_psyncv_can_queue(ime_d, io_u))
+			return FIO_Q_BUSY;
+
+		dprint(FD_IO, "queue: ddir=%d at %u commit=%u queued=%u events=%u\n",
+			io_u->ddir, ime_d->head, ime_d->cur_commit,
+			ime_d->queued, ime_d->events);
+		fio_ime_psyncv_enqueue(ime_d, io_u);
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_SYNC) {
+		if (ime_native_fsync(io_u->file->fd) < 0) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "fsync");
+		}
+		return FIO_Q_COMPLETED;
+	} else {
+		io_u->error = EINVAL;
+		td_verror(td, io_u->error, "wrong ddir");
+		return FIO_Q_COMPLETED;
+	}
+}
+
+/* Notice: this function comes from the sync engine */
+/* It is used by the commit function to return a proper code and fill
+   some attributes in the io_us appended to the current request. */
+static int fio_ime_psyncv_end(struct thread_data *td, ssize_t bytes)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct io_u *io_u;
+	unsigned int i;
+	int err = errno;
+
+	for (i = 0; i < ime_d->queued; i++) {
+		io_u = ime_d->io_us[i];
+
+		if (bytes == -1)
+			io_u->error = err;
+		else {
+			unsigned int this_io;
+
+			this_io = bytes;
+			if (this_io > io_u->xfer_buflen)
+				this_io = io_u->xfer_buflen;
+
+			io_u->resid = io_u->xfer_buflen - this_io;
+			io_u->error = 0;
+			bytes -= this_io;
+		}
+	}
+
+	if (bytes == -1) {
+		td_verror(td, err, "xfer psyncv");
+		return -err;
+	}
+
+	return 0;
+}
+
+/* Commits the current request by calling ime_native (with one or several
+   iovecs). After this commit, the corresponding events (one per iovec)
+   can be retrieved by get_events. */
+static int fio_ime_psyncv_commit(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imesio_req *ioreq;
+	int ret = 0;
+
+	/* Exit if there are no (new) events to commit
+	   or if the previous committed event haven't been retrieved */
+	if (!ime_d->queued || ime_d->events)
+		return 0;
+
+	ioreq = ime_d->sioreq;
+	ime_d->events = ime_d->queued;
+	if (ioreq->ddir == DDIR_READ)
+		ret = ime_native_preadv(ioreq->fd, ime_d->iovecs, ime_d->queued, ioreq->offset);
+	else
+		ret = ime_native_pwritev(ioreq->fd, ime_d->iovecs, ime_d->queued, ioreq->offset);
+
+	dprint(FD_IO, "committed %d iovecs\n", ime_d->queued);
+
+	return fio_ime_psyncv_end(td, ret);
+}
+
+static int fio_ime_psyncv_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct io_u *io_u;
+	int events = 0;
+	unsigned int count;
+
+	if (ime_d->events) {
+		for (count = 0; count < ime_d->events; count++) {
+			io_u = ime_d->io_us[count];
+			ime_d->event_io_us[events] = io_u;
+			events++;
+		}
+		fio_ime_queue_reset(ime_d);
+	}
+
+	dprint(FD_IO, "getevents(%u,%u) ret=%d queued=%u events=%u\n",
+		min, max, events, ime_d->queued, ime_d->events);
+	return events;
+}
+
+static int fio_ime_psyncv_init(struct thread_data *td)
+{
+	struct ime_data *ime_d;
+
+	if (fio_ime_engine_init(td) < 0)
+		return 1;
+
+	ime_d = calloc(1, sizeof(*ime_d));
+
+	ime_d->sioreq = malloc(sizeof(struct imesio_req));
+	ime_d->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
+	ime_d->io_us = malloc(2 * td->o.iodepth * sizeof(struct io_u *));
+	ime_d->event_io_us = ime_d->io_us + td->o.iodepth;
+
+	ime_d->depth = td->o.iodepth;
+
+	td->io_ops_data = ime_d;
+	return 0;
+}
+
+static void fio_ime_psyncv_clean(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	if (ime_d) {
+		free(ime_d->sioreq);
+		free(ime_d->iovecs);
+		free(ime_d->io_us);
+		free(ime_d);
+		td->io_ops_data = NULL;
+	}
+
+	fio_ime_engine_finalize(td);
+}
+
+
+/**************************************************************
+ *           Private functions for non-blocking IOs
+ *
+ **************************************************************/
+
+void fio_ime_aio_complete_cb  (struct ime_aiocb *aiocb, int err,
+							   ssize_t bytes)
+{
+	struct imeaio_req *ioreq = (struct imeaio_req *) aiocb->user_context;
+
+	pthread_mutex_lock(&ioreq->status_mutex);
+	ioreq->status = err == 0 ? bytes : FIO_IME_REQ_ERROR;
+	pthread_mutex_unlock(&ioreq->status_mutex);
+
+	pthread_cond_signal(&ioreq->cond_endio);
+}
+
+static bool fio_ime_aio_can_queue (struct ime_data *ime_d, struct io_u *io_u)
+{
+	/* So far we can queue in any case. */
+	return true;
+}
+static bool fio_ime_aio_can_append (struct ime_data *ime_d, struct io_u *io_u)
+{
+	/* We can only append if:
+		- The iovecs will be contiguous in the array
+		- There is already a queued iovec
+		- The offsets are contiguous
+		- The ddir and fs are the same */
+	return (ime_d->head != 0 &&
+			ime_d->queued - ime_d->events > 0 &&
+			ime_d->last_offset == io_u->offset &&
+			ime_d->last_req->ddir == io_u->ddir &&
+			ime_d->last_req->iocb.fd == io_u->file->fd);
+}
+
+/* Before using this function, we should have already
+   ensured that the queue is not full */
+static void fio_ime_aio_enqueue(struct ime_data *ime_d, struct io_u *io_u)
+{
+	struct imeaio_req *ioreq = &ime_d->aioreqs[ime_d->head];
+	struct ime_aiocb *iocb = &ioreq->iocb;
+	struct iovec *iov = &ime_d->iovecs[ime_d->head];
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (fio_ime_aio_can_append(ime_d, io_u))
+		ime_d->last_req->iocb.iovcnt++;
+	else {
+		ioreq->status = FIO_IME_IN_PROGRESS;
+		ioreq->ddir = io_u->ddir;
+		ime_d->last_req = ioreq;
+
+		iocb->complete_cb = &fio_ime_aio_complete_cb;
+		iocb->fd = io_u->file->fd;
+		iocb->file_offset = io_u->offset;
+		iocb->iov = iov;
+		iocb->iovcnt = 1;
+		iocb->flags = 0;
+		iocb->user_context = (intptr_t) ioreq;
+	}
+
+	ime_d->io_us[ime_d->head] = io_u;
+	ime_d->last_offset = io_u->offset + io_u->xfer_buflen;
+	fio_ime_queue_incr(ime_d);
+}
+
+/* Tries to queue an IO. It will create a new request if the IO can't be
+   appended to the current request. It will fail if the queue can't contain
+   any more io_u/iovec. In this case, commit and then get_events need to be
+   called. */
+static enum fio_q_status fio_ime_aio_queue(struct thread_data *td,
+		struct io_u *io_u)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	dprint(FD_IO, "queue: ddir=%d at %u commit=%u queued=%u events=%u\n",
+		io_u->ddir, ime_d->head, ime_d->cur_commit,
+		ime_d->queued, ime_d->events);
+
+	if (ime_d->queued == ime_d->depth)
+		return FIO_Q_BUSY;
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (!fio_ime_aio_can_queue(ime_d, io_u))
+			return FIO_Q_BUSY;
+
+		fio_ime_aio_enqueue(ime_d, io_u);
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_SYNC) {
+		if (ime_native_fsync(io_u->file->fd) < 0) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "fsync");
+		}
+		return FIO_Q_COMPLETED;
+	} else {
+		io_u->error = EINVAL;
+		td_verror(td, io_u->error, "wrong ddir");
+		return FIO_Q_COMPLETED;
+	}
+}
+
+static int fio_ime_aio_commit(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imeaio_req *ioreq;
+	int ret = 0;
+
+	/* Loop while there are events to commit */
+	while (ime_d->queued - ime_d->events) {
+		ioreq = &ime_d->aioreqs[ime_d->cur_commit];
+		if (ioreq->ddir == DDIR_READ)
+			ret = ime_native_aio_read(&ioreq->iocb);
+		else
+			ret = ime_native_aio_write(&ioreq->iocb);
+
+		fio_ime_queue_commit(ime_d, ioreq->iocb.iovcnt);
+
+		/* fio needs a negative error code */
+		if (ret < 0) {
+			ioreq->status = FIO_IME_REQ_ERROR;
+			return -errno;
+		}
+
+		io_u_mark_submit(td, ioreq->iocb.iovcnt);
+		dprint(FD_IO, "committed %d iovecs commit=%u queued=%u events=%u\n",
+			ioreq->iocb.iovcnt, ime_d->cur_commit,
+			ime_d->queued, ime_d->events);
+	}
+
+	return 0;
+}
+
+static int fio_ime_aio_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imeaio_req *ioreq;
+	struct io_u *io_u;
+	int events = 0;
+	unsigned int count;
+	ssize_t bytes;
+
+	while (ime_d->events) {
+		ioreq = &ime_d->aioreqs[ime_d->tail];
+
+		/* Break if we already got events, and if we will
+		   exceed max if we append the next events */
+		if (events && events + ioreq->iocb.iovcnt > max)
+			break;
+
+		if (ioreq->status != FIO_IME_IN_PROGRESS) {
+
+			bytes = ioreq->status;
+			for (count = 0; count < ioreq->iocb.iovcnt; count++) {
+				io_u = ime_d->io_us[ime_d->tail];
+				ime_d->event_io_us[events] = io_u;
+				events++;
+				fio_ime_queue_red(ime_d);
+
+				if (ioreq->status == FIO_IME_REQ_ERROR)
+					io_u->error = EIO;
+				else {
+					io_u->resid = bytes > io_u->xfer_buflen ?
+									0 : io_u->xfer_buflen - bytes;
+					io_u->error = 0;
+					bytes -= io_u->xfer_buflen - io_u->resid;
+				}
+			}
+		} else {
+			pthread_mutex_lock(&ioreq->status_mutex);
+			while (ioreq->status == FIO_IME_IN_PROGRESS)
+				pthread_cond_wait(&ioreq->cond_endio, &ioreq->status_mutex);
+			pthread_mutex_unlock(&ioreq->status_mutex);
+		}
+
+	}
+
+	dprint(FD_IO, "getevents(%u,%u) ret=%d queued=%u events=%u\n", min, max,
+		events, ime_d->queued, ime_d->events);
+	return events;
+}
+
+static int fio_ime_aio_init(struct thread_data *td)
+{
+	struct ime_data *ime_d;
+	struct imeaio_req *ioreq;
+	unsigned int i;
+
+	if (fio_ime_engine_init(td) < 0)
+		return 1;
+
+	ime_d = calloc(1, sizeof(*ime_d));
+
+	ime_d->aioreqs = malloc(td->o.iodepth * sizeof(struct imeaio_req));
+	ime_d->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
+	ime_d->io_us = malloc(2 * td->o.iodepth * sizeof(struct io_u *));
+	ime_d->event_io_us = ime_d->io_us + td->o.iodepth;
+
+	ime_d->depth = td->o.iodepth;
+	for (i = 0; i < ime_d->depth; i++) {
+		ioreq = &ime_d->aioreqs[i];
+		pthread_cond_init(&ioreq->cond_endio, NULL);
+		pthread_mutex_init(&ioreq->status_mutex, NULL);
+	}
+
+	td->io_ops_data = ime_d;
+	return 0;
+}
+
+static void fio_ime_aio_clean(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imeaio_req *ioreq;
+	unsigned int i;
+
+	if (ime_d) {
+		for (i = 0; i < ime_d->depth; i++) {
+			ioreq = &ime_d->aioreqs[i];
+			pthread_cond_destroy(&ioreq->cond_endio);
+			pthread_mutex_destroy(&ioreq->status_mutex);
+		}
+		free(ime_d->aioreqs);
+		free(ime_d->iovecs);
+		free(ime_d->io_us);
+		free(ime_d);
+		td->io_ops_data = NULL;
+	}
+
+	fio_ime_engine_finalize(td);
+}
+
+
+/**************************************************************
+ *                   IO engines definitions
+ *
+ **************************************************************/
+
+/* The FIO_DISKLESSIO flag used for these engines is necessary to prevent
+   FIO from using POSIX calls. See fio_ime_open_file for more details. */
+
+static struct ioengine_ops ioengine_prw = {
+	.name		= "ime_psync",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_ime_setup,
+	.init		= fio_ime_engine_init,
+	.cleanup	= fio_ime_engine_finalize,
+	.queue		= fio_ime_psync_queue,
+	.open_file	= fio_ime_open_file,
+	.close_file	= fio_ime_close_file,
+	.get_file_size	= fio_ime_get_file_size,
+	.unlink_file  	= fio_ime_unlink_file,
+	.flags	    	= FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static struct ioengine_ops ioengine_pvrw = {
+	.name		= "ime_psyncv",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_ime_setup,
+	.init		= fio_ime_psyncv_init,
+	.cleanup	= fio_ime_psyncv_clean,
+	.queue		= fio_ime_psyncv_queue,
+	.commit		= fio_ime_psyncv_commit,
+	.getevents	= fio_ime_psyncv_getevents,
+	.event		= fio_ime_event,
+	.open_file	= fio_ime_open_file,
+	.close_file	= fio_ime_close_file,
+	.get_file_size	= fio_ime_get_file_size,
+	.unlink_file  	= fio_ime_unlink_file,
+	.flags	    	= FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static struct ioengine_ops ioengine_aio = {
+	.name		= "ime_aio",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_ime_setup,
+	.init		= fio_ime_aio_init,
+	.cleanup	= fio_ime_aio_clean,
+	.queue		= fio_ime_aio_queue,
+	.commit		= fio_ime_aio_commit,
+	.getevents	= fio_ime_aio_getevents,
+	.event		= fio_ime_event,
+	.open_file	= fio_ime_open_file,
+	.close_file	= fio_ime_close_file,
+	.get_file_size	= fio_ime_get_file_size,
+	.unlink_file  	= fio_ime_unlink_file,
+	.flags       	= FIO_DISKLESSIO,
+};
+
+static void fio_init fio_ime_register(void)
+{
+	register_ioengine(&ioengine_prw);
+	register_ioengine(&ioengine_pvrw);
+	register_ioengine(&ioengine_aio);
+}
+
+static void fio_exit fio_ime_unregister(void)
+{
+	unregister_ioengine(&ioengine_prw);
+	unregister_ioengine(&ioengine_pvrw);
+	unregister_ioengine(&ioengine_aio);
+
+	if (fio_ime_is_initialized && ime_native_finalize() < 0)
+		log_err("Warning: IME did not finalize properly\n");
+}
diff --git a/engines/io_uring.c b/engines/io_uring.c
new file mode 100644
index 0000000..ac57af8
--- /dev/null
+++ b/engines/io_uring.c
@@ -0,0 +1,760 @@
+/*
+ * io_uring engine
+ *
+ * IO engine using the new native Linux aio io_uring interface. See:
+ *
+ * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring
+ *
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+#include "../lib/fls.h"
+
+#ifdef ARCH_HAVE_IOURING
+
+#include "../lib/types.h"
+#include "../os/linux/io_uring.h"
+
+struct io_sq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	unsigned *flags;
+	unsigned *array;
+};
+
+struct io_cq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	struct io_uring_cqe *cqes;
+};
+
+struct ioring_mmap {
+	void *ptr;
+	size_t len;
+};
+
+struct ioring_data {
+	int ring_fd;
+
+	struct io_u **io_u_index;
+
+	int *fds;
+
+	struct io_sq_ring sq_ring;
+	struct io_uring_sqe *sqes;
+	struct iovec *iovecs;
+	unsigned sq_ring_mask;
+
+	struct io_cq_ring cq_ring;
+	unsigned cq_ring_mask;
+
+	int queued;
+	int cq_ring_off;
+	unsigned iodepth;
+	bool ioprio_class_set;
+	bool ioprio_set;
+
+	struct ioring_mmap mmap[3];
+};
+
+struct ioring_options {
+	void *pad;
+	unsigned int hipri;
+	unsigned int cmdprio_percentage;
+	unsigned int fixedbufs;
+	unsigned int registerfiles;
+	unsigned int sqpoll_thread;
+	unsigned int sqpoll_set;
+	unsigned int sqpoll_cpu;
+	unsigned int nonvectored;
+	unsigned int uncached;
+};
+
+static const int ddir_to_op[2][2] = {
+	{ IORING_OP_READV, IORING_OP_READ },
+	{ IORING_OP_WRITEV, IORING_OP_WRITE }
+};
+
+static const int fixed_ddir_to_op[2] = {
+	IORING_OP_READ_FIXED,
+	IORING_OP_WRITE_FIXED
+};
+
+static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
+{
+	struct ioring_options *o = data;
+
+	o->sqpoll_cpu = *val;
+	o->sqpoll_set = 1;
+	return 0;
+}
+
+static struct fio_option options[] = {
+	{
+		.name	= "hipri",
+		.lname	= "High Priority",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct ioring_options, hipri),
+		.help	= "Use polled IO completions",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+#ifdef FIO_HAVE_IOPRIO_CLASS
+	{
+		.name	= "cmdprio_percentage",
+		.lname	= "high priority percentage",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct ioring_options, cmdprio_percentage),
+		.minval	= 1,
+		.maxval	= 100,
+		.help	= "Send high priority I/O this percentage of the time",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+#else
+	{
+		.name	= "cmdprio_percentage",
+		.lname	= "high priority percentage",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support I/O priority classes",
+	},
+#endif
+	{
+		.name	= "fixedbufs",
+		.lname	= "Fixed (pre-mapped) IO buffers",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct ioring_options, fixedbufs),
+		.help	= "Pre map IO buffers",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "registerfiles",
+		.lname	= "Register file set",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct ioring_options, registerfiles),
+		.help	= "Pre-open/register files",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "sqthread_poll",
+		.lname	= "Kernel SQ thread polling",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct ioring_options, sqpoll_thread),
+		.help	= "Offload submission/completion to kernel thread",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "sqthread_poll_cpu",
+		.lname	= "SQ Thread Poll CPU",
+		.type	= FIO_OPT_INT,
+		.cb	= fio_ioring_sqpoll_cb,
+		.help	= "What CPU to run SQ thread polling on",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "nonvectored",
+		.lname	= "Non-vectored",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct ioring_options, nonvectored),
+		.help	= "Use non-vectored read/write commands",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "uncached",
+		.lname	= "Uncached",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct ioring_options, uncached),
+		.help	= "Use RWF_UNCACHED for buffered read/writes",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
+			 unsigned int min_complete, unsigned int flags)
+{
+	return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
+			min_complete, flags, NULL, 0);
+}
+
+static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct fio_file *f = io_u->file;
+	struct io_uring_sqe *sqe;
+
+	sqe = &ld->sqes[io_u->index];
+
+	/* zero out fields not used in this submission */
+	memset(sqe, 0, sizeof(*sqe));
+
+	if (o->registerfiles) {
+		sqe->fd = f->engine_pos;
+		sqe->flags = IOSQE_FIXED_FILE;
+	} else {
+		sqe->fd = f->fd;
+	}
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (o->fixedbufs) {
+			sqe->opcode = fixed_ddir_to_op[io_u->ddir];
+			sqe->addr = (unsigned long) io_u->xfer_buf;
+			sqe->len = io_u->xfer_buflen;
+			sqe->buf_index = io_u->index;
+		} else {
+			sqe->opcode = ddir_to_op[io_u->ddir][!!o->nonvectored];
+			if (o->nonvectored) {
+				sqe->addr = (unsigned long)
+						ld->iovecs[io_u->index].iov_base;
+				sqe->len = ld->iovecs[io_u->index].iov_len;
+			} else {
+				sqe->addr = (unsigned long) &ld->iovecs[io_u->index];
+				sqe->len = 1;
+			}
+		}
+		if (!td->o.odirect && o->uncached)
+			sqe->rw_flags = RWF_UNCACHED;
+		if (ld->ioprio_class_set)
+			sqe->ioprio = td->o.ioprio_class << 13;
+		if (ld->ioprio_set)
+			sqe->ioprio |= td->o.ioprio;
+		sqe->off = io_u->offset;
+	} else if (ddir_sync(io_u->ddir)) {
+		if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
+			sqe->off = f->first_write;
+			sqe->len = f->last_write - f->first_write;
+			sqe->sync_range_flags = td->o.sync_file_range;
+			sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
+		} else {
+			if (io_u->ddir == DDIR_DATASYNC)
+				sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
+			sqe->opcode = IORING_OP_FSYNC;
+		}
+	}
+
+	sqe->user_data = (unsigned long) io_u;
+	return 0;
+}
+
+static struct io_u *fio_ioring_event(struct thread_data *td, int event)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct io_uring_cqe *cqe;
+	struct io_u *io_u;
+	unsigned index;
+
+	index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
+
+	cqe = &ld->cq_ring.cqes[index];
+	io_u = (struct io_u *) (uintptr_t) cqe->user_data;
+
+	if (cqe->res != io_u->xfer_buflen) {
+		if (cqe->res > io_u->xfer_buflen)
+			io_u->error = -cqe->res;
+		else
+			io_u->resid = io_u->xfer_buflen - cqe->res;
+	} else
+		io_u->error = 0;
+
+	return io_u;
+}
+
+static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
+				   unsigned int max)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct io_cq_ring *ring = &ld->cq_ring;
+	unsigned head, reaped = 0;
+
+	head = *ring->head;
+	do {
+		read_barrier();
+		if (head == *ring->tail)
+			break;
+		reaped++;
+		head++;
+	} while (reaped + events < max);
+
+	*ring->head = head;
+	write_barrier();
+	return reaped;
+}
+
+static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
+	struct ioring_options *o = td->eo;
+	struct io_cq_ring *ring = &ld->cq_ring;
+	unsigned events = 0;
+	int r;
+
+	ld->cq_ring_off = *ring->head;
+	do {
+		r = fio_ioring_cqring_reap(td, events, max);
+		if (r) {
+			events += r;
+			if (actual_min != 0)
+				actual_min -= r;
+			continue;
+		}
+
+		if (!o->sqpoll_thread) {
+			r = io_uring_enter(ld, 0, actual_min,
+						IORING_ENTER_GETEVENTS);
+			if (r < 0) {
+				if (errno == EAGAIN || errno == EINTR)
+					continue;
+				td_verror(td, errno, "io_uring_enter");
+				break;
+			}
+		}
+	} while (events < min);
+
+	return r < 0 ? r : events;
+}
+
+static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct ioring_options *o = td->eo;
+	struct ioring_data *ld = td->io_ops_data;
+	if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
+		ld->sqes[io_u->index].ioprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
+		io_u->flags |= IO_U_F_PRIORITY;
+	}
+	return;
+}
+
+static enum fio_q_status fio_ioring_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct io_sq_ring *ring = &ld->sq_ring;
+	struct ioring_options *o = td->eo;
+	unsigned tail, next_tail;
+
+	fio_ro_check(td, io_u);
+
+	if (ld->queued == ld->iodepth)
+		return FIO_Q_BUSY;
+
+	if (io_u->ddir == DDIR_TRIM) {
+		if (ld->queued)
+			return FIO_Q_BUSY;
+
+		do_io_u_trim(td, io_u);
+		io_u_mark_submit(td, 1);
+		io_u_mark_complete(td, 1);
+		return FIO_Q_COMPLETED;
+	}
+
+	tail = *ring->tail;
+	next_tail = tail + 1;
+	read_barrier();
+	if (next_tail == *ring->head)
+		return FIO_Q_BUSY;
+
+	if (o->cmdprio_percentage)
+		fio_ioring_prio_prep(td, io_u);
+	ring->array[tail & ld->sq_ring_mask] = io_u->index;
+	*ring->tail = next_tail;
+	write_barrier();
+
+	ld->queued++;
+	return FIO_Q_QUEUED;
+}
+
+static void fio_ioring_queued(struct thread_data *td, int start, int nr)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct timespec now;
+
+	if (!fio_fill_issue_time(td))
+		return;
+
+	fio_gettime(&now, NULL);
+
+	while (nr--) {
+		struct io_sq_ring *ring = &ld->sq_ring;
+		int index = ring->array[start & ld->sq_ring_mask];
+		struct io_u *io_u = ld->io_u_index[index];
+
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+
+		start++;
+	}
+}
+
+static int fio_ioring_commit(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	int ret;
+
+	if (!ld->queued)
+		return 0;
+
+	/*
+	 * Kernel side does submission. just need to check if the ring is
+	 * flagged as needing a kick, if so, call io_uring_enter(). This
+	 * only happens if we've been idle too long.
+	 */
+	if (o->sqpoll_thread) {
+		struct io_sq_ring *ring = &ld->sq_ring;
+
+		read_barrier();
+		if (*ring->flags & IORING_SQ_NEED_WAKEUP)
+			io_uring_enter(ld, ld->queued, 0,
+					IORING_ENTER_SQ_WAKEUP);
+		ld->queued = 0;
+		return 0;
+	}
+
+	do {
+		unsigned start = *ld->sq_ring.head;
+		long nr = ld->queued;
+
+		ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
+		if (ret > 0) {
+			fio_ioring_queued(td, start, ret);
+			io_u_mark_submit(td, ret);
+
+			ld->queued -= ret;
+			ret = 0;
+		} else if (!ret) {
+			io_u_mark_submit(td, ret);
+			continue;
+		} else {
+			if (errno == EAGAIN || errno == EINTR) {
+				ret = fio_ioring_cqring_reap(td, 0, ld->queued);
+				if (ret)
+					continue;
+				/* Shouldn't happen */
+				usleep(1);
+				continue;
+			}
+			td_verror(td, errno, "io_uring_enter submit");
+			break;
+		}
+	} while (ld->queued);
+
+	return ret;
+}
+
+static void fio_ioring_unmap(struct ioring_data *ld)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ld->mmap); i++)
+		munmap(ld->mmap[i].ptr, ld->mmap[i].len);
+	close(ld->ring_fd);
+}
+
+static void fio_ioring_cleanup(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+
+	if (ld) {
+		if (!(td->flags & TD_F_CHILD))
+			fio_ioring_unmap(ld);
+
+		free(ld->io_u_index);
+		free(ld->iovecs);
+		free(ld->fds);
+		free(ld);
+	}
+}
+
+static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
+{
+	struct io_sq_ring *sring = &ld->sq_ring;
+	struct io_cq_ring *cring = &ld->cq_ring;
+	void *ptr;
+
+	ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32);
+	ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+			IORING_OFF_SQ_RING);
+	ld->mmap[0].ptr = ptr;
+	sring->head = ptr + p->sq_off.head;
+	sring->tail = ptr + p->sq_off.tail;
+	sring->ring_mask = ptr + p->sq_off.ring_mask;
+	sring->ring_entries = ptr + p->sq_off.ring_entries;
+	sring->flags = ptr + p->sq_off.flags;
+	sring->array = ptr + p->sq_off.array;
+	ld->sq_ring_mask = *sring->ring_mask;
+
+	ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
+	ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+				IORING_OFF_SQES);
+	ld->mmap[1].ptr = ld->sqes;
+
+	ld->mmap[2].len = p->cq_off.cqes +
+				p->cq_entries * sizeof(struct io_uring_cqe);
+	ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+			IORING_OFF_CQ_RING);
+	ld->mmap[2].ptr = ptr;
+	cring->head = ptr + p->cq_off.head;
+	cring->tail = ptr + p->cq_off.tail;
+	cring->ring_mask = ptr + p->cq_off.ring_mask;
+	cring->ring_entries = ptr + p->cq_off.ring_entries;
+	cring->cqes = ptr + p->cq_off.cqes;
+	ld->cq_ring_mask = *cring->ring_mask;
+	return 0;
+}
+
+static int fio_ioring_queue_init(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	int depth = td->o.iodepth;
+	struct io_uring_params p;
+	int ret;
+
+	memset(&p, 0, sizeof(p));
+
+	if (o->hipri)
+		p.flags |= IORING_SETUP_IOPOLL;
+	if (o->sqpoll_thread) {
+		p.flags |= IORING_SETUP_SQPOLL;
+		if (o->sqpoll_set) {
+			p.flags |= IORING_SETUP_SQ_AFF;
+			p.sq_thread_cpu = o->sqpoll_cpu;
+		}
+	}
+
+	ret = syscall(__NR_io_uring_setup, depth, &p);
+	if (ret < 0)
+		return ret;
+
+	ld->ring_fd = ret;
+
+	if (o->fixedbufs) {
+		struct rlimit rlim = {
+			.rlim_cur = RLIM_INFINITY,
+			.rlim_max = RLIM_INFINITY,
+		};
+
+		if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0)
+			return -1;
+
+		ret = syscall(__NR_io_uring_register, ld->ring_fd,
+				IORING_REGISTER_BUFFERS, ld->iovecs, depth);
+		if (ret < 0)
+			return ret;
+	}
+
+	return fio_ioring_mmap(ld, &p);
+}
+
+static int fio_ioring_register_files(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct fio_file *f;
+	unsigned int i;
+	int ret;
+
+	ld->fds = calloc(td->o.nr_files, sizeof(int));
+
+	for_each_file(td, f, i) {
+		ret = generic_open_file(td, f);
+		if (ret)
+			goto err;
+		ld->fds[i] = f->fd;
+		f->engine_pos = i;
+	}
+
+	ret = syscall(__NR_io_uring_register, ld->ring_fd,
+			IORING_REGISTER_FILES, ld->fds, td->o.nr_files);
+	if (ret) {
+err:
+		free(ld->fds);
+		ld->fds = NULL;
+	}
+
+	/*
+	 * Pretend the file is closed again, and really close it if we hit
+	 * an error.
+	 */
+	for_each_file(td, f, i) {
+		if (ret) {
+			int fio_unused ret2;
+			ret2 = generic_close_file(td, f);
+		} else
+			f->fd = -1;
+	}
+
+	return ret;
+}
+
+static int fio_ioring_post_init(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct io_u *io_u;
+	int err, i;
+
+	for (i = 0; i < td->o.iodepth; i++) {
+		struct iovec *iov = &ld->iovecs[i];
+
+		io_u = ld->io_u_index[i];
+		iov->iov_base = io_u->buf;
+		iov->iov_len = td_max_bs(td);
+	}
+
+	err = fio_ioring_queue_init(td);
+	if (err) {
+		td_verror(td, errno, "io_queue_init");
+		return 1;
+	}
+
+	if (o->registerfiles) {
+		err = fio_ioring_register_files(td);
+		if (err) {
+			td_verror(td, errno, "ioring_register_files");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static unsigned roundup_pow2(unsigned depth)
+{
+	return 1UL << __fls(depth - 1);
+}
+
+static int fio_ioring_init(struct thread_data *td)
+{
+	struct ioring_options *o = td->eo;
+	struct ioring_data *ld;
+	struct thread_options *to = &td->o;
+
+	/* sqthread submission requires registered files */
+	if (o->sqpoll_thread)
+		o->registerfiles = 1;
+
+	if (o->registerfiles && td->o.nr_files != td->o.open_files) {
+		log_err("fio: io_uring registered files require nr_files to "
+			"be identical to open_files\n");
+		return 1;
+	}
+
+	ld = calloc(1, sizeof(*ld));
+
+	/* ring depth must be a power-of-2 */
+	ld->iodepth = td->o.iodepth;
+	td->o.iodepth = roundup_pow2(td->o.iodepth);
+
+	/* io_u index */
+	ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
+	ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
+
+	td->io_ops_data = ld;
+
+	/*
+	 * Check for option conflicts
+	 */
+	if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
+			o->cmdprio_percentage != 0) {
+		log_err("%s: cmdprio_percentage option and mutually exclusive "
+				"prio or prioclass option is set, exiting\n", to->name);
+		td_verror(td, EINVAL, "fio_io_uring_init");
+		return 1;
+	}
+
+	if (fio_option_is_set(&td->o, ioprio_class))
+		ld->ioprio_class_set = true;
+	if (fio_option_is_set(&td->o, ioprio))
+		ld->ioprio_set = true;
+
+	return 0;
+}
+
+static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+
+	ld->io_u_index[io_u->index] = io_u;
+	return 0;
+}
+
+static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+
+	if (!ld || !o->registerfiles)
+		return generic_open_file(td, f);
+
+	f->fd = ld->fds[f->engine_pos];
+	return 0;
+}
+
+static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+
+	if (!ld || !o->registerfiles)
+		return generic_close_file(td, f);
+
+	f->fd = -1;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "io_uring",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM,
+	.init			= fio_ioring_init,
+	.post_init		= fio_ioring_post_init,
+	.io_u_init		= fio_ioring_io_u_init,
+	.prep			= fio_ioring_prep,
+	.queue			= fio_ioring_queue,
+	.commit			= fio_ioring_commit,
+	.getevents		= fio_ioring_getevents,
+	.event			= fio_ioring_event,
+	.cleanup		= fio_ioring_cleanup,
+	.open_file		= fio_ioring_open_file,
+	.close_file		= fio_ioring_close_file,
+	.get_file_size		= generic_get_file_size,
+	.options		= options,
+	.option_struct_size	= sizeof(struct ioring_options),
+};
+
+static void fio_init fio_ioring_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_ioring_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
+#endif
diff --git a/engines/libaio.c b/engines/libaio.c
new file mode 100644
index 0000000..299798a
--- /dev/null
+++ b/engines/libaio.c
@@ -0,0 +1,456 @@
+/*
+ * libaio engine
+ *
+ * IO engine using the Linux native aio interface.
+ *
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <libaio.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+/* Should be defined in newest aio_abi.h */
+#ifndef IOCB_FLAG_IOPRIO
+#define IOCB_FLAG_IOPRIO    (1 << 1)
+#endif
+
+static int fio_libaio_commit(struct thread_data *td);
+static int fio_libaio_init(struct thread_data *td);
+
+struct libaio_data {
+	io_context_t aio_ctx;
+	struct io_event *aio_events;
+	struct iocb **iocbs;
+	struct io_u **io_us;
+
+	struct io_u **io_u_index;
+
+	/*
+	 * Basic ring buffer. 'head' is incremented in _queue(), and
+	 * 'tail' is incremented in _commit(). We keep 'queued' so
+	 * that we know if the ring is full or empty, when
+	 * 'head' == 'tail'. 'entries' is the ring size, and
+	 * 'is_pow2' is just an optimization to use AND instead of
+	 * modulus to get the remainder on ring increment.
+	 */
+	int is_pow2;
+	unsigned int entries;
+	unsigned int queued;
+	unsigned int head;
+	unsigned int tail;
+};
+
+struct libaio_options {
+	void *pad;
+	unsigned int userspace_reap;
+	unsigned int cmdprio_percentage;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "userspace_reap",
+		.lname	= "Libaio userspace reaping",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct libaio_options, userspace_reap),
+		.help	= "Use alternative user-space reap implementation",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_LIBAIO,
+	},
+#ifdef FIO_HAVE_IOPRIO_CLASS
+	{
+		.name	= "cmdprio_percentage",
+		.lname	= "high priority percentage",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct libaio_options, cmdprio_percentage),
+		.minval	= 1,
+		.maxval	= 100,
+		.help	= "Send high priority I/O this percentage of the time",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_LIBAIO,
+	},
+#else
+	{
+		.name	= "cmdprio_percentage",
+		.lname	= "high priority percentage",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support I/O priority classes",
+	},
+#endif
+	{
+		.name	= NULL,
+	},
+};
+
+static inline void ring_inc(struct libaio_data *ld, unsigned int *val,
+			    unsigned int add)
+{
+	if (ld->is_pow2)
+		*val = (*val + add) & (ld->entries - 1);
+	else
+		*val = (*val + add) % ld->entries;
+}
+
+static int fio_libaio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct iocb *iocb = &io_u->iocb;
+
+	if (io_u->ddir == DDIR_READ) {
+		io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	} else if (io_u->ddir == DDIR_WRITE) {
+		io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	} else if (ddir_sync(io_u->ddir))
+		io_prep_fsync(iocb, f->fd);
+
+	return 0;
+}
+
+static void fio_libaio_prio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct libaio_options *o = td->eo;
+	if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
+		io_u->iocb.aio_reqprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
+		io_u->iocb.u.c.flags |= IOCB_FLAG_IOPRIO;
+		io_u->flags |= IO_U_F_PRIORITY;
+	}
+	return;
+}
+
+static struct io_u *fio_libaio_event(struct thread_data *td, int event)
+{
+	struct libaio_data *ld = td->io_ops_data;
+	struct io_event *ev;
+	struct io_u *io_u;
+
+	ev = ld->aio_events + event;
+	io_u = container_of(ev->obj, struct io_u, iocb);
+
+	if (ev->res != io_u->xfer_buflen) {
+		if (ev->res > io_u->xfer_buflen)
+			io_u->error = -ev->res;
+		else
+			io_u->resid = io_u->xfer_buflen - ev->res;
+	} else
+		io_u->error = 0;
+
+	return io_u;
+}
+
+struct aio_ring {
+	unsigned id;		 /** kernel internal index number */
+	unsigned nr;		 /** number of io_events */
+	unsigned head;
+	unsigned tail;
+
+	unsigned magic;
+	unsigned compat_features;
+	unsigned incompat_features;
+	unsigned header_length;	/** size of aio_ring */
+
+	struct io_event events[0];
+};
+
+#define AIO_RING_MAGIC	0xa10a10a1
+
+static int user_io_getevents(io_context_t aio_ctx, unsigned int max,
+			     struct io_event *events)
+{
+	long i = 0;
+	unsigned head;
+	struct aio_ring *ring = (struct aio_ring*) aio_ctx;
+
+	while (i < max) {
+		head = ring->head;
+
+		if (head == ring->tail) {
+			/* There are no more completions */
+			break;
+		} else {
+			/* There is another completion to reap */
+			events[i] = ring->events[head];
+			read_barrier();
+			ring->head = (head + 1) % ring->nr;
+			i++;
+		}
+	}
+
+	return i;
+}
+
+static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct libaio_data *ld = td->io_ops_data;
+	struct libaio_options *o = td->eo;
+	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
+	struct timespec __lt, *lt = NULL;
+	int r, events = 0;
+
+	if (t) {
+		__lt = *t;
+		lt = &__lt;
+	}
+
+	do {
+		if (o->userspace_reap == 1
+		    && actual_min == 0
+		    && ((struct aio_ring *)(ld->aio_ctx))->magic
+				== AIO_RING_MAGIC) {
+			r = user_io_getevents(ld->aio_ctx, max,
+				ld->aio_events + events);
+		} else {
+			r = io_getevents(ld->aio_ctx, actual_min,
+				max, ld->aio_events + events, lt);
+		}
+		if (r > 0)
+			events += r;
+		else if ((min && r == 0) || r == -EAGAIN) {
+			fio_libaio_commit(td);
+			if (actual_min)
+				usleep(10);
+		} else if (r != -EINTR)
+			break;
+	} while (events < min);
+
+	return r < 0 ? r : events;
+}
+
+static enum fio_q_status fio_libaio_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct libaio_data *ld = td->io_ops_data;
+	struct libaio_options *o = td->eo;
+
+	fio_ro_check(td, io_u);
+
+	if (ld->queued == td->o.iodepth)
+		return FIO_Q_BUSY;
+
+	/*
+	 * fsync is tricky, since it can fail and we need to do it
+	 * serialized with other io. the reason is that linux doesn't
+	 * support aio fsync yet. So return busy for the case where we
+	 * have pending io, to let fio complete those first.
+	 */
+	if (ddir_sync(io_u->ddir)) {
+		if (ld->queued)
+			return FIO_Q_BUSY;
+
+		do_io_u_sync(td, io_u);
+		return FIO_Q_COMPLETED;
+	}
+
+	if (io_u->ddir == DDIR_TRIM) {
+		if (ld->queued)
+			return FIO_Q_BUSY;
+
+		do_io_u_trim(td, io_u);
+		io_u_mark_submit(td, 1);
+		io_u_mark_complete(td, 1);
+		return FIO_Q_COMPLETED;
+	}
+
+	if (o->cmdprio_percentage)
+		fio_libaio_prio_prep(td, io_u);
+
+	ld->iocbs[ld->head] = &io_u->iocb;
+	ld->io_us[ld->head] = io_u;
+	ring_inc(ld, &ld->head, 1);
+	ld->queued++;
+	return FIO_Q_QUEUED;
+}
+
+static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
+			      unsigned int nr)
+{
+	struct timespec now;
+	unsigned int i;
+
+	if (!fio_fill_issue_time(td))
+		return;
+
+	fio_gettime(&now, NULL);
+
+	for (i = 0; i < nr; i++) {
+		struct io_u *io_u = io_us[i];
+
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+	}
+}
+
+static int fio_libaio_commit(struct thread_data *td)
+{
+	struct libaio_data *ld = td->io_ops_data;
+	struct iocb **iocbs;
+	struct io_u **io_us;
+	struct timespec ts;
+	int ret, wait_start = 0;
+
+	if (!ld->queued)
+		return 0;
+
+	do {
+		long nr = ld->queued;
+
+		nr = min((unsigned int) nr, ld->entries - ld->tail);
+		io_us = ld->io_us + ld->tail;
+		iocbs = ld->iocbs + ld->tail;
+
+		ret = io_submit(ld->aio_ctx, nr, iocbs);
+		if (ret > 0) {
+			fio_libaio_queued(td, io_us, ret);
+			io_u_mark_submit(td, ret);
+
+			ld->queued -= ret;
+			ring_inc(ld, &ld->tail, ret);
+			ret = 0;
+			wait_start = 0;
+		} else if (ret == -EINTR || !ret) {
+			if (!ret)
+				io_u_mark_submit(td, ret);
+			wait_start = 0;
+			continue;
+		} else if (ret == -EAGAIN) {
+			/*
+			 * If we get EAGAIN, we should break out without
+			 * error and let the upper layer reap some
+			 * events for us. If we have no queued IO, we
+			 * must loop here. If we loop for more than 30s,
+			 * just error out, something must be buggy in the
+			 * IO path.
+			 */
+			if (ld->queued) {
+				ret = 0;
+				break;
+			}
+			if (!wait_start) {
+				fio_gettime(&ts, NULL);
+				wait_start = 1;
+			} else if (mtime_since_now(&ts) > 30000) {
+				log_err("fio: aio appears to be stalled, giving up\n");
+				break;
+			}
+			usleep(1);
+			continue;
+		} else if (ret == -ENOMEM) {
+			/*
+			 * If we get -ENOMEM, reap events if we can. If
+			 * we cannot, treat it as a fatal event since there's
+			 * nothing we can do about it.
+			 */
+			if (ld->queued)
+				ret = 0;
+			break;
+		} else
+			break;
+	} while (ld->queued);
+
+	return ret;
+}
+
+static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u)
+{
+	struct libaio_data *ld = td->io_ops_data;
+
+	return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events);
+}
+
+static void fio_libaio_cleanup(struct thread_data *td)
+{
+	struct libaio_data *ld = td->io_ops_data;
+
+	if (ld) {
+		/*
+		 * Work-around to avoid huge RCU stalls at exit time. If we
+		 * don't do this here, then it'll be torn down by exit_aio().
+		 * But for that case we can parallellize the freeing, thus
+		 * speeding it up a lot.
+		 */
+		if (!(td->flags & TD_F_CHILD))
+			io_destroy(ld->aio_ctx);
+		free(ld->aio_events);
+		free(ld->iocbs);
+		free(ld->io_us);
+		free(ld);
+	}
+}
+
+static int fio_libaio_post_init(struct thread_data *td)
+{
+	struct libaio_data *ld = td->io_ops_data;
+	int err;
+
+	err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
+	if (err) {
+		td_verror(td, -err, "io_queue_init");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_libaio_init(struct thread_data *td)
+{
+	struct libaio_data *ld;
+	struct thread_options *to = &td->o;
+	struct libaio_options *o = td->eo;
+
+	ld = calloc(1, sizeof(*ld));
+
+	ld->entries = td->o.iodepth;
+	ld->is_pow2 = is_power_of_2(ld->entries);
+	ld->aio_events = calloc(ld->entries, sizeof(struct io_event));
+	ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
+	ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
+
+	td->io_ops_data = ld;
+	/*
+	 * Check for option conflicts
+	 */
+	if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
+			o->cmdprio_percentage != 0) {
+		log_err("%s: cmdprio_percentage option and mutually exclusive "
+				"prio or prioclass option is set, exiting\n", to->name);
+		td_verror(td, EINVAL, "fio_libaio_init");
+		return 1;
+	}
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "libaio",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM,
+	.init			= fio_libaio_init,
+	.post_init		= fio_libaio_post_init,
+	.prep			= fio_libaio_prep,
+	.queue			= fio_libaio_queue,
+	.commit			= fio_libaio_commit,
+	.cancel			= fio_libaio_cancel,
+	.getevents		= fio_libaio_getevents,
+	.event			= fio_libaio_event,
+	.cleanup		= fio_libaio_cleanup,
+	.open_file		= generic_open_file,
+	.close_file		= generic_close_file,
+	.get_file_size		= generic_get_file_size,
+	.options		= options,
+	.option_struct_size	= sizeof(struct libaio_options),
+};
+
+static void fio_init fio_libaio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_libaio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/libhdfs.c b/engines/libhdfs.c
new file mode 100644
index 0000000..c57fcea
--- /dev/null
+++ b/engines/libhdfs.c
@@ -0,0 +1,421 @@
+/*
+ * libhdfs engine
+ *
+ * this engine helps perform read/write operations on hdfs cluster using
+ * libhdfs. hdfs does not support modification of data once file is created.
+ *
+ * so to mimic that create many files of small size (e.g 256k), and this
+ * engine select a file based on the offset generated by fio.
+ *
+ * thus, random reads and writes can also be achieved with this logic.
+ *
+ */
+
+#include <math.h>
+#include <hdfs.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#define CHUNCK_NAME_LENGTH_MAX 80
+#define CHUNCK_CREATION_BUFFER_SIZE 65536
+
+struct hdfsio_data {
+	hdfsFS fs;
+	hdfsFile fp;
+	uint64_t curr_file_id;
+};
+
+struct hdfsio_options {
+	void *pad;			/* needed because offset can't be 0 for a option defined used offsetof */
+	char *host;
+	char *directory;
+	unsigned int port;
+	unsigned int chunck_size;
+	unsigned int single_instance;
+	unsigned int use_direct;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "namenode",
+		.lname	= "hfds namenode",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, host),
+		.def    = "localhost",
+		.help	= "Namenode of the HDFS cluster",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hostname",
+		.lname	= "hfds namenode",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, host),
+		.def    = "localhost",
+		.help	= "Namenode of the HDFS cluster",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "port",
+		.lname	= "hdfs namenode port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct hdfsio_options, port),
+		.def    = "9000",
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port used by the HDFS cluster namenode",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hdfsdirectory",
+		.lname	= "hfds directory",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, directory),
+		.def    = "/",
+		.help	= "The HDFS directory where fio will create chunks",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "chunk_size",
+		.alias	= "chunck_size",
+		.lname	= "Chunk size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct hdfsio_options, chunck_size),
+		.def    = "1048576",
+		.help	= "Size of individual chunk",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "single_instance",
+		.lname	= "Single Instance",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct hdfsio_options, single_instance),
+		.def    = "1",
+		.help	= "Use a single instance",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hdfs_use_direct",
+		.lname	= "HDFS Use Direct",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct hdfsio_options, use_direct),
+		.def    = "0",
+		.help	= "Use readDirect instead of hdfsRead",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+
+static int get_chunck_name(char *dest, char *file_name, uint64_t chunk_id) {
+	return snprintf(dest, CHUNCK_NAME_LENGTH_MAX, "%s_%lu", file_name, chunk_id);
+}
+
+static int fio_hdfsio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_options *options = td->eo;
+	struct hdfsio_data *hd = td->io_ops_data;
+	unsigned long f_id;
+	char fname[CHUNCK_NAME_LENGTH_MAX];
+	int open_flags;
+
+	/* find out file id based on the offset generated by fio */
+	f_id = floor(io_u->offset / options-> chunck_size);
+
+	if (f_id == hd->curr_file_id) {
+		/* file is already open */
+		return 0;
+	}
+
+	if (hd->curr_file_id != -1) {
+		if ( hdfsCloseFile(hd->fs, hd->fp) == -1) {
+			log_err("hdfs: unable to close file: %s\n", strerror(errno));
+			return errno;
+		}
+		hd->curr_file_id = -1;
+	}
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_SYNC) {
+		open_flags = O_RDONLY;
+	} else if (io_u->ddir == DDIR_WRITE) {
+		open_flags = O_WRONLY;
+	} else {
+		log_err("hdfs: Invalid I/O Operation\n");
+		return 0;
+	}
+	
+	get_chunck_name(fname, io_u->file->file_name, f_id);
+	hd->fp = hdfsOpenFile(hd->fs, fname, open_flags, 0, 0,
+			      options->chunck_size);
+	if(hd->fp == NULL) {
+		log_err("hdfs: unable to open file: %s: %d\n", fname, strerror(errno));
+		return errno;
+	}
+	hd->curr_file_id = f_id;
+
+	return 0;
+}
+
+static enum fio_q_status fio_hdfsio_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct hdfsio_options *options = td->eo;
+	int ret;
+	unsigned long offset;
+	
+	offset = io_u->offset % options->chunck_size;
+	
+	if( (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) && 
+	     hdfsTell(hd->fs, hd->fp) != offset && hdfsSeek(hd->fs, hd->fp, offset) != 0 ) {
+		log_err("hdfs: seek failed: %s, are you doing random write smaller than chunk size ?\n", strerror(errno));
+		io_u->error = errno;
+		return FIO_Q_COMPLETED;
+	};
+
+	// do the IO
+	if (io_u->ddir == DDIR_READ) {
+		if (options->use_direct) {
+			ret = readDirect(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+		} else {
+			ret = hdfsRead(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+		}
+	} else if (io_u->ddir == DDIR_WRITE) {
+		ret = hdfsWrite(hd->fs, hd->fp, io_u->xfer_buf,
+				io_u->xfer_buflen);
+	} else if (io_u->ddir == DDIR_SYNC) {
+		ret = hdfsFlush(hd->fs, hd->fp);
+	} else {
+		log_err("hdfs: Invalid I/O Operation: %d\n", io_u->ddir);
+		ret = EINVAL;
+	}
+
+	// Check if the IO went fine, or is incomplete
+	if (ret != (int)io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else {
+			io_u->error = errno;
+		}
+	}
+
+	if (io_u->error)
+		td_verror(td, io_u->error, "xfer");
+
+	return FIO_Q_COMPLETED;
+}
+
+int fio_hdfsio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td->o.odirect) {
+		td->error = EINVAL;
+		return 0;
+	}
+
+	return 0;
+}
+
+int fio_hdfsio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+
+	if (hd->curr_file_id != -1) {
+		if ( hdfsCloseFile(hd->fs, hd->fp) == -1) {
+			log_err("hdfs: unable to close file: %s\n", strerror(errno));
+			return errno;
+		}
+		hd->curr_file_id = -1;
+	}
+	return 0;
+}
+
+static int fio_hdfsio_init(struct thread_data *td)
+{
+	struct hdfsio_options *options = td->eo;
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct fio_file *f;
+	uint64_t j,k;
+	int i, failure = 0;
+	uint8_t buffer[CHUNCK_CREATION_BUFFER_SIZE];
+	uint64_t bytes_left;	
+	char fname[CHUNCK_NAME_LENGTH_MAX];	
+	hdfsFile fp;
+	hdfsFileInfo *fi;
+	tOffset fi_size;
+
+	for_each_file(td, f, i) {
+		k = 0;
+		for(j=0; j < f->real_file_size; j += options->chunck_size) {
+			get_chunck_name(fname, f->file_name, k++);
+			fi = hdfsGetPathInfo(hd->fs, fname);
+			fi_size = fi ? fi->mSize : 0;
+			// fill exist and is big enough, nothing to do
+			if( fi && fi_size >= options->chunck_size) {
+				continue;
+			}
+			fp = hdfsOpenFile(hd->fs, fname, O_WRONLY, 0, 0,
+					  options->chunck_size);
+			if(fp == NULL) {
+				failure = errno;
+				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+				break;
+			}
+			bytes_left = options->chunck_size;
+			memset(buffer, 0, CHUNCK_CREATION_BUFFER_SIZE);
+			while( bytes_left > CHUNCK_CREATION_BUFFER_SIZE) {
+				if( hdfsWrite(hd->fs, fp, buffer, CHUNCK_CREATION_BUFFER_SIZE)
+				    != CHUNCK_CREATION_BUFFER_SIZE) {
+    					failure = errno;
+	    				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+					break;
+				};
+				bytes_left -= CHUNCK_CREATION_BUFFER_SIZE;
+			}
+			if(bytes_left > 0) {
+				if( hdfsWrite(hd->fs, fp, buffer, bytes_left)
+				    != bytes_left) {
+					failure = errno;
+					break;
+				};
+			}
+			if( hdfsCloseFile(hd->fs, fp) != 0) {
+				failure = errno;
+				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+				break;
+			}
+		}
+		if(failure) {
+			break;
+		}
+	}
+	
+	if( !failure ) {
+		fio_file_set_size_known(f);
+	}
+
+	return failure;
+}
+
+static int fio_hdfsio_setup(struct thread_data *td)
+{
+	struct hdfsio_data *hd;
+	struct fio_file *f;
+	int i;
+	uint64_t file_size, total_file_size;
+
+	if (!td->io_ops_data) {
+		hd = malloc(sizeof(*hd));
+		memset(hd, 0, sizeof(*hd));
+		
+		hd->curr_file_id = -1;
+
+		td->io_ops_data = hd;
+	}
+	
+	total_file_size = 0;
+	file_size = 0;
+
+	for_each_file(td, f, i) {
+		if(!td->o.file_size_low) {
+			file_size = floor(td->o.size / td->o.nr_files);
+			total_file_size += file_size;
+		}
+		else if (td->o.file_size_low == td->o.file_size_high)
+			file_size = td->o.file_size_low;
+		else {
+			file_size = get_rand_file_size(td);
+		}
+		f->real_file_size = file_size;
+	}
+	/* If the size doesn't divide nicely with the chunk size,
+	 * make the last files bigger.
+	 * Used only if filesize was not explicitly given
+	 */
+	if (!td->o.file_size_low && total_file_size < td->o.size) {
+		f->real_file_size += (td->o.size - total_file_size);
+	}
+
+	return 0;
+}
+
+static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct hdfsio_options *options = td->eo;
+	int failure;
+	struct hdfsBuilder *bld;
+
+	if (options->host == NULL || options->port == 0) {
+		log_err("hdfs: server not defined\n");
+		return EINVAL;
+	}
+	
+	bld = hdfsNewBuilder();
+	if (!bld) {
+		failure = errno;
+		log_err("hdfs: unable to allocate connect builder\n");
+		return failure;
+	}
+	hdfsBuilderSetNameNode(bld, options->host);
+	hdfsBuilderSetNameNodePort(bld, options->port);
+	if(! options->single_instance) {
+		hdfsBuilderSetForceNewInstance(bld);
+	}
+	hd->fs = hdfsBuilderConnect(bld);
+	
+	/* hdfsSetWorkingDirectory succeed on non-existent directory */
+	if (hdfsExists(hd->fs, options->directory) < 0 || hdfsSetWorkingDirectory(hd->fs, options->directory) < 0) {
+		failure = errno;
+		log_err("hdfs: invalid working directory %s: %s\n", options->directory, strerror(errno));
+		return failure;
+	}
+	
+	return 0;
+}
+
+static void fio_hdfsio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+
+	if (hd->fs && hdfsDisconnect(hd->fs) < 0) {
+		log_err("hdfs: disconnect failed: %d\n", errno);
+	}
+}
+
+static struct ioengine_ops ioengine_hdfs = {
+	.name = "libhdfs",
+	.version = FIO_IOOPS_VERSION,
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NODISKUTIL,
+	.setup = fio_hdfsio_setup,
+	.init = fio_hdfsio_init,
+	.prep = fio_hdfsio_prep,
+	.queue = fio_hdfsio_queue,
+	.open_file = fio_hdfsio_open_file,
+	.close_file = fio_hdfsio_close_file,
+	.io_u_init = fio_hdfsio_io_u_init,
+	.io_u_free = fio_hdfsio_io_u_free,
+	.option_struct_size	= sizeof(struct hdfsio_options),
+	.options		= options,
+};
+
+
+static void fio_init fio_hdfsio_register(void)
+{
+	register_ioengine(&ioengine_hdfs);
+}
+
+static void fio_exit fio_hdfsio_unregister(void)
+{
+	unregister_ioengine(&ioengine_hdfs);
+}
diff --git a/engines/libiscsi.c b/engines/libiscsi.c
new file mode 100644
index 0000000..58667fb
--- /dev/null
+++ b/engines/libiscsi.c
@@ -0,0 +1,411 @@
+/*
+ * libiscsi engine
+ *
+ * this engine read/write iscsi lun with libiscsi.
+ */
+
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#include <stdlib.h>
+#include <iscsi/iscsi.h>
+#include <iscsi/scsi-lowlevel.h>
+#include <poll.h>
+
+struct iscsi_lun;
+struct iscsi_info;
+
+struct iscsi_task {
+	struct scsi_task	*scsi_task;
+	struct iscsi_lun	*iscsi_lun;
+	struct io_u		*io_u;
+};
+
+struct iscsi_lun {
+	struct iscsi_info	*iscsi_info;
+	struct iscsi_context	*iscsi;
+	struct iscsi_url        *url;
+	int			 block_size;
+	uint64_t		 num_blocks;
+};
+
+struct iscsi_info {
+	struct iscsi_lun	**luns;
+	int			  nr_luns;
+	struct pollfd		 *pfds;
+	struct iscsi_task	**complete_events;
+	int			  nr_events;
+};
+
+struct iscsi_options {
+	void	*pad;
+	char	*initiator;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	  = "initiator",
+		.lname	  = "initiator",
+		.type	  = FIO_OPT_STR_STORE,
+		.off1	  = offsetof(struct iscsi_options, initiator),
+		.def	  = "iqn.2019-04.org.fio:fio",
+		.help	  = "initiator name",
+		.category = FIO_OPT_C_ENGINE,
+		.group	  = FIO_OPT_G_ISCSI,
+	},
+
+	{
+		.name = NULL,
+	},
+};
+
+static int fio_iscsi_setup_lun(struct iscsi_info *iscsi_info,
+			       char *initiator, struct fio_file *f, int i)
+{
+	struct iscsi_lun		*iscsi_lun  = NULL;
+	struct scsi_task		*task	    = NULL;
+	struct scsi_readcapacity16	*rc16	    = NULL;
+	int				 ret	    = 0;
+
+	iscsi_lun = malloc(sizeof(struct iscsi_lun));
+	memset(iscsi_lun, 0, sizeof(struct iscsi_lun));
+
+	iscsi_lun->iscsi_info = iscsi_info;
+
+	iscsi_lun->url = iscsi_parse_full_url(NULL, f->file_name);
+	if (iscsi_lun->url == NULL) {
+		log_err("iscsi: failed to parse url: %s\n", f->file_name);
+		ret = EINVAL;
+		goto out;
+	}
+
+	iscsi_lun->iscsi = iscsi_create_context(initiator);
+	if (iscsi_lun->iscsi == NULL) {
+		log_err("iscsi: failed to create iscsi context.\n");
+		ret = 1;
+		goto out;
+	}
+
+	if (iscsi_set_targetname(iscsi_lun->iscsi, iscsi_lun->url->target)) {
+		log_err("iscsi: failed to set target name.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (iscsi_set_session_type(iscsi_lun->iscsi, ISCSI_SESSION_NORMAL) != 0) {
+		log_err("iscsi: failed to set session type.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (iscsi_set_header_digest(iscsi_lun->iscsi,
+				    ISCSI_HEADER_DIGEST_NONE_CRC32C) != 0) {
+		log_err("iscsi: failed to set header digest.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (iscsi_full_connect_sync(iscsi_lun->iscsi,
+				    iscsi_lun->url->portal,
+				    iscsi_lun->url->lun)) {
+		log_err("sicsi: failed to connect to LUN : %s\n",
+			iscsi_get_error(iscsi_lun->iscsi));
+		ret = EINVAL;
+		goto out;
+	}
+
+	task = iscsi_readcapacity16_sync(iscsi_lun->iscsi, iscsi_lun->url->lun);
+	if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+		log_err("iscsi: failed to send readcapacity command: %s\n",
+			iscsi_get_error(iscsi_lun->iscsi));
+		ret = EINVAL;
+		goto out;
+	}
+
+	rc16 = scsi_datain_unmarshall(task);
+	if (rc16 == NULL) {
+		log_err("iscsi: failed to unmarshal readcapacity16 data.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	iscsi_lun->block_size = rc16->block_length;
+	iscsi_lun->num_blocks = rc16->returned_lba + 1;
+
+	scsi_free_scsi_task(task);
+	task = NULL;
+
+	f->real_file_size = iscsi_lun->num_blocks * iscsi_lun->block_size;
+	f->engine_data	  = iscsi_lun;
+
+	iscsi_info->luns[i]    = iscsi_lun;
+	iscsi_info->pfds[i].fd = iscsi_get_fd(iscsi_lun->iscsi);
+
+out:
+	if (task) {
+		scsi_free_scsi_task(task);
+	}
+
+	if (ret && iscsi_lun) {
+		if (iscsi_lun->iscsi != NULL) {
+			if (iscsi_is_logged_in(iscsi_lun->iscsi)) {
+				iscsi_logout_sync(iscsi_lun->iscsi);
+			}
+			iscsi_destroy_context(iscsi_lun->iscsi);
+		}
+		free(iscsi_lun);
+	}
+
+	return ret;
+}
+
+static int fio_iscsi_setup(struct thread_data *td)
+{
+	struct iscsi_options	*options    = td->eo;
+	struct iscsi_info	*iscsi_info = NULL;
+	int			 ret	    = 0;
+	struct fio_file		*f;
+	int			 i;
+
+	iscsi_info	    = malloc(sizeof(struct iscsi_info));
+	iscsi_info->nr_luns = td->o.nr_files;
+	iscsi_info->luns    = calloc(iscsi_info->nr_luns, sizeof(struct iscsi_lun*));
+	iscsi_info->pfds    = calloc(iscsi_info->nr_luns, sizeof(struct pollfd));
+
+	iscsi_info->nr_events	    = 0;
+	iscsi_info->complete_events = calloc(td->o.iodepth, sizeof(struct iscsi_task*));
+
+	td->io_ops_data = iscsi_info;
+
+	for_each_file(td, f, i) {
+		ret = fio_iscsi_setup_lun(iscsi_info, options->initiator, f, i);
+		if (ret < 0) break;
+	}
+
+	return ret;
+}
+
+static int fio_iscsi_init(struct thread_data *td) {
+	return 0;
+}
+
+static void fio_iscsi_cleanup_lun(struct iscsi_lun *iscsi_lun) {
+	if (iscsi_lun->iscsi != NULL) {
+		if (iscsi_is_logged_in(iscsi_lun->iscsi)) {
+			iscsi_logout_sync(iscsi_lun->iscsi);
+		}
+		iscsi_destroy_context(iscsi_lun->iscsi);
+	}
+	free(iscsi_lun);
+}
+
+static void fio_iscsi_cleanup(struct thread_data *td)
+{
+	struct iscsi_info *iscsi_info = td->io_ops_data;
+
+	for (int i = 0; i < iscsi_info->nr_luns; i++) {
+		if (iscsi_info->luns[i]) {
+			fio_iscsi_cleanup_lun(iscsi_info->luns[i]);
+			iscsi_info->luns[i] = NULL;
+		}
+	}
+
+	free(iscsi_info->luns);
+	free(iscsi_info->pfds);
+	free(iscsi_info->complete_events);
+	free(iscsi_info);
+}
+
+static int fio_iscsi_prep(struct thread_data *td, struct io_u *io_u)
+{
+	return 0;
+}
+
+static int fio_iscsi_open_file(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static int fio_iscsi_close_file(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static void iscsi_cb(struct iscsi_context *iscsi, int status,
+		     void *command_data, void *private_data)
+{
+	struct iscsi_task	*iscsi_task = (struct iscsi_task*)private_data;
+	struct iscsi_lun	*iscsi_lun  = iscsi_task->iscsi_lun;
+	struct iscsi_info       *iscsi_info = iscsi_lun->iscsi_info;
+	struct io_u             *io_u	    = iscsi_task->io_u;
+
+	if (status == SCSI_STATUS_GOOD) {
+		io_u->error = 0;
+	} else {
+		log_err("iscsi: request failed with error %s.\n",
+			iscsi_get_error(iscsi_lun->iscsi));
+
+		io_u->error = 1;
+		io_u->resid = io_u->xfer_buflen;
+	}
+
+	iscsi_info->complete_events[iscsi_info->nr_events] = iscsi_task;
+	iscsi_info->nr_events++;
+}
+
+static enum fio_q_status fio_iscsi_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct iscsi_lun	*iscsi_lun  = io_u->file->engine_data;
+	struct scsi_task	*scsi_task  = NULL;
+	struct iscsi_task	*iscsi_task = malloc(sizeof(struct iscsi_task));
+	int			 ret	    = -1;
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (io_u->offset % iscsi_lun->block_size != 0) {
+			log_err("iscsi: offset is not align to block size.\n");
+			ret = -1;
+			goto out;
+		}
+
+		if (io_u->xfer_buflen % iscsi_lun->block_size != 0) {
+			log_err("iscsi: buflen is not align to block size.\n");
+			ret = -1;
+			goto out;
+		}
+	}
+
+	if (io_u->ddir == DDIR_READ) {
+		scsi_task = scsi_cdb_read16(io_u->offset / iscsi_lun->block_size,
+					    io_u->xfer_buflen,
+					    iscsi_lun->block_size,
+					    0, 0, 0, 0, 0);
+		ret = scsi_task_add_data_in_buffer(scsi_task, io_u->xfer_buflen,
+						   io_u->xfer_buf);
+		if (ret < 0) {
+			log_err("iscsi: failed to add data in buffer.\n");
+			goto out;
+		}
+	} else if (io_u->ddir == DDIR_WRITE) {
+		scsi_task = scsi_cdb_write16(io_u->offset / iscsi_lun->block_size,
+					     io_u->xfer_buflen,
+					     iscsi_lun->block_size,
+					     0, 0, 0, 0, 0);
+		ret = scsi_task_add_data_out_buffer(scsi_task, io_u->xfer_buflen,
+						    io_u->xfer_buf);
+		if (ret < 0) {
+			log_err("iscsi: failed to add data out buffer.\n");
+			goto out;
+		}
+	} else if (ddir_sync(io_u->ddir)) {
+		scsi_task = scsi_cdb_synchronizecache16(
+			0, iscsi_lun->num_blocks * iscsi_lun->block_size, 0, 0);
+	} else {
+		log_err("iscsi: invalid I/O operation: %d\n", io_u->ddir);
+		ret = EINVAL;
+		goto out;
+	}
+
+	iscsi_task->scsi_task = scsi_task;
+	iscsi_task->iscsi_lun = iscsi_lun;
+	iscsi_task->io_u      = io_u;
+
+	ret = iscsi_scsi_command_async(iscsi_lun->iscsi, iscsi_lun->url->lun,
+				       scsi_task, iscsi_cb, NULL, iscsi_task);
+	if (ret < 0) {
+		log_err("iscsi: failed to send scsi command.\n");
+		goto out;
+	}
+
+	return FIO_Q_QUEUED;
+
+out:
+	if (iscsi_task) {
+		free(iscsi_task);
+	}
+
+	if (scsi_task) {
+		scsi_free_scsi_task(scsi_task);
+	}
+
+	if (ret) {
+		io_u->error = ret;
+	}
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_iscsi_getevents(struct thread_data *td, unsigned int min,
+			       unsigned int max, const struct timespec *t)
+{
+	struct iscsi_info	*iscsi_info = td->io_ops_data;
+	int			 ret	    = 0;
+
+	iscsi_info->nr_events = 0;
+
+	while (iscsi_info->nr_events < min) {
+		for (int i = 0; i < iscsi_info->nr_luns; i++) {
+			int events = iscsi_which_events(iscsi_info->luns[i]->iscsi);
+			iscsi_info->pfds[i].events = events;
+		}
+
+		ret = poll(iscsi_info->pfds, iscsi_info->nr_luns, -1);
+		if (ret < 0) {
+			if (errno == EINTR || errno == EAGAIN) {
+				continue;
+			}
+			log_err("iscsi: failed to poll events: %s.\n",
+				strerror(errno));
+			break;
+		}
+
+		for (int i = 0; i < iscsi_info->nr_luns; i++) {
+			ret = iscsi_service(iscsi_info->luns[i]->iscsi,
+					    iscsi_info->pfds[i].revents);
+			assert(ret >= 0);
+		}
+	}
+
+	return ret < 0 ? ret : iscsi_info->nr_events;
+}
+
+static struct io_u *fio_iscsi_event(struct thread_data *td, int event)
+{
+	struct iscsi_info	*iscsi_info = (struct iscsi_info*)td->io_ops_data;
+	struct iscsi_task	*iscsi_task = iscsi_info->complete_events[event];
+	struct io_u		*io_u	    = iscsi_task->io_u;
+
+	iscsi_info->complete_events[event] = NULL;
+
+	scsi_free_scsi_task(iscsi_task->scsi_task);
+	free(iscsi_task);
+
+	return io_u;
+}
+
+static struct ioengine_ops ioengine_iscsi = {
+	.name               = "libiscsi",
+	.version            = FIO_IOOPS_VERSION,
+	.flags              = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NODISKUTIL,
+	.setup              = fio_iscsi_setup,
+	.init               = fio_iscsi_init,
+	.prep               = fio_iscsi_prep,
+	.queue              = fio_iscsi_queue,
+	.getevents          = fio_iscsi_getevents,
+	.event              = fio_iscsi_event,
+	.cleanup            = fio_iscsi_cleanup,
+	.open_file          = fio_iscsi_open_file,
+	.close_file         = fio_iscsi_close_file,
+	.option_struct_size = sizeof(struct iscsi_options),
+	.options	    = options,
+};
+
+static void fio_init fio_iscsi_register(void)
+{
+	register_ioengine(&ioengine_iscsi);
+}
+
+static void fio_exit fio_iscsi_unregister(void)
+{
+	unregister_ioengine(&ioengine_iscsi);
+}
diff --git a/engines/libpmem.c b/engines/libpmem.c
new file mode 100644
index 0000000..99c7b50
--- /dev/null
+++ b/engines/libpmem.c
@@ -0,0 +1,592 @@
+/*
+ * libpmem: IO engine that uses PMDK libpmem to read and write data
+ *
+ * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * libpmem engine
+ *
+ * IO engine that uses libpmem to read and write data
+ *
+ * To use:
+ *   ioengine=libpmem
+ *
+ * Other relevant settings:
+ *   iodepth=1
+ *   direct=1
+ *   directory=/mnt/pmem0/
+ *   bs=4k
+ *
+ *   direct=1 means that pmem_drain() is executed for each write operation.
+ *   In contrast, direct=0 means that pmem_drain() is not executed.
+ *
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled. directory must point to a mount point of DAX FS.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ *
+ * See examples/libpmem.fio for more.
+ *
+ *
+ * libpmem.so
+ *   By default, the libpmem engine will let the system find the libpmem.so
+ *   that it uses. You can use an alternative libpmem by setting the
+ *   FIO_PMEM_LIB environment variable to the full path to the desired
+ *   libpmem.so.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <libgen.h>
+#include <libpmem.h>
+
+#include "../fio.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total to model after
+ * libpmem engine behavior
+ */
+#define MMAP_TOTAL_SZ   (1 * 1024 * 1024 * 1024UL)
+
+struct fio_libpmem_data {
+	void *libpmem_ptr;
+	size_t libpmem_sz;
+	off_t libpmem_off;
+};
+
+#define MEGABYTE ((uintptr_t)1 << 20)
+#define GIGABYTE ((uintptr_t)1 << 30)
+#define PROCMAXLEN 2048 /* maximum expected line length in /proc files */
+#define roundup(x, y)   ((((x) + ((y) - 1)) / (y)) * (y))
+
+static bool Mmap_no_random;
+static void *Mmap_hint;
+static unsigned long long Mmap_align;
+
+/*
+ * util_map_hint_align -- choose the desired mapping alignment
+ *
+ * Use 2MB/1GB page alignment only if the mapping length is at least
+ * twice as big as the page size.
+ */
+static inline size_t util_map_hint_align(size_t len, size_t req_align)
+{
+	size_t align = Mmap_align;
+
+	dprint(FD_IO, "DEBUG util_map_hint_align\n" );
+
+	if (req_align)
+		align = req_align;
+	else if (len >= 2 * GIGABYTE)
+		align = GIGABYTE;
+	else if (len >= 4 * MEGABYTE)
+		align = 2 * MEGABYTE;
+
+	dprint(FD_IO, "align=%d\n", (int)align);
+	return align;
+}
+
+#ifdef __FreeBSD__
+static const char *sscanf_os = "%p %p";
+#define MAP_NORESERVE 0
+#define OS_MAPFILE "/proc/curproc/map"
+#else
+static const char *sscanf_os = "%p-%p";
+#define OS_MAPFILE "/proc/self/maps"
+#endif
+
+/*
+ * util_map_hint_unused -- use /proc to determine a hint address for mmap()
+ *
+ * This is a helper function for util_map_hint().
+ * It opens up /proc/self/maps and looks for the first unused address
+ * in the process address space that is:
+ * - greater or equal 'minaddr' argument,
+ * - large enough to hold range of given length,
+ * - aligned to the specified unit.
+ *
+ * Asking for aligned address like this will allow the DAX code to use large
+ * mappings.  It is not an error if mmap() ignores the hint and chooses
+ * different address.
+ */
+static char *util_map_hint_unused(void *minaddr, size_t len, size_t align)
+{
+	char *lo = NULL;        /* beginning of current range in maps file */
+	char *hi = NULL;        /* end of current range in maps file */
+	char *raddr = minaddr;  /* ignore regions below 'minaddr' */
+
+#ifdef WIN32
+	MEMORY_BASIC_INFORMATION mi;
+#else
+	FILE *fp;
+	char line[PROCMAXLEN];  /* for fgets() */
+#endif
+
+	dprint(FD_IO, "DEBUG util_map_hint_unused\n");
+	assert(align > 0);
+
+	if (raddr == NULL)
+		raddr += page_size;
+
+	raddr = (char *)roundup((uintptr_t)raddr, align);
+
+#ifdef WIN32
+	while ((uintptr_t)raddr < UINTPTR_MAX - len) {
+		size_t ret = VirtualQuery(raddr, &mi, sizeof(mi));
+		if (ret == 0) {
+			ERR("VirtualQuery %p", raddr);
+			return MAP_FAILED;
+		}
+		dprint(FD_IO, "addr %p len %zu state %d",
+				mi.BaseAddress, mi.RegionSize, mi.State);
+
+		if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) {
+			raddr = (char *)mi.BaseAddress + mi.RegionSize;
+			raddr = (char *)roundup((uintptr_t)raddr, align);
+			dprint(FD_IO, "nearest aligned addr %p", raddr);
+		} else {
+			dprint(FD_IO, "unused region of size %zu found at %p",
+					mi.RegionSize, mi.BaseAddress);
+			return mi.BaseAddress;
+		}
+	}
+
+	dprint(FD_IO, "end of address space reached");
+	return MAP_FAILED;
+#else
+	fp = fopen(OS_MAPFILE, "r");
+	if (!fp) {
+		log_err("!%s\n", OS_MAPFILE);
+		return MAP_FAILED;
+	}
+
+	while (fgets(line, PROCMAXLEN, fp) != NULL) {
+		/* check for range line */
+		if (sscanf(line, sscanf_os, &lo, &hi) == 2) {
+			dprint(FD_IO, "%p-%p\n", lo, hi);
+			if (lo > raddr) {
+				if ((uintptr_t)(lo - raddr) >= len) {
+					dprint(FD_IO, "unused region of size "
+							"%zu found at %p\n",
+							lo - raddr, raddr);
+					break;
+				} else {
+					dprint(FD_IO, "region is too small: "
+							"%zu < %zu\n",
+							lo - raddr, len);
+				}
+			}
+
+			if (hi > raddr) {
+				raddr = (char *)roundup((uintptr_t)hi, align);
+				dprint(FD_IO, "nearest aligned addr %p\n",
+						raddr);
+			}
+
+			if (raddr == 0) {
+				dprint(FD_IO, "end of address space reached\n");
+				break;
+			}
+		}
+	}
+
+	/*
+	 * Check for a case when this is the last unused range in the address
+	 * space, but is not large enough. (very unlikely)
+	 */
+	if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) {
+		dprint(FD_IO, "end of address space reached");
+		raddr = MAP_FAILED;
+	}
+
+	fclose(fp);
+
+	dprint(FD_IO, "returning %p", raddr);
+	return raddr;
+#endif
+}
+
+/*
+ * util_map_hint -- determine hint address for mmap()
+ *
+ * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick
+ * the randomized mapping address.  Otherwise, a user-defined hint address
+ * is used.
+ *
+ * Windows Environment:
+ *   XXX - Windows doesn't support large DAX pages yet, so there is
+ *   no point in aligning for the same.
+ *
+ * Except for Windows Environment:
+ *   ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap
+ *   (bit positions 12-39), which means the base mapping address is randomized
+ *   within [0..1024GB] range, with 4KB granularity.  Assuming additional
+ *   1GB alignment, it results in 1024 possible locations.
+ *
+ *   Configuring the hint address via PMEM_MMAP_HINT environment variable
+ *   disables address randomization.  In such case, the function will search for
+ *   the first unused, properly aligned region of given size, above the
+ *   specified address.
+ */
+static char *util_map_hint(size_t len, size_t req_align)
+{
+	char *addr;
+	size_t align = 0;
+	char *e = NULL;
+
+	dprint(FD_IO, "DEBUG util_map_hint\n");
+	dprint(FD_IO, "len %zu req_align %zu\n", len, req_align);
+
+	/* choose the desired alignment based on the requested length */
+	align = util_map_hint_align(len, req_align);
+
+	e = getenv("PMEM_MMAP_HINT");
+	if (e) {
+		char *endp;
+		unsigned long long val = 0;
+
+		errno = 0;
+
+		val = strtoull(e, &endp, 16);
+		if (errno || endp == e) {
+			dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n");
+		} else {
+			Mmap_hint = (void *)val;
+			Mmap_no_random = true;
+			dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint);
+		}
+	}
+
+	if (Mmap_no_random) {
+		dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint);
+		addr = util_map_hint_unused((void *)Mmap_hint, len, align);
+	} else {
+		/*
+		 * Create dummy mapping to find an unused region of given size.
+		 * * Request for increased size for later address alignment.
+		 *
+		 * Windows Environment: 
+		 *   Use MAP_NORESERVE flag to only reserve the range of pages
+		 *   rather than commit.  We don't want the pages to be actually
+		 *   backed by the operating system paging file, as the swap
+		 *   file is usually too small to handle terabyte pools.
+		 *
+		 * Except for Windows Environment:
+		 *   Use MAP_PRIVATE with read-only access to simulate
+		 *   zero cost for overcommit accounting.  Note: MAP_NORESERVE
+		 *   flag is ignored if overcommit is disabled (mode 2).
+		 */
+#ifndef WIN32
+		addr = mmap(NULL, len + align, PROT_READ,
+				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+#else
+		addr = mmap(NULL, len + align, PROT_READ,
+				MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
+#endif
+		if (addr != MAP_FAILED) {
+			dprint(FD_IO, "system choice %p\n", addr);
+			munmap(addr, len + align);
+			addr = (char *)roundup((uintptr_t)addr, align);
+		}
+	}
+
+	dprint(FD_IO, "hint %p\n", addr);
+
+	return addr;
+}
+
+/*
+ * This is the mmap execution function
+ */
+static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
+			    size_t length, off_t off)
+{
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+	int flags = 0;
+	void *addr = NULL;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_file\n");
+
+	if (td_rw(td))
+		flags = PROT_READ | PROT_WRITE;
+	else if (td_write(td)) {
+		flags = PROT_WRITE;
+
+		if (td->o.verify != VERIFY_NONE)
+			flags |= PROT_READ;
+	} else
+		flags = PROT_READ;
+
+	dprint(FD_IO, "f->file_name = %s  td->o.verify = %d \n", f->file_name,
+			td->o.verify);
+	dprint(FD_IO, "length = %ld  flags = %d  f->fd = %d off = %ld \n",
+			length, flags, f->fd,off);
+
+	addr = util_map_hint(length, 0);
+
+	fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off);
+	if (fdd->libpmem_ptr == MAP_FAILED) {
+		fdd->libpmem_ptr = NULL;
+		td_verror(td, errno, "mmap");
+	}
+
+	if (td->error && fdd->libpmem_ptr)
+		munmap(fdd->libpmem_ptr, length);
+
+	return td->error;
+}
+
+/*
+ * XXX Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+
+	dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" );
+
+	if (io_u->buflen > f->real_file_size) {
+		log_err("libpmem: bs too big for libpmem engine\n");
+		return EIO;
+	}
+
+	fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
+	if (fdd->libpmem_sz > f->io_size)
+		fdd->libpmem_sz = f->io_size;
+
+	fdd->libpmem_off = io_u->offset;
+
+	return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" );
+
+	if (fio_file_partial_mmap(f))
+		return EINVAL;
+
+	dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n",
+			f->io_size, io_u->offset);
+
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
+	fdd->libpmem_sz = f->io_size;
+	fdd->libpmem_off = 0;
+
+	ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
+	if (ret)
+		fio_file_set_partial_mmap(f);
+
+	return ret;
+}
+
+static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
+	/*
+	 * It fits within existing mapping, use it
+	 */
+	dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %llu : "
+			"io_u->buflen %llu : fdd->libpmem_sz %llu\n",
+			io_u->offset, (unsigned long long) fdd->libpmem_off,
+			io_u->buflen, (unsigned long long) fdd->libpmem_sz);
+
+	if (io_u->offset >= fdd->libpmem_off &&
+	    (io_u->offset + io_u->buflen <=
+	     fdd->libpmem_off + fdd->libpmem_sz))
+		goto done;
+
+	/*
+	 * unmap any existing mapping
+	 */
+	if (fdd->libpmem_ptr) {
+		dprint(FD_IO,"munmap \n");
+		if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0)
+			return errno;
+		fdd->libpmem_ptr = NULL;
+	}
+
+	if (fio_libpmem_prep_full(td, io_u)) {
+		td_clear_error(td);
+		ret = fio_libpmem_prep_limited(td, io_u);
+		if (ret)
+			return ret;
+	}
+
+done:
+	io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off
+				- f->file_offset;
+	return 0;
+}
+
+static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	fio_ro_check(td, io_u);
+	io_u->error = 0;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+		break;
+	case DDIR_WRITE:
+		dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
+				io_u->mmap_data, io_u->xfer_buf );
+		dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+		if (td->o.odirect) {
+			pmem_memcpy_persist(io_u->mmap_data,
+						io_u->xfer_buf,
+						io_u->xfer_buflen);
+		} else {
+			pmem_memcpy_nodrain(io_u->mmap_data,
+						io_u->xfer_buf,
+						io_u->xfer_buflen);
+		}
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_libpmem_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n",
+			o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+	dprint(FD_IO, "DEBUG fio_libpmem_init\n");
+
+	if ((o->rw_min_bs & page_mask) &&
+	    (o->fsync_blocks || o->fdatasync_blocks)) {
+		log_err("libpmem: mmap options dictate a minimum block size of "
+				"%llu bytes\n",	(unsigned long long) page_size);
+		return 1;
+	}
+	return 0;
+}
+
+static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_libpmem_data *fdd;
+	int ret;
+
+	dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
+	dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
+	dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
+	dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
+	dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fdd = calloc(1, sizeof(*fdd));
+	if (!fdd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fdd);
+
+	return 0;
+}
+
+static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+
+	dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
+	dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+
+	if (!td->o.odirect) {
+		dprint(FD_IO,"pmem_drain\n");
+		pmem_drain();
+	}
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fdd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "libpmem",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_libpmem_init,
+	.prep		= fio_libpmem_prep,
+	.queue		= fio_libpmem_queue,
+	.open_file	= fio_libpmem_open_file,
+	.close_file	= fio_libpmem_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO |FIO_NOEXTEND,
+};
+
+static void fio_init fio_libpmem_register(void)
+{
+#ifndef WIN32
+	Mmap_align = page_size;
+#else
+	if (Mmap_align == 0) {
+		SYSTEM_INFO si;
+
+		GetSystemInfo(&si);
+		Mmap_align = si.dwAllocationGranularity;
+	}
+#endif
+
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_libpmem_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/mmap.c b/engines/mmap.c
new file mode 100644
index 0000000..55ba1ab
--- /dev/null
+++ b/engines/mmap.c
@@ -0,0 +1,338 @@
+/*
+ * mmap engine
+ *
+ * IO engine that reads/writes from files by doing memcpy to/from
+ * a memory mapped region of the file.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total
+ */
+#define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
+
+static unsigned long mmap_map_size;
+
+struct fio_mmap_data {
+	void *mmap_ptr;
+	size_t mmap_sz;
+	off_t mmap_off;
+};
+
+#ifdef CONFIG_HAVE_THP
+struct mmap_options {
+	void *pad;
+	unsigned int thp;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "thp",
+		.lname	= "Transparent Huge Pages",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct mmap_options, thp),
+		.help	= "Memory Advise Huge Page",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_MMAP,
+	},
+	{
+		.name = NULL,
+	},
+};
+#endif
+
+static bool fio_madvise_file(struct thread_data *td, struct fio_file *f,
+			     size_t length)
+
+{
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+#ifdef CONFIG_HAVE_THP
+	struct mmap_options *o = td->eo;
+
+	/* Ignore errors on this optional advisory */
+	if (o->thp)
+		madvise(fmd->mmap_ptr, length, MADV_HUGEPAGE);
+#endif
+
+	if (!td->o.fadvise_hint)
+		return true;
+
+	if (!td_random(td)) {
+		if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_SEQUENTIAL) < 0) {
+			td_verror(td, errno, "madvise");
+			return false;
+		}
+	} else {
+		if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_RANDOM) < 0) {
+			td_verror(td, errno, "madvise");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+#ifdef CONFIG_HAVE_THP
+static int fio_mmap_get_shared(struct thread_data *td)
+{
+	struct mmap_options *o = td->eo;
+
+	if (o->thp)
+		return MAP_PRIVATE;
+	return MAP_SHARED;
+}
+#else
+static int fio_mmap_get_shared(struct thread_data *td)
+{
+	return MAP_SHARED;
+}
+#endif
+
+static int fio_mmap_file(struct thread_data *td, struct fio_file *f,
+			 size_t length, off_t off)
+{
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+	int flags = 0, shared = fio_mmap_get_shared(td);
+
+	if (td_rw(td) && !td->o.verify_only)
+		flags = PROT_READ | PROT_WRITE;
+	else if (td_write(td) && !td->o.verify_only) {
+		flags = PROT_WRITE;
+
+		if (td->o.verify != VERIFY_NONE)
+			flags |= PROT_READ;
+	} else
+		flags = PROT_READ;
+
+	fmd->mmap_ptr = mmap(NULL, length, flags, shared, f->fd, off);
+	if (fmd->mmap_ptr == MAP_FAILED) {
+		fmd->mmap_ptr = NULL;
+		td_verror(td, errno, "mmap");
+		goto err;
+	}
+
+	if (!fio_madvise_file(td, f, length))
+		goto err;
+
+	if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_DONTNEED) < 0) {
+		td_verror(td, errno, "madvise");
+		goto err;
+	}
+
+#ifdef FIO_MADV_FREE
+	if (f->filetype == FIO_TYPE_BLOCK)
+		(void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE);
+#endif
+
+err:
+	if (td->error && fmd->mmap_ptr)
+		munmap(fmd->mmap_ptr, length);
+
+	return td->error;
+}
+
+/*
+ * Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_mmapio_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+
+	if (io_u->buflen > mmap_map_size) {
+		log_err("fio: bs too big for mmap engine\n");
+		return EIO;
+	}
+
+	fmd->mmap_sz = mmap_map_size;
+	if (fmd->mmap_sz  > f->io_size)
+		fmd->mmap_sz = f->io_size;
+
+	fmd->mmap_off = io_u->offset;
+
+	return fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_mmapio_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+	int ret;
+
+	if (fio_file_partial_mmap(f))
+		return EINVAL;
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
+
+	fmd->mmap_sz = f->io_size;
+	fmd->mmap_off = 0;
+
+	ret = fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off);
+	if (ret)
+		fio_file_set_partial_mmap(f);
+
+	return ret;
+}
+
+static int fio_mmapio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+	int ret;
+
+	/*
+	 * It fits within existing mapping, use it
+	 */
+	if (io_u->offset >= fmd->mmap_off &&
+	    io_u->offset + io_u->buflen <= fmd->mmap_off + fmd->mmap_sz)
+		goto done;
+
+	/*
+	 * unmap any existing mapping
+	 */
+	if (fmd->mmap_ptr) {
+		if (munmap(fmd->mmap_ptr, fmd->mmap_sz) < 0)
+			return errno;
+		fmd->mmap_ptr = NULL;
+	}
+
+	if (fio_mmapio_prep_full(td, io_u)) {
+		td_clear_error(td);
+		ret = fio_mmapio_prep_limited(td, io_u);
+		if (ret)
+			return ret;
+	}
+
+done:
+	io_u->mmap_data = fmd->mmap_ptr + io_u->offset - fmd->mmap_off -
+				f->file_offset;
+	return 0;
+}
+
+static enum fio_q_status fio_mmapio_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+	else if (io_u->ddir == DDIR_WRITE)
+		memcpy(io_u->mmap_data, io_u->xfer_buf, io_u->xfer_buflen);
+	else if (ddir_sync(io_u->ddir)) {
+		if (msync(fmd->mmap_ptr, fmd->mmap_sz, MS_SYNC)) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "msync");
+		}
+	} else if (io_u->ddir == DDIR_TRIM) {
+		int ret = do_io_u_trim(td, io_u);
+
+		if (!ret)
+			td_verror(td, io_u->error, "trim");
+	}
+
+
+	/*
+	 * not really direct, but should drop the pages from the cache
+	 */
+	if (td->o.odirect && ddir_rw(io_u->ddir)) {
+		if (msync(io_u->mmap_data, io_u->xfer_buflen, MS_SYNC) < 0) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "msync");
+		}
+		if (posix_madvise(io_u->mmap_data, io_u->xfer_buflen, POSIX_MADV_DONTNEED) < 0) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "madvise");
+		}
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_mmapio_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	if ((o->rw_min_bs & page_mask) &&
+	    (o->odirect || o->fsync_blocks || o->fdatasync_blocks)) {
+		log_err("fio: mmap options dictate a minimum block size of "
+			"%llu bytes\n", (unsigned long long) page_size);
+		return 1;
+	}
+
+	mmap_map_size = MMAP_TOTAL_SZ / o->nr_files;
+	return 0;
+}
+
+static int fio_mmapio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mmap_data *fmd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fmd = calloc(1, sizeof(*fmd));
+	if (!fmd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fmd);
+	return 0;
+}
+
+static int fio_mmapio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fmd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "mmap",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_mmapio_init,
+	.prep		= fio_mmapio_prep,
+	.queue		= fio_mmapio_queue,
+	.open_file	= fio_mmapio_open_file,
+	.close_file	= fio_mmapio_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
+#ifdef CONFIG_HAVE_THP
+	.options	= options,
+	.option_struct_size = sizeof(struct mmap_options),
+#endif
+};
+
+static void fio_init fio_mmapio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_mmapio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/mtd.c b/engines/mtd.c
new file mode 100644
index 0000000..b9f4316
--- /dev/null
+++ b/engines/mtd.c
@@ -0,0 +1,233 @@
+/*
+ * MTD engine
+ *
+ * IO engine that reads/writes from MTD character devices.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <mtd/mtd-user.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../oslib/libmtd.h"
+
+static libmtd_t desc;
+
+struct fio_mtd_data {
+	struct mtd_dev_info info;
+};
+
+struct fio_mtd_options {
+	void *pad; /* avoid off1 == 0 */
+	unsigned int skip_bad;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "skip_bad",
+		.lname	= "Skip operations against bad blocks",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct fio_mtd_options, skip_bad),
+		.help	= "Skip operations against known bad blocks.",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_MTD,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int fio_mtd_maybe_mark_bad(struct thread_data *td,
+				  struct fio_mtd_data *fmd,
+				  struct io_u *io_u, int eb)
+{
+	int ret;
+	if (errno == EIO) {
+		ret = mtd_mark_bad(&fmd->info, io_u->file->fd, eb);
+		if (ret != 0) {
+			io_u->error = errno;
+			td_verror(td, errno, "mtd_mark_bad");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int fio_mtd_is_bad(struct thread_data *td,
+			  struct fio_mtd_data *fmd,
+			  struct io_u *io_u, int eb)
+{
+	int ret = mtd_is_bad(&fmd->info, io_u->file->fd, eb);
+	if (ret == -1) {
+		io_u->error = errno;
+		td_verror(td, errno, "mtd_is_bad");
+	} else if (ret == 1)
+		io_u->error = EIO;	/* Silent failure--don't flood stderr */
+	return ret;
+}
+
+static enum fio_q_status fio_mtd_queue(struct thread_data *td,
+				       struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+	struct fio_mtd_options *o = td->eo;
+	int local_offs = 0;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	/*
+	 * Errors tend to pertain to particular erase blocks, so divide up
+	 * I/O to erase block size.
+	 * If an error is encountered, log it and keep going onto the next
+	 * block because the error probably just pertains to that block.
+	 * TODO(dehrenberg): Divide up reads and writes into page-sized
+	 * operations to get more fine-grained information about errors.
+	 */
+	while (local_offs < io_u->buflen) {
+		int eb = (io_u->offset + local_offs) / fmd->info.eb_size;
+		int eb_offs = (io_u->offset + local_offs) % fmd->info.eb_size;
+		/* The length is the smaller of the length remaining in the
+		 * buffer and the distance to the end of the erase block */
+		int len = min((int)io_u->buflen - local_offs,
+			      (int)fmd->info.eb_size - eb_offs);
+		char *buf = ((char *)io_u->buf) + local_offs;
+
+		if (o->skip_bad) {
+			ret = fio_mtd_is_bad(td, fmd, io_u, eb);
+			if (ret == -1)
+				break;
+			else if (ret == 1)
+				goto next;
+		}
+		if (io_u->ddir == DDIR_READ) {
+			ret = mtd_read(&fmd->info, f->fd, eb, eb_offs, buf, len);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_read");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else if (io_u->ddir == DDIR_WRITE) {
+			ret = mtd_write(desc, &fmd->info, f->fd, eb,
+					    eb_offs, buf, len, NULL, 0, 0);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_write");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else if (io_u->ddir == DDIR_TRIM) {
+			if (eb_offs != 0 || len != fmd->info.eb_size) {
+				io_u->error = EINVAL;
+				td_verror(td, EINVAL,
+					  "trim on MTD must be erase block-aligned");
+			}
+			ret = mtd_erase(desc, &fmd->info, f->fd, eb);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_erase");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else {
+			io_u->error = ENOTSUP;
+			td_verror(td, io_u->error, "operation not supported on mtd");
+		}
+
+next:
+		local_offs += len;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_mtd_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mtd_data *fmd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fmd = calloc(1, sizeof(*fmd));
+	if (!fmd)
+		goto err_close;
+
+	ret = mtd_get_dev_info(desc, f->file_name, &fmd->info);
+	if (ret != 0) {
+		td_verror(td, errno, "mtd_get_dev_info");
+		goto err_free;
+	}
+
+	FILE_SET_ENG_DATA(f, fmd);
+	return 0;
+
+err_free:
+	free(fmd);
+err_close:
+	{
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+}
+
+static int fio_mtd_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fmd);
+
+	return generic_close_file(td, f);
+}
+
+static int fio_mtd_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct mtd_dev_info info;
+
+	int ret = mtd_get_dev_info(desc, f->file_name, &info);
+	if (ret != 0) {
+		td_verror(td, errno, "mtd_get_dev_info");
+		return errno;
+	}
+	f->real_file_size = info.size;
+
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "mtd",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_mtd_queue,
+	.open_file	= fio_mtd_open_file,
+	.close_file	= fio_mtd_close_file,
+	.get_file_size	= fio_mtd_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_mtd_options),
+};
+
+static void fio_init fio_mtd_register(void)
+{
+	desc = libmtd_open();
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_mtd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+	libmtd_close(desc);
+	desc = NULL;
+}
+
+
+
diff --git a/engines/nbd.c b/engines/nbd.c
new file mode 100644
index 0000000..5323792
--- /dev/null
+++ b/engines/nbd.c
@@ -0,0 +1,359 @@
+/*
+ * NBD engine
+ *
+ * IO engine that talks to an NBD server.
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ * Written by Richard W.M. Jones <rjones@redhat.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+
+#include <libnbd.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/* Actually this differs across servers, but for nbdkit ... */
+#define NBD_MAX_REQUEST_SIZE (64 * 1024 * 1024)
+
+/* Storage for the NBD handle. */
+struct nbd_data {
+	struct nbd_handle *nbd;
+	int debug;
+
+	/* The list of completed io_u structs. */
+	struct io_u **completed;
+	size_t nr_completed;
+};
+
+/* Options. */
+struct nbd_options {
+	void *padding;
+	char *uri;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "uri",
+		.lname	= "NBD URI",
+		.help	= "Name of NBD URI",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct nbd_options, uri),
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+/* Alocates nbd_data. */
+static int nbd_setup(struct thread_data *td)
+{
+	struct nbd_data *nbd_data;
+	struct nbd_options *o = td->eo;
+	struct fio_file *f;
+	int r;
+	int64_t size;
+
+	nbd_data = calloc(1, sizeof(*nbd_data));
+	if (!nbd_data) {
+		td_verror(td, errno, "calloc");
+		return 1;
+	}
+	td->io_ops_data = nbd_data;
+
+	/* Pretend to deal with files.	See engines/rbd.c */
+	if (!td->files_index) {
+		add_file(td, "nbd", 0, 0);
+		td->o.nr_files = td->o.nr_files ? : 1;
+		td->o.open_files++;
+	}
+	f = td->files[0];
+
+	nbd_data->nbd = nbd_create();
+	if (!nbd_data->nbd) {
+		log_err("fio: nbd_create: %s\n", nbd_get_error());
+		return 1;
+	}
+
+	/* Get the debug flag which can be set through LIBNBD_DEBUG=1. */
+	nbd_data->debug = nbd_get_debug(nbd_data->nbd);
+
+	/* Connect synchronously here so we can check for the size and
+	 * in future other properties of the server.
+	 */
+	if (!o->uri) {
+		log_err("fio: nbd: uri parameter was not specified\n");
+		return 1;
+	}
+	r = nbd_connect_uri(nbd_data->nbd, o->uri);
+	if (r == -1) {
+		log_err("fio: nbd_connect_uri: %s\n", nbd_get_error());
+		return 1;
+	}
+	size = nbd_get_size(nbd_data->nbd);
+	if (size == -1) {
+		log_err("fio: nbd_get_size: %s\n", nbd_get_error());
+		return 1;
+	}
+
+	f->real_file_size = size;
+
+	nbd_close (nbd_data->nbd);
+	nbd_data->nbd = NULL;
+
+	return 0;
+}
+
+/* Closes socket and frees nbd_data -- the opposite of nbd_setup. */
+static void nbd_cleanup(struct thread_data *td)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+
+	if (nbd_data) {
+		if (nbd_data->nbd)
+			nbd_close(nbd_data->nbd);
+		free(nbd_data);
+	}
+}
+
+/* Connect to the server from each thread. */
+static int nbd_init(struct thread_data *td)
+{
+	struct nbd_options *o = td->eo;
+	struct nbd_data *nbd_data = td->io_ops_data;
+	int r;
+
+	if (!o->uri) {
+		log_err("fio: nbd: uri parameter was not specified\n");
+		return 1;
+	}
+
+	nbd_data->nbd = nbd_create();
+	if (!nbd_data->nbd) {
+		log_err("fio: nbd_create: %s\n", nbd_get_error());
+		return 1;
+	}
+	/* This is actually a synchronous connect and handshake. */
+	r = nbd_connect_uri(nbd_data->nbd, o->uri);
+	if (r == -1) {
+		log_err("fio: nbd_connect_uri: %s\n", nbd_get_error());
+		return 1;
+	}
+
+	log_info("fio: connected to NBD server\n");
+	return 0;
+}
+
+/* A command in flight has been completed. */
+static int cmd_completed (void *vp, int *error)
+{
+	struct io_u *io_u;
+	struct nbd_data *nbd_data;
+	struct io_u **completed;
+
+	io_u = vp;
+	nbd_data = io_u->engine_data;
+
+	if (nbd_data->debug)
+		log_info("fio: nbd: command completed\n");
+
+	if (*error != 0)
+		io_u->error = *error;
+	else
+		io_u->error = 0;
+
+	/* Add this completion to the list so it can be picked up
+	 * later by ->event.
+	 */
+	completed = realloc(nbd_data->completed,
+			    sizeof(struct io_u *) *
+			    (nbd_data->nr_completed+1));
+	if (completed == NULL) {
+		io_u->error = errno;
+		return 0;
+	}
+
+	nbd_data->completed = completed;
+	nbd_data->completed[nbd_data->nr_completed] = io_u;
+	nbd_data->nr_completed++;
+
+	return 0;
+}
+
+/* Begin read or write request. */
+static enum fio_q_status nbd_queue(struct thread_data *td,
+				   struct io_u *io_u)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+	nbd_completion_callback completion = { .callback = cmd_completed,
+					       .user_data = io_u };
+	int r;
+
+	fio_ro_check(td, io_u);
+
+	io_u->engine_data = nbd_data;
+
+	if (io_u->ddir == DDIR_WRITE || io_u->ddir == DDIR_READ)
+		assert(io_u->xfer_buflen <= NBD_MAX_REQUEST_SIZE);
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		r = nbd_aio_pread(nbd_data->nbd,
+				  io_u->xfer_buf, io_u->xfer_buflen,
+				  io_u->offset, completion, 0);
+		break;
+	case DDIR_WRITE:
+		r = nbd_aio_pwrite(nbd_data->nbd,
+				   io_u->xfer_buf, io_u->xfer_buflen,
+				   io_u->offset, completion, 0);
+		break;
+	case DDIR_TRIM:
+		r = nbd_aio_trim(nbd_data->nbd, io_u->xfer_buflen,
+				 io_u->offset, completion, 0);
+		break;
+	case DDIR_SYNC:
+		/* XXX We could probably also handle
+		 * DDIR_SYNC_FILE_RANGE with a bit of effort.
+		 */
+		r = nbd_aio_flush(nbd_data->nbd, completion, 0);
+		break;
+	default:
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+
+	if (r == -1) {
+		/* errno is optional information on libnbd error path;
+		 * if it's 0, set it to a default value
+		 */
+		io_u->error = nbd_get_errno();
+		if (io_u->error == 0)
+			io_u->error = EIO;
+		return FIO_Q_COMPLETED;
+	}
+
+	if (nbd_data->debug)
+		log_info("fio: nbd: command issued\n");
+	io_u->error = 0;
+	return FIO_Q_QUEUED;
+}
+
+static unsigned retire_commands(struct nbd_handle *nbd)
+{
+	int64_t cookie;
+	unsigned r = 0;
+
+	while ((cookie = nbd_aio_peek_command_completed(nbd)) > 0) {
+		/* Ignore the return value.  cmd_completed has already
+		 * checked for an error and set io_u->error.  We only
+		 * have to call this to retire the command.
+		 */
+		nbd_aio_command_completed(nbd, cookie);
+		r++;
+	}
+
+	if (nbd_get_debug(nbd))
+		log_info("fio: nbd: %u commands retired\n", r);
+	return r;
+}
+
+static int nbd_getevents(struct thread_data *td, unsigned int min,
+			 unsigned int max, const struct timespec *t)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+	int r;
+	unsigned events = 0;
+	int timeout;
+
+	/* XXX This handling of timeout is wrong because it will wait
+	 * for up to loop iterations * timeout.
+	 */
+	timeout = !t ? -1 : t->tv_sec * 1000 + t->tv_nsec / 1000000;
+
+	while (events < min) {
+		r = nbd_poll(nbd_data->nbd, timeout);
+		if (r == -1) {
+			/* error in poll */
+			log_err("fio: nbd_poll: %s\n", nbd_get_error());
+			return -1;
+		}
+		else {
+			/* poll made progress */
+			events += retire_commands(nbd_data->nbd);
+		}
+	}
+
+	return events;
+}
+
+static struct io_u *nbd_event(struct thread_data *td, int event)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+
+	if (nbd_data->nr_completed == 0)
+		return NULL;
+
+	/* XXX We ignore the event number and assume fio calls us
+	 * exactly once for [0..nr_events-1].
+	 */
+	nbd_data->nr_completed--;
+	return nbd_data->completed[nbd_data->nr_completed];
+}
+
+static int nbd_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	io_u->engine_data = NULL;
+	return 0;
+}
+
+static void nbd_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	/* Nothing needs to be done. */
+}
+
+static int nbd_open_file(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static int nbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "nbd",
+	.version		= FIO_IOOPS_VERSION,
+	.options		= options,
+	.option_struct_size	= sizeof(struct nbd_options),
+	.flags			= FIO_DISKLESSIO | FIO_NOEXTEND,
+
+	.setup			= nbd_setup,
+	.init			= nbd_init,
+	.cleanup		= nbd_cleanup,
+	.queue			= nbd_queue,
+	.getevents		= nbd_getevents,
+	.event			= nbd_event,
+	.io_u_init		= nbd_io_u_init,
+	.io_u_free		= nbd_io_u_free,
+
+	.open_file		= nbd_open_file,
+	.invalidate		= nbd_invalidate,
+};
+
+static void fio_init fio_nbd_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nbd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/net.c b/engines/net.c
new file mode 100644
index 0000000..91f2577
--- /dev/null
+++ b/engines/net.c
@@ -0,0 +1,1466 @@
+/*
+ * net engine
+ *
+ * IO engine that reads/writes to/from sockets.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <poll.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../fio.h"
+#include "../verify.h"
+#include "../optgroup.h"
+
+struct netio_data {
+	int listenfd;
+	int use_splice;
+	int seq_off;
+	int pipes[2];
+	struct sockaddr_in addr;
+	struct sockaddr_in6 addr6;
+	struct sockaddr_un addr_un;
+	uint64_t udp_send_seq;
+	uint64_t udp_recv_seq;
+};
+
+struct netio_options {
+	struct thread_data *td;
+	unsigned int port;
+	unsigned int proto;
+	unsigned int listen;
+	unsigned int pingpong;
+	unsigned int nodelay;
+	unsigned int ttl;
+	unsigned int window_size;
+	unsigned int mss;
+	char *intfc;
+};
+
+struct udp_close_msg {
+	uint32_t magic;
+	uint32_t cmd;
+};
+
+struct udp_seq {
+	uint64_t magic;
+	uint64_t seq;
+	uint64_t bs;
+};
+
+enum {
+	FIO_LINK_CLOSE = 0x89,
+	FIO_LINK_OPEN_CLOSE_MAGIC = 0x6c696e6b,
+	FIO_LINK_OPEN = 0x98,
+	FIO_UDP_SEQ_MAGIC = 0x657375716e556563ULL,
+
+	FIO_TYPE_TCP	= 1,
+	FIO_TYPE_UDP	= 2,
+	FIO_TYPE_UNIX	= 3,
+	FIO_TYPE_TCP_V6	= 4,
+	FIO_TYPE_UDP_V6	= 5,
+};
+
+static int str_hostname_cb(void *data, const char *input);
+static struct fio_option options[] = {
+	{
+		.name	= "hostname",
+		.lname	= "net engine hostname",
+		.type	= FIO_OPT_STR_STORE,
+		.cb	= str_hostname_cb,
+		.help	= "Hostname for net IO engine",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "port",
+		.lname	= "net engine port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, port),
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port to use for TCP or UDP net connections",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "protocol",
+		.lname	= "net engine protocol",
+		.alias	= "proto",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct netio_options, proto),
+		.help	= "Network protocol to use",
+		.def	= "tcp",
+		.posval = {
+			  { .ival = "tcp",
+			    .oval = FIO_TYPE_TCP,
+			    .help = "Transmission Control Protocol",
+			  },
+#ifdef CONFIG_IPV6
+			  { .ival = "tcpv6",
+			    .oval = FIO_TYPE_TCP_V6,
+			    .help = "Transmission Control Protocol V6",
+			  },
+#endif
+			  { .ival = "udp",
+			    .oval = FIO_TYPE_UDP,
+			    .help = "User Datagram Protocol",
+			  },
+#ifdef CONFIG_IPV6
+			  { .ival = "udpv6",
+			    .oval = FIO_TYPE_UDP_V6,
+			    .help = "User Datagram Protocol V6",
+			  },
+#endif
+			  { .ival = "unix",
+			    .oval = FIO_TYPE_UNIX,
+			    .help = "UNIX domain socket",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#ifdef CONFIG_TCP_NODELAY
+	{
+		.name	= "nodelay",
+		.lname	= "No Delay",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct netio_options, nodelay),
+		.help	= "Use TCP_NODELAY on TCP connections",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
+	{
+		.name	= "listen",
+		.lname	= "net engine listen",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct netio_options, listen),
+		.help	= "Listen for incoming TCP connections",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "pingpong",
+		.lname	= "Ping Pong",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct netio_options, pingpong),
+		.help	= "Ping-pong IO requests",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "interface",
+		.lname	= "net engine interface",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct netio_options, intfc),
+		.help	= "Network interface to use",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "ttl",
+		.lname	= "net engine multicast ttl",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, ttl),
+		.def    = "1",
+		.minval	= 0,
+		.help	= "Time-to-live value for outgoing UDP multicast packets",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#ifdef CONFIG_NET_WINDOWSIZE
+	{
+		.name	= "window_size",
+		.lname	= "Window Size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, window_size),
+		.minval	= 0,
+		.help	= "Set socket buffer window size",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
+#ifdef CONFIG_NET_MSS
+	{
+		.name	= "mss",
+		.lname	= "Maximum segment size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, mss),
+		.minval	= 0,
+		.help	= "Set TCP maximum segment size",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
+	{
+		.name	= NULL,
+	},
+};
+
+static inline int is_udp(struct netio_options *o)
+{
+	return o->proto == FIO_TYPE_UDP || o->proto == FIO_TYPE_UDP_V6;
+}
+
+static inline int is_tcp(struct netio_options *o)
+{
+	return o->proto == FIO_TYPE_TCP || o->proto == FIO_TYPE_TCP_V6;
+}
+
+static inline int is_ipv6(struct netio_options *o)
+{
+	return o->proto == FIO_TYPE_UDP_V6 || o->proto == FIO_TYPE_TCP_V6;
+}
+
+static int set_window_size(struct thread_data *td, int fd)
+{
+#ifdef CONFIG_NET_WINDOWSIZE
+	struct netio_options *o = td->eo;
+	unsigned int wss;
+	int snd, rcv, ret;
+
+	if (!o->window_size)
+		return 0;
+
+	rcv = o->listen || o->pingpong;
+	snd = !o->listen || o->pingpong;
+	wss = o->window_size;
+	ret = 0;
+
+	if (rcv) {
+		ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &wss,
+					sizeof(wss));
+		if (ret < 0)
+			td_verror(td, errno, "rcvbuf window size");
+	}
+	if (snd && !ret) {
+		ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &wss,
+					sizeof(wss));
+		if (ret < 0)
+			td_verror(td, errno, "sndbuf window size");
+	}
+
+	return ret;
+#else
+	td_verror(td, -EINVAL, "setsockopt window size");
+	return -1;
+#endif
+}
+
+static int set_mss(struct thread_data *td, int fd)
+{
+#ifdef CONFIG_NET_MSS
+	struct netio_options *o = td->eo;
+	unsigned int mss;
+	int ret;
+
+	if (!o->mss || !is_tcp(o))
+		return 0;
+
+	mss = o->mss;
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *) &mss,
+				sizeof(mss));
+	if (ret < 0)
+		td_verror(td, errno, "setsockopt TCP_MAXSEG");
+
+	return ret;
+#else
+	td_verror(td, -EINVAL, "setsockopt TCP_MAXSEG");
+	return -1;
+#endif
+}
+
+
+/*
+ * Return -1 for error and 'nr events' for a positive number
+ * of events
+ */
+static int poll_wait(struct thread_data *td, int fd, short events)
+{
+	struct pollfd pfd;
+	int ret;
+
+	while (!td->terminate) {
+		pfd.fd = fd;
+		pfd.events = events;
+		ret = poll(&pfd, 1, -1);
+		if (ret < 0) {
+			if (errno == EINTR)
+				break;
+
+			td_verror(td, errno, "poll");
+			return -1;
+		} else if (!ret)
+			continue;
+
+		break;
+	}
+
+	if (pfd.revents & events)
+		return 1;
+
+	return -1;
+}
+
+static int fio_netio_is_multicast(const char *mcaddr)
+{
+	in_addr_t addr = inet_network(mcaddr);
+	if (addr == -1)
+		return 0;
+
+	if (inet_network("224.0.0.0") <= addr &&
+	    inet_network("239.255.255.255") >= addr)
+		return 1;
+
+	return 0;
+}
+
+
+static int fio_netio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct netio_options *o = td->eo;
+
+	/*
+	 * Make sure we don't see spurious reads to a receiver, and vice versa
+	 */
+	if (is_tcp(o))
+		return 0;
+
+	if ((o->listen && io_u->ddir == DDIR_WRITE) ||
+	    (!o->listen && io_u->ddir == DDIR_READ)) {
+		td_verror(td, EINVAL, "bad direction");
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_LINUX_SPLICE
+static int splice_io_u(int fdin, int fdout, unsigned int len)
+{
+	int bytes = 0;
+
+	while (len) {
+		int ret = splice(fdin, NULL, fdout, NULL, len, 0);
+
+		if (ret < 0) {
+			if (!bytes)
+				bytes = ret;
+
+			break;
+		} else if (!ret)
+			break;
+
+		bytes += ret;
+		len -= ret;
+	}
+
+	return bytes;
+}
+
+/*
+ * Receive bytes from a socket and fill them into the internal pipe
+ */
+static int splice_in(struct thread_data *td, struct io_u *io_u)
+{
+	struct netio_data *nd = td->io_ops_data;
+
+	return splice_io_u(io_u->file->fd, nd->pipes[1], io_u->xfer_buflen);
+}
+
+/*
+ * Transmit 'len' bytes from the internal pipe
+ */
+static int splice_out(struct thread_data *td, struct io_u *io_u,
+		      unsigned int len)
+{
+	struct netio_data *nd = td->io_ops_data;
+
+	return splice_io_u(nd->pipes[0], io_u->file->fd, len);
+}
+
+static int vmsplice_io_u(struct io_u *io_u, int fd, unsigned int len)
+{
+	struct iovec iov = {
+		.iov_base = io_u->xfer_buf,
+		.iov_len = len,
+	};
+	int bytes = 0;
+
+	while (iov.iov_len) {
+		int ret = vmsplice(fd, &iov, 1, SPLICE_F_MOVE);
+
+		if (ret < 0) {
+			if (!bytes)
+				bytes = ret;
+			break;
+		} else if (!ret)
+			break;
+
+		iov.iov_len -= ret;
+		iov.iov_base += ret;
+		bytes += ret;
+	}
+
+	return bytes;
+
+}
+
+/*
+ * vmsplice() pipe to io_u buffer
+ */
+static int vmsplice_io_u_out(struct thread_data *td, struct io_u *io_u,
+			     unsigned int len)
+{
+	struct netio_data *nd = td->io_ops_data;
+
+	return vmsplice_io_u(io_u, nd->pipes[0], len);
+}
+
+/*
+ * vmsplice() io_u to pipe
+ */
+static int vmsplice_io_u_in(struct thread_data *td, struct io_u *io_u)
+{
+	struct netio_data *nd = td->io_ops_data;
+
+	return vmsplice_io_u(io_u, nd->pipes[1], io_u->xfer_buflen);
+}
+
+/*
+ * splice receive - transfer socket data into a pipe using splice, then map
+ * that pipe data into the io_u using vmsplice.
+ */
+static int fio_netio_splice_in(struct thread_data *td, struct io_u *io_u)
+{
+	int ret;
+
+	ret = splice_in(td, io_u);
+	if (ret > 0)
+		return vmsplice_io_u_out(td, io_u, ret);
+
+	return ret;
+}
+
+/*
+ * splice transmit - map data from the io_u into a pipe by using vmsplice,
+ * then transfer that pipe to a socket using splice.
+ */
+static int fio_netio_splice_out(struct thread_data *td, struct io_u *io_u)
+{
+	int ret;
+
+	ret = vmsplice_io_u_in(td, io_u);
+	if (ret > 0)
+		return splice_out(td, io_u, ret);
+
+	return ret;
+}
+#else
+static int fio_netio_splice_in(struct thread_data *td, struct io_u *io_u)
+{
+	errno = EOPNOTSUPP;
+	return -1;
+}
+
+static int fio_netio_splice_out(struct thread_data *td, struct io_u *io_u)
+{
+	errno = EOPNOTSUPP;
+	return -1;
+}
+#endif
+
+static void store_udp_seq(struct netio_data *nd, struct io_u *io_u)
+{
+	struct udp_seq *us;
+
+	if (io_u->xfer_buflen < sizeof(*us))
+		return;
+
+	us = io_u->xfer_buf + io_u->xfer_buflen - sizeof(*us);
+	us->magic = cpu_to_le64((uint64_t) FIO_UDP_SEQ_MAGIC);
+	us->bs = cpu_to_le64((uint64_t) io_u->xfer_buflen);
+	us->seq = cpu_to_le64(nd->udp_send_seq++);
+}
+
+static void verify_udp_seq(struct thread_data *td, struct netio_data *nd,
+			   struct io_u *io_u)
+{
+	struct udp_seq *us;
+	uint64_t seq;
+
+	if (io_u->xfer_buflen < sizeof(*us))
+		return;
+
+	if (nd->seq_off)
+		return;
+
+	us = io_u->xfer_buf + io_u->xfer_buflen - sizeof(*us);
+	if (le64_to_cpu(us->magic) != FIO_UDP_SEQ_MAGIC)
+		return;
+	if (le64_to_cpu(us->bs) != io_u->xfer_buflen) {
+		nd->seq_off = 1;
+		return;
+	}
+
+	seq = le64_to_cpu(us->seq);
+
+	if (seq != nd->udp_recv_seq)
+		td->ts.drop_io_u[io_u->ddir] += seq - nd->udp_recv_seq;
+
+	nd->udp_recv_seq = seq + 1;
+}
+
+static int fio_netio_send(struct thread_data *td, struct io_u *io_u)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	int ret, flags = 0;
+
+	do {
+		if (is_udp(o)) {
+			const struct sockaddr *to;
+			socklen_t len;
+
+			if (is_ipv6(o)) {
+				to = (struct sockaddr *) &nd->addr6;
+				len = sizeof(nd->addr6);
+			} else {
+				to = (struct sockaddr *) &nd->addr;
+				len = sizeof(nd->addr);
+			}
+
+			if (td->o.verify == VERIFY_NONE)
+				store_udp_seq(nd, io_u);
+
+			ret = sendto(io_u->file->fd, io_u->xfer_buf,
+					io_u->xfer_buflen, flags, to, len);
+		} else {
+			/*
+			 * if we are going to write more, set MSG_MORE
+			 */
+#ifdef MSG_MORE
+			if ((td->this_io_bytes[DDIR_WRITE] + io_u->xfer_buflen <
+			    td->o.size) && !o->pingpong)
+				flags |= MSG_MORE;
+#endif
+			ret = send(io_u->file->fd, io_u->xfer_buf,
+					io_u->xfer_buflen, flags);
+		}
+		if (ret > 0)
+			break;
+
+		ret = poll_wait(td, io_u->file->fd, POLLOUT);
+		if (ret <= 0)
+			break;
+	} while (1);
+
+	return ret;
+}
+
+static int is_close_msg(struct io_u *io_u, int len)
+{
+	struct udp_close_msg *msg;
+
+	if (len != sizeof(struct udp_close_msg))
+		return 0;
+
+	msg = io_u->xfer_buf;
+	if (le32_to_cpu(msg->magic) != FIO_LINK_OPEN_CLOSE_MAGIC)
+		return 0;
+	if (le32_to_cpu(msg->cmd) != FIO_LINK_CLOSE)
+		return 0;
+
+	return 1;
+}
+
+static int fio_netio_recv(struct thread_data *td, struct io_u *io_u)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	int ret, flags = 0;
+
+	do {
+		if (is_udp(o)) {
+			struct sockaddr *from;
+			socklen_t l, *len = &l;
+
+			if (o->listen) {
+				if (!is_ipv6(o)) {
+					from = (struct sockaddr *) &nd->addr;
+					*len = sizeof(nd->addr);
+				} else {
+					from = (struct sockaddr *) &nd->addr6;
+					*len = sizeof(nd->addr6);
+				}
+			} else {
+				from = NULL;
+				len = NULL;
+			}
+
+			ret = recvfrom(io_u->file->fd, io_u->xfer_buf,
+					io_u->xfer_buflen, flags, from, len);
+
+			if (is_close_msg(io_u, ret)) {
+				td->done = 1;
+				return 0;
+			}
+		} else {
+			ret = recv(io_u->file->fd, io_u->xfer_buf,
+					io_u->xfer_buflen, flags);
+
+			if (is_close_msg(io_u, ret)) {
+				td->done = 1;
+				return 0;
+			}
+		}
+		if (ret > 0)
+			break;
+		else if (!ret && (flags & MSG_WAITALL))
+			break;
+
+		ret = poll_wait(td, io_u->file->fd, POLLIN);
+		if (ret <= 0)
+			break;
+		flags |= MSG_WAITALL;
+	} while (1);
+
+	if (is_udp(o) && td->o.verify == VERIFY_NONE)
+		verify_udp_seq(td, nd, io_u);
+
+	return ret;
+}
+
+static enum fio_q_status __fio_netio_queue(struct thread_data *td,
+					   struct io_u *io_u,
+					   enum fio_ddir ddir)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	int ret;
+
+	if (ddir == DDIR_WRITE) {
+		if (!nd->use_splice || is_udp(o) ||
+		    o->proto == FIO_TYPE_UNIX)
+			ret = fio_netio_send(td, io_u);
+		else
+			ret = fio_netio_splice_out(td, io_u);
+	} else if (ddir == DDIR_READ) {
+		if (!nd->use_splice || is_udp(o) ||
+		    o->proto == FIO_TYPE_UNIX)
+			ret = fio_netio_recv(td, io_u);
+		else
+			ret = fio_netio_splice_in(td, io_u);
+	} else
+		ret = 0;	/* must be a SYNC */
+
+	if (ret != (int) io_u->xfer_buflen) {
+		if (ret > 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else if (!ret)
+			return FIO_Q_BUSY;
+		else {
+			int err = errno;
+
+			if (ddir == DDIR_WRITE && err == EMSGSIZE)
+				return FIO_Q_BUSY;
+
+			io_u->error = err;
+		}
+	}
+
+	if (io_u->error)
+		td_verror(td, io_u->error, "xfer");
+
+	return FIO_Q_COMPLETED;
+}
+
+static enum fio_q_status fio_netio_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct netio_options *o = td->eo;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	ret = __fio_netio_queue(td, io_u, io_u->ddir);
+	if (!o->pingpong || ret != FIO_Q_COMPLETED)
+		return ret;
+
+	/*
+	 * For ping-pong mode, receive or send reply as needed
+	 */
+	if (td_read(td) && io_u->ddir == DDIR_READ)
+		ret = __fio_netio_queue(td, io_u, DDIR_WRITE);
+	else if (td_write(td) && io_u->ddir == DDIR_WRITE)
+		ret = __fio_netio_queue(td, io_u, DDIR_READ);
+
+	return ret;
+}
+
+static int fio_netio_connect(struct thread_data *td, struct fio_file *f)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	int type, domain;
+
+	if (o->proto == FIO_TYPE_TCP) {
+		domain = AF_INET;
+		type = SOCK_STREAM;
+	} else if (o->proto == FIO_TYPE_TCP_V6) {
+		domain = AF_INET6;
+		type = SOCK_STREAM;
+	} else if (o->proto == FIO_TYPE_UDP) {
+		domain = AF_INET;
+		type = SOCK_DGRAM;
+	} else if (o->proto == FIO_TYPE_UDP_V6) {
+		domain = AF_INET6;
+		type = SOCK_DGRAM;
+	} else if (o->proto == FIO_TYPE_UNIX) {
+		domain = AF_UNIX;
+		type = SOCK_STREAM;
+	} else {
+		log_err("fio: bad network type %d\n", o->proto);
+		f->fd = -1;
+		return 1;
+	}
+
+	f->fd = socket(domain, type, 0);
+	if (f->fd < 0) {
+		td_verror(td, errno, "socket");
+		return 1;
+	}
+
+#ifdef CONFIG_TCP_NODELAY
+	if (o->nodelay && is_tcp(o)) {
+		int optval = 1;
+
+		if (setsockopt(f->fd, IPPROTO_TCP, TCP_NODELAY, (void *) &optval, sizeof(int)) < 0) {
+			log_err("fio: cannot set TCP_NODELAY option on socket (%s), disable with 'nodelay=0'\n", strerror(errno));
+			return 1;
+		}
+	}
+#endif
+
+	if (set_window_size(td, f->fd)) {
+		close(f->fd);
+		return 1;
+	}
+	if (set_mss(td, f->fd)) {
+		close(f->fd);
+		return 1;
+	}
+
+	if (is_udp(o)) {
+		if (!fio_netio_is_multicast(td->o.filename))
+			return 0;
+		if (is_ipv6(o)) {
+			log_err("fio: multicast not supported on IPv6\n");
+			close(f->fd);
+			return 1;
+		}
+
+		if (o->intfc) {
+			struct in_addr interface_addr;
+
+			if (inet_aton(o->intfc, &interface_addr) == 0) {
+				log_err("fio: interface not valid interface IP\n");
+				close(f->fd);
+				return 1;
+			}
+			if (setsockopt(f->fd, IPPROTO_IP, IP_MULTICAST_IF, (const char*)&interface_addr, sizeof(interface_addr)) < 0) {
+				td_verror(td, errno, "setsockopt IP_MULTICAST_IF");
+				close(f->fd);
+				return 1;
+			}
+		}
+		if (setsockopt(f->fd, IPPROTO_IP, IP_MULTICAST_TTL, (const char*)&o->ttl, sizeof(o->ttl)) < 0) {
+			td_verror(td, errno, "setsockopt IP_MULTICAST_TTL");
+			close(f->fd);
+			return 1;
+		}
+		return 0;
+	} else if (o->proto == FIO_TYPE_TCP) {
+		socklen_t len = sizeof(nd->addr);
+
+		if (connect(f->fd, (struct sockaddr *) &nd->addr, len) < 0) {
+			td_verror(td, errno, "connect");
+			close(f->fd);
+			return 1;
+		}
+	} else if (o->proto == FIO_TYPE_TCP_V6) {
+		socklen_t len = sizeof(nd->addr6);
+
+		if (connect(f->fd, (struct sockaddr *) &nd->addr6, len) < 0) {
+			td_verror(td, errno, "connect");
+			close(f->fd);
+			return 1;
+		}
+
+	} else {
+		struct sockaddr_un *addr = &nd->addr_un;
+		socklen_t len;
+
+		len = sizeof(addr->sun_family) + strlen(addr->sun_path) + 1;
+
+		if (connect(f->fd, (struct sockaddr *) addr, len) < 0) {
+			td_verror(td, errno, "connect");
+			close(f->fd);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int fio_netio_accept(struct thread_data *td, struct fio_file *f)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	socklen_t socklen;
+	int state;
+
+	if (is_udp(o)) {
+		f->fd = nd->listenfd;
+		return 0;
+	}
+
+	state = td->runstate;
+	td_set_runstate(td, TD_SETTING_UP);
+
+	log_info("fio: waiting for connection\n");
+
+	if (poll_wait(td, nd->listenfd, POLLIN) < 0)
+		goto err;
+
+	if (o->proto == FIO_TYPE_TCP) {
+		socklen = sizeof(nd->addr);
+		f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr, &socklen);
+	} else {
+		socklen = sizeof(nd->addr6);
+		f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr6, &socklen);
+	}
+
+	if (f->fd < 0) {
+		td_verror(td, errno, "accept");
+		goto err;
+	}
+
+#ifdef CONFIG_TCP_NODELAY
+	if (o->nodelay && is_tcp(o)) {
+		int optval = 1;
+
+		if (setsockopt(f->fd, IPPROTO_TCP, TCP_NODELAY, (void *) &optval, sizeof(int)) < 0) {
+			log_err("fio: cannot set TCP_NODELAY option on socket (%s), disable with 'nodelay=0'\n", strerror(errno));
+			return 1;
+		}
+	}
+#endif
+
+	reset_all_stats(td);
+	td_set_runstate(td, state);
+	return 0;
+err:
+	td_set_runstate(td, state);
+	return 1;
+}
+
+static void fio_netio_send_close(struct thread_data *td, struct fio_file *f)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	struct udp_close_msg msg;
+	struct sockaddr *to;
+	socklen_t len;
+	int ret;
+
+	if (is_ipv6(o)) {
+		to = (struct sockaddr *) &nd->addr6;
+		len = sizeof(nd->addr6);
+	} else {
+		to = (struct sockaddr *) &nd->addr;
+		len = sizeof(nd->addr);
+	}
+
+	msg.magic = cpu_to_le32((uint32_t) FIO_LINK_OPEN_CLOSE_MAGIC);
+	msg.cmd = cpu_to_le32((uint32_t) FIO_LINK_CLOSE);
+
+	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, len);
+	if (ret < 0)
+		td_verror(td, errno, "sendto udp link close");
+}
+
+static int fio_netio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	/*
+	 * Notify the receiver that we are closing down the link
+	 */
+	fio_netio_send_close(td, f);
+
+	return generic_close_file(td, f);
+}
+
+static int fio_netio_udp_recv_open(struct thread_data *td, struct fio_file *f)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	struct udp_close_msg msg;
+	struct sockaddr *to;
+	socklen_t len;
+	int ret;
+
+	if (is_ipv6(o)) {
+		len = sizeof(nd->addr6);
+		to = (struct sockaddr *) &nd->addr6;
+	} else {
+		len = sizeof(nd->addr);
+		to = (struct sockaddr *) &nd->addr;
+	}
+
+	ret = recvfrom(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, &len);
+	if (ret < 0) {
+		td_verror(td, errno, "recvfrom udp link open");
+		return ret;
+	}
+
+	if (ntohl(msg.magic) != FIO_LINK_OPEN_CLOSE_MAGIC ||
+	    ntohl(msg.cmd) != FIO_LINK_OPEN) {
+		log_err("fio: bad udp open magic %x/%x\n", ntohl(msg.magic),
+								ntohl(msg.cmd));
+		return -1;
+	}
+
+	fio_gettime(&td->start, NULL);
+	return 0;
+}
+
+static int fio_netio_send_open(struct thread_data *td, struct fio_file *f)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	struct udp_close_msg msg;
+	struct sockaddr *to;
+	socklen_t len;
+	int ret;
+
+	if (is_ipv6(o)) {
+		len = sizeof(nd->addr6);
+		to = (struct sockaddr *) &nd->addr6;
+	} else {
+		len = sizeof(nd->addr);
+		to = (struct sockaddr *) &nd->addr;
+	}
+
+	msg.magic = htonl(FIO_LINK_OPEN_CLOSE_MAGIC);
+	msg.cmd = htonl(FIO_LINK_OPEN);
+
+	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, len);
+	if (ret < 0) {
+		td_verror(td, errno, "sendto udp link open");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int fio_netio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret;
+	struct netio_options *o = td->eo;
+
+	if (o->listen)
+		ret = fio_netio_accept(td, f);
+	else
+		ret = fio_netio_connect(td, f);
+
+	if (ret) {
+		f->fd = -1;
+		return ret;
+	}
+
+	if (is_udp(o)) {
+		if (td_write(td))
+			ret = fio_netio_send_open(td, f);
+		else {
+			int state;
+
+			state = td->runstate;
+			td_set_runstate(td, TD_SETTING_UP);
+			ret = fio_netio_udp_recv_open(td, f);
+			td_set_runstate(td, state);
+		}
+	}
+
+	if (ret)
+		fio_netio_close_file(td, f);
+
+	return ret;
+}
+
+static int fio_fill_addr(struct thread_data *td, const char *host, int af,
+			 void *dst, struct addrinfo **res)
+{
+	struct netio_options *o = td->eo;
+	struct addrinfo hints;
+	int ret;
+
+	if (inet_pton(af, host, dst))
+		return 0;
+
+	memset(&hints, 0, sizeof(hints));
+
+	if (is_tcp(o))
+		hints.ai_socktype = SOCK_STREAM;
+	else
+		hints.ai_socktype = SOCK_DGRAM;
+
+	if (is_ipv6(o))
+		hints.ai_family = AF_INET6;
+	else
+		hints.ai_family = AF_INET;
+
+	ret = getaddrinfo(host, NULL, &hints, res);
+	if (ret) {
+		int e = EINVAL;
+		char str[128];
+
+		if (ret == EAI_SYSTEM)
+			e = errno;
+
+		snprintf(str, sizeof(str), "getaddrinfo: %s", gai_strerror(ret));
+		td_verror(td, e, str);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_netio_setup_connect_inet(struct thread_data *td,
+					const char *host, unsigned short port)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	struct addrinfo *res = NULL;
+	void *dst, *src;
+	int af, len;
+
+	if (!host) {
+		log_err("fio: connect with no host to connect to.\n");
+		if (td_read(td))
+			log_err("fio: did you forget to set 'listen'?\n");
+
+		td_verror(td, EINVAL, "no hostname= set");
+		return 1;
+	}
+
+	nd->addr.sin_family = AF_INET;
+	nd->addr.sin_port = htons(port);
+	nd->addr6.sin6_family = AF_INET6;
+	nd->addr6.sin6_port = htons(port);
+
+	if (is_ipv6(o)) {
+		af = AF_INET6;
+		dst = &nd->addr6.sin6_addr;
+	} else {
+		af = AF_INET;
+		dst = &nd->addr.sin_addr;
+	}
+
+	if (fio_fill_addr(td, host, af, dst, &res))
+		return 1;
+
+	if (!res)
+		return 0;
+
+	if (is_ipv6(o)) {
+		len = sizeof(nd->addr6.sin6_addr);
+		src = &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr;
+	} else {
+		len = sizeof(nd->addr.sin_addr);
+		src = &((struct sockaddr_in *) res->ai_addr)->sin_addr;
+	}
+
+	memcpy(dst, src, len);
+	freeaddrinfo(res);
+	return 0;
+}
+
+static int fio_netio_setup_connect_unix(struct thread_data *td,
+					const char *path)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct sockaddr_un *soun = &nd->addr_un;
+
+	soun->sun_family = AF_UNIX;
+	snprintf(soun->sun_path, sizeof(soun->sun_path), "%s", path);
+	return 0;
+}
+
+static int fio_netio_setup_connect(struct thread_data *td)
+{
+	struct netio_options *o = td->eo;
+
+	if (is_udp(o) || is_tcp(o))
+		return fio_netio_setup_connect_inet(td, td->o.filename,o->port);
+	else
+		return fio_netio_setup_connect_unix(td, td->o.filename);
+}
+
+static int fio_netio_setup_listen_unix(struct thread_data *td, const char *path)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct sockaddr_un *addr = &nd->addr_un;
+	mode_t mode;
+	int len, fd;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0) {
+		log_err("fio: socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	mode = umask(000);
+
+	addr->sun_family = AF_UNIX;
+	snprintf(addr->sun_path, sizeof(addr->sun_path), "%s", path);
+	unlink(path);
+
+	len = sizeof(addr->sun_family) + strlen(path) + 1;
+
+	if (bind(fd, (struct sockaddr *) addr, len) < 0) {
+		log_err("fio: bind: %s\n", strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	umask(mode);
+	nd->listenfd = fd;
+	return 0;
+}
+
+static int fio_netio_setup_listen_inet(struct thread_data *td, short port)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	struct ip_mreq mr;
+	struct sockaddr_in sin;
+	struct sockaddr *saddr;
+	int fd, opt, type, domain;
+	socklen_t len;
+
+	memset(&sin, 0, sizeof(sin));
+
+	if (o->proto == FIO_TYPE_TCP) {
+		type = SOCK_STREAM;
+		domain = AF_INET;
+	} else if (o->proto == FIO_TYPE_TCP_V6) {
+		type = SOCK_STREAM;
+		domain = AF_INET6;
+	} else if (o->proto == FIO_TYPE_UDP) {
+		type = SOCK_DGRAM;
+		domain = AF_INET;
+	} else if (o->proto == FIO_TYPE_UDP_V6) {
+		type = SOCK_DGRAM;
+		domain = AF_INET6;
+	} else {
+		log_err("fio: unknown proto %d\n", o->proto);
+		return 1;
+	}
+
+	fd = socket(domain, type, 0);
+	if (fd < 0) {
+		td_verror(td, errno, "socket");
+		return 1;
+	}
+
+	opt = 1;
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (void *) &opt, sizeof(opt)) < 0) {
+		td_verror(td, errno, "setsockopt");
+		close(fd);
+		return 1;
+	}
+#ifdef SO_REUSEPORT
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, (void *) &opt, sizeof(opt)) < 0) {
+		td_verror(td, errno, "setsockopt");
+		close(fd);
+		return 1;
+	}
+#endif
+
+	if (set_window_size(td, fd)) {
+		close(fd);
+		return 1;
+	}
+	if (set_mss(td, fd)) {
+		close(fd);
+		return 1;
+	}
+
+	if (td->o.filename) {
+		if (!is_udp(o) || !fio_netio_is_multicast(td->o.filename)) {
+			log_err("fio: hostname not valid for non-multicast inbound network IO\n");
+			close(fd);
+			return 1;
+		}
+		if (is_ipv6(o)) {
+			log_err("fio: IPv6 not supported for multicast network IO\n");
+			close(fd);
+			return 1;
+		}
+
+		inet_aton(td->o.filename, &sin.sin_addr);
+
+		mr.imr_multiaddr = sin.sin_addr;
+		if (o->intfc) {
+			if (inet_aton(o->intfc, &mr.imr_interface) == 0) {
+				log_err("fio: interface not valid interface IP\n");
+				close(fd);
+				return 1;
+			}
+		} else {
+			mr.imr_interface.s_addr = htonl(INADDR_ANY);
+		}
+
+		if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, (const char*)&mr, sizeof(mr)) < 0) {
+			td_verror(td, errno, "setsockopt IP_ADD_MEMBERSHIP");
+			close(fd);
+			return 1;
+		}
+	}
+
+	if (!is_ipv6(o)) {
+		saddr = (struct sockaddr *) &nd->addr;
+		len = sizeof(nd->addr);
+
+		nd->addr.sin_family = AF_INET;
+		nd->addr.sin_addr.s_addr = sin.sin_addr.s_addr ? sin.sin_addr.s_addr : htonl(INADDR_ANY);
+		nd->addr.sin_port = htons(port);
+	} else {
+		saddr = (struct sockaddr *) &nd->addr6;
+		len = sizeof(nd->addr6);
+
+		nd->addr6.sin6_family = AF_INET6;
+		nd->addr6.sin6_addr = in6addr_any;
+		nd->addr6.sin6_port = htons(port);
+	}
+
+	if (bind(fd, saddr, len) < 0) {
+		close(fd);
+		td_verror(td, errno, "bind");
+		return 1;
+	}
+
+	nd->listenfd = fd;
+	return 0;
+}
+
+static int fio_netio_setup_listen(struct thread_data *td)
+{
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	int ret;
+
+	if (is_udp(o) || is_tcp(o))
+		ret = fio_netio_setup_listen_inet(td, o->port);
+	else
+		ret = fio_netio_setup_listen_unix(td, td->o.filename);
+
+	if (ret)
+		return ret;
+	if (is_udp(o))
+		return 0;
+
+	if (listen(nd->listenfd, 10) < 0) {
+		td_verror(td, errno, "listen");
+		nd->listenfd = -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_netio_init(struct thread_data *td)
+{
+	struct netio_options *o = td->eo;
+	int ret;
+
+#ifdef WIN32
+	WSADATA wsd;
+	WSAStartup(MAKEWORD(2,2), &wsd);
+#endif
+
+	if (td_random(td)) {
+		log_err("fio: network IO can't be random\n");
+		return 1;
+	}
+
+	if (o->proto == FIO_TYPE_UNIX && o->port) {
+		log_err("fio: network IO port not valid with unix socket\n");
+		return 1;
+	} else if (o->proto != FIO_TYPE_UNIX && !o->port) {
+		log_err("fio: network IO requires port for tcp or udp\n");
+		return 1;
+	}
+
+	o->port += td->subjob_number;
+
+	if (!is_tcp(o)) {
+		if (o->listen) {
+			log_err("fio: listen only valid for TCP proto IO\n");
+			return 1;
+		}
+		if (td_rw(td)) {
+			log_err("fio: datagram network connections must be"
+				   " read OR write\n");
+			return 1;
+		}
+		if (o->proto == FIO_TYPE_UNIX && !td->o.filename) {
+			log_err("fio: UNIX sockets need host/filename\n");
+			return 1;
+		}
+		o->listen = td_read(td);
+	}
+
+	if (o->listen)
+		ret = fio_netio_setup_listen(td);
+	else
+		ret = fio_netio_setup_connect(td);
+
+	return ret;
+}
+
+static void fio_netio_cleanup(struct thread_data *td)
+{
+	struct netio_data *nd = td->io_ops_data;
+
+	if (nd) {
+		if (nd->listenfd != -1)
+			close(nd->listenfd);
+		if (nd->pipes[0] != -1)
+			close(nd->pipes[0]);
+		if (nd->pipes[1] != -1)
+			close(nd->pipes[1]);
+
+		free(nd);
+	}
+}
+
+static int fio_netio_setup(struct thread_data *td)
+{
+	struct netio_data *nd;
+
+	if (!td->files_index) {
+		add_file(td, td->o.filename ?: "net", 0, 0);
+		td->o.nr_files = td->o.nr_files ?: 1;
+		td->o.open_files++;
+	}
+
+	if (!td->io_ops_data) {
+		nd = malloc(sizeof(*nd));
+
+		memset(nd, 0, sizeof(*nd));
+		nd->listenfd = -1;
+		nd->pipes[0] = nd->pipes[1] = -1;
+		td->io_ops_data = nd;
+	}
+
+	return 0;
+}
+
+static void fio_netio_terminate(struct thread_data *td)
+{
+	kill(td->pid, SIGTERM);
+}
+
+#ifdef CONFIG_LINUX_SPLICE
+static int fio_netio_setup_splice(struct thread_data *td)
+{
+	struct netio_data *nd;
+
+	fio_netio_setup(td);
+
+	nd = td->io_ops_data;
+	if (nd) {
+		if (pipe(nd->pipes) < 0)
+			return 1;
+
+		nd->use_splice = 1;
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ioengine_ops ioengine_splice = {
+	.name			= "netsplice",
+	.version		= FIO_IOOPS_VERSION,
+	.prep			= fio_netio_prep,
+	.queue			= fio_netio_queue,
+	.setup			= fio_netio_setup_splice,
+	.init			= fio_netio_init,
+	.cleanup		= fio_netio_cleanup,
+	.open_file		= fio_netio_open_file,
+	.close_file		= fio_netio_close_file,
+	.terminate		= fio_netio_terminate,
+	.options		= options,
+	.option_struct_size	= sizeof(struct netio_options),
+	.flags			= FIO_SYNCIO | FIO_DISKLESSIO | FIO_UNIDIR |
+				  FIO_PIPEIO,
+};
+#endif
+
+static struct ioengine_ops ioengine_rw = {
+	.name			= "net",
+	.version		= FIO_IOOPS_VERSION,
+	.prep			= fio_netio_prep,
+	.queue			= fio_netio_queue,
+	.setup			= fio_netio_setup,
+	.init			= fio_netio_init,
+	.cleanup		= fio_netio_cleanup,
+	.open_file		= fio_netio_open_file,
+	.close_file		= fio_netio_close_file,
+	.terminate		= fio_netio_terminate,
+	.options		= options,
+	.option_struct_size	= sizeof(struct netio_options),
+	.flags			= FIO_SYNCIO | FIO_DISKLESSIO | FIO_UNIDIR |
+				  FIO_PIPEIO | FIO_BIT_BASED,
+};
+
+static int str_hostname_cb(void *data, const char *input)
+{
+	struct netio_options *o = data;
+
+	if (o->td->o.filename)
+		free(o->td->o.filename);
+	o->td->o.filename = strdup(input);
+	return 0;
+}
+
+static void fio_init fio_netio_register(void)
+{
+	register_ioengine(&ioengine_rw);
+#ifdef CONFIG_LINUX_SPLICE
+	register_ioengine(&ioengine_splice);
+#endif
+}
+
+static void fio_exit fio_netio_unregister(void)
+{
+	unregister_ioengine(&ioengine_rw);
+#ifdef CONFIG_LINUX_SPLICE
+	unregister_ioengine(&ioengine_splice);
+#endif
+}
diff --git a/engines/null.c b/engines/null.c
new file mode 100644
index 0000000..4cc0102
--- /dev/null
+++ b/engines/null.c
@@ -0,0 +1,276 @@
+/*
+ * null engine
+ *
+ * IO engine that doesn't do any real IO transfers, it just pretends to.
+ * The main purpose is to test fio itself.
+ *
+ * It also can act as external C++ engine - compiled with:
+ *
+ * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c -DFIO_EXTERNAL_ENGINE
+ *
+ * to test it execute:
+ *
+ * LD_LIBRARY_PATH=./engines ./fio examples/cpp_null.fio
+ *
+ */
+#include <stdlib.h>
+#include <assert.h>
+
+#include "../fio.h"
+
+struct null_data {
+	struct io_u **io_us;
+	int queued;
+	int events;
+};
+
+static struct io_u *null_event(struct null_data *nd, int event)
+{
+	return nd->io_us[event];
+}
+
+static int null_getevents(struct null_data *nd, unsigned int min_events,
+			  unsigned int fio_unused max,
+			  const struct timespec fio_unused *t)
+{
+	int ret = 0;
+
+	if (min_events) {
+		ret = nd->events;
+		nd->events = 0;
+	}
+
+	return ret;
+}
+
+static int null_commit(struct thread_data *td, struct null_data *nd)
+{
+	if (!nd->events) {
+#ifndef FIO_EXTERNAL_ENGINE
+		io_u_mark_submit(td, nd->queued);
+#endif
+		nd->events = nd->queued;
+		nd->queued = 0;
+	}
+
+	return 0;
+}
+
+static enum fio_q_status null_queue(struct thread_data *td,
+				    struct null_data *nd, struct io_u *io_u)
+{
+	fio_ro_check(td, io_u);
+
+	if (td->io_ops->flags & FIO_SYNCIO)
+		return FIO_Q_COMPLETED;
+	if (nd->events)
+		return FIO_Q_BUSY;
+
+	nd->io_us[nd->queued++] = io_u;
+	return FIO_Q_QUEUED;
+}
+
+static int null_open(struct null_data fio_unused *nd,
+		     struct fio_file fio_unused *f)
+{
+	return 0;
+}
+
+static void null_cleanup(struct null_data *nd)
+{
+	if (nd) {
+		free(nd->io_us);
+		free(nd);
+	}
+}
+
+static struct null_data *null_init(struct thread_data *td)
+{
+	struct null_data *nd = (struct null_data *) malloc(sizeof(*nd));
+
+	memset(nd, 0, sizeof(*nd));
+
+	if (td->o.iodepth != 1) {
+		nd->io_us = (struct io_u **) malloc(td->o.iodepth * sizeof(struct io_u *));
+		memset(nd->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
+	} else
+		td->io_ops->flags |= FIO_SYNCIO;
+
+	return nd;
+}
+
+#ifndef __cplusplus
+
+static struct io_u *fio_null_event(struct thread_data *td, int event)
+{
+	return null_event(td->io_ops_data, event);
+}
+
+static int fio_null_getevents(struct thread_data *td, unsigned int min_events,
+			      unsigned int max, const struct timespec *t)
+{
+	struct null_data *nd = td->io_ops_data;
+	return null_getevents(nd, min_events, max, t);
+}
+
+static int fio_null_commit(struct thread_data *td)
+{
+	return null_commit(td, td->io_ops_data);
+}
+
+static enum fio_q_status fio_null_queue(struct thread_data *td,
+					struct io_u *io_u)
+{
+	return null_queue(td, td->io_ops_data, io_u);
+}
+
+static int fio_null_open(struct thread_data *td, struct fio_file *f)
+{
+	return null_open(td->io_ops_data, f);
+}
+
+static void fio_null_cleanup(struct thread_data *td)
+{
+	null_cleanup(td->io_ops_data);
+}
+
+static int fio_null_init(struct thread_data *td)
+{
+	td->io_ops_data = null_init(td);
+	assert(td->io_ops_data);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "null",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_null_queue,
+	.commit		= fio_null_commit,
+	.getevents	= fio_null_getevents,
+	.event		= fio_null_event,
+	.init		= fio_null_init,
+	.cleanup	= fio_null_cleanup,
+	.open_file	= fio_null_open,
+	.flags		= FIO_DISKLESSIO | FIO_FAKEIO,
+};
+
+static void fio_init fio_null_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_null_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
+
+#else
+
+#ifdef FIO_EXTERNAL_ENGINE
+
+struct NullData {
+	NullData(struct thread_data *td)
+	{
+		impl_ = null_init(td);
+		assert(impl_);
+	}
+
+	~NullData()
+	{
+		null_cleanup(impl_);
+	}
+
+	static NullData *get(struct thread_data *td)
+	{
+		return reinterpret_cast<NullData *>(td->io_ops_data);
+	}
+
+	io_u *fio_null_event(struct thread_data *, int event)
+	{
+		return null_event(impl_, event);
+	}
+
+	int fio_null_getevents(struct thread_data *, unsigned int min_events,
+			       unsigned int max, const struct timespec *t)
+	{
+		return null_getevents(impl_, min_events, max, t);
+	}
+
+	int fio_null_commit(struct thread_data *td)
+	{
+		return null_commit(td, impl_);
+	}
+
+	int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+	{
+		return null_queue(td, impl_, io_u);
+	}
+
+	int fio_null_open(struct thread_data *, struct fio_file *f)
+	{
+		return null_open(impl_, f);
+	}
+
+private:
+	struct null_data *impl_;
+};
+
+extern "C" {
+
+static struct io_u *fio_null_event(struct thread_data *td, int event)
+{
+	return NullData::get(td)->fio_null_event(td, event);
+}
+
+static int fio_null_getevents(struct thread_data *td, unsigned int min_events,
+			      unsigned int max, const struct timespec *t)
+{
+	return NullData::get(td)->fio_null_getevents(td, min_events, max, t);
+}
+
+static int fio_null_commit(struct thread_data *td)
+{
+	return NullData::get(td)->fio_null_commit(td);
+}
+
+static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+{
+	return NullData::get(td)->fio_null_queue(td, io_u);
+}
+
+static int fio_null_open(struct thread_data *td, struct fio_file *f)
+{
+	return NullData::get(td)->fio_null_open(td, f);
+}
+
+static int fio_null_init(struct thread_data *td)
+{
+	td->io_ops_data = new NullData(td);
+	return 0;
+}
+
+static void fio_null_cleanup(struct thread_data *td)
+{
+	delete NullData::get(td);
+}
+
+static struct ioengine_ops ioengine;
+void get_ioengine(struct ioengine_ops **ioengine_ptr)
+{
+	*ioengine_ptr = &ioengine;
+
+	ioengine.name           = "cpp_null";
+	ioengine.version        = FIO_IOOPS_VERSION;
+	ioengine.queue          = fio_null_queue;
+	ioengine.commit         = fio_null_commit;
+	ioengine.getevents      = fio_null_getevents;
+	ioengine.event          = fio_null_event;
+	ioengine.init           = fio_null_init;
+	ioengine.cleanup        = fio_null_cleanup;
+	ioengine.open_file      = fio_null_open;
+	ioengine.flags          = FIO_DISKLESSIO | FIO_FAKEIO;
+}
+}
+#endif /* FIO_EXTERNAL_ENGINE */
+
+#endif /* __cplusplus */
diff --git a/engines/pmemblk.c b/engines/pmemblk.c
new file mode 100644
index 0000000..730f4d7
--- /dev/null
+++ b/engines/pmemblk.c
@@ -0,0 +1,448 @@
+/*
+ * pmemblk: IO engine that uses PMDK libpmemblk to read and write data
+ *
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * pmemblk engine
+ *
+ * IO engine that uses libpmemblk to read and write data
+ *
+ * To use:
+ *   ioengine=pmemblk
+ *
+ * Other relevant settings:
+ *   thread=1   REQUIRED
+ *   iodepth=1
+ *   direct=1
+ *   unlink=1
+ *   filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
+ *
+ *   thread must be set to 1 for pmemblk as multiple processes cannot
+ *     open the same block pool file.
+ *
+ *   iodepth should be set to 1 as pmemblk is always synchronous.
+ *   Use numjobs to scale up.
+ *
+ *   direct=1 is implied as pmemblk is always direct. A warning message
+ *   is printed if this is not specified.
+ *
+ *   unlink=1 removes the block pool file after testing, and is optional.
+ *
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled.  filename must point to a file on that filesystem.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ *   When specifying the filename, if the block pool file does not already
+ *   exist, then the pmemblk engine creates the pool file if you specify
+ *   the block and file sizes.  BSIZE is the block size in bytes.
+ *   FSIZEMB is the pool file size in MiB.
+ *
+ *   See examples/pmemblk.fio for more.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <libpmem.h>
+#include <libpmemblk.h>
+
+#include "../fio.h"
+
+/*
+ * libpmemblk
+ */
+typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
+
+struct fio_pmemblk_file {
+	fio_pmemblk_file_t pmb_next;
+	char *pmb_filename;
+	uint64_t pmb_refcnt;
+	PMEMblkpool *pmb_pool;
+	size_t pmb_bsize;
+	size_t pmb_nblocks;
+};
+
+static fio_pmemblk_file_t Cache;
+
+static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
+
+#define PMB_CREATE   (0x0001)	/* should create file */
+
+fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
+{
+	fio_pmemblk_file_t i;
+
+	for (i = Cache; i != NULL; i = i->pmb_next)
+		if (!strcmp(filename, i->pmb_filename))
+			return i;
+
+	return NULL;
+}
+
+static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
+{
+	pmb->pmb_next = Cache;
+	Cache = pmb;
+}
+
+static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
+{
+	fio_pmemblk_file_t i;
+
+	if (pmb == Cache) {
+		Cache = Cache->pmb_next;
+		pmb->pmb_next = NULL;
+		return;
+	}
+
+	for (i = Cache; i != NULL; i = i->pmb_next)
+		if (pmb == i->pmb_next) {
+			i->pmb_next = i->pmb_next->pmb_next;
+			pmb->pmb_next = NULL;
+			return;
+		}
+}
+
+/*
+ * to control block size and gross file size at the libpmemblk
+ * level, we allow the block size and file size to be appended
+ * to the file name:
+ *
+ *   path[,bsize,fsizemib]
+ *
+ * note that we do not use the fio option "filesize" to dictate
+ * the file size because we can only give libpmemblk the gross
+ * file size, which is different from the net or usable file
+ * size (which is probably what fio wants).
+ *
+ * the final path without the parameters is returned in ppath.
+ * the block size and file size are returned in pbsize and fsize.
+ *
+ * note that the user specifies the file size in MiB, but
+ * we return bytes from here.
+ */
+static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
+			   uint64_t *pfsize)
+{
+	char *path;
+	char *s;
+	uint64_t bsize;
+	uint64_t fsizemib;
+
+	path = strdup(pathspec);
+	if (!path) {
+		*ppath = NULL;
+		return;
+	}
+
+	/* extract sizes, if given */
+	s = strrchr(path, ',');
+	if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
+		*s = 0;
+		s = strrchr(path, ',');
+		if (s && (bsize = strtoull(s + 1, NULL, 10))) {
+			*s = 0;
+			*ppath = path;
+			*pbsize = bsize;
+			*pfsize = fsizemib << 20;
+			return;
+		}
+	}
+
+	/* size specs not found */
+	strcpy(path, pathspec);
+	*ppath = path;
+	*pbsize = 0;
+	*pfsize = 0;
+}
+
+static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
+{
+	fio_pmemblk_file_t pmb;
+	char *path = NULL;
+	uint64_t bsize = 0;
+	uint64_t fsize = 0;
+
+	pmb_parse_path(pathspec, &path, &bsize, &fsize);
+	if (!path)
+		return NULL;
+
+	pthread_mutex_lock(&CacheLock);
+
+	pmb = fio_pmemblk_cache_lookup(path);
+	if (!pmb) {
+		pmb = malloc(sizeof(*pmb));
+		if (!pmb)
+			goto error;
+
+		/* try opening existing first, create it if needed */
+		pmb->pmb_pool = pmemblk_open(path, bsize);
+		if (!pmb->pmb_pool && (errno == ENOENT) &&
+		    (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
+			pmb->pmb_pool =
+			    pmemblk_create(path, bsize, fsize, 0644);
+		}
+		if (!pmb->pmb_pool) {
+			log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
+			     path, strerror(errno));
+			goto error;
+		}
+
+		pmb->pmb_filename = path;
+		pmb->pmb_next = NULL;
+		pmb->pmb_refcnt = 0;
+		pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
+		pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
+
+		fio_pmemblk_cache_insert(pmb);
+	}
+
+	pmb->pmb_refcnt += 1;
+
+	pthread_mutex_unlock(&CacheLock);
+
+	free(path);
+
+	return pmb;
+
+error:
+	if (pmb) {
+		if (pmb->pmb_pool)
+			pmemblk_close(pmb->pmb_pool);
+		pmb->pmb_pool = NULL;
+		pmb->pmb_filename = NULL;
+		free(pmb);
+	}
+	if (path)
+		free(path);
+
+	pthread_mutex_unlock(&CacheLock);
+	return NULL;
+}
+
+static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
+{
+	pthread_mutex_lock(&CacheLock);
+
+	pmb->pmb_refcnt--;
+
+	if (!keep && !pmb->pmb_refcnt) {
+		pmemblk_close(pmb->pmb_pool);
+		pmb->pmb_pool = NULL;
+		free(pmb->pmb_filename);
+		pmb->pmb_filename = NULL;
+		fio_pmemblk_cache_remove(pmb);
+		free(pmb);
+	}
+
+	pthread_mutex_unlock(&CacheLock);
+}
+
+static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
+{
+	static int thread_warned = 0;
+	static int odirect_warned = 0;
+
+	uint64_t flags = 0;
+
+	if (!td->o.use_thread) {
+		if (!thread_warned) {
+			thread_warned = 1;
+			log_err("pmemblk: must set thread=1 for pmemblk engine\n");
+		}
+		return 1;
+	}
+
+	if (!td->o.odirect && !odirect_warned) {
+		odirect_warned = 1;
+		log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
+	}
+
+	if (td->o.allow_create)
+		flags |= PMB_CREATE;
+
+	(*pflags) = flags;
+	return 0;
+}
+
+static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
+{
+	uint64_t flags = 0;
+	fio_pmemblk_file_t pmb;
+
+	if (pmb_get_flags(td, &flags))
+		return 1;
+
+	pmb = pmb_open(f->file_name, flags);
+	if (!pmb)
+		return 1;
+
+	FILE_SET_ENG_DATA(f, pmb);
+	return 0;
+}
+
+static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
+				  struct fio_file *f)
+{
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	if (pmb)
+		pmb_close(pmb, false);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	return 0;
+}
+
+static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	uint64_t flags = 0;
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (!pmb) {
+		if (pmb_get_flags(td, &flags))
+			return 1;
+		pmb = pmb_open(f->file_name, flags);
+		if (!pmb)
+			return 1;
+	}
+
+	f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
+
+	fio_file_set_size_known(f);
+
+	if (!FILE_ENG_DATA(f))
+		pmb_close(pmb, true);
+
+	return 0;
+}
+
+static enum fio_q_status fio_pmemblk_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	unsigned long long off;
+	unsigned long len;
+	void *buf;
+
+	fio_ro_check(td, io_u);
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+	case DDIR_WRITE:
+		off = io_u->offset;
+		len = io_u->xfer_buflen;
+
+		io_u->error = EINVAL;
+		if (off % pmb->pmb_bsize)
+			break;
+		if (len % pmb->pmb_bsize)
+			break;
+		if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
+			break;
+
+		io_u->error = 0;
+		buf = io_u->xfer_buf;
+		off /= pmb->pmb_bsize;
+		len /= pmb->pmb_bsize;
+		while (0 < len) {
+			if (io_u->ddir == DDIR_READ &&
+			   0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			} else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			}
+			buf += pmb->pmb_bsize;
+			off++;
+			len--;
+		}
+		off *= pmb->pmb_bsize;
+		len *= pmb->pmb_bsize;
+		io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		/* we're always sync'd */
+		io_u->error = 0;
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	char *path = NULL;
+	uint64_t bsize = 0;
+	uint64_t fsize = 0;
+
+	/*
+	 * we need our own unlink in case the user has specified
+	 * the block and file sizes in the path name.  we parse
+	 * the file_name to determine the file name we actually used.
+	 */
+
+	pmb_parse_path(f->file_name, &path, &bsize, &fsize);
+	if (!path)
+		return ENOENT;
+
+	unlink(path);
+	free(path);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "pmemblk",
+	.version = FIO_IOOPS_VERSION,
+	.queue = fio_pmemblk_queue,
+	.open_file = fio_pmemblk_open_file,
+	.close_file = fio_pmemblk_close_file,
+	.get_file_size = fio_pmemblk_get_file_size,
+	.unlink_file = fio_pmemblk_unlink_file,
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_pmemblk_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_pmemblk_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/posixaio.c b/engines/posixaio.c
new file mode 100644
index 0000000..82c6aa6
--- /dev/null
+++ b/engines/posixaio.c
@@ -0,0 +1,267 @@
+/*
+ * posixaio engine
+ *
+ * IO engine that uses the posix defined aio interface.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+
+struct posixaio_data {
+	struct io_u **aio_events;
+	unsigned int queued;
+};
+
+static int fill_timespec(struct timespec *ts)
+{
+#ifdef CONFIG_CLOCK_GETTIME
+#ifdef CONFIG_CLOCK_MONOTONIC
+	clockid_t clk = CLOCK_MONOTONIC;
+#else
+	clockid_t clk = CLOCK_REALTIME;
+#endif
+	if (!clock_gettime(clk, ts))
+		return 0;
+
+	perror("clock_gettime");
+	return 1;
+#else
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	ts->tv_sec = tv.tv_sec;
+	ts->tv_nsec = tv.tv_usec * 1000;
+	return 0;
+#endif
+}
+
+static unsigned long long ts_utime_since_now(struct timespec *t)
+{
+	long long sec, nsec;
+	struct timespec now;
+
+	if (fill_timespec(&now))
+		return 0;
+	
+	sec = now.tv_sec - t->tv_sec;
+	nsec = now.tv_nsec - t->tv_nsec;
+	if (sec > 0 && nsec < 0) {
+		sec--;
+		nsec += 1000000000;
+	}
+
+	sec *= 1000000;
+	nsec /= 1000;
+	return sec + nsec;
+}
+
+static int fio_posixaio_cancel(struct thread_data fio_unused *td,
+			       struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int r = aio_cancel(f->fd, &io_u->aiocb);
+
+	if (r == AIO_ALLDONE || r == AIO_CANCELED)
+		return 0;
+
+	return 1;
+}
+
+static int fio_posixaio_prep(struct thread_data fio_unused *td,
+			     struct io_u *io_u)
+{
+	os_aiocb_t *aiocb = &io_u->aiocb;
+	struct fio_file *f = io_u->file;
+
+	aiocb->aio_fildes = f->fd;
+	aiocb->aio_buf = io_u->xfer_buf;
+	aiocb->aio_nbytes = io_u->xfer_buflen;
+	aiocb->aio_offset = io_u->offset;
+	aiocb->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+	io_u->seen = 0;
+	return 0;
+}
+
+#define SUSPEND_ENTRIES	8
+
+static int fio_posixaio_getevents(struct thread_data *td, unsigned int min,
+				  unsigned int max, const struct timespec *t)
+{
+	struct posixaio_data *pd = td->io_ops_data;
+	os_aiocb_t *suspend_list[SUSPEND_ENTRIES];
+	struct timespec start;
+	int have_timeout = 0;
+	int suspend_entries;
+	struct io_u *io_u;
+	unsigned int r;
+	int i;
+
+	if (t && !fill_timespec(&start))
+		have_timeout = 1;
+	else
+		memset(&start, 0, sizeof(start));
+
+	r = 0;
+restart:
+	memset(suspend_list, 0, sizeof(suspend_list));
+	suspend_entries = 0;
+	io_u_qiter(&td->io_u_all, io_u, i) {
+		int err;
+
+		if (io_u->seen || !(io_u->flags & IO_U_F_FLIGHT))
+			continue;
+
+		err = aio_error(&io_u->aiocb);
+		if (err == EINPROGRESS) {
+			if (suspend_entries < SUSPEND_ENTRIES) {
+				suspend_list[suspend_entries] = &io_u->aiocb;
+				suspend_entries++;
+			}
+			continue;
+		}
+
+		io_u->seen = 1;
+		pd->queued--;
+		pd->aio_events[r++] = io_u;
+
+		if (err == ECANCELED)
+			io_u->resid = io_u->xfer_buflen;
+		else if (!err) {
+			ssize_t retval = aio_return(&io_u->aiocb);
+
+			io_u->resid = io_u->xfer_buflen - retval;
+		} else
+			io_u->error = err;
+	}
+
+	if (r >= min)
+		return r;
+
+	if (have_timeout) {
+		unsigned long long usec;
+
+		usec = (t->tv_sec * 1000000) + (t->tv_nsec / 1000);
+		if (ts_utime_since_now(&start) > usec)
+			return r;
+	}
+
+	/*
+	 * must have some in-flight, wait for at least one
+	 */
+	aio_suspend((const os_aiocb_t * const *)suspend_list,
+							suspend_entries, t);
+	goto restart;
+}
+
+static struct io_u *fio_posixaio_event(struct thread_data *td, int event)
+{
+	struct posixaio_data *pd = td->io_ops_data;
+
+	return pd->aio_events[event];
+}
+
+static enum fio_q_status fio_posixaio_queue(struct thread_data *td,
+					    struct io_u *io_u)
+{
+	struct posixaio_data *pd = td->io_ops_data;
+	os_aiocb_t *aiocb = &io_u->aiocb;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = aio_read(aiocb);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = aio_write(aiocb);
+	else if (io_u->ddir == DDIR_TRIM) {
+		if (pd->queued)
+			return FIO_Q_BUSY;
+
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else {
+#ifdef CONFIG_POSIXAIO_FSYNC
+		ret = aio_fsync(O_SYNC, aiocb);
+#else
+		if (pd->queued)
+			return FIO_Q_BUSY;
+
+		do_io_u_sync(td, io_u);
+		return FIO_Q_COMPLETED;
+#endif
+	}
+
+	if (ret) {
+		int aio_err = errno;
+
+		/*
+		 * At least OSX has a very low limit on the number of pending
+		 * IOs, so if it returns EAGAIN, we are out of resources
+		 * to queue more. Just return FIO_Q_BUSY to naturally
+		 * drop off at this depth.
+		 */
+		if (aio_err == EAGAIN)
+			return FIO_Q_BUSY;
+
+		io_u->error = aio_err;
+		td_verror(td, io_u->error, "xfer");
+		return FIO_Q_COMPLETED;
+	}
+
+	pd->queued++;
+	return FIO_Q_QUEUED;
+}
+
+static void fio_posixaio_cleanup(struct thread_data *td)
+{
+	struct posixaio_data *pd = td->io_ops_data;
+
+	if (pd) {
+		free(pd->aio_events);
+		free(pd);
+	}
+}
+
+static int fio_posixaio_init(struct thread_data *td)
+{
+	struct posixaio_data *pd = malloc(sizeof(*pd));
+
+	memset(pd, 0, sizeof(*pd));
+	pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *));
+	memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *));
+
+	td->io_ops_data = pd;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "posixaio",
+	.version	= FIO_IOOPS_VERSION,
+	.flags		= FIO_ASYNCIO_SYNC_TRIM,
+	.init		= fio_posixaio_init,
+	.prep		= fio_posixaio_prep,
+	.queue		= fio_posixaio_queue,
+	.cancel		= fio_posixaio_cancel,
+	.getevents	= fio_posixaio_getevents,
+	.event		= fio_posixaio_event,
+	.cleanup	= fio_posixaio_cleanup,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+};
+
+static void fio_init fio_posixaio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_posixaio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/rados.c b/engines/rados.c
new file mode 100644
index 0000000..30fcebb
--- /dev/null
+++ b/engines/rados.c
@@ -0,0 +1,461 @@
+/*
+ *  Ceph Rados engine
+ *
+ * IO engine using Ceph's RADOS interface to test low-level performance of
+ * Ceph OSDs.
+ *
+ */
+
+#include <rados/librados.h>
+#include <pthread.h>
+#include "fio.h"
+#include "../optgroup.h"
+
+struct rados_data {
+        rados_t cluster;
+        rados_ioctx_t io_ctx;
+        struct io_u **aio_events;
+        bool connected;
+        pthread_mutex_t completed_lock;
+        pthread_cond_t completed_more_io;
+        struct flist_head completed_operations;
+};
+
+struct fio_rados_iou {
+	struct flist_head list;
+	struct thread_data *td;
+	struct io_u *io_u;
+	rados_completion_t completion;
+	rados_write_op_t write_op;
+};
+
+/* fio configuration options read from the job file */
+struct rados_options {
+	void *pad;
+	char *cluster_name;
+	char *pool_name;
+	char *client_name;
+	int busy_poll;
+};
+
+static struct fio_option options[] = {
+	{
+		.name     = "clustername",
+		.lname    = "ceph cluster name",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Cluster name for ceph",
+		.off1     = offsetof(struct rados_options, cluster_name),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = "pool",
+		.lname    = "pool name to use",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Ceph pool name to benchmark against",
+		.off1     = offsetof(struct rados_options, pool_name),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = "clientname",
+		.lname    = "rados engine clientname",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Name of the ceph client to access RADOS engine",
+		.off1     = offsetof(struct rados_options, client_name),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = "busy_poll",
+		.lname    = "busy poll mode",
+		.type     = FIO_OPT_BOOL,
+		.help     = "Busy poll for completions instead of sleeping",
+		.off1     = offsetof(struct rados_options, busy_poll),
+		.def	  = "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = NULL,
+	},
+};
+
+static int _fio_setup_rados_data(struct thread_data *td,
+				struct rados_data **rados_data_ptr)
+{
+	struct rados_data *rados;
+
+	if (td->io_ops_data)
+		return 0;
+
+	rados = calloc(1, sizeof(struct rados_data));
+	if (!rados)
+		goto failed;
+
+	rados->connected = false;
+
+	rados->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rados->aio_events)
+		goto failed;
+	pthread_mutex_init(&rados->completed_lock, NULL);
+	pthread_cond_init(&rados->completed_more_io, NULL);
+	INIT_FLIST_HEAD(&rados->completed_operations);
+	*rados_data_ptr = rados;
+	return 0;
+
+failed:
+	if (rados) {
+		if (rados->aio_events)
+			free(rados->aio_events);
+		free(rados);
+	}
+	return 1;
+}
+
+static void _fio_rados_rm_objects(struct thread_data *td, struct rados_data *rados)
+{
+	size_t i;
+	for (i = 0; i < td->o.nr_files; i++) {
+		struct fio_file *f = td->files[i];
+		rados_remove(rados->io_ctx, f->file_name);
+	}
+}
+
+static int _fio_rados_connect(struct thread_data *td)
+{
+	struct rados_data *rados = td->io_ops_data;
+	struct rados_options *o = td->eo;
+	int r;
+	const uint64_t file_size =
+		td->o.size / (td->o.nr_files ? td->o.nr_files : 1u);
+	struct fio_file *f;
+	uint32_t i;
+
+	if (o->cluster_name) {
+		char *client_name = NULL;
+
+		/*
+		* If we specify cluser name, the rados_create2
+		* will not assume 'client.'. name is considered
+		* as a full type.id namestr
+		*/
+		if (o->client_name) {
+			if (!index(o->client_name, '.')) {
+				client_name = calloc(1, strlen("client.") +
+					strlen(o->client_name) + 1);
+				strcat(client_name, "client.");
+				strcat(client_name, o->client_name);
+			} else {
+				client_name = o->client_name;
+			}
+		}
+
+		r = rados_create2(&rados->cluster, o->cluster_name,
+			client_name, 0);
+
+		if (client_name && !index(o->client_name, '.'))
+			free(client_name);
+	} else
+		r = rados_create(&rados->cluster, o->client_name);
+
+	if (o->pool_name == NULL) {
+		log_err("rados pool name must be provided.\n");
+		goto failed_early;
+	}
+
+	if (r < 0) {
+		log_err("rados_create failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_conf_read_file(rados->cluster, NULL);
+	if (r < 0) {
+		log_err("rados_conf_read_file failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_connect(rados->cluster);
+	if (r < 0) {
+		log_err("rados_connect failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_ioctx_create(rados->cluster, o->pool_name, &rados->io_ctx);
+	if (r < 0) {
+		log_err("rados_ioctx_create failed.\n");
+		goto failed_shutdown;
+	}
+
+	for (i = 0; i < td->o.nr_files; i++) {
+		f = td->files[i];
+		f->real_file_size = file_size;
+		r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
+		if (r < 0) {
+			goto failed_obj_create;
+		}
+	}
+	return 0;
+
+failed_obj_create:
+	_fio_rados_rm_objects(td, rados);
+	rados_ioctx_destroy(rados->io_ctx);
+	rados->io_ctx = NULL;
+failed_shutdown:
+	rados_shutdown(rados->cluster);
+	rados->cluster = NULL;
+failed_early:
+	return 1;
+}
+
+static void _fio_rados_disconnect(struct rados_data *rados)
+{
+	if (!rados)
+		return;
+
+	if (rados->io_ctx) {
+		rados_ioctx_destroy(rados->io_ctx);
+		rados->io_ctx = NULL;
+	}
+
+	if (rados->cluster) {
+		rados_shutdown(rados->cluster);
+		rados->cluster = NULL;
+	}
+}
+
+static void fio_rados_cleanup(struct thread_data *td)
+{
+	struct rados_data *rados = td->io_ops_data;
+
+	if (rados) {
+		_fio_rados_rm_objects(td, rados);
+		_fio_rados_disconnect(rados);
+		free(rados->aio_events);
+		free(rados);
+	}
+}
+
+static void complete_callback(rados_completion_t cb, void *arg)
+{
+	struct fio_rados_iou *fri = (struct fio_rados_iou *)arg;
+	struct rados_data *rados = fri->td->io_ops_data;
+	assert(fri->completion);
+	assert(rados_aio_is_complete(fri->completion));
+	pthread_mutex_lock(&rados->completed_lock);
+	flist_add_tail(&fri->list, &rados->completed_operations);
+	pthread_mutex_unlock(&rados->completed_lock);
+	pthread_cond_signal(&rados->completed_more_io);
+}
+
+static enum fio_q_status fio_rados_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct rados_data *rados = td->io_ops_data;
+	struct fio_rados_iou *fri = io_u->engine_data;
+	char *object = io_u->file->file_name;
+	int r = -1;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_WRITE) {
+		 r = rados_aio_create_completion(fri, complete_callback,
+			NULL, &fri->completion);
+		if (r < 0) {
+			log_err("rados_aio_create_completion failed.\n");
+			goto failed;
+		}
+
+		r = rados_aio_write(rados->io_ctx, object, fri->completion,
+			io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+		if (r < 0) {
+			log_err("rados_write failed.\n");
+			goto failed_comp;
+		}
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_READ) {
+		r = rados_aio_create_completion(fri, complete_callback,
+			NULL, &fri->completion);
+		if (r < 0) {
+			log_err("rados_aio_create_completion failed.\n");
+			goto failed;
+		}
+		r = rados_aio_read(rados->io_ctx, object, fri->completion,
+			io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+		if (r < 0) {
+			log_err("rados_aio_read failed.\n");
+			goto failed_comp;
+		}
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_TRIM) {
+		r = rados_aio_create_completion(fri, complete_callback,
+			NULL , &fri->completion);
+		if (r < 0) {
+			log_err("rados_aio_create_completion failed.\n");
+			goto failed;
+		}
+		fri->write_op = rados_create_write_op();
+		if (fri->write_op == NULL) {
+			log_err("rados_create_write_op failed.\n");
+			goto failed_comp;
+		}
+		rados_write_op_zero(fri->write_op, io_u->offset,
+			io_u->xfer_buflen);
+		r = rados_aio_write_op_operate(fri->write_op, rados->io_ctx,
+			fri->completion, object, NULL, 0);
+		if (r < 0) {
+			log_err("rados_aio_write_op_operate failed.\n");
+			goto failed_write_op;
+		}
+		return FIO_Q_QUEUED;
+	 }
+
+	log_err("WARNING: Only DDIR_READ, DDIR_WRITE and DDIR_TRIM are supported!");
+
+failed_write_op:
+	rados_release_write_op(fri->write_op);
+failed_comp:
+	rados_aio_release(fri->completion);
+failed:
+	io_u->error = -r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+static struct io_u *fio_rados_event(struct thread_data *td, int event)
+{
+	struct rados_data *rados = td->io_ops_data;
+	return rados->aio_events[event];
+}
+
+int fio_rados_getevents(struct thread_data *td, unsigned int min,
+	unsigned int max, const struct timespec *t)
+{
+	struct rados_data *rados = td->io_ops_data;
+	unsigned int events = 0;
+	struct fio_rados_iou *fri;
+
+	pthread_mutex_lock(&rados->completed_lock);
+	while (events < min) {
+		while (flist_empty(&rados->completed_operations)) {
+			pthread_cond_wait(&rados->completed_more_io, &rados->completed_lock);
+		}
+		assert(!flist_empty(&rados->completed_operations));
+		
+		fri = flist_first_entry(&rados->completed_operations, struct fio_rados_iou, list);
+		assert(fri->completion);
+		assert(rados_aio_is_complete(fri->completion));
+		if (fri->write_op != NULL) {
+			rados_release_write_op(fri->write_op);
+			fri->write_op = NULL;
+		}
+		rados_aio_release(fri->completion);
+		fri->completion = NULL;
+
+		rados->aio_events[events] = fri->io_u;
+		events ++;
+		flist_del(&fri->list);
+		if (events >= max) break;
+	}
+	pthread_mutex_unlock(&rados->completed_lock);
+	return events;
+}
+
+static int fio_rados_setup(struct thread_data *td)
+{
+	struct rados_data *rados = NULL;
+	int r;
+	/* allocate engine specific structure to deal with librados. */
+	r = _fio_setup_rados_data(td, &rados);
+	if (r) {
+		log_err("fio_setup_rados_data failed.\n");
+		goto cleanup;
+	}
+	td->io_ops_data = rados;
+
+	/* Force single process mode.
+	*/
+	td->o.use_thread = 1;
+
+	/* connect in the main thread to determine to determine
+	* the size of the given RADOS block device. And disconnect
+	* later on.
+	*/
+	r = _fio_rados_connect(td);
+	if (r) {
+		log_err("fio_rados_connect failed.\n");
+		goto cleanup;
+	}
+	rados->connected = true;
+
+	return 0;
+cleanup:
+	fio_rados_cleanup(td);
+	return r;
+}
+
+/* open/invalidate are noops. we set the FIO_DISKLESSIO flag in ioengine_ops to
+   prevent fio from creating the files
+*/
+static int fio_rados_open(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+static int fio_rados_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static void fio_rados_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rados_iou *fri = io_u->engine_data;
+
+	if (fri) {
+		io_u->engine_data = NULL;
+		fri->td = NULL;
+		if (fri->completion)
+			rados_aio_release(fri->completion);
+		if (fri->write_op)
+			rados_release_write_op(fri->write_op);
+		free(fri);
+	}
+}
+
+static int fio_rados_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rados_iou *fri;
+	fri = calloc(1, sizeof(*fri));
+	fri->io_u = io_u;
+	fri->td = td;
+	INIT_FLIST_HEAD(&fri->list);
+	io_u->engine_data = fri;
+	return 0;
+}
+
+/* ioengine_ops for get_ioengine() */
+static struct ioengine_ops ioengine = {
+	.name = "rados",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_DISKLESSIO,
+	.setup			= fio_rados_setup,
+	.queue			= fio_rados_queue,
+	.getevents		= fio_rados_getevents,
+	.event			= fio_rados_event,
+	.cleanup		= fio_rados_cleanup,
+	.open_file		= fio_rados_open,
+	.invalidate		= fio_rados_invalidate,
+	.options		= options,
+	.io_u_init		= fio_rados_io_u_init,
+	.io_u_free		= fio_rados_io_u_free,
+	.option_struct_size	= sizeof(struct rados_options),
+};
+
+static void fio_init fio_rados_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_rados_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/rbd.c b/engines/rbd.c
new file mode 100644
index 0000000..7d4d3fa
--- /dev/null
+++ b/engines/rbd.c
@@ -0,0 +1,688 @@
+/*
+ * rbd engine
+ *
+ * IO engine using Ceph's librbd to test RADOS Block Devices.
+ *
+ */
+
+#include <rbd/librbd.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#ifdef CONFIG_RBD_POLL
+/* add for poll */
+#include <poll.h>
+#include <sys/eventfd.h>
+#endif
+
+struct fio_rbd_iou {
+	struct io_u *io_u;
+	rbd_completion_t completion;
+	int io_seen;
+	int io_complete;
+};
+
+struct rbd_data {
+	rados_t cluster;
+	rados_ioctx_t io_ctx;
+	rbd_image_t image;
+	struct io_u **aio_events;
+	struct io_u **sort_events;
+	int fd; /* add for poll */
+	bool connected;
+};
+
+struct rbd_options {
+	void *pad;
+	char *cluster_name;
+	char *rbd_name;
+	char *pool_name;
+	char *client_name;
+	int busy_poll;
+};
+
+static struct fio_option options[] = {
+        {
+		.name		= "clustername",
+		.lname		= "ceph cluster name",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Cluster name for ceph",
+		.off1		= offsetof(struct rbd_options, cluster_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+        },
+	{
+		.name		= "rbdname",
+		.lname		= "rbd engine rbdname",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "RBD name for RBD engine",
+		.off1		= offsetof(struct rbd_options, rbd_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name		= "pool",
+		.lname		= "rbd engine pool",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Name of the pool hosting the RBD for the RBD engine",
+		.off1		= offsetof(struct rbd_options, pool_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name		= "clientname",
+		.lname		= "rbd engine clientname",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Name of the ceph client to access the RBD for the RBD engine",
+		.off1		= offsetof(struct rbd_options, client_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name		= "busy_poll",
+		.lname		= "Busy poll",
+		.type		= FIO_OPT_BOOL,
+		.help		= "Busy poll for completions instead of sleeping",
+		.off1		= offsetof(struct rbd_options, busy_poll),
+		.def		= "0",
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static int _fio_setup_rbd_data(struct thread_data *td,
+			       struct rbd_data **rbd_data_ptr)
+{
+	struct rbd_data *rbd;
+
+	if (td->io_ops_data)
+		return 0;
+
+	rbd = calloc(1, sizeof(struct rbd_data));
+	if (!rbd)
+		goto failed;
+
+	rbd->connected = false;
+
+	/* add for poll, init fd: -1 */
+	rbd->fd = -1;
+
+	rbd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rbd->aio_events)
+		goto failed;
+
+	rbd->sort_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rbd->sort_events)
+		goto failed;
+
+	*rbd_data_ptr = rbd;
+	return 0;
+
+failed:
+	if (rbd) {
+		if (rbd->aio_events) 
+			free(rbd->aio_events);
+		if (rbd->sort_events)
+			free(rbd->sort_events);
+		free(rbd);
+	}
+	return 1;
+
+}
+
+#ifdef CONFIG_RBD_POLL
+static bool _fio_rbd_setup_poll(struct rbd_data *rbd)
+{
+	int r;
+
+	/* add for rbd poll */
+	rbd->fd = eventfd(0, EFD_SEMAPHORE);
+	if (rbd->fd < 0) {
+		log_err("eventfd failed.\n");
+		return false;
+	}
+
+	r = rbd_set_image_notification(rbd->image, rbd->fd, EVENT_TYPE_EVENTFD);
+	if (r < 0) {
+		log_err("rbd_set_image_notification failed.\n");
+		close(rbd->fd);
+		rbd->fd = -1;
+		return false;
+	}
+
+	return true;
+}
+#else
+static bool _fio_rbd_setup_poll(struct rbd_data *rbd)
+{
+	return true;
+}
+#endif
+
+static int _fio_rbd_connect(struct thread_data *td)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+	struct rbd_options *o = td->eo;
+	int r;
+
+	if (o->cluster_name) {
+		char *client_name = NULL; 
+
+		/*
+		 * If we specify cluser name, the rados_create2
+		 * will not assume 'client.'. name is considered
+		 * as a full type.id namestr
+		 */
+		if (o->client_name) {
+			if (!index(o->client_name, '.')) {
+				client_name = calloc(1, strlen("client.") +
+						    strlen(o->client_name) + 1);
+				strcat(client_name, "client.");
+				strcat(client_name, o->client_name);
+			} else {
+				client_name = o->client_name;
+			}
+		}
+
+		r = rados_create2(&rbd->cluster, o->cluster_name,
+				 client_name, 0);
+
+		if (client_name && !index(o->client_name, '.'))
+			free(client_name);
+	} else
+		r = rados_create(&rbd->cluster, o->client_name);
+	
+	if (r < 0) {
+		log_err("rados_create failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_conf_read_file(rbd->cluster, NULL);
+	if (r < 0) {
+		log_err("rados_conf_read_file failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_connect(rbd->cluster);
+	if (r < 0) {
+		log_err("rados_connect failed.\n");
+		goto failed_shutdown;
+	}
+
+	r = rados_ioctx_create(rbd->cluster, o->pool_name, &rbd->io_ctx);
+	if (r < 0) {
+		log_err("rados_ioctx_create failed.\n");
+		goto failed_shutdown;
+	}
+
+	r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
+	if (r < 0) {
+		log_err("rbd_open failed.\n");
+		goto failed_open;
+	}
+
+	if (!_fio_rbd_setup_poll(rbd))
+		goto failed_poll;
+
+	return 0;
+
+failed_poll:
+	rbd_close(rbd->image);
+	rbd->image = NULL;
+failed_open:
+	rados_ioctx_destroy(rbd->io_ctx);
+	rbd->io_ctx = NULL;
+failed_shutdown:
+	rados_shutdown(rbd->cluster);
+	rbd->cluster = NULL;
+failed_early:
+	return 1;
+}
+
+static void _fio_rbd_disconnect(struct rbd_data *rbd)
+{
+	if (!rbd)
+		return;
+
+	/* close eventfd */
+	if (rbd->fd != -1) {
+		close(rbd->fd);
+		rbd->fd = -1;
+	}
+
+	/* shutdown everything */
+	if (rbd->image) {
+		rbd_close(rbd->image);
+		rbd->image = NULL;
+	}
+
+	if (rbd->io_ctx) {
+		rados_ioctx_destroy(rbd->io_ctx);
+		rbd->io_ctx = NULL;
+	}
+
+	if (rbd->cluster) {
+		rados_shutdown(rbd->cluster);
+		rbd->cluster = NULL;
+	}
+}
+
+static void _fio_rbd_finish_aiocb(rbd_completion_t comp, void *data)
+{
+	struct fio_rbd_iou *fri = data;
+	struct io_u *io_u = fri->io_u;
+	ssize_t ret;
+
+	/*
+	 * Looks like return value is 0 for success, or < 0 for
+	 * a specific error. So we have to assume that it can't do
+	 * partial completions.
+	 */
+	ret = rbd_aio_get_return_value(fri->completion);
+	if (ret < 0) {
+		io_u->error = -ret;
+		io_u->resid = io_u->xfer_buflen;
+	} else
+		io_u->error = 0;
+
+	fri->io_complete = 1;
+}
+
+static struct io_u *fio_rbd_event(struct thread_data *td, int event)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+
+	return rbd->aio_events[event];
+}
+
+static inline int fri_check_complete(struct rbd_data *rbd, struct io_u *io_u,
+				     unsigned int *events)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	if (fri->io_complete) {
+		fri->io_seen = 1;
+		rbd->aio_events[*events] = io_u;
+		(*events)++;
+
+		rbd_aio_release(fri->completion);
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifndef CONFIG_RBD_POLL
+static inline int rbd_io_u_seen(struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	return fri->io_seen;
+}
+#endif
+
+static void rbd_io_u_wait_complete(struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	rbd_aio_wait_for_complete(fri->completion);
+}
+
+static int rbd_io_u_cmp(const void *p1, const void *p2)
+{
+	const struct io_u **a = (const struct io_u **) p1;
+	const struct io_u **b = (const struct io_u **) p2;
+	uint64_t at, bt;
+
+	at = utime_since_now(&(*a)->start_time);
+	bt = utime_since_now(&(*b)->start_time);
+
+	if (at < bt)
+		return -1;
+	else if (at == bt)
+		return 0;
+	else
+		return 1;
+}
+
+static int rbd_iter_events(struct thread_data *td, unsigned int *events,
+			   unsigned int min_evts, int wait)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+	unsigned int this_events = 0;
+	struct io_u *io_u;
+	int i, sidx = 0;
+
+#ifdef CONFIG_RBD_POLL
+	int ret = 0;
+	int event_num = 0;
+	struct fio_rbd_iou *fri = NULL;
+	rbd_completion_t comps[min_evts];
+	uint64_t counter;
+	bool completed;
+
+	struct pollfd pfd;
+	pfd.fd = rbd->fd;
+	pfd.events = POLLIN;
+
+	ret = poll(&pfd, 1, wait ? -1 : 0);
+	if (ret <= 0)
+		return 0;
+	if (!(pfd.revents & POLLIN))
+		return 0;
+
+	event_num = rbd_poll_io_events(rbd->image, comps, min_evts);
+
+	for (i = 0; i < event_num; i++) {
+		fri = rbd_aio_get_arg(comps[i]);
+		io_u = fri->io_u;
+
+		/* best effort to decrement the semaphore */
+		ret = read(rbd->fd, &counter, sizeof(counter));
+		if (ret <= 0)
+			log_err("rbd_iter_events failed to decrement semaphore.\n");
+
+		completed = fri_check_complete(rbd, io_u, events);
+		assert(completed);
+
+		this_events++;
+	}
+#else
+	io_u_qiter(&td->io_u_all, io_u, i) {
+		if (!(io_u->flags & IO_U_F_FLIGHT))
+			continue;
+		if (rbd_io_u_seen(io_u))
+			continue;
+
+		if (fri_check_complete(rbd, io_u, events))
+			this_events++;
+		else if (wait)
+			rbd->sort_events[sidx++] = io_u;
+	}
+#endif
+
+	if (!wait || !sidx)
+		return this_events;
+
+	/*
+	 * Sort events, oldest issue first, then wait on as many as we
+	 * need in order of age. If we have enough events, stop waiting,
+	 * and just check if any of the older ones are done.
+	 */
+	if (sidx > 1)
+		qsort(rbd->sort_events, sidx, sizeof(struct io_u *), rbd_io_u_cmp);
+
+	for (i = 0; i < sidx; i++) {
+		io_u = rbd->sort_events[i];
+
+		if (fri_check_complete(rbd, io_u, events)) {
+			this_events++;
+			continue;
+		}
+
+		/*
+		 * Stop waiting when we have enough, but continue checking
+		 * all pending IOs if they are complete.
+		 */
+		if (*events >= min_evts)
+			continue;
+
+		rbd_io_u_wait_complete(io_u);
+
+		if (fri_check_complete(rbd, io_u, events))
+			this_events++;
+	}
+
+	return this_events;
+}
+
+static int fio_rbd_getevents(struct thread_data *td, unsigned int min,
+			     unsigned int max, const struct timespec *t)
+{
+	unsigned int this_events, events = 0;
+	struct rbd_options *o = td->eo;
+	int wait = 0;
+
+	do {
+		this_events = rbd_iter_events(td, &events, min, wait);
+
+		if (events >= min)
+			break;
+		if (this_events)
+			continue;
+
+		if (!o->busy_poll)
+			wait = 1;
+		else
+			nop;
+	} while (1);
+
+	return events;
+}
+
+static enum fio_q_status fio_rbd_queue(struct thread_data *td,
+				       struct io_u *io_u)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+	struct fio_rbd_iou *fri = io_u->engine_data;
+	int r = -1;
+
+	fio_ro_check(td, io_u);
+
+	fri->io_seen = 0;
+	fri->io_complete = 0;
+
+	r = rbd_aio_create_completion(fri, _fio_rbd_finish_aiocb,
+						&fri->completion);
+	if (r < 0) {
+		log_err("rbd_aio_create_completion failed.\n");
+		goto failed;
+	}
+
+	if (io_u->ddir == DDIR_WRITE) {
+		r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
+					 io_u->xfer_buf, fri->completion);
+		if (r < 0) {
+			log_err("rbd_aio_write failed.\n");
+			goto failed_comp;
+		}
+
+	} else if (io_u->ddir == DDIR_READ) {
+		r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
+					io_u->xfer_buf, fri->completion);
+
+		if (r < 0) {
+			log_err("rbd_aio_read failed.\n");
+			goto failed_comp;
+		}
+	} else if (io_u->ddir == DDIR_TRIM) {
+		r = rbd_aio_discard(rbd->image, io_u->offset,
+					io_u->xfer_buflen, fri->completion);
+		if (r < 0) {
+			log_err("rbd_aio_discard failed.\n");
+			goto failed_comp;
+		}
+	} else if (io_u->ddir == DDIR_SYNC) {
+		r = rbd_aio_flush(rbd->image, fri->completion);
+		if (r < 0) {
+			log_err("rbd_flush failed.\n");
+			goto failed_comp;
+		}
+	} else {
+		dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
+		       io_u->ddir);
+		r = -EINVAL;
+		goto failed_comp;
+	}
+
+	return FIO_Q_QUEUED;
+failed_comp:
+	rbd_aio_release(fri->completion);
+failed:
+	io_u->error = -r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_rbd_init(struct thread_data *td)
+{
+	int r;
+	struct rbd_data *rbd = td->io_ops_data;
+
+	if (rbd->connected)
+		return 0;
+
+	r = _fio_rbd_connect(td);
+	if (r) {
+		log_err("fio_rbd_connect failed, return code: %d .\n", r);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	return 1;
+}
+
+static void fio_rbd_cleanup(struct thread_data *td)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+
+	if (rbd) {
+		_fio_rbd_disconnect(rbd);
+		free(rbd->aio_events);
+		free(rbd->sort_events);
+		free(rbd);
+	}
+}
+
+static int fio_rbd_setup(struct thread_data *td)
+{
+	rbd_image_info_t info;
+	struct fio_file *f;
+	struct rbd_data *rbd = NULL;
+	int r;
+
+	/* allocate engine specific structure to deal with librbd. */
+	r = _fio_setup_rbd_data(td, &rbd);
+	if (r) {
+		log_err("fio_setup_rbd_data failed.\n");
+		goto cleanup;
+	}
+	td->io_ops_data = rbd;
+
+	/* librbd does not allow us to run first in the main thread and later
+	 * in a fork child. It needs to be the same process context all the
+	 * time. 
+	 */
+	td->o.use_thread = 1;
+
+	/* connect in the main thread to determine to determine
+	 * the size of the given RADOS block device. And disconnect
+	 * later on.
+	 */
+	r = _fio_rbd_connect(td);
+	if (r) {
+		log_err("fio_rbd_connect failed.\n");
+		goto cleanup;
+	}
+	rbd->connected = true;
+
+	/* get size of the RADOS block device */
+	r = rbd_stat(rbd->image, &info, sizeof(info));
+	if (r < 0) {
+		log_err("rbd_status failed.\n");
+		goto cleanup;
+	} else if (info.size == 0) {
+		log_err("image size should be larger than zero.\n");
+		r = -EINVAL;
+		goto cleanup;
+	}
+
+	dprint(FD_IO, "rbd-engine: image size: %" PRIu64 "\n", info.size);
+
+	/* taken from "net" engine. Pretend we deal with files,
+	 * even if we do not have any ideas about files.
+	 * The size of the RBD is set instead of a artificial file.
+	 */
+	if (!td->files_index) {
+		add_file(td, td->o.filename ? : "rbd", 0, 0);
+		td->o.nr_files = td->o.nr_files ? : 1;
+		td->o.open_files++;
+	}
+	f = td->files[0];
+	f->real_file_size = info.size;
+
+	return 0;
+
+cleanup:
+	fio_rbd_cleanup(td);
+	return r;
+}
+
+static int fio_rbd_open(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static int fio_rbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+#if defined(CONFIG_RBD_INVAL)
+	struct rbd_data *rbd = td->io_ops_data;
+
+	return rbd_invalidate_cache(rbd->image);
+#else
+	return 0;
+#endif
+}
+
+static void fio_rbd_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	if (fri) {
+		io_u->engine_data = NULL;
+		free(fri);
+	}
+}
+
+static int fio_rbd_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri;
+
+	fri = calloc(1, sizeof(*fri));
+	fri->io_u = io_u;
+	io_u->engine_data = fri;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "rbd",
+	.version		= FIO_IOOPS_VERSION,
+	.setup			= fio_rbd_setup,
+	.init			= fio_rbd_init,
+	.queue			= fio_rbd_queue,
+	.getevents		= fio_rbd_getevents,
+	.event			= fio_rbd_event,
+	.cleanup		= fio_rbd_cleanup,
+	.open_file		= fio_rbd_open,
+	.invalidate		= fio_rbd_invalidate,
+	.options		= options,
+	.io_u_init		= fio_rbd_io_u_init,
+	.io_u_free		= fio_rbd_io_u_free,
+	.option_struct_size	= sizeof(struct rbd_options),
+};
+
+static void fio_init fio_rbd_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_rbd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/rdma.c b/engines/rdma.c
new file mode 100644
index 0000000..2569a8e
--- /dev/null
+++ b/engines/rdma.c
@@ -0,0 +1,1412 @@
+/*
+ * RDMA I/O engine
+ *
+ * RDMA I/O engine based on the IB verbs and RDMA/CM user space libraries.
+ * Supports both RDMA memory semantics and channel semantics
+ *   for the InfiniBand, RoCE and iWARP protocols.
+ *
+ * You will need the Linux RDMA software installed, either
+ * from your Linux distributor or directly from openfabrics.org:
+ *
+ * http://www.openfabrics.org/downloads/OFED/
+ *
+ * Exchanging steps of RDMA ioengine control messages:
+ *	1. client side sends test mode (RDMA_WRITE/RDMA_READ/SEND)
+ *	   to server side.
+ *	2. server side parses test mode, and sends back confirmation
+ *	   to client side. In RDMA WRITE/READ test, this confirmation
+ *	   includes memory information, such as rkey, address.
+ *	3. client side initiates test loop.
+ *	4. In RDMA WRITE/READ test, client side sends a completion
+ *	   notification to server side. Server side updates its
+ *	   td->done as true.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "../fio.h"
+#include "../hash.h"
+#include "../optgroup.h"
+
+#include <rdma/rdma_cma.h>
+
+#define FIO_RDMA_MAX_IO_DEPTH    512
+
+enum rdma_io_mode {
+	FIO_RDMA_UNKNOWN = 0,
+	FIO_RDMA_MEM_WRITE,
+	FIO_RDMA_MEM_READ,
+	FIO_RDMA_CHA_SEND,
+	FIO_RDMA_CHA_RECV
+};
+
+struct rdmaio_options {
+	struct thread_data *td;
+	unsigned int port;
+	enum rdma_io_mode verb;
+	char *bindname;
+};
+
+static int str_hostname_cb(void *data, const char *input)
+{
+	struct rdmaio_options *o = data;
+
+	if (o->td->o.filename)
+		free(o->td->o.filename);
+	o->td->o.filename = strdup(input);
+	return 0;
+}
+
+static struct fio_option options[] = {
+	{
+		.name	= "hostname",
+		.lname	= "rdma engine hostname",
+		.type	= FIO_OPT_STR_STORE,
+		.cb	= str_hostname_cb,
+		.help	= "Hostname for RDMA IO engine",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "bindname",
+		.lname	= "rdma engine bindname",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct rdmaio_options, bindname),
+		.help	= "Bind for RDMA IO engine",
+		.def    = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "port",
+		.lname	= "rdma engine port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct rdmaio_options, port),
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port to use for RDMA connections",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "verb",
+		.lname	= "RDMA engine verb",
+		.alias	= "proto",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct rdmaio_options, verb),
+		.help	= "RDMA engine verb",
+		.def	= "write",
+		.posval = {
+			  { .ival = "write",
+			    .oval = FIO_RDMA_MEM_WRITE,
+			    .help = "Memory Write",
+			  },
+			  { .ival = "read",
+			    .oval = FIO_RDMA_MEM_READ,
+			    .help = "Memory Read",
+			  },
+			  { .ival = "send",
+			    .oval = FIO_RDMA_CHA_SEND,
+			    .help = "Posted Send",
+			  },
+			  { .ival = "recv",
+			    .oval = FIO_RDMA_CHA_RECV,
+			    .help = "Posted Receive",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+struct remote_u {
+	uint64_t buf;
+	uint32_t rkey;
+	uint32_t size;
+};
+
+struct rdma_info_blk {
+	uint32_t mode;		/* channel semantic or memory semantic */
+	uint32_t nr;		/* client: io depth
+				   server: number of records for memory semantic
+				 */
+	uint32_t max_bs;        /* maximum block size */
+	struct remote_u rmt_us[FIO_RDMA_MAX_IO_DEPTH];
+};
+
+struct rdma_io_u_data {
+	uint64_t wr_id;
+	struct ibv_send_wr sq_wr;
+	struct ibv_recv_wr rq_wr;
+	struct ibv_sge rdma_sgl;
+};
+
+struct rdmaio_data {
+	int is_client;
+	enum rdma_io_mode rdma_protocol;
+	char host[64];
+	struct sockaddr_in addr;
+
+	struct ibv_recv_wr rq_wr;
+	struct ibv_sge recv_sgl;
+	struct rdma_info_blk recv_buf;
+	struct ibv_mr *recv_mr;
+
+	struct ibv_send_wr sq_wr;
+	struct ibv_sge send_sgl;
+	struct rdma_info_blk send_buf;
+	struct ibv_mr *send_mr;
+
+	struct ibv_comp_channel *channel;
+	struct ibv_cq *cq;
+	struct ibv_pd *pd;
+	struct ibv_qp *qp;
+
+	pthread_t cmthread;
+	struct rdma_event_channel *cm_channel;
+	struct rdma_cm_id *cm_id;
+	struct rdma_cm_id *child_cm_id;
+
+	int cq_event_num;
+
+	struct remote_u *rmt_us;
+	int rmt_nr;
+	struct io_u **io_us_queued;
+	int io_u_queued_nr;
+	struct io_u **io_us_flight;
+	int io_u_flight_nr;
+	struct io_u **io_us_completed;
+	int io_u_completed_nr;
+
+	struct frand_state rand_state;
+};
+
+static int client_recv(struct thread_data *td, struct ibv_wc *wc)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	unsigned int max_bs;
+
+	if (wc->byte_len != sizeof(rd->recv_buf)) {
+		log_err("Received bogus data, size %d\n", wc->byte_len);
+		return 1;
+	}
+
+	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	if (max_bs > ntohl(rd->recv_buf.max_bs)) {
+		log_err("fio: Server's block size (%d) must be greater than or "
+			"equal to the client's block size (%d)!\n",
+			ntohl(rd->recv_buf.max_bs), max_bs);
+		return 1;
+	}
+
+	/* store mr info for MEMORY semantic */
+	if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
+	    (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
+		/* struct flist_head *entry; */
+		int i = 0;
+
+		rd->rmt_nr = ntohl(rd->recv_buf.nr);
+
+		for (i = 0; i < rd->rmt_nr; i++) {
+			rd->rmt_us[i].buf = be64_to_cpu(rd->recv_buf.rmt_us[i].buf);
+			rd->rmt_us[i].rkey = ntohl(rd->recv_buf.rmt_us[i].rkey);
+			rd->rmt_us[i].size = ntohl(rd->recv_buf.rmt_us[i].size);
+
+			dprint(FD_IO,
+			       "fio: Received rkey %x addr %" PRIx64
+			       " len %d from peer\n", rd->rmt_us[i].rkey,
+			       rd->rmt_us[i].buf, rd->rmt_us[i].size);
+		}
+	}
+
+	return 0;
+}
+
+static int server_recv(struct thread_data *td, struct ibv_wc *wc)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	unsigned int max_bs;
+
+	if (wc->wr_id == FIO_RDMA_MAX_IO_DEPTH) {
+		rd->rdma_protocol = ntohl(rd->recv_buf.mode);
+
+		/* CHANNEL semantic, do nothing */
+		if (rd->rdma_protocol == FIO_RDMA_CHA_SEND)
+			rd->rdma_protocol = FIO_RDMA_CHA_RECV;
+
+		max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+		if (max_bs < ntohl(rd->recv_buf.max_bs)) {
+			log_err("fio: Server's block size (%d) must be greater than or "
+				"equal to the client's block size (%d)!\n",
+				ntohl(rd->recv_buf.max_bs), max_bs);
+			return 1;
+		}
+
+	}
+
+	return 0;
+}
+
+static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct ibv_wc wc;
+	struct rdma_io_u_data *r_io_u_d;
+	int ret;
+	int compevnum = 0;
+	int i;
+
+	while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) {
+		ret = 0;
+		compevnum++;
+
+		if (wc.status) {
+			log_err("fio: cq completion status %d(%s)\n",
+				wc.status, ibv_wc_status_str(wc.status));
+			return -1;
+		}
+
+		switch (wc.opcode) {
+
+		case IBV_WC_RECV:
+			if (rd->is_client == 1)
+				ret = client_recv(td, &wc);
+			else
+				ret = server_recv(td, &wc);
+
+			if (ret)
+				return -1;
+
+			if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
+				break;
+
+			for (i = 0; i < rd->io_u_flight_nr; i++) {
+				r_io_u_d = rd->io_us_flight[i]->engine_data;
+
+				if (wc.wr_id == r_io_u_d->rq_wr.wr_id) {
+					rd->io_us_flight[i]->resid =
+					    rd->io_us_flight[i]->buflen
+					    - wc.byte_len;
+
+					rd->io_us_flight[i]->error = 0;
+
+					rd->io_us_completed[rd->
+							    io_u_completed_nr]
+					    = rd->io_us_flight[i];
+					rd->io_u_completed_nr++;
+					break;
+				}
+			}
+			if (i == rd->io_u_flight_nr)
+				log_err("fio: recv wr %" PRId64 " not found\n",
+					wc.wr_id);
+			else {
+				/* put the last one into middle of the list */
+				rd->io_us_flight[i] =
+				    rd->io_us_flight[rd->io_u_flight_nr - 1];
+				rd->io_u_flight_nr--;
+			}
+
+			break;
+
+		case IBV_WC_SEND:
+		case IBV_WC_RDMA_WRITE:
+		case IBV_WC_RDMA_READ:
+			if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
+				break;
+
+			for (i = 0; i < rd->io_u_flight_nr; i++) {
+				r_io_u_d = rd->io_us_flight[i]->engine_data;
+
+				if (wc.wr_id == r_io_u_d->sq_wr.wr_id) {
+					rd->io_us_completed[rd->
+							    io_u_completed_nr]
+					    = rd->io_us_flight[i];
+					rd->io_u_completed_nr++;
+					break;
+				}
+			}
+			if (i == rd->io_u_flight_nr)
+				log_err("fio: send wr %" PRId64 " not found\n",
+					wc.wr_id);
+			else {
+				/* put the last one into middle of the list */
+				rd->io_us_flight[i] =
+				    rd->io_us_flight[rd->io_u_flight_nr - 1];
+				rd->io_u_flight_nr--;
+			}
+
+			break;
+
+		default:
+			log_info("fio: unknown completion event %d\n",
+				 wc.opcode);
+			return -1;
+		}
+		rd->cq_event_num++;
+	}
+
+	if (ret) {
+		log_err("fio: poll error %d\n", ret);
+		return 1;
+	}
+
+	return compevnum;
+}
+
+/*
+ * Return -1 for error and 'nr events' for a positive number
+ * of events
+ */
+static int rdma_poll_wait(struct thread_data *td, enum ibv_wc_opcode opcode)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+	int ret;
+
+	if (rd->cq_event_num > 0) {	/* previous left */
+		rd->cq_event_num--;
+		return 0;
+	}
+
+again:
+	if (ibv_get_cq_event(rd->channel, &ev_cq, &ev_ctx) != 0) {
+		log_err("fio: Failed to get cq event!\n");
+		return -1;
+	}
+	if (ev_cq != rd->cq) {
+		log_err("fio: Unknown CQ!\n");
+		return -1;
+	}
+	if (ibv_req_notify_cq(rd->cq, 0) != 0) {
+		log_err("fio: Failed to set notify!\n");
+		return -1;
+	}
+
+	ret = cq_event_handler(td, opcode);
+	if (ret == 0)
+		goto again;
+
+	ibv_ack_cq_events(rd->cq, ret);
+
+	rd->cq_event_num--;
+
+	return ret;
+}
+
+static int fio_rdmaio_setup_qp(struct thread_data *td)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct ibv_qp_init_attr init_attr;
+	int qp_depth = td->o.iodepth * 2;	/* 2 times of io depth */
+
+	if (rd->is_client == 0)
+		rd->pd = ibv_alloc_pd(rd->child_cm_id->verbs);
+	else
+		rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
+
+	if (rd->pd == NULL) {
+		log_err("fio: ibv_alloc_pd fail: %m\n");
+		return 1;
+	}
+
+	if (rd->is_client == 0)
+		rd->channel = ibv_create_comp_channel(rd->child_cm_id->verbs);
+	else
+		rd->channel = ibv_create_comp_channel(rd->cm_id->verbs);
+	if (rd->channel == NULL) {
+		log_err("fio: ibv_create_comp_channel fail: %m\n");
+		goto err1;
+	}
+
+	if (qp_depth < 16)
+		qp_depth = 16;
+
+	if (rd->is_client == 0)
+		rd->cq = ibv_create_cq(rd->child_cm_id->verbs,
+				       qp_depth, rd, rd->channel, 0);
+	else
+		rd->cq = ibv_create_cq(rd->cm_id->verbs,
+				       qp_depth, rd, rd->channel, 0);
+	if (rd->cq == NULL) {
+		log_err("fio: ibv_create_cq failed: %m\n");
+		goto err2;
+	}
+
+	if (ibv_req_notify_cq(rd->cq, 0) != 0) {
+		log_err("fio: ibv_req_notify_cq failed: %m\n");
+		goto err3;
+	}
+
+	/* create queue pair */
+	memset(&init_attr, 0, sizeof(init_attr));
+	init_attr.cap.max_send_wr = qp_depth;
+	init_attr.cap.max_recv_wr = qp_depth;
+	init_attr.cap.max_recv_sge = 1;
+	init_attr.cap.max_send_sge = 1;
+	init_attr.qp_type = IBV_QPT_RC;
+	init_attr.send_cq = rd->cq;
+	init_attr.recv_cq = rd->cq;
+
+	if (rd->is_client == 0) {
+		if (rdma_create_qp(rd->child_cm_id, rd->pd, &init_attr) != 0) {
+			log_err("fio: rdma_create_qp failed: %m\n");
+			goto err3;
+		}
+		rd->qp = rd->child_cm_id->qp;
+	} else {
+		if (rdma_create_qp(rd->cm_id, rd->pd, &init_attr) != 0) {
+			log_err("fio: rdma_create_qp failed: %m\n");
+			goto err3;
+		}
+		rd->qp = rd->cm_id->qp;
+	}
+
+	return 0;
+
+err3:
+	ibv_destroy_cq(rd->cq);
+err2:
+	ibv_destroy_comp_channel(rd->channel);
+err1:
+	ibv_dealloc_pd(rd->pd);
+
+	return 1;
+}
+
+static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+
+	rd->recv_mr = ibv_reg_mr(rd->pd, &rd->recv_buf, sizeof(rd->recv_buf),
+				 IBV_ACCESS_LOCAL_WRITE);
+	if (rd->recv_mr == NULL) {
+		log_err("fio: recv_buf reg_mr failed: %m\n");
+		return 1;
+	}
+
+	rd->send_mr = ibv_reg_mr(rd->pd, &rd->send_buf, sizeof(rd->send_buf),
+				 0);
+	if (rd->send_mr == NULL) {
+		log_err("fio: send_buf reg_mr failed: %m\n");
+		ibv_dereg_mr(rd->recv_mr);
+		return 1;
+	}
+
+	/* setup work request */
+	/* recv wq */
+	rd->recv_sgl.addr = (uint64_t) (unsigned long)&rd->recv_buf;
+	rd->recv_sgl.length = sizeof(rd->recv_buf);
+	rd->recv_sgl.lkey = rd->recv_mr->lkey;
+	rd->rq_wr.sg_list = &rd->recv_sgl;
+	rd->rq_wr.num_sge = 1;
+	rd->rq_wr.wr_id = FIO_RDMA_MAX_IO_DEPTH;
+
+	/* send wq */
+	rd->send_sgl.addr = (uint64_t) (unsigned long)&rd->send_buf;
+	rd->send_sgl.length = sizeof(rd->send_buf);
+	rd->send_sgl.lkey = rd->send_mr->lkey;
+
+	rd->sq_wr.opcode = IBV_WR_SEND;
+	rd->sq_wr.send_flags = IBV_SEND_SIGNALED;
+	rd->sq_wr.sg_list = &rd->send_sgl;
+	rd->sq_wr.num_sge = 1;
+	rd->sq_wr.wr_id = FIO_RDMA_MAX_IO_DEPTH;
+
+	return 0;
+}
+
+static int get_next_channel_event(struct thread_data *td,
+				  struct rdma_event_channel *channel,
+				  enum rdma_cm_event_type wait_event)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdma_cm_event *event;
+	int ret;
+
+	ret = rdma_get_cm_event(channel, &event);
+	if (ret) {
+		log_err("fio: rdma_get_cm_event: %d\n", ret);
+		return 1;
+	}
+
+	if (event->event != wait_event) {
+		log_err("fio: event is %s instead of %s\n",
+			rdma_event_str(event->event),
+			rdma_event_str(wait_event));
+		return 1;
+	}
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		rd->child_cm_id = event->id;
+		break;
+	default:
+		break;
+	}
+
+	rdma_ack_cm_event(event);
+
+	return 0;
+}
+
+static int fio_rdmaio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdma_io_u_data *r_io_u_d;
+
+	r_io_u_d = io_u->engine_data;
+
+	switch (rd->rdma_protocol) {
+	case FIO_RDMA_MEM_WRITE:
+	case FIO_RDMA_MEM_READ:
+		r_io_u_d->rdma_sgl.addr = (uint64_t) (unsigned long)io_u->buf;
+		r_io_u_d->rdma_sgl.lkey = io_u->mr->lkey;
+		r_io_u_d->sq_wr.wr_id = r_io_u_d->wr_id;
+		r_io_u_d->sq_wr.send_flags = IBV_SEND_SIGNALED;
+		r_io_u_d->sq_wr.sg_list = &r_io_u_d->rdma_sgl;
+		r_io_u_d->sq_wr.num_sge = 1;
+		break;
+	case FIO_RDMA_CHA_SEND:
+		r_io_u_d->rdma_sgl.addr = (uint64_t) (unsigned long)io_u->buf;
+		r_io_u_d->rdma_sgl.lkey = io_u->mr->lkey;
+		r_io_u_d->rdma_sgl.length = io_u->buflen;
+		r_io_u_d->sq_wr.wr_id = r_io_u_d->wr_id;
+		r_io_u_d->sq_wr.opcode = IBV_WR_SEND;
+		r_io_u_d->sq_wr.send_flags = IBV_SEND_SIGNALED;
+		r_io_u_d->sq_wr.sg_list = &r_io_u_d->rdma_sgl;
+		r_io_u_d->sq_wr.num_sge = 1;
+		break;
+	case FIO_RDMA_CHA_RECV:
+		r_io_u_d->rdma_sgl.addr = (uint64_t) (unsigned long)io_u->buf;
+		r_io_u_d->rdma_sgl.lkey = io_u->mr->lkey;
+		r_io_u_d->rdma_sgl.length = io_u->buflen;
+		r_io_u_d->rq_wr.wr_id = r_io_u_d->wr_id;
+		r_io_u_d->rq_wr.sg_list = &r_io_u_d->rdma_sgl;
+		r_io_u_d->rq_wr.num_sge = 1;
+		break;
+	default:
+		log_err("fio: unknown rdma protocol - %d\n", rd->rdma_protocol);
+		break;
+	}
+
+	return 0;
+}
+
+static struct io_u *fio_rdmaio_event(struct thread_data *td, int event)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct io_u *io_u;
+	int i;
+
+	io_u = rd->io_us_completed[0];
+	for (i = 0; i < rd->io_u_completed_nr - 1; i++)
+		rd->io_us_completed[i] = rd->io_us_completed[i + 1];
+
+	rd->io_u_completed_nr--;
+
+	dprint_io_u(io_u, "fio_rdmaio_event");
+
+	return io_u;
+}
+
+static int fio_rdmaio_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	enum ibv_wc_opcode comp_opcode;
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+	int ret, r = 0;
+	comp_opcode = IBV_WC_RDMA_WRITE;
+
+	switch (rd->rdma_protocol) {
+	case FIO_RDMA_MEM_WRITE:
+		comp_opcode = IBV_WC_RDMA_WRITE;
+		break;
+	case FIO_RDMA_MEM_READ:
+		comp_opcode = IBV_WC_RDMA_READ;
+		break;
+	case FIO_RDMA_CHA_SEND:
+		comp_opcode = IBV_WC_SEND;
+		break;
+	case FIO_RDMA_CHA_RECV:
+		comp_opcode = IBV_WC_RECV;
+		break;
+	default:
+		log_err("fio: unknown rdma protocol - %d\n", rd->rdma_protocol);
+		break;
+	}
+
+	if (rd->cq_event_num > 0) {	/* previous left */
+		rd->cq_event_num--;
+		return 0;
+	}
+
+again:
+	if (ibv_get_cq_event(rd->channel, &ev_cq, &ev_ctx) != 0) {
+		log_err("fio: Failed to get cq event!\n");
+		return -1;
+	}
+	if (ev_cq != rd->cq) {
+		log_err("fio: Unknown CQ!\n");
+		return -1;
+	}
+	if (ibv_req_notify_cq(rd->cq, 0) != 0) {
+		log_err("fio: Failed to set notify!\n");
+		return -1;
+	}
+
+	ret = cq_event_handler(td, comp_opcode);
+	if (ret < 1)
+		goto again;
+
+	ibv_ack_cq_events(rd->cq, ret);
+
+	r += ret;
+	if (r < min)
+		goto again;
+
+	rd->cq_event_num -= r;
+
+	return r;
+}
+
+static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
+			   unsigned int nr)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct ibv_send_wr *bad_wr;
+#if 0
+	enum ibv_wc_opcode comp_opcode;
+	comp_opcode = IBV_WC_RDMA_WRITE;
+#endif
+	int i;
+	long index;
+	struct rdma_io_u_data *r_io_u_d;
+
+	r_io_u_d = NULL;
+
+	for (i = 0; i < nr; i++) {
+		/* RDMA_WRITE or RDMA_READ */
+		switch (rd->rdma_protocol) {
+		case FIO_RDMA_MEM_WRITE:
+			/* compose work request */
+			r_io_u_d = io_us[i]->engine_data;
+			index = __rand(&rd->rand_state) % rd->rmt_nr;
+			r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_WRITE;
+			r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
+			r_io_u_d->sq_wr.wr.rdma.remote_addr = \
+				rd->rmt_us[index].buf;
+			r_io_u_d->sq_wr.sg_list->length = io_us[i]->buflen;
+			break;
+		case FIO_RDMA_MEM_READ:
+			/* compose work request */
+			r_io_u_d = io_us[i]->engine_data;
+			index = __rand(&rd->rand_state) % rd->rmt_nr;
+			r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_READ;
+			r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
+			r_io_u_d->sq_wr.wr.rdma.remote_addr = \
+				rd->rmt_us[index].buf;
+			r_io_u_d->sq_wr.sg_list->length = io_us[i]->buflen;
+			break;
+		case FIO_RDMA_CHA_SEND:
+			r_io_u_d = io_us[i]->engine_data;
+			r_io_u_d->sq_wr.opcode = IBV_WR_SEND;
+			r_io_u_d->sq_wr.send_flags = IBV_SEND_SIGNALED;
+			break;
+		default:
+			log_err("fio: unknown rdma protocol - %d\n",
+				rd->rdma_protocol);
+			break;
+		}
+
+		if (ibv_post_send(rd->qp, &r_io_u_d->sq_wr, &bad_wr) != 0) {
+			log_err("fio: ibv_post_send fail: %m\n");
+			return -1;
+		}
+
+		dprint_io_u(io_us[i], "fio_rdmaio_send");
+	}
+
+	/* wait for completion
+	   rdma_poll_wait(td, comp_opcode); */
+
+	return i;
+}
+
+static int fio_rdmaio_recv(struct thread_data *td, struct io_u **io_us,
+			   unsigned int nr)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct ibv_recv_wr *bad_wr;
+	struct rdma_io_u_data *r_io_u_d;
+	int i;
+
+	i = 0;
+	if (rd->rdma_protocol == FIO_RDMA_CHA_RECV) {
+		/* post io_u into recv queue */
+		for (i = 0; i < nr; i++) {
+			r_io_u_d = io_us[i]->engine_data;
+			if (ibv_post_recv(rd->qp, &r_io_u_d->rq_wr, &bad_wr) !=
+			    0) {
+				log_err("fio: ibv_post_recv fail: %m\n");
+				return 1;
+			}
+		}
+	} else if ((rd->rdma_protocol == FIO_RDMA_MEM_READ)
+		   || (rd->rdma_protocol == FIO_RDMA_MEM_WRITE)) {
+		/* re-post the rq_wr */
+		if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
+			log_err("fio: ibv_post_recv fail: %m\n");
+			return 1;
+		}
+
+		rdma_poll_wait(td, IBV_WC_RECV);
+
+		dprint(FD_IO, "fio: recv FINISH message\n");
+		td->done = 1;
+		return 0;
+	}
+
+	return i;
+}
+
+static enum fio_q_status fio_rdmaio_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	if (rd->io_u_queued_nr == (int)td->o.iodepth)
+		return FIO_Q_BUSY;
+
+	rd->io_us_queued[rd->io_u_queued_nr] = io_u;
+	rd->io_u_queued_nr++;
+
+	dprint_io_u(io_u, "fio_rdmaio_queue");
+
+	return FIO_Q_QUEUED;
+}
+
+static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
+			      unsigned int nr)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct timespec now;
+	unsigned int i;
+
+	if (!fio_fill_issue_time(td))
+		return;
+
+	fio_gettime(&now, NULL);
+
+	for (i = 0; i < nr; i++) {
+		struct io_u *io_u = io_us[i];
+
+		/* queued -> flight */
+		rd->io_us_flight[rd->io_u_flight_nr] = io_u;
+		rd->io_u_flight_nr++;
+
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+	}
+}
+
+static int fio_rdmaio_commit(struct thread_data *td)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct io_u **io_us;
+	int ret;
+
+	if (!rd->io_us_queued)
+		return 0;
+
+	io_us = rd->io_us_queued;
+	do {
+		/* RDMA_WRITE or RDMA_READ */
+		if (rd->is_client)
+			ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr);
+		else if (!rd->is_client)
+			ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr);
+		else
+			ret = 0;	/* must be a SYNC */
+
+		if (ret > 0) {
+			fio_rdmaio_queued(td, io_us, ret);
+			io_u_mark_submit(td, ret);
+			rd->io_u_queued_nr -= ret;
+			io_us += ret;
+			ret = 0;
+		} else
+			break;
+	} while (rd->io_u_queued_nr);
+
+	return ret;
+}
+
+static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdma_conn_param conn_param;
+	struct ibv_send_wr *bad_wr;
+
+	memset(&conn_param, 0, sizeof(conn_param));
+	conn_param.responder_resources = 1;
+	conn_param.initiator_depth = 1;
+	conn_param.retry_count = 10;
+
+	if (rdma_connect(rd->cm_id, &conn_param) != 0) {
+		log_err("fio: rdma_connect fail: %m\n");
+		return 1;
+	}
+
+	if (get_next_channel_event
+	    (td, rd->cm_channel, RDMA_CM_EVENT_ESTABLISHED) != 0) {
+		log_err("fio: wait for RDMA_CM_EVENT_ESTABLISHED\n");
+		return 1;
+	}
+
+	/* send task request */
+	rd->send_buf.mode = htonl(rd->rdma_protocol);
+	rd->send_buf.nr = htonl(td->o.iodepth);
+
+	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
+		log_err("fio: ibv_post_send fail: %m\n");
+		return 1;
+	}
+
+	if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
+		return 1;
+
+	/* wait for remote MR info from server side */
+	if (rdma_poll_wait(td, IBV_WC_RECV) < 0)
+		return 1;
+
+	/* In SEND/RECV test, it's a good practice to setup the iodepth of
+	 * of the RECV side deeper than that of the SEND side to
+	 * avoid RNR (receiver not ready) error. The
+	 * SEND side may send so many unsolicited message before
+	 * RECV side commits sufficient recv buffers into recv queue.
+	 * This may lead to RNR error. Here, SEND side pauses for a while
+	 * during which RECV side commits sufficient recv buffers.
+	 */
+	usleep(500000);
+
+	return 0;
+}
+
+static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdma_conn_param conn_param;
+	struct ibv_send_wr *bad_wr;
+	int ret = 0;
+
+	/* rdma_accept() - then wait for accept success */
+	memset(&conn_param, 0, sizeof(conn_param));
+	conn_param.responder_resources = 1;
+	conn_param.initiator_depth = 1;
+
+	if (rdma_accept(rd->child_cm_id, &conn_param) != 0) {
+		log_err("fio: rdma_accept: %m\n");
+		return 1;
+	}
+
+	if (get_next_channel_event
+	    (td, rd->cm_channel, RDMA_CM_EVENT_ESTABLISHED) != 0) {
+		log_err("fio: wait for RDMA_CM_EVENT_ESTABLISHED\n");
+		return 1;
+	}
+
+	/* wait for request */
+	ret = rdma_poll_wait(td, IBV_WC_RECV) < 0;
+
+	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
+		log_err("fio: ibv_post_send fail: %m\n");
+		return 1;
+	}
+
+	if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
+		return 1;
+
+	return ret;
+}
+
+static int fio_rdmaio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td_read(td))
+		return fio_rdmaio_accept(td, f);
+	else
+		return fio_rdmaio_connect(td, f);
+}
+
+static int fio_rdmaio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct ibv_send_wr *bad_wr;
+
+	/* unregister rdma buffer */
+
+	/*
+	 * Client sends notification to the server side
+	 */
+	/* refer to: http://linux.die.net/man/7/rdma_cm */
+	if ((rd->is_client == 1) && ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE)
+				     || (rd->rdma_protocol ==
+					 FIO_RDMA_MEM_READ))) {
+		if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
+			log_err("fio: ibv_post_send fail: %m\n");
+			return 1;
+		}
+
+		dprint(FD_IO, "fio: close information sent success\n");
+		rdma_poll_wait(td, IBV_WC_SEND);
+	}
+
+	if (rd->is_client == 1)
+		rdma_disconnect(rd->cm_id);
+	else {
+		rdma_disconnect(rd->child_cm_id);
+#if 0
+		rdma_disconnect(rd->cm_id);
+#endif
+	}
+
+#if 0
+	if (get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_DISCONNECTED) != 0) {
+		log_err("fio: wait for RDMA_CM_EVENT_DISCONNECTED\n");
+		return 1;
+	}
+#endif
+
+	ibv_destroy_cq(rd->cq);
+	ibv_destroy_qp(rd->qp);
+
+	if (rd->is_client == 1)
+		rdma_destroy_id(rd->cm_id);
+	else {
+		rdma_destroy_id(rd->child_cm_id);
+		rdma_destroy_id(rd->cm_id);
+	}
+
+	ibv_destroy_comp_channel(rd->channel);
+	ibv_dealloc_pd(rd->pd);
+
+	return 0;
+}
+
+static int aton(struct thread_data *td, const char *host,
+		     struct sockaddr_in *addr)
+{
+	if (inet_aton(host, &addr->sin_addr) != 1) {
+		struct hostent *hent;
+
+		hent = gethostbyname(host);
+		if (!hent) {
+			td_verror(td, errno, "gethostbyname");
+			return 1;
+		}
+
+		memcpy(&addr->sin_addr, hent->h_addr, 4);
+	}
+	return 0;
+}
+
+static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
+				    unsigned short port)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
+	struct sockaddr_storage addrb;
+	struct ibv_recv_wr *bad_wr;
+	int err;
+
+	rd->addr.sin_family = AF_INET;
+	rd->addr.sin_port = htons(port);
+
+	err = aton(td, host, &rd->addr);
+	if (err)
+		return err;
+
+	/* resolve route */
+	if (strcmp(o->bindname, "") != 0) {
+		addrb.ss_family = AF_INET;
+		err = aton(td, o->bindname, (struct sockaddr_in *)&addrb);
+		if (err)
+			return err;
+		err = rdma_resolve_addr(rd->cm_id, (struct sockaddr *)&addrb,
+					(struct sockaddr *)&rd->addr, 2000);
+
+	} else {
+		err = rdma_resolve_addr(rd->cm_id, NULL,
+					(struct sockaddr *)&rd->addr, 2000);
+	}
+
+	if (err != 0) {
+		log_err("fio: rdma_resolve_addr: %d\n", err);
+		return 1;
+	}
+
+	err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
+	if (err != 0) {
+		log_err("fio: get_next_channel_event: %d\n", err);
+		return 1;
+	}
+
+	/* resolve route */
+	err = rdma_resolve_route(rd->cm_id, 2000);
+	if (err != 0) {
+		log_err("fio: rdma_resolve_route: %d\n", err);
+		return 1;
+	}
+
+	err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
+	if (err != 0) {
+		log_err("fio: get_next_channel_event: %d\n", err);
+		return 1;
+	}
+
+	/* create qp and buffer */
+	if (fio_rdmaio_setup_qp(td) != 0)
+		return 1;
+
+	if (fio_rdmaio_setup_control_msg_buffers(td) != 0)
+		return 1;
+
+	/* post recv buf */
+	err = ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr);
+	if (err != 0) {
+		log_err("fio: ibv_post_recv fail: %d\n", err);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
+	struct ibv_recv_wr *bad_wr;
+	int state = td->runstate;
+
+	td_set_runstate(td, TD_SETTING_UP);
+
+	rd->addr.sin_family = AF_INET;
+	rd->addr.sin_port = htons(port);
+
+	if (strcmp(o->bindname, "") == 0)
+		rd->addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	else
+		rd->addr.sin_addr.s_addr = htonl(*o->bindname);
+
+	/* rdma_listen */
+	if (rdma_bind_addr(rd->cm_id, (struct sockaddr *)&rd->addr) != 0) {
+		log_err("fio: rdma_bind_addr fail: %m\n");
+		return 1;
+	}
+
+	if (rdma_listen(rd->cm_id, 3) != 0) {
+		log_err("fio: rdma_listen fail: %m\n");
+		return 1;
+	}
+
+	log_info("fio: waiting for connection\n");
+
+	/* wait for CONNECT_REQUEST */
+	if (get_next_channel_event
+	    (td, rd->cm_channel, RDMA_CM_EVENT_CONNECT_REQUEST) != 0) {
+		log_err("fio: wait for RDMA_CM_EVENT_CONNECT_REQUEST\n");
+		return 1;
+	}
+
+	if (fio_rdmaio_setup_qp(td) != 0)
+		return 1;
+
+	if (fio_rdmaio_setup_control_msg_buffers(td) != 0)
+		return 1;
+
+	/* post recv buf */
+	if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
+		log_err("fio: ibv_post_recv fail: %m\n");
+		return 1;
+	}
+
+	td_set_runstate(td, state);
+	return 0;
+}
+
+static int check_set_rlimits(struct thread_data *td)
+{
+#ifdef CONFIG_RLIMIT_MEMLOCK
+	struct rlimit rl;
+
+	/* check RLIMIT_MEMLOCK */
+	if (getrlimit(RLIMIT_MEMLOCK, &rl) != 0) {
+		log_err("fio: getrlimit fail: %d(%s)\n",
+			errno, strerror(errno));
+		return 1;
+	}
+
+	/* soft limit */
+	if ((rl.rlim_cur != RLIM_INFINITY)
+	    && (rl.rlim_cur < td->orig_buffer_size)) {
+		log_err("fio: soft RLIMIT_MEMLOCK is: %" PRId64 "\n",
+			rl.rlim_cur);
+		log_err("fio: total block size is:    %zd\n",
+			td->orig_buffer_size);
+		/* try to set larger RLIMIT_MEMLOCK */
+		rl.rlim_cur = rl.rlim_max;
+		if (setrlimit(RLIMIT_MEMLOCK, &rl) != 0) {
+			log_err("fio: setrlimit fail: %d(%s)\n",
+				errno, strerror(errno));
+			log_err("fio: you may try enlarge MEMLOCK by root\n");
+			log_err("# ulimit -l unlimited\n");
+			return 1;
+		}
+	}
+#endif
+
+	return 0;
+}
+
+static int compat_options(struct thread_data *td)
+{
+	// The original RDMA engine had an ugly / seperator
+	// on the filename for it's options. This function
+	// retains backwards compatibility with it. Note we do not
+	// support setting the bindname option is this legacy mode.
+
+	struct rdmaio_options *o = td->eo;
+	char *modep, *portp;
+	char *filename = td->o.filename;
+
+	if (!filename)
+		return 0;
+
+	portp = strchr(filename, '/');
+	if (portp == NULL)
+		return 0;
+
+	*portp = '\0';
+	portp++;
+
+	o->port = strtol(portp, NULL, 10);
+	if (!o->port || o->port > 65535)
+		goto bad_host;
+
+	modep = strchr(portp, '/');
+	if (modep != NULL) {
+		*modep = '\0';
+		modep++;
+	}
+
+	if (modep) {
+		if (!strncmp("rdma_write", modep, strlen(modep)) ||
+		    !strncmp("RDMA_WRITE", modep, strlen(modep)))
+			o->verb = FIO_RDMA_MEM_WRITE;
+		else if (!strncmp("rdma_read", modep, strlen(modep)) ||
+			 !strncmp("RDMA_READ", modep, strlen(modep)))
+			o->verb = FIO_RDMA_MEM_READ;
+		else if (!strncmp("send", modep, strlen(modep)) ||
+			 !strncmp("SEND", modep, strlen(modep)))
+			o->verb = FIO_RDMA_CHA_SEND;
+		else
+			goto bad_host;
+	} else
+		o->verb = FIO_RDMA_MEM_WRITE;
+
+
+	return 0;
+
+bad_host:
+	log_err("fio: bad rdma host/port/protocol: %s\n", td->o.filename);
+	return 1;
+}
+
+static int fio_rdmaio_init(struct thread_data *td)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
+	unsigned int max_bs;
+	int ret, i;
+
+	if (td_rw(td)) {
+		log_err("fio: rdma connections must be read OR write\n");
+		return 1;
+	}
+	if (td_random(td)) {
+		log_err("fio: RDMA network IO can't be random\n");
+		return 1;
+	}
+
+	if (compat_options(td))
+		return 1;
+
+	if (!o->port) {
+		log_err("fio: no port has been specified which is required "
+			"for the rdma engine\n");
+		return 1;
+	}
+
+	if (check_set_rlimits(td))
+		return 1;
+
+	rd->rdma_protocol = o->verb;
+	rd->cq_event_num = 0;
+
+	rd->cm_channel = rdma_create_event_channel();
+	if (!rd->cm_channel) {
+		log_err("fio: rdma_create_event_channel fail: %m\n");
+		return 1;
+	}
+
+	ret = rdma_create_id(rd->cm_channel, &rd->cm_id, rd, RDMA_PS_TCP);
+	if (ret) {
+		log_err("fio: rdma_create_id fail: %m\n");
+		return 1;
+	}
+
+	if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
+	    (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
+		rd->rmt_us =
+			malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
+		memset(rd->rmt_us, 0,
+			FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
+		rd->rmt_nr = 0;
+	}
+
+	rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *));
+	memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *));
+	rd->io_u_queued_nr = 0;
+
+	rd->io_us_flight = malloc(td->o.iodepth * sizeof(struct io_u *));
+	memset(rd->io_us_flight, 0, td->o.iodepth * sizeof(struct io_u *));
+	rd->io_u_flight_nr = 0;
+
+	rd->io_us_completed = malloc(td->o.iodepth * sizeof(struct io_u *));
+	memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *));
+	rd->io_u_completed_nr = 0;
+
+	if (td_read(td)) {	/* READ as the server */
+		rd->is_client = 0;
+		td->flags |= TD_F_NO_PROGRESS;
+		/* server rd->rdma_buf_len will be setup after got request */
+		ret = fio_rdmaio_setup_listen(td, o->port);
+	} else {		/* WRITE as the client */
+		rd->is_client = 1;
+		ret = fio_rdmaio_setup_connect(td, td->o.filename, o->port);
+	}
+
+	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	rd->send_buf.max_bs = htonl(max_bs);
+
+	/* register each io_u in the free list */
+	for (i = 0; i < td->io_u_freelist.nr; i++) {
+		struct io_u *io_u = td->io_u_freelist.io_us[i];
+
+		io_u->engine_data = malloc(sizeof(struct rdma_io_u_data));
+		memset(io_u->engine_data, 0, sizeof(struct rdma_io_u_data));
+		((struct rdma_io_u_data *)io_u->engine_data)->wr_id = i;
+
+		io_u->mr = ibv_reg_mr(rd->pd, io_u->buf, max_bs,
+				      IBV_ACCESS_LOCAL_WRITE |
+				      IBV_ACCESS_REMOTE_READ |
+				      IBV_ACCESS_REMOTE_WRITE);
+		if (io_u->mr == NULL) {
+			log_err("fio: ibv_reg_mr io_u failed: %m\n");
+			return 1;
+		}
+
+		rd->send_buf.rmt_us[i].buf =
+		    cpu_to_be64((uint64_t) (unsigned long)io_u->buf);
+		rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
+		rd->send_buf.rmt_us[i].size = htonl(max_bs);
+
+#if 0
+		log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n", io_u->mr->rkey, io_u->buf, max_bs); */
+#endif
+	}
+
+	rd->send_buf.nr = htonl(i);
+
+	return ret;
+}
+
+static void fio_rdmaio_cleanup(struct thread_data *td)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+
+	if (rd)
+		free(rd);
+}
+
+static int fio_rdmaio_setup(struct thread_data *td)
+{
+	struct rdmaio_data *rd;
+
+	if (!td->files_index) {
+		add_file(td, td->o.filename ?: "rdma", 0, 0);
+		td->o.nr_files = td->o.nr_files ?: 1;
+		td->o.open_files++;
+	}
+
+	if (!td->io_ops_data) {
+		rd = malloc(sizeof(*rd));
+
+		memset(rd, 0, sizeof(*rd));
+		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
+		td->io_ops_data = rd;
+	}
+
+	return 0;
+}
+
+static struct ioengine_ops ioengine_rw = {
+	.name			= "rdma",
+	.version		= FIO_IOOPS_VERSION,
+	.setup			= fio_rdmaio_setup,
+	.init			= fio_rdmaio_init,
+	.prep			= fio_rdmaio_prep,
+	.queue			= fio_rdmaio_queue,
+	.commit			= fio_rdmaio_commit,
+	.getevents		= fio_rdmaio_getevents,
+	.event			= fio_rdmaio_event,
+	.cleanup		= fio_rdmaio_cleanup,
+	.open_file		= fio_rdmaio_open_file,
+	.close_file		= fio_rdmaio_close_file,
+	.flags			= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+	.options		= options,
+	.option_struct_size	= sizeof(struct rdmaio_options),
+};
+
+static void fio_init fio_rdmaio_register(void)
+{
+	register_ioengine(&ioengine_rw);
+}
+
+static void fio_exit fio_rdmaio_unregister(void)
+{
+	unregister_ioengine(&ioengine_rw);
+}
diff --git a/engines/sg.c b/engines/sg.c
new file mode 100644
index 0000000..a1a6de4
--- /dev/null
+++ b/engines/sg.c
@@ -0,0 +1,1281 @@
+/*
+ * sg engine
+ *
+ * IO engine that uses the Linux SG v3 interface to talk to SCSI devices
+ *
+ * This ioengine can operate in two modes:
+ *	sync	with block devices (/dev/sdX) or
+ *		with character devices (/dev/sgY) with direct=1 or sync=1
+ *	async	with character devices with direct=0 and sync=0
+ *
+ * What value does queue() return for the different cases?
+ *				queue() return value
+ * In sync mode:
+ *  /dev/sdX		RWT	FIO_Q_COMPLETED
+ *  /dev/sgY		RWT	FIO_Q_COMPLETED
+ *   with direct=1 or sync=1
+ *
+ * In async mode:
+ *  /dev/sgY		RWT	FIO_Q_QUEUED
+ *   direct=0 and sync=0
+ *
+ * Because FIO_SYNCIO is set for this ioengine td_io_queue() will fill in
+ * issue_time *before* each IO is sent to queue()
+ *
+ * Where are the IO counting functions called for the different cases?
+ *
+ * In sync mode:
+ *  /dev/sdX (commit==NULL)
+ *   RWT
+ *    io_u_mark_depth()			called in td_io_queue()
+ *    io_u_mark_submit/complete()	called in td_io_queue()
+ *    issue_time			set in td_io_queue()
+ *
+ *  /dev/sgY with direct=1 or sync=1 (commit does nothing)
+ *   RWT
+ *    io_u_mark_depth()			called in td_io_queue()
+ *    io_u_mark_submit/complete()	called in queue()
+ *    issue_time			set in td_io_queue()
+ *  
+ * In async mode:
+ *  /dev/sgY with direct=0 and sync=0
+ *   RW: read and write operations are submitted in queue()
+ *    io_u_mark_depth()			called in td_io_commit()
+ *    io_u_mark_submit()		called in queue()
+ *    issue_time			set in td_io_queue()
+ *   T: trim operations are queued in queue() and submitted in commit()
+ *    io_u_mark_depth()			called in td_io_commit()
+ *    io_u_mark_submit()		called in commit()
+ *    issue_time			set in commit()
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <poll.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#ifdef FIO_HAVE_SGIO
+
+enum {
+	FIO_SG_WRITE		= 1,
+	FIO_SG_WRITE_VERIFY	= 2,
+	FIO_SG_WRITE_SAME	= 3
+};
+
+struct sg_options {
+	void *pad;
+	unsigned int readfua;
+	unsigned int writefua;
+	unsigned int write_mode;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "readfua",
+		.lname	= "sg engine read fua flag support",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct sg_options, readfua),
+		.help	= "Set FUA flag (force unit access) for all Read operations",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
+	{
+		.name	= "writefua",
+		.lname	= "sg engine write fua flag support",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct sg_options, writefua),
+		.help	= "Set FUA flag (force unit access) for all Write operations",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
+	{
+		.name	= "sg_write_mode",
+		.lname	= "specify sg write mode",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct sg_options, write_mode),
+		.help	= "Specify SCSI WRITE mode",
+		.def	= "write",
+		.posval = {
+			  { .ival = "write",
+			    .oval = FIO_SG_WRITE,
+			    .help = "Issue standard SCSI WRITE commands",
+			  },
+			  { .ival = "verify",
+			    .oval = FIO_SG_WRITE_VERIFY,
+			    .help = "Issue SCSI WRITE AND VERIFY commands",
+			  },
+			  { .ival = "same",
+			    .oval = FIO_SG_WRITE_SAME,
+			    .help = "Issue SCSI WRITE SAME commands",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+#define MAX_10B_LBA  0xFFFFFFFFULL
+#define SCSI_TIMEOUT_MS 30000   // 30 second timeout; currently no method to override
+#define MAX_SB 64               // sense block maximum return size
+/*
+#define FIO_SGIO_DEBUG
+*/
+
+struct sgio_cmd {
+	unsigned char cdb[16];      // enhanced from 10 to support 16 byte commands
+	unsigned char sb[MAX_SB];   // add sense block to commands
+	int nr;
+};
+
+struct sgio_trim {
+	uint8_t *unmap_param;
+	unsigned int unmap_range_count;
+	struct io_u **trim_io_us;
+};
+
+struct sgio_data {
+	struct sgio_cmd *cmds;
+	struct io_u **events;
+	struct pollfd *pfds;
+	int *fd_flags;
+	void *sgbuf;
+	unsigned int bs;
+	int type_checked;
+	struct sgio_trim **trim_queues;
+	int current_queue;
+#ifdef FIO_SGIO_DEBUG
+	unsigned int *trim_queue_map;
+#endif
+};
+
+static inline uint32_t sgio_get_be32(uint8_t *buf)
+{
+	return be32_to_cpu(*((uint32_t *) buf));
+}
+
+static inline uint64_t sgio_get_be64(uint8_t *buf)
+{
+	return be64_to_cpu(*((uint64_t *) buf));
+}
+
+static inline void sgio_set_be16(uint16_t val, uint8_t *buf)
+{
+	uint16_t t = cpu_to_be16(val);
+
+	memcpy(buf, &t, sizeof(uint16_t));
+}
+
+static inline void sgio_set_be32(uint32_t val, uint8_t *buf)
+{
+	uint32_t t = cpu_to_be32(val);
+
+	memcpy(buf, &t, sizeof(uint32_t));
+}
+
+static inline void sgio_set_be64(uint64_t val, uint8_t *buf)
+{
+	uint64_t t = cpu_to_be64(val);
+
+	memcpy(buf, &t, sizeof(uint64_t));
+}
+
+static inline bool sgio_unbuffered(struct thread_data *td)
+{
+	return (td->o.odirect || td->o.sync_io);
+}
+
+static void sgio_hdr_init(struct sgio_data *sd, struct sg_io_hdr *hdr,
+			  struct io_u *io_u, int fs)
+{
+	struct sgio_cmd *sc = &sd->cmds[io_u->index];
+
+	memset(hdr, 0, sizeof(*hdr));
+	memset(sc->cdb, 0, sizeof(sc->cdb));
+
+	hdr->interface_id = 'S';
+	hdr->cmdp = sc->cdb;
+	hdr->cmd_len = sizeof(sc->cdb);
+	hdr->sbp = sc->sb;
+	hdr->mx_sb_len = sizeof(sc->sb);
+	hdr->pack_id = io_u->index;
+	hdr->usr_ptr = io_u;
+	hdr->timeout = SCSI_TIMEOUT_MS;
+
+	if (fs) {
+		hdr->dxferp = io_u->xfer_buf;
+		hdr->dxfer_len = io_u->xfer_buflen;
+	}
+}
+
+static int pollin_events(struct pollfd *pfds, int fds)
+{
+	int i;
+
+	for (i = 0; i < fds; i++)
+		if (pfds[i].revents & POLLIN)
+			return 1;
+
+	return 0;
+}
+
+static int sg_fd_read(int fd, void *data, size_t size)
+{
+	int err = 0;
+
+	while (size) {
+		ssize_t ret;
+
+		ret = read(fd, data, size);
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			err = errno;
+			break;
+		} else if (!ret)
+			break;
+		else {
+			data += ret;
+			size -= ret;
+		}
+	}
+
+	if (err)
+		return err;
+	if (size)
+		return EAGAIN;
+
+	return 0;
+}
+
+static int fio_sgio_getevents(struct thread_data *td, unsigned int min,
+			      unsigned int max,
+			      const struct timespec fio_unused *t)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	int left = max, eventNum, ret, r = 0, trims = 0;
+	void *buf = sd->sgbuf;
+	unsigned int i, j, events;
+	struct fio_file *f;
+	struct io_u *io_u;
+
+	/*
+	 * Fill in the file descriptors
+	 */
+	for_each_file(td, f, i) {
+		/*
+		 * don't block for min events == 0
+		 */
+		if (!min)
+			sd->fd_flags[i] = fio_set_fd_nonblocking(f->fd, "sg");
+		else
+			sd->fd_flags[i] = -1;
+
+		sd->pfds[i].fd = f->fd;
+		sd->pfds[i].events = POLLIN;
+	}
+
+	/*
+	** There are two counters here:
+	**  - number of SCSI commands completed
+	**  - number of io_us completed
+	**
+	** These are the same with reads and writes, but
+	** could differ with trim/unmap commands because
+	** a single unmap can include multiple io_us
+	*/
+
+	while (left > 0) {
+		char *p;
+
+		dprint(FD_IO, "sgio_getevents: sd %p: min=%d, max=%d, left=%d\n", sd, min, max, left);
+
+		do {
+			if (!min)
+				break;
+
+			ret = poll(sd->pfds, td->o.nr_files, -1);
+			if (ret < 0) {
+				if (!r)
+					r = -errno;
+				td_verror(td, errno, "poll");
+				break;
+			} else if (!ret)
+				continue;
+
+			if (pollin_events(sd->pfds, td->o.nr_files))
+				break;
+		} while (1);
+
+		if (r < 0)
+			break;
+
+re_read:
+		p = buf;
+		events = 0;
+		for_each_file(td, f, i) {
+			for (eventNum = 0; eventNum < left; eventNum++) {
+				ret = sg_fd_read(f->fd, p, sizeof(struct sg_io_hdr));
+				dprint(FD_IO, "sgio_getevents: sg_fd_read ret: %d\n", ret);
+				if (ret) {
+					r = -ret;
+					td_verror(td, r, "sg_read");
+					break;
+				}
+				io_u = ((struct sg_io_hdr *)p)->usr_ptr;
+				if (io_u->ddir == DDIR_TRIM) {
+					events += sd->trim_queues[io_u->index]->unmap_range_count;
+					eventNum += sd->trim_queues[io_u->index]->unmap_range_count - 1;
+				} else
+					events++;
+
+				p += sizeof(struct sg_io_hdr);
+				dprint(FD_IO, "sgio_getevents: events: %d, eventNum: %d, left: %d\n", events, eventNum, left);
+			}
+		}
+
+		if (r < 0 && !events)
+			break;
+		if (!events) {
+			usleep(1000);
+			goto re_read;
+		}
+
+		left -= events;
+		r += events;
+
+		for (i = 0; i < events; i++) {
+			struct sg_io_hdr *hdr = (struct sg_io_hdr *) buf + i;
+			sd->events[i + trims] = hdr->usr_ptr;
+			io_u = (struct io_u *)(hdr->usr_ptr);
+
+			if (hdr->info & SG_INFO_CHECK) {
+				/* record if an io error occurred, ignore resid */
+				memcpy(&io_u->hdr, hdr, sizeof(struct sg_io_hdr));
+				sd->events[i + trims]->error = EIO;
+			}
+
+			if (io_u->ddir == DDIR_TRIM) {
+				struct sgio_trim *st = sd->trim_queues[io_u->index];
+#ifdef FIO_SGIO_DEBUG
+				assert(st->trim_io_us[0] == io_u);
+				assert(sd->trim_queue_map[io_u->index] == io_u->index);
+				dprint(FD_IO, "sgio_getevents: reaping %d io_us from trim queue %d\n", st->unmap_range_count, io_u->index);
+				dprint(FD_IO, "sgio_getevents: reaped io_u %d and stored in events[%d]\n", io_u->index, i+trims);
+#endif
+				for (j = 1; j < st->unmap_range_count; j++) {
+					++trims;
+					sd->events[i + trims] = st->trim_io_us[j];
+#ifdef FIO_SGIO_DEBUG
+					dprint(FD_IO, "sgio_getevents: reaped io_u %d and stored in events[%d]\n", st->trim_io_us[j]->index, i+trims);
+					assert(sd->trim_queue_map[st->trim_io_us[j]->index] == io_u->index);
+#endif
+					if (hdr->info & SG_INFO_CHECK) {
+						/* record if an io error occurred, ignore resid */
+						memcpy(&st->trim_io_us[j]->hdr, hdr, sizeof(struct sg_io_hdr));
+						sd->events[i + trims]->error = EIO;
+					}
+				}
+				events -= st->unmap_range_count - 1;
+				st->unmap_range_count = 0;
+			}
+		}
+	}
+
+	if (!min) {
+		for_each_file(td, f, i) {
+			if (sd->fd_flags[i] == -1)
+				continue;
+
+			if (fcntl(f->fd, F_SETFL, sd->fd_flags[i]) < 0)
+				log_err("fio: sg failed to restore fcntl flags: %s\n", strerror(errno));
+		}
+	}
+
+	return r;
+}
+
+static enum fio_q_status fio_sgio_ioctl_doio(struct thread_data *td,
+					     struct fio_file *f,
+					     struct io_u *io_u)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	struct sg_io_hdr *hdr = &io_u->hdr;
+	int ret;
+
+	sd->events[0] = io_u;
+
+	ret = ioctl(f->fd, SG_IO, hdr);
+	if (ret < 0)
+		return ret;
+
+	/* record if an io error occurred */
+	if (hdr->info & SG_INFO_CHECK)
+		io_u->error = EIO;
+
+	return FIO_Q_COMPLETED;
+}
+
+static enum fio_q_status fio_sgio_rw_doio(struct thread_data *td,
+					  struct fio_file *f,
+					  struct io_u *io_u, int do_sync)
+{
+	struct sg_io_hdr *hdr = &io_u->hdr;
+	int ret;
+
+	ret = write(f->fd, hdr, sizeof(*hdr));
+	if (ret < 0)
+		return ret;
+
+	if (do_sync) {
+		/*
+		 * We can't just read back the first command that completes
+		 * and assume it's the one we need, it could be any command
+		 * that is inflight.
+		 */
+		do {
+			struct io_u *__io_u;
+
+			ret = read(f->fd, hdr, sizeof(*hdr));
+			if (ret < 0)
+				return ret;
+
+			__io_u = hdr->usr_ptr;
+
+			/* record if an io error occurred */
+			if (hdr->info & SG_INFO_CHECK)
+				__io_u->error = EIO;
+
+			if (__io_u == io_u)
+				break;
+
+			if (io_u_sync_complete(td, __io_u)) {
+				ret = -1;
+				break;
+			}
+		} while (1);
+
+		return FIO_Q_COMPLETED;
+	}
+
+	return FIO_Q_QUEUED;
+}
+
+static enum fio_q_status fio_sgio_doio(struct thread_data *td,
+				       struct io_u *io_u, int do_sync)
+{
+	struct fio_file *f = io_u->file;
+	enum fio_q_status ret;
+
+	if (f->filetype == FIO_TYPE_BLOCK) {
+		ret = fio_sgio_ioctl_doio(td, f, io_u);
+		if (io_u->error)
+			td_verror(td, io_u->error, __func__);
+	} else {
+		ret = fio_sgio_rw_doio(td, f, io_u, do_sync);
+		if (io_u->error && do_sync)
+			td_verror(td, io_u->error, __func__);
+	}
+
+	return ret;
+}
+
+static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba,
+			    unsigned long long nr_blocks)
+{
+	if (lba < MAX_10B_LBA) {
+		sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]);
+		sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]);
+	} else {
+		sgio_set_be64(lba, &hdr->cmdp[2]);
+		sgio_set_be32((uint32_t) nr_blocks, &hdr->cmdp[10]);
+	}
+
+	return;
+}
+
+static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct sg_io_hdr *hdr = &io_u->hdr;
+	struct sg_options *o = td->eo;
+	struct sgio_data *sd = td->io_ops_data;
+	unsigned long long nr_blocks, lba;
+	int offset;
+
+	if (io_u->xfer_buflen & (sd->bs - 1)) {
+		log_err("read/write not sector aligned\n");
+		return EINVAL;
+	}
+
+	nr_blocks = io_u->xfer_buflen / sd->bs;
+	lba = io_u->offset / sd->bs;
+
+	if (io_u->ddir == DDIR_READ) {
+		sgio_hdr_init(sd, hdr, io_u, 1);
+
+		hdr->dxfer_direction = SG_DXFER_FROM_DEV;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x28; // read(10)
+		else
+			hdr->cmdp[0] = 0x88; // read(16)
+
+		if (o->readfua)
+			hdr->cmdp[1] |= 0x08;
+
+		fio_sgio_rw_lba(hdr, lba, nr_blocks);
+
+	} else if (io_u->ddir == DDIR_WRITE) {
+		sgio_hdr_init(sd, hdr, io_u, 1);
+
+		hdr->dxfer_direction = SG_DXFER_TO_DEV;
+		switch(o->write_mode) {
+		case FIO_SG_WRITE:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2a; // write(10)
+			else
+				hdr->cmdp[0] = 0x8a; // write(16)
+			if (o->writefua)
+				hdr->cmdp[1] |= 0x08;
+			break;
+		case FIO_SG_WRITE_VERIFY:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2e; // write and verify(10)
+			else
+				hdr->cmdp[0] = 0x8e; // write and verify(16)
+			break;
+			// BYTCHK is disabled by virtue of the memset in sgio_hdr_init
+		case FIO_SG_WRITE_SAME:
+			hdr->dxfer_len = sd->bs;
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x41; // write same(10)
+			else
+				hdr->cmdp[0] = 0x93; // write same(16)
+			break;
+		};
+
+		fio_sgio_rw_lba(hdr, lba, nr_blocks);
+
+	} else if (io_u->ddir == DDIR_TRIM) {
+		struct sgio_trim *st;
+
+		if (sd->current_queue == -1) {
+			sgio_hdr_init(sd, hdr, io_u, 0);
+
+			hdr->cmd_len = 10;
+			hdr->dxfer_direction = SG_DXFER_TO_DEV;
+			hdr->cmdp[0] = 0x42; // unmap
+			sd->current_queue = io_u->index;
+			st = sd->trim_queues[sd->current_queue];
+			hdr->dxferp = st->unmap_param;
+#ifdef FIO_SGIO_DEBUG
+			assert(sd->trim_queues[io_u->index]->unmap_range_count == 0);
+			dprint(FD_IO, "sg: creating new queue based on io_u %d\n", io_u->index);
+#endif
+		}
+		else
+			st = sd->trim_queues[sd->current_queue];
+
+		dprint(FD_IO, "sg: adding io_u %d to trim queue %d\n", io_u->index, sd->current_queue);
+		st->trim_io_us[st->unmap_range_count] = io_u;
+#ifdef FIO_SGIO_DEBUG
+		sd->trim_queue_map[io_u->index] = sd->current_queue;
+#endif
+
+		offset = 8 + 16 * st->unmap_range_count;
+		sgio_set_be64(lba, &st->unmap_param[offset]);
+		sgio_set_be32((uint32_t) nr_blocks, &st->unmap_param[offset + 8]);
+
+		st->unmap_range_count++;
+
+	} else if (ddir_sync(io_u->ddir)) {
+		sgio_hdr_init(sd, hdr, io_u, 0);
+		hdr->dxfer_direction = SG_DXFER_NONE;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x35; // synccache(10)
+		else
+			hdr->cmdp[0] = 0x91; // synccache(16)
+	} else
+		assert(0);
+
+	return 0;
+}
+
+static void fio_sgio_unmap_setup(struct sg_io_hdr *hdr, struct sgio_trim *st)
+{
+	uint16_t cnt = st->unmap_range_count * 16;
+
+	hdr->dxfer_len = cnt + 8;
+	sgio_set_be16(cnt + 8, &hdr->cmdp[7]);
+	sgio_set_be16(cnt + 6, st->unmap_param);
+	sgio_set_be16(cnt, &st->unmap_param[2]);
+
+	return;
+}
+
+static enum fio_q_status fio_sgio_queue(struct thread_data *td,
+					struct io_u *io_u)
+{
+	struct sg_io_hdr *hdr = &io_u->hdr;
+	struct sgio_data *sd = td->io_ops_data;
+	int ret, do_sync = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (sgio_unbuffered(td) || ddir_sync(io_u->ddir))
+		do_sync = 1;
+
+	if (io_u->ddir == DDIR_TRIM) {
+		if (do_sync || io_u->file->filetype == FIO_TYPE_BLOCK) {
+			struct sgio_trim *st = sd->trim_queues[sd->current_queue];
+
+			/* finish cdb setup for unmap because we are
+			** doing unmap commands synchronously */
+#ifdef FIO_SGIO_DEBUG
+			assert(st->unmap_range_count == 1);
+			assert(io_u == st->trim_io_us[0]);
+#endif
+			hdr = &io_u->hdr;
+
+			fio_sgio_unmap_setup(hdr, st);
+
+			st->unmap_range_count = 0;
+			sd->current_queue = -1;
+		} else
+			/* queue up trim ranges and submit in commit() */
+			return FIO_Q_QUEUED;
+	}
+
+	ret = fio_sgio_doio(td, io_u, do_sync);
+
+	if (ret < 0)
+		io_u->error = errno;
+	else if (hdr->status) {
+		io_u->resid = hdr->resid;
+		io_u->error = EIO;
+	} else if (td->io_ops->commit != NULL) {
+		if (do_sync && !ddir_sync(io_u->ddir)) {
+			io_u_mark_submit(td, 1);
+			io_u_mark_complete(td, 1);
+		} else if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+			io_u_mark_submit(td, 1);
+			io_u_queued(td, io_u);
+		}
+	}
+
+	if (io_u->error) {
+		td_verror(td, io_u->error, "xfer");
+		return FIO_Q_COMPLETED;
+	}
+
+	return ret;
+}
+
+static int fio_sgio_commit(struct thread_data *td)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	struct sgio_trim *st;
+	struct io_u *io_u;
+	struct sg_io_hdr *hdr;
+	struct timespec now;
+	unsigned int i;
+	int ret;
+
+	if (sd->current_queue == -1)
+		return 0;
+
+	st = sd->trim_queues[sd->current_queue];
+	io_u = st->trim_io_us[0];
+	hdr = &io_u->hdr;
+
+	fio_sgio_unmap_setup(hdr, st);
+
+	sd->current_queue = -1;
+
+	ret = fio_sgio_rw_doio(td, io_u->file, io_u, 0);
+
+	if (ret < 0 || hdr->status) {
+		int error;
+
+		if (ret < 0)
+			error = errno;
+		else {
+			error = EIO;
+			ret = -EIO;
+		}
+
+		for (i = 0; i < st->unmap_range_count; i++) {
+			st->trim_io_us[i]->error = error;
+			clear_io_u(td, st->trim_io_us[i]);
+			if (hdr->status)
+				st->trim_io_us[i]->resid = hdr->resid;
+		}
+
+		td_verror(td, error, "xfer");
+		return ret;
+	}
+
+	if (fio_fill_issue_time(td)) {
+		fio_gettime(&now, NULL);
+		for (i = 0; i < st->unmap_range_count; i++) {
+			memcpy(&st->trim_io_us[i]->issue_time, &now, sizeof(now));
+			io_u_queued(td, io_u);
+		}
+	}
+	io_u_mark_submit(td, st->unmap_range_count);
+
+	return 0;
+}
+
+static struct io_u *fio_sgio_event(struct thread_data *td, int event)
+{
+	struct sgio_data *sd = td->io_ops_data;
+
+	return sd->events[event];
+}
+
+static int fio_sgio_read_capacity(struct thread_data *td, unsigned int *bs,
+				  unsigned long long *max_lba)
+{
+	/*
+	 * need to do read capacity operation w/o benefit of sd or
+	 * io_u structures, which are not initialized until later.
+	 */
+	struct sg_io_hdr hdr;
+	unsigned long long hlba;
+	unsigned int blksz = 0;
+	unsigned char cmd[16];
+	unsigned char sb[64];
+	unsigned char buf[32];  // read capacity return
+	int ret;
+	int fd = -1;
+
+	struct fio_file *f = td->files[0];
+
+	/* open file independent of rest of application */
+	fd = open(f->file_name, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	memset(&hdr, 0, sizeof(hdr));
+	memset(cmd, 0, sizeof(cmd));
+	memset(sb, 0, sizeof(sb));
+	memset(buf, 0, sizeof(buf));
+
+	/* First let's try a 10 byte read capacity. */
+	hdr.interface_id = 'S';
+	hdr.cmdp = cmd;
+	hdr.cmd_len = 10;
+	hdr.sbp = sb;
+	hdr.mx_sb_len = sizeof(sb);
+	hdr.timeout = SCSI_TIMEOUT_MS;
+	hdr.cmdp[0] = 0x25;  // Read Capacity(10)
+	hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	hdr.dxferp = buf;
+	hdr.dxfer_len = sizeof(buf);
+
+	ret = ioctl(fd, SG_IO, &hdr);
+	if (ret < 0) {
+		close(fd);
+		return ret;
+	}
+
+	if (hdr.info & SG_INFO_CHECK) {
+		/* RCAP(10) might be unsupported by device. Force RCAP(16) */
+		hlba = MAX_10B_LBA;
+	} else {
+		blksz = sgio_get_be32(&buf[4]);
+		hlba = sgio_get_be32(buf);
+	}
+
+	/*
+	 * If max lba masked by MAX_10B_LBA equals MAX_10B_LBA,
+	 * then need to retry with 16 byte Read Capacity command.
+	 */
+	if (hlba == MAX_10B_LBA) {
+		hdr.cmd_len = 16;
+		hdr.cmdp[0] = 0x9e; // service action
+		hdr.cmdp[1] = 0x10; // Read Capacity(16)
+		sgio_set_be32(sizeof(buf), &hdr.cmdp[10]);
+
+		hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+		hdr.dxferp = buf;
+		hdr.dxfer_len = sizeof(buf);
+
+		ret = ioctl(fd, SG_IO, &hdr);
+		if (ret < 0) {
+			close(fd);
+			return ret;
+		}
+
+		/* record if an io error occurred */
+		if (hdr.info & SG_INFO_CHECK)
+			td_verror(td, EIO, "fio_sgio_read_capacity");
+
+		blksz = sgio_get_be32(&buf[8]);
+		hlba = sgio_get_be64(buf);
+	}
+
+	if (blksz) {
+		*bs = blksz;
+		*max_lba = hlba;
+		ret = 0;
+	} else {
+		ret = EIO;
+	}
+
+	close(fd);
+	return ret;
+}
+
+static void fio_sgio_cleanup(struct thread_data *td)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	int i;
+
+	if (sd) {
+		free(sd->events);
+		free(sd->cmds);
+		free(sd->fd_flags);
+		free(sd->pfds);
+		free(sd->sgbuf);
+#ifdef FIO_SGIO_DEBUG
+		free(sd->trim_queue_map);
+#endif
+
+		for (i = 0; i < td->o.iodepth; i++) {
+			free(sd->trim_queues[i]->unmap_param);
+			free(sd->trim_queues[i]->trim_io_us);
+			free(sd->trim_queues[i]);
+		}
+
+		free(sd->trim_queues);
+		free(sd);
+	}
+}
+
+static int fio_sgio_init(struct thread_data *td)
+{
+	struct sgio_data *sd;
+	struct sgio_trim *st;
+	int i;
+
+	sd = calloc(1, sizeof(*sd));
+	sd->cmds = calloc(td->o.iodepth, sizeof(struct sgio_cmd));
+	sd->sgbuf = calloc(td->o.iodepth, sizeof(struct sg_io_hdr));
+	sd->events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	sd->pfds = calloc(td->o.nr_files, sizeof(struct pollfd));
+	sd->fd_flags = calloc(td->o.nr_files, sizeof(int));
+	sd->type_checked = 0;
+
+	sd->trim_queues = calloc(td->o.iodepth, sizeof(struct sgio_trim *));
+	sd->current_queue = -1;
+#ifdef FIO_SGIO_DEBUG
+	sd->trim_queue_map = calloc(td->o.iodepth, sizeof(int));
+#endif
+	for (i = 0; i < td->o.iodepth; i++) {
+		sd->trim_queues[i] = calloc(1, sizeof(struct sgio_trim));
+		st = sd->trim_queues[i];
+		st->unmap_param = calloc(td->o.iodepth + 1, sizeof(char[16]));
+		st->unmap_range_count = 0;
+		st->trim_io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
+	}
+
+	td->io_ops_data = sd;
+
+	/*
+	 * we want to do it, regardless of whether odirect is set or not
+	 */
+	td->o.override_sync = 1;
+	return 0;
+}
+
+static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	unsigned int bs = 0;
+	unsigned long long max_lba = 0;
+
+	if (f->filetype == FIO_TYPE_BLOCK) {
+		if (ioctl(f->fd, BLKSSZGET, &bs) < 0) {
+			td_verror(td, errno, "ioctl");
+			return 1;
+		}
+	} else if (f->filetype == FIO_TYPE_CHAR) {
+		int version, ret;
+
+		if (ioctl(f->fd, SG_GET_VERSION_NUM, &version) < 0) {
+			td_verror(td, errno, "ioctl");
+			return 1;
+		}
+
+		ret = fio_sgio_read_capacity(td, &bs, &max_lba);
+		if (ret) {
+			td_verror(td, td->error, "fio_sgio_read_capacity");
+			log_err("ioengine sg unable to read capacity successfully\n");
+			return 1;
+		}
+	} else {
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
+		return 1;
+	}
+
+	sd->bs = bs;
+	// Determine size of commands needed based on max_lba
+	if (max_lba >= MAX_10B_LBA) {
+		dprint(FD_IO, "sgio_type_check: using 16 byte read/write "
+			"commands for lba above 0x%016llx/0x%016llx\n",
+			MAX_10B_LBA, max_lba);
+	}
+
+	if (f->filetype == FIO_TYPE_BLOCK) {
+		td->io_ops->getevents = NULL;
+		td->io_ops->event = NULL;
+		td->io_ops->commit = NULL;
+		/*
+		** Setting these functions to null may cause problems
+		** with filename=/dev/sda:/dev/sg0 since we are only
+		** considering a single file
+		*/
+	}
+	sd->type_checked = 1;
+
+	return 0;
+}
+
+static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	if (sd && !sd->type_checked && fio_sgio_type_check(td, f)) {
+		ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Build an error string with details about the driver, host or scsi
+ * error contained in the sg header Caller will use as necessary.
+ */
+static char *fio_sgio_errdetails(struct io_u *io_u)
+{
+	struct sg_io_hdr *hdr = &io_u->hdr;
+#define MAXERRDETAIL 1024
+#define MAXMSGCHUNK  128
+	char *msg, msgchunk[MAXMSGCHUNK];
+	int i;
+
+	msg = calloc(1, MAXERRDETAIL);
+	strcpy(msg, "");
+
+	/*
+	 * can't seem to find sg_err.h, so I'll just echo the define values
+	 * so others can search on internet to find clearer clues of meaning.
+	 */
+	if (hdr->info & SG_INFO_CHECK) {
+		if (hdr->host_status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Host Status: 0x%02x; ", hdr->host_status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			switch (hdr->host_status) {
+			case 0x01:
+				strlcat(msg, "SG_ERR_DID_NO_CONNECT", MAXERRDETAIL);
+				break;
+			case 0x02:
+				strlcat(msg, "SG_ERR_DID_BUS_BUSY", MAXERRDETAIL);
+				break;
+			case 0x03:
+				strlcat(msg, "SG_ERR_DID_TIME_OUT", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "SG_ERR_DID_BAD_TARGET", MAXERRDETAIL);
+				break;
+			case 0x05:
+				strlcat(msg, "SG_ERR_DID_ABORT", MAXERRDETAIL);
+				break;
+			case 0x06:
+				strlcat(msg, "SG_ERR_DID_PARITY", MAXERRDETAIL);
+				break;
+			case 0x07:
+				strlcat(msg, "SG_ERR_DID_ERROR (internal error)", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "SG_ERR_DID_RESET", MAXERRDETAIL);
+				break;
+			case 0x09:
+				strlcat(msg, "SG_ERR_DID_BAD_INTR (unexpected)", MAXERRDETAIL);
+				break;
+			case 0x0a:
+				strlcat(msg, "SG_ERR_DID_PASSTHROUGH", MAXERRDETAIL);
+				break;
+			case 0x0b:
+				strlcat(msg, "SG_ERR_DID_SOFT_ERROR (driver retry?)", MAXERRDETAIL);
+				break;
+			case 0x0c:
+				strlcat(msg, "SG_ERR_DID_IMM_RETRY", MAXERRDETAIL);
+				break;
+			case 0x0d:
+				strlcat(msg, "SG_ERR_DID_REQUEUE", MAXERRDETAIL);
+				break;
+			case 0x0e:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_DISRUPTED", MAXERRDETAIL);
+				break;
+			case 0x0f:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_FAILFAST", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "SG_ERR_DID_TARGET_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x11:
+				strlcat(msg, "SG_ERR_DID_NEXUS_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x12:
+				strlcat(msg, "SG_ERR_DID_ALLOC_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x13:
+				strlcat(msg, "SG_ERR_DID_MEDIUM_ERROR", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->driver_status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver Status: 0x%02x; ", hdr->driver_status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			switch (hdr->driver_status & 0x0F) {
+			case 0x01:
+				strlcat(msg, "SG_ERR_DRIVER_BUSY", MAXERRDETAIL);
+				break;
+			case 0x02:
+				strlcat(msg, "SG_ERR_DRIVER_SOFT", MAXERRDETAIL);
+				break;
+			case 0x03:
+				strlcat(msg, "SG_ERR_DRIVER_MEDIA", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "SG_ERR_DRIVER_ERROR", MAXERRDETAIL);
+				break;
+			case 0x05:
+				strlcat(msg, "SG_ERR_DRIVER_INVALID", MAXERRDETAIL);
+				break;
+			case 0x06:
+				strlcat(msg, "SG_ERR_DRIVER_TIMEOUT", MAXERRDETAIL);
+				break;
+			case 0x07:
+				strlcat(msg, "SG_ERR_DRIVER_HARD", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "SG_ERR_DRIVER_SENSE", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, "; ", MAXERRDETAIL);
+			switch (hdr->driver_status & 0xF0) {
+			case 0x10:
+				strlcat(msg, "SG_ERR_SUGGEST_RETRY", MAXERRDETAIL);
+				break;
+			case 0x20:
+				strlcat(msg, "SG_ERR_SUGGEST_ABORT", MAXERRDETAIL);
+				break;
+			case 0x30:
+				strlcat(msg, "SG_ERR_SUGGEST_REMAP", MAXERRDETAIL);
+				break;
+			case 0x40:
+				strlcat(msg, "SG_ERR_SUGGEST_DIE", MAXERRDETAIL);
+				break;
+			case 0x80:
+				strlcat(msg, "SG_ERR_SUGGEST_SENSE", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG SCSI Status: 0x%02x; ", hdr->status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			// SCSI 3 status codes
+			switch (hdr->status) {
+			case 0x02:
+				strlcat(msg, "CHECK_CONDITION", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "CONDITION_MET", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "BUSY", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "INTERMEDIATE", MAXERRDETAIL);
+				break;
+			case 0x14:
+				strlcat(msg, "INTERMEDIATE_CONDITION_MET", MAXERRDETAIL);
+				break;
+			case 0x18:
+				strlcat(msg, "RESERVATION_CONFLICT", MAXERRDETAIL);
+				break;
+			case 0x22:
+				strlcat(msg, "COMMAND_TERMINATED", MAXERRDETAIL);
+				break;
+			case 0x28:
+				strlcat(msg, "TASK_SET_FULL", MAXERRDETAIL);
+				break;
+			case 0x30:
+				strlcat(msg, "ACA_ACTIVE", MAXERRDETAIL);
+				break;
+			case 0x40:
+				strlcat(msg, "TASK_ABORTED", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->sb_len_wr) {
+			snprintf(msgchunk, MAXMSGCHUNK, "Sense Data (%d bytes):", hdr->sb_len_wr);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			for (i = 0; i < hdr->sb_len_wr; i++) {
+				snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->sbp[i]);
+				strlcat(msg, msgchunk, MAXERRDETAIL);
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->resid != 0) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver: %d bytes out of %d not transferred. ", hdr->resid, hdr->dxfer_len);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+		}
+		if (hdr->cmdp) {
+			strlcat(msg, "cdb:", MAXERRDETAIL);
+			for (i = 0; i < hdr->cmd_len; i++) {
+				snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->cmdp[i]);
+				strlcat(msg, msgchunk, MAXERRDETAIL);
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+			if (io_u->ddir == DDIR_TRIM) {
+				unsigned char *param_list = hdr->dxferp;
+				strlcat(msg, "dxferp:", MAXERRDETAIL);
+				for (i = 0; i < hdr->dxfer_len; i++) {
+					snprintf(msgchunk, MAXMSGCHUNK, " %02x", param_list[i]);
+					strlcat(msg, msgchunk, MAXERRDETAIL);
+				}
+				strlcat(msg, ". ", MAXERRDETAIL);
+			}
+		}
+	}
+
+	if (!(hdr->info & SG_INFO_CHECK) && !strlen(msg))
+		snprintf(msg, MAXERRDETAIL, "%s",
+			 "SG Driver did not report a Host, Driver or Device check");
+
+	return msg;
+}
+
+/*
+ * get max file size from read capacity.
+ */
+static int fio_sgio_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	/*
+	 * get_file_size is being called even before sgio_init is
+	 * called, so none of the sg_io structures are
+	 * initialized in the thread_data yet.  So we need to do the
+	 * ReadCapacity without any of those helpers.  One of the effects
+	 * is that ReadCapacity may get called 4 times on each open:
+	 * readcap(10) followed by readcap(16) if needed - just to get
+	 * the file size after the init occurs - it will be called
+	 * again when "type_check" is called during structure
+	 * initialization I'm not sure how to prevent this little
+	 * inefficiency.
+	 */
+	unsigned int bs = 0;
+	unsigned long long max_lba = 0;
+	int ret;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
+		return 1;
+	}
+
+	ret = fio_sgio_read_capacity(td, &bs, &max_lba);
+	if (ret ) {
+		td_verror(td, td->error, "fio_sgio_read_capacity");
+		log_err("ioengine sg unable to successfully execute read capacity to get block size and maximum lba\n");
+		return 1;
+	}
+
+	f->real_file_size = (max_lba + 1) * bs;
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+
+static struct ioengine_ops ioengine = {
+	.name		= "sg",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_sgio_init,
+	.prep		= fio_sgio_prep,
+	.queue		= fio_sgio_queue,
+	.commit		= fio_sgio_commit,
+	.getevents	= fio_sgio_getevents,
+	.errdetails	= fio_sgio_errdetails,
+	.event		= fio_sgio_event,
+	.cleanup	= fio_sgio_cleanup,
+	.open_file	= fio_sgio_open,
+	.close_file	= generic_close_file,
+	.get_file_size	= fio_sgio_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_RAWIO,
+	.options	= options,
+	.option_struct_size	= sizeof(struct sg_options)
+};
+
+#else /* FIO_HAVE_SGIO */
+
+/*
+ * When we have a proper configure system in place, we simply wont build
+ * and install this io engine. For now install a crippled version that
+ * just complains and fails to load.
+ */
+static int fio_sgio_init(struct thread_data fio_unused *td)
+{
+	log_err("fio: ioengine sg not available\n");
+	return 1;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "sg",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_sgio_init,
+};
+
+#endif
+
+static void fio_init fio_sgio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_sgio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c
new file mode 100644
index 0000000..1b6625b
--- /dev/null
+++ b/engines/skeleton_external.c
@@ -0,0 +1,174 @@
+/*
+ * Skeleton for a sample external io engine
+ *
+ * Should be compiled with:
+ *
+ * gcc -Wall -O2 -g -D_GNU_SOURCE -include ../config-host.h -shared -rdynamic -fPIC -o skeleton_external.o skeleton_external.c
+ * (also requires -D_GNU_SOURCE -DCONFIG_STRSEP on Linux)
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/*
+ * The core of the module is identical to the ones included with fio,
+ * read those. You cannot use register_ioengine() and unregister_ioengine()
+ * for external modules, they should be gotten through dlsym()
+ */
+
+/*
+ * The io engine can define its own options within the io engine source.
+ * The option member must not be at offset 0, due to the way fio parses
+ * the given option. Just add a padding pointer unless the io engine has
+ * something usable.
+ */
+struct fio_skeleton_options {
+	void *pad; /* avoid ->off1 of fio_option becomes 0 */
+	unsigned int dummy;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "dummy",
+		.lname	= "ldummy",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct fio_skeleton_options, dummy),
+		.help	= "Set dummy",
+		.category = FIO_OPT_C_ENGINE, /* always use this */
+		.group	= FIO_OPT_G_INVALID, /* this can be different */
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+/*
+ * The ->event() hook is called to match an event number with an io_u.
+ * After the core has called ->getevents() and it has returned eg 3,
+ * the ->event() hook must return the 3 events that have completed for
+ * subsequent calls to ->event() with [0-2]. Required.
+ */
+static struct io_u *fio_skeleton_event(struct thread_data *td, int event)
+{
+	return NULL;
+}
+
+/*
+ * The ->getevents() hook is used to reap completion events from an async
+ * io engine. It returns the number of completed events since the last call,
+ * which may then be retrieved by calling the ->event() hook with the event
+ * numbers. Required.
+ */
+static int fio_skeleton_getevents(struct thread_data *td, unsigned int min,
+				  unsigned int max, const struct timespec *t)
+{
+	return 0;
+}
+
+/*
+ * The ->cancel() hook attempts to cancel the io_u. Only relevant for
+ * async io engines, and need not be supported.
+ */
+static int fio_skeleton_cancel(struct thread_data *td, struct io_u *io_u)
+{
+	return 0;
+}
+
+/*
+ * The ->queue() hook is responsible for initiating io on the io_u
+ * being passed in. If the io engine is a synchronous one, io may complete
+ * before ->queue() returns. Required.
+ *
+ * The io engine must transfer in the direction noted by io_u->ddir
+ * to the buffer pointed to by io_u->xfer_buf for as many bytes as
+ * io_u->xfer_buflen. Residual data count may be set in io_u->resid
+ * for a short read/write.
+ */
+static enum fio_q_status fio_skeleton_queue(struct thread_data *td,
+					    struct io_u *io_u)
+{
+	/*
+	 * Double sanity check to catch errant write on a readonly setup
+	 */
+	fio_ro_check(td, io_u);
+
+	/*
+	 * Could return FIO_Q_QUEUED for a queued request,
+	 * FIO_Q_COMPLETED for a completed request, and FIO_Q_BUSY
+	 * if we could queue no more at this point (you'd have to
+	 * define ->commit() to handle that.
+	 */
+	return FIO_Q_COMPLETED;
+}
+
+/*
+ * The ->prep() function is called for each io_u prior to being submitted
+ * with ->queue(). This hook allows the io engine to perform any
+ * preparatory actions on the io_u, before being submitted. Not required.
+ */
+static int fio_skeleton_prep(struct thread_data *td, struct io_u *io_u)
+{
+	return 0;
+}
+
+/*
+ * The init function is called once per thread/process, and should set up
+ * any structures that this io engine requires to keep track of io. Not
+ * required.
+ */
+static int fio_skeleton_init(struct thread_data *td)
+{
+	return 0;
+}
+
+/*
+ * This is paired with the ->init() function and is called when a thread is
+ * done doing io. Should tear down anything setup by the ->init() function.
+ * Not required.
+ */
+static void fio_skeleton_cleanup(struct thread_data *td)
+{
+}
+
+/*
+ * Hook for opening the given file. Unless the engine has special
+ * needs, it usually just provides generic_open_file() as the handler.
+ */
+static int fio_skeleton_open(struct thread_data *td, struct fio_file *f)
+{
+	return generic_open_file(td, f);
+}
+
+/*
+ * Hook for closing a file. See fio_skeleton_open().
+ */
+static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
+{
+	return generic_close_file(td, f);
+}
+
+/*
+ * Note that the structure is exported, so that fio can get it via
+ * dlsym(..., "ioengine"); for (and only for) external engines.
+ */
+struct ioengine_ops ioengine = {
+	.name		= "engine_name",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_skeleton_init,
+	.prep		= fio_skeleton_prep,
+	.queue		= fio_skeleton_queue,
+	.cancel		= fio_skeleton_cancel,
+	.getevents	= fio_skeleton_getevents,
+	.event		= fio_skeleton_event,
+	.cleanup	= fio_skeleton_cleanup,
+	.open_file	= fio_skeleton_open,
+	.close_file	= fio_skeleton_close,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_skeleton_options),
+};
diff --git a/engines/solarisaio.c b/engines/solarisaio.c
new file mode 100644
index 0000000..21e9593
--- /dev/null
+++ b/engines/solarisaio.c
@@ -0,0 +1,234 @@
+/*
+ * Native Solaris async IO engine
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "../fio.h"
+
+#include <sys/asynch.h>
+
+struct solarisaio_data {
+	struct io_u **aio_events;
+	unsigned int aio_pending;
+	unsigned int nr;
+	unsigned int max_depth;
+};
+
+static int fio_solarisaio_cancel(struct thread_data fio_unused *td,
+			       struct io_u *io_u)
+{
+	return aiocancel(&io_u->resultp);
+}
+
+static int fio_solarisaio_prep(struct thread_data fio_unused *td,
+			    struct io_u *io_u)
+{
+	struct solarisaio_data *sd = td->io_ops_data;
+
+	io_u->resultp.aio_return = AIO_INPROGRESS;
+	io_u->engine_data = sd;
+	return 0;
+}
+
+static void wait_for_event(struct timeval *tv)
+{
+	struct solarisaio_data *sd;
+	struct io_u *io_u;
+	aio_result_t *res;
+
+	res = aiowait(tv);
+	if (res == (aio_result_t *) -1) {
+		int err = errno;
+
+		if (err != EINVAL) {
+			log_err("fio: solarisaio got %d in aiowait\n", err);
+			exit(err);
+		}
+		return;
+	} else if (!res)
+		return;
+
+	io_u = container_of(res, struct io_u, resultp);
+	sd = io_u->engine_data;
+
+	if (io_u->resultp.aio_return >= 0) {
+		io_u->resid = io_u->xfer_buflen - io_u->resultp.aio_return;
+		io_u->error = 0;
+	} else
+		io_u->error = io_u->resultp.aio_errno;
+
+	/*
+	 * For SIGIO, we need a write barrier between the two, so that
+	 * the ->aio_pending store is seen after the ->aio_events store
+	 */
+	sd->aio_events[sd->aio_pending] = io_u;
+	write_barrier();
+	sd->aio_pending++;
+	sd->nr--;
+}
+
+static int fio_solarisaio_getevents(struct thread_data *td, unsigned int min,
+				    unsigned int max, const struct timespec *t)
+{
+	struct solarisaio_data *sd = td->io_ops_data;
+	struct timeval tv;
+	int ret;
+
+	if (!min || !t) {
+		tv.tv_sec = 0;
+		tv.tv_usec = 0;
+	} else {
+		tv.tv_sec = t->tv_sec;
+		tv.tv_usec = t->tv_nsec / 1000;
+	}
+
+	while (sd->aio_pending < min)
+		wait_for_event(&tv);
+
+	/*
+	 * should be OK without locking, as int operations should be atomic
+	 */
+	ret = sd->aio_pending;
+	sd->aio_pending -= ret;
+	return ret;
+}
+
+static struct io_u *fio_solarisaio_event(struct thread_data *td, int event)
+{
+	struct solarisaio_data *sd = td->io_ops_data;
+
+	return sd->aio_events[event];
+}
+
+static enum fio_q_status fio_solarisaio_queue(struct thread_data fio_unused *td,
+			      struct io_u *io_u)
+{
+	struct solarisaio_data *sd = td->io_ops_data;
+	struct fio_file *f = io_u->file;
+	off_t off;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_SYNC) {
+		if (sd->nr)
+			return FIO_Q_BUSY;
+		if (fsync(f->fd) < 0)
+			io_u->error = errno;
+
+		return FIO_Q_COMPLETED;
+	}
+
+	if (io_u->ddir == DDIR_DATASYNC) {
+		if (sd->nr)
+			return FIO_Q_BUSY;
+		if (fdatasync(f->fd) < 0)
+			io_u->error = errno;
+
+		return FIO_Q_COMPLETED;
+	}
+
+	if (sd->nr == sd->max_depth)
+		return FIO_Q_BUSY;
+
+	off = io_u->offset;
+	if (io_u->ddir == DDIR_READ)
+		ret = aioread(f->fd, io_u->xfer_buf, io_u->xfer_buflen, off,
+					SEEK_SET, &io_u->resultp);
+	else
+		ret = aiowrite(f->fd, io_u->xfer_buf, io_u->xfer_buflen, off,
+					SEEK_SET, &io_u->resultp);
+	if (ret) {
+		io_u->error = errno;
+		td_verror(td, io_u->error, "xfer");
+		return FIO_Q_COMPLETED;
+	}
+
+	sd->nr++;
+	return FIO_Q_QUEUED;
+}
+
+static void fio_solarisaio_cleanup(struct thread_data *td)
+{
+	struct solarisaio_data *sd = td->io_ops_data;
+
+	if (sd) {
+		free(sd->aio_events);
+		free(sd);
+	}
+}
+
+/*
+ * Set USE_SIGNAL_COMPLETIONS to use SIGIO as completion events.
+ */
+#ifdef USE_SIGNAL_COMPLETIONS
+static void fio_solarisaio_sigio(int sig)
+{
+	wait_for_event(NULL);
+}
+
+static void fio_solarisaio_init_sigio(void)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = fio_solarisaio_sigio;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGIO, &act, NULL);
+}
+#endif
+
+static int fio_solarisaio_init(struct thread_data *td)
+{
+	struct solarisaio_data *sd = malloc(sizeof(*sd));
+	unsigned int max_depth;
+
+	max_depth = td->o.iodepth;
+	if (max_depth > MAXASYNCHIO) {
+		max_depth = MAXASYNCHIO;
+		log_info("fio: lower depth to %d due to OS constraints\n",
+							max_depth);
+	}
+
+	memset(sd, 0, sizeof(*sd));
+	sd->aio_events = malloc(max_depth * sizeof(struct io_u *));
+	memset(sd->aio_events, 0, max_depth * sizeof(struct io_u *));
+	sd->max_depth = max_depth;
+
+#ifdef USE_SIGNAL_COMPLETIONS
+	fio_solarisaio_init_sigio();
+#endif
+
+	td->io_ops_data = sd;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "solarisaio",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_solarisaio_init,
+	.prep		= fio_solarisaio_prep,
+	.queue		= fio_solarisaio_queue,
+	.cancel		= fio_solarisaio_cancel,
+	.getevents	= fio_solarisaio_getevents,
+	.event		= fio_solarisaio_event,
+	.cleanup	= fio_solarisaio_cleanup,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+};
+
+static void fio_init fio_solarisaio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_solarisaio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/splice.c b/engines/splice.c
new file mode 100644
index 0000000..6fc36bb
--- /dev/null
+++ b/engines/splice.c
@@ -0,0 +1,305 @@
+/*
+ * splice engine
+ *
+ * IO engine that transfers data by doing splices to/from pipes and
+ * the files.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <poll.h>
+#include <sys/mman.h>
+
+#include "../fio.h"
+
+struct spliceio_data {
+	int pipe[2];
+	int vmsplice_to_user;
+	int vmsplice_to_user_map;
+};
+
+/*
+ * vmsplice didn't use to support splicing to user space, this is the old
+ * variant of getting that job done. Doesn't make a lot of sense, but it
+ * uses splices to move data from the source into a pipe.
+ */
+static int fio_splice_read_old(struct thread_data *td, struct io_u *io_u)
+{
+	struct spliceio_data *sd = td->io_ops_data;
+	struct fio_file *f = io_u->file;
+	int ret, ret2, buflen;
+	off_t offset;
+	char *p;
+
+	offset = io_u->offset;
+	buflen = io_u->xfer_buflen;
+	p = io_u->xfer_buf;
+	while (buflen) {
+		int this_len = buflen;
+
+		if (this_len > SPLICE_DEF_SIZE)
+			this_len = SPLICE_DEF_SIZE;
+
+		ret = splice(f->fd, &offset, sd->pipe[1], NULL, this_len, SPLICE_F_MORE);
+		if (ret < 0) {
+			if (errno == ENODATA || errno == EAGAIN)
+				continue;
+
+			return -errno;
+		}
+
+		buflen -= ret;
+
+		while (ret) {
+			ret2 = read(sd->pipe[0], p, ret);
+			if (ret2 < 0)
+				return -errno;
+
+			ret -= ret2;
+			p += ret2;
+		}
+	}
+
+	return io_u->xfer_buflen;
+}
+
+/*
+ * We can now vmsplice into userspace, so do the transfer by splicing into
+ * a pipe and vmsplicing that into userspace.
+ */
+static int fio_splice_read(struct thread_data *td, struct io_u *io_u)
+{
+	struct spliceio_data *sd = td->io_ops_data;
+	struct fio_file *f = io_u->file;
+	struct iovec iov;
+	int ret , buflen, mmap_len;
+	off_t offset;
+	void *map;
+	char *p;
+
+	ret = 0;
+	offset = io_u->offset;
+	mmap_len = buflen = io_u->xfer_buflen;
+
+	if (sd->vmsplice_to_user_map) {
+		map = mmap(io_u->xfer_buf, buflen, PROT_READ, MAP_PRIVATE|OS_MAP_ANON, 0, 0);
+		if (map == MAP_FAILED) {
+			td_verror(td, errno, "mmap io_u");
+			return -1;
+		}
+
+		p = map;
+	} else {
+		map = NULL;
+		p = io_u->xfer_buf;
+	}
+
+	while (buflen) {
+		int this_len = buflen;
+		int flags = 0;
+
+		if (this_len > SPLICE_DEF_SIZE) {
+			this_len = SPLICE_DEF_SIZE;
+			flags = SPLICE_F_MORE;
+		}
+
+		ret = splice(f->fd, &offset, sd->pipe[1], NULL, this_len,flags);
+		if (ret < 0) {
+			if (errno == ENODATA || errno == EAGAIN)
+				continue;
+
+			td_verror(td, errno, "splice-from-fd");
+			break;
+		}
+
+		buflen -= ret;
+		iov.iov_base = p;
+		iov.iov_len = ret;
+
+		while (iov.iov_len) {
+			ret = vmsplice(sd->pipe[0], &iov, 1, SPLICE_F_MOVE);
+			if (ret < 0) {
+				if (errno == EFAULT &&
+				    sd->vmsplice_to_user_map) {
+					sd->vmsplice_to_user_map = 0;
+					munmap(map, mmap_len);
+					map = NULL;
+					p = io_u->xfer_buf;
+					iov.iov_base = p;
+					continue;
+				}
+				if (errno == EBADF) {
+					ret = -EBADF;
+					break;
+				}
+				td_verror(td, errno, "vmsplice");
+				break;
+			} else if (!ret) {
+				td_verror(td, ENODATA, "vmsplice");
+				ret = -1;
+				break;
+			}
+
+			iov.iov_len -= ret;
+			iov.iov_base += ret;
+			p += ret;
+		}
+		if (ret < 0)
+			break;
+	}
+
+	if (sd->vmsplice_to_user_map && munmap(map, mmap_len) < 0) {
+		td_verror(td, errno, "munnap io_u");
+		return -1;
+	}
+	if (ret < 0)
+		return ret;
+
+	return io_u->xfer_buflen;
+}
+
+/*
+ * For splice writing, we can vmsplice our data buffer directly into a
+ * pipe and then splice that to a file.
+ */
+static int fio_splice_write(struct thread_data *td, struct io_u *io_u)
+{
+	struct spliceio_data *sd = td->io_ops_data;
+	struct iovec iov = {
+		.iov_base = io_u->xfer_buf,
+		.iov_len = io_u->xfer_buflen,
+	};
+	struct pollfd pfd = { .fd = sd->pipe[1], .events = POLLOUT, };
+	struct fio_file *f = io_u->file;
+	off_t off = io_u->offset;
+	int ret, ret2;
+
+	while (iov.iov_len) {
+		if (poll(&pfd, 1, -1) < 0)
+			return errno;
+
+		ret = vmsplice(sd->pipe[1], &iov, 1, SPLICE_F_NONBLOCK);
+		if (ret < 0)
+			return -errno;
+
+		iov.iov_len -= ret;
+		iov.iov_base += ret;
+
+		while (ret) {
+			ret2 = splice(sd->pipe[0], NULL, f->fd, &off, ret, 0);
+			if (ret2 < 0)
+				return -errno;
+
+			ret -= ret2;
+		}
+	}
+
+	return io_u->xfer_buflen;
+}
+
+static enum fio_q_status fio_spliceio_queue(struct thread_data *td,
+					    struct io_u *io_u)
+{
+	struct spliceio_data *sd = td->io_ops_data;
+	int ret = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ) {
+		if (sd->vmsplice_to_user) {
+			ret = fio_splice_read(td, io_u);
+			/*
+			 * This kernel doesn't support vmsplice to user
+			 * space. Reset the vmsplice_to_user flag, so that
+			 * we retry below and don't hit this path again.
+			 */
+			if (ret == -EBADF)
+				sd->vmsplice_to_user = 0;
+		}
+		if (!sd->vmsplice_to_user)
+			ret = fio_splice_read_old(td, io_u);
+	} else if (io_u->ddir == DDIR_WRITE)
+		ret = fio_splice_write(td, io_u);
+	else if (io_u->ddir == DDIR_TRIM)
+		ret = do_io_u_trim(td, io_u);
+	else
+		ret = do_io_u_sync(td, io_u);
+
+	if (ret != (int) io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		td_verror(td, io_u->error, "xfer");
+		if (io_u->error == EINVAL)
+			log_err("fio: looks like splice doesn't work on this"
+					" file system\n");
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static void fio_spliceio_cleanup(struct thread_data *td)
+{
+	struct spliceio_data *sd = td->io_ops_data;
+
+	if (sd) {
+		close(sd->pipe[0]);
+		close(sd->pipe[1]);
+		free(sd);
+	}
+}
+
+static int fio_spliceio_init(struct thread_data *td)
+{
+	struct spliceio_data *sd = malloc(sizeof(*sd));
+
+	if (pipe(sd->pipe) < 0) {
+		td_verror(td, errno, "pipe");
+		free(sd);
+		return 1;
+	}
+
+	/*
+	 * Assume this work, we'll reset this if it doesn't
+	 */
+	sd->vmsplice_to_user = 1;
+
+	/*
+	 * Works with "real" vmsplice to user, eg mapping pages directly.
+	 * Reset if we fail.
+	 */
+	sd->vmsplice_to_user_map = 1;
+
+	td->io_ops_data = sd;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "splice",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_spliceio_init,
+	.queue		= fio_spliceio_queue,
+	.cleanup	= fio_spliceio_cleanup,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_PIPEIO,
+};
+
+static void fio_init fio_spliceio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_spliceio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/sync.c b/engines/sync.c
new file mode 100644
index 0000000..65fd210
--- /dev/null
+++ b/engines/sync.c
@@ -0,0 +1,505 @@
+/*
+ * sync/psync engine
+ *
+ * IO engine that does regular read(2)/write(2) with lseek(2) to transfer
+ * data and IO engine that does regular pread(2)/pwrite(2) to transfer data.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../lib/rand.h"
+
+/*
+ * Sync engine uses engine_data to store last offset
+ */
+#define LAST_POS(f)	((f)->engine_pos)
+
+struct syncio_data {
+	struct iovec *iovecs;
+	struct io_u **io_us;
+	unsigned int queued;
+	unsigned int events;
+	unsigned long queued_bytes;
+
+	unsigned long long last_offset;
+	struct fio_file *last_file;
+	enum fio_ddir last_ddir;
+
+	struct frand_state rand_state;
+};
+
+#ifdef FIO_HAVE_PWRITEV2
+struct psyncv2_options {
+	void *pad;
+	unsigned int hipri;
+	unsigned int hipri_percentage;
+	unsigned int uncached;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "hipri",
+		.lname	= "RWF_HIPRI",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct psyncv2_options, hipri),
+		.help	= "Set RWF_HIPRI for pwritev2/preadv2",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "hipri_percentage",
+		.lname	= "RWF_HIPRI_PERCENTAGE",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct psyncv2_options, hipri_percentage),
+		.minval	= 0,
+		.maxval	= 100,
+		.def    = "100",
+		.help	= "Probabilistically set RWF_HIPRI for pwritev2/preadv2",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "uncached",
+		.lname	= "Uncached",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct psyncv2_options, uncached),
+		.help	= "Use RWF_UNCACHED for buffered read/writes",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= NULL,
+	},
+};
+#endif
+
+static int fio_syncio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+
+	if (!ddir_rw(io_u->ddir))
+		return 0;
+
+	if (LAST_POS(f) != -1ULL && LAST_POS(f) == io_u->offset)
+		return 0;
+
+	if (lseek(f->fd, io_u->offset, SEEK_SET) == -1) {
+		td_verror(td, errno, "lseek");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_io_end(struct thread_data *td, struct io_u *io_u, int ret)
+{
+	if (io_u->file && ret >= 0 && ddir_rw(io_u->ddir))
+		LAST_POS(io_u->file) = io_u->offset + ret;
+
+	if (ret != (int) io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		io_u_log_error(td, io_u);
+		td_verror(td, io_u->error, "xfer");
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+#ifdef CONFIG_PWRITEV
+static enum fio_q_status fio_pvsyncio_queue(struct thread_data *td,
+					    struct io_u *io_u)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	struct iovec *iov = &sd->iovecs[0];
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (io_u->ddir == DDIR_READ)
+		ret = preadv(f->fd, iov, 1, io_u->offset);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = pwritev(f->fd, iov, 1, io_u->offset);
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
+		ret = do_io_u_sync(td, io_u);
+
+	return fio_io_end(td, io_u, ret);
+}
+#endif
+
+#ifdef FIO_HAVE_PWRITEV2
+static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td,
+					     struct io_u *io_u)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	struct psyncv2_options *o = td->eo;
+	struct iovec *iov = &sd->iovecs[0];
+	struct fio_file *f = io_u->file;
+	int ret, flags = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (o->hipri &&
+	    (rand_between(&sd->rand_state, 1, 100) <= o->hipri_percentage))
+		flags |= RWF_HIPRI;
+	if (!td->o.odirect && o->uncached)
+		flags |= RWF_UNCACHED;
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (io_u->ddir == DDIR_READ)
+		ret = preadv2(f->fd, iov, 1, io_u->offset, flags);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = pwritev2(f->fd, iov, 1, io_u->offset, flags);
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
+		ret = do_io_u_sync(td, io_u);
+
+	return fio_io_end(td, io_u, ret);
+}
+#endif
+
+static enum fio_q_status fio_psyncio_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = pread(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = pwrite(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
+		ret = do_io_u_sync(td, io_u);
+
+	return fio_io_end(td, io_u, ret);
+}
+
+static enum fio_q_status fio_syncio_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = read(f->fd, io_u->xfer_buf, io_u->xfer_buflen);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = write(f->fd, io_u->xfer_buf, io_u->xfer_buflen);
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
+		ret = do_io_u_sync(td, io_u);
+
+	return fio_io_end(td, io_u, ret);
+}
+
+static int fio_vsyncio_getevents(struct thread_data *td, unsigned int min,
+				 unsigned int max,
+				 const struct timespec fio_unused *t)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	int ret;
+
+	if (min) {
+		ret = sd->events;
+		sd->events = 0;
+	} else
+		ret = 0;
+
+	dprint(FD_IO, "vsyncio_getevents: min=%d,max=%d: %d\n", min, max, ret);
+	return ret;
+}
+
+static struct io_u *fio_vsyncio_event(struct thread_data *td, int event)
+{
+	struct syncio_data *sd = td->io_ops_data;
+
+	return sd->io_us[event];
+}
+
+static int fio_vsyncio_append(struct thread_data *td, struct io_u *io_u)
+{
+	struct syncio_data *sd = td->io_ops_data;
+
+	if (ddir_sync(io_u->ddir))
+		return 0;
+
+	if (io_u->offset == sd->last_offset && io_u->file == sd->last_file &&
+	    io_u->ddir == sd->last_ddir)
+		return 1;
+
+	return 0;
+}
+
+static void fio_vsyncio_set_iov(struct syncio_data *sd, struct io_u *io_u,
+				int idx)
+{
+	sd->io_us[idx] = io_u;
+	sd->iovecs[idx].iov_base = io_u->xfer_buf;
+	sd->iovecs[idx].iov_len = io_u->xfer_buflen;
+	sd->last_offset = io_u->offset + io_u->xfer_buflen;
+	sd->last_file = io_u->file;
+	sd->last_ddir = io_u->ddir;
+	sd->queued_bytes += io_u->xfer_buflen;
+	sd->queued++;
+}
+
+static enum fio_q_status fio_vsyncio_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	struct syncio_data *sd = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	if (!fio_vsyncio_append(td, io_u)) {
+		dprint(FD_IO, "vsyncio_queue: no append (%d)\n", sd->queued);
+		/*
+		 * If we can't append and have stuff queued, tell fio to
+		 * commit those first and then retry this io
+		 */
+		if (sd->queued)
+			return FIO_Q_BUSY;
+		if (ddir_sync(io_u->ddir)) {
+			int ret = do_io_u_sync(td, io_u);
+
+			return fio_io_end(td, io_u, ret);
+		}
+
+		sd->queued = 0;
+		sd->queued_bytes = 0;
+		fio_vsyncio_set_iov(sd, io_u, 0);
+	} else {
+		if (sd->queued == td->o.iodepth) {
+			dprint(FD_IO, "vsyncio_queue: max depth %d\n", sd->queued);
+			return FIO_Q_BUSY;
+		}
+
+		dprint(FD_IO, "vsyncio_queue: append\n");
+		fio_vsyncio_set_iov(sd, io_u, sd->queued);
+	}
+
+	dprint(FD_IO, "vsyncio_queue: depth now %d\n", sd->queued);
+	return FIO_Q_QUEUED;
+}
+
+/*
+ * Check that we transferred all bytes, or saw an error, etc
+ */
+static int fio_vsyncio_end(struct thread_data *td, ssize_t bytes)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	struct io_u *io_u;
+	unsigned int i;
+	int err;
+
+	/*
+	 * transferred everything, perfect
+	 */
+	if (bytes == sd->queued_bytes)
+		return 0;
+
+	err = errno;
+	for (i = 0; i < sd->queued; i++) {
+		io_u = sd->io_us[i];
+
+		if (bytes == -1) {
+			io_u->error = err;
+		} else {
+			unsigned int this_io;
+
+			this_io = bytes;
+			if (this_io > io_u->xfer_buflen)
+				this_io = io_u->xfer_buflen;
+
+			io_u->resid = io_u->xfer_buflen - this_io;
+			io_u->error = 0;
+			bytes -= this_io;
+		}
+	}
+
+	if (bytes == -1) {
+		td_verror(td, err, "xfer vsync");
+		return -err;
+	}
+
+	return 0;
+}
+
+static int fio_vsyncio_commit(struct thread_data *td)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	struct fio_file *f;
+	ssize_t ret;
+
+	if (!sd->queued)
+		return 0;
+
+	io_u_mark_submit(td, sd->queued);
+	f = sd->last_file;
+
+	if (lseek(f->fd, sd->io_us[0]->offset, SEEK_SET) == -1) {
+		int err = -errno;
+
+		td_verror(td, errno, "lseek");
+		return err;
+	}
+
+	if (sd->last_ddir == DDIR_READ)
+		ret = readv(f->fd, sd->iovecs, sd->queued);
+	else
+		ret = writev(f->fd, sd->iovecs, sd->queued);
+
+	dprint(FD_IO, "vsyncio_commit: %d\n", (int) ret);
+	sd->events = sd->queued;
+	sd->queued = 0;
+	return fio_vsyncio_end(td, ret);
+}
+
+static int fio_vsyncio_init(struct thread_data *td)
+{
+	struct syncio_data *sd;
+
+	sd = malloc(sizeof(*sd));
+	memset(sd, 0, sizeof(*sd));
+	sd->last_offset = -1ULL;
+	sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
+	sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
+	init_rand(&sd->rand_state, 0);
+
+	td->io_ops_data = sd;
+	return 0;
+}
+
+static void fio_vsyncio_cleanup(struct thread_data *td)
+{
+	struct syncio_data *sd = td->io_ops_data;
+
+	if (sd) {
+		free(sd->iovecs);
+		free(sd->io_us);
+		free(sd);
+	}
+}
+
+static struct ioengine_ops ioengine_rw = {
+	.name		= "sync",
+	.version	= FIO_IOOPS_VERSION,
+	.prep		= fio_syncio_prep,
+	.queue		= fio_syncio_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+};
+
+static struct ioengine_ops ioengine_prw = {
+	.name		= "psync",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_psyncio_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+};
+
+static struct ioengine_ops ioengine_vrw = {
+	.name		= "vsync",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_vsyncio_init,
+	.cleanup	= fio_vsyncio_cleanup,
+	.queue		= fio_vsyncio_queue,
+	.commit		= fio_vsyncio_commit,
+	.event		= fio_vsyncio_event,
+	.getevents	= fio_vsyncio_getevents,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+};
+
+#ifdef CONFIG_PWRITEV
+static struct ioengine_ops ioengine_pvrw = {
+	.name		= "pvsync",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_vsyncio_init,
+	.cleanup	= fio_vsyncio_cleanup,
+	.queue		= fio_pvsyncio_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+};
+#endif
+
+#ifdef FIO_HAVE_PWRITEV2
+static struct ioengine_ops ioengine_pvrw2 = {
+	.name		= "pvsync2",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_vsyncio_init,
+	.cleanup	= fio_vsyncio_cleanup,
+	.queue		= fio_pvsyncio2_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+	.options	= options,
+	.option_struct_size	= sizeof(struct psyncv2_options),
+};
+#endif
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine_rw);
+	register_ioengine(&ioengine_prw);
+	register_ioengine(&ioengine_vrw);
+#ifdef CONFIG_PWRITEV
+	register_ioengine(&ioengine_pvrw);
+#endif
+#ifdef FIO_HAVE_PWRITEV2
+	register_ioengine(&ioengine_pvrw2);
+#endif
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine_rw);
+	unregister_ioengine(&ioengine_prw);
+	unregister_ioengine(&ioengine_vrw);
+#ifdef CONFIG_PWRITEV
+	unregister_ioengine(&ioengine_pvrw);
+#endif
+#ifdef FIO_HAVE_PWRITEV2
+	unregister_ioengine(&ioengine_pvrw2);
+#endif
+}
diff --git a/engines/windowsaio.c b/engines/windowsaio.c
new file mode 100644
index 0000000..13d7f19
--- /dev/null
+++ b/engines/windowsaio.c
@@ -0,0 +1,498 @@
+/*
+ * windowsaio engine
+ *
+ * IO engine using Windows IO Completion Ports.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "../fio.h"
+
+typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped);
+
+int geterrno_from_win_error (DWORD code, int deferrno);
+
+struct fio_overlapped {
+	OVERLAPPED o;
+	struct io_u *io_u;
+	BOOL io_complete;
+};
+
+struct windowsaio_data {
+	struct io_u **aio_events;
+	HANDLE iocp;
+	HANDLE iothread;
+	HANDLE iocomplete_event;
+	BOOL iothread_running;
+};
+
+struct thread_ctx {
+	HANDLE iocp;
+	struct windowsaio_data *wd;
+};
+
+static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
+
+static int fio_windowsaio_init(struct thread_data *td)
+{
+	struct windowsaio_data *wd;
+	int rc = 0;
+
+	wd = calloc(1, sizeof(struct windowsaio_data));
+	if (wd == NULL) {
+		 log_err("windowsaio: failed to allocate memory for engine data\n");
+		rc = 1;
+	}
+
+	if (!rc) {
+		wd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u*));
+		if (wd->aio_events == NULL) {
+			log_err("windowsaio: failed to allocate memory for aio events list\n");
+			rc = 1;
+		}
+	}
+
+	if (!rc) {
+		/* Create an auto-reset event */
+		wd->iocomplete_event = CreateEvent(NULL, FALSE, FALSE, NULL);
+		if (wd->iocomplete_event == NULL) {
+			log_err("windowsaio: failed to create io complete event handle\n");
+			rc = 1;
+		}
+	}
+
+	if (rc) {
+		if (wd != NULL) {
+			if (wd->aio_events != NULL)
+				free(wd->aio_events);
+
+			free(wd);
+		}
+	}
+
+	td->io_ops_data = wd;
+
+	if (!rc) {
+		struct thread_ctx *ctx;
+		struct windowsaio_data *wd;
+		HANDLE hFile;
+
+		hFile = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+		if (hFile == INVALID_HANDLE_VALUE) {
+			log_err("windowsaio: failed to create io completion port\n");
+			rc = 1;
+		}
+
+		wd = td->io_ops_data;
+		wd->iothread_running = TRUE;
+		wd->iocp = hFile;
+
+		if (!rc)
+			ctx = malloc(sizeof(struct thread_ctx));
+
+		if (!rc && ctx == NULL) {
+			log_err("windowsaio: failed to allocate memory for thread context structure\n");
+			CloseHandle(hFile);
+			rc = 1;
+		}
+
+		if (!rc) {
+			DWORD threadid;
+
+			ctx->iocp = hFile;
+			ctx->wd = wd;
+			wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+
+			if (wd->iothread != NULL)
+				fio_setaffinity(threadid, td->o.cpumask);
+			else
+				log_err("windowsaio: failed to create io completion thread\n");
+		}
+
+		if (rc || wd->iothread == NULL)
+			rc = 1;
+	}
+
+	return rc;
+}
+
+static void fio_windowsaio_cleanup(struct thread_data *td)
+{
+	struct windowsaio_data *wd;
+
+	wd = td->io_ops_data;
+
+	if (wd != NULL) {
+		wd->iothread_running = FALSE;
+		WaitForSingleObject(wd->iothread, INFINITE);
+
+		CloseHandle(wd->iothread);
+		CloseHandle(wd->iocomplete_event);
+
+		free(wd->aio_events);
+		free(wd);
+
+		td->io_ops_data = NULL;
+	}
+}
+
+static int windowsaio_invalidate_cache(struct fio_file *f)
+{
+	DWORD error;
+	DWORD isharemode = (FILE_SHARE_DELETE | FILE_SHARE_READ |
+				FILE_SHARE_WRITE);
+	HANDLE ihFile;
+	int rc = 0;
+
+	/*
+	 * Encourage Windows to drop cached parts of a file by temporarily
+	 * opening it for non-buffered access. Note: this will only work when
+	 * the following is the only thing with the file open on the whole
+	 * system.
+	 */
+	dprint(FD_IO, "windowaio: attempt invalidate cache for %s\n",
+			f->file_name);
+	ihFile = CreateFile(f->file_name, 0, isharemode, NULL, OPEN_EXISTING,
+			FILE_FLAG_NO_BUFFERING, NULL);
+
+	if (ihFile != INVALID_HANDLE_VALUE) {
+		if (!CloseHandle(ihFile)) {
+			error = GetLastError();
+			log_info("windowsaio: invalidation fd close %s "
+				 "failed: error %d\n", f->file_name, error);
+			rc = 1;
+		}
+	} else {
+		error = GetLastError();
+		if (error != ERROR_FILE_NOT_FOUND) {
+			log_info("windowsaio: cache invalidation of %s failed: "
+					"error %d\n", f->file_name, error);
+			rc = 1;
+		}
+	}
+
+	return rc;
+}
+
+static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	int rc = 0;
+	DWORD flags = FILE_FLAG_POSIX_SEMANTICS | FILE_FLAG_OVERLAPPED;
+	DWORD sharemode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+	DWORD openmode = OPEN_ALWAYS;
+	DWORD access;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (f->filetype == FIO_TYPE_PIPE) {
+		log_err("windowsaio: pipes are not supported\n");
+		return 1;
+	}
+
+	if (!strcmp(f->file_name, "-")) {
+		log_err("windowsaio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (td->o.odirect)
+		flags |= FILE_FLAG_NO_BUFFERING;
+	if (td->o.sync_io)
+		flags |= FILE_FLAG_WRITE_THROUGH;
+
+	/*
+	 * Inform Windows whether we're going to be doing sequential or
+	 * random IO so it can tune the Cache Manager
+	 */
+	switch (td->o.fadvise_hint) {
+	case F_ADV_TYPE:
+		if (td_random(td))
+			flags |= FILE_FLAG_RANDOM_ACCESS;
+		else
+			flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+		break;
+	case F_ADV_RANDOM:
+		flags |= FILE_FLAG_RANDOM_ACCESS;
+		break;
+	case F_ADV_SEQUENTIAL:
+		flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+		break;
+	case F_ADV_NONE:
+		break;
+	default:
+		log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
+	}
+
+	if (!td_write(td) || read_only)
+		access = GENERIC_READ;
+	else
+		access = (GENERIC_READ | GENERIC_WRITE);
+
+	if (td->o.create_on_open)
+		openmode = OPEN_ALWAYS;
+	else
+		openmode = OPEN_EXISTING;
+
+	/* If we're going to use direct I/O, Windows will try and invalidate
+	 * its cache at that point so there's no need to do it here */
+	if (td->o.invalidate_cache && !td->o.odirect)
+		windowsaio_invalidate_cache(f);
+
+	f->hFile = CreateFile(f->file_name, access, sharemode,
+		NULL, openmode, flags, NULL);
+
+	if (f->hFile == INVALID_HANDLE_VALUE) {
+		log_err("windowsaio: failed to open file \"%s\"\n", f->file_name);
+		rc = 1;
+	}
+
+	/* Only set up the completion port and thread if we're not just
+	 * querying the device size */
+	if (!rc && td->io_ops_data != NULL) {
+		struct windowsaio_data *wd;
+
+		wd = td->io_ops_data;
+
+		if (CreateIoCompletionPort(f->hFile, wd->iocp, 0, 0) == NULL) {
+			log_err("windowsaio: failed to create io completion port\n");
+			rc = 1;
+		}
+	}
+
+	return rc;
+}
+
+static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct fio_file *f)
+{
+	int rc = 0;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	if (f->hFile != INVALID_HANDLE_VALUE) {
+		if (!CloseHandle(f->hFile)) {
+			log_info("windowsaio: failed to close file handle for \"%s\"\n", f->file_name);
+			rc = 1;
+		}
+	}
+
+	f->hFile = INVALID_HANDLE_VALUE;
+	return rc;
+}
+
+static BOOL timeout_expired(DWORD start_count, DWORD end_count)
+{
+	BOOL expired = FALSE;
+	DWORD current_time;
+
+	current_time = GetTickCount();
+
+	if ((end_count > start_count) && current_time >= end_count)
+		expired = TRUE;
+	else if (current_time < start_count && current_time > end_count)
+		expired = TRUE;
+
+	return expired;
+}
+
+static struct io_u* fio_windowsaio_event(struct thread_data *td, int event)
+{
+	struct windowsaio_data *wd = td->io_ops_data;
+	return wd->aio_events[event];
+}
+
+static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
+				    unsigned int max,
+				    const struct timespec *t)
+{
+	struct windowsaio_data *wd = td->io_ops_data;
+	unsigned int dequeued = 0;
+	struct io_u *io_u;
+	int i;
+	struct fio_overlapped *fov;
+	DWORD start_count = 0;
+	DWORD end_count = 0;
+	DWORD status;
+	DWORD mswait = 250;
+
+	if (t != NULL) {
+		mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+		start_count = GetTickCount();
+		end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+	}
+
+	do {
+		io_u_qiter(&td->io_u_all, io_u, i) {
+			if (!(io_u->flags & IO_U_F_FLIGHT))
+				continue;
+
+			fov = (struct fio_overlapped*)io_u->engine_data;
+
+			if (fov->io_complete) {
+				fov->io_complete = FALSE;
+				wd->aio_events[dequeued] = io_u;
+				dequeued++;
+			}
+
+		}
+		if (dequeued >= min)
+			break;
+
+		if (dequeued < min) {
+			status = WaitForSingleObject(wd->iocomplete_event, mswait);
+			if (status != WAIT_OBJECT_0 && dequeued >= min)
+				break;
+		}
+
+		if (dequeued >= min ||
+		    (t != NULL && timeout_expired(start_count, end_count)))
+			break;
+	} while (1);
+
+	return dequeued;
+}
+
+static enum fio_q_status fio_windowsaio_queue(struct thread_data *td,
+					      struct io_u *io_u)
+{
+	struct fio_overlapped *o = io_u->engine_data;
+	LPOVERLAPPED lpOvl = &o->o;
+	BOOL success = FALSE;
+	int rc = FIO_Q_COMPLETED;
+
+	fio_ro_check(td, io_u);
+
+	lpOvl->Internal = 0;
+	lpOvl->InternalHigh = 0;
+	lpOvl->Offset = io_u->offset & 0xFFFFFFFF;
+	lpOvl->OffsetHigh = io_u->offset >> 32;
+
+	switch (io_u->ddir) {
+	case DDIR_WRITE:
+		success = WriteFile(io_u->file->hFile, io_u->xfer_buf,
+					io_u->xfer_buflen, NULL, lpOvl);
+		break;
+	case DDIR_READ:
+		success = ReadFile(io_u->file->hFile, io_u->xfer_buf,
+					io_u->xfer_buflen, NULL, lpOvl);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		success = FlushFileBuffers(io_u->file->hFile);
+		if (!success) {
+			log_err("windowsaio: failed to flush file buffers\n");
+			io_u->error = win_to_posix_error(GetLastError());
+		}
+
+		return FIO_Q_COMPLETED;
+	case DDIR_TRIM:
+		log_err("windowsaio: manual TRIM isn't supported on Windows\n");
+		io_u->error = 1;
+		io_u->resid = io_u->xfer_buflen;
+		return FIO_Q_COMPLETED;
+	default:
+		assert(0);
+		break;
+	}
+
+	if (success || GetLastError() == ERROR_IO_PENDING)
+		rc = FIO_Q_QUEUED;
+	else {
+		io_u->error = win_to_posix_error(GetLastError());
+		io_u->resid = io_u->xfer_buflen;
+	}
+
+	return rc;
+}
+
+/* Runs as a thread and waits for queued IO to complete */
+static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter)
+{
+	OVERLAPPED *ovl;
+	struct fio_overlapped *fov;
+	struct io_u *io_u;
+	struct windowsaio_data *wd;
+	struct thread_ctx *ctx;
+	ULONG_PTR ulKey = 0;
+	DWORD bytes;
+
+	ctx = (struct thread_ctx*)lpParameter;
+	wd = ctx->wd;
+
+	do {
+		BOOL ret;
+
+		ret = GetQueuedCompletionStatus(ctx->iocp, &bytes, &ulKey,
+						&ovl, 250);
+		if (!ret && ovl == NULL)
+			continue;
+
+		fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o);
+		io_u = fov->io_u;
+
+		if (ovl->Internal == ERROR_SUCCESS) {
+			io_u->resid = io_u->xfer_buflen - ovl->InternalHigh;
+			io_u->error = 0;
+		} else {
+			io_u->resid = io_u->xfer_buflen;
+			io_u->error = win_to_posix_error(GetLastError());
+		}
+
+		fov->io_complete = TRUE;
+		SetEvent(wd->iocomplete_event);
+	} while (ctx->wd->iothread_running);
+
+	CloseHandle(ctx->iocp);
+	free(ctx);
+	return 0;
+}
+
+static void fio_windowsaio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_overlapped *o = io_u->engine_data;
+
+	if (o) {
+		io_u->engine_data = NULL;
+		free(o);
+	}
+}
+
+static int fio_windowsaio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_overlapped *o;
+
+	o = malloc(sizeof(*o));
+	o->io_complete = FALSE;
+	o->io_u = io_u;
+	o->o.hEvent = NULL;
+	io_u->engine_data = o;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "windowsaio",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_windowsaio_init,
+	.queue		= fio_windowsaio_queue,
+	.getevents	= fio_windowsaio_getevents,
+	.event		= fio_windowsaio_event,
+	.cleanup	= fio_windowsaio_cleanup,
+	.open_file	= fio_windowsaio_open_file,
+	.close_file	= fio_windowsaio_close_file,
+	.get_file_size	= generic_get_file_size,
+	.io_u_init	= fio_windowsaio_io_u_init,
+	.io_u_free	= fio_windowsaio_io_u_free,
+};
+
+static void fio_init fio_windowsaio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_windowsaio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/err.h b/err.h
new file mode 100644
index 0000000..0765f1b
--- /dev/null
+++ b/err.h
@@ -0,0 +1,44 @@
+#ifndef FIO_ERR_H
+#define FIO_ERR_H
+
+/*
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a dentry
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ */
+#define MAX_ERRNO	4095
+
+#define IS_ERR_VALUE(x) ((x) >= (uintptr_t)-MAX_ERRNO)
+
+static inline void *ERR_PTR(uintptr_t error)
+{
+	return (void *) error;
+}
+
+static inline uintptr_t PTR_ERR(const void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static inline uintptr_t IS_ERR(const void *ptr)
+{
+	return IS_ERR_VALUE((uintptr_t)ptr);
+}
+
+static inline uintptr_t IS_ERR_OR_NULL(const void *ptr)
+{
+	return !ptr || IS_ERR_VALUE((uintptr_t)ptr);
+}
+
+static inline int PTR_ERR_OR_ZERO(const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+
+#endif
diff --git a/eta.c b/eta.c
new file mode 100644
index 0000000..13f61ba
--- /dev/null
+++ b/eta.c
@@ -0,0 +1,743 @@
+/*
+ * Status and ETA code
+ */
+#include <unistd.h>
+#include <string.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
+
+#include "fio.h"
+#include "lib/pow2.h"
+
+static char __run_str[REAL_MAX_JOBS + 1];
+static char run_str[__THREAD_RUNSTR_SZ(REAL_MAX_JOBS) + 1];
+
+static void update_condensed_str(char *rstr, char *run_str_condensed)
+{
+	if (*rstr) {
+		while (*rstr) {
+			int nr = 1;
+
+			*run_str_condensed++ = *rstr++;
+			while (*(rstr - 1) == *rstr) {
+				rstr++;
+				nr++;
+			}
+			run_str_condensed += sprintf(run_str_condensed, "(%u),", nr);
+		}
+		run_str_condensed--;
+	}
+	*run_str_condensed = '\0';
+}
+
+/*
+ * Sets the status of the 'td' in the printed status map.
+ */
+static void check_str_update(struct thread_data *td)
+{
+	char c = __run_str[td->thread_number - 1];
+
+	switch (td->runstate) {
+	case TD_REAPED:
+		if (td->error)
+			c = 'X';
+		else if (td->sig)
+			c = 'K';
+		else
+			c = '_';
+		break;
+	case TD_EXITED:
+		c = 'E';
+		break;
+	case TD_RAMP:
+		c = '/';
+		break;
+	case TD_RUNNING:
+		if (td_rw(td)) {
+			if (td_random(td)) {
+				if (td->o.rwmix[DDIR_READ] == 100)
+					c = 'r';
+				else if (td->o.rwmix[DDIR_WRITE] == 100)
+					c = 'w';
+				else
+					c = 'm';
+			} else {
+				if (td->o.rwmix[DDIR_READ] == 100)
+					c = 'R';
+				else if (td->o.rwmix[DDIR_WRITE] == 100)
+					c = 'W';
+				else
+					c = 'M';
+			}
+		} else if (td_read(td)) {
+			if (td_random(td))
+				c = 'r';
+			else
+				c = 'R';
+		} else if (td_write(td)) {
+			if (td_random(td))
+				c = 'w';
+			else
+				c = 'W';
+		} else {
+			if (td_random(td))
+				c = 'd';
+			else
+				c = 'D';
+		}
+		break;
+	case TD_PRE_READING:
+		c = 'p';
+		break;
+	case TD_VERIFYING:
+		c = 'V';
+		break;
+	case TD_FSYNCING:
+		c = 'F';
+		break;
+	case TD_FINISHING:
+		c = 'f';
+		break;
+	case TD_CREATED:
+		c = 'C';
+		break;
+	case TD_INITIALIZED:
+	case TD_SETTING_UP:
+		c = 'I';
+		break;
+	case TD_NOT_CREATED:
+		c = 'P';
+		break;
+	default:
+		log_err("state %d\n", td->runstate);
+	}
+
+	__run_str[td->thread_number - 1] = c;
+	update_condensed_str(__run_str, run_str);
+}
+
+/*
+ * Convert seconds to a printable string.
+ */
+void eta_to_str(char *str, unsigned long eta_sec)
+{
+	unsigned int d, h, m, s;
+	int disp_hour = 0;
+
+	if (eta_sec == -1) {
+		sprintf(str, "--");
+		return;
+	}
+
+	s = eta_sec % 60;
+	eta_sec /= 60;
+	m = eta_sec % 60;
+	eta_sec /= 60;
+	h = eta_sec % 24;
+	eta_sec /= 24;
+	d = eta_sec;
+
+	if (d) {
+		disp_hour = 1;
+		str += sprintf(str, "%02ud:", d);
+	}
+
+	if (h || disp_hour)
+		str += sprintf(str, "%02uh:", h);
+
+	str += sprintf(str, "%02um:", m);
+	sprintf(str, "%02us", s);
+}
+
+/*
+ * Best effort calculation of the estimated pending runtime of a job.
+ */
+static unsigned long thread_eta(struct thread_data *td)
+{
+	unsigned long long bytes_total, bytes_done;
+	unsigned long eta_sec = 0;
+	unsigned long elapsed;
+	uint64_t timeout;
+
+	elapsed = (mtime_since_now(&td->epoch) + 999) / 1000;
+	timeout = td->o.timeout / 1000000UL;
+
+	bytes_total = td->total_io_size;
+
+	if (td->flags & TD_F_NO_PROGRESS)
+		return -1;
+
+	if (td->o.fill_device && td->o.size  == -1ULL) {
+		if (!td->fill_device_size || td->fill_device_size == -1ULL)
+			return 0;
+
+		bytes_total = td->fill_device_size;
+	}
+
+	/*
+	 * If io_size is set, bytes_total is an exact value that does not need
+	 * adjustment.
+	 */
+	if (td->o.zone_size && td->o.zone_skip && bytes_total &&
+	    !fio_option_is_set(&td->o, io_size)) {
+		unsigned int nr_zones;
+		uint64_t zone_bytes;
+
+		/*
+		 * Calculate the upper bound of the number of zones that will
+		 * be processed, including skipped bytes between zones. If this
+		 * is larger than total_io_size (e.g. when --io_size or --size
+		 * specify a small value), use the lower bound to avoid
+		 * adjustments to a negative value that would result in a very
+		 * large bytes_total and an incorrect eta.
+		 */
+		zone_bytes = td->o.zone_size + td->o.zone_skip;
+		nr_zones = (bytes_total + zone_bytes - 1) / zone_bytes;
+		if (bytes_total < nr_zones * td->o.zone_skip)
+			nr_zones = bytes_total / zone_bytes;
+		bytes_total -= nr_zones * td->o.zone_skip;
+	}
+
+	/*
+	 * if writing and verifying afterwards, bytes_total will be twice the
+	 * size. In a mixed workload, verify phase will be the size of the
+	 * first stage writes.
+	 */
+	if (td->o.do_verify && td->o.verify && td_write(td)) {
+		if (td_rw(td)) {
+			unsigned int perc = 50;
+
+			if (td->o.rwmix[DDIR_WRITE])
+				perc = td->o.rwmix[DDIR_WRITE];
+
+			bytes_total += (bytes_total * perc) / 100;
+		} else
+			bytes_total <<= 1;
+	}
+
+	if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) {
+		double perc, perc_t;
+
+		bytes_done = ddir_rw_sum(td->io_bytes);
+
+		if (bytes_total) {
+			perc = (double) bytes_done / (double) bytes_total;
+			if (perc > 1.0)
+				perc = 1.0;
+		} else
+			perc = 0.0;
+
+		if (td->o.time_based) {
+			if (timeout) {
+				perc_t = (double) elapsed / (double) timeout;
+				if (perc_t < perc)
+					perc = perc_t;
+			} else {
+				/*
+				 * Will never hit, we can't have time_based
+				 * without a timeout set.
+				 */
+				perc = 0.0;
+			}
+		}
+
+		if (perc == 0.0) {
+			eta_sec = timeout;
+		} else {
+			eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed;
+		}
+
+		if (td->o.timeout &&
+		    eta_sec > (timeout + done_secs - elapsed))
+			eta_sec = timeout + done_secs - elapsed;
+	} else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED
+			|| td->runstate == TD_INITIALIZED
+			|| td->runstate == TD_SETTING_UP
+			|| td->runstate == TD_RAMP
+			|| td->runstate == TD_PRE_READING) {
+		int64_t t_eta = 0, r_eta = 0;
+		unsigned long long rate_bytes;
+
+		/*
+		 * We can only guess - assume it'll run the full timeout
+		 * if given, otherwise assume it'll run at the specified rate.
+		 */
+		if (td->o.timeout) {
+			uint64_t __timeout = td->o.timeout;
+			uint64_t start_delay = td->o.start_delay;
+			uint64_t ramp_time = td->o.ramp_time;
+
+			t_eta = __timeout + start_delay;
+			if (!td->ramp_time_over) {
+				t_eta += ramp_time;
+			}
+			t_eta /= 1000000ULL;
+
+			if ((td->runstate == TD_RAMP) && in_ramp_time(td)) {
+				unsigned long ramp_left;
+
+				ramp_left = mtime_since_now(&td->epoch);
+				ramp_left = (ramp_left + 999) / 1000;
+				if (ramp_left <= t_eta)
+					t_eta -= ramp_left;
+			}
+		}
+		rate_bytes = 0;
+		if (td_read(td))
+			rate_bytes  = td->o.rate[DDIR_READ];
+		if (td_write(td))
+			rate_bytes += td->o.rate[DDIR_WRITE];
+		if (td_trim(td))
+			rate_bytes += td->o.rate[DDIR_TRIM];
+
+		if (rate_bytes) {
+			r_eta = bytes_total / rate_bytes;
+			r_eta += (td->o.start_delay / 1000000ULL);
+		}
+
+		if (r_eta && t_eta)
+			eta_sec = min(r_eta, t_eta);
+		else if (r_eta)
+			eta_sec = r_eta;
+		else if (t_eta)
+			eta_sec = t_eta;
+		else
+			eta_sec = 0;
+	} else {
+		/*
+		 * thread is already done or waiting for fsync
+		 */
+		eta_sec = 0;
+	}
+
+	return eta_sec;
+}
+
+static void calc_rate(int unified_rw_rep, unsigned long mtime,
+		      unsigned long long *io_bytes,
+		      unsigned long long *prev_io_bytes, uint64_t *rate)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		unsigned long long diff, this_rate;
+
+		diff = io_bytes[i] - prev_io_bytes[i];
+		if (mtime)
+			this_rate = ((1000 * diff) / mtime) / 1024; /* KiB/s */
+		else
+			this_rate = 0;
+
+		if (unified_rw_rep) {
+			rate[i] = 0;
+			rate[0] += this_rate;
+		} else
+			rate[i] = this_rate;
+
+		prev_io_bytes[i] = io_bytes[i];
+	}
+}
+
+static void calc_iops(int unified_rw_rep, unsigned long mtime,
+		      unsigned long long *io_iops,
+		      unsigned long long *prev_io_iops, unsigned int *iops)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		unsigned long long diff, this_iops;
+
+		diff = io_iops[i] - prev_io_iops[i];
+		if (mtime)
+			this_iops = (diff * 1000) / mtime;
+		else
+			this_iops = 0;
+
+		if (unified_rw_rep) {
+			iops[i] = 0;
+			iops[0] += this_iops;
+		} else
+			iops[i] = this_iops;
+
+		prev_io_iops[i] = io_iops[i];
+	}
+}
+
+/*
+ * Allow a little slack - if we're within 95% of the time, allow ETA.
+ */
+bool eta_time_within_slack(unsigned int time)
+{
+	return time > ((eta_interval_msec * 95) / 100);
+}
+
+/*
+ * Print status of the jobs we know about. This includes rate estimates,
+ * ETA, thread state, etc.
+ */
+bool calc_thread_status(struct jobs_eta *je, int force)
+{
+	struct thread_data *td;
+	int i, unified_rw_rep;
+	uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
+	unsigned long long io_bytes[DDIR_RWDIR_CNT];
+	unsigned long long io_iops[DDIR_RWDIR_CNT];
+	struct timespec now;
+
+	static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT];
+	static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT];
+	static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
+	static struct timespec rate_prev_time, disp_prev_time;
+
+	if (!force) {
+		if (!(output_format & FIO_OUTPUT_NORMAL) &&
+		    f_out == stdout)
+			return false;
+		if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
+			return false;
+
+		if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS))
+			return false;
+	}
+
+	if (!ddir_rw_sum(rate_io_bytes))
+		fill_start_time(&rate_prev_time);
+	if (!ddir_rw_sum(disp_io_bytes))
+		fill_start_time(&disp_prev_time);
+
+	eta_secs = malloc(thread_number * sizeof(uint64_t));
+	memset(eta_secs, 0, thread_number * sizeof(uint64_t));
+
+	je->elapsed_sec = (mtime_since_genesis() + 999) / 1000;
+
+	io_bytes[DDIR_READ] = io_bytes[DDIR_WRITE] = io_bytes[DDIR_TRIM] = 0;
+	io_iops[DDIR_READ] = io_iops[DDIR_WRITE] = io_iops[DDIR_TRIM] = 0;
+	bw_avg_time = ULONG_MAX;
+	unified_rw_rep = 0;
+	for_each_td(td, i) {
+		unified_rw_rep += td->o.unified_rw_rep;
+		if (is_power_of_2(td->o.kb_base))
+			je->is_pow2 = 1;
+		je->unit_base = td->o.unit_base;
+		if (td->o.bw_avg_time < bw_avg_time)
+			bw_avg_time = td->o.bw_avg_time;
+		if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING
+		    || td->runstate == TD_FSYNCING
+		    || td->runstate == TD_PRE_READING
+		    || td->runstate == TD_FINISHING) {
+			je->nr_running++;
+			if (td_read(td)) {
+				je->t_rate[0] += td->o.rate[DDIR_READ];
+				je->t_iops[0] += td->o.rate_iops[DDIR_READ];
+				je->m_rate[0] += td->o.ratemin[DDIR_READ];
+				je->m_iops[0] += td->o.rate_iops_min[DDIR_READ];
+			}
+			if (td_write(td)) {
+				je->t_rate[1] += td->o.rate[DDIR_WRITE];
+				je->t_iops[1] += td->o.rate_iops[DDIR_WRITE];
+				je->m_rate[1] += td->o.ratemin[DDIR_WRITE];
+				je->m_iops[1] += td->o.rate_iops_min[DDIR_WRITE];
+			}
+			if (td_trim(td)) {
+				je->t_rate[2] += td->o.rate[DDIR_TRIM];
+				je->t_iops[2] += td->o.rate_iops[DDIR_TRIM];
+				je->m_rate[2] += td->o.ratemin[DDIR_TRIM];
+				je->m_iops[2] += td->o.rate_iops_min[DDIR_TRIM];
+			}
+
+			je->files_open += td->nr_open_files;
+		} else if (td->runstate == TD_RAMP) {
+			je->nr_running++;
+			je->nr_ramp++;
+		} else if (td->runstate == TD_SETTING_UP)
+			je->nr_setting_up++;
+		else if (td->runstate < TD_RUNNING)
+			je->nr_pending++;
+
+		if (je->elapsed_sec >= 3)
+			eta_secs[i] = thread_eta(td);
+		else
+			eta_secs[i] = INT_MAX;
+
+		check_str_update(td);
+
+		if (td->runstate > TD_SETTING_UP) {
+			int ddir;
+
+			for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+				if (unified_rw_rep) {
+					io_bytes[0] += td->io_bytes[ddir];
+					io_iops[0] += td->io_blocks[ddir];
+				} else {
+					io_bytes[ddir] += td->io_bytes[ddir];
+					io_iops[ddir] += td->io_blocks[ddir];
+				}
+			}
+		}
+	}
+
+	if (exitall_on_terminate) {
+		je->eta_sec = INT_MAX;
+		for_each_td(td, i) {
+			if (eta_secs[i] < je->eta_sec)
+				je->eta_sec = eta_secs[i];
+		}
+	} else {
+		unsigned long eta_stone = 0;
+
+		je->eta_sec = 0;
+		for_each_td(td, i) {
+			if ((td->runstate == TD_NOT_CREATED) && td->o.stonewall)
+				eta_stone += eta_secs[i];
+			else {
+				if (eta_secs[i] > je->eta_sec)
+					je->eta_sec = eta_secs[i];
+			}
+		}
+		je->eta_sec += eta_stone;
+	}
+
+	free(eta_secs);
+
+	fio_gettime(&now, NULL);
+	rate_time = mtime_since(&rate_prev_time, &now);
+
+	if (write_bw_log && rate_time > bw_avg_time && !in_ramp_time(td)) {
+		calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes,
+				je->rate);
+		memcpy(&rate_prev_time, &now, sizeof(now));
+		add_agg_sample(sample_val(je->rate[DDIR_READ]), DDIR_READ, 0, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_WRITE]), DDIR_WRITE, 0, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_TRIM]), DDIR_TRIM, 0, 0);
+	}
+
+	disp_time = mtime_since(&disp_prev_time, &now);
+
+	if (!force && !eta_time_within_slack(disp_time))
+		return false;
+
+	calc_rate(unified_rw_rep, disp_time, io_bytes, disp_io_bytes, je->rate);
+	calc_iops(unified_rw_rep, disp_time, io_iops, disp_io_iops, je->iops);
+
+	memcpy(&disp_prev_time, &now, sizeof(now));
+
+	if (!force && !je->nr_running && !je->nr_pending)
+		return false;
+
+	je->nr_threads = thread_number;
+	update_condensed_str(__run_str, run_str);
+	memcpy(je->run_str, run_str, strlen(run_str));
+	return true;
+}
+
+static int gen_eta_str(struct jobs_eta *je, char *p, size_t left,
+		       char **rate_str, char **iops_str)
+{
+	bool has_r = je->rate[DDIR_READ] || je->iops[DDIR_READ];
+	bool has_w = je->rate[DDIR_WRITE] || je->iops[DDIR_WRITE];
+	bool has_t = je->rate[DDIR_TRIM] || je->iops[DDIR_TRIM];
+	int l = 0;
+
+	if (!has_r && !has_w && !has_t)
+		return 0;
+
+	if (has_r) {
+		l += snprintf(p + l, left - l, "[r=%s", rate_str[DDIR_READ]);
+		if (!has_w)
+			l += snprintf(p + l, left - l, "]");
+	}
+	if (has_w) {
+		if (has_r)
+			l += snprintf(p + l, left - l, ",");
+		else
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "w=%s", rate_str[DDIR_WRITE]);
+		if (!has_t)
+			l += snprintf(p + l, left - l, "]");
+	}
+	if (has_t) {
+		if (has_r || has_w)
+			l += snprintf(p + l, left - l, ",");
+		else if (!has_r && !has_w)
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "t=%s]", rate_str[DDIR_TRIM]);
+	}
+	if (has_r) {
+		l += snprintf(p + l, left - l, "[r=%s", iops_str[DDIR_READ]);
+		if (!has_w)
+			l += snprintf(p + l, left - l, " IOPS]");
+	}
+	if (has_w) {
+		if (has_r)
+			l += snprintf(p + l, left - l, ",");
+		else
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "w=%s", iops_str[DDIR_WRITE]);
+		if (!has_t)
+			l += snprintf(p + l, left - l, " IOPS]");
+	}
+	if (has_t) {
+		if (has_r || has_w)
+			l += snprintf(p + l, left - l, ",");
+		else if (!has_r && !has_w)
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "t=%s IOPS]", iops_str[DDIR_TRIM]);
+	}
+
+	return l;
+}
+
+void display_thread_status(struct jobs_eta *je)
+{
+	static struct timespec disp_eta_new_line;
+	static int eta_new_line_init, eta_new_line_pending;
+	static int linelen_last;
+	static int eta_good;
+	char output[__THREAD_RUNSTR_SZ(REAL_MAX_JOBS) + 512], *p = output;
+	char eta_str[128];
+	double perc = 0.0;
+
+	if (je->eta_sec != INT_MAX && je->elapsed_sec) {
+		perc = (double) je->elapsed_sec / (double) (je->elapsed_sec + je->eta_sec);
+		eta_to_str(eta_str, je->eta_sec);
+	}
+
+	if (eta_new_line_pending) {
+		eta_new_line_pending = 0;
+		linelen_last = 0;
+		p += sprintf(p, "\n");
+	}
+
+	p += sprintf(p, "Jobs: %d (f=%d)", je->nr_running, je->files_open);
+
+	/* rate limits, if any */
+	if (je->m_rate[0] || je->m_rate[1] || je->m_rate[2] ||
+	    je->t_rate[0] || je->t_rate[1] || je->t_rate[2]) {
+		char *tr, *mr;
+
+		mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
+				je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+		tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
+				je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+
+		p += sprintf(p, ", %s-%s", mr, tr);
+		free(tr);
+		free(mr);
+	} else if (je->m_iops[0] || je->m_iops[1] || je->m_iops[2] ||
+		   je->t_iops[0] || je->t_iops[1] || je->t_iops[2]) {
+		p += sprintf(p, ", %d-%d IOPS",
+					je->m_iops[0] + je->m_iops[1] + je->m_iops[2],
+					je->t_iops[0] + je->t_iops[1] + je->t_iops[2]);
+	}
+
+	/* current run string, % done, bandwidth, iops, eta */
+	if (je->eta_sec != INT_MAX && je->nr_running) {
+		char perc_str[32];
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
+		size_t left;
+		int l;
+		int ddir;
+		int linelen;
+
+		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running ||
+		    je->eta_sec == -1)
+			strcpy(perc_str, "-.-%");
+		else {
+			double mult = 100.0;
+
+			if (je->nr_setting_up && je->nr_running)
+				mult *= (1.0 - (double) je->nr_setting_up / (double) je->nr_running);
+
+			eta_good = 1;
+			perc *= mult;
+			sprintf(perc_str, "%3.1f%%", perc);
+		}
+
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			rate_str[ddir] = num2str(je->rate[ddir], 4,
+						1024, je->is_pow2, je->unit_base);
+			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, N2S_NONE);
+		}
+
+		left = sizeof(output) - (p - output) - 1;
+		l = snprintf(p, left, ": [%s][%s]", je->run_str, perc_str);
+		l += gen_eta_str(je, p + l, left - l, rate_str, iops_str);
+		l += snprintf(p + l, left - l, "[eta %s]", eta_str);
+
+		/* If truncation occurred adjust l so p is on the null */
+		if (l >= left)
+			l = left - 1;
+		p += l;
+		linelen = p - output;
+		if (l >= 0 && linelen < linelen_last)
+			p += sprintf(p, "%*s", linelen_last - linelen, "");
+		linelen_last = linelen;
+
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			free(rate_str[ddir]);
+			free(iops_str[ddir]);
+		}
+	}
+	sprintf(p, "\r");
+
+	printf("%s", output);
+
+	if (!eta_new_line_init) {
+		fio_gettime(&disp_eta_new_line, NULL);
+		eta_new_line_init = 1;
+	} else if (eta_new_line && mtime_since_now(&disp_eta_new_line) > eta_new_line) {
+		fio_gettime(&disp_eta_new_line, NULL);
+		eta_new_line_pending = 1;
+	}
+
+	fflush(stdout);
+}
+
+struct jobs_eta *get_jobs_eta(bool force, size_t *size)
+{
+	struct jobs_eta *je;
+
+	if (!thread_number)
+		return NULL;
+
+	*size = sizeof(*je) + THREAD_RUNSTR_SZ + 8;
+	je = malloc(*size);
+	if (!je)
+		return NULL;
+	memset(je, 0, *size);
+
+	if (!calc_thread_status(je, force)) {
+		free(je);
+		return NULL;
+	}
+
+	*size = sizeof(*je) + strlen((char *) je->run_str) + 1;
+	return je;
+}
+
+void print_thread_status(void)
+{
+	struct jobs_eta *je;
+	size_t size;
+
+	je = get_jobs_eta(false, &size);
+	if (je)
+		display_thread_status(je);
+
+	free(je);
+}
+
+void print_status_init(int thr_number)
+{
+	struct jobs_eta_packed jep;
+
+	compiletime_assert(sizeof(struct jobs_eta) == sizeof(jep), "jobs_eta");
+
+	DRD_IGNORE_VAR(__run_str);
+	__run_str[thr_number] = 'P';
+	update_condensed_str(__run_str, run_str);
+}
diff --git a/examples/1mbs_clients.fio b/examples/1mbs_clients.fio
new file mode 100644
index 0000000..505cd87
--- /dev/null
+++ b/examples/1mbs_clients.fio
@@ -0,0 +1,104 @@
+; Keep adding 1024kb/s reading clients at 4 seconds
+[global]
+size=32m
+rw=read
+directory=tmp
+rate=1250
+ratemin=1024
+
+[file1]
+startdelay=0
+
+[file2]
+startdelay=4
+
+[file3]
+startdelay=8
+
+[file4]
+startdelay=12
+
+[file5]
+startdelay=16
+
+[file6]
+startdelay=20
+
+[file7]
+startdelay=24
+
+[file8]
+startdelay=28
+
+[file9]
+startdelay=32
+
+[file10]
+startdelay=36
+
+[file11]
+startdelay=40
+
+[file12]
+startdelay=44
+
+[file13]
+startdelay=48
+
+[file14]
+startdelay=52
+
+[file15]
+startdelay=56
+
+[file16]
+startdelay=60
+
+[file17]
+startdelay=64
+
+[file18]
+startdelay=68
+
+[file19]
+startdelay=72
+
+[file20]
+startdelay=76
+
+[file21]
+startdelay=80
+
+[file22]
+startdelay=84
+
+[file23]
+startdelay=88
+
+[file24]
+startdelay=92
+
+[file25]
+startdelay=96
+
+[file26]
+startdelay=100
+
+[file27]
+startdelay=104
+
+[file28]
+startdelay=108
+
+[file29]
+startdelay=112
+
+[file30]
+startdelay=116
+
+[file31]
+startdelay=120
+
+[file32]
+startdelay=124
+
diff --git a/examples/aio-read.fio b/examples/aio-read.fio
new file mode 100644
index 0000000..173a4e1
--- /dev/null
+++ b/examples/aio-read.fio
@@ -0,0 +1,20 @@
+; Read 4 files with aio at different depths
+[global]
+ioengine=libaio
+buffered=0
+rw=randread
+bs=128k
+size=512m
+directory=/data1
+
+[file1]
+iodepth=4
+
+[file2]
+iodepth=32
+
+[file3]
+iodepth=8
+
+[file4]
+iodepth=16
diff --git a/examples/backwards-read.fio b/examples/backwards-read.fio
new file mode 100644
index 0000000..0fe35a2
--- /dev/null
+++ b/examples/backwards-read.fio
@@ -0,0 +1,8 @@
+# Demonstrates how to read backwards in a file.
+
+[backwards-read]
+bs=4k
+# seek -8k back for every IO
+rw=read:-8k
+filename=128m
+size=128m
diff --git a/examples/basic-verify.fio b/examples/basic-verify.fio
new file mode 100644
index 0000000..7871aeb
--- /dev/null
+++ b/examples/basic-verify.fio
@@ -0,0 +1,12 @@
+# The most basic form of data verification. Write the device randomly
+# in 4K chunks, then read it back and verify the contents.
+[write-and-verify]
+rw=randwrite
+bs=4k
+direct=1
+ioengine=libaio
+iodepth=16
+verify=crc32c
+# Use /dev/XXX. For running this on a file instead, remove the filename
+# option and add a size=32G (or whatever file size you want) instead.
+filename=/dev/XXX
diff --git a/examples/butterfly.fio b/examples/butterfly.fio
new file mode 100644
index 0000000..42d253d
--- /dev/null
+++ b/examples/butterfly.fio
@@ -0,0 +1,19 @@
+# Perform a butterfly/funnel seek pattern. This won't always alternate ends on
+# every I/O but it will get close.
+
+[global]
+filename=/tmp/testfile
+bs=4k
+direct=1
+
+[forward]
+rw=read
+flow=2
+# Uncomment the size= and offset= lines to prevent each direction going past
+# the middle of the file
+#size=50%
+
+[backward]
+rw=read:-8k
+flow=-2
+#offset=50%
diff --git a/examples/cpp_null.fio b/examples/cpp_null.fio
new file mode 100644
index 0000000..436ed90
--- /dev/null
+++ b/examples/cpp_null.fio
@@ -0,0 +1,10 @@
+[global]
+bs=4k
+gtod_reduce=1
+
+[null]
+ioengine=cpp_null
+size=100g
+rw=randread
+norandommap
+time_based=0
diff --git a/examples/cpuio.fio b/examples/cpuio.fio
new file mode 100644
index 0000000..577e072
--- /dev/null
+++ b/examples/cpuio.fio
@@ -0,0 +1,8 @@
+[global]
+ioengine=cpuio
+time_based
+runtime=10
+
+[burn50percent]
+cpuload=50
+
diff --git a/examples/cross-stripe-verify.fio b/examples/cross-stripe-verify.fio
new file mode 100644
index 0000000..68664ed
--- /dev/null
+++ b/examples/cross-stripe-verify.fio
@@ -0,0 +1,25 @@
+# Example of how to split a drive up into sections, manually, and perform
+# verify from a bunch of jobs. This example is special in that it assumes
+# the drive is at around 30 * 124G in size, so with the below settings, we'll
+# cover most of the drive. It's also special in that it doesn't write
+# everything, it just writes 16k at a specific boundary, for every 128k.
+# This is done to exercise the split path for Intel NVMe devices, most of
+# which have a 128k stripe size and require IOs to be split if the cross
+# the stripe boundary.
+#
+[global]
+bs=16k
+direct=1
+rw=write:112k
+verify=crc32c
+filename=/dev/nvme0n1
+verify_backlog=1
+offset_increment=124g
+io_size=120g
+offset=120k
+group_reporting=1
+verify_dump=1
+loops=2
+
+[write-verify]
+numjobs=30
diff --git a/examples/dev-dax.fio b/examples/dev-dax.fio
new file mode 100644
index 0000000..d9f430e
--- /dev/null
+++ b/examples/dev-dax.fio
@@ -0,0 +1,45 @@
+[global]
+bs=2m
+ioengine=dev-dax
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the dev-dax engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#
+iodepth=1
+direct=0
+thread=1
+numjobs=16
+#
+# The dev-dax engine does IO to DAX device that are special character
+# devices exported by the kernel (e.g. /dev/dax0.0). The device is
+# opened normally and then the region is accessible via mmap. We do
+# not use the O_DIRECT flag because the device is naturally direct
+# access. The O_DIRECT flags will result in failure. The engine
+# access the underlying NVDIMM directly once the mmapping is setup.
+#
+# Check the alignment requirement of your DAX device. Currently the default
+# should be 2M. Blocksize (bs) should meet alignment requirement.
+#
+# An example of creating a dev dax device node from pmem:
+# ndctl create-namespace --reconfig=namespace0.0 --mode=dax --force
+#
+filename=/dev/dax0.0
+
+[dev-dax-write]
+rw=randwrite
+stonewall
+
+[dev-dax-read]
+rw=randread
+stonewall
diff --git a/examples/disk-zone-profile.fio b/examples/disk-zone-profile.fio
new file mode 100644
index 0000000..96e5669
--- /dev/null
+++ b/examples/disk-zone-profile.fio
@@ -0,0 +1,14 @@
+; Read disk in zones of 128m/2g, generating a plot of that afterwards
+; should give a nice picture of the zoning of this drive
+
+[global]
+bs=64k
+direct=1
+rw=read
+ioengine=libaio
+iodepth=2
+zonesize=256m
+zoneskip=2g
+write_bw_log
+
+[/dev/sdb]
diff --git a/examples/e4defrag.fio b/examples/e4defrag.fio
new file mode 100644
index 0000000..cb94e85
--- /dev/null
+++ b/examples/e4defrag.fio
@@ -0,0 +1,41 @@
+[global]
+ioengine=e4defrag
+directory=/scratch
+nrfiles=1
+filesize=100M
+size=100M
+bs=32k
+#group_reporting
+
+[isolated-e4defrag]
+# It is important to disable buffered io
+buffered=0
+donorname=file.def
+filename=file1
+inplace=0
+rw=write
+
+# Run e4defrag and aio-dio workers in parallel
+[e4defrag]
+stonewall
+time_based=30
+runtime=30
+ioengine=e4defrag
+buffered=0
+donorname=file.def
+filename=file1
+inplace=0
+rw=write
+
+[random-aio-32k]
+ioengine=libaio
+runtime=30
+verify=md5
+direct=1
+bs=64k
+iodepth=128
+filename=file1
+rw=randrw
+numjobs=4
+
+
diff --git a/examples/e4defrag2.fio b/examples/e4defrag2.fio
new file mode 100644
index 0000000..c648599
--- /dev/null
+++ b/examples/e4defrag2.fio
@@ -0,0 +1,88 @@
+#################################################
+# Hardcode defragmentation patterns
+# Please be careful, it can trigger kernel panic
+#################################################
+[global]
+ioengine=e4defrag
+group_reporting
+directory=/scratch
+nrfiles=1
+filesize=100M
+size=100M
+donorname=file.def
+bs=32k
+
+###########
+# Run several defragmentation threads for different files, but
+# use shared donor file
+[parallel-e4defrag]
+buffered=0
+inplace=0
+rw=write
+numjobs=4
+
+########
+# Run two defragmentation threads, each thread use another's file
+# as donor file
+
+[e4defrag-1]
+stonewall
+inplace=0
+rw=write
+donorname=e4defrag-2
+
+[e4defrag-2]
+inplace=0
+rw=write
+donorname=e4defrag-1
+
+###########
+# Run random defragment activity 
+[e4defrag-fuzzer-4k]
+stonewall
+inplace=1
+bs=4k
+rw=randwrite
+filename=file
+donorname=file.def
+
+########
+# Run random e4defrag and various aio workers in parallel
+[e4defrag-fuzzer-4k]
+stonewall
+continue_on_error=all
+inplace=1
+bs=4k
+donorname=file3.def
+filename=file3
+time_based=30
+rw=randwrite
+
+[buffered-aio-32k]
+continue_on_error=none
+verify=md5
+buffered=1
+ioengine=libaio
+iodepth=128
+bs=32k
+filename=file3
+rw=randrw
+runtime=30
+time_based=30
+numjobs=4
+
+[direct-aio-32k]
+continue_on_error=none
+verify=md5
+buffered=0
+direct=1
+ioengine=libaio
+iodepth=128
+bs=32k
+filename=file3
+rw=randrw
+runtime=30
+time_based=30
+numjobs=4
+
+
diff --git a/examples/enospc-pressure.fio b/examples/enospc-pressure.fio
new file mode 100644
index 0000000..ca9d8f7
--- /dev/null
+++ b/examples/enospc-pressure.fio
@@ -0,0 +1,51 @@
+#
+# Test for race-condition DIO-write vs punch_hole
+# If race exist dio may rewrite punched block after
+# it was allocated to another file, we will catch that
+# by verifying blocks content
+#
+[global]
+ioengine=libaio 
+directory=/scratch
+# File size is reasonably huge to provoke ENOSPC
+filesize=128G
+size=999G
+iodepth=128
+
+# Expect write failure due to ENOSPC, skip error dump
+continue_on_error=write
+ignore_error=,ENOSPC
+error_dump=0
+fallocate=none
+exitall
+
+# Two threads (dio and punch_hole) operate on single file:'raicer',
+# We do not care about data content here
+[dio-raicer]
+bs=128k 
+direct=1
+buffered=0 
+rw=randwrite
+runtime=100
+filename=raicer
+time_based
+
+[punch_hole-raicer]
+bs=4k
+rw=randtrim
+filename=raicer
+
+# Verifier thread continiously write to newly allcated blocks
+# and veryfy written content
+[aio-dio-verifier]
+create_on_open=1
+verify=crc32c-intel
+verify_fatal=1
+verify_dump=1
+verify_backlog=1024
+verify_async=4
+direct=1
+# block size should be equals to fs block size to prevent short writes
+bs=4k
+rw=randrw
+filename=aio-dio-verifier
diff --git a/examples/exitwhat.fio b/examples/exitwhat.fio
new file mode 100644
index 0000000..a1099f0
--- /dev/null
+++ b/examples/exitwhat.fio
@@ -0,0 +1,56 @@
+# We want to run fast1 as long as slow1 is running, but also have a cumulative
+# report of fast1 (group_reporting=1/new_group=1).  exitall=1 would not cause
+# fast1 to stop after slow1 is done. Setting exit_what=stonewall will cause
+# alls jobs up until the next stonewall=1 setting to be stopped, when job slow1
+# finishes.
+# In this example skipping forward to slow2/fast2. slow2 has exit_what=all set,
+# which means all jobs will be cancelled when slow2 finishes. In particular,
+# runsnever will never run.
+
+[global]
+filename=/tmp/test
+filesize=1G
+blocksize=4096
+group_reporting=1
+exitall=1
+
+[slow1]
+rw=r
+numjobs=1
+ioengine=sync
+new_group=1
+thinktime=2000
+number_ios=1000
+exit_what=stonewall
+
+[fast1]
+new_group=1
+rw=randrw
+numjobs=3
+ioengine=libaio
+iodepth=32
+rate=300,300,300
+
+[slow2]
+stonewall=1
+rw=w
+numjobs=1
+ioengine=sync
+new_group=1
+thinktime=2000
+number_ios=1000
+exit_what=all
+
+[fast2]
+rw=randrw
+numjobs=3
+ioengine=libaio
+iodepth=32
+rate=300,300,300
+
+[runsnever]
+rw=randrw
+numjobs=3
+ioengine=libaio
+iodepth=32
+rate=300,300,300
diff --git a/examples/falloc.fio b/examples/falloc.fio
new file mode 100644
index 0000000..fa30731
--- /dev/null
+++ b/examples/falloc.fio
@@ -0,0 +1,54 @@
+[global]
+ioengine=falloc
+iodepth=1
+direct=0
+buffered=0
+directory=/scratch
+nrfiles=1
+size=100M
+filesize=100M
+group_reporting
+
+
+# Run falloc and punch_hole threads in parallel
+# After activity file will be highly fragmented
+[falloc-fuzzer]
+stonewall
+runtime=10
+time_based=10
+bssplit=4k/10:64k/50:32k/40
+rw=randwrite
+numjobs=1
+filename=fragmented_file
+
+[punch hole-fuzzer]
+bs=4k
+runtime=10
+time_based=10
+rw=randtrim
+numjobs=2
+filename=fragmented_file
+
+## Mesure IO performance on fragmented file
+[sequential aio-dio write]
+stonewall
+ioengine=libaio
+numjobs=1
+iodepth=128
+buffered=0
+direct=1
+rw=write
+bs=64k
+filename=fragmented_file
+
+[sequential buffered read]
+stonewall
+ioengine=sync
+numjobs=1
+iodepth=1
+buffered=1
+direct=0
+rw=read
+bs=64k
+filename=fragmented_file
+
diff --git a/examples/filecreate-ioengine.fio b/examples/filecreate-ioengine.fio
new file mode 100644
index 0000000..ec7caad
--- /dev/null
+++ b/examples/filecreate-ioengine.fio
@@ -0,0 +1,35 @@
+# Example filecreate job
+#
+# create_on_open is needed so that the open happens during the run and not the
+# setup.
+#
+# openfiles needs to be set so that you do not exceed the maximum allowed open
+# files.
+#
+# filesize needs to be set to a non zero value so fio will actually run, but the
+# IO will not really be done and the write latency numbers will only reflect the
+# open times.
+[global]
+create_on_open=1
+nrfiles=31250
+ioengine=filecreate
+fallocate=none
+filesize=4k
+openfiles=1
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
+[t6]
+[t7]
+[t8]
+[t9]
+[t10]
+[t11]
+[t12]
+[t13]
+[t14]
+[t15]
diff --git a/examples/filestat-ioengine.fio b/examples/filestat-ioengine.fio
new file mode 100644
index 0000000..932fced
--- /dev/null
+++ b/examples/filestat-ioengine.fio
@@ -0,0 +1,19 @@
+# Example filestat job
+
+# 'filestat' engine only do 'stat(filename)', file will not be open().
+# 'filesize' must be set, then files will be created at setup stage.
+
+[global]
+ioengine=filestat
+numjobs=1
+filesize=4k
+nrfiles=200
+thread
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
+
diff --git a/examples/fio-rand-RW.fio b/examples/fio-rand-RW.fio
new file mode 100644
index 0000000..0df0bc1
--- /dev/null
+++ b/examples/fio-rand-RW.fio
@@ -0,0 +1,18 @@
+; fio-rand-RW.job for fiotest
+
+[global]
+name=fio-rand-RW
+filename=fio-rand-RW
+rw=randrw
+rwmixread=60
+rwmixwrite=40
+bs=4K
+direct=0
+numjobs=4
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff --git a/examples/fio-rand-read.fio b/examples/fio-rand-read.fio
new file mode 100644
index 0000000..bc15466
--- /dev/null
+++ b/examples/fio-rand-read.fio
@@ -0,0 +1,16 @@
+; fio-rand-read.job for fiotest
+
+[global]
+name=fio-rand-read
+filename=fio-rand-read
+rw=randread
+bs=4K
+direct=0
+numjobs=1
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff --git a/examples/fio-rand-write.fio b/examples/fio-rand-write.fio
new file mode 100644
index 0000000..bd1b73a
--- /dev/null
+++ b/examples/fio-rand-write.fio
@@ -0,0 +1,16 @@
+; fio-rand-write.job for fiotest
+
+[global]
+name=fio-rand-write
+filename=fio-rand-write
+rw=randwrite
+bs=4K
+direct=0
+numjobs=4
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff --git a/examples/fio-seq-RW.fio b/examples/fio-seq-RW.fio
new file mode 100644
index 0000000..8f7090f
--- /dev/null
+++ b/examples/fio-seq-RW.fio
@@ -0,0 +1,18 @@
+; fio-seq-RW.job for fiotest
+
+[global]
+name=fio-seq-RW
+filename=fio-seq-RW
+rw=rw
+rwmixread=60
+rwmixwrite=40
+bs=256K
+direct=0
+numjobs=4
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff --git a/examples/fio-seq-read.fio b/examples/fio-seq-read.fio
new file mode 100644
index 0000000..28de93c
--- /dev/null
+++ b/examples/fio-seq-read.fio
@@ -0,0 +1,14 @@
+[global]
+name=fio-seq-reads
+filename=fio-seq-reads
+rw=read
+bs=256K
+direct=1
+numjobs=1
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff --git a/examples/fio-seq-write.fio b/examples/fio-seq-write.fio
new file mode 100644
index 0000000..b291a15
--- /dev/null
+++ b/examples/fio-seq-write.fio
@@ -0,0 +1,16 @@
+; fio-seq-write.job for fiotest
+
+[global]
+name=fio-seq-write
+filename=fio-seq-write
+rw=write
+bs=256K
+direct=0
+numjobs=1
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff --git a/examples/fixed-rate-submission.fio b/examples/fixed-rate-submission.fio
new file mode 100644
index 0000000..076a868
--- /dev/null
+++ b/examples/fixed-rate-submission.fio
@@ -0,0 +1,10 @@
+[fixed-rate-submit]
+size=128m
+rw=read
+ioengine=libaio
+iodepth=32
+direct=1
+# by setting the submit mode to offload, we can guarantee a fixed rate of
+# submission regardless of what the device completion rate is.
+io_submit_mode=offload
+rate_iops=1000
diff --git a/examples/flow.fio b/examples/flow.fio
new file mode 100644
index 0000000..4b078cf
--- /dev/null
+++ b/examples/flow.fio
@@ -0,0 +1,25 @@
+# Example usage of flows. The below will have roughly a 1:8 difference
+# between job2 and job1.
+[global]
+norandommap
+thread
+time_based
+runtime=30
+direct=1
+ioengine=libaio
+iodepth=256
+size=100g
+bs=8k
+filename=/tmp/testfile
+flow_watermark=100
+flow_sleep=1000
+
+[job2]
+numjobs=1
+rw=write
+flow=-8
+
+[job1]
+numjobs=1
+rw=randread
+flow=1
diff --git a/examples/fsx.fio b/examples/fsx.fio
new file mode 100644
index 0000000..6b48c6f
--- /dev/null
+++ b/examples/fsx.fio
@@ -0,0 +1,12 @@
+; This job file works pretty works similarly to running fsx-linux
+; with -r 4096 -w 4096 -Z -N 500000
+[file]
+ioengine=libaio
+iodepth=1
+rw=randrw
+size=256k
+bs=4k
+norandommap
+direct=1
+loops=500000
+rwmixcycle=40
diff --git a/examples/ftruncate.fio b/examples/ftruncate.fio
new file mode 100644
index 0000000..a6ef457
--- /dev/null
+++ b/examples/ftruncate.fio
@@ -0,0 +1,27 @@
+# Example ftruncate engine jobs
+
+[global]
+ioengine=ftruncate
+directory=/scratch
+size=102404k ; 100Mb+4k
+stonewall
+filename=truncate
+runtime=10s
+time_based
+direct=1
+#
+# bs option is stub here. Truncation is performed on the current block offset.
+# blocksize value is ignored
+bs=4k
+
+# truncate the file to 4Kbytes then repeatedly grow the file back to just over
+# its original size using subsequent truncates
+[grow-truncate]
+rw=write
+
+# Repeatedly change a file to a random size between 0Kbytes and 100Mb
+# using truncates
+[rand-truncate]
+rw=randwrite
+norandommap
+
diff --git a/examples/gfapi.fio b/examples/gfapi.fio
new file mode 100644
index 0000000..ccc8123
--- /dev/null
+++ b/examples/gfapi.fio
@@ -0,0 +1,16 @@
+# Test opening a file from multiple jobs.
+# Originally authored by Castor Fu
+[global]
+ioengine=gfapi
+volume=vol
+brick=localhost
+create_on_open=1
+rw=write
+
+[reopen_file_test]
+nrfiles=4
+filesize=16k
+size=64k
+openfiles=2
+rw=write
+filename_format=reopen_test.$filenum
diff --git a/examples/gpudirect-rdmaio-client.fio b/examples/gpudirect-rdmaio-client.fio
new file mode 100644
index 0000000..1e24624
--- /dev/null
+++ b/examples/gpudirect-rdmaio-client.fio
@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff --git a/examples/gpudirect-rdmaio-server.fio b/examples/gpudirect-rdmaio-server.fio
new file mode 100644
index 0000000..5fc4950
--- /dev/null
+++ b/examples/gpudirect-rdmaio-server.fio
@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff --git a/examples/http-s3.fio b/examples/http-s3.fio
new file mode 100644
index 0000000..2dcae36
--- /dev/null
+++ b/examples/http-s3.fio
@@ -0,0 +1,34 @@
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
diff --git a/examples/http-swift.fio b/examples/http-swift.fio
new file mode 100644
index 0000000..b591adb
--- /dev/null
+++ b/examples/http-swift.fio
@@ -0,0 +1,32 @@
+[global]
+ioengine=http
+rw=randwrite
+name=test
+direct=1
+http_verbose=0
+http_mode=swift
+https=on
+# This is the hostname and port portion of the public access link for
+# the container:
+http_host=swift.srv.openstack.local:8081
+filename_format=/swift/v1/fio-test/bucket.$jobnum
+group_reporting
+bs=64k
+size=1M
+# Currently, fio cannot yet generate the Swift Auth-Token itself.
+# You need to set this prior to running fio via
+# eval $(openstack token issue -f shell --prefix SWIFT_) ; export SWIFT_id
+http_swift_auth_token=${SWIFT_id}
+
+[create]
+numjobs=1
+rw=randwrite
+io_size=256k
+verify=sha256
+
+# This will delete all created objects again
+[trim]
+stonewall
+numjobs=1
+rw=trim
+io_size=64k
diff --git a/examples/http-webdav.fio b/examples/http-webdav.fio
new file mode 100644
index 0000000..2d1ca73
--- /dev/null
+++ b/examples/http-webdav.fio
@@ -0,0 +1,26 @@
+[global]
+ioengine=http
+rw=randwrite
+name=test
+direct=1
+http_verbose=0
+http_mode=webdav
+https=off
+http_host=localhost
+filename_format=/dav/bucket.$jobnum
+group_reporting
+bs=64k
+size=1M
+
+[create]
+numjobs=16
+rw=randwrite
+io_size=10M
+verify=sha256
+
+# This will delete all created objects again
+[trim]
+stonewall
+numjobs=16
+rw=trim
+io_size=1M
diff --git a/examples/ime.fio b/examples/ime.fio
new file mode 100644
index 0000000..e97fd1d
--- /dev/null
+++ b/examples/ime.fio
@@ -0,0 +1,51 @@
+# This jobfile performs basic write+read operations using
+# DDN's Infinite Memory Engine.
+
+[global]
+
+# Use as much jobs as possible to maximize performance
+numjobs=8
+
+# The filename should be uniform so that "read" jobs can read what
+# the "write" jobs have written.
+filename_format=fio-test-ime.$jobnum.$filenum
+
+size=25g
+bs=128k
+
+# These settings are useful for the asynchronous ime_aio engine:
+# by setting the io depth to twice the size of a "batch", we can
+# queue IOs while other IOs are "in-flight".
+iodepth=32
+iodepth_batch=16
+iodepth_batch_complete=16
+
+[write-psync]
+stonewall
+rw=write
+ioengine=ime_psync
+
+[read-psync]
+stonewall
+rw=read
+ioengine=ime_psync
+
+[write-psyncv]
+stonewall
+rw=write
+ioengine=ime_psyncv
+
+[read-psyncv]
+stonewall
+rw=read
+ioengine=ime_psyncv
+
+[write-aio]
+stonewall
+rw=write
+ioengine=ime_aio
+
+[read-aio]
+stonewall
+rw=read
+ioengine=ime_aio
\ No newline at end of file
diff --git a/examples/iometer-file-access-server.fio b/examples/iometer-file-access-server.fio
new file mode 100644
index 0000000..291bace
--- /dev/null
+++ b/examples/iometer-file-access-server.fio
@@ -0,0 +1,18 @@
+# This job file tries to mimic the Intel IOMeter File Server Access Pattern
+[global]
+description=Emulation of Intel IOmeter File Server Access Pattern
+
+[iometer]
+bssplit=512/10:1k/5:2k/5:4k/60:8k/2:16k/4:32k/4:64k/10
+rw=randrw
+rwmixread=80
+direct=1
+size=4g
+ioengine=libaio
+# IOMeter defines the server loads as the following:
+# iodepth=1	Linear
+# iodepth=4	Very Light
+# iodepth=8	Light
+# iodepth=64	Moderate
+# iodepth=256	Heavy
+iodepth=64
diff --git a/examples/jesd219.fio b/examples/jesd219.fio
new file mode 100644
index 0000000..24f16f7
--- /dev/null
+++ b/examples/jesd219.fio
@@ -0,0 +1,20 @@
+# Sample implementation of the JESD219 workload for SSD endurance
+# testing. It uses a specific distribution of block sizes and
+# read/write mix, as well as a specific distribution of where on
+# the device the IO accesses will land. Based on posting from
+# Jeff Furlong <jeff.furlong@hgst.com>
+[JESD219]
+ioengine=libaio
+direct=1
+rw=randrw
+norandommap
+randrepeat=0
+rwmixread=40
+rwmixwrite=60
+iodepth=256
+numjobs=4
+bssplit=512/4:1024/1:1536/1:2048/1:2560/1:3072/1:3584/1:4k/67:8k/10:16k/7:32k/3:64k/3
+blockalign=4k
+random_distribution=zoned:50/5:30/15:20/80
+filename=/dev/nvme0n1
+group_reporting=1
diff --git a/examples/latency-profile.fio b/examples/latency-profile.fio
new file mode 100644
index 0000000..f5cd844
--- /dev/null
+++ b/examples/latency-profile.fio
@@ -0,0 +1,21 @@
+# Test job that demonstrates how to use the latency target
+# profiling. Fio will find the queue depth between 1..128
+# that fits within the latency constraints of this 4k random
+# read workload.
+
+[global]
+bs=4k
+rw=randread
+random_generator=lfsr
+direct=1
+ioengine=libaio
+iodepth=128
+# Set max acceptable latency to 500msec
+latency_target=500000
+# profile over a 5s window
+latency_window=5000000
+# 99.9% of IOs must be below the target
+latency_percentile=99.9
+
+[device]
+filename=/dev/sda
diff --git a/examples/libhdfs.fio b/examples/libhdfs.fio
new file mode 100644
index 0000000..d5c0ba6
--- /dev/null
+++ b/examples/libhdfs.fio
@@ -0,0 +1,8 @@
+[global]
+runtime=300
+
+[hdfs]
+filename=dfs-perftest-base.dfs-perftest-base,9000
+ioengine=libhdfs
+rw=read
+bs=256k
diff --git a/examples/libiscsi.fio b/examples/libiscsi.fio
new file mode 100644
index 0000000..565604d
--- /dev/null
+++ b/examples/libiscsi.fio
@@ -0,0 +1,3 @@
+[iscsi]
+ioengine=libiscsi
+filename=iscsi\://127.0.0.1/iqn.2016-02.com.fio\:system\:fio/1
diff --git a/examples/libpmem.fio b/examples/libpmem.fio
new file mode 100644
index 0000000..d44fcfa
--- /dev/null
+++ b/examples/libpmem.fio
@@ -0,0 +1,73 @@
+[global]
+bs=4k
+size=8g
+ioengine=libpmem
+norandommap
+time_based=1
+group_reporting
+invalidate=1
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+
+iodepth=1
+iodepth_batch=1
+thread=1
+numjobs=1
+
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritten with a random value every write operations.
+#
+# But when 'scramble_buffers=0' is set, the source buffer isn't
+# rewritten. So it will be likely that the source buffer is in CPU
+# cache and it seems to be high performance.
+#
+scramble_buffers=0
+
+#
+# direct=0:
+#   Using pmem_memcpy_nodrain() for write operation
+#
+# direct=1:
+#   Using pmem_memcpy_persist() for write operation
+#
+direct=0
+
+#
+# Setting for fio process's CPU Node and Memory Node
+#
+numa_cpu_nodes=0
+numa_mem_policy=bind:0
+
+#
+# split means that each job will get a unique CPU from the CPU set
+#
+cpus_allowed_policy=split
+
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+directory=/mnt/pmem0
+
+[libpmem-seqwrite]
+rw=write
+stonewall
+
+#[libpmem-seqread]
+#rw=read
+#stonewall
+
+#[libpmem-randwrite]
+#rw=randwrite
+#stonewall
+
+#[libpmem-randread]
+#rw=randread
+#stonewall
diff --git a/examples/mtd.fio b/examples/mtd.fio
new file mode 100644
index 0000000..e5dcea4
--- /dev/null
+++ b/examples/mtd.fio
@@ -0,0 +1,21 @@
+[global]
+gtod_reduce=1
+filename=/dev/mtd0
+ioengine=mtd
+ignore_error=,EIO
+blocksize=512,512,16384
+skip_bad=1
+
+[write]
+stonewall
+rw=trim
+
+[write]
+stonewall
+rw=write
+
+[write]
+stonewall
+block_error_percentiles=1
+rw=trimwrite
+loops=4
diff --git a/examples/nbd.fio b/examples/nbd.fio
new file mode 100644
index 0000000..6900ebe
--- /dev/null
+++ b/examples/nbd.fio
@@ -0,0 +1,35 @@
+# To use fio to test nbdkit:
+#
+# nbdkit -U - memory size=256M --run 'export unixsocket; fio examples/nbd.fio'
+#
+# To use fio to test qemu-nbd:
+#
+# rm -f /tmp/disk.img /tmp/socket
+# truncate -s 256M /tmp/disk.img
+# export unixsocket=/tmp/socket
+# qemu-nbd -t -k $unixsocket -f raw /tmp/disk.img &
+# fio examples/nbd.fio
+# killall qemu-nbd
+
+[global]
+ioengine=nbd
+uri=nbd+unix:///?socket=${unixsocket}
+# Starting from nbdkit 1.14 the following will work:
+#uri=${uri}
+rw=randrw
+time_based
+runtime=60
+group_reporting
+iodepth=64
+
+[job0]
+offset=0
+
+[job1]
+offset=64m
+
+[job2]
+offset=128m
+
+[job3]
+offset=192m
diff --git a/examples/netio.fio b/examples/netio.fio
new file mode 100644
index 0000000..0c5c77c
--- /dev/null
+++ b/examples/netio.fio
@@ -0,0 +1,20 @@
+# Example network job, just defines two clients that send/recv data
+[global]
+ioengine=net
+#Use hostname=/tmp.fio.sock for local unix domain sockets
+port=8888
+#Use =udp for UDP, =unix for local unix domain socket
+protocol=tcp
+bs=4k
+size=100g
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[receiver]
+listen
+rw=read
+
+[sender]
+hostname=localhost
+startdelay=1
+rw=write
diff --git a/examples/netio_multicast.fio b/examples/netio_multicast.fio
new file mode 100644
index 0000000..f7d9d26
--- /dev/null
+++ b/examples/netio_multicast.fio
@@ -0,0 +1,23 @@
+# netio UDP multicast example. Writers and readers can be run on separate hosts.
+[global]
+ioengine=net
+protocol=udp
+bs=64
+size=100m
+# Set interface IP to send/receive traffic through specific network interface
+#interface=10.8.16.22
+port=10000
+hostname=239.0.0.0
+ttl=1
+
+[pingpong_reader]
+pingpong=1
+rw=read
+
+[normal_reader]
+rw=read
+
+[pingpong_writer]
+startdelay=1
+pingpong=1
+rw=write
diff --git a/examples/null.fio b/examples/null.fio
new file mode 100644
index 0000000..9d2f3e0
--- /dev/null
+++ b/examples/null.fio
@@ -0,0 +1,10 @@
+[global]
+bs=4k
+gtod_reduce=1
+
+[null]
+ioengine=null
+size=100g
+rw=randread
+norandommap
+time_based=0
diff --git a/examples/numa.fio b/examples/numa.fio
new file mode 100644
index 0000000..b81964f
--- /dev/null
+++ b/examples/numa.fio
@@ -0,0 +1,21 @@
+; setup numa policy for each thread
+; 'numactl --show' to determine the maximum numa nodes
+[global]
+ioengine=libaio
+buffered=0
+rw=randread
+bs=512K
+iodepth=16
+size=512m
+filename=/dev/sdb1
+
+; Fix memory blocks (512K * 16) in numa node 0
+[job1]
+numa_cpu_nodes=0
+numa_mem_policy=bind:0
+
+; Interleave memory blocks (512K * 16) in numa node 0 and 1
+[job2]
+numa_cpu_nodes=0-1
+numa_mem_policy=interleave:0-1
+
diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio
new file mode 100644
index 0000000..2d5ecfc
--- /dev/null
+++ b/examples/pmemblk.fio
@@ -0,0 +1,71 @@
+[global]
+bs=1m
+ioengine=pmemblk
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the pmemblk engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#   Must use threads
+#
+iodepth=1
+direct=1
+thread=1
+numjobs=16
+#
+# Unlink can be used to remove the files when done, but if you are
+# using serial runs with stonewall, and you want the files to be created
+# only once and unlinked only at the very end, then put the unlink=1
+# in the last group.  This is the method demonstrated here.
+#
+# Note that if you have a read-only group and if the files will be
+# newly created, then all of the data will read back as zero and the
+# read will be optimized, yielding performance that is different from
+# that of reading non-zero blocks (or unoptimized zero blocks).
+#
+unlink=0
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+# Here we specify a test file on each of two NVDIMMs.  The first
+# number after the file name is the block size in bytes (4096 bytes
+# in this example).  The second number is the size of the file to
+# create in MiB (1 GiB in this example); note that the actual usable
+# space available to fio will be less than this as libpmemblk requires
+# some space for metadata.
+#
+# Currently, the minimum block size is 512 bytes and the minimum file
+# size is about 17 MiB (these are libpmemblk requirements).
+#
+# While both files in this example have the same block size and file
+# size, this is not required.
+#
+filename=/pmem0/fio-test,4096,1024
+filename=/pmem1/fio-test,4096,1024
+
+[pmemblk-write]
+rw=randwrite
+stonewall
+
+[pmemblk-read]
+rw=randread
+stonewall
+#
+# We're done, so unlink the file:
+#
+unlink=1
+
diff --git a/examples/poisson-rate-submission.fio b/examples/poisson-rate-submission.fio
new file mode 100644
index 0000000..4bb28f2
--- /dev/null
+++ b/examples/poisson-rate-submission.fio
@@ -0,0 +1,14 @@
+[poisson-rate-submit]
+size=128m
+rw=randread
+ioengine=libaio
+iodepth=32
+direct=1
+# by setting the submit mode to offload, we can guarantee a fixed rate of
+# submission regardless of what the device completion rate is.
+io_submit_mode=offload
+rate_iops=50
+# Real world random request flow follows Poisson process. To give better
+# insight on latency distribution, we simulate request flow under Poisson
+# process.
+rate_process=poisson
diff --git a/examples/rados.fio b/examples/rados.fio
new file mode 100644
index 0000000..035cbff
--- /dev/null
+++ b/examples/rados.fio
@@ -0,0 +1,24 @@
+######################################################################
+# Example test for the RADOS engine.
+#
+# Runs a 4k random write test against a RADOS via librados
+#
+# NOTE: Make sure you have either Ceph pool named 'rados' or change
+#       the pool parameter.
+######################################################################
+[global]
+#logging
+#write_iops_log=write_iops_log
+#write_bw_log=write_bw_log
+#write_lat_log=write_lat_log
+ioengine=rados
+clientname=admin
+pool=rados
+busy_poll=0
+rw=randwrite
+bs=4k
+
+[rbd_iodepth32]
+iodepth=32
+size=128m
+nr_files=32
diff --git a/examples/rand-zones.fio b/examples/rand-zones.fio
new file mode 100644
index 0000000..169137d
--- /dev/null
+++ b/examples/rand-zones.fio
@@ -0,0 +1,26 @@
+# Sample job file demonstrating how to use zoned random distributionss
+# to have skewed random accesses. This example has 50% of the accesses
+# to the first 5% of the file (50/5), 30% to the next 15% (30/15), and
+# finally 20% of the IO will end up in the remaining 80%.
+[zones]
+size=2g
+direct=1
+bs=4k
+rw=randread
+norandommap
+random_distribution=zoned:50/5:30/15:20/
+
+# It's also possible to use zoned_abs to specify absolute sizes. For
+# instance, if you do:
+#
+# random_distribution=zoned_abs:50/10G:30/100G:20/500G
+#
+# Then 50% of the access will be to the first 10G of the drive, 30%
+# will be to the next 100G, and 20% will be to the next 500G.
+
+# The above applies to all of reads/writes/trims. If we wanted to do
+# something differently for writes, let's say 50% for the first 10%
+# and 50% for the remaining 90%, we could do it by adding a new section
+# after a a comma.
+
+# random_distribution=zoned:50/5:30/15:20/,50/10:50/90
diff --git a/examples/rbd.fio b/examples/rbd.fio
new file mode 100644
index 0000000..c6901f4
--- /dev/null
+++ b/examples/rbd.fio
@@ -0,0 +1,22 @@
+######################################################################
+# Example test for the RBD engine.
+# 
+# Runs a 4k random write test against a RBD via librbd
+#
+# NOTE: Make sure you have either a RBD named 'fio_test' or change
+#       the rbdname parameter.
+######################################################################
+[global]
+#logging
+#write_iops_log=write_iops_log
+#write_bw_log=write_bw_log
+#write_lat_log=write_lat_log
+ioengine=rbd
+clientname=admin
+pool=rbd
+rbdname=fio_test
+rw=randwrite
+bs=4k
+
+[rbd_iodepth32]
+iodepth=32
diff --git a/examples/rdmaio-client.fio b/examples/rdmaio-client.fio
new file mode 100644
index 0000000..286aa21
--- /dev/null
+++ b/examples/rdmaio-client.fio
@@ -0,0 +1,13 @@
+# Example rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff --git a/examples/rdmaio-server.fio b/examples/rdmaio-server.fio
new file mode 100644
index 0000000..ee30856
--- /dev/null
+++ b/examples/rdmaio-server.fio
@@ -0,0 +1,10 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff --git a/examples/ssd-steadystate.fio b/examples/ssd-steadystate.fio
new file mode 100644
index 0000000..9b91467
--- /dev/null
+++ b/examples/ssd-steadystate.fio
@@ -0,0 +1,36 @@
+# Get a decent idea about the steady state performance of an SSD.
+#
+# First we sequentially write the drive. Then we completely
+# overwrite the device again, this time randomly at 4K. The former gives
+# us a good idea of the ideal write performance, you should see flat graph
+# of steady write performance. The latter we would expect to start out at
+# approximately the same rate as the sequential fill, but at some point
+# hit a write cliff and hit steady state. The latency numbers of the steady
+# state also provide a good idea of what kind of latencies to expect when
+# the device is pushed to steady state instead of peak benchmark-like
+# numbers that are usually reported.
+#
+# Note that this is a DESTRUCTIVE test. It operates on the device itself.
+# It's not destructive in the sense that it will ruin the device, but
+# whatever data you have on there will be gone.
+#
+[global]
+ioengine=libaio
+direct=1
+group_reporting
+filename=/dev/fioa
+
+[sequential-fill]
+description=Sequential fill phase
+rw=write
+iodepth=16
+bs=1M
+
+[random-write-steady]
+stonewall
+description=Random write steady state phase
+rw=randwrite
+bs=4K
+iodepth=32
+numjobs=4
+write_bw_log=fioa-steady-state
diff --git a/examples/ssd-test.fio b/examples/ssd-test.fio
new file mode 100644
index 0000000..2b6a590
--- /dev/null
+++ b/examples/ssd-test.fio
@@ -0,0 +1,37 @@
+# Do some important numbers on SSD drives, to gauge what kind of
+# performance you might get out of them.
+#
+# Sequential read and write speeds are tested, these are expected to be
+# high. Random reads should also be fast, random writes are where crap
+# drives are usually separated from the good drives.
+#
+# This uses a queue depth of 4. New SATA SSD's will support up to 32
+# in flight commands, so it may also be interesting to increase the queue
+# depth and compare. Note that most real-life usage will not see that
+# large of a queue depth, so 4 is more representative of normal use.
+#
+[global]
+bs=4k
+ioengine=libaio
+iodepth=4
+size=10g
+direct=1
+runtime=60
+directory=/mount-point-of-ssd
+filename=ssd.test.file
+
+[seq-read]
+rw=read
+stonewall
+
+[rand-read]
+rw=randread
+stonewall
+
+[seq-write]
+rw=write
+stonewall
+
+[rand-write]
+rw=randwrite
+stonewall
diff --git a/examples/steadystate.fio b/examples/steadystate.fio
new file mode 100644
index 0000000..26fb808
--- /dev/null
+++ b/examples/steadystate.fio
@@ -0,0 +1,45 @@
+#
+# Example job file for steady state job termination
+# Use --output-format=json for detailed information
+#
+# For Windows, change the file names
+#
+
+[global]
+threads=1
+group_reporting=1
+time_based
+size=128m
+
+[ss-write]
+filename=/dev/null
+rw=write
+bs=128k
+numjobs=4
+runtime=5m
+ss=iops:10%
+ss_dur=30s
+ss_ramp=10s
+#
+# Begin ss detection 10s after job starts
+# Terminate job when largest deviation from mean IOPS is 10%
+# Use a rolling 30s window for deviations
+#
+
+
+[ss-read]
+new_group
+stonewall
+filename=/dev/zero
+rw=randread
+bs=4k
+numjobs=4
+runtime=5m
+ss=bw_slope:1%
+ss_dur=10s
+ss_ramp=5s
+#
+# Begin ss detection 5s after job starts
+# Terminate job when bandwidth slope is less than 1% of avg bw
+# Use a rolling 10s window for bw measurements
+#
diff --git a/examples/surface-scan.fio b/examples/surface-scan.fio
new file mode 100644
index 0000000..dc3373a
--- /dev/null
+++ b/examples/surface-scan.fio
@@ -0,0 +1,23 @@
+; writes 512 byte verification blocks until the disk is full,
+; then verifies written data
+[global]
+thread=1
+bs=64k
+direct=1
+ioengine=sync
+verify=meta
+verify_pattern=0xaa555aa5
+verify_interval=512
+
+[write-phase]
+filename=datafile.tmp	; or use a full disk, for example /dev/sda
+rw=write
+fill_device=1
+do_verify=0
+
+[verify-phase]
+stonewall
+create_serialize=0
+filename=datafile.tmp
+rw=read
+do_verify=1
diff --git a/examples/tiobench-example.fio b/examples/tiobench-example.fio
new file mode 100644
index 0000000..5a4493e
--- /dev/null
+++ b/examples/tiobench-example.fio
@@ -0,0 +1,24 @@
+; tiobench like setup, add more fX files between the stonewalls to
+; create more threads
+
+[global]
+direct=1
+size=512m
+bsrange=4k-4k
+timeout=60
+numjobs=4	; 4 simultaneous threads for each job
+
+[f1]
+rw=write
+
+[f2]
+stonewall
+rw=randwrite
+
+[f3]
+stonewall
+rw=read
+
+[f4]
+stonewall
+rw=randread
diff --git a/examples/waitfor.fio b/examples/waitfor.fio
new file mode 100644
index 0000000..95fad00
--- /dev/null
+++ b/examples/waitfor.fio
@@ -0,0 +1,35 @@
+[global]
+threads=1
+group_reporting=1
+filename=/tmp/data
+filesize=128m
+
+[writers]
+rw=write
+bs=128k
+numjobs=4
+runtime=10
+
+[readers]
+new_group
+wait_for=writers
+rw=randread
+bs=4k
+numjobs=4
+runtime=10
+
+[writers2]
+new_group
+wait_for=readers
+rw=randwrite
+bs=4k
+numjobs=4
+runtime=10
+
+[readers2]
+new_group
+wait_for=writers2
+rw=randread
+bs=4k
+numjobs=4
+runtime=10
diff --git a/examples/zipf.fio b/examples/zipf.fio
new file mode 100644
index 0000000..fcfa38d
--- /dev/null
+++ b/examples/zipf.fio
@@ -0,0 +1,10 @@
+# Example job file for using a zipf distribution instead
+# of a purely random workload where each block is read
+# or written once.
+[job]
+ioengine=null
+rw=randread
+norandommap
+size=1280m
+bs=4k
+random_distribution=zipf:0.5
diff --git a/exp/expression-parser.l b/exp/expression-parser.l
new file mode 100644
index 0000000..692c6cc
--- /dev/null
+++ b/exp/expression-parser.l
@@ -0,0 +1,184 @@
+%{
+
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "y.tab.h"
+
+#define YYSTYPE PARSER_VALUE_TYPE
+
+extern int lexer_input(char *buffer, unsigned int *nbytes, int buffersize);
+
+#undef YY_INPUT
+#define YY_INPUT(buffer, bytes_read, bytes_requested)			\
+({									\
+	int __ret;							\
+	unsigned int __bread = bytes_read;				\
+	__ret = lexer_input((buffer), &__bread, (bytes_requested));	\
+	bytes_read = __bread;						\
+	__ret;								\
+})
+
+extern int yyerror(long long *result, double *dresult,
+		int *has_error, int *units_specified, const char *msg);
+
+static void __attribute__((unused)) yyunput(int c, char *buf_ptr);
+static int __attribute__((unused)) input(void);
+
+/* set by parser -- this is another thing which makes the parser thread-unsafe :(. */
+int lexer_value_is_time = 0; /* for determining if "m" suffix means mega- or minutes */
+
+#define set_suffix_value(yylval, i_val, d_val, has_d_val) \
+	(yylval).v.dval = (d_val); \
+	(yylval).v.ival = (i_val); \
+	(yylval).v.has_dval = (has_d_val); \
+	(yylval).v.has_error = 0;
+
+%}
+
+%%
+
+
+[kK]|[kK][bB] 	{
+			set_suffix_value(yylval, 1024, 1024.0, 0);
+			return SUFFIX;
+		}
+[Mm][bB]	{
+			set_suffix_value(yylval, 1024 * 1024, 1024.0 * 1024.0, 0);
+			return SUFFIX;
+		}
+[mM][sS]	{
+			set_suffix_value(yylval, 1000, 1000.0, 1);
+			return SUFFIX;
+		}
+[uU][sS]	{
+			set_suffix_value(yylval, 1, 1.0, 1);
+			return SUFFIX;
+		}
+[gG]|[Gg][Bb]	{
+			set_suffix_value(yylval, 1024LL * 1024 * 1024, 1024.0 * 1024.0 * 1024, 0);
+			return SUFFIX;
+		}
+[tT]|[tT][bB]	{	
+			set_suffix_value(yylval, 1024LL * 1024 * 1024 * 1024,
+						1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024, 0);
+			return SUFFIX;
+		}
+[pP]|[pP][bB]	{	
+			set_suffix_value(yylval, 1024LL * 1024 * 1024 * 1024 * 1024,
+					1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024.0, 0);
+			return SUFFIX;
+		}
+[kK][iI][Bb]	{
+			set_suffix_value(yylval, 1000LL, 1000.0, 0);
+			return SUFFIX;
+		}
+[mM][Ii][bB]	{
+			set_suffix_value(yylval, 1000000LL, 1000000.0 , 0);
+			return SUFFIX;
+		}
+[gG][iI][Bb]	{
+			set_suffix_value(yylval, 1000000000LL, 1000000000.0 , 0);
+			return SUFFIX;
+		}
+[pP][iI][Bb]	{	
+			set_suffix_value(yylval, 1000000000000LL, 1000000000000.0 , 0);
+			return SUFFIX;
+		}
+[sS]		{
+			set_suffix_value(yylval, 1000000LL, 1000000.0 , 0);
+			return SUFFIX;
+		}
+[mM]		{
+			if (!lexer_value_is_time) {
+				set_suffix_value(yylval, 1024 * 1024, 1024.0 * 1024.0, 0);
+			} else {
+				set_suffix_value(yylval, 60LL * 1000000LL, 60.0 * 1000000.0, 0);
+			}
+			return SUFFIX;
+		}
+[dD]		{
+			set_suffix_value(yylval, 60LL * 60LL * 24LL * 1000000LL,
+						60.0 * 60.0 * 24.0 * 1000000.0, 0);
+			return SUFFIX;
+		}
+[hH]		{	
+			set_suffix_value(yylval, 60LL * 60LL * 1000000LL,
+					60.0 * 60.0 * 1000000.0, 0);
+			return SUFFIX;
+		}
+[ \t] ; /* ignore whitespace */
+[#:,].* ; /* ignore comments, and everything after colons and commas */
+[0-9]*[.][0-9]+|[0-9]*[.]?[0-9]+[eE][-+]*[0-9]+ {
+			int rc;
+			double dval;
+
+			rc = sscanf(yytext, "%lf", &dval);
+			if (rc == 1) {
+				yylval.v.dval = dval;
+				yylval.v.ival = (long long) dval;
+				yylval.v.has_dval = 1;
+				yylval.v.has_error = 0;
+				return NUMBER;
+			} else {
+				yyerror(0, 0, 0, 0, "bad number\n");
+				yylval.v.has_error = 1;
+				return NUMBER;
+			}
+		}
+0x[0-9a-fA-F]+ {
+		int rc, intval;
+		rc = sscanf(yytext, "%x", &intval);
+		if (rc == 1) {
+			yylval.v.ival = intval;
+			yylval.v.dval = (double) intval;
+			yylval.v.has_dval = 0;
+			yylval.v.has_error = 0;
+			return NUMBER;
+		} else {
+			yyerror(0, 0, 0, 0, "bad number\n");
+			yylval.v.has_error = 1;
+			return NUMBER;
+		}
+	}
+[0-9]+	{
+		int rc, intval;
+		rc = sscanf(yytext, "%d", &intval);
+		if (rc == 1) {
+			yylval.v.ival = intval;
+			yylval.v.dval = (double) intval;
+			yylval.v.has_dval = 0;
+			yylval.v.has_error = 0;
+			return NUMBER;
+		} else {
+			yyerror(0, 0, 0, 0, "bad number\n");
+			yylval.v.has_error = 1;
+			return NUMBER;
+		}
+	}
+\n	return 0;
+[+-/*()^%]	return yytext[0];
+
+.	{
+		yylval.v.has_error = 1;
+		return NUMBER;	
+	}
+%%
+
diff --git a/exp/expression-parser.y b/exp/expression-parser.y
new file mode 100644
index 0000000..8619025
--- /dev/null
+++ b/exp/expression-parser.y
@@ -0,0 +1,247 @@
+%{
+
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+struct parser_value_type {
+	double dval;
+	long long ival;
+	int has_dval;
+	int has_error;
+};
+
+typedef union valtype {
+	struct parser_value_type v;
+} PARSER_VALUE_TYPE;
+
+#define YYSTYPE PARSER_VALUE_TYPE
+
+int yyerror(__attribute__((unused)) long long *result,
+		__attribute__((unused)) double *dresult,
+		__attribute__((unused)) int *has_error,
+		__attribute__((unused)) int *units_specified,
+		__attribute__((unused)) const char *msg);
+
+extern int yylex(void);
+extern void yyrestart(FILE *file);
+extern int lexer_value_is_time;
+
+%}
+
+%union valtype {
+	struct parser_value_type {
+		double dval;
+		long long ival;
+		int has_dval;
+		int has_error;
+	} v;
+};
+
+%token <v> NUMBER
+%token <v> BYE
+%token <v> SUFFIX 
+%left '-' '+'
+%right SUFFIX
+%left '*' '/'
+%right '^'
+%left '%'
+%nonassoc UMINUS
+%parse-param { long long *result }
+%parse-param { double *dresult }
+%parse-param { int *has_error }
+%parse-param { int *units_specified }
+
+%type <v> expression
+%%
+
+top_level:	expression {
+				*result = $1.ival;
+				*dresult = $1.dval;
+				*has_error = $1.has_error;
+			}
+		| expression error {
+				*result = $1.ival;
+				*dresult = $1.dval;
+				*has_error = 1;
+			}
+expression:	expression '+' expression { 
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival + $3.ival;
+			else
+				$$.ival = (long long) ($1.dval + $3.dval);
+			$$.dval = $1.dval + $3.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '-' expression {
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival - $3.ival; 
+			else
+				$$.ival = (long long) ($1.dval - $3.dval); 
+			$$.dval = $1.dval - $3.dval; 
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '*' expression {
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival * $3.ival;
+			else
+				$$.ival = (long long) ($1.dval * $3.dval);
+			$$.dval = $1.dval * $3.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '/' expression {
+			if ($3.ival == 0)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else
+				$$.ival = $1.ival / $3.ival;
+			if ($3.dval < 1e-20 && $3.dval > -1e-20)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else
+				$$.dval = $1.dval / $3.dval;
+			if ($3.has_dval || $1.has_dval)
+				$$.ival = (long long) $$.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	'-' expression %prec UMINUS {
+			$$.ival = -$2.ival;
+			$$.dval = -$2.dval;
+			$$.has_error = $2.has_error;
+		}
+	|	'(' expression ')' { $$ = $2; }
+	|	expression SUFFIX {
+			if (!$1.has_dval && !$2.has_dval)
+				$$.ival = $1.ival * $2.ival;
+			else
+				$$.ival = (long long) $1.dval * $2.dval;
+			if ($1.has_dval || $2.has_dval)
+				$$.dval = $1.dval * $2.dval;
+			else
+				$$.dval = $1.ival * $2.ival;
+			$$.has_error = $1.has_error || $2.has_error;
+			*units_specified = 1;
+		}
+	|	expression '%' expression {
+			if ($1.has_dval || $3.has_dval)
+				yyerror(0, 0, 0, 0, "modulo on floats");
+			if ($3.ival == 0)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else {
+				$$.ival = $1.ival % $3.ival;
+				$$.dval = $$.ival;
+			}
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '^' expression {
+			$$.has_error = $1.has_error || $3.has_error;
+			if (!$1.has_dval && !$3.has_dval) {
+				int i;
+
+				if ($3.ival == 0) {
+					$$.ival = 1;
+				} else if ($3.ival > 0) {
+					long long tmp = $1.ival;
+					$$.ival = 1.0;
+					for (i = 0; i < $3.ival; i++)
+						$$.ival *= tmp;
+				}  else {
+					/* integers, 2^-3, ok, we now have doubles */
+					double tmp;
+					if ($1.ival == 0 && $3.ival == 0) {
+						tmp = 1.0;
+						$$.has_error = 1;
+					} else {
+						double x = (double) $1.ival;
+						double y = (double) $3.ival;
+						tmp = pow(x, y);
+					}
+					$$.ival = (long long) tmp;
+				}
+				$$.dval = pow($1.dval, $3.dval);
+			} else {
+				$$.dval = pow($1.dval, $3.dval);
+				$$.ival = (long long) $$.dval;
+			}
+		}
+	|	NUMBER { $$ = $1; };
+%%
+#include <stdio.h>
+
+/* Urgh.  yacc and lex are kind of horrible.  This is not thread safe, obviously. */
+static int lexer_read_offset = 0;
+static char lexer_input_buffer[1000];
+
+int lexer_input(char* buffer, unsigned int *bytes_read, int bytes_requested)
+{
+	int bytes_left = strlen(lexer_input_buffer) - lexer_read_offset;
+
+	if (bytes_requested > bytes_left )
+		bytes_requested = bytes_left;
+	memcpy(buffer, &lexer_input_buffer[lexer_read_offset], bytes_requested);
+	*bytes_read = bytes_requested;
+	lexer_read_offset += bytes_requested;
+	return 0;
+}
+
+static void setup_to_parse_string(const char *string)
+{
+	unsigned int len;
+
+	len = sizeof(lexer_input_buffer) - 3;
+	if (len > strlen(string))
+		len = strlen(string);
+
+	strncpy(lexer_input_buffer, string, len);
+	lexer_input_buffer[len] = '\0'; 
+	lexer_input_buffer[len + 1] = '\0';  /* lex/yacc want string double null terminated! */
+	lexer_read_offset = 0;
+}
+
+int evaluate_arithmetic_expression(const char *buffer, long long *ival, double *dval,
+					double implied_units, int is_time)
+{
+	int rc, units_specified = 0, has_error = 0;
+
+	lexer_value_is_time = is_time;
+	setup_to_parse_string(buffer);
+	rc = yyparse(ival, dval, &has_error, &units_specified);
+	yyrestart(NULL);
+	if (rc || has_error) {
+		*ival = 0;
+		*dval = 0;
+		has_error = 1;
+	}
+	if (!units_specified) {
+		*ival = (int) ((double) *ival * implied_units);
+		*dval = *dval * implied_units;
+	}
+	return has_error;
+}
+
+int yyerror(__attribute__((unused)) long long *result,
+		__attribute__((unused)) double *dresult,
+		__attribute__((unused)) int *has_error,
+		__attribute__((unused)) int *units_specified,
+		__attribute__((unused)) const char *msg)
+{
+	/* We do not need to do anything here. */
+	return 0;
+}
+
diff --git a/exp/test-expression-parser.c b/exp/test-expression-parser.c
new file mode 100644
index 0000000..e22f24d
--- /dev/null
+++ b/exp/test-expression-parser.c
@@ -0,0 +1,54 @@
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "../y.tab.h"
+
+extern int evaluate_arithmetic_expression(const char *buffer, long long *ival,
+					  double *dval, double implied_units, int is_time);
+ 
+int main(int argc, char *argv[])
+{
+	int rc, bye = 0;
+	long long result;
+	double dresult;
+	char buffer[100];
+
+	do {
+		if (fgets(buffer, 90, stdin) == NULL)
+			break;
+		rc = strlen(buffer);
+		if (rc > 0 && buffer[rc - 1] == '\n')
+			buffer[rc - 1] = '\0';
+		rc = evaluate_arithmetic_expression(buffer, &result, &dresult, 1.0, 0);
+		if (!rc) {
+			printf("%lld (%20.20lf)\n", result, dresult);
+		} else {
+			fprintf(stderr, "Syntax error\n");
+			result = 0;
+			dresult = 0;
+		}
+	} while (!bye);
+	return 0;
+}
+
diff --git a/fifo.c b/fifo.c
new file mode 100644
index 0000000..ac0d215
--- /dev/null
+++ b/fifo.c
@@ -0,0 +1,98 @@
+/*
+ * A simple kernel FIFO implementation.
+ *
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "fifo.h"
+#include "minmax.h"
+
+struct fifo *fifo_alloc(unsigned int size)
+{
+	struct fifo *fifo;
+
+	fifo = malloc(sizeof(struct fifo));
+	if (!fifo)
+		return NULL;
+
+	fifo->buffer = malloc(size);
+	fifo->size = size;
+	fifo->in = fifo->out = 0;
+
+	return fifo;
+}
+
+void fifo_free(struct fifo *fifo)
+{
+	free(fifo->buffer);
+	free(fifo);
+}
+
+unsigned int fifo_put(struct fifo *fifo, void *buffer, unsigned int len)
+{
+	unsigned int l;
+
+	len = min(len, fifo_room(fifo));
+
+	/* first put the data starting from fifo->in to buffer end */
+	l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+	memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+
+	/* then put the rest (if any) at the beginning of the buffer */
+	memcpy(fifo->buffer, buffer + l, len - l);
+
+	/*
+	 * Ensure that we add the bytes to the fifo -before-
+	 * we update the fifo->in index.
+	 */
+
+	fifo->in += len;
+
+	return len;
+}
+
+unsigned int fifo_get(struct fifo *fifo, void *buf, unsigned int len)
+{
+	len = min(len, fifo->in - fifo->out);
+
+	if (buf) {
+		unsigned int l;
+
+		/*
+		 * first get the data from fifo->out until the end of the buffer
+		 */
+		l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+		memcpy(buf, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+
+		/*
+		 * then get the rest (if any) from the beginning of the buffer
+		 */
+		memcpy(buf + l, fifo->buffer, len - l);
+	}
+
+	fifo->out += len;
+
+	if (fifo->in == fifo->out)
+		fifo->in = fifo->out = 0;
+
+	return len;
+}
diff --git a/fifo.h b/fifo.h
new file mode 100644
index 0000000..61cc5a8
--- /dev/null
+++ b/fifo.h
@@ -0,0 +1,46 @@
+#ifndef FIO_FIFO_H
+#define FIO_FIFO_H
+/*
+ * A simple FIFO implementation.
+ *
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+struct fifo {
+	unsigned char *buffer;	/* the buffer holding the data */
+	unsigned int size;	/* the size of the allocated buffer */
+	unsigned int in;	/* data is added at offset (in % size) */
+	unsigned int out;	/* data is extracted from off. (out % size) */
+};
+
+struct fifo *fifo_alloc(unsigned int);
+unsigned int fifo_put(struct fifo *, void *, unsigned int);
+unsigned int fifo_get(struct fifo *, void *, unsigned int);
+void fifo_free(struct fifo *);
+
+static inline unsigned int fifo_len(struct fifo *fifo)
+{
+	return fifo->in - fifo->out;
+}
+
+static inline unsigned int fifo_room(struct fifo *fifo)
+{
+	return fifo->size - fifo->in + fifo->out;
+}
+
+#endif
diff --git a/file.h b/file.h
new file mode 100644
index 0000000..ae0e6fc
--- /dev/null
+++ b/file.h
@@ -0,0 +1,230 @@
+#ifndef FIO_FILE_H
+#define FIO_FILE_H
+
+#include <string.h>
+#include "compiler/compiler.h"
+#include "io_ddir.h"
+#include "flist.h"
+#include "lib/zipf.h"
+#include "lib/axmap.h"
+#include "lib/lfsr.h"
+#include "lib/gauss.h"
+
+/* Forward declarations */
+struct zoned_block_device_info;
+
+/*
+ * The type of object we are working on
+ */
+enum fio_filetype {
+	FIO_TYPE_FILE = 1,		/* plain file */
+	FIO_TYPE_BLOCK,			/* block device */
+	FIO_TYPE_CHAR,			/* character device */
+	FIO_TYPE_PIPE,			/* pipe */
+};
+
+enum fio_file_flags {
+	FIO_FILE_open		= 1 << 0,	/* file is open */
+	FIO_FILE_closing	= 1 << 1,	/* file being closed */
+	FIO_FILE_extend		= 1 << 2,	/* needs extend */
+	FIO_FILE_done		= 1 << 3,	/* io completed to this file */
+	FIO_FILE_size_known	= 1 << 4,	/* size has been set */
+	FIO_FILE_hashed		= 1 << 5,	/* file is on hash */
+	FIO_FILE_partial_mmap	= 1 << 6,	/* can't do full mmap */
+	FIO_FILE_axmap		= 1 << 7,	/* uses axmap */
+	FIO_FILE_lfsr		= 1 << 8,	/* lfsr is used */
+};
+
+enum file_lock_mode {
+	FILE_LOCK_NONE,
+	FILE_LOCK_EXCLUSIVE,
+	FILE_LOCK_READWRITE,
+};
+
+/*
+ * How fio chooses what file to service next. Choice of uniformly random, or
+ * some skewed random variants, or just sequentially go through them or
+ * roundrobing.
+ */
+enum {
+	FIO_FSERVICE_RANDOM		= 1,
+	FIO_FSERVICE_RR			= 2,
+	FIO_FSERVICE_SEQ		= 3,
+	__FIO_FSERVICE_NONUNIFORM	= 0x100,
+	FIO_FSERVICE_ZIPF		= __FIO_FSERVICE_NONUNIFORM | 4,
+	FIO_FSERVICE_PARETO		= __FIO_FSERVICE_NONUNIFORM | 5,
+	FIO_FSERVICE_GAUSS		= __FIO_FSERVICE_NONUNIFORM | 6,
+
+	FIO_FSERVICE_SHIFT		= 10,
+};
+
+/*
+ * No pre-allocation when laying down files, or call posix_fallocate(), or
+ * call fallocate() with FALLOC_FL_KEEP_SIZE set.
+ */
+enum fio_fallocate_mode {
+	FIO_FALLOCATE_NONE	= 1,
+	FIO_FALLOCATE_POSIX	= 2,
+	FIO_FALLOCATE_KEEP_SIZE	= 3,
+	FIO_FALLOCATE_NATIVE	= 4,
+	FIO_FALLOCATE_TRUNCATE	= 5,
+};
+
+/*
+ * Each thread_data structure has a number of files associated with it,
+ * this structure holds state information for a single file.
+ */
+struct fio_file {
+	struct flist_head hash_list;
+	enum fio_filetype filetype;
+
+	int fd;
+	int shadow_fd;
+#ifdef WIN32
+	HANDLE hFile;
+	HANDLE ioCP;
+#endif
+
+	/*
+	 * filename and possible memory mapping
+	 */
+	unsigned int major, minor;
+	int fileno;
+	char *file_name;
+
+	/*
+	 * size of the file, offset into file, and io size from that offset
+	 * (be aware io_size is different from thread_options::io_size)
+	 */
+	uint64_t real_file_size;
+	uint64_t file_offset;
+	uint64_t io_size;
+
+	/*
+	 * Zoned block device information. See also zonemode=zbd.
+	 */
+	struct zoned_block_device_info *zbd_info;
+
+	/*
+	 * Track last end and last start of IO for a given data direction
+	 */
+	uint64_t last_pos[DDIR_RWDIR_CNT];
+	uint64_t last_start[DDIR_RWDIR_CNT];
+
+	uint64_t first_write;
+	uint64_t last_write;
+
+	/*
+	 * Tracks the last iodepth number of completed writes, if data
+	 * verification is enabled
+	 */
+	uint64_t *last_write_comp;
+	unsigned int last_write_idx;
+
+	/*
+	 * For use by the io engine for offset or private data storage
+	 */
+	union {
+		uint64_t engine_pos;
+		void *engine_data;
+	};
+
+	/*
+	 * if io is protected by a semaphore, this is set
+	 */
+	union {
+		struct fio_sem *lock;
+		struct fio_rwlock *rwlock;
+	};
+
+	/*
+	 * block map or LFSR for random io
+	 */
+	union {
+		struct axmap *io_axmap;
+		struct fio_lfsr lfsr;
+	};
+
+	/*
+	 * Used for zipf random distribution
+	 */
+	union {
+		struct zipf_state zipf;
+		struct gauss_state gauss;
+	};
+
+	int references;
+	enum fio_file_flags flags;
+
+	struct disk_util *du;
+};
+
+#define FILE_ENG_DATA(f)		((f)->engine_data)
+#define FILE_SET_ENG_DATA(f, data)	((f)->engine_data = (data))
+
+#define FILE_FLAG_FNS(name)						\
+static inline void fio_file_set_##name(struct fio_file *f)		\
+{									\
+	(f)->flags = (enum fio_file_flags) ((f)->flags | FIO_FILE_##name);	\
+}									\
+static inline void fio_file_clear_##name(struct fio_file *f)		\
+{									\
+	(f)->flags = (enum fio_file_flags) ((f)->flags & ~FIO_FILE_##name);	\
+}									\
+static inline int fio_file_##name(struct fio_file *f)			\
+{									\
+	return ((f)->flags & FIO_FILE_##name) != 0;			\
+}
+
+FILE_FLAG_FNS(open);
+FILE_FLAG_FNS(closing);
+FILE_FLAG_FNS(extend);
+FILE_FLAG_FNS(done);
+FILE_FLAG_FNS(size_known);
+FILE_FLAG_FNS(hashed);
+FILE_FLAG_FNS(partial_mmap);
+FILE_FLAG_FNS(axmap);
+FILE_FLAG_FNS(lfsr);
+#undef FILE_FLAG_FNS
+
+/*
+ * File setup/shutdown
+ */
+struct thread_data;
+extern void close_files(struct thread_data *);
+extern void close_and_free_files(struct thread_data *);
+extern uint64_t get_start_offset(struct thread_data *, struct fio_file *);
+extern int __must_check setup_files(struct thread_data *);
+extern int __must_check file_invalidate_cache(struct thread_data *, struct fio_file *);
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int __must_check generic_open_file(struct thread_data *, struct fio_file *);
+extern int __must_check generic_close_file(struct thread_data *, struct fio_file *);
+extern int __must_check generic_get_file_size(struct thread_data *, struct fio_file *);
+#ifdef __cplusplus
+}
+#endif
+extern int __must_check file_lookup_open(struct fio_file *f, int flags);
+extern bool __must_check pre_read_files(struct thread_data *);
+extern unsigned long long get_rand_file_size(struct thread_data *td);
+extern int add_file(struct thread_data *, const char *, int, int);
+extern int add_file_exclusive(struct thread_data *, const char *);
+extern void get_file(struct fio_file *);
+extern int __must_check put_file(struct thread_data *, struct fio_file *);
+extern void put_file_log(struct thread_data *, struct fio_file *);
+extern void lock_file(struct thread_data *, struct fio_file *, enum fio_ddir);
+extern void unlock_file(struct thread_data *, struct fio_file *);
+extern void unlock_file_all(struct thread_data *, struct fio_file *);
+extern int add_dir_files(struct thread_data *, const char *);
+extern bool init_random_map(struct thread_data *);
+extern void dup_files(struct thread_data *, struct thread_data *);
+extern int get_fileno(struct thread_data *, const char *);
+extern void free_release_files(struct thread_data *);
+extern void filesetup_mem_free(void);
+extern void fio_file_reset(struct thread_data *, struct fio_file *);
+extern bool fio_files_done(struct thread_data *);
+extern bool exists_and_not_regfile(const char *);
+extern int fio_set_directio(struct thread_data *, struct fio_file *);
+
+#endif
diff --git a/filehash.c b/filehash.c
new file mode 100644
index 0000000..b55ab73
--- /dev/null
+++ b/filehash.c
@@ -0,0 +1,139 @@
+#include <stdlib.h>
+#include <assert.h>
+
+#include "fio.h"
+#include "flist.h"
+#include "hash.h"
+#include "filehash.h"
+#include "smalloc.h"
+#include "lib/bloom.h"
+
+#define HASH_BUCKETS	512
+#define HASH_MASK	(HASH_BUCKETS - 1)
+
+#define BLOOM_SIZE	16*1024*1024
+
+static unsigned int file_hash_size = HASH_BUCKETS * sizeof(struct flist_head);
+
+static struct flist_head *file_hash;
+static struct fio_sem *hash_lock;
+static struct bloom *file_bloom;
+
+static unsigned short hash(const char *name)
+{
+	return jhash(name, strlen(name), 0) & HASH_MASK;
+}
+
+void fio_file_hash_lock(void)
+{
+	if (hash_lock)
+		fio_sem_down(hash_lock);
+}
+
+void fio_file_hash_unlock(void)
+{
+	if (hash_lock)
+		fio_sem_up(hash_lock);
+}
+
+void remove_file_hash(struct fio_file *f)
+{
+	fio_sem_down(hash_lock);
+
+	if (fio_file_hashed(f)) {
+		assert(!flist_empty(&f->hash_list));
+		flist_del_init(&f->hash_list);
+		fio_file_clear_hashed(f);
+	}
+
+	fio_sem_up(hash_lock);
+}
+
+static struct fio_file *__lookup_file_hash(const char *name)
+{
+	struct flist_head *bucket = &file_hash[hash(name)];
+	struct flist_head *n;
+
+	flist_for_each(n, bucket) {
+		struct fio_file *f = flist_entry(n, struct fio_file, hash_list);
+
+		if (!f->file_name)
+			continue;
+
+		if (!strcmp(f->file_name, name)) {
+			assert(f->fd != -1);
+			return f;
+		}
+	}
+
+	return NULL;
+}
+
+struct fio_file *lookup_file_hash(const char *name)
+{
+	struct fio_file *f;
+
+	fio_sem_down(hash_lock);
+	f = __lookup_file_hash(name);
+	fio_sem_up(hash_lock);
+	return f;
+}
+
+struct fio_file *add_file_hash(struct fio_file *f)
+{
+	struct fio_file *alias;
+
+	if (fio_file_hashed(f))
+		return NULL;
+
+	INIT_FLIST_HEAD(&f->hash_list);
+
+	fio_sem_down(hash_lock);
+
+	alias = __lookup_file_hash(f->file_name);
+	if (!alias) {
+		fio_file_set_hashed(f);
+		flist_add_tail(&f->hash_list, &file_hash[hash(f->file_name)]);
+	}
+
+	fio_sem_up(hash_lock);
+	return alias;
+}
+
+bool file_bloom_exists(const char *fname, bool set)
+{
+	return bloom_string(file_bloom, fname, strlen(fname), set);
+}
+
+void file_hash_exit(void)
+{
+	unsigned int i, has_entries = 0;
+
+	fio_sem_down(hash_lock);
+	for (i = 0; i < HASH_BUCKETS; i++)
+		has_entries += !flist_empty(&file_hash[i]);
+	fio_sem_up(hash_lock);
+
+	if (has_entries)
+		log_err("fio: file hash not empty on exit\n");
+
+	sfree(file_hash);
+	file_hash = NULL;
+	fio_sem_remove(hash_lock);
+	hash_lock = NULL;
+	bloom_free(file_bloom);
+	file_bloom = NULL;
+}
+
+void file_hash_init(void)
+{
+	unsigned int i;
+
+	file_hash = smalloc(file_hash_size);
+
+	for (i = 0; i < HASH_BUCKETS; i++)
+		INIT_FLIST_HEAD(&file_hash[i]);
+
+	hash_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+	file_bloom = bloom_new(BLOOM_SIZE);
+}
diff --git a/filehash.h b/filehash.h
new file mode 100644
index 0000000..5fecc3b
--- /dev/null
+++ b/filehash.h
@@ -0,0 +1,15 @@
+#ifndef FIO_FILE_HASH_H
+#define FIO_FILE_HASH_H
+
+#include "lib/types.h"
+
+extern void file_hash_init(void);
+extern void file_hash_exit(void);
+extern struct fio_file *lookup_file_hash(const char *);
+extern struct fio_file *add_file_hash(struct fio_file *);
+extern void remove_file_hash(struct fio_file *);
+extern void fio_file_hash_lock(void);
+extern void fio_file_hash_unlock(void);
+extern bool file_bloom_exists(const char *, bool);
+
+#endif
diff --git a/filelock.c b/filelock.c
new file mode 100644
index 0000000..7e92f63
--- /dev/null
+++ b/filelock.c
@@ -0,0 +1,246 @@
+/*
+ * Really simple exclusive file locking based on filename.
+ * No hash indexing, just a list, so only works well for < 100 files or
+ * so. But that's more than what fio needs, so should be fine.
+ */
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include "flist.h"
+#include "filelock.h"
+#include "smalloc.h"
+#include "fio_sem.h"
+#include "hash.h"
+#include "log.h"
+
+struct fio_filelock {
+	uint32_t hash;
+	struct fio_sem lock;
+	struct flist_head list;
+	unsigned int references;
+};
+
+#define MAX_FILELOCKS	1024
+	
+static struct filelock_data {
+	struct flist_head list;
+	struct fio_sem lock;
+
+	struct flist_head free_list;
+	struct fio_filelock ffs[MAX_FILELOCKS];
+} *fld;
+
+static void put_filelock(struct fio_filelock *ff)
+{
+	flist_add(&ff->list, &fld->free_list);
+}
+
+static struct fio_filelock *__get_filelock(void)
+{
+	struct fio_filelock *ff;
+
+	if (flist_empty(&fld->free_list))
+		return NULL;
+
+	ff = flist_first_entry(&fld->free_list, struct fio_filelock, list);
+	flist_del_init(&ff->list);
+	return ff;
+}
+
+static struct fio_filelock *get_filelock(int trylock, int *retry)
+{
+	struct fio_filelock *ff;
+
+	do {
+		ff = __get_filelock();
+		if (ff || trylock)
+			break;
+
+		fio_sem_up(&fld->lock);
+		usleep(1000);
+		fio_sem_down(&fld->lock);
+		*retry = 1;
+	} while (1);
+
+	return ff;
+}
+
+int fio_filelock_init(void)
+{
+	int i;
+
+	fld = smalloc(sizeof(*fld));
+	if (!fld)
+		return 1;
+
+	INIT_FLIST_HEAD(&fld->list);
+	INIT_FLIST_HEAD(&fld->free_list);
+
+	if (__fio_sem_init(&fld->lock, FIO_SEM_UNLOCKED))
+		goto err;
+
+	for (i = 0; i < MAX_FILELOCKS; i++) {
+		struct fio_filelock *ff = &fld->ffs[i];
+
+		if (__fio_sem_init(&ff->lock, FIO_SEM_UNLOCKED))
+			goto err;
+		flist_add_tail(&ff->list, &fld->free_list);
+	}
+
+	return 0;
+err:
+	fio_filelock_exit();
+	return 1;
+}
+
+void fio_filelock_exit(void)
+{
+	if (!fld)
+		return;
+
+	assert(flist_empty(&fld->list));
+	__fio_sem_remove(&fld->lock);
+
+	while (!flist_empty(&fld->free_list)) {
+		struct fio_filelock *ff;
+
+		ff = flist_first_entry(&fld->free_list, struct fio_filelock, list);
+
+		flist_del_init(&ff->list);
+		__fio_sem_remove(&ff->lock);
+	}
+
+	sfree(fld);
+	fld = NULL;
+}
+
+static struct fio_filelock *fio_hash_find(uint32_t hash)
+{
+	struct flist_head *entry;
+	struct fio_filelock *ff;
+
+	flist_for_each(entry, &fld->list) {
+		ff = flist_entry(entry, struct fio_filelock, list);
+		if (ff->hash == hash)
+			return ff;
+	}
+
+	return NULL;
+}
+
+static struct fio_filelock *fio_hash_get(uint32_t hash, int trylock)
+{
+	struct fio_filelock *ff;
+
+	ff = fio_hash_find(hash);
+	if (!ff) {
+		int retry = 0;
+
+		ff = get_filelock(trylock, &retry);
+		if (!ff)
+			return NULL;
+
+		/*
+		 * If we dropped the main lock, re-lookup the hash in case
+		 * someone else added it meanwhile. If it's now there,
+		 * just return that.
+		 */
+		if (retry) {
+			struct fio_filelock *__ff;
+
+			__ff = fio_hash_find(hash);
+			if (__ff) {
+				put_filelock(ff);
+				return __ff;
+			}
+		}
+
+		ff->hash = hash;
+		ff->references = 0;
+		flist_add(&ff->list, &fld->list);
+	}
+
+	return ff;
+}
+
+static bool __fio_lock_file(const char *fname, int trylock)
+{
+	struct fio_filelock *ff;
+	uint32_t hash;
+
+	hash = jhash(fname, strlen(fname), 0);
+
+	fio_sem_down(&fld->lock);
+	ff = fio_hash_get(hash, trylock);
+	if (ff)
+		ff->references++;
+	fio_sem_up(&fld->lock);
+
+	if (!ff) {
+		assert(!trylock);
+		return true;
+	}
+
+	if (!trylock) {
+		fio_sem_down(&ff->lock);
+		return false;
+	}
+
+	if (!fio_sem_down_trylock(&ff->lock))
+		return false;
+
+	fio_sem_down(&fld->lock);
+
+	/*
+	 * If we raced and the only reference to the lock is us, we can
+	 * grab it
+	 */
+	if (ff->references != 1) {
+		ff->references--;
+		ff = NULL;
+	}
+
+	fio_sem_up(&fld->lock);
+
+	if (ff) {
+		fio_sem_down(&ff->lock);
+		return false;
+	}
+
+	return true;
+}
+
+bool fio_trylock_file(const char *fname)
+{
+	return __fio_lock_file(fname, 1);
+}
+
+void fio_lock_file(const char *fname)
+{
+	__fio_lock_file(fname, 0);
+}
+
+void fio_unlock_file(const char *fname)
+{
+	struct fio_filelock *ff;
+	uint32_t hash;
+
+	hash = jhash(fname, strlen(fname), 0);
+
+	fio_sem_down(&fld->lock);
+
+	ff = fio_hash_find(hash);
+	if (ff) {
+		int refs = --ff->references;
+		fio_sem_up(&ff->lock);
+		if (!refs) {
+			flist_del_init(&ff->list);
+			put_filelock(ff);
+		}
+	} else
+		log_err("fio: file not found for unlocking\n");
+
+	fio_sem_up(&fld->lock);
+}
diff --git a/filelock.h b/filelock.h
new file mode 100644
index 0000000..4551bb0
--- /dev/null
+++ b/filelock.h
@@ -0,0 +1,13 @@
+#ifndef FIO_LOCK_FILE_H
+#define FIO_LOCK_FILE_H
+
+#include "lib/types.h"
+
+extern void fio_lock_file(const char *);
+extern bool fio_trylock_file(const char *);
+extern void fio_unlock_file(const char *);
+
+extern int fio_filelock_init(void);
+extern void fio_filelock_exit(void);
+
+#endif
diff --git a/filesetup.c b/filesetup.c
new file mode 100644
index 0000000..8a4091f
--- /dev/null
+++ b/filesetup.c
@@ -0,0 +1,2006 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <assert.h>
+#include <dirent.h>
+#include <libgen.h>
+#include <sys/stat.h>
+
+#include "fio.h"
+#include "smalloc.h"
+#include "filehash.h"
+#include "options.h"
+#include "os/os.h"
+#include "hash.h"
+#include "lib/axmap.h"
+#include "rwlock.h"
+#include "zbd.h"
+
+#ifdef CONFIG_LINUX_FALLOCATE
+#include <linux/falloc.h>
+#endif
+
+static FLIST_HEAD(filename_list);
+
+/*
+ * List entry for filename_list
+ */
+struct file_name {
+	struct flist_head list;
+	char *filename;
+};
+
+static inline void clear_error(struct thread_data *td)
+{
+	td->error = 0;
+	td->verror[0] = '\0';
+}
+
+static int native_fallocate(struct thread_data *td, struct fio_file *f)
+{
+	bool success;
+
+	success = fio_fallocate(f, 0, f->real_file_size);
+	dprint(FD_FILE, "native fallocate of file %s size %llu was "
+			"%ssuccessful\n", f->file_name,
+			(unsigned long long) f->real_file_size,
+			!success ? "un": "");
+
+	if (success)
+		return false;
+
+	if (errno == ENOSYS)
+		dprint(FD_FILE, "native fallocate is not implemented\n");
+
+	return true;
+}
+
+static void fallocate_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td->o.fill_device)
+		return;
+
+	switch (td->o.fallocate_mode) {
+	case FIO_FALLOCATE_NATIVE:
+		native_fallocate(td, f);
+		break;
+	case FIO_FALLOCATE_NONE:
+		break;
+#ifdef CONFIG_POSIX_FALLOCATE
+	case FIO_FALLOCATE_POSIX: {
+		int r;
+
+		dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
+				 f->file_name,
+				 (unsigned long long) f->real_file_size);
+
+		r = posix_fallocate(f->fd, 0, f->real_file_size);
+		if (r > 0)
+			log_err("fio: posix_fallocate fails: %s\n", strerror(r));
+		break;
+		}
+#endif /* CONFIG_POSIX_FALLOCATE */
+#ifdef CONFIG_LINUX_FALLOCATE
+	case FIO_FALLOCATE_KEEP_SIZE: {
+		int r;
+
+		dprint(FD_FILE, "fallocate(FALLOC_FL_KEEP_SIZE) "
+				"file %s size %llu\n", f->file_name,
+				(unsigned long long) f->real_file_size);
+
+		r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0, f->real_file_size);
+		if (r != 0)
+			td_verror(td, errno, "fallocate");
+
+		break;
+		}
+#endif /* CONFIG_LINUX_FALLOCATE */
+	case FIO_FALLOCATE_TRUNCATE: {
+		int r;
+
+		dprint(FD_FILE, "ftruncate file %s size %llu\n",
+				f->file_name,
+				(unsigned long long) f->real_file_size);
+		r = ftruncate(f->fd, f->real_file_size);
+		if (r != 0)
+			td_verror(td, errno, "ftruncate");
+
+		break;
+	}
+	default:
+		log_err("fio: unknown fallocate mode: %d\n", td->o.fallocate_mode);
+		assert(0);
+	}
+}
+
+/*
+ * Leaves f->fd open on success, caller must close
+ */
+static int extend_file(struct thread_data *td, struct fio_file *f)
+{
+	int new_layout = 0, unlink_file = 0, flags;
+	unsigned long long left;
+	unsigned long long bs;
+	char *b = NULL;
+
+	if (read_only) {
+		log_err("fio: refusing extend of file due to read-only\n");
+		return 0;
+	}
+
+	/*
+	 * check if we need to lay the file out complete again. fio
+	 * does that for operations involving reads, or for writes
+	 * where overwrite is set
+	 */
+	if (td_read(td) ||
+	   (td_write(td) && td->o.overwrite && !td->o.file_append) ||
+	    (td_write(td) && td_ioengine_flagged(td, FIO_NOEXTEND)))
+		new_layout = 1;
+	if (td_write(td) && !td->o.overwrite && !td->o.file_append)
+		unlink_file = 1;
+
+	if (unlink_file || new_layout) {
+		int ret;
+
+		dprint(FD_FILE, "layout unlink %s\n", f->file_name);
+
+		ret = td_io_unlink_file(td, f);
+		if (ret != 0 && ret != ENOENT) {
+			td_verror(td, errno, "unlink");
+			return 1;
+		}
+	}
+
+	flags = O_WRONLY;
+	if (td->o.allow_create)
+		flags |= O_CREAT;
+	if (new_layout)
+		flags |= O_TRUNC;
+
+#ifdef WIN32
+	flags |= _O_BINARY;
+#endif
+
+	dprint(FD_FILE, "open file %s, flags %x\n", f->file_name, flags);
+	f->fd = open(f->file_name, flags, 0644);
+	if (f->fd < 0) {
+		int err = errno;
+
+		if (err == ENOENT && !td->o.allow_create)
+			log_err("fio: file creation disallowed by "
+					"allow_file_create=0\n");
+		else
+			td_verror(td, err, "open");
+		return 1;
+	}
+
+	fallocate_file(td, f);
+
+	/*
+	 * If our jobs don't require regular files initially, we're done.
+	 */
+	if (!new_layout)
+		goto done;
+
+	/*
+	 * The size will be -1ULL when fill_device is used, so don't truncate
+	 * or fallocate this file, just write it
+	 */
+	if (!td->o.fill_device) {
+		dprint(FD_FILE, "truncate file %s, size %llu\n", f->file_name,
+					(unsigned long long) f->real_file_size);
+		if (ftruncate(f->fd, f->real_file_size) == -1) {
+			if (errno != EFBIG) {
+				td_verror(td, errno, "ftruncate");
+				goto err;
+			}
+		}
+	}
+
+	left = f->real_file_size;
+	bs = td->o.max_bs[DDIR_WRITE];
+	if (bs > left)
+		bs = left;
+
+	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		goto err;
+	}
+
+	while (left && !td->terminate) {
+		ssize_t r;
+
+		if (bs > left)
+			bs = left;
+
+		fill_io_buffer(td, b, bs, bs);
+
+		r = write(f->fd, b, bs);
+
+		if (r > 0) {
+			left -= r;
+			continue;
+		} else {
+			if (r < 0) {
+				int __e = errno;
+
+				if (__e == ENOSPC) {
+					if (td->o.fill_device)
+						break;
+					log_info("fio: ENOSPC on laying out "
+						 "file, stopping\n");
+					break;
+				}
+				td_verror(td, errno, "write");
+			} else
+				td_verror(td, EIO, "write");
+
+			break;
+		}
+	}
+
+	if (td->terminate) {
+		dprint(FD_FILE, "terminate unlink %s\n", f->file_name);
+		td_io_unlink_file(td, f);
+	} else if (td->o.create_fsync) {
+		if (fsync(f->fd) < 0) {
+			td_verror(td, errno, "fsync");
+			goto err;
+		}
+	}
+	if (td->o.fill_device && !td_write(td)) {
+		fio_file_clear_size_known(f);
+		if (td_io_get_file_size(td, f))
+			goto err;
+		if (f->io_size > f->real_file_size)
+			f->io_size = f->real_file_size;
+	}
+
+	free(b);
+done:
+	return 0;
+err:
+	close(f->fd);
+	f->fd = -1;
+	if (b)
+		free(b);
+	return 1;
+}
+
+static bool pre_read_file(struct thread_data *td, struct fio_file *f)
+{
+	int r, did_open = 0, old_runstate;
+	unsigned long long left;
+	unsigned long long bs;
+	bool ret = true;
+	char *b;
+
+	if (td_ioengine_flagged(td, FIO_PIPEIO) ||
+	    td_ioengine_flagged(td, FIO_NOIO))
+		return true;
+
+	if (f->filetype == FIO_TYPE_CHAR)
+		return true;
+
+	if (!fio_file_open(f)) {
+		if (td->io_ops->open_file(td, f)) {
+			log_err("fio: cannot pre-read, failed to open file\n");
+			return false;
+		}
+		did_open = 1;
+	}
+
+	old_runstate = td_bump_runstate(td, TD_PRE_READING);
+
+	left = f->io_size;
+	bs = td->o.max_bs[DDIR_READ];
+	if (bs > left)
+		bs = left;
+
+	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		ret = false;
+		goto error;
+	}
+	memset(b, 0, bs);
+
+	if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) {
+		td_verror(td, errno, "lseek");
+		log_err("fio: failed to lseek pre-read file\n");
+		ret = false;
+		goto error;
+	}
+
+	while (left && !td->terminate) {
+		if (bs > left)
+			bs = left;
+
+		r = read(f->fd, b, bs);
+
+		if (r == (int) bs) {
+			left -= bs;
+			continue;
+		} else {
+			td_verror(td, EIO, "pre_read");
+			break;
+		}
+	}
+
+error:
+	td_restore_runstate(td, old_runstate);
+
+	if (did_open)
+		td->io_ops->close_file(td, f);
+
+	free(b);
+	return ret;
+}
+
+unsigned long long get_rand_file_size(struct thread_data *td)
+{
+	unsigned long long ret, sized;
+	uint64_t frand_max;
+	uint64_t r;
+
+	frand_max = rand_max(&td->file_size_state);
+	r = __rand(&td->file_size_state);
+	sized = td->o.file_size_high - td->o.file_size_low;
+	ret = (unsigned long long) ((double) sized * (r / (frand_max + 1.0)));
+	ret += td->o.file_size_low;
+	ret -= (ret % td->o.rw_min_bs);
+	return ret;
+}
+
+static int file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct stat st;
+
+	if (stat(f->file_name, &st) == -1) {
+		td_verror(td, errno, "fstat");
+		return 1;
+	}
+
+	f->real_file_size = st.st_size;
+	return 0;
+}
+
+static int bdev_size(struct thread_data *td, struct fio_file *f)
+{
+	unsigned long long bytes = 0;
+	int r;
+
+	if (td->io_ops->open_file(td, f)) {
+		log_err("fio: failed opening blockdev %s for size check\n",
+			f->file_name);
+		return 1;
+	}
+
+	r = blockdev_size(f, &bytes);
+	if (r) {
+		td_verror(td, r, "blockdev_size");
+		goto err;
+	}
+
+	if (!bytes) {
+		log_err("%s: zero sized block device?\n", f->file_name);
+		goto err;
+	}
+
+	f->real_file_size = bytes;
+	td->io_ops->close_file(td, f);
+	return 0;
+err:
+	td->io_ops->close_file(td, f);
+	return 1;
+}
+
+static int char_size(struct thread_data *td, struct fio_file *f)
+{
+#ifdef FIO_HAVE_CHARDEV_SIZE
+	unsigned long long bytes = 0;
+	int r;
+
+	if (td->io_ops->open_file(td, f)) {
+		log_err("fio: failed opening chardev %s for size check\n",
+			f->file_name);
+		return 1;
+	}
+
+	r = chardev_size(f, &bytes);
+	if (r) {
+		td_verror(td, r, "chardev_size");
+		goto err;
+	}
+
+	if (!bytes) {
+		log_err("%s: zero sized char device?\n", f->file_name);
+		goto err;
+	}
+
+	f->real_file_size = bytes;
+	td->io_ops->close_file(td, f);
+	return 0;
+err:
+	td->io_ops->close_file(td, f);
+	return 1;
+#else
+	f->real_file_size = -1ULL;
+	return 0;
+#endif
+}
+
+static int get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype == FIO_TYPE_FILE)
+		ret = file_size(td, f);
+	else if (f->filetype == FIO_TYPE_BLOCK)
+		ret = bdev_size(td, f);
+	else if (f->filetype == FIO_TYPE_CHAR)
+		ret = char_size(td, f);
+	else {
+		f->real_file_size = -1;
+		log_info("%s: failed to get file size of %s\n", td->o.name,
+					f->file_name);
+		return 1; /* avoid offset extends end error message */
+	}
+
+	/*
+	 * Leave ->real_file_size with 0 since it could be expectation
+	 * of initial setup for regular files.
+	 */
+	if (ret)
+		return ret;
+
+	/*
+	 * ->file_offset normally hasn't been initialized yet, so this
+	 * is basically always false unless ->real_file_size is -1, but
+	 * if ->real_file_size is -1 this message doesn't make sense.
+	 * As a result, this message is basically useless.
+	 */
+	if (f->file_offset > f->real_file_size) {
+		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
+					(unsigned long long) f->file_offset,
+					(unsigned long long) f->real_file_size);
+		return 1;
+	}
+
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+static int __file_invalidate_cache(struct thread_data *td, struct fio_file *f,
+				   unsigned long long off,
+				   unsigned long long len)
+{
+	int errval = 0, ret = 0;
+
+#ifdef CONFIG_ESX
+	return 0;
+#endif
+
+	if (len == -1ULL)
+		len = f->io_size;
+	if (off == -1ULL)
+		off = f->file_offset;
+
+	if (len == -1ULL || off == -1ULL)
+		return 0;
+
+	if (td->io_ops->invalidate) {
+		dprint(FD_IO, "invalidate %s cache %s\n", td->io_ops->name,
+			f->file_name);
+		ret = td->io_ops->invalidate(td, f);
+		if (ret < 0)
+			errval = -ret;
+	} else if (td_ioengine_flagged(td, FIO_DISKLESSIO)) {
+		dprint(FD_IO, "invalidate not supported by ioengine %s\n",
+		       td->io_ops->name);
+	} else if (f->filetype == FIO_TYPE_FILE) {
+		dprint(FD_IO, "declare unneeded cache %s: %llu/%llu\n",
+			f->file_name, off, len);
+		ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED);
+		if (ret)
+			errval = ret;
+	} else if (f->filetype == FIO_TYPE_BLOCK) {
+		int retry_count = 0;
+
+		dprint(FD_IO, "drop page cache %s\n", f->file_name);
+		ret = blockdev_invalidate_cache(f);
+		while (ret < 0 && errno == EAGAIN && retry_count++ < 25) {
+			/*
+			 * Linux multipath devices reject ioctl while
+			 * the maps are being updated. That window can
+			 * last tens of milliseconds; we'll try up to
+			 * a quarter of a second.
+			 */
+			usleep(10000);
+			ret = blockdev_invalidate_cache(f);
+		}
+		if (ret < 0 && errno == EACCES && geteuid()) {
+			if (!fio_did_warn(FIO_WARN_ROOT_FLUSH)) {
+				log_err("fio: only root may flush block "
+					"devices. Cache flush bypassed!\n");
+			}
+		}
+		if (ret < 0)
+			errval = errno;
+	} else if (f->filetype == FIO_TYPE_CHAR ||
+		   f->filetype == FIO_TYPE_PIPE) {
+		dprint(FD_IO, "invalidate not supported %s\n", f->file_name);
+	}
+
+	/*
+	 * Cache flushing isn't a fatal condition, and we know it will
+	 * happen on some platforms where we don't have the proper
+	 * function to flush eg block device caches. So just warn and
+	 * continue on our way.
+	 */
+	if (errval)
+		log_info("fio: cache invalidation of %s failed: %s\n",
+			 f->file_name, strerror(errval));
+
+	return 0;
+
+}
+
+int file_invalidate_cache(struct thread_data *td, struct fio_file *f)
+{
+	if (!fio_file_open(f))
+		return 0;
+
+	return __file_invalidate_cache(td, f, -1ULL, -1ULL);
+}
+
+int generic_close_file(struct thread_data fio_unused *td, struct fio_file *f)
+{
+	int ret = 0;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	remove_file_hash(f);
+
+	if (close(f->fd) < 0)
+		ret = errno;
+
+	f->fd = -1;
+
+	if (f->shadow_fd != -1) {
+		close(f->shadow_fd);
+		f->shadow_fd = -1;
+	}
+
+	f->engine_pos = 0;
+	return ret;
+}
+
+int file_lookup_open(struct fio_file *f, int flags)
+{
+	struct fio_file *__f;
+	int from_hash;
+
+	__f = lookup_file_hash(f->file_name);
+	if (__f) {
+		dprint(FD_FILE, "found file in hash %s\n", f->file_name);
+		f->lock = __f->lock;
+		from_hash = 1;
+	} else {
+		dprint(FD_FILE, "file not found in hash %s\n", f->file_name);
+		from_hash = 0;
+	}
+
+#ifdef WIN32
+	flags |= _O_BINARY;
+#endif
+
+	f->fd = open(f->file_name, flags, 0600);
+	return from_hash;
+}
+
+static int file_close_shadow_fds(struct thread_data *td)
+{
+	struct fio_file *f;
+	int num_closed = 0;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (f->shadow_fd == -1)
+			continue;
+
+		close(f->shadow_fd);
+		f->shadow_fd = -1;
+		num_closed++;
+	}
+
+	return num_closed;
+}
+
+int generic_open_file(struct thread_data *td, struct fio_file *f)
+{
+	int is_std = 0;
+	int flags = 0;
+	int from_hash = 0;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (!strcmp(f->file_name, "-")) {
+		if (td_rw(td)) {
+			log_err("fio: can't read/write to stdin/out\n");
+			return 1;
+		}
+		is_std = 1;
+
+		/*
+		 * move output logging to stderr, if we are writing to stdout
+		 */
+		if (td_write(td))
+			f_out = stderr;
+	}
+
+	if (td_trim(td))
+		goto skip_flags;
+	if (td->o.odirect)
+		flags |= OS_O_DIRECT;
+	if (td->o.oatomic) {
+		if (!FIO_O_ATOMIC) {
+			td_verror(td, EINVAL, "OS does not support atomic IO");
+			return 1;
+		}
+		flags |= OS_O_DIRECT | FIO_O_ATOMIC;
+	}
+	if (td->o.sync_io)
+		flags |= O_SYNC;
+	if (td->o.create_on_open && td->o.allow_create)
+		flags |= O_CREAT;
+skip_flags:
+	if (f->filetype != FIO_TYPE_FILE)
+		flags |= FIO_O_NOATIME;
+
+open_again:
+	if (td_write(td)) {
+		if (!read_only)
+			flags |= O_RDWR;
+
+		if (f->filetype == FIO_TYPE_FILE && td->o.allow_create)
+			flags |= O_CREAT;
+
+		if (is_std)
+			f->fd = dup(STDOUT_FILENO);
+		else
+			from_hash = file_lookup_open(f, flags);
+	} else if (td_read(td)) {
+		if (f->filetype == FIO_TYPE_CHAR && !read_only)
+			flags |= O_RDWR;
+		else
+			flags |= O_RDONLY;
+
+		if (is_std)
+			f->fd = dup(STDIN_FILENO);
+		else
+			from_hash = file_lookup_open(f, flags);
+	} else if (td_trim(td)) {
+		assert(!td_rw(td)); /* should have matched above */
+		if (!read_only)
+			flags |= O_RDWR;
+		from_hash = file_lookup_open(f, flags);
+	}
+
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int __e = errno;
+
+		if (__e == EPERM && (flags & FIO_O_NOATIME)) {
+			flags &= ~FIO_O_NOATIME;
+			goto open_again;
+		}
+		if (__e == EMFILE && file_close_shadow_fds(td))
+			goto open_again;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+
+		if (__e == EINVAL && (flags & OS_O_DIRECT)) {
+			log_err("fio: looks like your file system does not " \
+				"support direct=1/buffered=0\n");
+		}
+
+		td_verror(td, __e, buf);
+		return 1;
+	}
+
+	if (!from_hash && f->fd != -1) {
+		if (add_file_hash(f)) {
+			int fio_unused ret;
+
+			/*
+			 * Stash away descriptor for later close. This is to
+			 * work-around a "feature" on Linux, where a close of
+			 * an fd that has been opened for write will trigger
+			 * udev to call blkid to check partitions, fs id, etc.
+			 * That pollutes the device cache, which can slow down
+			 * unbuffered accesses.
+			 */
+			if (f->shadow_fd == -1)
+				f->shadow_fd = f->fd;
+			else {
+				/*
+			 	 * OK to ignore, we haven't done anything
+				 * with it
+				 */
+				ret = generic_close_file(td, f);
+			}
+			goto open_again;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * This function i.e. get_file_size() is the default .get_file_size
+ * implementation of majority of I/O engines.
+ */
+int generic_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	return get_file_size(td, f);
+}
+
+/*
+ * open/close all files, so that ->real_file_size gets set
+ */
+static int get_file_sizes(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int err = 0;
+
+	for_each_file(td, f, i) {
+		dprint(FD_FILE, "get file size for %p/%d/%s\n", f, i,
+								f->file_name);
+
+		if (td_io_get_file_size(td, f)) {
+			if (td->error != ENOENT) {
+				log_err("%s\n", td->verror);
+				err = 1;
+				break;
+			}
+			clear_error(td);
+		}
+
+		/*
+		 * There are corner cases where we end up with -1 for
+		 * ->real_file_size due to unsupported file type, etc.
+		 * We then just set to size option value divided by number
+		 * of files, similar to the way file ->io_size is set.
+		 * stat(2) failure doesn't set ->real_file_size to -1.
+		 */
+		if (f->real_file_size == -1ULL && td->o.size)
+			f->real_file_size = td->o.size / td->o.nr_files;
+	}
+
+	return err;
+}
+
+struct fio_mount {
+	struct flist_head list;
+	const char *base;
+	char __base[256];
+	unsigned int key;
+};
+
+/*
+ * Get free number of bytes for each file on each unique mount.
+ */
+static unsigned long long get_fs_free_counts(struct thread_data *td)
+{
+	struct flist_head *n, *tmp;
+	unsigned long long ret = 0;
+	struct fio_mount *fm;
+	FLIST_HEAD(list);
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		struct stat sb;
+		char buf[256];
+
+		if (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_CHAR) {
+			if (f->real_file_size != -1ULL)
+				ret += f->real_file_size;
+			continue;
+		} else if (f->filetype != FIO_TYPE_FILE)
+			continue;
+
+		snprintf(buf, ARRAY_SIZE(buf), "%s", f->file_name);
+
+		if (stat(buf, &sb) < 0) {
+			if (errno != ENOENT)
+				break;
+			strcpy(buf, ".");
+			if (stat(buf, &sb) < 0)
+				break;
+		}
+
+		fm = NULL;
+		flist_for_each(n, &list) {
+			fm = flist_entry(n, struct fio_mount, list);
+			if (fm->key == sb.st_dev)
+				break;
+
+			fm = NULL;
+		}
+
+		if (fm)
+			continue;
+
+		fm = calloc(1, sizeof(*fm));
+		snprintf(fm->__base, ARRAY_SIZE(fm->__base), "%s", buf);
+		fm->base = basename(fm->__base);
+		fm->key = sb.st_dev;
+		flist_add(&fm->list, &list);
+	}
+
+	flist_for_each_safe(n, tmp, &list) {
+		unsigned long long sz;
+
+		fm = flist_entry(n, struct fio_mount, list);
+		flist_del(&fm->list);
+
+		sz = get_fs_free_size(fm->base);
+		if (sz && sz != -1ULL)
+			ret += sz;
+
+		free(fm);
+	}
+
+	return ret;
+}
+
+uint64_t get_start_offset(struct thread_data *td, struct fio_file *f)
+{
+	bool align = false;
+	struct thread_options *o = &td->o;
+	unsigned long long align_bs;
+	unsigned long long offset;
+	unsigned long long increment;
+
+	if (o->file_append && f->filetype == FIO_TYPE_FILE)
+		return f->real_file_size;
+
+	if (o->offset_increment_percent) {
+		assert(!o->offset_increment);
+		increment = o->offset_increment_percent * f->real_file_size / 100;
+		align = true;
+	} else
+		increment = o->offset_increment;
+
+	if (o->start_offset_percent > 0) {
+		/* calculate the raw offset */
+		offset = (f->real_file_size * o->start_offset_percent / 100) +
+			(td->subjob_number * increment);
+
+		align = true;
+	} else {
+		/* start_offset_percent not set */
+		offset = o->start_offset +
+				td->subjob_number * increment;
+	}
+
+	if (align) {
+		/*
+		 * if offset_align is provided, use it
+		 */
+		if (fio_option_is_set(o, start_offset_align)) {
+			align_bs = o->start_offset_align;
+		} else {
+			/* else take the minimum block size */
+			align_bs = td_min_bs(td);
+		}
+
+		/*
+		 * block align the offset at the next available boundary at
+		 * ceiling(offset / align_bs) * align_bs
+		 */
+		offset = (offset / align_bs + (offset % align_bs != 0)) * align_bs;
+	}
+
+	return offset;
+}
+
+/*
+ * Find longest path component that exists and return its length
+ */
+int longest_existing_path(char *path) {
+	char buf[PATH_MAX];
+	bool done;
+	char *buf_pos;
+	int offset;
+#ifdef WIN32
+	DWORD dwAttr;
+#else
+	struct stat sb;
+#endif
+
+	sprintf(buf, "%s", path);
+	done = false;
+	while (!done) {
+		buf_pos = strrchr(buf, FIO_OS_PATH_SEPARATOR);
+		if (!buf_pos) {
+			done = true;
+			offset = 0;
+			break;
+		}
+
+		*(buf_pos + 1) = '\0';
+
+#ifdef WIN32
+		dwAttr = GetFileAttributesA(buf);
+		if (dwAttr != INVALID_FILE_ATTRIBUTES) {
+			done = true;
+		}
+#else
+		if (stat(buf, &sb) == 0)
+			done = true;
+#endif
+		if (done)
+			offset = buf_pos - buf;
+		else
+			*buf_pos = '\0';
+	}
+
+	return offset;
+}
+
+static bool create_work_dirs(struct thread_data *td, const char *fname)
+{
+	char path[PATH_MAX];
+	char *start, *end;
+	int offset;
+
+	snprintf(path, PATH_MAX, "%s", fname);
+	start = path;
+
+	offset = longest_existing_path(path);
+	end = start + offset;
+	while ((end = strchr(end, FIO_OS_PATH_SEPARATOR)) != NULL) {
+		if (end == start) {
+			end++;
+			continue;
+		}
+		*end = '\0';
+		errno = 0;
+		if (fio_mkdir(path, 0700) && errno != EEXIST) {
+			log_err("fio: failed to create dir (%s): %s\n",
+				start, strerror(errno));
+			return false;
+		}
+		*end = FIO_OS_PATH_SEPARATOR;
+		end++;
+	}
+	td->flags |= TD_F_DIRS_CREATED;
+	return true;
+}
+
+/*
+ * Open the files and setup files sizes, creating files if necessary.
+ */
+int setup_files(struct thread_data *td)
+{
+	unsigned long long total_size, extend_size;
+	struct thread_options *o = &td->o;
+	struct fio_file *f;
+	unsigned int i, nr_fs_extra = 0;
+	int err = 0, need_extend;
+	int old_state;
+	const unsigned long long bs = td_min_bs(td);
+	uint64_t fs = 0;
+
+	dprint(FD_FILE, "setup files\n");
+
+	old_state = td_bump_runstate(td, TD_SETTING_UP);
+
+	for_each_file(td, f, i) {
+		if (!td_ioengine_flagged(td, FIO_DISKLESSIO) &&
+		    strchr(f->file_name, FIO_OS_PATH_SEPARATOR) &&
+		    !(td->flags & TD_F_DIRS_CREATED) &&
+		    !create_work_dirs(td, f->file_name))
+			goto err_out;
+	}
+
+	/*
+	 * Find out physical size of files or devices for this thread,
+	 * before we determine I/O size and range of our targets.
+	 * If ioengine defines a setup() method, it's responsible for
+	 * opening the files and setting f->real_file_size to indicate
+	 * the valid range for that file.
+	 */
+	if (td->io_ops->setup)
+		err = td->io_ops->setup(td);
+	else
+		err = get_file_sizes(td);
+
+	if (err)
+		goto err_out;
+
+	if (o->read_iolog_file)
+		goto done;
+
+	/*
+	 * check sizes. if the files/devices do not exist and the size
+	 * isn't passed to fio, abort.
+	 */
+	total_size = 0;
+	for_each_file(td, f, i) {
+		f->fileno = i;
+		if (f->real_file_size == -1ULL)
+			total_size = -1ULL;
+		else
+			total_size += f->real_file_size;
+	}
+
+	if (o->fill_device)
+		td->fill_device_size = get_fs_free_counts(td);
+
+	/*
+	 * device/file sizes are zero and no size given, punt
+	 */
+	if ((!total_size || total_size == -1ULL) && !o->size &&
+	    !td_ioengine_flagged(td, FIO_NOIO) && !o->fill_device &&
+	    !(o->nr_files && (o->file_size_low || o->file_size_high))) {
+		log_err("%s: you need to specify size=\n", o->name);
+		td_verror(td, EINVAL, "total_file_size");
+		goto err_out;
+	}
+
+	/*
+	 * Calculate per-file size and potential extra size for the
+	 * first files, if needed (i.e. if we don't have a fixed size).
+	 */
+	if (!o->file_size_low && o->nr_files) {
+		uint64_t all_fs;
+
+		fs = o->size / o->nr_files;
+		all_fs = fs * o->nr_files;
+
+		if (all_fs < o->size)
+			nr_fs_extra = (o->size - all_fs) / bs;
+	}
+
+	/*
+	 * now file sizes are known, so we can set ->io_size. if size= is
+	 * not given, ->io_size is just equal to ->real_file_size. if size
+	 * is given, ->io_size is size / nr_files.
+	 */
+	extend_size = total_size = 0;
+	need_extend = 0;
+	for_each_file(td, f, i) {
+		f->file_offset = get_start_offset(td, f);
+
+		/*
+		 * Update ->io_size depending on options specified.
+		 * ->file_size_low being 0 means filesize option isn't set.
+		 * Non zero ->file_size_low equals ->file_size_high means
+		 * filesize option is set in a fixed size format.
+		 * Non zero ->file_size_low not equals ->file_size_high means
+		 * filesize option is set in a range format.
+		 */
+		if (!o->file_size_low) {
+			/*
+			 * no file size or range given, file size is equal to
+			 * total size divided by number of files. If the size
+			 * doesn't divide nicely with the min blocksize,
+			 * make the first files bigger.
+			 */
+			f->io_size = fs;
+			if (nr_fs_extra) {
+				nr_fs_extra--;
+				f->io_size += bs;
+			}
+
+			/*
+			 * We normally don't come here for regular files, but
+			 * if the result is 0 for a regular file, set it to the
+			 * real file size. This could be size of the existing
+			 * one if it already exists, but otherwise will be set
+			 * to 0. A new file won't be created because
+			 * ->io_size + ->file_offset equals ->real_file_size.
+			 */
+			if (!f->io_size) {
+				if (f->file_offset > f->real_file_size)
+					goto err_offset;
+				f->io_size = f->real_file_size - f->file_offset;
+				if (!f->io_size)
+					log_info("fio: file %s may be ignored\n",
+						f->file_name);
+			}
+		} else if (f->real_file_size < o->file_size_low ||
+			   f->real_file_size > o->file_size_high) {
+			if (f->file_offset > o->file_size_low)
+				goto err_offset;
+			/*
+			 * file size given. if it's fixed, use that. if it's a
+			 * range, generate a random size in-between.
+			 */
+			if (o->file_size_low == o->file_size_high)
+				f->io_size = o->file_size_low - f->file_offset;
+			else {
+				f->io_size = get_rand_file_size(td)
+						- f->file_offset;
+			}
+		} else
+			f->io_size = f->real_file_size - f->file_offset;
+
+		if (f->io_size == -1ULL)
+			total_size = -1ULL;
+		else {
+                        if (o->size_percent && o->size_percent != 100) {
+				uint64_t file_size;
+
+				file_size = f->io_size + f->file_offset;
+				f->io_size = (file_size *
+					      o->size_percent) / 100;
+				if (f->io_size > (file_size - f->file_offset))
+					f->io_size = file_size - f->file_offset;
+
+				f->io_size -= (f->io_size % td_min_bs(td));
+			}
+			total_size += f->io_size;
+		}
+
+		if (f->filetype == FIO_TYPE_FILE &&
+		    (f->io_size + f->file_offset) > f->real_file_size) {
+			if (!td_ioengine_flagged(td, FIO_DISKLESSIO) &&
+			    !o->create_on_open) {
+				need_extend++;
+				extend_size += (f->io_size + f->file_offset);
+				fio_file_set_extend(f);
+			} else if (!td_ioengine_flagged(td, FIO_DISKLESSIO) ||
+				   (td_ioengine_flagged(td, FIO_DISKLESSIO) &&
+				    td_ioengine_flagged(td, FIO_FAKEIO)))
+				f->real_file_size = f->io_size + f->file_offset;
+		}
+	}
+
+	if (td->o.block_error_hist) {
+		int len;
+
+		assert(td->o.nr_files == 1);	/* checked in fixup_options */
+		f = td->files[0];
+		len = f->io_size / td->o.bs[DDIR_TRIM];
+		if (len > MAX_NR_BLOCK_INFOS || len <= 0) {
+			log_err("fio: cannot calculate block histogram with "
+				"%d trim blocks, maximum %d\n",
+				len, MAX_NR_BLOCK_INFOS);
+			td_verror(td, EINVAL, "block_error_hist");
+			goto err_out;
+		}
+
+		td->ts.nr_block_infos = len;
+		for (i = 0; i < len; i++)
+			td->ts.block_infos[i] =
+				BLOCK_INFO(0, BLOCK_STATE_UNINIT);
+	} else
+		td->ts.nr_block_infos = 0;
+
+	if (!o->size || (total_size && o->size > total_size))
+		o->size = total_size;
+
+	if (o->size < td_min_bs(td)) {
+		log_err("fio: blocksize too large for data set\n");
+		goto err_out;
+	}
+
+	/*
+	 * See if we need to extend some files, typically needed when our
+	 * target regular files don't exist yet, but our jobs require them
+	 * initially due to read I/Os.
+	 */
+	if (need_extend) {
+		temp_stall_ts = 1;
+		if (output_format & FIO_OUTPUT_NORMAL) {
+			log_info("%s: Laying out IO file%s (%u file%s / %s%lluMiB)\n",
+				 o->name,
+				 need_extend > 1 ? "s" : "",
+				 need_extend,
+				 need_extend > 1 ? "s" : "",
+				 need_extend > 1 ? "total " : "",
+				 extend_size >> 20);
+		}
+
+		for_each_file(td, f, i) {
+			unsigned long long old_len = -1ULL, extend_len = -1ULL;
+
+			if (!fio_file_extend(f))
+				continue;
+
+			assert(f->filetype == FIO_TYPE_FILE);
+			fio_file_clear_extend(f);
+			if (!o->fill_device) {
+				old_len = f->real_file_size;
+				extend_len = f->io_size + f->file_offset -
+						old_len;
+			}
+			f->real_file_size = (f->io_size + f->file_offset);
+			err = extend_file(td, f);
+			if (err)
+				break;
+
+			err = __file_invalidate_cache(td, f, old_len,
+								extend_len);
+
+			/*
+			 * Shut up static checker
+			 */
+			if (f->fd != -1)
+				close(f->fd);
+
+			f->fd = -1;
+			if (err)
+				break;
+		}
+		temp_stall_ts = 0;
+	}
+
+	if (err)
+		goto err_out;
+
+	/*
+	 * iolog already set the total io size, if we read back
+	 * stored entries.
+	 */
+	if (!o->read_iolog_file) {
+		if (o->io_size)
+			td->total_io_size = o->io_size * o->loops;
+		else
+			td->total_io_size = o->size * o->loops;
+	}
+
+done:
+	if (o->create_only)
+		td->done = 1;
+
+	td_restore_runstate(td, old_state);
+
+	if (td->o.zone_mode == ZONE_MODE_ZBD) {
+		err = zbd_init(td);
+		if (err)
+			goto err_out;
+	}
+	return 0;
+
+err_offset:
+	log_err("%s: you need to specify valid offset=\n", o->name);
+err_out:
+	td_restore_runstate(td, old_state);
+	return 1;
+}
+
+bool pre_read_files(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	dprint(FD_FILE, "pre_read files\n");
+
+	for_each_file(td, f, i) {
+		if (!pre_read_file(td, f))
+			return false;
+	}
+
+	return true;
+}
+
+static void __init_rand_distribution(struct thread_data *td, struct fio_file *f)
+{
+	unsigned int range_size, seed;
+	uint64_t nranges;
+	uint64_t fsize;
+
+	range_size = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
+	fsize = min(f->real_file_size, f->io_size);
+
+	nranges = (fsize + range_size - 1ULL) / range_size;
+
+	seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
+	if (!td->o.rand_repeatable)
+		seed = td->rand_seeds[4];
+
+	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+		pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, seed);
+}
+
+static bool init_rand_distribution(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int state;
+
+	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM ||
+	    td->o.random_distribution == FIO_RAND_DIST_ZONED ||
+	    td->o.random_distribution == FIO_RAND_DIST_ZONED_ABS)
+		return false;
+
+	state = td_bump_runstate(td, TD_SETTING_UP);
+
+	for_each_file(td, f, i)
+		__init_rand_distribution(td, f);
+
+	td_restore_runstate(td, state);
+	return true;
+}
+
+/*
+ * Check if the number of blocks exceeds the randomness capability of
+ * the selected generator. Tausworthe is 32-bit, the others are fullly
+ * 64-bit capable.
+ */
+static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f,
+				 uint64_t blocks)
+{
+	if (blocks <= FRAND32_MAX)
+		return 0;
+	if (td->o.random_generator != FIO_RAND_GEN_TAUSWORTHE)
+		return 0;
+
+	/*
+	 * If the user hasn't specified a random generator, switch
+	 * to tausworthe64 with informational warning. If the user did
+	 * specify one, just warn.
+	 */
+	log_info("fio: file %s exceeds 32-bit tausworthe random generator.\n",
+			f->file_name);
+
+	if (!fio_option_is_set(&td->o, random_generator)) {
+		log_info("fio: Switching to tausworthe64. Use the "
+			 "random_generator= option to get rid of this "
+			 "warning.\n");
+		td->o.random_generator = FIO_RAND_GEN_TAUSWORTHE64;
+		return 0;
+	}
+
+	/*
+	 * Just make this information to avoid breaking scripts.
+	 */
+	log_info("fio: Use the random_generator= option to switch to lfsr or "
+			 "tausworthe64.\n");
+	return 0;
+}
+
+bool init_random_map(struct thread_data *td)
+{
+	unsigned long long blocks;
+	struct fio_file *f;
+	unsigned int i;
+
+	if (init_rand_distribution(td))
+		return true;
+	if (!td_random(td))
+		return true;
+
+	for_each_file(td, f, i) {
+		uint64_t fsize = min(f->real_file_size, f->io_size);
+
+		if (td->o.zone_mode == ZONE_MODE_STRIDED)
+			fsize = td->o.zone_range;
+
+		blocks = fsize / (unsigned long long) td->o.rw_min_bs;
+
+		if (check_rand_gen_limits(td, f, blocks))
+			return false;
+
+		if (td->o.random_generator == FIO_RAND_GEN_LFSR) {
+			uint64_t seed;
+
+			seed = td->rand_seeds[FIO_RAND_BLOCK_OFF];
+
+			if (!lfsr_init(&f->lfsr, blocks, seed, 0)) {
+				fio_file_set_lfsr(f);
+				continue;
+			} else {
+				log_err("fio: failed initializing LFSR\n");
+				return false;
+			}
+		} else if (!td->o.norandommap) {
+			f->io_axmap = axmap_new(blocks);
+			if (f->io_axmap) {
+				fio_file_set_axmap(f);
+				continue;
+			}
+		} else if (td->o.norandommap)
+			continue;
+
+		if (!td->o.softrandommap) {
+			log_err("fio: failed allocating random map. If running"
+				" a large number of jobs, try the 'norandommap'"
+				" option or set 'softrandommap'. Or give"
+				" a larger --alloc-size to fio.\n");
+			return false;
+		}
+
+		log_info("fio: file %s failed allocating random map. Running "
+			 "job without.\n", f->file_name);
+	}
+
+	return true;
+}
+
+void close_files(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (fio_file_open(f))
+			td_io_close_file(td, f);
+	}
+}
+
+void close_and_free_files(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	bool use_free = td_ioengine_flagged(td, FIO_NOFILEHASH);
+
+	dprint(FD_FILE, "close files\n");
+
+	for_each_file(td, f, i) {
+		if (td->o.unlink && f->filetype == FIO_TYPE_FILE) {
+			dprint(FD_FILE, "free unlink %s\n", f->file_name);
+			td_io_unlink_file(td, f);
+		}
+
+		if (fio_file_open(f))
+			td_io_close_file(td, f);
+
+		remove_file_hash(f);
+
+		if (td->o.unlink && f->filetype == FIO_TYPE_FILE) {
+			dprint(FD_FILE, "free unlink %s\n", f->file_name);
+			td_io_unlink_file(td, f);
+		}
+
+		zbd_free_zone_info(f);
+
+		if (use_free)
+			free(f->file_name);
+		else
+			sfree(f->file_name);
+		f->file_name = NULL;
+		if (fio_file_axmap(f)) {
+			axmap_free(f->io_axmap);
+			f->io_axmap = NULL;
+		}
+		if (use_free)
+			free(f);
+		else
+			sfree(f);
+	}
+
+	td->o.filename = NULL;
+	free(td->files);
+	free(td->file_locks);
+	td->files_index = 0;
+	td->files = NULL;
+	td->file_locks = NULL;
+	td->o.file_lock_mode = FILE_LOCK_NONE;
+	td->o.nr_files = 0;
+}
+
+static void get_file_type(struct fio_file *f)
+{
+	struct stat sb;
+
+	if (!strcmp(f->file_name, "-"))
+		f->filetype = FIO_TYPE_PIPE;
+	else
+		f->filetype = FIO_TYPE_FILE;
+
+#ifdef WIN32
+	/* \\.\ is the device namespace in Windows, where every file is
+	 * a block device */
+	if (strncmp(f->file_name, "\\\\.\\", 4) == 0)
+		f->filetype = FIO_TYPE_BLOCK;
+#endif
+
+	if (!stat(f->file_name, &sb)) {
+		if (S_ISBLK(sb.st_mode))
+			f->filetype = FIO_TYPE_BLOCK;
+		else if (S_ISCHR(sb.st_mode))
+			f->filetype = FIO_TYPE_CHAR;
+		else if (S_ISFIFO(sb.st_mode))
+			f->filetype = FIO_TYPE_PIPE;
+	}
+}
+
+static bool __is_already_allocated(const char *fname, bool set)
+{
+	struct flist_head *entry;
+	bool ret;
+
+	ret = file_bloom_exists(fname, set);
+	if (!ret)
+		return ret;
+
+	flist_for_each(entry, &filename_list) {
+		struct file_name *fn;
+
+		fn = flist_entry(entry, struct file_name, list);
+
+		if (!strcmp(fn->filename, fname))
+			return true;
+	}
+
+	return false;
+}
+
+static bool is_already_allocated(const char *fname)
+{
+	bool ret;
+
+	fio_file_hash_lock();
+	ret = __is_already_allocated(fname, false);
+	fio_file_hash_unlock();
+
+	return ret;
+}
+
+static void set_already_allocated(const char *fname)
+{
+	struct file_name *fn;
+
+	fn = malloc(sizeof(struct file_name));
+	fn->filename = strdup(fname);
+
+	fio_file_hash_lock();
+	if (!__is_already_allocated(fname, true)) {
+		flist_add_tail(&fn->list, &filename_list);
+		fn = NULL;
+	}
+	fio_file_hash_unlock();
+
+	if (fn) {
+		free(fn->filename);
+		free(fn);
+	}
+}
+
+static void free_already_allocated(void)
+{
+	struct flist_head *entry, *tmp;
+	struct file_name *fn;
+
+	if (flist_empty(&filename_list))
+		return;
+
+	fio_file_hash_lock();
+	flist_for_each_safe(entry, tmp, &filename_list) {
+		fn = flist_entry(entry, struct file_name, list);
+		free(fn->filename);
+		flist_del(&fn->list);
+		free(fn);
+	}
+
+	fio_file_hash_unlock();
+}
+
+static struct fio_file *alloc_new_file(struct thread_data *td)
+{
+	struct fio_file *f;
+
+	if (td_ioengine_flagged(td, FIO_NOFILEHASH))
+		f = calloc(1, sizeof(*f));
+	else
+		f = scalloc(1, sizeof(*f));
+	if (!f) {
+		assert(0);
+		return NULL;
+	}
+
+	f->fd = -1;
+	f->shadow_fd = -1;
+	fio_file_reset(td, f);
+	return f;
+}
+
+bool exists_and_not_regfile(const char *filename)
+{
+	struct stat sb;
+
+	if (lstat(filename, &sb) == -1)
+		return false;
+
+#ifndef WIN32 /* NOT Windows */
+	if (S_ISREG(sb.st_mode))
+		return false;
+#else
+	/* \\.\ is the device namespace in Windows, where every file
+	 * is a device node */
+	if (S_ISREG(sb.st_mode) && strncmp(filename, "\\\\.\\", 4) != 0)
+		return false;
+#endif
+
+	return true;
+}
+
+int add_file(struct thread_data *td, const char *fname, int numjob, int inc)
+{
+	int cur_files = td->files_index;
+	char file_name[PATH_MAX];
+	struct fio_file *f;
+	int len = 0;
+
+	dprint(FD_FILE, "add file %s\n", fname);
+
+	if (td->o.directory)
+		len = set_name_idx(file_name, PATH_MAX, td->o.directory, numjob,
+					td->o.unique_filename);
+
+	sprintf(file_name + len, "%s", fname);
+
+	/* clean cloned siblings using existing files */
+	if (numjob && is_already_allocated(file_name) &&
+	    !exists_and_not_regfile(fname))
+		return 0;
+
+	f = alloc_new_file(td);
+
+	if (td->files_size <= td->files_index) {
+		unsigned int new_size = td->o.nr_files + 1;
+
+		dprint(FD_FILE, "resize file array to %d files\n", new_size);
+
+		td->files = realloc(td->files, new_size * sizeof(f));
+		if (td->files == NULL) {
+			log_err("fio: realloc OOM\n");
+			assert(0);
+		}
+		if (td->o.file_lock_mode != FILE_LOCK_NONE) {
+			td->file_locks = realloc(td->file_locks, new_size);
+			if (!td->file_locks) {
+				log_err("fio: realloc OOM\n");
+				assert(0);
+			}
+			td->file_locks[cur_files] = FILE_LOCK_NONE;
+		}
+		td->files_size = new_size;
+	}
+	td->files[cur_files] = f;
+	f->fileno = cur_files;
+
+	/*
+	 * init function, io engine may not be loaded yet
+	 */
+	if (td->io_ops && td_ioengine_flagged(td, FIO_DISKLESSIO))
+		f->real_file_size = -1ULL;
+
+	if (td_ioengine_flagged(td, FIO_NOFILEHASH))
+		f->file_name = strdup(file_name);
+	else
+		f->file_name = smalloc_strdup(file_name);
+
+	/* can't handle smalloc failure from here */
+	assert(f->file_name);
+
+	get_file_type(f);
+
+	switch (td->o.file_lock_mode) {
+	case FILE_LOCK_NONE:
+		break;
+	case FILE_LOCK_READWRITE:
+		f->rwlock = fio_rwlock_init();
+		break;
+	case FILE_LOCK_EXCLUSIVE:
+		f->lock = fio_sem_init(FIO_SEM_UNLOCKED);
+		break;
+	default:
+		log_err("fio: unknown lock mode: %d\n", td->o.file_lock_mode);
+		assert(0);
+	}
+
+	td->files_index++;
+
+	if (td->o.numjobs > 1)
+		set_already_allocated(file_name);
+
+	if (inc)
+		td->o.nr_files++;
+
+	dprint(FD_FILE, "file %p \"%s\" added at %d\n", f, f->file_name,
+							cur_files);
+
+	return cur_files;
+}
+
+int add_file_exclusive(struct thread_data *td, const char *fname)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (!strcmp(f->file_name, fname))
+			return i;
+	}
+
+	return add_file(td, fname, 0, 1);
+}
+
+void get_file(struct fio_file *f)
+{
+	dprint(FD_FILE, "get file %s, ref=%d\n", f->file_name, f->references);
+	assert(fio_file_open(f));
+	f->references++;
+}
+
+int put_file(struct thread_data *td, struct fio_file *f)
+{
+	int f_ret = 0, ret = 0;
+
+	dprint(FD_FILE, "put file %s, ref=%d\n", f->file_name, f->references);
+
+	if (!fio_file_open(f)) {
+		assert(f->fd == -1);
+		return 0;
+	}
+
+	assert(f->references);
+	if (--f->references)
+		return 0;
+
+	disk_util_dec(f->du);
+
+	if (td->o.file_lock_mode != FILE_LOCK_NONE)
+		unlock_file_all(td, f);
+
+	if (should_fsync(td) && td->o.fsync_on_close) {
+		f_ret = fsync(f->fd);
+		if (f_ret < 0)
+			f_ret = errno;
+	}
+
+	if (td->io_ops->close_file)
+		ret = td->io_ops->close_file(td, f);
+
+	if (!ret)
+		ret = f_ret;
+
+	td->nr_open_files--;
+	fio_file_clear_closing(f);
+	fio_file_clear_open(f);
+	assert(f->fd == -1);
+	return ret;
+}
+
+void lock_file(struct thread_data *td, struct fio_file *f, enum fio_ddir ddir)
+{
+	if (!f->lock || td->o.file_lock_mode == FILE_LOCK_NONE)
+		return;
+
+	if (td->o.file_lock_mode == FILE_LOCK_READWRITE) {
+		if (ddir == DDIR_READ)
+			fio_rwlock_read(f->rwlock);
+		else
+			fio_rwlock_write(f->rwlock);
+	} else if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
+		fio_sem_down(f->lock);
+
+	td->file_locks[f->fileno] = td->o.file_lock_mode;
+}
+
+void unlock_file(struct thread_data *td, struct fio_file *f)
+{
+	if (!f->lock || td->o.file_lock_mode == FILE_LOCK_NONE)
+		return;
+
+	if (td->o.file_lock_mode == FILE_LOCK_READWRITE)
+		fio_rwlock_unlock(f->rwlock);
+	else if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
+		fio_sem_up(f->lock);
+
+	td->file_locks[f->fileno] = FILE_LOCK_NONE;
+}
+
+void unlock_file_all(struct thread_data *td, struct fio_file *f)
+{
+	if (td->o.file_lock_mode == FILE_LOCK_NONE || !td->file_locks)
+		return;
+	if (td->file_locks[f->fileno] != FILE_LOCK_NONE)
+		unlock_file(td, f);
+}
+
+static bool recurse_dir(struct thread_data *td, const char *dirname)
+{
+	struct dirent *dir;
+	bool ret = false;
+	DIR *D;
+
+	D = opendir(dirname);
+	if (!D) {
+		char buf[FIO_VERROR_SIZE];
+
+		snprintf(buf, FIO_VERROR_SIZE, "opendir(%s)", dirname);
+		td_verror(td, errno, buf);
+		return true;
+	}
+
+	while ((dir = readdir(D)) != NULL) {
+		char full_path[PATH_MAX];
+		struct stat sb;
+
+		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
+			continue;
+
+		sprintf(full_path, "%s%c%s", dirname, FIO_OS_PATH_SEPARATOR, dir->d_name);
+
+		if (lstat(full_path, &sb) == -1) {
+			if (errno != ENOENT) {
+				td_verror(td, errno, "stat");
+				ret = true;
+				break;
+			}
+		}
+
+		if (S_ISREG(sb.st_mode)) {
+			add_file(td, full_path, 0, 1);
+			continue;
+		}
+		if (!S_ISDIR(sb.st_mode))
+			continue;
+
+		ret = recurse_dir(td, full_path);
+		if (ret)
+			break;
+	}
+
+	closedir(D);
+	return ret;
+}
+
+int add_dir_files(struct thread_data *td, const char *path)
+{
+	int ret = recurse_dir(td, path);
+
+	if (!ret)
+		log_info("fio: opendir added %d files\n", td->o.nr_files);
+
+	return ret;
+}
+
+void dup_files(struct thread_data *td, struct thread_data *org)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	dprint(FD_FILE, "dup files: %d\n", org->files_index);
+
+	if (!org->files)
+		return;
+
+	td->files = malloc(org->files_index * sizeof(f));
+
+	if (td->o.file_lock_mode != FILE_LOCK_NONE)
+		td->file_locks = malloc(org->files_index);
+
+	for_each_file(org, f, i) {
+		struct fio_file *__f;
+
+		__f = alloc_new_file(td);
+
+		if (f->file_name) {
+			if (td_ioengine_flagged(td, FIO_NOFILEHASH))
+				__f->file_name = strdup(f->file_name);
+			else
+				__f->file_name = smalloc_strdup(f->file_name);
+
+			/* can't handle smalloc failure from here */
+			assert(__f->file_name);
+			__f->filetype = f->filetype;
+		}
+
+		if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
+			__f->lock = f->lock;
+		else if (td->o.file_lock_mode == FILE_LOCK_READWRITE)
+			__f->rwlock = f->rwlock;
+
+		td->files[i] = __f;
+	}
+}
+
+/*
+ * Returns the index that matches the filename, or -1 if not there
+ */
+int get_fileno(struct thread_data *td, const char *fname)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i)
+		if (!strcmp(f->file_name, fname))
+			return i;
+
+	return -1;
+}
+
+/*
+ * For log usage, where we add/open/close files automatically
+ */
+void free_release_files(struct thread_data *td)
+{
+	close_files(td);
+	td->o.nr_files = 0;
+	td->o.open_files = 0;
+	td->files_index = 0;
+}
+
+void fio_file_reset(struct thread_data *td, struct fio_file *f)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		f->last_pos[i] = f->file_offset;
+		f->last_start[i] = -1ULL;
+	}
+
+	if (fio_file_axmap(f))
+		axmap_reset(f->io_axmap);
+	else if (fio_file_lfsr(f))
+		lfsr_reset(&f->lfsr, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
+
+	zbd_file_reset(td, f);
+}
+
+bool fio_files_done(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i)
+		if (!fio_file_done(f))
+			return false;
+
+	return true;
+}
+
+/* free memory used in initialization phase only */
+void filesetup_mem_free(void)
+{
+	free_already_allocated();
+}
+
+/*
+ * This function is for platforms which support direct I/O but not O_DIRECT.
+ */
+int fio_set_directio(struct thread_data *td, struct fio_file *f)
+{
+#ifdef FIO_OS_DIRECTIO
+	int ret = fio_set_odirect(f);
+
+	if (ret) {
+		td_verror(td, ret, "fio_set_directio");
+#if defined(__sun__)
+		if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
+			log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
+		} else {
+			log_err("fio: the file system does not seem to support direct IO\n");
+		}
+#else
+		log_err("fio: the file system does not seem to support direct IO\n");
+#endif
+		return -1;
+	}
+
+	return 0;
+#else
+	log_err("fio: direct IO is not supported on this host operating system\n");
+	return -1;
+#endif
+}
diff --git a/fio.1 b/fio.1
new file mode 100644
index 0000000..1db12c2
--- /dev/null
+++ b/fio.1
@@ -0,0 +1,4002 @@
+.TH fio 1 "August 2017" "User Manual"
+.SH NAME
+fio \- flexible I/O tester
+.SH SYNOPSIS
+.B fio
+[\fIoptions\fR] [\fIjobfile\fR]...
+.SH DESCRIPTION
+.B fio
+is a tool that will spawn a number of threads or processes doing a
+particular type of I/O action as specified by the user.
+The typical use of fio is to write a job file matching the I/O load
+one wants to simulate.
+.SH OPTIONS
+.TP
+.BI \-\-debug \fR=\fPtype
+Enable verbose tracing \fItype\fR of various fio actions. May be `all' for all \fItype\fRs
+or individual types separated by a comma (e.g. `\-\-debug=file,mem' will enable
+file and memory debugging). `help' will list all available tracing options.
+.TP
+.BI \-\-parse\-only
+Parse options only, don't start any I/O.
+.TP
+.BI \-\-merge\-blktrace\-only
+Merge blktraces only, don't start any I/O.
+.TP
+.BI \-\-output \fR=\fPfilename
+Write output to \fIfilename\fR.
+.TP
+.BI \-\-output\-format \fR=\fPformat
+Set the reporting \fIformat\fR to `normal', `terse', `json', or
+`json+'. Multiple formats can be selected, separate by a comma. `terse'
+is a CSV based format. `json+' is like `json', except it adds a full
+dump of the latency buckets.
+.TP
+.BI \-\-bandwidth\-log
+Generate aggregate bandwidth logs.
+.TP
+.BI \-\-minimal
+Print statistics in a terse, semicolon\-delimited format.
+.TP
+.BI \-\-append\-terse
+Print statistics in selected mode AND terse, semicolon\-delimited format.
+\fBDeprecated\fR, use \fB\-\-output\-format\fR instead to select multiple formats.
+.TP
+.BI \-\-terse\-version \fR=\fPversion
+Set terse \fIversion\fR output format (default `3', or `2', `4', `5').
+.TP
+.BI \-\-version
+Print version information and exit.
+.TP
+.BI \-\-help
+Print a summary of the command line options and exit.
+.TP
+.BI \-\-cpuclock\-test
+Perform test and validation of internal CPU clock.
+.TP
+.BI \-\-crctest \fR=\fP[test]
+Test the speed of the built\-in checksumming functions. If no argument is given,
+all of them are tested. Alternatively, a comma separated list can be passed, in which
+case the given ones are tested.
+.TP
+.BI \-\-cmdhelp \fR=\fPcommand
+Print help information for \fIcommand\fR. May be `all' for all commands.
+.TP
+.BI \-\-enghelp \fR=\fP[ioengine[,command]]
+List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR
+defined by \fIioengine\fR. If no \fIioengine\fR is given, list all
+available ioengines.
+.TP
+.BI \-\-showcmd \fR=\fPjobfile
+Convert \fIjobfile\fR to a set of command\-line options.
+.TP
+.BI \-\-readonly
+Turn on safety read\-only checks, preventing writes and trims. The \fB\-\-readonly\fR
+option is an extra safety guard to prevent users from accidentally starting
+a write or trim workload when that is not desired. Fio will only modify the
+device under test if `rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite'
+is given. This safety net can be used as an extra precaution.
+.TP
+.BI \-\-eta \fR=\fPwhen
+Specifies when real\-time ETA estimate should be printed. \fIwhen\fR may
+be `always', `never' or `auto'. `auto' is the default, it prints ETA when
+requested if the output is a TTY. `always' disregards the output type, and
+prints ETA when requested. `never' never prints ETA.
+.TP
+.BI \-\-eta\-interval \fR=\fPtime
+By default, fio requests client ETA status roughly every second. With this
+option, the interval is configurable. Fio imposes a minimum allowed time to
+avoid flooding the console, less than 250 msec is not supported.
+.TP
+.BI \-\-eta\-newline \fR=\fPtime
+Force a new line for every \fItime\fR period passed. When the unit is omitted,
+the value is interpreted in seconds.
+.TP
+.BI \-\-status\-interval \fR=\fPtime
+Force a full status dump of cumulative (from job start) values at \fItime\fR
+intervals. This option does *not* provide per-period measurements. So
+values such as bandwidth are running averages. When the time unit is omitted,
+\fItime\fR is interpreted in seconds. Note that using this option with
+`\-\-output-format=json' will yield output that technically isn't valid json,
+since the output will be collated sets of valid json. It will need to be split
+into valid sets of json after the run.
+.TP
+.BI \-\-section \fR=\fPname
+Only run specified section \fIname\fR in job file. Multiple sections can be specified.
+The \fB\-\-section\fR option allows one to combine related jobs into one file.
+E.g. one job file could define light, moderate, and heavy sections. Tell
+fio to run only the "heavy" section by giving `\-\-section=heavy'
+command line option. One can also specify the "write" operations in one
+section and "verify" operation in another section. The \fB\-\-section\fR option
+only applies to job sections. The reserved *global* section is always
+parsed and used.
+.TP
+.BI \-\-alloc\-size \fR=\fPkb
+Allocate additional internal smalloc pools of size \fIkb\fR in KiB. The
+\fB\-\-alloc\-size\fR option increases shared memory set aside for use by fio.
+If running large jobs with randommap enabled, fio can run out of memory.
+Smalloc is an internal allocator for shared structures from a fixed size
+memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+NOTE: While running `.fio_smalloc.*' backing store files are visible
+in `/tmp'.
+.TP
+.BI \-\-warnings\-fatal
+All fio parser warnings are fatal, causing fio to exit with an error.
+.TP
+.BI \-\-max\-jobs \fR=\fPnr
+Set the maximum number of threads/processes to support to \fInr\fR.
+NOTE: On Linux, it may be necessary to increase the shared-memory limit
+(`/proc/sys/kernel/shmmax') if fio runs into errors while creating jobs.
+.TP
+.BI \-\-server \fR=\fPargs
+Start a backend server, with \fIargs\fR specifying what to listen to.
+See \fBCLIENT/SERVER\fR section.
+.TP
+.BI \-\-daemonize \fR=\fPpidfile
+Background a fio server, writing the pid to the given \fIpidfile\fR file.
+.TP
+.BI \-\-client \fR=\fPhostname
+Instead of running the jobs locally, send and run them on the given \fIhostname\fR
+or set of \fIhostname\fRs. See \fBCLIENT/SERVER\fR section.
+.TP
+.BI \-\-remote\-config \fR=\fPfile
+Tell fio server to load this local \fIfile\fR.
+.TP
+.BI \-\-idle\-prof \fR=\fPoption
+Report CPU idleness. \fIoption\fR is one of the following:
+.RS
+.RS
+.TP
+.B calibrate
+Run unit work calibration only and exit.
+.TP
+.B system
+Show aggregate system idleness and unit work.
+.TP
+.B percpu
+As \fBsystem\fR but also show per CPU idleness.
+.RE
+.RE
+.TP
+.BI \-\-inflate\-log \fR=\fPlog
+Inflate and output compressed \fIlog\fR.
+.TP
+.BI \-\-trigger\-file \fR=\fPfile
+Execute trigger command when \fIfile\fR exists.
+.TP
+.BI \-\-trigger\-timeout \fR=\fPtime
+Execute trigger at this \fItime\fR.
+.TP
+.BI \-\-trigger \fR=\fPcommand
+Set this \fIcommand\fR as local trigger.
+.TP
+.BI \-\-trigger\-remote \fR=\fPcommand
+Set this \fIcommand\fR as remote trigger.
+.TP
+.BI \-\-aux\-path \fR=\fPpath
+Use the directory specified by \fIpath\fP for generated state files instead
+of the current working directory.
+.SH "JOB FILE FORMAT"
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will \fBstonewall\fR execution
+between each group.
+
+Fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning. Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The \fB\-\-cmdhelp\fR option also lists all options. If used with an \fIcommand\fR
+argument, \fB\-\-cmdhelp\fR will detail the given \fIcommand\fR.
+
+See the `examples/' directory for inspiration on how to write job files. Note
+the copyright and license requirements currently apply to
+`examples/' files.
+
+Note that the maximum length of a line in the job file is 8192 bytes.
+.SH "JOB FILE PARAMETERS"
+Some parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+.RS
+.P
+.B addition (+)
+.P
+.B subtraction (\-)
+.P
+.B multiplication (*)
+.P
+.B division (/)
+.P
+.B modulus (%)
+.P
+.B exponentiation (^)
+.RE
+.P
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses).
+.SH "PARAMETER TYPES"
+The following parameter types are used.
+.TP
+.I str
+String. A sequence of alphanumeric characters.
+.TP
+.I time
+Integer with possible time suffix. Without a unit value is interpreted as
+seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for
+hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and 'us'
+(or 'usec') for microseconds. For example, use 10m for 10 minutes.
+.TP
+.I int
+Integer. A whole number value, which may contain an integer prefix
+and an integer suffix.
+.RS
+.RS
+.P
+[*integer prefix*] **number** [*integer suffix*]
+.RE
+.P
+The optional *integer prefix* specifies the number's base. The default
+is decimal. *0x* specifies hexadecimal.
+.P
+The optional *integer suffix* specifies the number's units, and includes an
+optional unit prefix and an optional unit. For quantities of data, the
+default unit is bytes. For quantities of time, the default unit is seconds
+unless otherwise specified.
+.P
+With `kb_base=1000', fio follows international standards for unit
+prefixes. To specify power-of-10 decimal values defined in the
+International System of Units (SI):
+.RS
+.P
+.PD 0
+K means kilo (K) or 1000
+.P
+M means mega (M) or 1000**2
+.P
+G means giga (G) or 1000**3
+.P
+T means tera (T) or 1000**4
+.P
+P means peta (P) or 1000**5
+.PD
+.RE
+.P
+To specify power-of-2 binary values defined in IEC 80000-13:
+.RS
+.P
+.PD 0
+Ki means kibi (Ki) or 1024
+.P
+Mi means mebi (Mi) or 1024**2
+.P
+Gi means gibi (Gi) or 1024**3
+.P
+Ti means tebi (Ti) or 1024**4
+.P
+Pi means pebi (Pi) or 1024**5
+.PD
+.RE
+.P
+With `kb_base=1024' (the default), the unit prefixes are opposite
+from those specified in the SI and IEC 80000-13 standards to provide
+compatibility with old scripts. For example, 4k means 4096.
+.P
+For quantities of data, an optional unit of 'B' may be included
+(e.g., 'kB' is the same as 'k').
+.P
+The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+not milli). 'b' and 'B' both mean byte, not bit.
+.P
+Examples with `kb_base=1000':
+.RS
+.P
+.PD 0
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
+1 MiB: 1048576, 1m, 1024k
+.P
+1 MB: 1000000, 1mi, 1000ki
+.P
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.PD
+.RE
+.P
+Examples with `kb_base=1024' (default):
+.RS
+.P
+.PD 0
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
+1 MiB: 1048576, 1m, 1024k
+.P
+1 MB: 1000000, 1mi, 1000ki
+.P
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.PD
+.RE
+.P
+To specify times (units are not case sensitive):
+.RS
+.P
+.PD 0
+D means days
+.P
+H means hours
+.P
+M mean minutes
+.P
+s or sec means seconds (default)
+.P
+ms or msec means milliseconds
+.P
+us or usec means microseconds
+.PD
+.RE
+.P
+If the option accepts an upper and lower range, use a colon ':' or
+minus '\-' to separate such values. See \fIirange\fR parameter type.
+If the lower value specified happens to be larger than the upper value
+the two values are swapped.
+.RE
+.TP
+.I bool
+Boolean. Usually parsed as an integer, however only defined for
+true and false (1 and 0).
+.TP
+.I irange
+Integer range with suffix. Allows value range to be given, such as
+1024\-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+option allows two sets of ranges, they can be specified with a ',' or '/'
+delimiter: 1k\-4k/8k\-32k. Also see \fIint\fR parameter type.
+.TP
+.I float_list
+A list of floating point numbers, separated by a ':' character.
+.SH "JOB PARAMETERS"
+With the above in mind, here follows the complete list of fio job parameters.
+.SS "Units"
+.TP
+.BI kb_base \fR=\fPint
+Select the interpretation of unit prefixes in input parameters.
+.RS
+.RS
+.TP
+.B 1000
+Inputs comply with IEC 80000-13 and the International
+System of Units (SI). Use:
+.RS
+.P
+.PD 0
+\- power-of-2 values with IEC prefixes (e.g., KiB)
+.P
+\- power-of-10 values with SI prefixes (e.g., kB)
+.PD
+.RE
+.TP
+.B 1024
+Compatibility mode (default). To avoid breaking old scripts:
+.P
+.RS
+.PD 0
+\- power-of-2 values with SI prefixes
+.P
+\- power-of-10 values with IEC prefixes
+.PD
+.RE
+.RE
+.P
+See \fBbs\fR for more details on input parameters.
+.P
+Outputs always use correct prefixes. Most outputs include both
+side-by-side, like:
+.P
+.RS
+bw=2383.3kB/s (2327.4KiB/s)
+.RE
+.P
+If only one value is reported, then kb_base selects the one to use:
+.P
+.RS
+.PD 0
+1000 \-\- SI prefixes
+.P
+1024 \-\- IEC prefixes
+.PD
+.RE
+.RE
+.TP
+.BI unit_base \fR=\fPint
+Base unit for reporting. Allowed values are:
+.RS
+.RS
+.TP
+.B 0
+Use auto-detection (default).
+.TP
+.B 8
+Byte based.
+.TP
+.B 1
+Bit based.
+.RE
+.RE
+.SS "Job description"
+.TP
+.BI name \fR=\fPstr
+ASCII name of the job. This may be used to override the name printed by fio
+for this job. Otherwise the job name is used. On the command line this
+parameter has the special purpose of also signaling the start of a new job.
+.TP
+.BI description \fR=\fPstr
+Text description of the job. Doesn't do anything except dump this text
+description when this job is run. It's not parsed.
+.TP
+.BI loops \fR=\fPint
+Run the specified number of iterations of this job. Used to repeat the same
+workload a given number of times. Defaults to 1.
+.TP
+.BI numjobs \fR=\fPint
+Create the specified number of clones of this job. Each clone of job
+is spawned as an independent thread or process. May be used to setup a
+larger number of threads/processes doing the same thing. Each thread is
+reported separately; to see statistics for all clones as a whole, use
+\fBgroup_reporting\fR in conjunction with \fBnew_group\fR.
+See \fB\-\-max\-jobs\fR. Default: 1.
+.SS "Time related parameters"
+.TP
+.BI runtime \fR=\fPtime
+Tell fio to terminate processing after the specified period of time. It
+can be quite hard to determine for how long a specified job will run, so
+this parameter is handy to cap the total runtime to a given time. When
+the unit is omitted, the value is interpreted in seconds.
+.TP
+.BI time_based
+If set, fio will run for the duration of the \fBruntime\fR specified
+even if the file(s) are completely read or written. It will simply loop over
+the same workload as many times as the \fBruntime\fR allows.
+.TP
+.BI startdelay \fR=\fPirange(int)
+Delay the start of job for the specified amount of time. Can be a single
+value or a range. When given as a range, each thread will choose a value
+randomly from within the range. Value is in seconds if a unit is omitted.
+.TP
+.BI ramp_time \fR=\fPtime
+If set, fio will run the specified workload for this amount of time before
+logging any performance numbers. Useful for letting performance settle
+before logging results, thus minimizing the runtime required for stable
+results. Note that the \fBramp_time\fR is considered lead in time for a job,
+thus it will increase the total runtime if a special timeout or
+\fBruntime\fR is specified. When the unit is omitted, the value is
+given in seconds.
+.TP
+.BI clocksource \fR=\fPstr
+Use the given clocksource as the base of timing. The supported options are:
+.RS
+.RS
+.TP
+.B gettimeofday
+\fBgettimeofday\fR\|(2)
+.TP
+.B clock_gettime
+\fBclock_gettime\fR\|(2)
+.TP
+.B cpu
+Internal CPU clock source
+.RE
+.P
+\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast (and
+fio is heavy on time calls). Fio will automatically use this clocksource if
+it's supported and considered reliable on the system it is running on,
+unless another clocksource is specifically set. For x86/x86\-64 CPUs, this
+means supporting TSC Invariant.
+.RE
+.TP
+.BI gtod_reduce \fR=\fPbool
+Enable all of the \fBgettimeofday\fR\|(2) reducing options
+(\fBdisable_clat\fR, \fBdisable_slat\fR, \fBdisable_bw_measurement\fR) plus
+reduce precision of the timeout somewhat to really shrink the
+\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do
+about 0.4% of the \fBgettimeofday\fR\|(2) calls we would have done if all
+time keeping was enabled.
+.TP
+.BI gtod_cpu \fR=\fPint
+Sometimes it's cheaper to dedicate a single thread of execution to just
+getting the current time. Fio (and databases, for instance) are very
+intensive on \fBgettimeofday\fR\|(2) calls. With this option, you can set
+one CPU aside for doing nothing but logging current time to a shared memory
+location. Then the other threads/processes that run I/O workloads need only
+copy that segment, instead of entering the kernel with a
+\fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time
+calls will be excluded from other uses. Fio will manually clear it from the
+CPU mask of other jobs.
+.SS "Target file/device"
+.TP
+.BI directory \fR=\fPstr
+Prefix \fBfilename\fRs with this directory. Used to place files in a different
+location than `./'. You can specify a number of directories by
+separating the names with a ':' character. These directories will be
+assigned equally distributed to job clones created by \fBnumjobs\fR as
+long as they are using generated filenames. If specific \fBfilename\fR(s) are
+set fio will use the first listed directory, and thereby matching the
+\fBfilename\fR semantic (which generates a file for each clone if not
+specified, but lets all clones use the same file if set).
+.RS
+.P
+See the \fBfilename\fR option for information on how to escape ':'
+characters within the directory path itself.
+.P
+Note: To control the directory fio will use for internal state files
+use \fB\-\-aux\-path\fR.
+.RE
+.TP
+.BI filename \fR=\fPstr
+Fio normally makes up a \fBfilename\fR based on the job name, thread number, and
+file number (see \fBfilename_format\fR). If you want to share files
+between threads in a job or several
+jobs with fixed file paths, specify a \fBfilename\fR for each of them to override
+the default. If the ioengine is file based, you can specify a number of files
+by separating the names with a ':' colon. So if you wanted a job to open
+`/dev/sda' and `/dev/sdb' as the two working files, you would use
+`filename=/dev/sda:/dev/sdb'. This also means that whenever this option is
+specified, \fBnrfiles\fR is ignored. The size of regular files specified
+by this option will be \fBsize\fR divided by number of files unless an
+explicit size is specified by \fBfilesize\fR.
+.RS
+.P
+Each colon in the wanted path must be escaped with a '\\'
+character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you
+would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is
+`F:\\filename' then you would use `filename=F\\:\\filename'.
+.P
+On Windows, disk devices are accessed as `\\\\.\\PhysicalDrive0' for
+the first device, `\\\\.\\PhysicalDrive1' for the second etc.
+Note: Windows and FreeBSD prevent write access to areas
+of the disk containing in-use data (e.g. filesystems).
+.P
+The filename `\-' is a reserved name, meaning *stdin* or *stdout*. Which
+of the two depends on the read/write direction set.
+.RE
+.TP
+.BI filename_format \fR=\fPstr
+If sharing multiple files between jobs, it is usually necessary to have fio
+generate the exact names that you want. By default, fio will name a file
+based on the default file format specification of
+`jobname.jobnumber.filenumber'. With this option, that can be
+customized. Fio will recognize and replace the following keywords in this
+string:
+.RS
+.RS
+.TP
+.B $jobname
+The name of the worker thread or process.
+.TP
+.B $jobnum
+The incremental number of the worker thread or process.
+.TP
+.B $filenum
+The incremental number of the file for that worker thread or process.
+.RE
+.P
+To have dependent jobs share a set of files, this option can be set to have
+fio generate filenames that are shared between the two. For instance, if
+`testfiles.$filenum' is specified, file number 4 for any job will be
+named `testfiles.4'. The default of `$jobname.$jobnum.$filenum'
+will be used if no other format specifier is given.
+.P
+If you specify a path then the directories will be created up to the main
+directory for the file.  So for example if you specify `a/b/c/$jobnum` then the
+directories a/b/c will be created before the file setup part of the job.  If you
+specify \fBdirectory\fR then the path will be relative that directory, otherwise
+it is treated as the absolute path.
+.RE
+.TP
+.BI unique_filename \fR=\fPbool
+To avoid collisions between networked clients, fio defaults to prefixing any
+generated filenames (with a directory specified) with the source of the
+client connecting. To disable this behavior, set this option to 0.
+.TP
+.BI opendir \fR=\fPstr
+Recursively open any files below directory \fIstr\fR.
+.TP
+.BI lockfile \fR=\fPstr
+Fio defaults to not locking any files before it does I/O to them. If a file
+or file descriptor is shared, fio can serialize I/O to that file to make the
+end result consistent. This is usual for emulating real workloads that share
+files. The lock modes are:
+.RS
+.RS
+.TP
+.B none
+No locking. The default.
+.TP
+.B exclusive
+Only one thread or process may do I/O at a time, excluding all others.
+.TP
+.B readwrite
+Read\-write locking on the file. Many readers may
+access the file at the same time, but writes get exclusive access.
+.RE
+.RE
+.TP
+.BI nrfiles \fR=\fPint
+Number of files to use for this job. Defaults to 1. The size of files
+will be \fBsize\fR divided by this unless explicit size is specified by
+\fBfilesize\fR. Files are created for each thread separately, and each
+file will have a file number within its name by default, as explained in
+\fBfilename\fR section.
+.TP
+.BI openfiles \fR=\fPint
+Number of files to keep open at the same time. Defaults to the same as
+\fBnrfiles\fR, can be set smaller to limit the number simultaneous
+opens.
+.TP
+.BI file_service_type \fR=\fPstr
+Defines how fio decides which file from a job to service next. The following
+types are defined:
+.RS
+.RS
+.TP
+.B random
+Choose a file at random.
+.TP
+.B roundrobin
+Round robin over opened files. This is the default.
+.TP
+.B sequential
+Finish one file before moving on to the next. Multiple files can
+still be open depending on \fBopenfiles\fR.
+.TP
+.B zipf
+Use a Zipf distribution to decide what file to access.
+.TP
+.B pareto
+Use a Pareto distribution to decide what file to access.
+.TP
+.B normal
+Use a Gaussian (normal) distribution to decide what file to access.
+.TP
+.B gauss
+Alias for normal.
+.RE
+.P
+For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be appended to
+tell fio how many I/Os to issue before switching to a new file. For example,
+specifying `file_service_type=random:8' would cause fio to issue
+8 I/Os before selecting a new file at random. For the non-uniform
+distributions, a floating point postfix can be given to influence how the
+distribution is skewed. See \fBrandom_distribution\fR for a description
+of how that would work.
+.RE
+.TP
+.BI ioscheduler \fR=\fPstr
+Attempt to switch the device hosting the file to the specified I/O scheduler
+before running.
+.TP
+.BI create_serialize \fR=\fPbool
+If true, serialize the file creation for the jobs. This may be handy to
+avoid interleaving of data files, which may greatly depend on the filesystem
+used and even the number of processors in the system. Default: true.
+.TP
+.BI create_fsync \fR=\fPbool
+\fBfsync\fR\|(2) the data file after creation. This is the default.
+.TP
+.BI create_on_open \fR=\fPbool
+If true, don't pre-create files but allow the job's open() to create a file
+when it's time to do I/O. Default: false \-\- pre-create all necessary files
+when the job starts.
+.TP
+.BI create_only \fR=\fPbool
+If true, fio will only run the setup phase of the job. If files need to be
+laid out or updated on disk, only that will be done \-\- the actual job contents
+are not executed. Default: false.
+.TP
+.BI allow_file_create \fR=\fPbool
+If true, fio is permitted to create files as part of its workload. If this
+option is false, then fio will error out if
+the files it needs to use don't already exist. Default: true.
+.TP
+.BI allow_mounted_write \fR=\fPbool
+If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+to what appears to be a mounted device or partition. This should help catch
+creating inadvertently destructive tests, not realizing that the test will
+destroy data on the mounted file system. Note that some platforms don't allow
+writing against a mounted device regardless of this option. Default: false.
+.TP
+.BI pre_read \fR=\fPbool
+If this is given, files will be pre-read into memory before starting the
+given I/O operation. This will also clear the \fBinvalidate\fR flag,
+since it is pointless to pre-read and then drop the cache. This will only
+work for I/O engines that are seek-able, since they allow you to read the
+same data multiple times. Thus it will not work on non-seekable I/O engines
+(e.g. network, splice). Default: false.
+.TP
+.BI unlink \fR=\fPbool
+Unlink the job files when done. Not the default, as repeated runs of that
+job would then waste time recreating the file set again and again. Default:
+false.
+.TP
+.BI unlink_each_loop \fR=\fPbool
+Unlink job files after each iteration or loop. Default: false.
+.TP
+.BI zonemode \fR=\fPstr
+Accepted values are:
+.RS
+.RS
+.TP
+.B none
+The \fBzonerange\fR, \fBzonesize\fR and \fBzoneskip\fR parameters are ignored.
+.TP
+.B strided
+I/O happens in a single zone until \fBzonesize\fR bytes have been transferred.
+After that number of bytes has been transferred processing of the next zone
+starts.
+.TP
+.B zbd
+Zoned block device mode. I/O happens sequentially in each zone, even if random
+I/O has been selected. Random I/O happens across all zones instead of being
+restricted to a single zone.
+.RE
+.RE
+.TP
+.BI zonerange \fR=\fPint
+For \fBzonemode\fR=strided, this is the size of a single zone. See also
+\fBzonesize\fR and \fBzoneskip\fR.
+
+For \fBzonemode\fR=zbd, this parameter is ignored.
+.TP
+.BI zonesize \fR=\fPint
+For \fBzonemode\fR=strided, this is the number of bytes to transfer before
+skipping \fBzoneskip\fR bytes. If this parameter is smaller than
+\fBzonerange\fR then only a fraction of each zone with \fBzonerange\fR bytes
+will be accessed.  If this parameter is larger than \fBzonerange\fR then each
+zone will be accessed multiple times before skipping to the next zone.
+
+For \fBzonemode\fR=zbd, this is the size of a single zone. The
+\fBzonerange\fR parameter is ignored in this mode. For a job accessing a
+zoned block device, the specified \fBzonesize\fR must be 0 or equal to the
+device zone size. For a regular block device or file, the specified
+\fBzonesize\fR must be at least 512B.
+.TP
+.BI zoneskip \fR=\fPint
+For \fBzonemode\fR=strided, the number of bytes to skip after \fBzonesize\fR
+bytes of data have been transferred.
+
+For \fBzonemode\fR=zbd, the \fBzonesize\fR aligned number of bytes to skip
+once a zone is fully written (write workloads) or all written data in the
+zone have been read (read workloads). This parameter is valid only for
+sequential workloads and ignored for random workloads. For read workloads,
+see also \fBread_beyond_wp\fR.
+
+.TP
+.BI read_beyond_wp \fR=\fPbool
+This parameter applies to \fBzonemode=zbd\fR only.
+
+Zoned block devices are block devices that consist of multiple zones. Each
+zone has a type, e.g. conventional or sequential. A conventional zone can be
+written at any offset that is a multiple of the block size. Sequential zones
+must be written sequentially. The position at which a write must occur is
+called the write pointer. A zoned block device can be either host managed or
+host aware. For host managed devices the host must ensure that writes happen
+sequentially. Fio recognizes host managed devices and serializes writes to
+sequential zones for these devices.
+
+If a read occurs in a sequential zone beyond the write pointer then the zoned
+block device will complete the read without reading any data from the storage
+medium. Since such reads lead to unrealistically high bandwidth and IOPS
+numbers fio only reads beyond the write pointer if explicitly told to do
+so. Default: false.
+.TP
+.BI max_open_zones \fR=\fPint
+When running a random write test across an entire drive many more zones will be
+open than in a typical application workload. Hence this command line option
+that allows to limit the number of open zones. The number of open zones is
+defined as the number of zones to which write commands are issued.
+.TP
+.BI zone_reset_threshold \fR=\fPfloat
+A number between zero and one that indicates the ratio of logical blocks with
+data to the total number of logical blocks in the test above which zones
+should be reset periodically.
+.TP
+.BI zone_reset_frequency \fR=\fPfloat
+A number between zero and one that indicates how often a zone reset should be
+issued if the zone reset threshold has been exceeded. A zone reset is
+submitted after each (1 / zone_reset_frequency) write requests. This and the
+previous parameter can be used to simulate garbage collection activity.
+
+.SS "I/O type"
+.TP
+.BI direct \fR=\fPbool
+If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
+ioengines don't support direct I/O. Default: false.
+.TP
+.BI atomic \fR=\fPbool
+If value is true, attempt to use atomic direct I/O. Atomic writes are
+guaranteed to be stable once acknowledged by the operating system. Only
+Linux supports O_ATOMIC right now.
+.TP
+.BI buffered \fR=\fPbool
+If value is true, use buffered I/O. This is the opposite of the
+\fBdirect\fR option. Defaults to true.
+.TP
+.BI readwrite \fR=\fPstr "\fR,\fP rw" \fR=\fPstr
+Type of I/O pattern. Accepted values are:
+.RS
+.RS
+.TP
+.B read
+Sequential reads.
+.TP
+.B write
+Sequential writes.
+.TP
+.B trim
+Sequential trims (Linux block devices and SCSI character devices only).
+.TP
+.B randread
+Random reads.
+.TP
+.B randwrite
+Random writes.
+.TP
+.B randtrim
+Random trims (Linux block devices and SCSI character devices only).
+.TP
+.B rw,readwrite
+Sequential mixed reads and writes.
+.TP
+.B randrw
+Random mixed reads and writes.
+.TP
+.B trimwrite
+Sequential trim+write sequences. Blocks will be trimmed first,
+then the same blocks will be written to.
+.RE
+.P
+Fio defaults to read if the option is not specified. For the mixed I/O
+types, the default is to split them 50/50. For certain types of I/O the
+result may still be skewed a bit, since the speed may be different.
+.P
+It is possible to specify the number of I/Os to do before getting a new
+offset by appending `:<nr>' to the end of the string given. For a
+random read, it would look like `rw=randread:8' for passing in an offset
+modifier with a value of 8. If the suffix is used with a sequential I/O
+pattern, then the `<nr>' value specified will be added to the generated
+offset for each I/O turning sequential I/O into sequential I/O with holes.
+For instance, using `rw=write:4k' will skip 4k for every write. Also see
+the \fBrw_sequencer\fR option.
+.RE
+.TP
+.BI rw_sequencer \fR=\fPstr
+If an offset modifier is given by appending a number to the `rw=\fIstr\fR'
+line, then this option controls how that number modifies the I/O offset
+being generated. Accepted values are:
+.RS
+.RS
+.TP
+.B sequential
+Generate sequential offset.
+.TP
+.B identical
+Generate the same offset.
+.RE
+.P
+\fBsequential\fR is only useful for random I/O, where fio would normally
+generate a new random offset for every I/O. If you append e.g. 8 to randread,
+you would get a new random offset for every 8 I/Os. The result would be a
+seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8'
+to specify that. As sequential I/O is already sequential, setting
+\fBsequential\fR for that would not result in any differences. \fBidentical\fR
+behaves in a similar fashion, except it sends the same offset 8 number of
+times before generating a new offset.
+.RE
+.TP
+.BI unified_rw_reporting \fR=\fPbool
+Fio normally reports statistics on a per data direction basis, meaning that
+reads, writes, and trims are accounted and reported separately. If this
+option is set fio sums the results and report them as "mixed" instead.
+.TP
+.BI randrepeat \fR=\fPbool
+Seed the random number generator used for random I/O patterns in a
+predictable way so the pattern is repeatable across runs. Default: true.
+.TP
+.BI allrandrepeat \fR=\fPbool
+Seed all random number generators in a predictable way so results are
+repeatable across runs. Default: false.
+.TP
+.BI randseed \fR=\fPint
+Seed the random number generators based on this seed value, to be able to
+control what sequence of output is being generated. If not set, the random
+sequence depends on the \fBrandrepeat\fR setting.
+.TP
+.BI fallocate \fR=\fPstr
+Whether pre-allocation is performed when laying down files.
+Accepted values are:
+.RS
+.RS
+.TP
+.B none
+Do not pre-allocate space.
+.TP
+.B native
+Use a platform's native pre-allocation call but fall back to
+\fBnone\fR behavior if it fails/is not implemented.
+.TP
+.B posix
+Pre-allocate via \fBposix_fallocate\fR\|(3).
+.TP
+.B keep
+Pre-allocate via \fBfallocate\fR\|(2) with
+FALLOC_FL_KEEP_SIZE set.
+.TP
+.B truncate
+Extend file to final size using \fBftruncate\fR|(2)
+instead of allocating.
+.TP
+.B 0
+Backward-compatible alias for \fBnone\fR.
+.TP
+.B 1
+Backward-compatible alias for \fBposix\fR.
+.RE
+.P
+May not be available on all supported platforms. \fBkeep\fR is only available
+on Linux. If using ZFS on Solaris this cannot be set to \fBposix\fR
+because ZFS doesn't support pre-allocation. Default: \fBnative\fR if any
+pre-allocation methods except \fBtruncate\fR are available, \fBnone\fR if not.
+.P
+Note that using \fBtruncate\fR on Windows will interact surprisingly
+with non-sequential write patterns. When writing to a file that has
+been extended by setting the end-of-file information, Windows will
+backfill the unwritten portion of the file up to that offset with
+zeroes before issuing the new write. This means that a single small
+write to the end of an extended file will stall until the entire
+file has been filled with zeroes.
+.RE
+.TP
+.BI fadvise_hint \fR=\fPstr
+Use \fBposix_fadvise\fR\|(2) or \fBposix_madvise\fR\|(2) to advise the kernel
+what I/O patterns are likely to be issued. Accepted values are:
+.RS
+.RS
+.TP
+.B 0
+Backwards compatible hint for "no hint".
+.TP
+.B 1
+Backwards compatible hint for "advise with fio workload type". This
+uses FADV_RANDOM for a random workload, and FADV_SEQUENTIAL
+for a sequential workload.
+.TP
+.B sequential
+Advise using FADV_SEQUENTIAL.
+.TP
+.B random
+Advise using FADV_RANDOM.
+.RE
+.RE
+.TP
+.BI write_hint \fR=\fPstr
+Use \fBfcntl\fR\|(2) to advise the kernel what life time to expect
+from a write. Only supported on Linux, as of version 4.13. Accepted
+values are:
+.RS
+.RS
+.TP
+.B none
+No particular life time associated with this file.
+.TP
+.B short
+Data written to this file has a short life time.
+.TP
+.B medium
+Data written to this file has a medium life time.
+.TP
+.B long
+Data written to this file has a long life time.
+.TP
+.B extreme
+Data written to this file has a very long life time.
+.RE
+.P
+The values are all relative to each other, and no absolute meaning
+should be associated with them.
+.RE
+.TP
+.BI offset \fR=\fPint
+Start I/O at the provided offset in the file, given as either a fixed size in
+bytes or a percentage. If a percentage is given, the generated offset will be
+aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if
+provided. Data before the given offset will not be touched. This
+effectively caps the file size at `real_size \- offset'. Can be combined with
+\fBsize\fR to constrain the start and end range of the I/O workload.
+A percentage can be specified by a number between 1 and 100 followed by '%',
+for example, `offset=20%' to specify 20%.
+.TP
+.BI offset_align \fR=\fPint
+If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR
+is aligned upwards to this value. Defaults to 0 meaning that a percentage
+offset is aligned to the minimum block size.
+.TP
+.BI offset_increment \fR=\fPint
+If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
+* thread_number', where the thread number is a counter that starts at 0 and
+is incremented for each sub-job (i.e. when \fBnumjobs\fR option is
+specified). This option is useful if there are several jobs which are
+intended to operate on a file in parallel disjoint segments, with even
+spacing between the starting points. Percentages can be used for this option.
+If a percentage is given, the generated offset will be aligned to the minimum
+\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.
+.TP
+.BI number_ios \fR=\fPint
+Fio will normally perform I/Os until it has exhausted the size of the region
+set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
+condition). With this setting, the range/size can be set independently of
+the number of I/Os to perform. When fio reaches this number, it will exit
+normally and report status. Note that this does not extend the amount of I/O
+that will be done, it will only stop fio if this condition is met before
+other end-of-job criteria.
+.TP
+.BI fsync \fR=\fPint
+If writing to a file, issue an \fBfsync\fR\|(2) (or its equivalent) of
+the dirty data for every number of blocks given. For example, if you give 32
+as a parameter, fio will sync the file after every 32 writes issued. If fio is
+using non-buffered I/O, we may not sync the file. The exception is the sg
+I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+means fio does not periodically issue and wait for a sync to complete. Also
+see \fBend_fsync\fR and \fBfsync_on_close\fR.
+.TP
+.BI fdatasync \fR=\fPint
+Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
+not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+\fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
+Defaults to 0, which means fio does not periodically issue and wait for a
+data-only sync to complete.
+.TP
+.BI write_barrier \fR=\fPint
+Make every N\-th write a barrier write.
+.TP
+.BI sync_file_range \fR=\fPstr:int
+Use \fBsync_file_range\fR\|(2) for every \fIint\fR number of write
+operations. Fio will track range of writes that have happened since the last
+\fBsync_file_range\fR\|(2) call. \fIstr\fR can currently be one or more of:
+.RS
+.RS
+.TP
+.B wait_before
+SYNC_FILE_RANGE_WAIT_BEFORE
+.TP
+.B write
+SYNC_FILE_RANGE_WRITE
+.TP
+.B wait_after
+SYNC_FILE_RANGE_WRITE_AFTER
+.RE
+.P
+So if you do `sync_file_range=wait_before,write:8', fio would use
+`SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE' for every 8
+writes. Also see the \fBsync_file_range\fR\|(2) man page. This option is
+Linux specific.
+.RE
+.TP
+.BI overwrite \fR=\fPbool
+If true, writes to a file will always overwrite existing data. If the file
+doesn't already exist, it will be created before the write phase begins. If
+the file exists and is large enough for the specified write phase, nothing
+will be done. Default: false.
+.TP
+.BI end_fsync \fR=\fPbool
+If true, \fBfsync\fR\|(2) file contents when a write stage has completed.
+Default: false.
+.TP
+.BI fsync_on_close \fR=\fPbool
+If true, fio will \fBfsync\fR\|(2) a dirty file on close. This differs
+from \fBend_fsync\fR in that it will happen on every file close, not
+just at the end of the job. Default: false.
+.TP
+.BI rwmixread \fR=\fPint
+Percentage of a mixed workload that should be reads. Default: 50.
+.TP
+.BI rwmixwrite \fR=\fPint
+Percentage of a mixed workload that should be writes. If both
+\fBrwmixread\fR and \fBrwmixwrite\fR is given and the values do not
+add up to 100%, the latter of the two will be used to override the
+first. This may interfere with a given rate setting, if fio is asked to
+limit reads or writes to a certain rate. If that is the case, then the
+distribution may be skewed. Default: 50.
+.TP
+.BI random_distribution \fR=\fPstr:float[,str:float][,str:float]
+By default, fio will use a completely uniform random distribution when asked
+to perform random I/O. Sometimes it is useful to skew the distribution in
+specific ways, ensuring that some parts of the data is more hot than others.
+fio includes the following distribution models:
+.RS
+.RS
+.TP
+.B random
+Uniform random distribution
+.TP
+.B zipf
+Zipf distribution
+.TP
+.B pareto
+Pareto distribution
+.TP
+.B normal
+Normal (Gaussian) distribution
+.TP
+.B zoned
+Zoned random distribution
+.B zoned_abs
+Zoned absolute random distribution
+.RE
+.P
+When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
+needed to define the access pattern. For \fBzipf\fR, this is the `Zipf theta'.
+For \fBpareto\fR, it's the `Pareto power'. Fio includes a test
+program, \fBfio\-genzipf\fR, that can be used visualize what the given input
+values will yield in terms of hit rates. If you wanted to use \fBzipf\fR with
+a `theta' of 1.2, you would use `random_distribution=zipf:1.2' as the
+option. If a non\-uniform model is used, fio will disable use of the random
+map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
+supplied as a value between 0 and 100.
+.P
+For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
+access that should fall within what range of the file or device. For
+example, given a criteria of:
+.RS
+.P
+.PD 0
+60% of accesses should be to the first 10%
+.P
+30% of accesses should be to the next 20%
+.P
+8% of accesses should be to the next 30%
+.P
+2% of accesses should be to the next 40%
+.PD
+.RE
+.P
+we can define that through zoning of the random accesses. For the above
+example, the user would do:
+.RS
+.P
+random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+A \fBzoned_abs\fR distribution works exactly like the\fBzoned\fR, except that
+it takes absolute sizes. For example, let's say you wanted to define access
+according to the following criteria:
+.RS
+.P
+.PD 0
+60% of accesses should be to the first 20G
+.P
+30% of accesses should be to the next 100G
+.P
+10% of accesses should be to the next 500G
+.PD
+.RE
+.P
+we can define an absolute zoning distribution with:
+.RS
+.P
+random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+For both \fBzoned\fR and \fBzoned_abs\fR, fio supports defining up to 256
+separate zones.
+.P
+Similarly to how \fBbssplit\fR works for setting ranges and percentages
+of block sizes. Like \fBbssplit\fR, it's possible to specify separate
+zones for reads, writes, and trims. If just one set is given, it'll apply to
+all of them.
+.RE
+.TP
+.BI percentage_random \fR=\fPint[,int][,int]
+For a random workload, set how big a percentage should be random. This
+defaults to 100%, in which case the workload is fully random. It can be set
+from anywhere from 0 to 100. Setting it to 0 would make the workload fully
+sequential. Any setting in between will result in a random mix of sequential
+and random I/O, at the given percentages. Comma-separated values may be
+specified for reads, writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI norandommap
+Normally fio will cover every block of the file when doing random I/O. If
+this option is given, fio will just get a new random offset without looking
+at past I/O history. This means that some blocks may not be read or written,
+and that some blocks may be read/written more than once. If this option is
+used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR),
+only intact blocks are verified, i.e., partially-overwritten blocks are
+ignored.  With an async I/O engine and an I/O depth > 1, it is possible for
+the same block to be overwritten, which can cause verification errors.  Either
+do not use norandommap in this case, or also use the lfsr random generator.
+.TP
+.BI softrandommap \fR=\fPbool
+See \fBnorandommap\fR. If fio runs with the random block map enabled and
+it fails to allocate the map, if this option is set it will continue without
+a random block map. As coverage will not be as complete as with random maps,
+this option is disabled by default.
+.TP
+.BI random_generator \fR=\fPstr
+Fio supports the following engines for generating I/O offsets for random I/O:
+.RS
+.RS
+.TP
+.B tausworthe
+Strong 2^88 cycle random number generator.
+.TP
+.B lfsr
+Linear feedback shift register generator.
+.TP
+.B tausworthe64
+Strong 64\-bit 2^258 cycle random number generator.
+.RE
+.P
+\fBtausworthe\fR is a strong random number generator, but it requires tracking
+on the side if we want to ensure that blocks are only read or written
+once. \fBlfsr\fR guarantees that we never generate the same offset twice, and
+it's also less computationally expensive. It's not a true random generator,
+however, though for I/O purposes it's typically good enough. \fBlfsr\fR only
+works with single block sizes, not with workloads that use multiple block
+sizes. If used with such a workload, fio may read or write some blocks
+multiple times. The default value is \fBtausworthe\fR, unless the required
+space exceeds 2^32 blocks. If it does, then \fBtausworthe64\fR is
+selected automatically.
+.RE
+.SS "Block size"
+.TP
+.BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int]
+The block size in bytes used for I/O units. Default: 4096. A single value
+applies to reads, writes, and trims. Comma-separated values may be
+specified for reads, writes, and trims. A value not terminated in a comma
+applies to subsequent types. Examples:
+.RS
+.RS
+.P
+.PD 0
+bs=256k        means 256k for reads, writes and trims.
+.P
+bs=8k,32k      means 8k for reads, 32k for writes and trims.
+.P
+bs=8k,32k,     means 8k for reads, 32k for writes, and default for trims.
+.P
+bs=,8k         means default for reads, 8k for writes and trims.
+.P
+bs=,8k,        means default for reads, 8k for writes, and default for trims.
+.PD
+.RE
+.RE
+.TP
+.BI blocksize_range \fR=\fPirange[,irange][,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange][,irange]
+A range of block sizes in bytes for I/O units. The issued I/O unit will
+always be a multiple of the minimum size, unless
+\fBblocksize_unaligned\fR is set.
+Comma-separated ranges may be specified for reads, writes, and trims as
+described in \fBblocksize\fR. Example:
+.RS
+.RS
+.P
+bsrange=1k\-4k,2k\-8k
+.RE
+.RE
+.TP
+.BI bssplit \fR=\fPstr[,str][,str]
+Sometimes you want even finer grained control of the block sizes issued, not
+just an even split between them. This option allows you to weight various
+block sizes, so that you are able to define a specific amount of block sizes
+issued. The format for this option is:
+.RS
+.RS
+.P
+bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+for as many block sizes as needed. So if you want to define a workload that
+has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write:
+.RS
+.P
+bssplit=4k/10:64k/50:32k/40
+.RE
+.P
+Ordering does not matter. If the percentage is left blank, fio will fill in
+the remaining values evenly. So a bssplit option like this one:
+.RS
+.P
+bssplit=4k/50:1k/:32k/
+.RE
+.P
+would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up
+to 100, if bssplit is given a range that adds up to more, it will error out.
+.P
+Comma-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
+.P
+If you want a workload that has 50% 2k reads and 50% 4k reads, while having
+90% 4k writes and 10% 8k writes, you would specify:
+.RS
+.P
+bssplit=2k/50:4k/50,4k/90:8k/10
+.RE
+.P
+Fio supports defining up to 64 different weights for each data direction.
+.RE
+.TP
+.BI blocksize_unaligned "\fR,\fB bs_unaligned"
+If set, fio will issue I/O units with any size within
+\fBblocksize_range\fR, not just multiples of the minimum size. This
+typically won't work with direct I/O, as that normally requires sector
+alignment.
+.TP
+.BI bs_is_seq_rand \fR=\fPbool
+If this option is set, fio will use the normal read,write blocksize settings
+as sequential,random blocksize settings instead. Any random read or write
+will use the WRITE blocksize settings, and any sequential read or write will
+use the READ blocksize settings.
+.TP
+.BI blockalign \fR=\fPint[,int][,int] "\fR,\fB ba" \fR=\fPint[,int][,int]
+Boundary to which fio will align random I/O units. Default:
+\fBblocksize\fR. Minimum alignment is typically 512b for using direct
+I/O, though it usually depends on the hardware block size. This option is
+mutually exclusive with using a random map for files, so it will turn off
+that option. Comma-separated values may be specified for reads, writes, and
+trims as described in \fBblocksize\fR.
+.SS "Buffers and memory"
+.TP
+.BI zero_buffers
+Initialize buffers with all zeros. Default: fill buffers with random data.
+.TP
+.BI refill_buffers
+If this option is given, fio will refill the I/O buffers on every
+submit. The default is to only fill it at init time and reuse that
+data. Only makes sense if zero_buffers isn't specified, naturally. If data
+verification is enabled, \fBrefill_buffers\fR is also automatically enabled.
+.TP
+.BI scramble_buffers \fR=\fPbool
+If \fBrefill_buffers\fR is too costly and the target is using data
+deduplication, then setting this option will slightly modify the I/O buffer
+contents to defeat normal de-dupe attempts. This is not enough to defeat
+more clever block compression attempts, but it will stop naive dedupe of
+blocks. Default: true.
+.TP
+.BI buffer_compress_percentage \fR=\fPint
+If this is set, then fio will attempt to provide I/O buffer content
+(on WRITEs) that compresses to the specified level. Fio does this by
+providing a mix of random data followed by fixed pattern data. The
+fixed pattern is either zeros, or the pattern specified by
+\fBbuffer_pattern\fR. If the \fBbuffer_pattern\fR option is used, it
+might skew the compression ratio slightly. Setting
+\fBbuffer_compress_percentage\fR to a value other than 100 will also
+enable \fBrefill_buffers\fR in order to reduce the likelihood that
+adjacent blocks are so similar that they over compress when seen
+together. See \fBbuffer_compress_chunk\fR for how to set a finer or
+coarser granularity of the random/fixed data regions. Defaults to unset
+i.e., buffer data will not adhere to any compression level.
+.TP
+.BI buffer_compress_chunk \fR=\fPint
+This setting allows fio to manage how big the random/fixed data region
+is when using \fBbuffer_compress_percentage\fR. When
+\fBbuffer_compress_chunk\fR is set to some non-zero value smaller than the
+block size, fio can repeat the random/fixed region throughout the I/O
+buffer at the specified interval (which particularly useful when
+bigger block sizes are used for a job). When set to 0, fio will use a
+chunk size that matches the block size resulting in a single
+random/fixed region within the I/O buffer. Defaults to 512. When the
+unit is omitted, the value is interpreted in bytes.
+.TP
+.BI buffer_pattern \fR=\fPstr
+If set, fio will fill the I/O buffers with this pattern or with the contents
+of a file. If not set, the contents of I/O buffers are defined by the other
+options related to buffer contents. The setting can be any pattern of bytes,
+and can be prefixed with 0x for hex values. It may also be a string, where
+the string must then be wrapped with "". Or it may also be a filename,
+where the filename must be wrapped with '' in which case the file is
+opened and read. Note that not all the file contents will be read if that
+would cause the buffers to overflow. So, for example:
+.RS
+.RS
+.P
+.PD 0
+buffer_pattern='filename'
+.P
+or:
+.P
+buffer_pattern="abcd"
+.P
+or:
+.P
+buffer_pattern=\-12
+.P
+or:
+.P
+buffer_pattern=0xdeadface
+.PD
+.RE
+.P
+Also you can combine everything together in any order:
+.RS
+.P
+buffer_pattern=0xdeadface"abcd"\-12'filename'
+.RE
+.RE
+.TP
+.BI dedupe_percentage \fR=\fPint
+If set, fio will generate this percentage of identical buffers when
+writing. These buffers will be naturally dedupable. The contents of the
+buffers depend on what other buffer compression settings have been set. It's
+possible to have the individual buffers either fully compressible, or not at
+all \-\- this option only controls the distribution of unique buffers. Setting
+this option will also enable \fBrefill_buffers\fR to prevent every buffer
+being identical.
+.TP
+.BI invalidate \fR=\fPbool
+Invalidate the buffer/page cache parts of the files to be used prior to
+starting I/O if the platform and file type support it. Defaults to true.
+This will be ignored if \fBpre_read\fR is also specified for the
+same job.
+.TP
+.BI sync \fR=\fPbool
+Use synchronous I/O for buffered writes. For the majority of I/O engines,
+this means using O_SYNC. Default: false.
+.TP
+.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
+Fio can use various types of memory as the I/O unit buffer. The allowed
+values are:
+.RS
+.RS
+.TP
+.B malloc
+Use memory from \fBmalloc\fR\|(3) as the buffers. Default memory type.
+.TP
+.B shm
+Use shared memory as the buffers. Allocated through \fBshmget\fR\|(2).
+.TP
+.B shmhuge
+Same as \fBshm\fR, but use huge pages as backing.
+.TP
+.B mmap
+Use \fBmmap\fR\|(2) to allocate buffers. May either be anonymous memory, or can
+be file backed if a filename is given after the option. The format
+is `mem=mmap:/path/to/file'.
+.TP
+.B mmaphuge
+Use a memory mapped huge file as the buffer backing. Append filename
+after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file'.
+.TP
+.B mmapshared
+Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+The \fBioengine\fR must be \fBrdma\fR.
+.RE
+.P
+The area allocated is a function of the maximum allowed bs size for the job,
+multiplied by the I/O depth given. Note that for \fBshmhuge\fR and
+\fBmmaphuge\fR to work, the system must have free huge pages allocated. This
+can normally be checked and set by reading/writing
+`/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
+is 4MiB in size. So to calculate the number of huge pages you need for a
+given job file, add up the I/O depth of all jobs (normally one unless
+\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
+that number by the huge page size. You can see the size of the huge pages in
+`/proc/meminfo'. If no huge pages are allocated by having a non-zero
+number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
+see \fBhugepage\-size\fR.
+.P
+\fBmmaphuge\fR also needs to have hugetlbfs mounted and the file location
+should point there. So if it's mounted in `/huge', you would use
+`mem=mmaphuge:/huge/somefile'.
+.RE
+.TP
+.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint
+This indicates the memory alignment of the I/O memory buffers. Note that
+the given alignment is applied to the first I/O unit buffer, if using
+\fBiodepth\fR the alignment of the following buffers are given by the
+\fBbs\fR used. In other words, if using a \fBbs\fR that is a
+multiple of the page sized in the system, all buffers will be aligned to
+this value. If using a \fBbs\fR that is not page aligned, the alignment
+of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and
+\fBbs\fR used.
+.TP
+.BI hugepage\-size \fR=\fPint
+Defines the size of a huge page. Must at least be equal to the system
+setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
+always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
+preferred way to set this to avoid setting a non-pow-2 bad value.
+.TP
+.BI lockmem \fR=\fPint
+Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
+simulate a smaller amount of memory. The amount specified is per worker.
+.SS "I/O size"
+.TP
+.BI size \fR=\fPint
+The total size of file I/O for each thread of this job. Fio will run until
+this many bytes has been transferred, unless runtime is limited by other options
+(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
+Fio will divide this size between the available files determined by options
+such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is
+specified by the job. If the result of division happens to be 0, the size is
+set to the physical size of the given files or devices if they exist.
+If this option is not specified, fio will use the full size of the given
+files or devices. If the files do not exist, size must be given. It is also
+possible to give size as a percentage between 1 and 100. If `size=20%' is
+given, fio will use 20% of the full size of the given files or devices.
+Can be combined with \fBoffset\fR to constrain the start and end range
+that I/O will be done within.
+.TP
+.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+Normally fio operates within the region set by \fBsize\fR, which means
+that the \fBsize\fR option sets both the region and size of I/O to be
+performed. Sometimes that is not what you want. With this option, it is
+possible to define just the amount of I/O that fio should do. For instance,
+if \fBsize\fR is set to 20GiB and \fBio_size\fR is set to 5GiB, fio
+will perform I/O within the first 20GiB but exit when 5GiB have been
+done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
+and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
+the 0..20GiB region.
+.TP
+.BI filesize \fR=\fPirange(int)
+Individual file sizes. May be a range, in which case fio will select sizes
+for files at random within the given range and limited to \fBsize\fR in
+total (if that is given). If not given, each created file is the same size.
+This option overrides \fBsize\fR in terms of file size, which means
+this value is used as a fixed size or possible range of each file.
+.TP
+.BI file_append \fR=\fPbool
+Perform I/O after the end of the file. Normally fio will operate within the
+size of a file. If this option is set, then fio will append to the file
+instead. This has identical behavior to setting \fBoffset\fR to the size
+of a file. This option is ignored on non-regular files.
+.TP
+.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
+Sets size to something really large and waits for ENOSPC (no space left on
+device) as the terminating condition. Only makes sense with sequential
+write. For a read workload, the mount point will be filled first then I/O
+started on the result. This option doesn't make sense if operating on a raw
+device node, since the size of that is already known by the file system.
+Additionally, writing beyond end-of-device will not return ENOSPC there.
+.SS "I/O engine"
+.TP
+.BI ioengine \fR=\fPstr
+Defines how the job issues I/O to the file. The following types are defined:
+.RS
+.RS
+.TP
+.B sync
+Basic \fBread\fR\|(2) or \fBwrite\fR\|(2)
+I/O. \fBlseek\fR\|(2) is used to position the I/O location.
+See \fBfsync\fR and \fBfdatasync\fR for syncing write I/Os.
+.TP
+.B psync
+Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O. Default on
+all supported operating systems except for Windows.
+.TP
+.B vsync
+Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate
+queuing by coalescing adjacent I/Os into a single submission.
+.TP
+.B pvsync
+Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
+.TP
+.B pvsync2
+Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
+.TP
+.B libaio
+Linux native asynchronous I/O. Note that Linux may only support
+queued behavior with non-buffered I/O (set `direct=1' or
+`buffered=0').
+This engine defines engine specific options.
+.TP
+.B posixaio
+POSIX asynchronous I/O using \fBaio_read\fR\|(3) and
+\fBaio_write\fR\|(3).
+.TP
+.B solarisaio
+Solaris native asynchronous I/O.
+.TP
+.B windowsaio
+Windows native asynchronous I/O. Default on Windows.
+.TP
+.B mmap
+File is memory mapped with \fBmmap\fR\|(2) and data copied
+to/from using \fBmemcpy\fR\|(3).
+.TP
+.B splice
+\fBsplice\fR\|(2) is used to transfer the data and
+\fBvmsplice\fR\|(2) to transfer data from user space to the
+kernel.
+.TP
+.B sg
+SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+ioctl, or if the target is an sg character device we use
+\fBread\fR\|(2) and \fBwrite\fR\|(2) for asynchronous
+I/O. Requires \fBfilename\fR option to specify either block or
+character devices. This engine supports trim operations. The
+sg engine includes engine specific options.
+.TP
+.B null
+Doesn't transfer any data, just pretends to. This is mainly used to
+exercise fio itself and for debugging/testing purposes.
+.TP
+.B net
+Transfer over the network to given `host:port'. Depending on the
+\fBprotocol\fR used, the \fBhostname\fR, \fBport\fR,
+\fBlisten\fR and \fBfilename\fR options are used to specify
+what sort of connection to make, while the \fBprotocol\fR option
+determines which protocol will be used. This engine defines engine
+specific options.
+.TP
+.B netsplice
+Like \fBnet\fR, but uses \fBsplice\fR\|(2) and
+\fBvmsplice\fR\|(2) to map data and send/receive.
+This engine defines engine specific options.
+.TP
+.B cpuio
+Doesn't transfer any data, but burns CPU cycles according to the
+\fBcpuload\fR and \fBcpuchunks\fR options. Setting
+\fBcpuload\fR\=85 will cause that job to do nothing but burn 85%
+of the CPU. In case of SMP machines, use `numjobs=<nr_of_cpu>'
+to get desired CPU usage, as the cpuload only loads a
+single CPU at the desired rate. A job never finishes unless there is
+at least one non-cpuio job.
+.TP
+.B guasi
+The GUASI I/O engine is the Generic Userspace Asynchronous Syscall
+Interface approach to async I/O. See \fIhttp://www.xmailserver.org/guasi-lib.html\fR
+for more info on GUASI.
+.TP
+.B rdma
+The RDMA I/O engine supports both RDMA memory semantics
+(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+InfiniBand, RoCE and iWARP protocols. This engine defines engine
+specific options.
+.TP
+.B falloc
+I/O engine that does regular fallocate to simulate data transfer as
+fio ioengine.
+.RS
+.P
+.PD 0
+DDIR_READ      does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+.P
+DIR_WRITE      does fallocate(,mode = 0).
+.P
+DDIR_TRIM      does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+.PD
+.RE
+.TP
+.B ftruncate
+I/O engine that sends \fBftruncate\fR\|(2) operations in response
+to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+size to the current block offset. \fBblocksize\fR is ignored.
+.TP
+.B e4defrag
+I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+defragment activity in request to DDIR_WRITE event.
+.TP
+.B rados
+I/O engine supporting direct access to Ceph Reliable Autonomic Distributed
+Object Store (RADOS) via librados. This ioengine defines engine specific
+options.
+.TP
+.B rbd
+I/O engine supporting direct access to Ceph Rados Block Devices
+(RBD) via librbd without the need to use the kernel rbd driver. This
+ioengine defines engine specific options.
+.TP
+.B http
+I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
+a WebDAV or S3 endpoint.  This ioengine defines engine specific options.
+
+This engine only supports direct IO of iodepth=1; you need to scale this
+via numjobs. blocksize defines the size of the objects to be created.
+
+TRIM is translated to object deletion.
+.TP
+.B gfapi
+Using GlusterFS libgfapi sync interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
+.TP
+.B gfapi_async
+Using GlusterFS libgfapi async interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
+.TP
+.B libhdfs
+Read and write through Hadoop (HDFS). The \fBfilename\fR option
+is used to specify host,port of the hdfs name\-node to connect. This
+engine interprets offsets a little differently. In HDFS, files once
+created cannot be modified so random writes are not possible. To
+imitate this the libhdfs engine expects a bunch of small files to be
+created over HDFS and will randomly pick a file from them
+based on the offset generated by fio backend (see the example
+job file to create such files, use `rw=write' option). Please
+note, it may be necessary to set environment variables to work
+with HDFS/libhdfs properly. Each job uses its own connection to
+HDFS.
+.TP
+.B mtd
+Read, write and erase an MTD character device (e.g.,
+`/dev/mtd0'). Discards are treated as erases. Depending on the
+underlying device type, the I/O may have to go in a certain pattern,
+e.g., on NAND, writing sequentially to erase blocks and discarding
+before overwriting. The \fBtrimwrite\fR mode works well for this
+constraint.
+.TP
+.B pmemblk
+Read and write using filesystem DAX to a file on a filesystem
+mounted with DAX on a persistent memory device through the PMDK
+libpmemblk library.
+.TP
+.B dev\-dax
+Read and write using device DAX to a persistent memory device (e.g.,
+/dev/dax0.0) through the PMDK libpmem library.
+.TP
+.B external
+Prefix to specify loading an external I/O engine object file. Append
+the engine filename, e.g. `ioengine=external:/tmp/foo.o' to load
+ioengine `foo.o' in `/tmp'. The path can be either
+absolute or relative. See `engines/skeleton_external.c' in the fio source for
+details of writing an external I/O engine.
+.TP
+.B filecreate
+Simply create the files and do no I/O to them.  You still need to set
+\fBfilesize\fR so that all the accounting still occurs, but no actual I/O will be
+done other than creating the file.
+.TP
+.B filestat
+Simply do stat() and do no I/O to the file. You need to set 'filesize'
+and 'nrfiles', so that files will be created.
+This engine is to measure file lookup and meta data access.
+.TP
+.B libpmem
+Read and write using mmap I/O to a file on a filesystem
+mounted with DAX on a persistent memory device through the PMDK
+libpmem library.
+.TP
+.B ime_psync
+Synchronous read and write using DDN's Infinite Memory Engine (IME). This
+engine is very basic and issues calls to IME whenever an IO is queued.
+.TP
+.B ime_psyncv
+Synchronous read and write using DDN's Infinite Memory Engine (IME). This
+engine uses iovecs and will try to stack as much IOs as possible (if the IOs
+are "contiguous" and the IO depth is not exceeded) before issuing a call to IME.
+.TP
+.B ime_aio
+Asynchronous read and write using DDN's Infinite Memory Engine (IME). This
+engine will try to stack as much IOs as possible by creating requests for IME.
+FIO will then decide when to commit these requests.
+.TP
+.B libiscsi
+Read and write iscsi lun with libiscsi.
+.TP
+.B nbd
+Synchronous read and write a Network Block Device (NBD).
+.SS "I/O engine specific parameters"
+In addition, there are some parameters which are only valid when a specific
+\fBioengine\fR is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+\fBioengine\fR that defines them is selected.
+.TP
+.BI (io_uring, libaio)cmdprio_percentage \fR=\fPint
+Set the percentage of I/O that will be issued with higher priority by setting
+the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
+This option cannot be used with the `prio` or `prioclass` options. For this
+option to set the priority bit properly, NCQ priority must be supported and
+enabled and `direct=1' option must be used.
+.TP
+.BI (io_uring)fixedbufs
+If fio is asked to do direct IO, then Linux will map pages for each IO call, and
+release them when IO is done. If this option is set, the pages are pre-mapped
+before IO is started. This eliminates the need to map and release for each IO.
+This is more efficient, and reduces the IO latency as well.
+.TP
+.BI (io_uring)hipri
+If this option is set, fio will attempt to use polled IO completions. Normal IO
+completions generate interrupts to signal the completion of IO, polled
+completions do not. Hence they are require active reaping by the application.
+The benefits are more efficient IO for high IOPS scenarios, and lower latencies
+for low queue depth IO.
+.TP
+.BI (io_uring)registerfiles
+With this option, fio registers the set of files being used with the kernel.
+This avoids the overhead of managing file counts in the kernel, making the
+submission and completion part more lightweight. Required for the below
+sqthread_poll option.
+.TP
+.BI (io_uring)sqthread_poll
+Normally fio will submit IO by issuing a system call to notify the kernel of
+available items in the SQ ring. If this option is set, the act of submitting IO
+will be done by a polling thread in the kernel. This frees up cycles for fio, at
+the cost of using more CPU in the system.
+.TP
+.BI (io_uring)sqthread_poll_cpu
+When `sqthread_poll` is set, this option provides a way to define which CPU
+should be used for the polling thread.
+.TP
+.BI (libaio)userspace_reap
+Normally, with the libaio engine in use, fio will use the
+\fBio_getevents\fR\|(3) system call to reap newly returned events. With
+this flag turned on, the AIO ring will be read directly from user-space to
+reap events. The reaping mode is only enabled when polling for a minimum of
+0 events (e.g. when `iodepth_batch_complete=0').
+.TP
+.BI (pvsync2)hipri
+Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+than normal.
+.TP
+.BI (pvsync2)hipri_percentage
+When hipri is set this determines the probability of a pvsync2 I/O being high
+priority. The default is 100%.
+.TP
+.BI (cpuio)cpuload \fR=\fPint
+Attempt to use the specified percentage of CPU cycles. This is a mandatory
+option when using cpuio I/O engine.
+.TP
+.BI (cpuio)cpuchunks \fR=\fPint
+Split the load into cycles of the given time. In microseconds.
+.TP
+.BI (cpuio)exit_on_io_done \fR=\fPbool
+Detect when I/O threads are done, then exit.
+.TP
+.BI (libhdfs)namenode \fR=\fPstr
+The hostname or IP address of a HDFS cluster namenode to contact.
+.TP
+.BI (libhdfs)port
+The listening port of the HFDS cluster namenode.
+.TP
+.BI (netsplice,net)port
+The TCP or UDP port to bind to or connect to. If this is used with
+\fBnumjobs\fR to spawn multiple instances of the same job type, then
+this will be the starting port number since fio will use a range of
+ports.
+.TP
+.BI (rdma)port
+The port to use for RDMA-CM communication. This should be the same
+value on the client and the server side.
+.TP
+.BI (netsplice,net, rdma)hostname \fR=\fPstr
+The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.
+If the job is a TCP listener or UDP reader, the hostname is not used
+and must be omitted unless it is a valid UDP multicast address.
+.TP
+.BI (netsplice,net)interface \fR=\fPstr
+The IP address of the network interface used to send or receive UDP
+multicast.
+.TP
+.BI (netsplice,net)ttl \fR=\fPint
+Time\-to\-live value for outgoing UDP multicast packets. Default: 1.
+.TP
+.BI (netsplice,net)nodelay \fR=\fPbool
+Set TCP_NODELAY on TCP connections.
+.TP
+.BI (netsplice,net)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
+The network protocol to use. Accepted values are:
+.RS
+.RS
+.TP
+.B tcp
+Transmission control protocol.
+.TP
+.B tcpv6
+Transmission control protocol V6.
+.TP
+.B udp
+User datagram protocol.
+.TP
+.B udpv6
+User datagram protocol V6.
+.TP
+.B unix
+UNIX domain socket.
+.RE
+.P
+When the protocol is TCP or UDP, the port must also be given, as well as the
+hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+normal \fBfilename\fR option should be used and the port is invalid.
+.RE
+.TP
+.BI (netsplice,net)listen
+For TCP network connections, tell fio to listen for incoming connections
+rather than initiating an outgoing connection. The \fBhostname\fR must
+be omitted if this option is used.
+.TP
+.BI (netsplice,net)pingpong
+Normally a network writer will just continue writing data, and a network
+reader will just consume packages. If `pingpong=1' is set, a writer will
+send its normal payload to the reader, then wait for the reader to send the
+same payload back. This allows fio to measure network latencies. The
+submission and completion latencies then measure local time spent sending or
+receiving, and the completion latency measures how long it took for the
+other end to receive and send back. For UDP multicast traffic
+`pingpong=1' should only be set for a single reader when multiple readers
+are listening to the same address.
+.TP
+.BI (netsplice,net)window_size \fR=\fPint
+Set the desired socket buffer size for the connection.
+.TP
+.BI (netsplice,net)mss \fR=\fPint
+Set the TCP maximum segment size (TCP_MAXSEG).
+.TP
+.BI (e4defrag)donorname \fR=\fPstr
+File will be used as a block donor (swap extents between files).
+.TP
+.BI (e4defrag)inplace \fR=\fPint
+Configure donor file blocks allocation strategy:
+.RS
+.RS
+.TP
+.B 0
+Default. Preallocate donor's file on init.
+.TP
+.B 1
+Allocate space immediately inside defragment event, and free right
+after event.
+.RE
+.RE
+.TP
+.BI (rbd,rados)clustername \fR=\fPstr
+Specifies the name of the Ceph cluster.
+.TP
+.BI (rbd)rbdname \fR=\fPstr
+Specifies the name of the RBD.
+.TP
+.BI (rbd,rados)pool \fR=\fPstr
+Specifies the name of the Ceph pool containing RBD or RADOS data.
+.TP
+.BI (rbd,rados)clientname \fR=\fPstr
+Specifies the username (without the 'client.' prefix) used to access the
+Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall be
+the full *type.id* string. If no type. prefix is given, fio will add 'client.'
+by default.
+.TP
+.BI (rbd,rados)busy_poll \fR=\fPbool
+Poll store instead of waiting for completion. Usually this provides better
+throughput at cost of higher(up to 100%) CPU utilization.
+.TP
+.BI (http)http_host \fR=\fPstr
+Hostname to connect to. For S3, this could be the bucket name. Default
+is \fBlocalhost\fR
+.TP
+.BI (http)http_user \fR=\fPstr
+Username for HTTP authentication.
+.TP
+.BI (http)http_pass \fR=\fPstr
+Password for HTTP authentication.
+.TP
+.BI (http)https \fR=\fPstr
+Whether to use HTTPS instead of plain HTTP. \fRon\fP enables HTTPS;
+\fRinsecure\fP will enable HTTPS, but disable SSL peer verification (use
+with caution!).  Default is \fBoff\fR.
+.TP
+.BI (http)http_mode \fR=\fPstr
+Which HTTP access mode to use: webdav, swift, or s3. Default is
+\fBwebdav\fR.
+.TP
+.BI (http)http_s3_region \fR=\fPstr
+The S3 region/zone to include in the request. Default is \fBus-east-1\fR.
+.TP
+.BI (http)http_s3_key \fR=\fPstr
+The S3 secret key.
+.TP
+.BI (http)http_s3_keyid \fR=\fPstr
+The S3 key/access id.
+.TP
+.BI (http)http_swift_auth_token \fR=\fPstr
+The Swift auth token. See the example configuration file on how to
+retrieve this.
+.TP
+.BI (http)http_verbose \fR=\fPint
+Enable verbose requests from libcurl. Useful for debugging. 1 turns on
+verbose logging from libcurl, 2 additionally enables HTTP IO tracing.
+Default is \fB0\fR
+.TP
+.BI (mtd)skip_bad \fR=\fPbool
+Skip operations against known bad blocks.
+.TP
+.BI (libhdfs)hdfsdirectory
+libhdfs will create chunk in this HDFS directory.
+.TP
+.BI (libhdfs)chunk_size
+The size of the chunk to use for each file.
+.TP
+.BI (rdma)verb \fR=\fPstr
+The RDMA verb to use on this side of the RDMA ioengine
+connection. Valid values are write, read, send and recv. These
+correspond to the equivalent RDMA verbs (e.g. write = rdma_write
+etc.). Note that this only needs to be specified on the client side of
+the connection. See the examples folder.
+.TP
+.BI (rdma)bindname \fR=\fPstr
+The name to use to bind the local RDMA-CM connection to a local RDMA
+device. This could be a hostname or an IPv4 or IPv6 address. On the
+server side this will be passed into the rdma_bind_addr() function and
+on the client site it will be used in the rdma_resolve_add()
+function. This can be useful when multiple paths exist between the
+client and the server or in certain loopback configurations.
+.TP
+.BI (filestat)stat_type \fR=\fPstr
+Specify stat system call type to measure lookup/getattr performance.
+Default is \fBstat\fR for \fBstat\fR\|(2).
+.TP
+.BI (sg)readfua \fR=\fPbool
+With readfua option set to 1, read operations include the force
+unit access (fua) flag. Default: 0.
+.TP
+.BI (sg)writefua \fR=\fPbool
+With writefua option set to 1, write operations include the force
+unit access (fua) flag. Default: 0.
+.TP
+.BI (sg)sg_write_mode \fR=\fPstr
+Specify the type of write commands to issue. This option can take three
+values:
+.RS
+.RS
+.TP
+.B write (default)
+Write opcodes are issued as usual
+.TP
+.B verify
+Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
+directs the device to carry out a medium verification with no data
+comparison. The writefua option is ignored with this selection.
+.TP
+.B same
+Issue WRITE SAME commands. This transfers a single block to the device
+and writes this same block of data to a contiguous sequence of LBAs
+beginning at the specified offset. fio's block size parameter
+specifies the amount of data written with each command. However, the
+amount of data actually transferred to the device is equal to the
+device's block (sector) size. For a device with 512 byte sectors,
+blocksize=8k will write 16 sectors with each command. fio will still
+generate 8k of data for each command butonly the first 512 bytes will
+be used and transferred to the device. The writefua option is ignored
+with this selection.
+.RE
+.RE
+.TP
+.BI (nbd)uri \fR=\fPstr
+Specify the NBD URI of the server to test.
+The string is a standard NBD URI (see
+\fIhttps://github.com/NetworkBlockDevice/nbd/tree/master/doc\fR).
+Example URIs:
+.RS
+.RS
+.TP
+\fInbd://localhost:10809\fR
+.TP
+\fInbd+unix:///?socket=/tmp/socket\fR
+.TP
+\fInbds://tlshost/exportname\fR
+
+.SS "I/O depth"
+.TP
+.BI iodepth \fR=\fPint
+Number of I/O units to keep in flight against the file. Note that
+increasing \fBiodepth\fR beyond 1 will not affect synchronous ioengines (except
+for small degrees when \fBverify_async\fR is in use). Even async
+engines may impose OS restrictions causing the desired depth not to be
+achieved. This may happen on Linux when using libaio and not setting
+`direct=1', since buffered I/O is not async on that OS. Keep an
+eye on the I/O depth distribution in the fio output to verify that the
+achieved depth is as expected. Default: 1.
+.TP
+.BI iodepth_batch_submit \fR=\fPint "\fR,\fP iodepth_batch" \fR=\fPint
+This defines how many pieces of I/O to submit at once. It defaults to 1
+which means that we submit each I/O as soon as it is available, but can be
+raised to submit bigger batches of I/O at the time. If it is set to 0 the
+\fBiodepth\fR value will be used.
+.TP
+.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
+This defines how many pieces of I/O to retrieve at once. It defaults to 1
+which means that we'll ask for a minimum of 1 I/O in the retrieval process
+from the kernel. The I/O retrieval will go on until we hit the limit set by
+\fBiodepth_low\fR. If this variable is set to 0, then fio will always
+check for completed events before queuing more I/O. This helps reduce I/O
+latency, at the cost of more retrieval system calls.
+.TP
+.BI iodepth_batch_complete_max \fR=\fPint
+This defines maximum pieces of I/O to retrieve at once. This variable should
+be used along with \fBiodepth_batch_complete_min\fR=\fIint\fR variable,
+specifying the range of min and max amount of I/O which should be
+retrieved. By default it is equal to \fBiodepth_batch_complete_min\fR
+value. Example #1:
+.RS
+.RS
+.P
+.PD 0
+iodepth_batch_complete_min=1
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
+.RE
+.P
+which means that we will retrieve at least 1 I/O and up to the whole
+submitted queue depth. If none of I/O has been completed yet, we will wait.
+Example #2:
+.RS
+.P
+.PD 0
+iodepth_batch_complete_min=0
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
+.RE
+.P
+which means that we can retrieve up to the whole submitted queue depth, but
+if none of I/O has been completed yet, we will NOT wait and immediately exit
+the system call. In this example we simply do polling.
+.RE
+.TP
+.BI iodepth_low \fR=\fPint
+The low water mark indicating when to start filling the queue
+again. Defaults to the same as \fBiodepth\fR, meaning that fio will
+attempt to keep the queue full at all times. If \fBiodepth\fR is set to
+e.g. 16 and \fBiodepth_low\fR is set to 4, then after fio has filled the queue of
+16 requests, it will let the depth drain down to 4 before starting to fill
+it again.
+.TP
+.BI serialize_overlap \fR=\fPbool
+Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+When two or more I/Os are submitted simultaneously, there is no guarantee that
+the I/Os will be processed or completed in the submitted order. Further, if
+two or more of those I/Os are writes, any overlapping region between them can
+become indeterminate/undefined on certain storage. These issues can cause
+verification to fail erratically when at least one of the racing I/Os is
+changing data and the overlapping region has a non-zero size. Setting
+\fBserialize_overlap\fR tells fio to avoid provoking this behavior by explicitly
+serializing in-flight I/Os that have a non-zero overlap. Note that setting
+this option can reduce both performance and the \fBiodepth\fR achieved.
+.RS
+.P
+This option only applies to I/Os issued for a single job except when it is
+enabled along with \fBio_submit_mode\fR=offload. In offload mode, fio
+will check for overlap among all I/Os submitted by offload jobs with \fBserialize_overlap\fR
+enabled.
+.P
+Default: false.
+.RE
+.TP
+.BI io_submit_mode \fR=\fPstr
+This option controls how fio submits the I/O to the I/O engine. The default
+is `inline', which means that the fio job threads submit and reap I/O
+directly. If set to `offload', the job threads will offload I/O submission
+to a dedicated pool of I/O threads. This requires some coordination and thus
+has a bit of extra overhead, especially for lower queue depth I/O where it
+can increase latencies. The benefit is that fio can manage submission rates
+independently of the device completion rates. This avoids skewed latency
+reporting if I/O gets backed up on the device side (the coordinated omission
+problem).
+.SS "I/O rate"
+.TP
+.BI thinktime \fR=\fPtime
+Stall the job for the specified period of time after an I/O has completed before issuing the
+next. May be used to simulate processing being done by an application.
+When the unit is omitted, the value is interpreted in microseconds. See
+\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
+.TP
+.BI thinktime_spin \fR=\fPtime
+Only valid if \fBthinktime\fR is set - pretend to spend CPU time doing
+something with the data received, before falling back to sleeping for the
+rest of the period specified by \fBthinktime\fR. When the unit is
+omitted, the value is interpreted in microseconds.
+.TP
+.BI thinktime_blocks \fR=\fPint
+Only valid if \fBthinktime\fR is set - control how many blocks to issue,
+before waiting \fBthinktime\fR usecs. If not set, defaults to 1 which will make
+fio wait \fBthinktime\fR usecs after every block. This effectively makes any
+queue depth setting redundant, since no more than 1 I/O will be queued
+before we have to complete it and do our \fBthinktime\fR. In other words, this
+setting effectively caps the queue depth if the latter is larger.
+.TP
+.BI rate \fR=\fPint[,int][,int]
+Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+suffix rules apply. Comma-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.RS
+.P
+For example, using `rate=1m,500k' would limit reads to 1MiB/sec and writes to
+500KiB/sec. Capping only reads or writes can be done with `rate=,500k' or
+`rate=500k,' where the former will only limit writes (to 500KiB/sec) and the
+latter will only limit reads.
+.RE
+.TP
+.BI rate_min \fR=\fPint[,int][,int]
+Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+to meet this requirement will cause the job to exit. Comma-separated values
+may be specified for reads, writes, and trims as described in
+\fBblocksize\fR.
+.TP
+.BI rate_iops \fR=\fPint[,int][,int]
+Cap the bandwidth to this number of IOPS. Basically the same as
+\fBrate\fR, just specified independently of bandwidth. If the job is
+given a block size range instead of a fixed value, the smallest block size
+is used as the metric. Comma-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI rate_iops_min \fR=\fPint[,int][,int]
+If fio doesn't meet this rate of I/O, it will cause the job to exit.
+Comma-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
+.TP
+.BI rate_process \fR=\fPstr
+This option controls how fio manages rated I/O submissions. The default is
+`linear', which submits I/O in a linear fashion with fixed delays between
+I/Os that gets adjusted based on I/O completion rates. If this is set to
+`poisson', fio will submit I/O based on a more real world random request
+flow, known as the Poisson process
+(\fIhttps://en.wikipedia.org/wiki/Poisson_point_process\fR). The lambda will be
+10^6 / IOPS for the given workload.
+.TP
+.BI rate_ignore_thinktime \fR=\fPbool
+By default, fio will attempt to catch up to the specified rate setting, if any
+kind of thinktime setting was used. If this option is set, then fio will
+ignore the thinktime and continue doing IO at the specified rate, instead of
+entering a catch-up mode after thinktime is done.
+.SS "I/O latency"
+.TP
+.BI latency_target \fR=\fPtime
+If set, fio will attempt to find the max performance point that the given
+workload will run at while maintaining a latency below this target. When
+the unit is omitted, the value is interpreted in microseconds. See
+\fBlatency_window\fR and \fBlatency_percentile\fR.
+.TP
+.BI latency_window \fR=\fPtime
+Used with \fBlatency_target\fR to specify the sample window that the job
+is run at varying queue depths to test the performance. When the unit is
+omitted, the value is interpreted in microseconds.
+.TP
+.BI latency_percentile \fR=\fPfloat
+The percentage of I/Os that must fall within the criteria specified by
+\fBlatency_target\fR and \fBlatency_window\fR. If not set, this
+defaults to 100.0, meaning that all I/Os must be equal or below to the value
+set by \fBlatency_target\fR.
+.TP
+.BI max_latency \fR=\fPtime
+If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+maximum latency. When the unit is omitted, the value is interpreted in
+microseconds.
+.TP
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
+of milliseconds. Defaults to 1000.
+.SS "I/O replay"
+.TP
+.BI write_iolog \fR=\fPstr
+Write the issued I/O patterns to the specified file. See
+\fBread_iolog\fR. Specify a separate file for each job, otherwise the
+iologs will be interspersed and the file may be corrupt.
+.TP
+.BI read_iolog \fR=\fPstr
+Open an iolog with the specified filename and replay the I/O patterns it
+contains. This can be used to store a workload and replay it sometime
+later. The iolog given may also be a blktrace binary file, which allows fio
+to replay a workload captured by blktrace. See
+\fBblktrace\fR\|(8) for how to capture such logging data. For blktrace
+replay, the file needs to be turned into a blkparse binary data file first
+(`blkparse <device> \-o /dev/null \-d file_for_fio.bin').
+You can specify a number of files by separating the names with a ':' character.
+See the \fBfilename\fR option for information on how to escape ':'
+characters within the file names. These files will be sequentially assigned to
+job clones created by \fBnumjobs\fR.
+.TP
+.BI read_iolog_chunked \fR=\fPbool
+Determines how iolog is read. If false (default) entire \fBread_iolog\fR will
+be read at once. If selected true, input from iolog will be read gradually.
+Useful when iolog is very large, or it is generated.
+.TP
+.BI merge_blktrace_file \fR=\fPstr
+When specified, rather than replaying the logs passed to \fBread_iolog\fR,
+the logs go through a merge phase which aggregates them into a single blktrace.
+The resulting file is then passed on as the \fBread_iolog\fR parameter. The
+intention here is to make the order of events consistent. This limits the
+influence of the scheduler compared to replaying multiple blktraces via
+concurrent jobs.
+.TP
+.BI merge_blktrace_scalars \fR=\fPfloat_list
+This is a percentage based option that is index paired with the list of files
+passed to \fBread_iolog\fR. When merging is performed, scale the time of each
+event by the corresponding amount. For example,
+`\-\-merge_blktrace_scalars="50:100"' runs the first trace in halftime and the
+second trace in realtime. This knob is separately tunable from
+\fBreplay_time_scale\fR which scales the trace during runtime and will not
+change the output of the merge unlike this option.
+.TP
+.BI merge_blktrace_iters \fR=\fPfloat_list
+This is a whole number option that is index paired with the list of files
+passed to \fBread_iolog\fR. When merging is performed, run each trace for
+the specified number of iterations. For example,
+`\-\-merge_blktrace_iters="2:1"' runs the first trace for two iterations
+and the second trace for one iteration.
+.TP
+.BI replay_no_stall \fR=\fPbool
+When replaying I/O with \fBread_iolog\fR the default behavior is to
+attempt to respect the timestamps within the log and replay them with the
+appropriate delay between IOPS. By setting this variable fio will not
+respect the timestamps and attempt to replay them as fast as possible while
+still respecting ordering. The result is the same I/O pattern to a given
+device, but different timings.
+.TP
+.BI replay_time_scale \fR=\fPint
+When replaying I/O with \fBread_iolog\fR, fio will honor the original timing
+in the trace. With this option, it's possible to scale the time. It's a
+percentage option, if set to 50 it means run at 50% the original IO rate in
+the trace. If set to 200, run at twice the original IO rate. Defaults to 100.
+.TP
+.BI replay_redirect \fR=\fPstr
+While replaying I/O patterns using \fBread_iolog\fR the default behavior
+is to replay the IOPS onto the major/minor device that each IOP was recorded
+from. This is sometimes undesirable because on a different machine those
+major/minor numbers can map to a different device. Changing hardware on the
+same system can also result in a different major/minor mapping.
+\fBreplay_redirect\fR causes all I/Os to be replayed onto the single specified
+device regardless of the device it was recorded
+from. i.e. `replay_redirect=/dev/sdc' would cause all I/O
+in the blktrace or iolog to be replayed onto `/dev/sdc'. This means
+multiple devices will be replayed onto a single device, if the trace
+contains multiple devices. If you want multiple devices to be replayed
+concurrently to multiple redirected devices you must blkparse your trace
+into separate traces and replay them with independent fio invocations.
+Unfortunately this also breaks the strict time ordering between multiple
+device accesses.
+.TP
+.BI replay_align \fR=\fPint
+Force alignment of the byte offsets in a trace to this value. The value
+must be a power of 2.
+.TP
+.BI replay_scale \fR=\fPint
+Scale bye offsets down by this factor when replaying traces. Should most
+likely use \fBreplay_align\fR as well.
+.SS "Threads, processes and job synchronization"
+.TP
+.BI replay_skip \fR=\fPstr
+Sometimes it's useful to skip certain IO types in a replay trace. This could
+be, for instance, eliminating the writes in the trace. Or not replaying the
+trims/discards, if you are redirecting to a device that doesn't support them.
+This option takes a comma separated list of read, write, trim, sync.
+.TP
+.BI thread
+Fio defaults to creating jobs by using fork, however if this option is
+given, fio will create jobs by using POSIX Threads' function
+\fBpthread_create\fR\|(3) to create threads instead.
+.TP
+.BI wait_for \fR=\fPstr
+If set, the current job won't be started until all workers of the specified
+waitee job are done.
+.\" ignore blank line here from HOWTO as it looks normal without it
+\fBwait_for\fR operates on the job name basis, so there are a few
+limitations. First, the waitee must be defined prior to the waiter job
+(meaning no forward references). Second, if a job is being referenced as a
+waitee, it must have a unique name (no duplicate waitees).
+.TP
+.BI nice \fR=\fPint
+Run the job with the given nice value. See man \fBnice\fR\|(2).
+.\" ignore blank line here from HOWTO as it looks normal without it
+On Windows, values less than \-15 set the process class to "High"; \-1 through
+\-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+priority class.
+.TP
+.BI prio \fR=\fPint
+Set the I/O priority value of this job. Linux limits us to a positive value
+between 0 and 7, with 0 being the highest. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating
+systems since meaning of priority may differ. For per-command priority
+setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
+options.
+.TP
+.BI prioclass \fR=\fPint
+Set the I/O priority class. See man \fBionice\fR\|(1). For per-command
+priority setting, see I/O engine specific `cmdprio_percentage` and `hipri_percent`
+options.
+.TP
+.BI cpus_allowed \fR=\fPstr
+Controls the same options as \fBcpumask\fR, but accepts a textual
+specification of the permitted CPUs instead and CPUs are indexed from 0. So
+to use CPUs 0 and 5 you would specify `cpus_allowed=0,5'. This option also
+allows a range of CPUs to be specified \-\- say you wanted a binding to CPUs
+0, 5, and 8 to 15, you would set `cpus_allowed=0,5,8\-15'.
+.RS
+.P
+On Windows, when `cpus_allowed' is unset only CPUs from fio's current
+processor group will be used and affinity settings are inherited from the
+system. An fio build configured to target Windows 7 makes options that set
+CPUs processor group aware and values will set both the processor group
+and a CPU from within that group. For example, on a system where processor
+group 0 has 40 CPUs and processor group 1 has 32 CPUs, `cpus_allowed'
+values between 0 and 39 will bind CPUs from processor group 0 and
+`cpus_allowed' values between 40 and 71 will bind CPUs from processor
+group 1. When using `cpus_allowed_policy=shared' all CPUs specified by a
+single `cpus_allowed' option must be from the same processor group. For
+Windows fio builds not built for Windows 7, CPUs will only be selected from
+(and be relative to) whatever processor group fio happens to be running in
+and CPUs from other processor groups cannot be used.
+.RE
+.TP
+.BI cpus_allowed_policy \fR=\fPstr
+Set the policy of how fio distributes the CPUs specified by
+\fBcpus_allowed\fR or \fBcpumask\fR. Two policies are supported:
+.RS
+.RS
+.TP
+.B shared
+All jobs will share the CPU set specified.
+.TP
+.B split
+Each job will get a unique CPU from the CPU set.
+.RE
+.P
+\fBshared\fR is the default behavior, if the option isn't specified. If
+\fBsplit\fR is specified, then fio will assign one cpu per job. If not
+enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+in the set.
+.RE
+.TP
+.BI cpumask \fR=\fPint
+Set the CPU affinity of this job. The parameter given is a bit mask of
+allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+\fBsched_setaffinity\fR\|(2). This may not work on all supported
+operating systems or kernel versions. This option doesn't work well for a
+higher CPU count than what you can store in an integer mask, so it can only
+control cpus 1\-32. For boxes with larger CPU counts, use
+\fBcpus_allowed\fR.
+.TP
+.BI numa_cpu_nodes \fR=\fPstr
+Set this job running on specified NUMA nodes' CPUs. The arguments allow
+comma delimited list of cpu numbers, A\-B ranges, or `all'. Note, to enable
+NUMA options support, fio must be built on a system with libnuma\-dev(el)
+installed.
+.TP
+.BI numa_mem_policy \fR=\fPstr
+Set this job's memory policy and corresponding NUMA nodes. Format of the
+arguments:
+.RS
+.RS
+.P
+<mode>[:<nodelist>]
+.RE
+.P
+`mode' is one of the following memory policies: `default', `prefer',
+`bind', `interleave' or `local'. For `default' and `local' memory
+policies, no node needs to be specified. For `prefer', only one node is
+allowed. For `bind' and `interleave' the `nodelist' may be as
+follows: a comma delimited list of numbers, A\-B ranges, or `all'.
+.RE
+.TP
+.BI cgroup \fR=\fPstr
+Add job to this control group. If it doesn't exist, it will be created. The
+system must have a mounted cgroup blkio mount point for this to work. If
+your system doesn't have it mounted, you can do so with:
+.RS
+.RS
+.P
+# mount \-t cgroup \-o blkio none /cgroup
+.RE
+.RE
+.TP
+.BI cgroup_weight \fR=\fPint
+Set the weight of the cgroup to this value. See the documentation that comes
+with the kernel, allowed values are in the range of 100..1000.
+.TP
+.BI cgroup_nodelete \fR=\fPbool
+Normally fio will delete the cgroups it has created after the job
+completion. To override this behavior and to leave cgroups around after the
+job completion, set `cgroup_nodelete=1'. This can be useful if one wants
+to inspect various cgroup files after job completion. Default: false.
+.TP
+.BI flow_id \fR=\fPint
+The ID of the flow. If not specified, it defaults to being a global
+flow. See \fBflow\fR.
+.TP
+.BI flow \fR=\fPint
+Weight in token-based flow control. If this value is used, then there is
+a 'flow counter' which is used to regulate the proportion of activity between
+two or more jobs. Fio attempts to keep this flow counter near zero. The
+\fBflow\fR parameter stands for how much should be added or subtracted to the
+flow counter on each iteration of the main I/O loop. That is, if one job has
+`flow=8' and another job has `flow=\-1', then there will be a roughly 1:8
+ratio in how much one runs vs the other.
+.TP
+.BI flow_watermark \fR=\fPint
+The maximum value that the absolute value of the flow counter is allowed to
+reach before the job must wait for a lower value of the counter.
+.TP
+.BI flow_sleep \fR=\fPint
+The period of time, in microseconds, to wait after the flow watermark has
+been exceeded before retrying operations.
+.TP
+.BI stonewall "\fR,\fB wait_for_previous"
+Wait for preceding jobs in the job file to exit, before starting this
+one. Can be used to insert serialization points in the job file. A stone
+wall also implies starting a new reporting group, see
+\fBgroup_reporting\fR.
+.TP
+.BI exitall
+By default, fio will continue running all other jobs when one job finishes.
+Sometimes this is not the desired action. Setting \fBexitall\fR will instead
+make fio terminate all jobs in the same group, as soon as one job of that
+group finishes.
+.TP
+.BI exit_what
+By default, fio will continue running all other jobs when one job finishes.
+Sometimes this is not the desired action. Setting \fBexit_all\fR will instead
+make fio terminate all jobs in the same group. The option \fBexit_what\fR
+allows to control which jobs get terminated when \fBexitall\fR is enabled. The
+default is \fBgroup\fR and does not change the behaviour of \fBexitall\fR. The
+setting \fBall\fR terminates all jobs. The setting \fBstonewall\fR terminates
+all currently running jobs across all groups and continues execution with the
+next stonewalled group.
+.TP
+.BI exec_prerun \fR=\fPstr
+Before running this job, issue the command specified through
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.prerun.txt'.
+.TP
+.BI exec_postrun \fR=\fPstr
+After the job completes, issue the command specified though
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.postrun.txt'.
+.TP
+.BI uid \fR=\fPint
+Instead of running as the invoking user, set the user ID to this value
+before the thread/process does any work.
+.TP
+.BI gid \fR=\fPint
+Set group ID, see \fBuid\fR.
+.SS "Verification"
+.TP
+.BI verify_only
+Do not perform specified workload, only verify data still matches previous
+invocation of this workload. This option allows one to check data multiple
+times at a later date without overwriting it. This option makes sense only
+for workloads that write data, and does not support workloads with the
+\fBtime_based\fR option set.
+.TP
+.BI do_verify \fR=\fPbool
+Run the verify phase after a write phase. Only valid if \fBverify\fR is
+set. Default: true.
+.TP
+.BI verify \fR=\fPstr
+If writing to a file, fio can verify the file contents after each iteration
+of the job. Each verification method also implies verification of special
+header, which is written to the beginning of each block. This header also
+includes meta information, like offset of the block, block number, timestamp
+when block was written, etc. \fBverify\fR can be combined with
+\fBverify_pattern\fR option. The allowed values are:
+.RS
+.RS
+.TP
+.B md5
+Use an md5 sum of the data area and store it in the header of
+each block.
+.TP
+.B crc64
+Use an experimental crc64 sum of the data area and store it in the
+header of each block.
+.TP
+.B crc32c
+Use a crc32c sum of the data area and store it in the header of
+each block. This will automatically use hardware acceleration
+(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+fall back to software crc32c if none is found. Generally the
+fastest checksum fio supports when hardware accelerated.
+.TP
+.B crc32c\-intel
+Synonym for crc32c.
+.TP
+.B crc32
+Use a crc32 sum of the data area and store it in the header of each
+block.
+.TP
+.B crc16
+Use a crc16 sum of the data area and store it in the header of each
+block.
+.TP
+.B crc7
+Use a crc7 sum of the data area and store it in the header of each
+block.
+.TP
+.B xxhash
+Use xxhash as the checksum function. Generally the fastest software
+checksum that fio supports.
+.TP
+.B sha512
+Use sha512 as the checksum function.
+.TP
+.B sha256
+Use sha256 as the checksum function.
+.TP
+.B sha1
+Use optimized sha1 as the checksum function.
+.TP
+.B sha3\-224
+Use optimized sha3\-224 as the checksum function.
+.TP
+.B sha3\-256
+Use optimized sha3\-256 as the checksum function.
+.TP
+.B sha3\-384
+Use optimized sha3\-384 as the checksum function.
+.TP
+.B sha3\-512
+Use optimized sha3\-512 as the checksum function.
+.TP
+.B meta
+This option is deprecated, since now meta information is included in
+generic verification header and meta verification happens by
+default. For detailed information see the description of the
+\fBverify\fR setting. This option is kept because of
+compatibility's sake with old configurations. Do not use it.
+.TP
+.B pattern
+Verify a strict pattern. Normally fio includes a header with some
+basic information and checksumming, but if this option is set, only
+the specific pattern set with \fBverify_pattern\fR is verified.
+.TP
+.B null
+Only pretend to verify. Useful for testing internals with
+`ioengine=null', not for much else.
+.RE
+.P
+This option can be used for repeated burn\-in tests of a system to make sure
+that the written data is also correctly read back. If the data direction
+given is a read or random read, fio will assume that it should verify a
+previously written file. If the data direction includes any form of write,
+the verify will be of the newly written data.
+.P
+To avoid false verification errors, do not use the norandommap option when
+verifying data with async I/O engines and I/O depths > 1.  Or use the
+norandommap and the lfsr random generator together to avoid writing to the
+same offset with muliple outstanding I/Os.
+.RE
+.TP
+.BI verify_offset \fR=\fPint
+Swap the verification header with data somewhere else in the block before
+writing. It is swapped back before verifying.
+.TP
+.BI verify_interval \fR=\fPint
+Write the verification header at a finer granularity than the
+\fBblocksize\fR. It will be written for chunks the size of
+\fBverify_interval\fR. \fBblocksize\fR should divide this evenly.
+.TP
+.BI verify_pattern \fR=\fPstr
+If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+filling with totally random bytes, but sometimes it's interesting to fill
+with a known pattern for I/O verification purposes. Depending on the width
+of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+be either a decimal or a hex number). The \fBverify_pattern\fR if larger than
+a 32\-bit quantity has to be a hex number that starts with either "0x" or
+"0X". Use with \fBverify\fR. Also, \fBverify_pattern\fR supports %o
+format, which means that for each block offset will be written and then
+verified back, e.g.:
+.RS
+.RS
+.P
+verify_pattern=%o
+.RE
+.P
+Or use combination of everything:
+.RS
+.P
+verify_pattern=0xff%o"abcd"\-12
+.RE
+.RE
+.TP
+.BI verify_fatal \fR=\fPbool
+Normally fio will keep checking the entire contents before quitting on a
+block verification failure. If this option is set, fio will exit the job on
+the first observed failure. Default: false.
+.TP
+.BI verify_dump \fR=\fPbool
+If set, dump the contents of both the original data block and the data block
+we read off disk to files. This allows later analysis to inspect just what
+kind of data corruption occurred. Off by default.
+.TP
+.BI verify_async \fR=\fPint
+Fio will normally verify I/O inline from the submitting thread. This option
+takes an integer describing how many async offload threads to create for I/O
+verification instead, causing fio to offload the duty of verifying I/O
+contents to one or more separate threads. If using this offload option, even
+sync I/O engines can benefit from using an \fBiodepth\fR setting higher
+than 1, as it allows them to have I/O in flight while verifies are running.
+Defaults to 0 async threads, i.e. verification is not asynchronous.
+.TP
+.BI verify_async_cpus \fR=\fPstr
+Tell fio to set the given CPU affinity on the async I/O verification
+threads. See \fBcpus_allowed\fR for the format used.
+.TP
+.BI verify_backlog \fR=\fPint
+Fio will normally verify the written contents of a job that utilizes verify
+once that job has completed. In other words, everything is written then
+everything is read back and verified. You may want to verify continually
+instead for a variety of reasons. Fio stores the meta data associated with
+an I/O block in memory, so for large verify workloads, quite a bit of memory
+would be used up holding this meta data. If this option is enabled, fio will
+write only N blocks before verifying these blocks.
+.TP
+.BI verify_backlog_batch \fR=\fPint
+Control how many blocks fio will verify if \fBverify_backlog\fR is
+set. If not set, will default to the value of \fBverify_backlog\fR
+(meaning the entire queue is read back and verified). If
+\fBverify_backlog_batch\fR is less than \fBverify_backlog\fR then not all
+blocks will be verified, if \fBverify_backlog_batch\fR is larger than
+\fBverify_backlog\fR, some blocks will be verified more than once.
+.TP
+.BI verify_state_save \fR=\fPbool
+When a job exits during the write phase of a verify workload, save its
+current state. This allows fio to replay up until that point, if the verify
+state is loaded for the verify read phase. The format of the filename is,
+roughly:
+.RS
+.RS
+.P
+<type>\-<jobname>\-<jobindex>\-verify.state.
+.RE
+.P
+<type> is "local" for a local run, "sock" for a client/server socket
+connection, and "ip" (192.168.0.1, for instance) for a networked
+client/server connection. Defaults to true.
+.RE
+.TP
+.BI verify_state_load \fR=\fPbool
+If a verify termination trigger was used, fio stores the current write state
+of each thread. This can be used at verification time so that fio knows how
+far it should verify. Without this information, fio will run a full
+verification pass, according to the settings in the job file used. Default
+false.
+.TP
+.BI trim_percentage \fR=\fPint
+Number of verify blocks to discard/trim.
+.TP
+.BI trim_verify_zero \fR=\fPbool
+Verify that trim/discarded blocks are returned as zeros.
+.TP
+.BI trim_backlog \fR=\fPint
+Verify that trim/discarded blocks are returned as zeros.
+.TP
+.BI trim_backlog_batch \fR=\fPint
+Trim this number of I/O blocks.
+.TP
+.BI experimental_verify \fR=\fPbool
+Enable experimental verification.
+.SS "Steady state"
+.TP
+.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
+Define the criterion and limit for assessing steady state performance. The
+first parameter designates the criterion whereas the second parameter sets
+the threshold. When the criterion falls below the threshold for the
+specified duration, the job will stop. For example, `iops_slope:0.1%' will
+direct fio to terminate the job when the least squares regression slope
+falls below 0.1% of the mean IOPS. If \fBgroup_reporting\fR is enabled
+this will apply to all jobs in the group. Below is the list of available
+steady state assessment criteria. All assessments are carried out using only
+data from the rolling collection window. Threshold limits can be expressed
+as a fixed value or as a percentage of the mean in the collection window.
+.RS
+.P
+When using this feature, most jobs should include the \fBtime_based\fR
+and \fBruntime\fR options or the \fBloops\fR option so that fio does not
+stop running after it has covered the full size of the specified file(s)
+or device(s).
+.RS
+.RS
+.TP
+.B iops
+Collect IOPS data. Stop the job if all individual IOPS measurements
+are within the specified limit of the mean IOPS (e.g., `iops:2'
+means that all individual IOPS values must be within 2 of the mean,
+whereas `iops:0.2%' means that all individual IOPS values must be
+within 0.2% of the mean IOPS to terminate the job).
+.TP
+.B iops_slope
+Collect IOPS data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
+.TP
+.B bw
+Collect bandwidth data. Stop the job if all individual bandwidth
+measurements are within the specified limit of the mean bandwidth.
+.TP
+.B bw_slope
+Collect bandwidth data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
+.RE
+.RE
+.TP
+.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
+A rolling window of this duration will be used to judge whether steady state
+has been reached. Data will be collected once per second. The default is 0
+which disables steady state detection. When the unit is omitted, the
+value is interpreted in seconds.
+.TP
+.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
+Allow the job to run for the specified duration before beginning data
+collection for checking the steady state job termination criterion. The
+default is 0. When the unit is omitted, the value is interpreted in seconds.
+.SS "Measurements and reporting"
+.TP
+.BI per_job_logs \fR=\fPbool
+If set, this generates bw/clat/iops log with per file private filenames. If
+not set, jobs with identical names will share the log filename. Default:
+true.
+.TP
+.BI group_reporting
+It may sometimes be interesting to display statistics for groups of jobs as
+a whole instead of for each individual job. This is especially true if
+\fBnumjobs\fR is used; looking at individual thread/process output
+quickly becomes unwieldy. To see the final report per-group instead of
+per-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the
+same reporting group, unless if separated by a \fBstonewall\fR, or by
+using \fBnew_group\fR.
+.TP
+.BI new_group
+Start a new reporting group. See: \fBgroup_reporting\fR. If not given,
+all jobs in a file will be part of the same reporting group, unless
+separated by a \fBstonewall\fR.
+.TP
+.BI stats \fR=\fPbool
+By default, fio collects and shows final output results for all jobs
+that run. If this option is set to 0, then fio will ignore it in
+the final stat output.
+.TP
+.BI write_bw_log \fR=\fPstr
+If given, write a bandwidth log for this job. Can be used to store data of
+the bandwidth of the jobs in their lifetime.
+.RS
+.P
+If no str argument is given, the default filename of
+`jobname_type.x.log' is used. Even when the argument is given, fio
+will still append the type of log. So if one specifies:
+.RS
+.P
+write_bw_log=foo
+.RE
+.P
+The actual log name will be `foo_bw.x.log' where `x' is the index
+of the job (1..N, where N is the number of jobs). If
+\fBper_job_logs\fR is false, then the filename will not include the
+`.x` job index.
+.P
+The included \fBfio_generate_plots\fR script uses gnuplot to turn these
+text files into nice graphs. See the \fBLOG FILE FORMATS\fR section for how data is
+structured within the file.
+.RE
+.TP
+.BI write_lat_log \fR=\fPstr
+Same as \fBwrite_bw_log\fR, except this option creates I/O
+submission (e.g., `name_slat.x.log'), completion (e.g.,
+`name_clat.x.log'), and total (e.g., `name_lat.x.log') latency
+files instead. See \fBwrite_bw_log\fR for details about the
+filename format and the \fBLOG FILE FORMATS\fR section for how data is structured
+within the files.
+.TP
+.BI write_hist_log \fR=\fPstr
+Same as \fBwrite_bw_log\fR but writes an I/O completion latency
+histogram file (e.g., `name_hist.x.log') instead. Note that this
+file will be empty unless \fBlog_hist_msec\fR has also been set.
+See \fBwrite_bw_log\fR for details about the filename format and
+the \fBLOG FILE FORMATS\fR section for how data is structured
+within the file.
+.TP
+.BI write_iops_log \fR=\fPstr
+Same as \fBwrite_bw_log\fR, but writes an IOPS file (e.g.
+`name_iops.x.log`) instead. Because fio defaults to individual
+I/O logging, the value entry in the IOPS log will be 1 unless windowed
+logging (see \fBlog_avg_msec\fR) has been enabled. See
+\fBwrite_bw_log\fR for details about the filename format and \fBLOG
+FILE FORMATS\fR for how data is structured within the file.
+.TP
+.BI log_avg_msec \fR=\fPint
+By default, fio will log an entry in the iops, latency, or bw log for every
+I/O that completes. When writing to the disk log, that can quickly grow to a
+very large size. Setting this option makes fio average the each log entry
+over the specified period of time, reducing the resolution of the log. See
+\fBlog_max_value\fR as well. Defaults to 0, logging all entries.
+Also see \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_hist_msec \fR=\fPint
+Same as \fBlog_avg_msec\fR, but logs entries for completion latency
+histograms. Computing latency percentiles from averages of intervals using
+\fBlog_avg_msec\fR is inaccurate. Setting this option makes fio log
+histogram entries over the specified period of time, reducing log sizes for
+high IOPS devices while retaining percentile accuracy. See
+\fBlog_hist_coarseness\fR and \fBwrite_hist_log\fR as well.
+Defaults to 0, meaning histogram logging is disabled.
+.TP
+.BI log_hist_coarseness \fR=\fPint
+Integer ranging from 0 to 6, defining the coarseness of the resolution of
+the histogram logs enabled with \fBlog_hist_msec\fR. For each increment
+in coarseness, fio outputs half as many bins. Defaults to 0, for which
+histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_max_value \fR=\fPbool
+If \fBlog_avg_msec\fR is set, fio logs the average over that window. If
+you instead want to log the maximum value, set this option to 1. Defaults to
+0, meaning that averaged values are logged.
+.TP
+.BI log_offset \fR=\fPbool
+If this is set, the iolog options will include the byte offset for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_compression \fR=\fPint
+If this is set, fio will compress the I/O logs as it goes, to keep the
+memory footprint lower. When a log reaches the specified size, that chunk is
+removed and compressed in the background. Given that I/O logs are fairly
+highly compressible, this yields a nice memory savings for longer runs. The
+downside is that the compression will consume some background CPU cycles, so
+it may impact the run. This, however, is also true if the logging ends up
+consuming most of the system memory. So pick your poison. The I/O logs are
+saved normally at the end of a run, by decompressing the chunks and storing
+them in the specified log file. This feature depends on the availability of
+zlib.
+.TP
+.BI log_compression_cpus \fR=\fPstr
+Define the set of CPUs that are allowed to handle online log compression for
+the I/O jobs. This can provide better isolation between performance
+sensitive jobs, and background compression work. See \fBcpus_allowed\fR for
+the format used.
+.TP
+.BI log_store_compressed \fR=\fPbool
+If set, fio will store the log files in a compressed format. They can be
+decompressed with fio, using the \fB\-\-inflate\-log\fR command line
+parameter. The files will be stored with a `.fz' suffix.
+.TP
+.BI log_unix_epoch \fR=\fPbool
+If set, fio will log Unix timestamps to the log files produced by enabling
+write_type_log for each log type, instead of the default zero-based
+timestamps.
+.TP
+.BI block_error_percentiles \fR=\fPbool
+If set, record errors in trim block-sized units from writes and trims and
+output a histogram of how many trims it took to get to errors, and what kind
+of error was encountered.
+.TP
+.BI bwavgtime \fR=\fPint
+Average the calculated bandwidth over the given time. Value is specified in
+milliseconds. If the job also does bandwidth logging through
+\fBwrite_bw_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
+.TP
+.BI iopsavgtime \fR=\fPint
+Average the calculated IOPS over the given time. Value is specified in
+milliseconds. If the job also does IOPS logging through
+\fBwrite_iops_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
+.TP
+.BI disk_util \fR=\fPbool
+Generate disk utilization statistics, if the platform supports it.
+Default: true.
+.TP
+.BI disable_lat \fR=\fPbool
+Disable measurements of total latency numbers. Useful only for cutting back
+the number of calls to \fBgettimeofday\fR\|(2), as that does impact
+performance at really high IOPS rates. Note that to really get rid of a
+large amount of these calls, this option must be used with
+\fBdisable_slat\fR and \fBdisable_bw_measurement\fR as well.
+.TP
+.BI disable_clat \fR=\fPbool
+Disable measurements of completion latency numbers. See
+\fBdisable_lat\fR.
+.TP
+.BI disable_slat \fR=\fPbool
+Disable measurements of submission latency numbers. See
+\fBdisable_lat\fR.
+.TP
+.BI disable_bw_measurement \fR=\fPbool "\fR,\fP disable_bw" \fR=\fPbool
+Disable measurements of throughput/bandwidth numbers. See
+\fBdisable_lat\fR.
+.TP
+.BI slat_percentiles \fR=\fPbool
+Report submission latency percentiles. Submission latency is not recorded
+for synchronous ioengines.
+.TP
+.BI clat_percentiles \fR=\fPbool
+Report completion latency percentiles.
+.TP
+.BI lat_percentiles \fR=\fPbool
+Report total latency percentiles. Total latency is the sum of submission
+latency and completion latency.
+.TP
+.BI percentile_list \fR=\fPfloat_list
+Overwrite the default list of percentiles for latencies and the
+block error histogram. Each number is a floating point number in the range
+(0,100], and the maximum length of the list is 20. Use ':' to separate the
+numbers. For example, `\-\-percentile_list=99.5:99.9' will cause fio to
+report the latency durations below which 99.5% and 99.9% of the observed
+latencies fell, respectively.
+.TP
+.BI significant_figures \fR=\fPint
+If using \fB\-\-output\-format\fR of `normal', set the significant figures
+to this value. Higher values will yield more precise IOPS and throughput
+units, while lower values will round. Requires a minimum value of 1 and a
+maximum value of 10. Defaults to 4.
+.SS "Error handling"
+.TP
+.BI exitall_on_error
+When one job finishes in error, terminate the rest. The default is to wait
+for each job to finish.
+.TP
+.BI continue_on_error \fR=\fPstr
+Normally fio will exit the job on the first observed failure. If this option
+is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+EILSEQ) until the runtime is exceeded or the I/O size specified is
+completed. If this option is used, there are two more stats that are
+appended, the total error count and the first error. The error field given
+in the stats is the first error that was hit during the run.
+The allowed values are:
+.RS
+.RS
+.TP
+.B none
+Exit on any I/O or verify errors.
+.TP
+.B read
+Continue on read errors, exit on all others.
+.TP
+.B write
+Continue on write errors, exit on all others.
+.TP
+.B io
+Continue on any I/O error, exit on all others.
+.TP
+.B verify
+Continue on verify errors, exit on all others.
+.TP
+.B all
+Continue on all errors.
+.TP
+.B 0
+Backward-compatible alias for 'none'.
+.TP
+.B 1
+Backward-compatible alias for 'all'.
+.RE
+.RE
+.TP
+.BI ignore_error \fR=\fPstr
+Sometimes you want to ignore some errors during test in that case you can
+specify error list for each error type, instead of only being able to
+ignore the default 'non-fatal error' using \fBcontinue_on_error\fR.
+`ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST' errors for
+given error type is separated with ':'. Error may be symbol ('ENOSPC', 'ENOMEM')
+or integer. Example:
+.RS
+.RS
+.P
+ignore_error=EAGAIN,ENOSPC:122
+.RE
+.P
+This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+WRITE. This option works by overriding \fBcontinue_on_error\fR with
+the list of errors for each error type if any.
+.RE
+.TP
+.BI error_dump \fR=\fPbool
+If set dump every error even if it is non fatal, true by default. If
+disabled only fatal error will be dumped.
+.SS "Running predefined workloads"
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+.TP
+.BI profile \fR=\fPstr
+The predefined workload to run. Current profiles are:
+.RS
+.RS
+.TP
+.B tiobench
+Threaded I/O bench (tiotest/tiobench) like workload.
+.TP
+.B act
+Aerospike Certification Tool (ACT) like workload.
+.RE
+.RE
+.P
+To view a profile's additional options use \fB\-\-cmdhelp\fR after specifying
+the profile. For example:
+.RS
+.TP
+$ fio \-\-profile=act \-\-cmdhelp
+.RE
+.SS "Act profile options"
+.TP
+.BI device\-names \fR=\fPstr
+Devices to use.
+.TP
+.BI load \fR=\fPint
+ACT load multiplier. Default: 1.
+.TP
+.BI test\-duration\fR=\fPtime
+How long the entire test takes to run. When the unit is omitted, the value
+is given in seconds. Default: 24h.
+.TP
+.BI threads\-per\-queue\fR=\fPint
+Number of read I/O threads per device. Default: 8.
+.TP
+.BI read\-req\-num\-512\-blocks\fR=\fPint
+Number of 512B blocks to read at the time. Default: 3.
+.TP
+.BI large\-block\-op\-kbytes\fR=\fPint
+Size of large block ops in KiB (writes). Default: 131072.
+.TP
+.BI prep
+Set to run ACT prep phase.
+.SS "Tiobench profile options"
+.TP
+.BI size\fR=\fPstr
+Size in MiB.
+.TP
+.BI block\fR=\fPint
+Block size in bytes. Default: 4096.
+.TP
+.BI numruns\fR=\fPint
+Number of runs.
+.TP
+.BI dir\fR=\fPstr
+Test directory.
+.TP
+.BI threads\fR=\fPint
+Number of threads.
+.SH OUTPUT
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be:
+.P
+.nf
+		Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+.fi
+.P
+The characters inside the first set of square brackets denote the current status of
+each thread. The first character is the first job defined in the job file, and so
+forth. The possible values (in typical life cycle order) are:
+.RS
+.TP
+.PD 0
+.B P
+Thread setup, but not started.
+.TP
+.B C
+Thread created.
+.TP
+.B I
+Thread initialized, waiting or generating necessary data.
+.TP
+.B p
+Thread running pre-reading file(s).
+.TP
+.B /
+Thread is in ramp period.
+.TP
+.B R
+Running, doing sequential reads.
+.TP
+.B r
+Running, doing random reads.
+.TP
+.B W
+Running, doing sequential writes.
+.TP
+.B w
+Running, doing random writes.
+.TP
+.B M
+Running, doing mixed sequential reads/writes.
+.TP
+.B m
+Running, doing mixed random reads/writes.
+.TP
+.B D
+Running, doing sequential trims.
+.TP
+.B d
+Running, doing random trims.
+.TP
+.B F
+Running, currently waiting for \fBfsync\fR\|(2).
+.TP
+.B V
+Running, doing verification of written data.
+.TP
+.B f
+Thread finishing.
+.TP
+.B E
+Thread exited, not reaped by main thread yet.
+.TP
+.B \-
+Thread reaped.
+.TP
+.B X
+Thread reaped, exited with an error.
+.TP
+.B K
+Thread reaped, exited due to signal.
+.PD
+.RE
+.P
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this:
+.P
+.nf
+		Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+.fi
+.P
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what. In the example above this means that jobs 1\-\-10
+are readers and 11\-\-20 are writers.
+.P
+The other values are fairly self explanatory \-\- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+.P
+When fio is done (or interrupted by Ctrl\-C), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like:
+.P
+.nf
+		Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+		  write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+		    slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+		    clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+		     lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+		    clat percentiles (usec):
+		     |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+		     | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+		     | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+		     | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+		     | 99.99th=[78119]
+		   bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+		   iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+		  lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+		  lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+		  lat (msec)   : 100=0.65%
+		  cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+		  IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+		     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+		     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+		     issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+		     latency   : target=0, window=0, percentile=100.00%, depth=8
+.fi
+.P
+The job name (or first job's name when using \fBgroup_reporting\fR) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed. Below are the I/O statistics for each data direction performed (showing
+writes in the example above). In the order listed, they denote:
+.RS
+.TP
+.B read/write/trim
+The string before the colon shows the I/O direction the statistics
+are for. \fIIOPS\fR is the average I/Os performed per second. \fIBW\fR
+is the average bandwidth rate shown as: value in power of 2 format
+(value in power of 10 format). The last two values show: (total
+I/O performed in power of 2 format / \fIruntime\fR of that thread).
+.TP
+.B slat
+Submission latency (\fImin\fR being the minimum, \fImax\fR being the
+maximum, \fIavg\fR being the average, \fIstdev\fR being the standard
+deviation). This is the time it took to submit the I/O. For
+sync I/O this row is not displayed as the slat is really the
+completion latency (since queue/complete is one operation there).
+This value can be in nanoseconds, microseconds or milliseconds \-\-\-
+fio will choose the most appropriate base and print that (in the
+example above nanoseconds was the best scale). Note: in \fB\-\-minimal\fR mode
+latencies are always expressed in microseconds.
+.TP
+.B clat
+Completion latency. Same names as slat, this denotes the time from
+submission to completion of the I/O pieces. For sync I/O, clat will
+usually be equal (or very close) to 0, as the time from submit to
+complete is basically just CPU time (I/O has already been done, see slat
+explanation).
+.TP
+.B lat
+Total latency. Same names as slat and clat, this denotes the time from
+when fio created the I/O unit to completion of the I/O operation.
+.TP
+.B bw
+Bandwidth statistics based on samples. Same names as the xlat stats,
+but also includes the number of samples taken (\fIsamples\fR) and an
+approximate percentage of total aggregate bandwidth this thread
+received in its group (\fIper\fR). This last value is only really
+useful if the threads in this group are on the same disk, since they
+are then competing for disk access.
+.TP
+.B iops
+IOPS statistics based on samples. Same names as \fBbw\fR.
+.TP
+.B lat (nsec/usec/msec)
+The distribution of I/O completion latencies. This is the time from when
+I/O leaves fio and when it gets completed. Unlike the separate
+read/write/trim sections above, the data here and in the remaining
+sections apply to all I/Os for the reporting group. 250=0.04% means that
+0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+of the I/Os required 250 to 499us for completion.
+.TP
+.B cpu
+CPU usage. User and system time, along with the number of context
+switches this thread went through, usage of system and user time, and
+finally the number of major and minor page faults. The CPU utilization
+numbers are averages for the jobs in that reporting group, while the
+context and fault counters are summed.
+.TP
+.B IO depths
+The distribution of I/O depths over the job lifetime. The numbers are
+divided into powers of 2 and each entry covers depths from that value
+up to those that are lower than the next entry \-\- e.g., 16= covers
+depths from 16 to 31. Note that the range covered by a depth
+distribution entry can be different to the range covered by the
+equivalent \fBsubmit\fR/\fBcomplete\fR distribution entry.
+.TP
+.B IO submit
+How many pieces of I/O were submitting in a single submit call. Each
+entry denotes that amount and below, until the previous entry \-\- e.g.,
+16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+call. Note that the range covered by a \fBsubmit\fR distribution entry can
+be different to the range covered by the equivalent depth distribution
+entry.
+.TP
+.B IO complete
+Like the above \fBsubmit\fR number, but for completions instead.
+.TP
+.B IO issued rwt
+The number of \fBread/write/trim\fR requests issued, and how many of them were
+short or dropped.
+.TP
+.B IO latency
+These values are for \fBlatency_target\fR and related options. When
+these options are engaged, this section describes the I/O depth required
+to meet the specified latency target.
+.RE
+.P
+After each client has been listed, the group statistics are printed. They
+will look like this:
+.P
+.nf
+		Run status group 0 (all jobs):
+		   READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s\-10.8MiB/s (10.9MB/s\-11.3MB/s), io=64.0MiB (67.1MB), run=2973\-3069msec
+		  WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s\-621KiB/s (630kB/s\-636kB/s), io=64.0MiB (67.1MB), run=52747\-53223msec
+.fi
+.P
+For each data direction it prints:
+.RS
+.TP
+.B bw
+Aggregate bandwidth of threads in this group followed by the
+minimum and maximum bandwidth of all the threads in this group.
+Values outside of brackets are power-of-2 format and those
+within are the equivalent value in a power-of-10 format.
+.TP
+.B io
+Aggregate I/O performed of all threads in this group. The
+format is the same as \fBbw\fR.
+.TP
+.B run
+The smallest and longest runtimes of the threads in this group.
+.RE
+.P
+And finally, the disk statistics are printed. This is Linux specific.
+They will look like this:
+.P
+.nf
+		  Disk stats (read/write):
+		    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+.fi
+.P
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
+.RS
+.TP
+.B ios
+Number of I/Os performed by all groups.
+.TP
+.B merge
+Number of merges performed by the I/O scheduler.
+.TP
+.B ticks
+Number of ticks we kept the disk busy.
+.TP
+.B in_queue
+Total time spent in the disk queue.
+.TP
+.B util
+The disk utilization. A value of 100% means we kept the disk
+busy constantly, 50% would be a disk idling half of the time.
+.RE
+.P
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the USR1 signal. You can
+also get regularly timed dumps by using the \fB\-\-status\-interval\fR
+parameter, or by creating a file in `/tmp' named
+`fio\-dump\-status'. If fio sees this file, it will unlink it and dump the
+current output status.
+.SH TERSE OUTPUT
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format. The format
+is one long line of values, such as:
+.P
+.nf
+		2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+		A description of this job goes here.
+.fi
+.P
+The job description (if provided) follows on a second line for terse v2.
+It appears on the same line for other terse versions.
+.P
+To enable terse output, use the \fB\-\-minimal\fR or
+`\-\-output\-format=terse' command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
+.P
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+.P
+.nf
+			terse version, fio version [v3], jobname, groupid, error
+.fi
+.RS
+.P
+.B
+READ status:
+.RE
+.P
+.nf
+			Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+			Submission latency: min, max, mean, stdev (usec)
+			Completion latency: min, max, mean, stdev (usec)
+			Completion latency percentiles: 20 fields (see below)
+			Total latency: min, max, mean, stdev (usec)
+			Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+			IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
+.RS
+.P
+.B
+WRITE status:
+.RE
+.P
+.nf
+			Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+			Submission latency: min, max, mean, stdev (usec)
+			Completion latency: min, max, mean, stdev (usec)
+			Completion latency percentiles: 20 fields (see below)
+			Total latency: min, max, mean, stdev (usec)
+			Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+			IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
+.RS
+.P
+.B
+TRIM status [all but version 3]:
+.RE
+.P
+.nf
+			Fields are similar to \fBREAD/WRITE\fR status.
+.fi
+.RS
+.P
+.B
+CPU usage:
+.RE
+.P
+.nf
+			user, system, context switches, major faults, minor faults
+.fi
+.RS
+.P
+.B
+I/O depths:
+.RE
+.P
+.nf
+			<=1, 2, 4, 8, 16, 32, >=64
+.fi
+.RS
+.P
+.B
+I/O latencies microseconds:
+.RE
+.P
+.nf
+			<=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.fi
+.RS
+.P
+.B
+I/O latencies milliseconds:
+.RE
+.P
+.nf
+			<=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+.fi
+.RS
+.P
+.B
+Disk utilization [v3]:
+.RE
+.P
+.nf
+			disk name, read ios, write ios, read merges, write merges, read ticks, write ticks, time spent in queue, disk utilization percentage
+.fi
+.RS
+.P
+.B
+Additional Info (dependent on continue_on_error, default off):
+.RE
+.P
+.nf
+			total # errors, first error code
+.fi
+.RS
+.P
+.B
+Additional Info (dependent on description being set):
+.RE
+.P
+.nf
+			Text description
+.fi
+.P
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this:
+.P
+.nf
+		1.00%=6112
+.fi
+.P
+which is the Xth percentile, and the `usec' latency associated with it.
+.P
+For \fBDisk utilization\fR, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+.P
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons:
+.P
+.nf
+		terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+.fi
+.P
+In client/server mode terse output differs from what appears when jobs are run
+locally. Disk utilization data is omitted from the standard terse output and
+for v3 and later appears on its own separate line at the end of each terse
+reporting cycle.
+.SH JSON OUTPUT
+The \fBjson\fR output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+\fBnormal\fR output. The \fBruntime\fR value is reported in msec and the \fBbw\fR value is
+reported in 1024 bytes per second units.
+.fi
+.SH JSON+ OUTPUT
+The \fBjson+\fR output format is identical to the \fBjson\fR output format except that it
+adds a full dump of the completion latency bins. Each \fBbins\fR object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+.RS
+.P
+"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+.RE
+.P
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+.P
+Also included with fio is a Python script \fBfio_jsonplus_clat2csv\fR that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+.P
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to `stat.h' in the fio source.
+.SH TRACE FILE FORMAT
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20\-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+.P
+In any case the trace is a simple text file with a single action per line.
+.TP
+.B Trace file format v1
+Each line represents a single I/O action in the following format:
+.RS
+.RS
+.P
+rw, offset, length
+.RE
+.P
+where `rw=0/1' for read/write, and the `offset' and `length' entries being in bytes.
+.P
+This format is not supported in fio versions >= 1.20\-rc3.
+.RE
+.TP
+.B Trace file format v2
+The second version of the trace file format was added in fio version 1.17. It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
+.RS
+.P
+The first line of the trace file has to be:
+.RS
+.P
+"fio version 2 iolog"
+.RE
+.P
+Following this can be lines in two different formats, which are described below.
+.P
+.B
+The file management format:
+.RS
+filename action
+.P
+The `filename' is given as an absolute path. The `action' can be one of these:
+.RS
+.TP
+.B add
+Add the given `filename' to the trace.
+.TP
+.B open
+Open the file with the given `filename'. The `filename' has to have
+been added with the \fBadd\fR action before.
+.TP
+.B close
+Close the file with the given `filename'. The file has to have been
+\fBopen\fRed before.
+.RE
+.RE
+.P
+.B
+The file I/O action format:
+.RS
+filename action offset length
+.P
+The `filename' is given as an absolute path, and has to have been \fBadd\fRed and
+\fBopen\fRed before it can be used with this format. The `offset' and `length' are
+given in bytes. The `action' can be one of these:
+.RS
+.TP
+.B wait
+Wait for `offset' microseconds. Everything below 100 is discarded.
+The time is relative to the previous `wait' statement.
+.TP
+.B read
+Read `length' bytes beginning from `offset'.
+.TP
+.B write
+Write `length' bytes beginning from `offset'.
+.TP
+.B sync
+\fBfsync\fR\|(2) the file.
+.TP
+.B datasync
+\fBfdatasync\fR\|(2) the file.
+.TP
+.B trim
+Trim the given file from the given `offset' for `length' bytes.
+.RE
+.RE
+.SH I/O REPLAY \- MERGING TRACES
+Colocation is a common practice used to get the most out of a machine.
+Knowing which workloads play nicely with each other and which ones don't is
+a much harder task. While fio can replay workloads concurrently via multiple
+jobs, it leaves some variability up to the scheduler making results harder to
+reproduce. Merging is a way to make the order of events consistent.
+.P
+Merging is integrated into I/O replay and done when a \fBmerge_blktrace_file\fR
+is specified. The list of files passed to \fBread_iolog\fR go through the merge
+process and output a single file stored to the specified file. The output file is
+passed on as if it were the only file passed to \fBread_iolog\fR. An example would
+look like:
+.RS
+.P
+$ fio \-\-read_iolog="<file1>:<file2>" \-\-merge_blktrace_file="<output_file>"
+.RE
+.P
+Creating only the merged file can be done by passing the command line argument
+\fBmerge-blktrace-only\fR.
+.P
+Scaling traces can be done to see the relative impact of any particular trace
+being slowed down or sped up. \fBmerge_blktrace_scalars\fR takes in a colon
+separated list of percentage scalars. It is index paired with the files passed
+to \fBread_iolog\fR.
+.P
+With scaling, it may be desirable to match the running time of all traces.
+This can be done with \fBmerge_blktrace_iters\fR. It is index paired with
+\fBread_iolog\fR just like \fBmerge_blktrace_scalars\fR.
+.P
+In an example, given two traces, A and B, each 60s long. If we want to see
+the impact of trace A issuing IOs twice as fast and repeat trace A over the
+runtime of trace B, the following can be done:
+.RS
+.P
+$ fio \-\-read_iolog="<trace_a>:"<trace_b>" \-\-merge_blktrace_file"<output_file>" \-\-merge_blktrace_scalars="50:100" \-\-merge_blktrace_iters="2:1"
+.RE
+.P
+This runs trace A at 2x the speed twice for approximately the same runtime as
+a single run of trace B.
+.SH CPU IDLENESS PROFILING
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+.P
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+.SH VERIFICATION AND TRIGGERS
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+.P
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+.P
+A verification trigger consists of two things:
+.RS
+.P
+1) Storing the write state of each job.
+.P
+2) Executing a trigger command.
+.RE
+.P
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+.P
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+`\-\-trigger\-file=/tmp/trigger\-file', then it will continually
+check for the existence of `/tmp/trigger\-file'. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
+command).
+.P
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
+.RE
+.P
+.B Verification trigger example
+.RS
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in `write\-test.fio'. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally:
+.RS
+.P
+server# fio \-\-server
+.RE
+.P
+and on the client, we'll fire off the workload:
+.RS
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger\-remote="bash \-c "echo b > /proc/sysrq\-triger""
+.RE
+.P
+We set `/tmp/my\-trigger' as the trigger file, and we tell fio to execute:
+.RS
+.P
+echo b > /proc/sysrq\-trigger
+.RE
+.P
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not really cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi\-reboot. On localbox, we could then have run fio with a local trigger
+instead:
+.RS
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi\-reboot server"
+.RE
+.P
+For this case, fio would wait for the server to send us the write state, then
+execute `ipmi\-reboot server' when that happened.
+.RE
+.P
+.B Loading verify state
+.RS
+To load stored write state, a read verification job file must contain the
+\fBverify_state_load\fR option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+.RE
+.SH LOG FILE FORMATS
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+.RS
+.P
+time (msec), value, data direction, block size (bytes), offset (bytes)
+.RE
+.P
+`Time' for the log entry is always in milliseconds. The `value' logged depends
+on the type of log, it will be one of the following:
+.RS
+.TP
+.B Latency log
+Value is latency in nsecs
+.TP
+.B Bandwidth log
+Value is in KiB/sec
+.TP
+.B IOPS log
+Value is IOPS
+.RE
+.P
+`Data direction' is one of the following:
+.RS
+.TP
+.B 0
+I/O is a READ
+.TP
+.B 1
+I/O is a WRITE
+.TP
+.B 2
+I/O is a TRIM
+.RE
+.P
+The entry's `block size' is always in bytes. The `offset' is the position in bytes
+from the start of the file for that particular I/O. The logging of the offset can be
+toggled with \fBlog_offset\fR.
+.P
+Fio defaults to logging every individual I/O but when windowed logging is set
+through \fBlog_avg_msec\fR, either the average (by default) or the maximum
+(\fBlog_max_value\fR is set) `value' seen over the specified period of time
+is recorded. Each `data direction' seen within the window period will aggregate
+its values in a separate row. Further, when using windowed logging the `block
+size' and `offset' entries will always contain 0.
+.SH CLIENT / SERVER
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+.P
+Start the server on the machine which has access to the storage DUT:
+.RS
+.P
+$ fio \-\-server=args
+.RE
+.P
+where `args' defines what fio listens to. The arguments are of the form
+`type,hostname' or `IP,port'. `type' is either `ip' (or ip4) for TCP/IP
+v4, `ip6' for TCP/IP v6, or `sock' for a local unix domain socket.
+`hostname' is either a hostname or IP address, and `port' is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+.RS
+.TP
+1) \fBfio \-\-server\fR
+Start a fio server, listening on all interfaces on the default port (8765).
+.TP
+2) \fBfio \-\-server=ip:hostname,4444\fR
+Start a fio server, listening on IP belonging to hostname and on port 4444.
+.TP
+3) \fBfio \-\-server=ip6:::1,4444\fR
+Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+.TP
+4) \fBfio \-\-server=,4444\fR
+Start a fio server, listening on all interfaces on port 4444.
+.TP
+5) \fBfio \-\-server=1.2.3.4\fR
+Start a fio server, listening on IP 1.2.3.4 on the default port.
+.TP
+6) \fBfio \-\-server=sock:/tmp/fio.sock\fR
+Start a fio server, listening on the local socket `/tmp/fio.sock'.
+.RE
+.P
+Once a server is running, a "client" can connect to the fio server with:
+.RS
+.P
+$ fio <local\-args> \-\-client=<server> <remote\-args> <job file(s)>
+.RE
+.P
+where `local\-args' are arguments for the client where it is running, `server'
+is the connect string, and `remote\-args' and `job file(s)' are sent to the
+server. The `server' string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+.P
+Fio can connect to multiple servers this way:
+.RS
+.P
+$ fio \-\-client=<server1> <job file(s)> \-\-client=<server2> <job file(s)>
+.RE
+.P
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using \fB\-\-remote\-config\fR:
+.RS
+.P
+$ fio \-\-client=server \-\-remote\-config /path/to/file.fio
+.RE
+.P
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+.P
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+\fB\-\-client\fR option. For example, here is an example `host.list'
+file containing 2 hostnames:
+.RS
+.P
+.PD 0
+host1.your.dns.domain
+.P
+host2.your.dns.domain
+.PD
+.RE
+.P
+The fio command would then be:
+.RS
+.P
+$ fio \-\-client=host.list <job file(s)>
+.RE
+.P
+In this mode, you cannot input server-specific parameters or job files \-\- all
+servers receive the same job file.
+.P
+In order to let `fio \-\-client' runs use a shared filesystem from multiple
+hosts, `fio \-\-client' now prepends the IP address of the server to the
+filename. For example, if fio is using the directory `/mnt/nfs/fio' and is
+writing filename `fileio.tmp', with a \fB\-\-client\fR `hostfile'
+containing two hostnames `h1' and `h2' with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files:
+.RS
+.P
+.PD 0
+/mnt/nfs/fio/192.168.10.120.fileio.tmp
+.P
+/mnt/nfs/fio/192.168.10.121.fileio.tmp
+.PD
+.RE
+.P
+Terse output in client/server mode will differ slightly from what is produced
+when fio is run in stand-alone mode. See the terse output section for details.
+.SH AUTHORS
+.B fio
+was written by Jens Axboe <axboe@kernel.dk>.
+.br
+This man page was written by Aaron Carroll <aaronc@cse.unsw.edu.au> based
+on documentation by Jens Axboe.
+.br
+This man page was rewritten by Tomohiro Kusumi <tkusumi@tuxera.com> based
+on documentation by Jens Axboe.
+.SH "REPORTING BUGS"
+Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.
+.br
+See \fBREPORTING\-BUGS\fR.
+.P
+\fBREPORTING\-BUGS\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/REPORTING\-BUGS\fR
+.SH "SEE ALSO"
+For further documentation see \fBHOWTO\fR and \fBREADME\fR.
+.br
+Sample jobfiles are available in the `examples/' directory.
+.br
+These are typically located under `/usr/share/doc/fio'.
+.P
+\fBHOWTO\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/HOWTO\fR
+.br
+\fBREADME\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/README\fR
diff --git a/fio.c b/fio.c
new file mode 100644
index 0000000..f19db1b
--- /dev/null
+++ b/fio.c
@@ -0,0 +1,67 @@
+/*
+ * fio - the flexible io tester
+ *
+ * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006-2012 Jens Axboe <axboe@kernel.dk>
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+#include "fio.h"
+
+int main(int argc, char *argv[], char *envp[])
+{
+	int ret = 1;
+
+	compiletime_assert(TD_NR <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+
+	if (initialize_fio(envp))
+		return 1;
+
+#if !defined(CONFIG_GETTIMEOFDAY) && !defined(CONFIG_CLOCK_GETTIME)
+#error "No available clock source!"
+#endif
+
+	if (fio_server_create_sk_key())
+		goto done;
+
+	if (parse_options(argc, argv))
+		goto done_key;
+
+	/*
+	 * line buffer stdout to avoid output lines from multiple
+	 * threads getting mixed
+	 */
+	setvbuf(stdout, NULL, _IOLBF, 0);
+
+	fio_time_init();
+
+	if (nr_clients) {
+		set_genesis_time();
+
+		if (fio_start_all_clients())
+			goto done_key;
+		ret = fio_handle_clients(&fio_client_ops);
+	} else
+		ret = fio_backend(NULL);
+
+done_key:
+	fio_server_destroy_sk_key();
+done:
+	deinitialize_fio();
+	return ret;
+}
diff --git a/fio.h b/fio.h
new file mode 100644
index 0000000..2a9eef4
--- /dev/null
+++ b/fio.h
@@ -0,0 +1,873 @@
+#ifndef FIO_H
+#define FIO_H
+
+#include <sched.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "compiler/compiler.h"
+#include "thread_options.h"
+#include "flist.h"
+#include "fifo.h"
+#include "arch/arch.h"
+#include "os/os.h"
+#include "log.h"
+#include "debug.h"
+#include "file.h"
+#include "io_ddir.h"
+#include "ioengines.h"
+#include "iolog.h"
+#include "helpers.h"
+#include "minmax.h"
+#include "options.h"
+#include "profile.h"
+#include "fio_time.h"
+#include "gettime.h"
+#include "oslib/getopt.h"
+#include "lib/rand.h"
+#include "lib/rbtree.h"
+#include "lib/num2str.h"
+#include "lib/memalign.h"
+#include "smalloc.h"
+#include "client.h"
+#include "server.h"
+#include "stat.h"
+#include "flow.h"
+#include "io_u.h"
+#include "io_u_queue.h"
+#include "workqueue.h"
+#include "steadystate.h"
+#include "lib/nowarn_snprintf.h"
+
+#ifdef CONFIG_SOLARISAIO
+#include <sys/asynch.h>
+#endif
+
+#ifdef CONFIG_LIBNUMA
+#include <linux/mempolicy.h>
+#include <numa.h>
+
+/*
+ * "local" is pseudo-policy
+ */
+#ifndef MPOL_LOCAL
+#define MPOL_LOCAL 4
+#endif
+#endif
+
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
+struct fio_sem;
+
+/*
+ * offset generator types
+ */
+enum {
+	RW_SEQ_SEQ	= 0,
+	RW_SEQ_IDENT,
+};
+
+enum {
+	__TD_F_VER_BACKLOG	= 0,
+	__TD_F_TRIM_BACKLOG,
+	__TD_F_READ_IOLOG,
+	__TD_F_REFILL_BUFFERS,
+	__TD_F_SCRAMBLE_BUFFERS,
+	__TD_F_DO_VERIFY,
+	__TD_F_PROFILE_OPS,
+	__TD_F_COMPRESS,
+	__TD_F_COMPRESS_LOG,
+	__TD_F_VSTATE_SAVED,
+	__TD_F_NEED_LOCK,
+	__TD_F_CHILD,
+	__TD_F_NO_PROGRESS,
+	__TD_F_REGROW_LOGS,
+	__TD_F_MMAP_KEEP,
+	__TD_F_DIRS_CREATED,
+	__TD_F_CHECK_RATE,
+	__TD_F_LAST,		/* not a real bit, keep last */
+};
+
+enum {
+	TD_F_VER_BACKLOG	= 1U << __TD_F_VER_BACKLOG,
+	TD_F_TRIM_BACKLOG	= 1U << __TD_F_TRIM_BACKLOG,
+	TD_F_READ_IOLOG		= 1U << __TD_F_READ_IOLOG,
+	TD_F_REFILL_BUFFERS	= 1U << __TD_F_REFILL_BUFFERS,
+	TD_F_SCRAMBLE_BUFFERS	= 1U << __TD_F_SCRAMBLE_BUFFERS,
+	TD_F_DO_VERIFY		= 1U << __TD_F_DO_VERIFY,
+	TD_F_PROFILE_OPS	= 1U << __TD_F_PROFILE_OPS,
+	TD_F_COMPRESS		= 1U << __TD_F_COMPRESS,
+	TD_F_COMPRESS_LOG	= 1U << __TD_F_COMPRESS_LOG,
+	TD_F_VSTATE_SAVED	= 1U << __TD_F_VSTATE_SAVED,
+	TD_F_NEED_LOCK		= 1U << __TD_F_NEED_LOCK,
+	TD_F_CHILD		= 1U << __TD_F_CHILD,
+	TD_F_NO_PROGRESS        = 1U << __TD_F_NO_PROGRESS,
+	TD_F_REGROW_LOGS	= 1U << __TD_F_REGROW_LOGS,
+	TD_F_MMAP_KEEP		= 1U << __TD_F_MMAP_KEEP,
+	TD_F_DIRS_CREATED	= 1U << __TD_F_DIRS_CREATED,
+	TD_F_CHECK_RATE		= 1U << __TD_F_CHECK_RATE,
+};
+
+enum {
+	FIO_RAND_BS_OFF		= 0,
+	FIO_RAND_BS1_OFF,
+	FIO_RAND_BS2_OFF,
+	FIO_RAND_VER_OFF,
+	FIO_RAND_MIX_OFF,
+	FIO_RAND_FILE_OFF,
+	FIO_RAND_BLOCK_OFF,
+	FIO_RAND_FILE_SIZE_OFF,
+	FIO_RAND_TRIM_OFF,
+	FIO_RAND_BUF_OFF,
+	FIO_RAND_SEQ_RAND_READ_OFF,
+	FIO_RAND_SEQ_RAND_WRITE_OFF,
+	FIO_RAND_SEQ_RAND_TRIM_OFF,
+	FIO_RAND_START_DELAY,
+	FIO_DEDUPE_OFF,
+	FIO_RAND_POISSON_OFF,
+	FIO_RAND_ZONE_OFF,
+	FIO_RAND_POISSON2_OFF,
+	FIO_RAND_POISSON3_OFF,
+	FIO_RAND_PRIO_CMDS,
+	FIO_RAND_NR_OFFS,
+};
+
+enum {
+	IO_MODE_INLINE = 0,
+	IO_MODE_OFFLOAD = 1,
+
+	RATE_PROCESS_LINEAR = 0,
+	RATE_PROCESS_POISSON = 1,
+};
+
+enum {
+	F_ADV_NONE = 0,
+	F_ADV_TYPE,
+	F_ADV_RANDOM,
+	F_ADV_SEQUENTIAL,
+};
+
+/*
+ * Per-thread/process specific data. Only used for the network client
+ * for now.
+ */
+void sk_out_assign(struct sk_out *);
+void sk_out_drop(void);
+
+struct zone_split_index {
+	uint8_t size_perc;
+	uint8_t size_perc_prev;
+	uint64_t size;
+	uint64_t size_prev;
+};
+
+#define FIO_MAX_OPEN_ZBD_ZONES 128
+
+/*
+ * This describes a single thread/process executing a fio job.
+ */
+struct thread_data {
+	struct flist_head opt_list;
+	unsigned long flags;
+	struct thread_options o;
+	void *eo;
+	pthread_t thread;
+	unsigned int thread_number;
+	unsigned int subjob_number;
+	unsigned int groupid;
+	struct thread_stat ts __attribute__ ((aligned(8)));
+
+	int client_type;
+
+	struct io_log *slat_log;
+	struct io_log *clat_log;
+	struct io_log *clat_hist_log;
+	struct io_log *lat_log;
+	struct io_log *bw_log;
+	struct io_log *iops_log;
+
+	struct workqueue log_compress_wq;
+
+	struct thread_data *parent;
+
+	uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
+	struct timespec bw_sample_time;
+
+	uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
+	struct timespec iops_sample_time;
+
+	volatile int update_rusage;
+	struct fio_sem *rusage_sem;
+	struct rusage ru_start;
+	struct rusage ru_end;
+
+	struct fio_file **files;
+	unsigned char *file_locks;
+	unsigned int files_size;
+	unsigned int files_index;
+	unsigned int nr_open_files;
+	unsigned int nr_done_files;
+	union {
+		unsigned int next_file;
+		struct frand_state next_file_state;
+	};
+	union {
+		struct zipf_state next_file_zipf;
+		struct gauss_state next_file_gauss;
+	};
+	union {
+		double zipf_theta;
+		double pareto_h;
+		double gauss_dev;
+	};
+	int error;
+	int sig;
+	int done;
+	int stop_io;
+	pid_t pid;
+	char *orig_buffer;
+	size_t orig_buffer_size;
+	volatile int runstate;
+	volatile bool terminate;
+	bool last_was_sync;
+	enum fio_ddir last_ddir;
+
+	int mmapfd;
+
+	void *iolog_buf;
+	FILE *iolog_f;
+
+	uint64_t rand_seeds[FIO_RAND_NR_OFFS];
+
+	struct frand_state bsrange_state[DDIR_RWDIR_CNT];
+	struct frand_state verify_state;
+	struct frand_state trim_state;
+	struct frand_state delay_state;
+
+	struct frand_state buf_state;
+	struct frand_state buf_state_prev;
+	struct frand_state dedupe_state;
+	struct frand_state zone_state;
+	struct frand_state prio_state;
+
+	struct zone_split_index **zone_state_index;
+
+	unsigned int verify_batch;
+	unsigned int trim_batch;
+
+	struct thread_io_list *vstate;
+
+	int shm_id;
+
+	/*
+	 * IO engine hooks, contains everything needed to submit an io_u
+	 * to any of the available IO engines.
+	 */
+	struct ioengine_ops *io_ops;
+	int io_ops_init;
+
+	/*
+	 * IO engine private data and dlhandle.
+	 */
+	void *io_ops_data;
+	void *io_ops_dlhandle;
+
+	/*
+	 * Queue depth of io_u's that fio MIGHT do
+	 */
+	unsigned int cur_depth;
+
+	/*
+	 * io_u's about to be committed
+	 */
+	unsigned int io_u_queued;
+
+	/*
+	 * io_u's submitted but not completed yet
+	 */
+	unsigned int io_u_in_flight;
+
+	/*
+	 * List of free and busy io_u's
+	 */
+	struct io_u_ring io_u_requeues;
+	struct io_u_queue io_u_freelist;
+	struct io_u_queue io_u_all;
+	pthread_mutex_t io_u_lock;
+	pthread_cond_t free_cond;
+
+	/*
+	 * async verify offload
+	 */
+	struct flist_head verify_list;
+	pthread_t *verify_threads;
+	unsigned int nr_verify_threads;
+	pthread_cond_t verify_cond;
+	int verify_thread_exit;
+
+	/*
+	 * Rate state
+	 */
+	uint64_t rate_bps[DDIR_RWDIR_CNT];
+	uint64_t rate_next_io_time[DDIR_RWDIR_CNT];
+	unsigned long rate_bytes[DDIR_RWDIR_CNT];
+	unsigned long rate_blocks[DDIR_RWDIR_CNT];
+	unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
+	struct timespec lastrate[DDIR_RWDIR_CNT];
+	int64_t last_usec[DDIR_RWDIR_CNT];
+	struct frand_state poisson_state[DDIR_RWDIR_CNT];
+
+	/*
+	 * Enforced rate submission/completion workqueue
+	 */
+	struct workqueue io_wq;
+
+	uint64_t total_io_size;
+	uint64_t fill_device_size;
+
+	/*
+	 * Issue side
+	 */
+	uint64_t io_issues[DDIR_RWDIR_CNT];
+	uint64_t io_issue_bytes[DDIR_RWDIR_CNT];
+	uint64_t loops;
+
+	/*
+	 * Completions
+	 */
+	uint64_t io_blocks[DDIR_RWDIR_CNT];
+	uint64_t this_io_blocks[DDIR_RWDIR_CNT];
+	uint64_t io_bytes[DDIR_RWDIR_CNT];
+	uint64_t this_io_bytes[DDIR_RWDIR_CNT];
+	uint64_t io_skip_bytes;
+	uint64_t zone_bytes;
+	struct fio_sem *sem;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
+
+	/*
+	 * State for random io, a bitmap of blocks done vs not done
+	 */
+	struct frand_state random_state;
+
+	struct timespec start;	/* start of this loop */
+	struct timespec epoch;	/* time job was started */
+	unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
+	struct timespec last_issue;
+	long time_offset;
+	struct timespec ts_cache;
+	struct timespec terminate_time;
+	unsigned int ts_cache_nr;
+	unsigned int ts_cache_mask;
+	bool ramp_time_over;
+
+	/*
+	 * Time since last latency_window was started
+	 */
+	struct timespec latency_ts;
+	unsigned int latency_qd;
+	unsigned int latency_qd_high;
+	unsigned int latency_qd_low;
+	unsigned int latency_failed;
+	uint64_t latency_ios;
+	int latency_end_run;
+
+	/*
+	 * read/write mixed workload state
+	 */
+	struct frand_state rwmix_state;
+	unsigned long rwmix_issues;
+	enum fio_ddir rwmix_ddir;
+	unsigned int ddir_seq_nr;
+
+	/*
+	 * rand/seq mixed workload state
+	 */
+	struct frand_state seq_rand_state[DDIR_RWDIR_CNT];
+
+	/*
+	 * IO history logs for verification. We use a tree for sorting,
+	 * if we are overwriting. Otherwise just use a fifo.
+	 */
+	struct rb_root io_hist_tree;
+	struct flist_head io_hist_list;
+	unsigned long io_hist_len;
+
+	/*
+	 * For IO replaying
+	 */
+	struct flist_head io_log_list;
+	FILE *io_log_rfile;
+	unsigned int io_log_current;
+	unsigned int io_log_checkmark;
+	unsigned int io_log_highmark;
+	struct timespec io_log_highmark_time;
+
+	/*
+	 * For tracking/handling discards
+	 */
+	struct flist_head trim_list;
+	unsigned long trim_entries;
+
+	/*
+	 * for fileservice, how often to switch to a new file
+	 */
+	unsigned int file_service_nr;
+	unsigned int file_service_left;
+	struct fio_file *file_service_file;
+
+	unsigned int sync_file_range_nr;
+
+	/*
+	 * For generating file sizes
+	 */
+	struct frand_state file_size_state;
+
+	/*
+	 * Error counts
+	 */
+	unsigned int total_err_count;
+	int first_error;
+
+	struct fio_flow *flow;
+
+	/*
+	 * Can be overloaded by profiles
+	 */
+	struct prof_io_ops prof_io_ops;
+	void *prof_data;
+
+	void *pinned_mem;
+
+	struct steadystate_data ss;
+
+	char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+	/*
+	 * for GPU memory management
+	 */
+	int gpu_dev_cnt;
+	int gpu_dev_id;
+	CUdevice  cu_dev;
+	CUcontext cu_ctx;
+	CUdeviceptr dev_mem_ptr;
+#endif
+
+};
+
+/*
+ * when should interactive ETA output be generated
+ */
+enum {
+	FIO_ETA_AUTO,
+	FIO_ETA_ALWAYS,
+	FIO_ETA_NEVER,
+};
+
+#define __td_verror(td, err, msg, func)					\
+	do {								\
+		unsigned int ____e = (err);				\
+		if ((td)->error)					\
+			break;						\
+		(td)->error = ____e;					\
+		if (!(td)->first_error)					\
+			nowarn_snprintf(td->verror, sizeof(td->verror),	\
+					"file:%s:%d, func=%s, error=%s", \
+					__FILE__, __LINE__, (func), (msg)); \
+	} while (0)
+
+
+#define td_clear_error(td)		do {		\
+	(td)->error = 0;				\
+	if ((td)->parent)				\
+		(td)->parent->error = 0;		\
+} while (0)
+
+#define td_verror(td, err, func)	do {			\
+	__td_verror((td), (err), strerror((err)), (func));	\
+	if ((td)->parent)					\
+		__td_verror((td)->parent, (err), strerror((err)), (func)); \
+} while (0)
+
+#define td_vmsg(td, err, msg, func)	do {			\
+	__td_verror((td), (err), (msg), (func));		\
+	if ((td)->parent)					\
+		__td_verror((td)->parent, (err), (msg), (func));	\
+} while (0)
+
+#define __fio_stringify_1(x)	#x
+#define __fio_stringify(x)	__fio_stringify_1(x)
+
+extern bool exitall_on_terminate;
+extern unsigned int thread_number;
+extern unsigned int stat_number;
+extern int shm_id;
+extern int groupid;
+extern int output_format;
+extern int append_terse_output;
+extern int temp_stall_ts;
+extern uintptr_t page_mask, page_size;
+extern bool read_only;
+extern int eta_print;
+extern int eta_new_line;
+extern unsigned int eta_interval_msec;
+extern unsigned long done_secs;
+extern int fio_gtod_offload;
+extern int fio_gtod_cpu;
+extern enum fio_cs fio_clock_source;
+extern int fio_clock_source_set;
+extern int warnings_fatal;
+extern int terse_version;
+extern bool is_backend;
+extern bool is_local_backend;
+extern int nr_clients;
+extern bool log_syslog;
+extern int status_interval;
+extern const char fio_version_string[];
+extern char *trigger_file;
+extern char *trigger_cmd;
+extern char *trigger_remote_cmd;
+extern long long trigger_timeout;
+extern char *aux_path;
+
+extern struct thread_data *threads;
+
+static inline bool is_running_backend(void)
+{
+	return is_backend || is_local_backend;
+}
+
+extern bool eta_time_within_slack(unsigned int time);
+
+static inline void fio_ro_check(const struct thread_data *td, struct io_u *io_u)
+{
+	assert(!(io_u->ddir == DDIR_WRITE && !td_write(td)) &&
+	       !(io_u->ddir == DDIR_TRIM && !td_trim(td)));
+}
+
+#define REAL_MAX_JOBS		4096
+
+static inline bool should_fsync(struct thread_data *td)
+{
+	if (td->last_was_sync)
+		return false;
+	if (td_write(td) || td->o.override_sync)
+		return true;
+
+	return false;
+}
+
+/*
+ * Init/option functions
+ */
+extern int __must_check fio_init_options(void);
+extern int __must_check parse_options(int, char **);
+extern int parse_jobs_ini(char *, int, int, int);
+extern int parse_cmd_line(int, char **, int);
+extern int fio_backend(struct sk_out *);
+extern void reset_fio_state(void);
+extern void clear_io_state(struct thread_data *, int);
+extern int fio_options_parse(struct thread_data *, char **, int);
+extern void fio_keywords_init(void);
+extern void fio_keywords_exit(void);
+extern int fio_cmd_option_parse(struct thread_data *, const char *, char *);
+extern int fio_cmd_ioengine_option_parse(struct thread_data *, const char *, char *);
+extern void fio_fill_default_options(struct thread_data *);
+extern int fio_show_option_help(const char *);
+extern void fio_options_set_ioengine_opts(struct option *long_options, struct thread_data *td);
+extern void fio_options_dup_and_init(struct option *);
+extern char *fio_option_dup_subs(const char *);
+extern void fio_options_mem_dupe(struct thread_data *);
+extern void td_fill_rand_seeds(struct thread_data *);
+extern void td_fill_verify_state_seed(struct thread_data *);
+extern void add_job_opts(const char **, int);
+extern int ioengine_load(struct thread_data *);
+extern bool parse_dryrun(void);
+extern int fio_running_or_pending_io_threads(void);
+extern int fio_set_fd_nonblocking(int, const char *);
+extern void sig_show_status(int sig);
+extern struct thread_data *get_global_options(void);
+
+extern uintptr_t page_mask;
+extern uintptr_t page_size;
+extern int initialize_fio(char *envp[]);
+extern void deinitialize_fio(void);
+
+#define FIO_GETOPT_JOB		0x89000000
+#define FIO_GETOPT_IOENGINE	0x98000000
+#define FIO_NR_OPTIONS		(FIO_MAX_OPTS + 128)
+
+/*
+ * ETA/status stuff
+ */
+extern void print_thread_status(void);
+extern void print_status_init(int);
+extern char *fio_uint_to_kmg(unsigned int val);
+
+/*
+ * Thread life cycle. Once a thread has a runstate beyond TD_INITIALIZED, it
+ * will never back again. It may cycle between running/verififying/fsyncing.
+ * Once the thread reaches TD_EXITED, it is just waiting for the core to
+ * reap it.
+ */
+enum {
+	TD_NOT_CREATED = 0,
+	TD_CREATED,
+	TD_INITIALIZED,
+	TD_RAMP,
+	TD_SETTING_UP,
+	TD_RUNNING,
+	TD_PRE_READING,
+	TD_VERIFYING,
+	TD_FSYNCING,
+	TD_FINISHING,
+	TD_EXITED,
+	TD_REAPED,
+	TD_LAST,
+	TD_NR,
+};
+
+#define TD_ENG_FLAG_SHIFT	17
+#define TD_ENG_FLAG_MASK	((1U << 17) - 1)
+
+static inline void td_set_ioengine_flags(struct thread_data *td)
+{
+	td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
+		    (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+}
+
+static inline bool td_ioengine_flagged(struct thread_data *td,
+				       enum fio_ioengine_flags flags)
+{
+	return ((td->flags >> TD_ENG_FLAG_SHIFT) & flags) != 0;
+}
+
+extern void td_set_runstate(struct thread_data *, int);
+extern int td_bump_runstate(struct thread_data *, int);
+extern void td_restore_runstate(struct thread_data *, int);
+extern const char *runstate_to_name(int runstate);
+
+/*
+ * Allow 60 seconds for a job to quit on its own, otherwise reap with
+ * a vengeance.
+ */
+#define FIO_REAP_TIMEOUT	300
+
+enum {
+	TERMINATE_NONE = 0,
+	TERMINATE_GROUP = 1,
+	TERMINATE_STONEWALL = 2,
+	TERMINATE_ALL = -1,
+};
+
+extern void fio_terminate_threads(unsigned int, unsigned int);
+extern void fio_mark_td_terminate(struct thread_data *);
+
+/*
+ * Memory helpers
+ */
+extern int __must_check fio_pin_memory(struct thread_data *);
+extern void fio_unpin_memory(struct thread_data *);
+extern int __must_check allocate_io_mem(struct thread_data *);
+extern void free_io_mem(struct thread_data *);
+extern void free_threads_shm(void);
+
+#ifdef FIO_INTERNAL
+#define PTR_ALIGN(ptr, mask)	\
+	(char *) (((uintptr_t) (ptr) + (mask)) & ~(mask))
+#endif
+
+/*
+ * Reset stats after ramp time completes
+ */
+extern void reset_all_stats(struct thread_data *);
+
+extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
+		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
+		   struct timespec *comp_time);
+
+/*
+ * Latency target helpers
+ */
+extern void lat_target_check(struct thread_data *);
+extern void lat_target_init(struct thread_data *);
+extern void lat_target_reset(struct thread_data *);
+
+/*
+ * Iterates all threads/processes within all the defined jobs
+ */
+#define for_each_td(td, i)	\
+	for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
+#define for_each_file(td, f, i)	\
+	if ((td)->files_index)						\
+		for ((i) = 0, (f) = (td)->files[0];			\
+	    	 (i) < (td)->o.nr_files && ((f) = (td)->files[i]) != NULL; \
+		 (i)++)
+
+static inline bool fio_fill_issue_time(struct thread_data *td)
+{
+	if (td->o.read_iolog_file ||
+	    !td->o.disable_clat || !td->o.disable_slat || !td->o.disable_bw)
+		return true;
+
+	return false;
+}
+
+static inline bool option_check_rate(struct thread_data *td, enum fio_ddir ddir)
+{
+	struct thread_options *o = &td->o;
+
+	/*
+	 * If some rate setting was given, we need to check it
+	 */
+	if (o->rate[ddir] || o->ratemin[ddir] || o->rate_iops[ddir] ||
+	    o->rate_iops_min[ddir])
+		return true;
+
+	return false;
+}
+
+static inline bool __should_check_rate(struct thread_data *td)
+{
+	return (td->flags & TD_F_CHECK_RATE) != 0;
+}
+
+static inline bool should_check_rate(struct thread_data *td)
+{
+	if (!__should_check_rate(td))
+		return false;
+
+	return ddir_rw_sum(td->bytes_done) != 0;
+}
+
+static inline unsigned long long td_max_bs(struct thread_data *td)
+{
+	unsigned long long max_bs;
+
+	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	return max(td->o.max_bs[DDIR_TRIM], max_bs);
+}
+
+static inline unsigned long long td_min_bs(struct thread_data *td)
+{
+	unsigned long long min_bs;
+
+	min_bs = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
+	return min(td->o.min_bs[DDIR_TRIM], min_bs);
+}
+
+static inline bool td_async_processing(struct thread_data *td)
+{
+	return (td->flags & TD_F_NEED_LOCK) != 0;
+}
+
+static inline bool td_offload_overlap(struct thread_data *td)
+{
+	return td->o.serialize_overlap && td->o.io_submit_mode == IO_MODE_OFFLOAD;
+}
+
+/*
+ * We currently only need to do locking if we have verifier threads
+ * accessing our internal structures too
+ */
+static inline void __td_io_u_lock(struct thread_data *td)
+{
+	pthread_mutex_lock(&td->io_u_lock);
+}
+
+static inline void __td_io_u_unlock(struct thread_data *td)
+{
+	pthread_mutex_unlock(&td->io_u_lock);
+}
+
+static inline void td_io_u_free_notify(struct thread_data *td)
+{
+	if (td_async_processing(td))
+		pthread_cond_signal(&td->free_cond);
+}
+
+static inline void td_flags_clear(struct thread_data *td, unsigned int *flags,
+				  unsigned int value)
+{
+	if (!td_async_processing(td))
+		*flags &= ~value;
+	else
+		__sync_fetch_and_and(flags, ~value);
+}
+
+static inline void td_flags_set(struct thread_data *td, unsigned int *flags,
+				unsigned int value)
+{
+	if (!td_async_processing(td))
+		*flags |= value;
+	else
+		__sync_fetch_and_or(flags, value);
+}
+
+extern const char *fio_get_arch_string(int);
+extern const char *fio_get_os_string(int);
+
+enum {
+	__FIO_OUTPUT_TERSE	= 0,
+	__FIO_OUTPUT_JSON	= 1,
+	__FIO_OUTPUT_NORMAL	= 2,
+        __FIO_OUTPUT_JSON_PLUS  = 3,
+	FIO_OUTPUT_NR		= 4,
+
+	FIO_OUTPUT_TERSE	= 1U << __FIO_OUTPUT_TERSE,
+	FIO_OUTPUT_JSON		= 1U << __FIO_OUTPUT_JSON,
+	FIO_OUTPUT_NORMAL	= 1U << __FIO_OUTPUT_NORMAL,
+	FIO_OUTPUT_JSON_PLUS    = 1U << __FIO_OUTPUT_JSON_PLUS,
+};
+
+enum {
+	FIO_RAND_DIST_RANDOM	= 0,
+	FIO_RAND_DIST_ZIPF,
+	FIO_RAND_DIST_PARETO,
+	FIO_RAND_DIST_GAUSS,
+	FIO_RAND_DIST_ZONED,
+	FIO_RAND_DIST_ZONED_ABS,
+};
+
+#define FIO_DEF_ZIPF		1.1
+#define FIO_DEF_PARETO		0.2
+
+enum {
+	FIO_RAND_GEN_TAUSWORTHE = 0,
+	FIO_RAND_GEN_LFSR,
+	FIO_RAND_GEN_TAUSWORTHE64,
+};
+
+enum {
+	FIO_CPUS_SHARED		= 0,
+	FIO_CPUS_SPLIT,
+};
+
+extern void exec_trigger(const char *);
+extern void check_trigger_file(void);
+
+extern bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u);
+extern pthread_mutex_t overlap_check;
+
+static inline void *fio_memalign(size_t alignment, size_t size, bool shared)
+{
+	return __fio_memalign(alignment, size, shared ? smalloc : malloc);
+}
+
+static inline void fio_memfree(void *ptr, size_t size, bool shared)
+{
+	return __fio_memfree(ptr, size, shared ? sfree : free);
+}
+
+#endif
diff --git a/fio_sem.c b/fio_sem.c
new file mode 100644
index 0000000..c34d8bf
--- /dev/null
+++ b/fio_sem.c
@@ -0,0 +1,181 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <assert.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/valgrind.h>
+#else
+#define RUNNING_ON_VALGRIND 0
+#endif
+
+#include "fio_sem.h"
+#include "pshared.h"
+#include "os/os.h"
+#include "fio_time.h"
+#include "gettime.h"
+
+void __fio_sem_remove(struct fio_sem *sem)
+{
+	assert(sem->magic == FIO_SEM_MAGIC);
+	pthread_mutex_destroy(&sem->lock);
+	pthread_cond_destroy(&sem->cond);
+
+	/*
+	 * When not running on Valgrind, ensure any subsequent attempt to grab
+	 * this semaphore will fail with an assert, instead of just silently
+	 * hanging. When running on Valgrind, let Valgrind detect
+	 * use-after-free.
+         */
+	if (!RUNNING_ON_VALGRIND)
+		memset(sem, 0, sizeof(*sem));
+}
+
+void fio_sem_remove(struct fio_sem *sem)
+{
+	__fio_sem_remove(sem);
+	munmap((void *) sem, sizeof(*sem));
+}
+
+int __fio_sem_init(struct fio_sem *sem, int value)
+{
+	int ret;
+
+	sem->value = value;
+	/* Initialize .waiters explicitly for Valgrind. */
+	sem->waiters = 0;
+	sem->magic = FIO_SEM_MAGIC;
+
+	ret = mutex_cond_init_pshared(&sem->lock, &sem->cond);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+struct fio_sem *fio_sem_init(int value)
+{
+	struct fio_sem *sem = NULL;
+
+	sem = (void *) mmap(NULL, sizeof(struct fio_sem),
+				PROT_READ | PROT_WRITE,
+				OS_MAP_ANON | MAP_SHARED, -1, 0);
+	if (sem == MAP_FAILED) {
+		perror("mmap semaphore");
+		return NULL;
+	}
+
+	if (!__fio_sem_init(sem, value))
+		return sem;
+
+	fio_sem_remove(sem);
+	return NULL;
+}
+
+static bool sem_timed_out(struct timespec *t, unsigned int msecs)
+{
+	struct timeval tv;
+	struct timespec now;
+
+	gettimeofday(&tv, NULL);
+	now.tv_sec = tv.tv_sec;
+	now.tv_nsec = tv.tv_usec * 1000;
+
+	return mtime_since(t, &now) >= msecs;
+}
+
+int fio_sem_down_timeout(struct fio_sem *sem, unsigned int msecs)
+{
+	struct timespec base;
+	struct timespec t;
+	int ret = 0;
+
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+	clock_gettime(CLOCK_MONOTONIC, &t);
+#else
+	clock_gettime(CLOCK_REALTIME, &t);
+#endif
+
+	base = t;
+
+	t.tv_sec += msecs / 1000;
+	t.tv_nsec += ((msecs * 1000000ULL) % 1000000000);
+	if (t.tv_nsec >= 1000000000) {
+		t.tv_nsec -= 1000000000;
+		t.tv_sec++;
+	}
+
+	pthread_mutex_lock(&sem->lock);
+
+	sem->waiters++;
+	while (!sem->value && !ret) {
+		/*
+		 * Some platforms (FreeBSD 9?) seems to return timed out
+		 * way too early, double check.
+		 */
+		ret = pthread_cond_timedwait(&sem->cond, &sem->lock, &t);
+		if (ret == ETIMEDOUT && !sem_timed_out(&base, msecs))
+			ret = 0;
+	}
+	sem->waiters--;
+
+	if (!ret) {
+		sem->value--;
+		pthread_mutex_unlock(&sem->lock);
+		return 0;
+	}
+
+	pthread_mutex_unlock(&sem->lock);
+	return ret;
+}
+
+bool fio_sem_down_trylock(struct fio_sem *sem)
+{
+	bool ret = true;
+
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	pthread_mutex_lock(&sem->lock);
+	if (sem->value) {
+		sem->value--;
+		ret = false;
+	}
+	pthread_mutex_unlock(&sem->lock);
+
+	return ret;
+}
+
+void fio_sem_down(struct fio_sem *sem)
+{
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	pthread_mutex_lock(&sem->lock);
+
+	while (!sem->value) {
+		sem->waiters++;
+		pthread_cond_wait(&sem->cond, &sem->lock);
+		sem->waiters--;
+	}
+
+	sem->value--;
+	pthread_mutex_unlock(&sem->lock);
+}
+
+void fio_sem_up(struct fio_sem *sem)
+{
+	int do_wake = 0;
+
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	pthread_mutex_lock(&sem->lock);
+	read_barrier();
+	if (!sem->value && sem->waiters)
+		do_wake = 1;
+	sem->value++;
+
+	if (do_wake)
+		pthread_cond_signal(&sem->cond);
+
+	pthread_mutex_unlock(&sem->lock);
+}
diff --git a/fio_sem.h b/fio_sem.h
new file mode 100644
index 0000000..a796ddd
--- /dev/null
+++ b/fio_sem.h
@@ -0,0 +1,31 @@
+#ifndef FIO_SEM_H
+#define FIO_SEM_H
+
+#include <pthread.h>
+#include "lib/types.h"
+
+#define FIO_SEM_MAGIC		0x4d555445U
+
+struct fio_sem {
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	int value;
+	int waiters;
+	int magic;
+};
+
+enum {
+	FIO_SEM_LOCKED	= 0,
+	FIO_SEM_UNLOCKED	= 1,
+};
+
+extern int __fio_sem_init(struct fio_sem *, int);
+extern struct fio_sem *fio_sem_init(int);
+extern void __fio_sem_remove(struct fio_sem *);
+extern void fio_sem_remove(struct fio_sem *);
+extern void fio_sem_up(struct fio_sem *);
+extern void fio_sem_down(struct fio_sem *);
+extern bool fio_sem_down_trylock(struct fio_sem *);
+extern int fio_sem_down_timeout(struct fio_sem *, unsigned int);
+
+#endif
diff --git a/fio_time.h b/fio_time.h
new file mode 100644
index 0000000..c00f8e7
--- /dev/null
+++ b/fio_time.h
@@ -0,0 +1,33 @@
+#ifndef FIO_TIME_H
+#define FIO_TIME_H
+
+#include <stdint.h>
+/* IWYU pragma: begin_exports */
+#include <time.h>
+#include <sys/time.h>
+/* IWYU pragma: end_exports */
+#include "lib/types.h"
+
+struct thread_data;
+extern uint64_t ntime_since(const struct timespec *, const struct timespec *);
+extern uint64_t ntime_since_now(const struct timespec *);
+extern uint64_t utime_since(const struct timespec *, const struct timespec *);
+extern uint64_t utime_since_now(const struct timespec *);
+extern uint64_t mtime_since(const struct timespec *, const struct timespec *);
+extern uint64_t mtime_since_now(const struct timespec *);
+extern uint64_t mtime_since_tv(const struct timeval *, const struct timeval *);
+extern uint64_t time_since_now(const struct timespec *);
+extern uint64_t time_since_genesis(void);
+extern uint64_t mtime_since_genesis(void);
+extern uint64_t utime_since_genesis(void);
+extern uint64_t usec_spin(unsigned int);
+extern uint64_t usec_sleep(struct thread_data *, unsigned long);
+extern void fill_start_time(struct timespec *);
+extern void set_genesis_time(void);
+extern bool ramp_time_over(struct thread_data *);
+extern bool in_ramp_time(struct thread_data *);
+extern void fio_time_init(void);
+extern void timespec_add_msec(struct timespec *, unsigned int);
+extern void set_epoch_time(struct thread_data *, int);
+
+#endif
diff --git a/flist.h b/flist.h
new file mode 100644
index 0000000..5437cd8
--- /dev/null
+++ b/flist.h
@@ -0,0 +1,198 @@
+#ifndef _LINUX_FLIST_H
+#define _LINUX_FLIST_H
+
+#include <stdlib.h>
+#include <stddef.h>
+
+#define container_of(ptr, type, member)  ({			\
+	const __typeof__( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct flist_head {
+	struct flist_head *next, *prev;
+};
+
+#define FLIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define FLIST_HEAD(name) \
+	struct flist_head name = FLIST_HEAD_INIT(name)
+
+#define INIT_FLIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __flist_add(struct flist_head *new_entry,
+			       struct flist_head *prev,
+			       struct flist_head *next)
+{
+	next->prev = new_entry;
+	new_entry->next = next;
+	new_entry->prev = prev;
+	prev->next = new_entry;
+}
+
+/**
+ * flist_add - add a new entry
+ * @new_entry: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void flist_add(struct flist_head *new_entry,
+                             struct flist_head *head)
+{
+	__flist_add(new_entry, head, head->next);
+}
+
+static inline void flist_add_tail(struct flist_head *new_entry,
+				  struct flist_head *head)
+{
+	__flist_add(new_entry, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __flist_del(struct flist_head *prev,
+			       struct flist_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * flist_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: flist_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void flist_del(struct flist_head *entry)
+{
+	__flist_del(entry->prev, entry->next);
+	entry->next = NULL;
+	entry->prev = NULL;
+}
+
+/**
+ * flist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void flist_del_init(struct flist_head *entry)
+{
+	__flist_del(entry->prev, entry->next);
+	INIT_FLIST_HEAD(entry);
+}
+
+/**
+ * flist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int flist_empty(const struct flist_head *head)
+{
+	return head->next == head;
+}
+
+static inline void __flist_splice(const struct flist_head *list,
+				  struct flist_head *prev,
+				  struct flist_head *next)
+{
+	struct flist_head *first = list->next;
+	struct flist_head *last = list->prev;
+
+	first->prev = prev;
+	prev->next = first;
+
+	last->next = next;
+	next->prev = last;
+}
+
+static inline void flist_splice(const struct flist_head *list,
+				struct flist_head *head)
+{
+	if (!flist_empty(list))
+		__flist_splice(list, head, head->next);
+}
+
+static inline void flist_splice_tail(struct flist_head *list,
+				     struct flist_head *head)
+{
+	if (!flist_empty(list))
+		__flist_splice(list, head->prev, head);
+}
+
+static inline void flist_splice_tail_init(struct flist_head *list,
+					  struct flist_head *head)
+{
+	if (!flist_empty(list)) {
+		__flist_splice(list, head->prev, head);
+		INIT_FLIST_HEAD(list);
+	}
+}
+
+static inline void flist_splice_init(struct flist_head *list,
+				    struct flist_head *head)
+{
+	if (!flist_empty(list)) {
+		__flist_splice(list, head, head->next);
+		INIT_FLIST_HEAD(list);
+	}
+}
+
+/**
+ * flist_entry - get the struct for this entry
+ * @ptr:	the &struct flist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the flist_struct within the struct.
+ */
+#define flist_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+#define flist_first_entry(ptr, type, member) \
+	flist_entry((ptr)->next, type, member)
+
+#define flist_last_entry(ptr, type, member) \
+	flist_entry((ptr)->prev, type, member)
+
+/**
+ * flist_for_each	-	iterate over a list
+ * @pos:	the &struct flist_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define flist_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * flist_for_each_safe	-	iterate over a list safe against removal of list entry
+ * @pos:	the &struct flist_head to use as a loop counter.
+ * @n:		another &struct flist_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define flist_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+extern void flist_sort(void *priv, struct flist_head *head,
+	int (*cmp)(void *priv, struct flist_head *a, struct flist_head *b));
+
+#endif
diff --git a/flow.c b/flow.c
new file mode 100644
index 0000000..a8dbfb9
--- /dev/null
+++ b/flow.c
@@ -0,0 +1,134 @@
+#include "fio.h"
+#include "fio_sem.h"
+#include "smalloc.h"
+#include "flist.h"
+
+struct fio_flow {
+	unsigned int refs;
+	struct flist_head list;
+	unsigned int id;
+	long long int flow_counter;
+};
+
+static struct flist_head *flow_list;
+static struct fio_sem *flow_lock;
+
+int flow_threshold_exceeded(struct thread_data *td)
+{
+	struct fio_flow *flow = td->flow;
+	long long flow_counter;
+
+	if (!flow)
+		return 0;
+
+	if (td->o.flow > 0)
+		flow_counter = flow->flow_counter;
+	else
+		flow_counter = -flow->flow_counter;
+
+	if (flow_counter > td->o.flow_watermark) {
+		if (td->o.flow_sleep) {
+			io_u_quiesce(td);
+			usleep(td->o.flow_sleep);
+		}
+
+		return 1;
+	}
+
+	/* No synchronization needed because it doesn't
+	 * matter if the flow count is slightly inaccurate */
+	flow->flow_counter += td->o.flow;
+	return 0;
+}
+
+static struct fio_flow *flow_get(unsigned int id)
+{
+	struct fio_flow *flow = NULL;
+	struct flist_head *n;
+
+	if (!flow_lock)
+		return NULL;
+
+	fio_sem_down(flow_lock);
+
+	flist_for_each(n, flow_list) {
+		flow = flist_entry(n, struct fio_flow, list);
+		if (flow->id == id)
+			break;
+
+		flow = NULL;
+	}
+
+	if (!flow) {
+		flow = smalloc(sizeof(*flow));
+		if (!flow) {
+			fio_sem_up(flow_lock);
+			return NULL;
+		}
+		flow->refs = 0;
+		INIT_FLIST_HEAD(&flow->list);
+		flow->id = id;
+		flow->flow_counter = 0;
+
+		flist_add_tail(&flow->list, flow_list);
+	}
+
+	flow->refs++;
+	fio_sem_up(flow_lock);
+	return flow;
+}
+
+static void flow_put(struct fio_flow *flow)
+{
+	if (!flow_lock)
+		return;
+
+	fio_sem_down(flow_lock);
+
+	if (!--flow->refs) {
+		flist_del(&flow->list);
+		sfree(flow);
+	}
+
+	fio_sem_up(flow_lock);
+}
+
+void flow_init_job(struct thread_data *td)
+{
+	if (td->o.flow)
+		td->flow = flow_get(td->o.flow_id);
+}
+
+void flow_exit_job(struct thread_data *td)
+{
+	if (td->flow) {
+		flow_put(td->flow);
+		td->flow = NULL;
+	}
+}
+
+void flow_init(void)
+{
+	flow_list = smalloc(sizeof(*flow_list));
+	if (!flow_list) {
+		log_err("fio: smalloc pool exhausted\n");
+		return;
+	}
+
+	flow_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+	if (!flow_lock) {
+		log_err("fio: failed to allocate flow lock\n");
+		sfree(flow_list);
+		return;
+	}
+
+	INIT_FLIST_HEAD(flow_list);
+}
+
+void flow_exit(void)
+{
+	if (flow_lock)
+		fio_sem_remove(flow_lock);
+	if (flow_list)
+		sfree(flow_list);
+}
diff --git a/flow.h b/flow.h
new file mode 100644
index 0000000..c0a45c3
--- /dev/null
+++ b/flow.h
@@ -0,0 +1,11 @@
+#ifndef FIO_FLOW_H
+#define FIO_FLOW_H
+
+int flow_threshold_exceeded(struct thread_data *td);
+void flow_init_job(struct thread_data *td);
+void flow_exit_job(struct thread_data *td);
+
+void flow_exit(void);
+void flow_init(void);
+
+#endif
diff --git a/gclient.c b/gclient.c
new file mode 100644
index 0000000..fe83382
--- /dev/null
+++ b/gclient.c
@@ -0,0 +1,1461 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include <glib.h>
+#include <cairo.h>
+#include <gtk/gtk.h>
+
+#include "fio.h"
+#include "gfio.h"
+#include "ghelpers.h"
+#include "goptions.h"
+#include "gerror.h"
+#include "graph.h"
+#include "gclient.h"
+#include "printing.h"
+#include "lib/pow2.h"
+
+static void gfio_display_ts(struct fio_client *client, struct thread_stat *ts,
+			    struct group_run_stats *rs);
+
+static gboolean results_window_delete(GtkWidget *w, gpointer data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+
+	gtk_widget_destroy(w);
+	ge->results_window = NULL;
+	ge->results_notebook = NULL;
+	return TRUE;
+}
+
+static void results_close(GtkWidget *w, gpointer *data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+
+	gtk_widget_destroy(ge->results_window);
+}
+
+static void results_print(GtkWidget *w, gpointer *data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+
+	gfio_print_results(ge);
+}
+
+static GtkActionEntry results_menu_items[] = {
+	{ "FileMenuAction", GTK_STOCK_FILE, "File", NULL, NULL, NULL},
+	{ "GraphMenuAction", GTK_STOCK_FILE, "Graph", NULL, NULL, NULL},
+	{ "PrintFile", GTK_STOCK_PRINT, "Print", "<Control>P", NULL, G_CALLBACK(results_print) },
+	{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(results_close) },
+};
+static gint results_nmenu_items = ARRAY_SIZE(results_menu_items);
+
+static const gchar *results_ui_string = " \
+	<ui> \
+		<menubar name=\"MainMenu\"> \
+			<menu name=\"FileMenu\" action=\"FileMenuAction\"> \
+				<menuitem name=\"Print\" action=\"PrintFile\" /> \
+				<menuitem name=\"Close\" action=\"CloseFile\" /> \
+			</menu> \
+			<menu name=\"GraphMenu\" action=\"GraphMenuAction\"> \
+			</menu>\
+		</menubar> \
+	</ui> \
+";
+
+static GtkWidget *get_results_menubar(GtkWidget *window, struct gui_entry *ge)
+{
+	GtkActionGroup *action_group;
+	GtkWidget *widget;
+	GError *error = 0;
+
+	ge->results_uimanager = gtk_ui_manager_new();
+
+	action_group = gtk_action_group_new("ResultsMenu");
+	gtk_action_group_add_actions(action_group, results_menu_items, results_nmenu_items, ge);
+
+	gtk_ui_manager_insert_action_group(ge->results_uimanager, action_group, 0);
+	gtk_ui_manager_add_ui_from_string(GTK_UI_MANAGER(ge->results_uimanager), results_ui_string, -1, &error);
+
+	gtk_window_add_accel_group(GTK_WINDOW(window), gtk_ui_manager_get_accel_group(ge->results_uimanager));
+
+	widget = gtk_ui_manager_get_widget(ge->results_uimanager, "/MainMenu");
+	return widget;
+}
+
+static GtkWidget *get_results_window(struct gui_entry *ge)
+{
+	GtkWidget *win, *notebook, *vbox;
+
+	if (ge->results_window)
+		return ge->results_notebook;
+
+	win = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+	gtk_window_set_title(GTK_WINDOW(win), "Results");
+	gtk_window_set_default_size(GTK_WINDOW(win), 1024, 768);
+	g_signal_connect(win, "delete-event", G_CALLBACK(results_window_delete), ge);
+	g_signal_connect(win, "destroy", G_CALLBACK(results_window_delete), ge);
+
+	vbox = gtk_vbox_new(FALSE, 0);
+	gtk_container_add(GTK_CONTAINER(win), vbox);
+
+	ge->results_menu = get_results_menubar(win, ge);
+	gtk_box_pack_start(GTK_BOX(vbox), ge->results_menu, FALSE, FALSE, 0);
+
+	notebook = gtk_notebook_new();
+	gtk_notebook_set_scrollable(GTK_NOTEBOOK(notebook), 1);
+	gtk_notebook_popup_enable(GTK_NOTEBOOK(notebook));
+	gtk_container_add(GTK_CONTAINER(vbox), notebook);
+
+	ge->results_window = win;
+	ge->results_notebook = notebook;
+	return ge->results_notebook;
+}
+
+static void gfio_text_op(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_text_pdu *p = (struct cmd_text_pdu *) cmd->payload;
+	struct gfio_client *gc = client->client_data;
+	struct gui_entry *ge = gc->ge;
+	struct gui *ui = ge->ui;
+	GtkTreeIter iter;
+	struct tm *tm;
+	time_t sec;
+	char tmp[64], timebuf[96];
+
+	sec = p->log_sec;
+	tm = localtime(&sec);
+	strftime(tmp, sizeof(tmp), "%Y-%m-%d %H:%M:%S", tm);
+	sprintf(timebuf, "%s.%03ld", tmp, (long) p->log_usec / 1000);
+
+	gdk_threads_enter();
+
+	gtk_list_store_append(ui->log_model, &iter);
+	gtk_list_store_set(ui->log_model, &iter, 0, timebuf, -1);
+	gtk_list_store_set(ui->log_model, &iter, 1, client->hostname, -1);
+	gtk_list_store_set(ui->log_model, &iter, 2, log_get_level(p->level), -1);
+	gtk_list_store_set(ui->log_model, &iter, 3, p->buf, -1);
+
+	if (p->level == FIO_LOG_ERR)
+		gfio_view_log(ui);
+
+	gdk_threads_leave();
+}
+
+static void disk_util_destroy(GtkWidget *w, gpointer data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+
+	ge->disk_util_vbox = NULL;
+	gtk_widget_destroy(w);
+}
+
+static GtkWidget *gfio_disk_util_get_vbox(struct gui_entry *ge)
+{
+	GtkWidget *vbox, *box, *scroll, *res_notebook;
+
+	if (ge->disk_util_vbox)
+		return ge->disk_util_vbox;
+
+	scroll = get_scrolled_window(5);
+	vbox = gtk_vbox_new(FALSE, 3);
+	box = gtk_hbox_new(FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(vbox), box, FALSE, FALSE, 5);
+
+	gtk_scrolled_window_add_with_viewport(GTK_SCROLLED_WINDOW(scroll), vbox);
+	res_notebook = get_results_window(ge);
+
+	gtk_notebook_append_page(GTK_NOTEBOOK(res_notebook), scroll, gtk_label_new("Disk utilization"));
+	ge->disk_util_vbox = box;
+	g_signal_connect(vbox, "destroy", G_CALLBACK(disk_util_destroy), ge);
+
+	return ge->disk_util_vbox;
+}
+
+static int __gfio_disk_util_show(GtkWidget *res_notebook,
+				 struct gfio_client *gc, struct cmd_du_pdu *p)
+{
+	GtkWidget *box, *frame, *entry, *vbox, *util_vbox;
+	struct gui_entry *ge = gc->ge;
+	double util;
+	char tmp[16];
+
+	util_vbox = gfio_disk_util_get_vbox(ge);
+
+	vbox = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(util_vbox), vbox);
+
+	frame = gtk_frame_new((char *) p->dus.name);
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 2);
+
+	box = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	frame = gtk_frame_new("Read");
+	gtk_box_pack_start(GTK_BOX(box), frame, FALSE, FALSE, 2);
+	vbox = gtk_hbox_new(TRUE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), vbox);
+	entry = new_info_entry_in_frame(vbox, "IOs");
+	entry_set_int_value(entry, p->dus.s.ios[0]);
+	entry = new_info_entry_in_frame(vbox, "Merges");
+	entry_set_int_value(entry, p->dus.s.merges[0]);
+	entry = new_info_entry_in_frame(vbox, "Sectors");
+	entry_set_int_value(entry, p->dus.s.sectors[0]);
+	entry = new_info_entry_in_frame(vbox, "Ticks");
+	entry_set_int_value(entry, p->dus.s.ticks[0]);
+
+	frame = gtk_frame_new("Write");
+	gtk_box_pack_start(GTK_BOX(box), frame, FALSE, FALSE, 2);
+	vbox = gtk_hbox_new(TRUE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), vbox);
+	entry = new_info_entry_in_frame(vbox, "IOs");
+	entry_set_int_value(entry, p->dus.s.ios[1]);
+	entry = new_info_entry_in_frame(vbox, "Merges");
+	entry_set_int_value(entry, p->dus.s.merges[1]);
+	entry = new_info_entry_in_frame(vbox, "Sectors");
+	entry_set_int_value(entry, p->dus.s.sectors[1]);
+	entry = new_info_entry_in_frame(vbox, "Ticks");
+	entry_set_int_value(entry, p->dus.s.ticks[1]);
+
+	frame = gtk_frame_new("Shared");
+	gtk_box_pack_start(GTK_BOX(box), frame, FALSE, FALSE, 2);
+	vbox = gtk_hbox_new(TRUE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), vbox);
+	entry = new_info_entry_in_frame(vbox, "IO ticks");
+	entry_set_int_value(entry, p->dus.s.io_ticks);
+	entry = new_info_entry_in_frame(vbox, "Time in queue");
+	entry_set_int_value(entry, p->dus.s.time_in_queue);
+
+	util = 0.0;
+	if (p->dus.s.msec)
+		util = (double) 100 * p->dus.s.io_ticks / (double) p->dus.s.msec;
+	if (util > 100.0)
+		util = 100.0;
+
+	sprintf(tmp, "%3.2f%%", util);
+	entry = new_info_entry_in_frame(vbox, "Disk utilization");
+	gtk_entry_set_text(GTK_ENTRY(entry), tmp);
+
+	gtk_widget_show_all(ge->results_window);
+	return 0;
+}
+
+static int gfio_disk_util_show(struct gfio_client *gc)
+{
+	struct gui_entry *ge = gc->ge;
+	GtkWidget *res_notebook;
+	int i;
+
+	if (!gc->nr_du)
+		return 1;
+
+	res_notebook = get_results_window(ge);
+
+	for (i = 0; i < gc->nr_du; i++) {
+		struct cmd_du_pdu *p = &gc->du[i];
+
+		__gfio_disk_util_show(res_notebook, gc, p);
+	}
+
+	gtk_widget_show_all(ge->results_window);
+	return 0;
+}
+
+static void gfio_disk_util_op(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_du_pdu *p = (struct cmd_du_pdu *) cmd->payload;
+	struct gfio_client *gc = client->client_data;
+	struct gui_entry *ge = gc->ge;
+	unsigned int nr = gc->nr_du;
+
+	gc->du = realloc(gc->du, (nr + 1) * sizeof(struct cmd_du_pdu));
+	memcpy(&gc->du[nr], p, sizeof(*p));
+	gc->nr_du++;
+
+	gdk_threads_enter();
+	if (ge->results_window)
+		__gfio_disk_util_show(ge->results_notebook, gc, p);
+	else
+		gfio_disk_util_show(gc);
+	gdk_threads_leave();
+}
+
+static int sum_stat_nr;
+
+static void gfio_thread_status_op(struct fio_client *client,
+				  struct fio_net_cmd *cmd)
+{
+	struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+
+	gfio_display_ts(client, &p->ts, &p->rs);
+
+	if (sum_stat_clients == 1)
+		return;
+
+	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+	sum_group_stats(&client_gs, &p->rs);
+
+	client_ts.members++;
+	client_ts.thread_number = p->ts.thread_number;
+	client_ts.groupid = p->ts.groupid;
+	client_ts.sig_figs = p->ts.sig_figs;
+
+	if (++sum_stat_nr == sum_stat_clients) {
+		strcpy(client_ts.name, "All clients");
+		gfio_display_ts(client, &client_ts, &client_gs);
+	}
+}
+
+static void gfio_group_stats_op(struct fio_client *client,
+				struct fio_net_cmd *cmd)
+{
+	/* We're ignoring group stats for now */
+}
+
+static void gfio_update_thread_status(struct gui_entry *ge,
+				      char *status_message, double perc)
+{
+	static char message[100];
+	const char *m = message;
+
+	snprintf(message, sizeof(message), "%s", status_message);
+	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ge->thread_status_pb), m);
+	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ge->thread_status_pb), perc / 100.0);
+	gtk_widget_queue_draw(ge->ui->window);
+}
+
+static void gfio_update_thread_status_all(struct gui *ui, char *status_message,
+					  double perc)
+{
+	static char message[100];
+	const char *m = message;
+
+	snprintf(message, sizeof(message), "%s", status_message);
+	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ui->thread_status_pb), m);
+	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ui->thread_status_pb), perc / 100.0);
+	gtk_widget_queue_draw(ui->window);
+}
+
+/*
+ * Client specific ETA
+ */
+static void gfio_update_client_eta(struct fio_client *client, struct jobs_eta *je)
+{
+	struct gfio_client *gc = client->client_data;
+	struct gui_entry *ge = gc->ge;
+	static int eta_good;
+	char eta_str[128];
+	char output[256];
+	char tmp[32];
+	double perc = 0.0;
+	int i2p = 0;
+
+	gdk_threads_enter();
+
+	eta_str[0] = '\0';
+	output[0] = '\0';
+
+	if (je->eta_sec != INT_MAX && je->elapsed_sec) {
+		perc = (double) je->elapsed_sec / (double) (je->elapsed_sec + je->eta_sec);
+		eta_to_str(eta_str, je->eta_sec);
+	}
+
+	sprintf(tmp, "%u", je->nr_running);
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.jobs), tmp);
+	sprintf(tmp, "%u", je->files_open);
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.files), tmp);
+
+	if (je->eta_sec != INT_MAX && je->nr_running) {
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
+		int i;
+
+		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
+			strcpy(output, "-.-% done");
+		else {
+			eta_good = 1;
+			perc *= 100.0;
+			sprintf(output, "%3.1f%% done", perc);
+		}
+
+		iops_str[0] = num2str(je->iops[0], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], je->sig_figs, 1, 0, N2S_PERSEC);
+
+		rate_str[0] = num2str(je->rate[0], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), tmp);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_iops), iops_str[0]);
+
+		rate_str[1] = num2str(je->rate[1], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), tmp);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_iops), iops_str[1]);
+
+		rate_str[2] = num2str(je->rate[2], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), tmp);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_iops), iops_str[2]);
+
+		graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
+		graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.write_iops, je->elapsed_sec, je->iops[1], iops_str[1]);
+		graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.trim_iops, je->elapsed_sec, je->iops[2], iops_str[2]);
+		graph_add_xy_data(ge->graphs.bandwidth_graph, ge->graphs.read_bw, je->elapsed_sec, je->rate[0], rate_str[0]);
+		graph_add_xy_data(ge->graphs.bandwidth_graph, ge->graphs.write_bw, je->elapsed_sec, je->rate[1], rate_str[1]);
+		graph_add_xy_data(ge->graphs.bandwidth_graph, ge->graphs.trim_bw, je->elapsed_sec, je->rate[2], rate_str[2]);
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(rate_str[i]);
+			free(rate_alt[i]);
+			free(iops_str[i]);
+		}
+	}
+
+	if (eta_str[0]) {
+		char *dst = output + strlen(output);
+
+		sprintf(dst, " - %s", eta_str);
+	}
+
+	gfio_update_thread_status(ge, output, perc);
+	gdk_threads_leave();
+}
+
+/*
+ * Update ETA in main window for all clients
+ */
+static void gfio_update_all_eta(struct jobs_eta *je)
+{
+	struct gui *ui = &main_ui;
+	static int eta_good;
+	char eta_str[128];
+	char output[256];
+	double perc = 0.0;
+	int i, i2p = 0;
+
+	gdk_threads_enter();
+
+	eta_str[0] = '\0';
+	output[0] = '\0';
+
+	if (je->eta_sec != INT_MAX && je->elapsed_sec) {
+		perc = (double) je->elapsed_sec / (double) (je->elapsed_sec + je->eta_sec);
+		eta_to_str(eta_str, je->eta_sec);
+	}
+
+	entry_set_int_value(ui->eta.jobs, je->nr_running);
+
+	if (je->eta_sec != INT_MAX && je->nr_running) {
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
+
+		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
+			strcpy(output, "-.-% done");
+		else {
+			eta_good = 1;
+			perc *= 100.0;
+			sprintf(output, "%3.1f%% done", perc);
+		}
+
+		iops_str[0] = num2str(je->iops[0], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], je->sig_figs, 1, 0, N2S_PERSEC);
+
+		rate_str[0] = num2str(je->rate[0], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), tmp);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_iops), iops_str[0]);
+
+		rate_str[1] = num2str(je->rate[1], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), tmp);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_iops), iops_str[1]);
+
+		rate_str[2] = num2str(je->rate[2], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), tmp);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_iops), iops_str[2]);
+
+		graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
+		graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.write_iops, je->elapsed_sec, je->iops[1], iops_str[1]);
+		graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.trim_iops, je->elapsed_sec, je->iops[2], iops_str[2]);
+		graph_add_xy_data(ui->graphs.bandwidth_graph, ui->graphs.read_bw, je->elapsed_sec, je->rate[0], rate_str[0]);
+		graph_add_xy_data(ui->graphs.bandwidth_graph, ui->graphs.write_bw, je->elapsed_sec, je->rate[1], rate_str[1]);
+		graph_add_xy_data(ui->graphs.bandwidth_graph, ui->graphs.trim_bw, je->elapsed_sec, je->rate[2], rate_str[2]);
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(rate_str[i]);
+			free(rate_alt[i]);
+			free(iops_str[i]);
+		}
+	}
+
+	if (eta_str[0]) {
+		char *dst = output + strlen(output);
+
+		sprintf(dst, " - %s", eta_str);
+	}
+
+	gfio_update_thread_status_all(ui, output, perc);
+	gdk_threads_leave();
+}
+
+static void gfio_probe_op(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_probe_reply_pdu *probe = (struct cmd_probe_reply_pdu *) cmd->payload;
+	struct gfio_client *gc = client->client_data;
+	struct gui_entry *ge = gc->ge;
+	const char *os, *arch;
+
+	os = fio_get_os_string(probe->os);
+	if (!os)
+		os = "unknown";
+
+	arch = fio_get_arch_string(probe->arch);
+	if (!arch)
+		os = "unknown";
+
+	if (!client->name)
+		client->name = strdup((char *) probe->hostname);
+
+	gc->client_cpus = le32_to_cpu(probe->cpus);
+	gc->client_flags = le64_to_cpu(probe->flags);
+
+	gdk_threads_enter();
+
+	gtk_label_set_text(GTK_LABEL(ge->probe.hostname), (char *) probe->hostname);
+	gtk_label_set_text(GTK_LABEL(ge->probe.os), os);
+	gtk_label_set_text(GTK_LABEL(ge->probe.arch), arch);
+	gtk_label_set_text(GTK_LABEL(ge->probe.fio_ver), (char *) probe->fio_version);
+
+	gfio_set_state(ge, GE_STATE_CONNECTED);
+
+	gdk_threads_leave();
+}
+
+static void gfio_quit_op(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct gfio_client *gc = client->client_data;
+
+	gdk_threads_enter();
+	gfio_set_state(gc->ge, GE_STATE_NEW);
+	gdk_threads_leave();
+}
+
+static struct thread_options *gfio_client_add_job(struct gfio_client *gc,
+			struct thread_options_pack *top)
+{
+	struct gfio_client_options *gco;
+
+	gco = calloc(1, sizeof(*gco));
+	convert_thread_options_to_cpu(&gco->o, top);
+	INIT_FLIST_HEAD(&gco->list);
+	flist_add_tail(&gco->list, &gc->o_list);
+	gc->o_list_nr = 1;
+	return &gco->o;
+}
+
+static void gfio_add_job_op(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_add_job_pdu *p = (struct cmd_add_job_pdu *) cmd->payload;
+	struct gfio_client *gc = client->client_data;
+	struct gui_entry *ge = gc->ge;
+	struct thread_options *o;
+	char *c1, *c2, *c3, *c4;
+	char tmp[80];
+	int i2p;
+
+	p->thread_number = le32_to_cpu(p->thread_number);
+	p->groupid = le32_to_cpu(p->groupid);
+	o = gfio_client_add_job(gc, &p->top);
+
+	gdk_threads_enter();
+
+	gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(ge->eta.names), (gchar *) o->name);
+	gtk_combo_box_set_active(GTK_COMBO_BOX(ge->eta.names), 0);
+
+	sprintf(tmp, "%s %s", o->odirect ? "direct" : "buffered", ddir_str(o->td_ddir));
+	multitext_add_entry(&ge->eta.iotype, tmp);
+
+	i2p = is_power_of_2(o->kb_base);
+	c1 = num2str(o->min_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+	c2 = num2str(o->max_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+	c3 = num2str(o->min_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+	c4 = num2str(o->max_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+
+	sprintf(tmp, "%s-%s,%s-%s", c1, c2, c3, c4);
+	free(c1);
+	free(c2);
+	free(c3);
+	free(c4);
+	multitext_add_entry(&ge->eta.bs, tmp);
+
+	multitext_add_entry(&ge->eta.ioengine, (const char *) o->ioengine);
+
+	sprintf(tmp, "%u", o->iodepth);
+	multitext_add_entry(&ge->eta.iodepth, tmp);
+
+	multitext_set_entry(&ge->eta.iotype, 0);
+	multitext_set_entry(&ge->eta.bs, 0);
+	multitext_set_entry(&ge->eta.ioengine, 0);
+	multitext_set_entry(&ge->eta.iodepth, 0);
+
+	gfio_set_state(ge, GE_STATE_JOB_SENT);
+
+	gdk_threads_leave();
+}
+
+static void gfio_update_job_op(struct fio_client *client,
+			       struct fio_net_cmd *cmd)
+{
+	uint32_t *pdu_error = (uint32_t *) cmd->payload;
+	struct gfio_client *gc = client->client_data;
+
+	gc->update_job_status = le32_to_cpu(*pdu_error);
+	gc->update_job_done = 1;
+}
+
+static void gfio_client_timed_out(struct fio_client *client)
+{
+	struct gfio_client *gc = client->client_data;
+	char buf[256];
+
+	gdk_threads_enter();
+
+	gfio_set_state(gc->ge, GE_STATE_NEW);
+	clear_ge_ui_info(gc->ge);
+
+	sprintf(buf, "Client %s: timeout talking to server.\n", client->hostname);
+	gfio_report_info(gc->ge->ui, "Network timeout", buf);
+
+	gdk_threads_leave();
+}
+
+static void gfio_client_stop(struct fio_client *client)
+{
+	struct gfio_client *gc = client->client_data;
+
+	gdk_threads_enter();
+
+	gfio_set_state(gc->ge, GE_STATE_JOB_DONE);
+
+	if (gc->err_entry)
+		entry_set_int_value(gc->err_entry, client->error);
+
+	gdk_threads_leave();
+}
+
+static void gfio_client_start(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct gfio_client *gc = client->client_data;
+
+	gdk_threads_enter();
+	gfio_set_state(gc->ge, GE_STATE_JOB_STARTED);
+	gdk_threads_leave();
+}
+
+static void gfio_client_job_start(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct gfio_client *gc = client->client_data;
+
+	gdk_threads_enter();
+	gfio_set_state(gc->ge, GE_STATE_JOB_RUNNING);
+	gdk_threads_leave();
+}
+
+static void gfio_add_total_depths_tree(GtkListStore *model,
+				       struct thread_stat *ts, unsigned int len)
+{
+	double io_u_dist[FIO_IO_U_MAP_NR];
+	GtkTreeIter iter;
+	/* Bits 1-6, and 8 */
+	const int add_mask = 0x17e;
+	int i, j;
+
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+
+	gtk_list_store_append(model, &iter);
+
+	gtk_list_store_set(model, &iter, 0, "Total", -1);
+
+	for (i = 1, j = 0; i < len; i++) {
+		char fbuf[32];
+
+		if (!(add_mask & (1UL << (i - 1))))
+			sprintf(fbuf, "0.0%%");
+		else {
+			sprintf(fbuf, "%3.1f%%", io_u_dist[j]);
+			j++;
+		}
+
+		gtk_list_store_set(model, &iter, i, fbuf, -1);
+	}
+
+}
+
+static void gfio_add_end_results(struct gfio_client *gc, struct thread_stat *ts,
+				 struct group_run_stats *rs)
+{
+	unsigned int nr = gc->nr_results;
+
+	gc->results = realloc(gc->results, (nr + 1) * sizeof(struct end_results));
+	memcpy(&gc->results[nr].ts, ts, sizeof(*ts));
+	memcpy(&gc->results[nr].gs, rs, sizeof(*rs));
+	gc->nr_results++;
+}
+
+static void gfio_add_sc_depths_tree(GtkListStore *model,
+				    struct thread_stat *ts, unsigned int len,
+				    int submit)
+{
+	double io_u_dist[FIO_IO_U_MAP_NR];
+	GtkTreeIter iter;
+	/* Bits 0, and 3-8 */
+	const int add_mask = 0x1f9;
+	int i, j;
+
+	if (submit)
+		stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
+	else
+		stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
+
+	gtk_list_store_append(model, &iter);
+
+	gtk_list_store_set(model, &iter, 0, submit ? "Submit" : "Complete", -1);
+
+	for (i = 1, j = 0; i < len; i++) {
+		char fbuf[32];
+
+		if (!(add_mask & (1UL << (i - 1))))
+			sprintf(fbuf, "0.0%%");
+		else {
+			sprintf(fbuf, "%3.1f%%", io_u_dist[j]);
+			j++;
+		}
+
+		gtk_list_store_set(model, &iter, i, fbuf, -1);
+	}
+
+}
+
+static void gfio_show_io_depths(GtkWidget *vbox, struct thread_stat *ts)
+{
+	GtkWidget *frame, *box, *tree_view = NULL;
+	GtkTreeSelection *selection;
+	GtkListStore *model;
+	int i;
+	const char *labels[] = { "Depth", "0", "1", "2", "4", "8", "16", "32", "64", ">= 64" };
+	const int nr_labels = ARRAY_SIZE(labels);
+	GType types[nr_labels];
+
+	frame = gtk_frame_new("IO depths");
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+
+	box = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	for (i = 0; i < nr_labels; i++)
+		types[i] = G_TYPE_STRING;
+
+	model = gtk_list_store_newv(nr_labels, types);
+
+	tree_view = gtk_tree_view_new_with_model(GTK_TREE_MODEL(model));
+	gtk_widget_set_can_focus(tree_view, FALSE);
+
+	g_object_set(G_OBJECT(tree_view), "headers-visible", TRUE,
+		"enable-grid-lines", GTK_TREE_VIEW_GRID_LINES_BOTH, NULL);
+
+	selection = gtk_tree_view_get_selection(GTK_TREE_VIEW(tree_view));
+	gtk_tree_selection_set_mode(GTK_TREE_SELECTION(selection), GTK_SELECTION_BROWSE);
+
+	for (i = 0; i < nr_labels; i++)
+		tree_view_column(tree_view, i, labels[i], ALIGN_RIGHT | UNSORTABLE);
+
+	gfio_add_total_depths_tree(model, ts, nr_labels);
+	gfio_add_sc_depths_tree(model, ts, nr_labels, 1);
+	gfio_add_sc_depths_tree(model, ts, nr_labels, 0);
+
+	gtk_box_pack_start(GTK_BOX(box), tree_view, TRUE, TRUE, 3);
+}
+
+static void gfio_show_cpu_usage(GtkWidget *vbox, struct thread_stat *ts)
+{
+	GtkWidget *box, *frame, *entry;
+	double usr_cpu, sys_cpu;
+	unsigned long runtime;
+	char tmp[32];
+
+	runtime = ts->total_run_time;
+	if (runtime) {
+		double runt = (double) runtime;
+
+		usr_cpu = (double) ts->usr_time * 100 / runt;
+		sys_cpu = (double) ts->sys_time * 100 / runt;
+	} else {
+		usr_cpu = 0;
+		sys_cpu = 0;
+	}
+
+	frame = gtk_frame_new("OS resources");
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+
+	box = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	entry = new_info_entry_in_frame(box, "User CPU");
+	sprintf(tmp, "%3.2f%%", usr_cpu);
+	gtk_entry_set_text(GTK_ENTRY(entry), tmp);
+	entry = new_info_entry_in_frame(box, "System CPU");
+	sprintf(tmp, "%3.2f%%", sys_cpu);
+	gtk_entry_set_text(GTK_ENTRY(entry), tmp);
+	entry = new_info_entry_in_frame(box, "Context switches");
+	entry_set_int_value(entry, ts->ctx);
+	entry = new_info_entry_in_frame(box, "Major faults");
+	entry_set_int_value(entry, ts->majf);
+	entry = new_info_entry_in_frame(box, "Minor faults");
+	entry_set_int_value(entry, ts->minf);
+}
+
+static GtkWidget *gfio_output_lat_buckets(double *lat, const char **labels,
+					  int num)
+{
+	GtkWidget *tree_view;
+	GtkTreeSelection *selection;
+	GtkListStore *model;
+	GtkTreeIter iter;
+	GType *types;
+	int i;
+
+	types = malloc(num * sizeof(GType));
+
+	for (i = 0; i < num; i++)
+		types[i] = G_TYPE_STRING;
+
+	model = gtk_list_store_newv(num, types);
+	free(types);
+	types = NULL;
+
+	tree_view = gtk_tree_view_new_with_model(GTK_TREE_MODEL(model));
+	gtk_widget_set_can_focus(tree_view, FALSE);
+
+	g_object_set(G_OBJECT(tree_view), "headers-visible", TRUE,
+		"enable-grid-lines", GTK_TREE_VIEW_GRID_LINES_BOTH, NULL);
+
+	selection = gtk_tree_view_get_selection(GTK_TREE_VIEW(tree_view));
+	gtk_tree_selection_set_mode(GTK_TREE_SELECTION(selection), GTK_SELECTION_BROWSE);
+
+	for (i = 0; i < num; i++)
+		tree_view_column(tree_view, i, labels[i], ALIGN_RIGHT | UNSORTABLE);
+
+	gtk_list_store_append(model, &iter);
+
+	for (i = 0; i < num; i++) {
+		char fbuf[32];
+
+		if (lat[i] <= 0.0)
+			sprintf(fbuf, "0.00");
+		else
+			sprintf(fbuf, "%3.2f%%", lat[i]);
+
+		gtk_list_store_set(model, &iter, i, fbuf, -1);
+	}
+
+	return tree_view;
+}
+
+static struct graph *setup_lat_bucket_graph(const char *title, double *lat,
+					    const char **labels,
+					    unsigned int len,
+					    double xdim, double ydim)
+{
+	struct graph *g;
+	int i;
+
+	g = graph_new(xdim, ydim, gfio_graph_font);
+	graph_title(g, title);
+	graph_x_title(g, "Buckets");
+	graph_y_title(g, "Percent");
+
+	for (i = 0; i < len; i++) {
+		graph_label_t l;
+
+		l = graph_add_label(g, labels[i]);
+		graph_add_data(g, l, lat[i]);
+	}
+
+	return g;
+}
+
+static int on_expose_lat_drawing_area(GtkWidget *w, GdkEvent *event, gpointer p)
+{
+	struct graph *g = p;
+	cairo_t *cr;
+
+	cr = gdk_cairo_create(gtk_widget_get_window(w));
+#if 0
+	if (graph_has_tooltips(g)) {
+		g_object_set(w, "has-tooltip", TRUE, NULL);
+		g_signal_connect(w, "query-tooltip", G_CALLBACK(clat_graph_tooltip), g);
+	}
+#endif
+	cairo_set_source_rgb(cr, 0, 0, 0);
+	bar_graph_draw(g, cr);
+	cairo_destroy(cr);
+
+	return FALSE;
+}
+
+static gint on_config_lat_drawing_area(GtkWidget *w, GdkEventConfigure *event,
+				       gpointer data)
+{
+	guint width = gtk_widget_get_allocated_width(w);
+	guint height = gtk_widget_get_allocated_height(w);
+	struct graph *g = data;
+
+	graph_set_size(g, width, height);
+	graph_set_size(g, width, height);
+	graph_set_position(g, 0, 0);
+	return TRUE;
+}
+
+static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox,
+				      struct thread_stat *ts)
+{
+	double io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
+	const char *ranges[] = { "2ns", "4ns", "10ns", "20ns", "50ns", "100ns",
+				 "250ns", "500ns", "750ns", "1000ns", "2us",
+				 "4us", "10us", "20us", "50us", "100us",
+				 "250us", "500us", "750us", "1ms", "2ms",
+				 "4ms", "10ms", "20ms", "50ms", "100ms",
+				 "250ms", "500ms", "750ms", "1s", "2s", ">= 2s" };
+	int start, end, i;
+	const int total = FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR;
+	GtkWidget *frame, *tree_view, *hbox, *completion_vbox, *drawing_area;
+	struct gui_entry *ge = gc->ge;
+
+	stat_calc_lat_n(ts, io_u_lat);
+	stat_calc_lat_u(ts, &io_u_lat[FIO_IO_U_LAT_N_NR]);
+	stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR]);
+
+	/*
+	 * Found out which first bucket has entries, and which last bucket
+	 */
+	start = end = -1U;
+	for (i = 0; i < total; i++) {
+		if (io_u_lat[i] == 0.00)
+			continue;
+
+		if (start == -1U)
+			start = i;
+		end = i;
+	}
+
+	/*
+	 * No entries...
+	 */
+	if (start == -1U)
+		return;
+
+	tree_view = gfio_output_lat_buckets(&io_u_lat[start], &ranges[start], end - start + 1);
+	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
+
+	frame = gtk_frame_new("Latency buckets");
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+
+	completion_vbox = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), completion_vbox);
+	hbox = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(completion_vbox), hbox);
+
+	drawing_area = gtk_drawing_area_new();
+	gtk_widget_set_size_request(GTK_WIDGET(drawing_area), 700, 300);
+	gtk_widget_modify_bg(drawing_area, GTK_STATE_NORMAL, &gfio_color_white);
+	gtk_container_add(GTK_CONTAINER(completion_vbox), drawing_area);
+	g_signal_connect(G_OBJECT(drawing_area), GFIO_DRAW_EVENT, G_CALLBACK(on_expose_lat_drawing_area), ge->lat_bucket_graph);
+	g_signal_connect(G_OBJECT(drawing_area), "configure_event", G_CALLBACK(on_config_lat_drawing_area), ge->lat_bucket_graph);
+
+	gtk_box_pack_start(GTK_BOX(hbox), tree_view, TRUE, TRUE, 3);
+}
+
+static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long long min,
+			  unsigned long long max, double mean, double dev)
+{
+	const char *base = "(nsec)";
+	GtkWidget *hbox, *label, *frame;
+	char *minp, *maxp;
+	char tmp[64];
+
+	if (nsec_to_msec(&min, &max, &mean, &dev))
+		base = "(msec)";
+	else if (nsec_to_usec(&min, &max, &mean, &dev))
+		base = "(usec)";
+
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
+
+	sprintf(tmp, "%s %s", name, base);
+	frame = gtk_frame_new(tmp);
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+
+	hbox = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), hbox);
+
+	label = new_info_label_in_frame(hbox, "Minimum");
+	gtk_label_set_text(GTK_LABEL(label), minp);
+	label = new_info_label_in_frame(hbox, "Maximum");
+	gtk_label_set_text(GTK_LABEL(label), maxp);
+	label = new_info_label_in_frame(hbox, "Average");
+	sprintf(tmp, "%5.02f", mean);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+	label = new_info_label_in_frame(hbox, "Standard deviation");
+	sprintf(tmp, "%5.02f", dev);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
+	free(minp);
+	free(maxp);
+}
+
+static GtkWidget *gfio_output_clat_percentiles(unsigned long long *ovals,
+					       fio_fp64_t *plist,
+					       unsigned int len,
+					       const char *base,
+					       unsigned int scale)
+{
+	GType types[FIO_IO_U_LIST_MAX_LEN];
+	GtkWidget *tree_view;
+	GtkTreeSelection *selection;
+	GtkListStore *model;
+	GtkTreeIter iter;
+	int i, j;
+
+	for (i = 0; i < len; i++)
+		types[i] = G_TYPE_ULONG;
+
+	model = gtk_list_store_newv(len, types);
+
+	tree_view = gtk_tree_view_new_with_model(GTK_TREE_MODEL(model));
+	gtk_widget_set_can_focus(tree_view, FALSE);
+
+	g_object_set(G_OBJECT(tree_view), "headers-visible", TRUE,
+		"enable-grid-lines", GTK_TREE_VIEW_GRID_LINES_BOTH, NULL);
+
+	selection = gtk_tree_view_get_selection(GTK_TREE_VIEW(tree_view));
+	gtk_tree_selection_set_mode(GTK_TREE_SELECTION(selection), GTK_SELECTION_BROWSE);
+
+	for (i = 0; i < len; i++) {
+		char fbuf[8];
+
+		sprintf(fbuf, "%2.2f%%", plist[i].u.f);
+		tree_view_column(tree_view, i, fbuf, ALIGN_RIGHT | UNSORTABLE);
+	}
+
+	gtk_list_store_append(model, &iter);
+
+	for (i = 0; i < len; i++) {
+		for (j = 0; j < scale; j++)
+			ovals[i] = (ovals[i] + 999) / 1000;
+		gtk_list_store_set(model, &iter, i, (unsigned long) ovals[i], -1);
+	}
+
+	return tree_view;
+}
+
+static struct graph *setup_clat_graph(char *title, unsigned long long *ovals,
+				      fio_fp64_t *plist,
+				      unsigned int len,
+				      double xdim, double ydim)
+{
+	struct graph *g;
+	int i;
+
+	g = graph_new(xdim, ydim, gfio_graph_font);
+	graph_title(g, title);
+	graph_x_title(g, "Percentile");
+	graph_y_title(g, "Time");
+
+	for (i = 0; i < len; i++) {
+		graph_label_t l;
+		char fbuf[8];
+
+		sprintf(fbuf, "%2.2f%%", plist[i].u.f);
+		l = graph_add_label(g, fbuf);
+		graph_add_data(g, l, (double) ovals[i]);
+	}
+
+	return g;
+}
+
+static void gfio_show_clat_percentiles(struct gfio_client *gc,
+				       GtkWidget *vbox, struct thread_stat *ts,
+				       int ddir, uint64_t *io_u_plat,
+				       unsigned long long nr, const char *type)
+{
+	fio_fp64_t *plist = ts->percentile_list;
+	unsigned int len, scale_down;
+	unsigned long long *ovals, minv, maxv;
+	const char *base;
+	GtkWidget *tree_view, *frame, *hbox, *drawing_area, *completion_vbox;
+	struct gui_entry *ge = gc->ge;
+	char tmp[64];
+
+	len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
+	if (!len)
+		goto out;
+
+	/*
+	 * We default to nsecs, but if the value range is such that we
+	 * should scale down to usecs or msecs, do that.
+	 */
+        if (minv > 2000000 && maxv > 99999999ULL) {
+                scale_down = 2;
+		base = "msec";
+        } else if (minv > 2000 && maxv > 99999) {
+                scale_down = 1;
+		base = "usec";
+        } else {
+                scale_down = 0;
+		base = "nsec";
+        }
+
+	sprintf(tmp, "%s latency percentiles (%s)", type, base);
+
+	tree_view = gfio_output_clat_percentiles(ovals, plist, len, base, scale_down);
+	ge->clat_graph = setup_clat_graph(tmp, ovals, plist, len, 700.0, 300.0);
+
+	frame = gtk_frame_new(tmp);
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+
+	completion_vbox = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), completion_vbox);
+	hbox = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(completion_vbox), hbox);
+	drawing_area = gtk_drawing_area_new();
+	gtk_widget_set_size_request(GTK_WIDGET(drawing_area), 700, 300);
+	gtk_widget_modify_bg(drawing_area, GTK_STATE_NORMAL, &gfio_color_white);
+	gtk_container_add(GTK_CONTAINER(completion_vbox), drawing_area);
+	g_signal_connect(G_OBJECT(drawing_area), GFIO_DRAW_EVENT, G_CALLBACK(on_expose_lat_drawing_area), ge->clat_graph);
+	g_signal_connect(G_OBJECT(drawing_area), "configure_event", G_CALLBACK(on_config_lat_drawing_area), ge->clat_graph);
+
+	gtk_box_pack_start(GTK_BOX(hbox), tree_view, TRUE, TRUE, 3);
+out:
+	if (ovals)
+		free(ovals);
+}
+
+#define GFIO_CLAT	1
+#define GFIO_SLAT	2
+#define GFIO_LAT	4
+#define GFIO_HILAT	8
+#define GFIO_LOLAT	16
+
+static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
+				  struct group_run_stats *rs,
+				  struct thread_stat *ts, int ddir)
+{
+	const char *ddir_label[3] = { "Read", "Write", "Trim" };
+	const char *hilat, *lolat;
+	GtkWidget *frame, *label, *box, *vbox, *main_vbox;
+	unsigned long long min[5], max[5];
+	unsigned long runt;
+	unsigned long long bw, iops;
+	unsigned int flags = 0;
+	double mean[5], dev[5];
+	char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
+	char tmp[128];
+	int i2p;
+
+	if (!ts->runtime[ddir])
+		return;
+
+	i2p = is_power_of_2(rs->kb_base);
+	runt = ts->runtime[ddir];
+
+	bw = (1000 * ts->io_bytes[ddir]) / runt;
+
+	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
+	iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_PERSEC);
+
+	box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(mbox), box, TRUE, FALSE, 3);
+
+	frame = gtk_frame_new(ddir_label[ddir]);
+	gtk_box_pack_start(GTK_BOX(box), frame, TRUE, TRUE, 5);
+
+	main_vbox = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), main_vbox);
+
+	box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(main_vbox), box, TRUE, FALSE, 3);
+
+	label = new_info_label_in_frame(box, "IO");
+	io_p = num2str(ts->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
+	io_palt = num2str(ts->io_bytes[ddir], ts->sig_figs, 1, !i2p, N2S_BYTE);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", io_p, io_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
+	label = new_info_label_in_frame(box, "Bandwidth");
+	bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
+	bw_palt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", bw_p, bw_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
+	label = new_info_label_in_frame(box, "IOPS");
+	gtk_label_set_text(GTK_LABEL(label), iops_p);
+	label = new_info_label_in_frame(box, "Runtime (msec)");
+	label_set_int_value(label, ts->runtime[ddir]);
+
+	if (calc_lat(&ts->bw_stat[ddir], &min[0], &max[0], &mean[0], &dev[0])) {
+		double p_of_agg = 100.0;
+		const char *bw_str = "KiB/s";
+		char tmp[32];
+
+		if (rs->agg[ddir]) {
+			p_of_agg = mean[0] * 100 / (double) rs->agg[ddir];
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
+
+		if (mean[0] > 1073741824.9) {
+			min[0] /= 1048576.0;
+			max[0] /= 1048576.0;
+			mean[0] /= 1048576.0;
+			dev[0] /= 1048576.0;
+			bw_str = "GiB/s";
+		}
+
+		if (mean[0] > 1047575.9) {
+			min[0] /= 1024.0;
+			max[0] /= 1024.0;
+			mean[0] /= 1024.0;
+			dev[0] /= 1024.0;
+			bw_str = "MiB/s";
+		}
+		sprintf(tmp, "Bandwidth (%s)", bw_str);
+		frame = gtk_frame_new(tmp);
+		gtk_box_pack_start(GTK_BOX(main_vbox), frame, FALSE, FALSE, 5);
+
+		box = gtk_hbox_new(FALSE, 3);
+		gtk_container_add(GTK_CONTAINER(frame), box);
+
+		label = new_info_label_in_frame(box, "Minimum");
+		label_set_int_value(label, min[0]);
+		label = new_info_label_in_frame(box, "Maximum");
+		label_set_int_value(label, max[0]);
+		label = new_info_label_in_frame(box, "Percentage of jobs");
+		sprintf(tmp, "%3.2f%%", p_of_agg);
+		gtk_label_set_text(GTK_LABEL(label), tmp);
+		label = new_info_label_in_frame(box, "Average");
+		sprintf(tmp, "%5.02f", mean[0]);
+		gtk_label_set_text(GTK_LABEL(label), tmp);
+		label = new_info_label_in_frame(box, "Standard deviation");
+		sprintf(tmp, "%5.02f", dev[0]);
+		gtk_label_set_text(GTK_LABEL(label), tmp);
+	}
+
+	if (calc_lat(&ts->slat_stat[ddir], &min[0], &max[0], &mean[0], &dev[0]))
+		flags |= GFIO_SLAT;
+	if (calc_lat(&ts->clat_stat[ddir], &min[1], &max[1], &mean[1], &dev[1]))
+		flags |= GFIO_CLAT;
+	if (calc_lat(&ts->lat_stat[ddir], &min[2], &max[2], &mean[2], &dev[2]))
+		flags |= GFIO_LAT;
+	if (calc_lat(&ts->clat_high_prio_stat[ddir], &min[3], &max[3], &mean[3], &dev[3])) {
+		flags |= GFIO_HILAT;
+		if (calc_lat(&ts->clat_low_prio_stat[ddir], &min[4], &max[4], &mean[4], &dev[4]))
+			flags |= GFIO_LOLAT;
+		/* we only want to print low priority statistics if other IOs were
+		 * submitted with the priority bit set
+		 */
+	}
+
+	if (flags) {
+		frame = gtk_frame_new("Latency");
+		gtk_box_pack_start(GTK_BOX(main_vbox), frame, FALSE, FALSE, 5);
+
+		vbox = gtk_vbox_new(FALSE, 3);
+		gtk_container_add(GTK_CONTAINER(frame), vbox);
+
+		if (ts->lat_percentiles) {
+			hilat = "High priority total latency";
+			lolat = "Low priority total latency";
+		} else {
+			hilat = "High priority completion latency";
+			lolat = "Low priority completion latency";
+		}
+
+		if (flags & GFIO_SLAT)
+			gfio_show_lat(vbox, "Submission latency", min[0], max[0], mean[0], dev[0]);
+		if (flags & GFIO_CLAT)
+			gfio_show_lat(vbox, "Completion latency", min[1], max[1], mean[1], dev[1]);
+		if (flags & GFIO_LAT)
+			gfio_show_lat(vbox, "Total latency", min[2], max[2], mean[2], dev[2]);
+		if (flags & GFIO_HILAT)
+			gfio_show_lat(vbox, hilat, min[3], max[3], mean[3], dev[3]);
+		if (flags & GFIO_LOLAT)
+			gfio_show_lat(vbox, lolat, min[4], max[4], mean[4], dev[4]);
+	}
+
+	if (ts->slat_percentiles && flags & GFIO_SLAT)
+		gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+				ts->io_u_plat[FIO_SLAT][ddir],
+				ts->slat_stat[ddir].samples,
+				"Submission");
+	if (ts->clat_percentiles && flags & GFIO_CLAT) {
+		gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+				ts->io_u_plat[FIO_CLAT][ddir],
+				ts->clat_stat[ddir].samples,
+				"Completion");
+		if (!ts->lat_percentiles) {
+			if (flags & GFIO_HILAT)
+				gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+						ts->io_u_plat_high_prio[ddir],
+						ts->clat_high_prio_stat[ddir].samples,
+						"High priority completion");
+			if (flags & GFIO_LOLAT)
+				gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+						ts->io_u_plat_low_prio[ddir],
+						ts->clat_low_prio_stat[ddir].samples,
+						"Low priority completion");
+		}
+	}
+	if (ts->lat_percentiles && flags & GFIO_LAT) {
+		gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+				ts->io_u_plat[FIO_LAT][ddir],
+				ts->lat_stat[ddir].samples,
+				"Total");
+		if (flags & GFIO_HILAT)
+			gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+					ts->io_u_plat_high_prio[ddir],
+					ts->clat_high_prio_stat[ddir].samples,
+					"High priority total");
+		if (flags & GFIO_LOLAT)
+			gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
+					ts->io_u_plat_low_prio[ddir],
+					ts->clat_low_prio_stat[ddir].samples,
+					"Low priority total");
+	}
+
+	free(io_p);
+	free(bw_p);
+	free(io_palt);
+	free(bw_palt);
+	free(iops_p);
+}
+
+static void __gfio_display_end_results(GtkWidget *win, struct gfio_client *gc,
+				       struct thread_stat *ts,
+				       struct group_run_stats *rs)
+{
+	GtkWidget *box, *vbox, *entry, *scroll;
+	int i;
+
+	scroll = gtk_scrolled_window_new(NULL, NULL);
+	gtk_container_set_border_width(GTK_CONTAINER(scroll), 5);
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scroll), GTK_POLICY_AUTOMATIC, GTK_POLICY_AUTOMATIC);
+
+	vbox = gtk_vbox_new(FALSE, 3);
+
+	box = gtk_hbox_new(FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(vbox), box, TRUE, FALSE, 5);
+
+	gtk_scrolled_window_add_with_viewport(GTK_SCROLLED_WINDOW(scroll), vbox);
+
+	gtk_notebook_append_page(GTK_NOTEBOOK(win), scroll, gtk_label_new(ts->name));
+
+	entry = new_info_entry_in_frame(box, "Name");
+	gtk_entry_set_text(GTK_ENTRY(entry), ts->name);
+	if (strlen(ts->description)) {
+		entry = new_info_entry_in_frame(box, "Description");
+		gtk_entry_set_text(GTK_ENTRY(entry), ts->description);
+	}
+	entry = new_info_entry_in_frame(box, "Group ID");
+	entry_set_int_value(entry, ts->groupid);
+	entry = new_info_entry_in_frame(box, "Jobs");
+	entry_set_int_value(entry, ts->members);
+	gc->err_entry = entry = new_info_entry_in_frame(box, "Error");
+	entry_set_int_value(entry, ts->error);
+	entry = new_info_entry_in_frame(box, "PID");
+	entry_set_int_value(entry, ts->pid);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (ts->io_bytes[i])
+			gfio_show_ddir_status(gc, vbox, rs, ts, i);
+	}
+
+	gfio_show_latency_buckets(gc, vbox, ts);
+	gfio_show_cpu_usage(vbox, ts);
+	gfio_show_io_depths(vbox, ts);
+}
+
+void gfio_display_end_results(struct gfio_client *gc)
+{
+	struct gui_entry *ge = gc->ge;
+	GtkWidget *res_notebook;
+	int i;
+
+	res_notebook = get_results_window(ge);
+
+	for (i = 0; i < gc->nr_results; i++) {
+		struct end_results *e = &gc->results[i];
+
+		__gfio_display_end_results(res_notebook, gc, &e->ts, &e->gs);
+	}
+
+	if (gfio_disk_util_show(gc))
+		gtk_widget_show_all(ge->results_window);
+}
+
+static void gfio_display_ts(struct fio_client *client, struct thread_stat *ts,
+			    struct group_run_stats *rs)
+{
+	struct gfio_client *gc = client->client_data;
+	struct gui_entry *ge = gc->ge;
+
+	gfio_add_end_results(gc, ts, rs);
+
+	gdk_threads_enter();
+	if (ge->results_window)
+		__gfio_display_end_results(ge->results_notebook, gc, ts, rs);
+	else
+		gfio_display_end_results(gc);
+	gdk_threads_leave();
+}
+
+static void gfio_client_removed(struct fio_client *client)
+{
+	struct gfio_client *gc = client->client_data;
+
+	assert(gc->client == client);
+	fio_put_client(gc->client);
+	gc->client = NULL;
+}
+
+struct client_ops gfio_client_ops = {
+	.text			= gfio_text_op,
+	.disk_util		= gfio_disk_util_op,
+	.thread_status		= gfio_thread_status_op,
+	.group_stats		= gfio_group_stats_op,
+	.jobs_eta		= gfio_update_client_eta,
+	.eta			= gfio_update_all_eta,
+	.probe			= gfio_probe_op,
+	.quit			= gfio_quit_op,
+	.add_job		= gfio_add_job_op,
+	.update_job		= gfio_update_job_op,
+	.timed_out		= gfio_client_timed_out,
+	.stop			= gfio_client_stop,
+	.start			= gfio_client_start,
+	.job_start		= gfio_client_job_start,
+	.removed		= gfio_client_removed,
+	.eta_msec		= FIO_CLIENT_DEF_ETA_MSEC,
+	.stay_connected		= 1,
+	.client_type		= FIO_CLIENT_TYPE_GUI,
+};
diff --git a/gclient.h b/gclient.h
new file mode 100644
index 0000000..4038365
--- /dev/null
+++ b/gclient.h
@@ -0,0 +1,18 @@
+#ifndef GFIO_CLIENT_H
+#define GFIO_CLIENT_H
+
+extern struct client_ops gfio_client_ops;
+
+extern void gfio_display_end_results(struct gfio_client *);
+
+#define GFIO_READ_R	0.13
+#define GFIO_READ_G	0.54
+#define GFIO_READ_B	0.13
+#define GFIO_WRITE_R	1.00
+#define GFIO_WRITE_G	0.00
+#define GFIO_WRITE_B	0.00
+#define GFIO_TRIM_R	0.24
+#define GFIO_TRIM_G	0.18
+#define GFIO_TRIM_B	0.52
+
+#endif
diff --git a/gcompat.c b/gcompat.c
new file mode 100644
index 0000000..5944df0
--- /dev/null
+++ b/gcompat.c
@@ -0,0 +1,59 @@
+#include <gtk/gtk.h>
+
+#include "gcompat.h"
+
+#if GTK_MAJOR_VERSION <= 2 && GTK_MINOR_VERSION < 24
+
+GtkWidget *gtk_combo_box_text_new(void)
+{
+	return gtk_combo_box_new();
+}
+
+void gtk_combo_box_text_append_text(GtkComboBoxText *combo_box,
+				    const gchar *text)
+{
+	gtk_combo_box_append_text(GTK_COMBO_BOX(combo_box), text);
+}
+
+void gtk_combo_box_text_insert_text(GtkComboBoxText *combo_box, gint position,
+				    const gchar *text)
+{
+	gtk_combo_box_insert_text(GTK_COMBO_BOX(combo_box), position, text);
+}
+
+void gtk_combo_box_text_prepend_text(GtkComboBoxText *combo_box,
+				     const gchar *text)
+{
+	gtk_combo_box_prepend_text(GTK_COMBO_BOX(combo_box), text);
+}
+
+gchar *gtk_combo_box_text_get_active_text(GtkComboBoxText *combo_box)
+{
+	return gtk_combo_box_get_active_text(GTK_COMBO_BOX(combo_box));
+}
+
+#endif
+
+#if GTK_MAJOR_VERSION < 3
+
+guint gtk_widget_get_allocated_width(GtkWidget *w)
+{
+	return w->allocation.width;
+}
+
+guint gtk_widget_get_allocated_height(GtkWidget *w)
+{
+	return w->allocation.height;
+}
+
+#endif
+
+#if GTK_MAJOR_VERSION <= 2 && GTK_MINOR_VERSION < 18
+void gtk_widget_set_can_focus(GtkWidget *widget, gboolean can_focus)
+{
+	if (can_focus)
+		GTK_WIDGET_SET_FLAGS(widget, GTK_CAN_FOCUS);
+	else
+		GTK_WIDGET_UNSET_FLAGS(widget, GTK_CAN_FOCUS);
+}
+#endif
diff --git a/gcompat.h b/gcompat.h
new file mode 100644
index 0000000..e0f7444
--- /dev/null
+++ b/gcompat.h
@@ -0,0 +1,46 @@
+#ifndef GFIO_GTK_COMPAT
+#define GFIO_GTK_COMPAT
+
+#include <gtk/gtk.h>
+
+#if GTK_MAJOR_VERSION <= 2 && GTK_MINOR_VERSION < 24
+struct GtkComboBoxText;
+typedef GtkComboBox GtkComboBoxText;
+GtkWidget *gtk_combo_box_text_new(void);
+GtkWidget *gtk_combo_box_text_new_with_entry(void);
+void gtk_combo_box_text_append_text(GtkComboBoxText *combo_box, const gchar *text);
+void gtk_combo_box_text_insert_text(GtkComboBoxText *combo_box, gint position, const gchar *text);
+void gtk_combo_box_text_prepend_text(GtkComboBoxText *combo_box, const gchar *text);
+void gtk_combo_box_text_remove(GtkComboBoxText *combo_box, gint position);
+gchar *gtk_combo_box_text_get_active_text(GtkComboBoxText *combo_box);
+
+#define GTK_COMBO_BOX_TEXT	GTK_COMBO_BOX
+#endif /* GTK_MAJOR_VERSION <= 2 && GTK_MINOR_VERSION < 24 */
+
+#if GTK_MAJOR_VERSION <= 2 && GTK_MINOR_VERSION < 14
+static inline GtkWidget *gtk_dialog_get_content_area(GtkDialog *dialog)
+{
+	return dialog->vbox;
+}
+static inline GdkWindow *gtk_widget_get_window(GtkWidget *w)
+{
+	return w->window;
+}
+#endif
+
+#if GTK_MAJOR_VERSION < 3
+guint gtk_widget_get_allocated_width(GtkWidget *w);
+guint gtk_widget_get_allocated_height(GtkWidget *w);
+#endif
+
+#if GTK_MAJOR_VERSION == 3
+#define GFIO_DRAW_EVENT		"draw"
+#elif GTK_MAJOR_VERSION == 2
+#define GFIO_DRAW_EVENT		"expose_event"
+#endif
+
+#if GTK_MAJOR_VERSION <= 2 && GTK_MINOR_VERSION < 18
+void gtk_widget_set_can_focus(GtkWidget *widget, gboolean can_focus);
+#endif
+
+#endif
diff --git a/gerror.c b/gerror.c
new file mode 100644
index 0000000..1ebcb27
--- /dev/null
+++ b/gerror.c
@@ -0,0 +1,76 @@
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include <gtk/gtk.h>
+
+#include "gfio.h"
+#include "gerror.h"
+
+static void on_info_bar_response(GtkWidget *widget, gint response,
+				 gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+
+	if (response == GTK_RESPONSE_OK) {
+		gtk_widget_destroy(widget);
+		ui->error_info_bar = NULL;
+	}
+}
+
+static void report_error(struct gui_entry *ge, GError *error)
+{
+	struct gui *ui = ge->ui;
+
+	if (ui->error_info_bar == NULL) {
+		GtkWidget *container;
+
+		ui->error_info_bar = gtk_info_bar_new_with_buttons(GTK_STOCK_OK,
+						GTK_RESPONSE_OK, NULL);
+		g_signal_connect(ui->error_info_bar, "response", G_CALLBACK(on_info_bar_response), ui);
+		gtk_info_bar_set_message_type(GTK_INFO_BAR(ui->error_info_bar),
+						GTK_MESSAGE_ERROR);
+
+		ui->error_label = gtk_label_new(error->message);
+		container = gtk_info_bar_get_content_area(GTK_INFO_BAR(ui->error_info_bar));
+		gtk_container_add(GTK_CONTAINER(container), ui->error_label);
+
+		gtk_box_pack_start(GTK_BOX(ui->vbox), ui->error_info_bar, FALSE, FALSE, 0);
+		gtk_widget_show_all(ui->vbox);
+	} else {
+		char buffer[256];
+		snprintf(buffer, sizeof(buffer), "Failed to open file.");
+		gtk_label_set_text(GTK_LABEL(ui->error_label), buffer);
+	}
+}
+
+void gfio_report_error(struct gui_entry *ge, const char *format, ...)
+{
+	va_list args;
+	GError *error;
+
+	va_start(args, format);
+	error = g_error_new_valist(g_quark_from_string("fio"), 1, format, args);
+	va_end(args);
+
+	report_error(ge, error);
+	g_error_free(error);
+}
+
+void gfio_report_info(struct gui *ui, const char *title, const char *message)
+{
+	GtkWidget *dialog, *content, *label;
+
+	dialog = gtk_dialog_new_with_buttons(title, GTK_WINDOW(ui->window),
+			GTK_DIALOG_MODAL | GTK_DIALOG_DESTROY_WITH_PARENT,
+			GTK_STOCK_OK, GTK_RESPONSE_OK, NULL);
+
+	content = gtk_dialog_get_content_area(GTK_DIALOG(dialog));
+	label = gtk_label_new(message);
+	gtk_container_add(GTK_CONTAINER(content), label);
+	gtk_widget_show_all(dialog);
+	gtk_dialog_set_default_response(GTK_DIALOG(dialog), GTK_RESPONSE_ACCEPT);
+	gtk_dialog_run(GTK_DIALOG(dialog));
+	gtk_widget_destroy(dialog);
+}
diff --git a/gerror.h b/gerror.h
new file mode 100644
index 0000000..3767f92
--- /dev/null
+++ b/gerror.h
@@ -0,0 +1,7 @@
+#ifndef GFIO_ERROR_H
+#define GFIO_ERROR_H
+
+extern void gfio_report_error(struct gui_entry *ge, const char *format, ...);
+extern void gfio_report_info(struct gui *ui, const char *title, const char *message);
+
+#endif
diff --git a/gettime-thread.c b/gettime-thread.c
new file mode 100644
index 0000000..0a2cc6c
--- /dev/null
+++ b/gettime-thread.c
@@ -0,0 +1,104 @@
+#include <sys/time.h>
+#include <time.h>
+
+#include "fio.h"
+#include "smalloc.h"
+
+struct timespec *fio_ts = NULL;
+int fio_gtod_offload = 0;
+static pthread_t gtod_thread;
+static os_cpu_mask_t fio_gtod_cpumask;
+
+void fio_gtod_init(void)
+{
+	if (fio_ts)
+		return;
+
+	fio_ts = smalloc(sizeof(*fio_ts));
+}
+
+static void fio_gtod_update(void)
+{
+	if (fio_ts) {
+		struct timeval __tv;
+
+		gettimeofday(&__tv, NULL);
+		fio_ts->tv_sec = __tv.tv_sec;
+		write_barrier();
+		fio_ts->tv_nsec = __tv.tv_usec * 1000;
+		write_barrier();
+	}
+}
+
+struct gtod_cpu_data {
+	struct fio_sem *sem;
+	unsigned int cpu;
+};
+
+static void *gtod_thread_main(void *data)
+{
+	struct fio_sem *sem = data;
+	int ret;
+
+	ret = fio_setaffinity(gettid(), fio_gtod_cpumask);
+
+	fio_sem_up(sem);
+
+	if (ret == -1) {
+		log_err("gtod: setaffinity failed\n");
+		return NULL;
+	}
+
+	/*
+	 * As long as we have jobs around, update the clock. It would be nice
+	 * to have some way of NOT hammering that CPU with gettimeofday(),
+	 * but I'm not sure what to use outside of a simple CPU nop to relax
+	 * it - we don't want to lose precision.
+	 */
+	while (threads) {
+		fio_gtod_update();
+		nop;
+	}
+
+	return NULL;
+}
+
+int fio_start_gtod_thread(void)
+{
+	struct fio_sem *sem;
+	pthread_attr_t attr;
+	int ret;
+
+	sem = fio_sem_init(FIO_SEM_LOCKED);
+	if (!sem)
+		return 1;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 2 * PTHREAD_STACK_MIN);
+	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, sem);
+	pthread_attr_destroy(&attr);
+	if (ret) {
+		log_err("Can't create gtod thread: %s\n", strerror(ret));
+		goto err;
+	}
+
+	ret = pthread_detach(gtod_thread);
+	if (ret) {
+		log_err("Can't detach gtod thread: %s\n", strerror(ret));
+		goto err;
+	}
+
+	dprint(FD_MUTEX, "wait on startup_sem\n");
+	fio_sem_down(sem);
+	dprint(FD_MUTEX, "done waiting on startup_sem\n");
+err:
+	fio_sem_remove(sem);
+	return ret;
+}
+
+void fio_gtod_set_cpu(unsigned int cpu)
+{
+#ifdef FIO_HAVE_CPU_AFFINITY
+	fio_cpu_set(&fio_gtod_cpumask, cpu);
+#endif
+}
diff --git a/gettime.c b/gettime.c
new file mode 100644
index 0000000..c3a4966
--- /dev/null
+++ b/gettime.c
@@ -0,0 +1,777 @@
+/*
+ * Clock functions
+ */
+
+#include <math.h>
+
+#include "fio.h"
+#include "os/os.h"
+
+#if defined(ARCH_HAVE_CPU_CLOCK)
+#ifndef ARCH_CPU_CLOCK_CYCLES_PER_USEC
+static unsigned long long cycles_per_msec;
+static unsigned long long cycles_start;
+static unsigned long long clock_mult;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+static unsigned int clock_shift;
+static unsigned int max_cycles_shift;
+#define MAX_CLOCK_SEC 60*60
+#endif
+#ifdef ARCH_CPU_CLOCK_WRAPS
+static unsigned int cycles_wrap;
+#endif
+#endif
+bool tsc_reliable = false;
+
+struct tv_valid {
+	int warned;
+};
+#ifdef ARCH_HAVE_CPU_CLOCK
+#ifdef CONFIG_TLS_THREAD
+static __thread struct tv_valid static_tv_valid;
+#else
+static pthread_key_t tv_tls_key;
+#endif
+#endif
+
+enum fio_cs fio_clock_source = FIO_PREFERRED_CLOCK_SOURCE;
+int fio_clock_source_set = 0;
+static enum fio_cs fio_clock_source_inited = CS_INVAL;
+
+#ifdef FIO_DEBUG_TIME
+
+#define HASH_BITS	8
+#define HASH_SIZE	(1 << HASH_BITS)
+
+static struct flist_head hash[HASH_SIZE];
+static int gtod_inited;
+
+struct gtod_log {
+	struct flist_head list;
+	void *caller;
+	unsigned long calls;
+};
+
+static struct gtod_log *find_hash(void *caller)
+{
+	unsigned long h = hash_ptr(caller, HASH_BITS);
+	struct flist_head *entry;
+
+	flist_for_each(entry, &hash[h]) {
+		struct gtod_log *log = flist_entry(entry, struct gtod_log,
+									list);
+
+		if (log->caller == caller)
+			return log;
+	}
+
+	return NULL;
+}
+
+static void inc_caller(void *caller)
+{
+	struct gtod_log *log = find_hash(caller);
+
+	if (!log) {
+		unsigned long h;
+
+		log = malloc(sizeof(*log));
+		INIT_FLIST_HEAD(&log->list);
+		log->caller = caller;
+		log->calls = 0;
+
+		h = hash_ptr(caller, HASH_BITS);
+		flist_add_tail(&log->list, &hash[h]);
+	}
+
+	log->calls++;
+}
+
+static void gtod_log_caller(void *caller)
+{
+	if (gtod_inited)
+		inc_caller(caller);
+}
+
+static void fio_exit fio_dump_gtod(void)
+{
+	unsigned long total_calls = 0;
+	int i;
+
+	for (i = 0; i < HASH_SIZE; i++) {
+		struct flist_head *entry;
+		struct gtod_log *log;
+
+		flist_for_each(entry, &hash[i]) {
+			log = flist_entry(entry, struct gtod_log, list);
+
+			printf("function %p, calls %lu\n", log->caller,
+								log->calls);
+			total_calls += log->calls;
+		}
+	}
+
+	printf("Total %lu gettimeofday\n", total_calls);
+}
+
+static void fio_init gtod_init(void)
+{
+	int i;
+
+	for (i = 0; i < HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&hash[i]);
+
+	gtod_inited = 1;
+}
+
+#endif /* FIO_DEBUG_TIME */
+
+#ifdef CONFIG_CLOCK_GETTIME
+static int fill_clock_gettime(struct timespec *ts)
+{
+#if defined(CONFIG_CLOCK_MONOTONIC_RAW)
+	return clock_gettime(CLOCK_MONOTONIC_RAW, ts);
+#elif defined(CONFIG_CLOCK_MONOTONIC)
+	return clock_gettime(CLOCK_MONOTONIC, ts);
+#else
+	return clock_gettime(CLOCK_REALTIME, ts);
+#endif
+}
+#endif
+
+static void __fio_gettime(struct timespec *tp)
+{
+	switch (fio_clock_source) {
+#ifdef CONFIG_GETTIMEOFDAY
+	case CS_GTOD: {
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+
+		tp->tv_sec = tv.tv_sec;
+		tp->tv_nsec = tv.tv_usec * 1000;
+		break;
+		}
+#endif
+#ifdef CONFIG_CLOCK_GETTIME
+	case CS_CGETTIME: {
+		if (fill_clock_gettime(tp) < 0) {
+			log_err("fio: clock_gettime fails\n");
+			assert(0);
+		}
+		break;
+		}
+#endif
+#ifdef ARCH_HAVE_CPU_CLOCK
+	case CS_CPUCLOCK: {
+		uint64_t nsecs, t, multiples;
+		struct tv_valid *tv;
+
+#ifdef CONFIG_TLS_THREAD
+		tv = &static_tv_valid;
+#else
+		tv = pthread_getspecific(tv_tls_key);
+#endif
+
+		t = get_cpu_clock();
+#ifdef ARCH_CPU_CLOCK_WRAPS
+		if (t < cycles_start && !cycles_wrap)
+			cycles_wrap = 1;
+		else if (cycles_wrap && t >= cycles_start && !tv->warned) {
+			log_err("fio: double CPU clock wrap\n");
+			tv->warned = 1;
+		}
+#endif
+#ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
+		nsecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC * 1000;
+#else
+		t -= cycles_start;
+		multiples = t >> max_cycles_shift;
+		nsecs = multiples * nsecs_for_max_cycles;
+		nsecs += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
+#endif
+		tp->tv_sec = nsecs / 1000000000ULL;
+		tp->tv_nsec = nsecs % 1000000000ULL;
+		break;
+		}
+#endif
+	default:
+		log_err("fio: invalid clock source %d\n", fio_clock_source);
+		break;
+	}
+}
+
+#ifdef FIO_DEBUG_TIME
+void fio_gettime(struct timespec *tp, void *caller)
+#else
+void fio_gettime(struct timespec *tp, void fio_unused *caller)
+#endif
+{
+#ifdef FIO_DEBUG_TIME
+	if (!caller)
+		caller = __builtin_return_address(0);
+
+	gtod_log_caller(caller);
+#endif
+	if (fio_unlikely(fio_gettime_offload(tp)))
+		return;
+
+	__fio_gettime(tp);
+}
+
+#if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
+static unsigned long get_cycles_per_msec(void)
+{
+	struct timespec s, e;
+	uint64_t c_s, c_e;
+	enum fio_cs old_cs = fio_clock_source;
+	uint64_t elapsed;
+
+#ifdef CONFIG_CLOCK_GETTIME
+	fio_clock_source = CS_CGETTIME;
+#else
+	fio_clock_source = CS_GTOD;
+#endif
+	__fio_gettime(&s);
+
+	c_s = get_cpu_clock();
+	do {
+		__fio_gettime(&e);
+		c_e = get_cpu_clock();
+
+		elapsed = ntime_since(&s, &e);
+		if (elapsed >= 1280000)
+			break;
+	} while (1);
+
+	fio_clock_source = old_cs;
+	return (c_e - c_s) * 1000000 / elapsed;
+}
+
+#define NR_TIME_ITERS	50
+
+static int calibrate_cpu_clock(void)
+{
+	double delta, mean, S;
+	uint64_t minc, maxc, avg, cycles[NR_TIME_ITERS];
+	int i, samples, sft = 0;
+	unsigned long long tmp, max_ticks, max_mult;
+
+	cycles[0] = get_cycles_per_msec();
+	S = delta = mean = 0.0;
+	for (i = 0; i < NR_TIME_ITERS; i++) {
+		cycles[i] = get_cycles_per_msec();
+		delta = cycles[i] - mean;
+		if (delta) {
+			mean += delta / (i + 1.0);
+			S += delta * (cycles[i] - mean);
+		}
+	}
+
+	/*
+	 * The most common platform clock breakage is returning zero
+	 * indefinitely. Check for that and return failure.
+	 */
+	if (!cycles[0] && !cycles[NR_TIME_ITERS - 1])
+		return 1;
+
+	S = sqrt(S / (NR_TIME_ITERS - 1.0));
+
+	minc = -1ULL;
+	maxc = samples = avg = 0;
+	for (i = 0; i < NR_TIME_ITERS; i++) {
+		double this = cycles[i];
+
+		minc = min(cycles[i], minc);
+		maxc = max(cycles[i], maxc);
+
+		if ((fmax(this, mean) - fmin(this, mean)) > S)
+			continue;
+		samples++;
+		avg += this;
+	}
+
+	S /= (double) NR_TIME_ITERS;
+
+	for (i = 0; i < NR_TIME_ITERS; i++)
+		dprint(FD_TIME, "cycles[%d]=%llu\n", i, (unsigned long long) cycles[i]);
+
+	avg /= samples;
+	cycles_per_msec = avg;
+	dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f, N=%d\n",
+			(unsigned long long) minc,
+			(unsigned long long) maxc, mean, S, NR_TIME_ITERS);
+	dprint(FD_TIME, "trimmed mean=%llu, N=%d\n", (unsigned long long) avg, samples);
+
+	max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL;
+	max_mult = ULLONG_MAX / max_ticks;
+	dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, "
+			"max_mult=%llu\n", max_ticks,
+			__builtin_clzll(max_ticks), max_mult);
+
+        /*
+         * Find the largest shift count that will produce
+         * a multiplier that does not exceed max_mult
+         */
+        tmp = max_mult * cycles_per_msec / 1000000;
+        while (tmp > 1) {
+                tmp >>= 1;
+                sft++;
+                dprint(FD_TIME, "tmp=%llu, sft=%u\n", tmp, sft);
+        }
+
+	clock_shift = sft;
+	clock_mult = (1ULL << sft) * 1000000 / cycles_per_msec;
+	dprint(FD_TIME, "clock_shift=%u, clock_mult=%llu\n", clock_shift,
+							clock_mult);
+
+	/*
+	 * Find the greatest power of 2 clock ticks that is less than the
+	 * ticks in MAX_CLOCK_SEC_2STAGE
+	 */
+	max_cycles_shift = max_cycles_mask = 0;
+	tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec;
+	dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp,
+							max_cycles_shift);
+	while (tmp > 1) {
+		tmp >>= 1;
+		max_cycles_shift++;
+		dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+	}
+	/*
+	 * if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_msec
+	 * here we will have a discontinuity every
+	 * (1ULL << max_cycles_shift) cycles
+	 */
+	nsecs_for_max_cycles = ((1ULL << max_cycles_shift) * clock_mult)
+					>> clock_shift;
+
+	/* Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) */
+	for (tmp = 0; tmp < max_cycles_shift; tmp++)
+		max_cycles_mask |= 1ULL << tmp;
+
+	dprint(FD_TIME, "max_cycles_shift=%u, 2^max_cycles_shift=%llu, "
+			"nsecs_for_max_cycles=%llu, "
+			"max_cycles_mask=%016llx\n",
+			max_cycles_shift, (1ULL << max_cycles_shift),
+			nsecs_for_max_cycles, max_cycles_mask);
+
+	cycles_start = get_cpu_clock();
+	dprint(FD_TIME, "cycles_start=%llu\n", cycles_start);
+	return 0;
+}
+#else
+static int calibrate_cpu_clock(void)
+{
+#ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
+	return 0;
+#else
+	return 1;
+#endif
+}
+#endif // ARCH_HAVE_CPU_CLOCK
+
+#if defined(ARCH_HAVE_CPU_CLOCK) && !defined(CONFIG_TLS_THREAD)
+void fio_local_clock_init(void)
+{
+	struct tv_valid *t;
+
+	t = calloc(1, sizeof(*t));
+	if (pthread_setspecific(tv_tls_key, t)) {
+		log_err("fio: can't set TLS key\n");
+		assert(0);
+	}
+}
+
+static void kill_tv_tls_key(void *data)
+{
+	free(data);
+}
+#else
+void fio_local_clock_init(void)
+{
+}
+#endif
+
+void fio_clock_init(void)
+{
+	if (fio_clock_source == fio_clock_source_inited)
+		return;
+
+#if defined(ARCH_HAVE_CPU_CLOCK) && !defined(CONFIG_TLS_THREAD)
+	if (pthread_key_create(&tv_tls_key, kill_tv_tls_key))
+		log_err("fio: can't create TLS key\n");
+#endif
+
+	fio_clock_source_inited = fio_clock_source;
+
+	if (calibrate_cpu_clock())
+		tsc_reliable = false;
+
+	/*
+	 * If the arch sets tsc_reliable != 0, then it must be good enough
+	 * to use as THE clock source. For x86 CPUs, this means the TSC
+	 * runs at a constant rate and is synced across CPU cores.
+	 */
+	if (tsc_reliable) {
+		if (!fio_clock_source_set && !fio_monotonic_clocktest(0))
+			fio_clock_source = CS_CPUCLOCK;
+	} else if (fio_clock_source == CS_CPUCLOCK)
+		log_info("fio: clocksource=cpu may not be reliable\n");
+	dprint(FD_TIME, "gettime: clocksource=%d\n", (int) fio_clock_source);
+}
+
+uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
+{
+       int64_t sec, nsec;
+
+       sec = e->tv_sec - s->tv_sec;
+       nsec = e->tv_nsec - s->tv_nsec;
+       if (sec > 0 && nsec < 0) {
+	       sec--;
+	       nsec += 1000000000LL;
+       }
+
+       /*
+	* time warp bug on some kernels?
+	*/
+       if (sec < 0 || (sec == 0 && nsec < 0))
+	       return 0;
+
+       return nsec + (sec * 1000000000LL);
+}
+
+uint64_t ntime_since_now(const struct timespec *s)
+{
+	struct timespec now;
+
+	fio_gettime(&now, NULL);
+	return ntime_since(s, &now);
+}
+
+uint64_t utime_since(const struct timespec *s, const struct timespec *e)
+{
+	int64_t sec, usec;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	/*
+	 * time warp bug on some kernels?
+	 */
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	return usec + (sec * 1000000);
+}
+
+uint64_t utime_since_now(const struct timespec *s)
+{
+	struct timespec t;
+#ifdef FIO_DEBUG_TIME
+	void *p = __builtin_return_address(0);
+
+	fio_gettime(&t, p);
+#else
+	fio_gettime(&t, NULL);
+#endif
+
+	return utime_since(s, &t);
+}
+
+uint64_t mtime_since_tv(const struct timeval *s, const struct timeval *e)
+{
+	int64_t sec, usec;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = (e->tv_usec - s->tv_usec);
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	sec *= 1000;
+	usec /= 1000;
+	return sec + usec;
+}
+
+uint64_t mtime_since_now(const struct timespec *s)
+{
+	struct timespec t;
+#ifdef FIO_DEBUG_TIME
+	void *p = __builtin_return_address(0);
+
+	fio_gettime(&t, p);
+#else
+	fio_gettime(&t, NULL);
+#endif
+
+	return mtime_since(s, &t);
+}
+
+uint64_t mtime_since(const struct timespec *s, const struct timespec *e)
+{
+	int64_t sec, usec;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	sec *= 1000;
+	usec /= 1000;
+	return sec + usec;
+}
+
+uint64_t time_since_now(const struct timespec *s)
+{
+	return mtime_since_now(s) / 1000;
+}
+
+#if defined(FIO_HAVE_CPU_AFFINITY) && defined(ARCH_HAVE_CPU_CLOCK)  && \
+    defined(CONFIG_SYNC_SYNC) && defined(CONFIG_CMP_SWAP)
+
+#define CLOCK_ENTRIES_DEBUG	100000
+#define CLOCK_ENTRIES_TEST	1000
+
+struct clock_entry {
+	uint32_t seq;
+	uint32_t cpu;
+	uint64_t tsc;
+};
+
+struct clock_thread {
+	pthread_t thread;
+	int cpu;
+	int debug;
+	struct fio_sem lock;
+	unsigned long nr_entries;
+	uint32_t *seq;
+	struct clock_entry *entries;
+};
+
+static inline uint32_t atomic32_compare_and_swap(uint32_t *ptr, uint32_t old,
+						 uint32_t new)
+{
+	return __sync_val_compare_and_swap(ptr, old, new);
+}
+
+static void *clock_thread_fn(void *data)
+{
+	struct clock_thread *t = data;
+	struct clock_entry *c;
+	os_cpu_mask_t cpu_mask;
+	unsigned long long first;
+	int i;
+
+	if (fio_cpuset_init(&cpu_mask)) {
+		int __err = errno;
+
+		log_err("clock cpuset init failed: %s\n", strerror(__err));
+		goto err_out;
+	}
+
+	fio_cpu_set(&cpu_mask, t->cpu);
+
+	if (fio_setaffinity(gettid(), cpu_mask) == -1) {
+		int __err = errno;
+
+		log_err("clock setaffinity failed: %s\n", strerror(__err));
+		goto err;
+	}
+
+	fio_sem_down(&t->lock);
+
+	first = get_cpu_clock();
+	c = &t->entries[0];
+	for (i = 0; i < t->nr_entries; i++, c++) {
+		uint32_t seq;
+		uint64_t tsc;
+
+		c->cpu = t->cpu;
+		do {
+			seq = *t->seq;
+			if (seq == UINT_MAX)
+				break;
+			__sync_synchronize();
+			tsc = get_cpu_clock();
+		} while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1));
+
+		if (seq == UINT_MAX)
+			break;
+
+		c->seq = seq;
+		c->tsc = tsc;
+	}
+
+	if (t->debug) {
+		unsigned long long clocks;
+
+		clocks = t->entries[i - 1].tsc - t->entries[0].tsc;
+		log_info("cs: cpu%3d: %llu clocks seen, first %llu\n", t->cpu,
+							clocks, first);
+	}
+
+	/*
+	 * The most common platform clock breakage is returning zero
+	 * indefinitely. Check for that and return failure.
+	 */
+	if (i > 1 && !t->entries[i - 1].tsc && !t->entries[0].tsc)
+		goto err;
+
+	fio_cpuset_exit(&cpu_mask);
+	return NULL;
+err:
+	fio_cpuset_exit(&cpu_mask);
+err_out:
+	return (void *) 1;
+}
+
+static int clock_cmp(const void *p1, const void *p2)
+{
+	const struct clock_entry *c1 = p1;
+	const struct clock_entry *c2 = p2;
+
+	if (c1->seq == c2->seq)
+		log_err("cs: bug in atomic sequence!\n");
+
+	return c1->seq - c2->seq;
+}
+
+int fio_monotonic_clocktest(int debug)
+{
+	struct clock_thread *cthreads;
+	unsigned int nr_cpus = cpus_online();
+	struct clock_entry *entries;
+	unsigned long nr_entries, tentries, failed = 0;
+	struct clock_entry *prev, *this;
+	uint32_t seq = 0;
+	unsigned int i;
+
+	if (debug) {
+		log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
+
+#ifdef FIO_INC_DEBUG
+		fio_debug |= 1U << FD_TIME;
+#endif
+		nr_entries = CLOCK_ENTRIES_DEBUG;
+	} else
+		nr_entries = CLOCK_ENTRIES_TEST;
+
+	calibrate_cpu_clock();
+
+	if (debug) {
+#ifdef FIO_INC_DEBUG
+		fio_debug &= ~(1U << FD_TIME);
+#endif
+	}
+
+	cthreads = malloc(nr_cpus * sizeof(struct clock_thread));
+	tentries = nr_entries * nr_cpus;
+	entries = malloc(tentries * sizeof(struct clock_entry));
+
+	if (debug)
+		log_info("cs: Testing %u CPUs\n", nr_cpus);
+
+	for (i = 0; i < nr_cpus; i++) {
+		struct clock_thread *t = &cthreads[i];
+
+		t->cpu = i;
+		t->debug = debug;
+		t->seq = &seq;
+		t->nr_entries = nr_entries;
+		t->entries = &entries[i * nr_entries];
+		__fio_sem_init(&t->lock, FIO_SEM_LOCKED);
+		if (pthread_create(&t->thread, NULL, clock_thread_fn, t)) {
+			failed++;
+			nr_cpus = i;
+			break;
+		}
+	}
+
+	for (i = 0; i < nr_cpus; i++) {
+		struct clock_thread *t = &cthreads[i];
+
+		fio_sem_up(&t->lock);
+	}
+
+	for (i = 0; i < nr_cpus; i++) {
+		struct clock_thread *t = &cthreads[i];
+		void *ret;
+
+		pthread_join(t->thread, &ret);
+		if (ret)
+			failed++;
+		__fio_sem_remove(&t->lock);
+	}
+	free(cthreads);
+
+	if (failed) {
+		if (debug)
+			log_err("Clocksource test: %lu threads failed\n", failed);
+		goto err;
+	}
+
+	qsort(entries, tentries, sizeof(struct clock_entry), clock_cmp);
+
+	/* silence silly gcc */
+	prev = NULL;
+	for (failed = i = 0; i < tentries; i++) {
+		this = &entries[i];
+
+		if (!i) {
+			prev = this;
+			continue;
+		}
+
+		if (prev->tsc > this->tsc) {
+			uint64_t diff = prev->tsc - this->tsc;
+
+			if (!debug) {
+				failed++;
+				break;
+			}
+
+			log_info("cs: CPU clock mismatch (diff=%llu):\n",
+						(unsigned long long) diff);
+			log_info("\t CPU%3u: TSC=%llu, SEQ=%u\n", prev->cpu, (unsigned long long) prev->tsc, prev->seq);
+			log_info("\t CPU%3u: TSC=%llu, SEQ=%u\n", this->cpu, (unsigned long long) this->tsc, this->seq);
+			failed++;
+		}
+
+		prev = this;
+	}
+
+	if (debug) {
+		if (failed)
+			log_info("cs: Failed: %lu\n", failed);
+		else
+			log_info("cs: Pass!\n");
+	}
+err:
+	free(entries);
+	return !!failed;
+}
+
+#else /* defined(FIO_HAVE_CPU_AFFINITY) && defined(ARCH_HAVE_CPU_CLOCK) */
+
+int fio_monotonic_clocktest(int debug)
+{
+	if (debug)
+		log_info("cs: current platform does not support CPU clocks\n");
+	return 1;
+}
+
+#endif
diff --git a/gettime.h b/gettime.h
new file mode 100644
index 0000000..f92ee8c
--- /dev/null
+++ b/gettime.h
@@ -0,0 +1,45 @@
+#ifndef FIO_GETTIME_H
+#define FIO_GETTIME_H
+
+#include <sys/time.h>
+
+#include "arch/arch.h"
+
+/*
+ * Clock sources
+ */
+enum fio_cs {
+	CS_GTOD		= 1,
+	CS_CGETTIME,
+	CS_CPUCLOCK,
+	CS_INVAL,
+};
+
+extern void fio_gettime(struct timespec *, void *);
+extern void fio_gtod_init(void);
+extern void fio_clock_init(void);
+extern int fio_start_gtod_thread(void);
+extern int fio_monotonic_clocktest(int debug);
+extern void fio_local_clock_init(void);
+
+extern struct timespec *fio_ts;
+
+static inline int fio_gettime_offload(struct timespec *ts)
+{
+	time_t last_sec;
+
+	if (!fio_ts)
+		return 0;
+
+	do {
+		read_barrier();
+		last_sec = ts->tv_sec = fio_ts->tv_sec;
+		ts->tv_nsec = fio_ts->tv_nsec;
+	} while (fio_ts->tv_sec != last_sec);
+
+	return 1;
+}
+
+extern void fio_gtod_set_cpu(unsigned int cpu);
+
+#endif
diff --git a/gfio.c b/gfio.c
new file mode 100644
index 0000000..2805396
--- /dev/null
+++ b/gfio.c
@@ -0,0 +1,1756 @@
+/*
+ * gfio - gui front end for fio - the flexible io tester
+ *
+ * Copyright (C) 2012 Stephen M. Cameron <stephenmcameron@gmail.com>
+ * Copyright (C) 2012 Jens Axboe <axboe@kernel.dk>
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libgen.h>
+
+#include <glib.h>
+#include <cairo.h>
+#include <gtk/gtk.h>
+
+#include "fio.h"
+#include "gfio.h"
+#include "ghelpers.h"
+#include "goptions.h"
+#include "gerror.h"
+#include "gclient.h"
+#include "graph.h"
+
+static bool gfio_server_running;
+static unsigned int gfio_graph_limit = 100;
+
+GdkColor gfio_color_white;
+GdkColor gfio_color_lightyellow;
+const char *gfio_graph_font = GRAPH_DEFAULT_FONT;
+
+typedef void (*clickfunction)(GtkWidget *widget, gpointer data);
+
+static void connect_clicked(GtkWidget *widget, gpointer data);
+static void start_job_clicked(GtkWidget *widget, gpointer data);
+static void send_clicked(GtkWidget *widget, gpointer data);
+
+static struct button_spec {
+	const char *buttontext;
+	clickfunction f;
+	const char *tooltiptext[2];
+	const int start_sensitive;
+} buttonspeclist[] = {
+	{
+	  .buttontext		= "Connect",
+	  .f			= connect_clicked,
+	  .tooltiptext		= { "Disconnect from host", "Connect to host" },
+	  .start_sensitive	= 1,
+	},
+	{
+	  .buttontext		= "Send",
+	  .f			= send_clicked,
+	  .tooltiptext		= { "Send job description to host", NULL },
+	  .start_sensitive	= 0,
+	},
+	{
+	  .buttontext		= "Start Job",
+	  .f			= start_job_clicked,
+	  .tooltiptext		= { "Start the current job on the server", NULL },
+	  .start_sensitive	= 0,
+	},
+};
+
+static void setup_iops_graph(struct gfio_graphs *gg)
+{
+	struct graph *g;
+
+	g = graph_new(DRAWING_AREA_XDIM / 2.0, DRAWING_AREA_YDIM, gfio_graph_font);
+	graph_title(g, "IOPS (IOs/sec)");
+	graph_x_title(g, "Time (secs)");
+	gg->read_iops = graph_add_label(g, "Read IOPS");
+	gg->write_iops = graph_add_label(g, "Write IOPS");
+	gg->trim_iops = graph_add_label(g, "Trim IOPS");
+	graph_set_color(g, gg->read_iops, GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	graph_set_color(g, gg->write_iops, GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	graph_set_color(g, gg->trim_iops, GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	line_graph_set_data_count_limit(g, gfio_graph_limit);
+	graph_add_extra_space(g, 0.0, 0.0, 0.0, 0.0);
+	graph_set_graph_all_zeroes(g, 0);
+	gg->iops_graph = g;
+}
+
+static void setup_bandwidth_graph(struct gfio_graphs *gg)
+{
+	struct graph *g;
+
+	g = graph_new(DRAWING_AREA_XDIM / 2.0, DRAWING_AREA_YDIM, gfio_graph_font);
+	graph_title(g, "Bandwidth (bytes/sec)");
+	graph_x_title(g, "Time (secs)");
+	gg->read_bw = graph_add_label(g, "Read Bandwidth");
+	gg->write_bw = graph_add_label(g, "Write Bandwidth");
+	gg->trim_bw = graph_add_label(g, "Trim Bandwidth");
+	graph_set_color(g, gg->read_bw, GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	graph_set_color(g, gg->write_bw, GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	graph_set_color(g, gg->trim_bw, GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	graph_set_base_offset(g, 1);
+	line_graph_set_data_count_limit(g, 100);
+	graph_add_extra_space(g, 0.0, 0.0, 0.0, 0.0);
+	graph_set_graph_all_zeroes(g, 0);
+	gg->bandwidth_graph = g;
+}
+
+static void setup_graphs(struct gfio_graphs *g)
+{
+	setup_iops_graph(g);
+	setup_bandwidth_graph(g);
+}
+
+void clear_ge_ui_info(struct gui_entry *ge)
+{
+	gtk_label_set_text(GTK_LABEL(ge->probe.hostname), "");
+	gtk_label_set_text(GTK_LABEL(ge->probe.os), "");
+	gtk_label_set_text(GTK_LABEL(ge->probe.arch), "");
+	gtk_label_set_text(GTK_LABEL(ge->probe.fio_ver), "");
+#if 0
+	/* should we empty it... */
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.name), "");
+#endif
+	multitext_update_entry(&ge->eta.iotype, 0, "");
+	multitext_update_entry(&ge->eta.bs, 0, "");
+	multitext_update_entry(&ge->eta.ioengine, 0, "");
+	multitext_update_entry(&ge->eta.iodepth, 0, "");
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.jobs), "");
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.files), "");
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), "");
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.read_iops), "");
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), "");
+	gtk_entry_set_text(GTK_ENTRY(ge->eta.write_iops), "");
+}
+
+static void set_menu_entry_text(struct gui *ui, const char *path,
+				const char *text)
+{
+	GtkWidget *w;
+
+	w = gtk_ui_manager_get_widget(ui->uimanager, path);
+	if (w)
+		gtk_menu_item_set_label(GTK_MENU_ITEM(w), text);
+	else
+		fprintf(stderr, "gfio: can't find path %s\n", path);
+}
+
+
+static void set_menu_entry_visible(struct gui *ui, const char *path, int show)
+{
+	GtkWidget *w;
+
+	w = gtk_ui_manager_get_widget(ui->uimanager, path);
+	if (w)
+		gtk_widget_set_sensitive(w, show);
+	else
+		fprintf(stderr, "gfio: can't find path %s\n", path);
+}
+
+static void set_job_menu_visible(struct gui *ui, int visible)
+{
+	set_menu_entry_visible(ui, "/MainMenu/JobMenu", visible);
+}
+
+static void set_view_results_visible(struct gui *ui, int visible)
+{
+	set_menu_entry_visible(ui, "/MainMenu/ViewMenu/Results", visible);
+}
+
+static const char *get_button_tooltip(struct button_spec *s, int sensitive)
+{
+	if (s->tooltiptext[sensitive])
+		return s->tooltiptext[sensitive];
+
+	return s->tooltiptext[0];
+}
+
+static GtkWidget *add_button(GtkWidget *buttonbox,
+			     struct button_spec *buttonspec, gpointer data)
+{
+	GtkWidget *button = gtk_button_new_with_label(buttonspec->buttontext);
+	gboolean sens = buttonspec->start_sensitive;
+
+	g_signal_connect(button, "clicked", G_CALLBACK(buttonspec->f), data);
+	gtk_box_pack_start(GTK_BOX(buttonbox), button, FALSE, FALSE, 3);
+
+	sens = buttonspec->start_sensitive;
+	gtk_widget_set_tooltip_text(button, get_button_tooltip(buttonspec, sens));
+	gtk_widget_set_sensitive(button, sens);
+
+	return button;
+}
+
+static void add_buttons(struct gui_entry *ge, struct button_spec *buttonlist,
+			int nbuttons)
+{
+	int i;
+
+	for (i = 0; i < nbuttons; i++)
+		ge->button[i] = add_button(ge->buttonbox, &buttonlist[i], ge);
+}
+
+/*
+ * Update sensitivity of job buttons and job menu items, based on the
+ * state of the client.
+ */
+static void update_button_states(struct gui *ui, struct gui_entry *ge)
+{
+	unsigned int connect_state, send_state, start_state, edit_state;
+	const char *connect_str = NULL;
+
+	switch (ge->state) {
+	default:
+		gfio_report_error(ge, "Bad client state: %u\n", ge->state);
+		/* fall through to new state */
+	case GE_STATE_NEW:
+		connect_state = 1;
+		edit_state = 1;
+		connect_str = "Connect";
+		send_state = 0;
+		start_state = 0;
+		break;
+	case GE_STATE_CONNECTED:
+		connect_state = 1;
+		edit_state = 1;
+		connect_str = "Disconnect";
+		send_state = 1;
+		start_state = 0;
+		break;
+	case GE_STATE_JOB_SENT:
+		connect_state = 1;
+		edit_state = 1;
+		connect_str = "Disconnect";
+		send_state = 0;
+		start_state = 1;
+		break;
+	case GE_STATE_JOB_STARTED:
+		connect_state = 1;
+		edit_state = 1;
+		connect_str = "Disconnect";
+		send_state = 0;
+		start_state = 1;
+		break;
+	case GE_STATE_JOB_RUNNING:
+		connect_state = 1;
+		edit_state = 0;
+		connect_str = "Disconnect";
+		send_state = 0;
+		start_state = 0;
+		break;
+	case GE_STATE_JOB_DONE:
+		connect_state = 1;
+		edit_state = 0;
+		connect_str = "Connect";
+		send_state = 0;
+		start_state = 0;
+		break;
+	}
+
+	gtk_widget_set_sensitive(ge->button[GFIO_BUTTON_CONNECT], connect_state);
+	gtk_widget_set_sensitive(ge->button[GFIO_BUTTON_SEND], send_state);
+	gtk_widget_set_sensitive(ge->button[GFIO_BUTTON_START], start_state);
+	gtk_button_set_label(GTK_BUTTON(ge->button[GFIO_BUTTON_CONNECT]), connect_str);
+	gtk_widget_set_tooltip_text(ge->button[GFIO_BUTTON_CONNECT], get_button_tooltip(&buttonspeclist[GFIO_BUTTON_CONNECT], connect_state));
+
+	set_menu_entry_visible(ui, "/MainMenu/JobMenu/Connect", connect_state);
+	set_menu_entry_text(ui, "/MainMenu/JobMenu/Connect", connect_str);
+
+	set_menu_entry_visible(ui, "/MainMenu/JobMenu/Edit job", edit_state);
+	set_menu_entry_visible(ui, "/MainMenu/JobMenu/Send job", send_state);
+	set_menu_entry_visible(ui, "/MainMenu/JobMenu/Start job", start_state);
+
+	if (ge->client && ge->client->nr_results)
+		set_view_results_visible(ui, 1);
+	else
+		set_view_results_visible(ui, 0);
+}
+
+void gfio_set_state(struct gui_entry *ge, unsigned int state)
+{
+	ge->state = state;
+	update_button_states(ge->ui, ge);
+}
+
+static void gfio_ui_setup_log(struct gui *ui)
+{
+	GtkTreeSelection *selection;
+	GtkListStore *model;
+	GtkWidget *tree_view;
+
+	model = gtk_list_store_new(4, G_TYPE_STRING, G_TYPE_STRING, G_TYPE_STRING, G_TYPE_STRING);
+
+	tree_view = gtk_tree_view_new_with_model(GTK_TREE_MODEL(model));
+	gtk_widget_set_can_focus(tree_view, FALSE);
+
+	selection = gtk_tree_view_get_selection(GTK_TREE_VIEW(tree_view));
+	gtk_tree_selection_set_mode(GTK_TREE_SELECTION(selection), GTK_SELECTION_BROWSE);
+	g_object_set(G_OBJECT(tree_view), "headers-visible", TRUE,
+		"enable-grid-lines", GTK_TREE_VIEW_GRID_LINES_BOTH, NULL);
+
+	tree_view_column(tree_view, 0, "Time", ALIGN_RIGHT | UNSORTABLE);
+	tree_view_column(tree_view, 1, "Host", ALIGN_RIGHT | UNSORTABLE);
+	tree_view_column(tree_view, 2, "Level", ALIGN_RIGHT | UNSORTABLE);
+	tree_view_column(tree_view, 3, "Text", ALIGN_LEFT | UNSORTABLE);
+
+	ui->log_model = model;
+	ui->log_tree = tree_view;
+}
+
+static gint on_config_drawing_area(GtkWidget *w, GdkEventConfigure *event,
+				   gpointer data)
+{
+	guint width = gtk_widget_get_allocated_width(w);
+	guint height = gtk_widget_get_allocated_height(w);
+	struct gfio_graphs *g = data;
+
+	graph_set_size(g->iops_graph, width / 2.0, height);
+	graph_set_position(g->iops_graph, width / 2.0, 0.0);
+	graph_set_size(g->bandwidth_graph, width / 2.0, height);
+	graph_set_position(g->bandwidth_graph, 0, 0);
+	return TRUE;
+}
+
+static void draw_graph(struct graph *g, cairo_t *cr)
+{
+	line_graph_draw(g, cr);
+	cairo_stroke(cr);
+}
+
+static gboolean graph_tooltip(GtkWidget *w, gint x, gint y,
+			      gboolean keyboard_mode, GtkTooltip *tooltip,
+			      gpointer data)
+{
+	struct gfio_graphs *g = data;
+	const char *text = NULL;
+
+	if (graph_contains_xy(g->iops_graph, x, y))
+		text = graph_find_tooltip(g->iops_graph, x, y);
+	else if (graph_contains_xy(g->bandwidth_graph, x, y))
+		text = graph_find_tooltip(g->bandwidth_graph, x, y);
+
+	if (text) {
+		gtk_tooltip_set_text(tooltip, text);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static int on_expose_drawing_area(GtkWidget *w, GdkEvent *event, gpointer p)
+{
+	struct gfio_graphs *g = p;
+	cairo_t *cr;
+
+	cr = gdk_cairo_create(gtk_widget_get_window(w));
+
+	if (graph_has_tooltips(g->iops_graph) ||
+	    graph_has_tooltips(g->bandwidth_graph)) {
+		g_object_set(w, "has-tooltip", TRUE, NULL);
+		g_signal_connect(w, "query-tooltip", G_CALLBACK(graph_tooltip), g);
+	}
+
+	cairo_set_source_rgb(cr, 0, 0, 0);
+	draw_graph(g->iops_graph, cr);
+	draw_graph(g->bandwidth_graph, cr);
+	cairo_destroy(cr);
+
+	return FALSE;
+}
+
+/*
+ * FIXME: need more handling here
+ */
+static void ge_destroy(struct gui_entry *ge)
+{
+	struct gfio_client *gc = ge->client;
+
+	if (gc) {
+		if (gc->client) {
+			if (ge->state >= GE_STATE_CONNECTED)
+				fio_client_terminate(gc->client);
+
+			fio_put_client(gc->client);
+		}
+		free(gc);
+	}
+
+	g_hash_table_remove(ge->ui->ge_hash, &ge->page_num);
+
+	free(ge->job_file);
+	free(ge->host);
+	free(ge);
+}
+
+static void ge_widget_destroy(GtkWidget *w, gpointer data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+
+	ge_destroy(ge);
+}
+
+static void gfio_quit(struct gui *ui)
+{
+	gtk_main_quit();
+}
+
+static void quit_clicked(__attribute__((unused)) GtkWidget *widget,
+			 gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+
+	gfio_quit(ui);
+}
+
+static void *job_thread(void *arg)
+{
+	struct gui *ui = arg;
+
+	ui->handler_running = 1;
+	fio_handle_clients(&gfio_client_ops);
+	ui->handler_running = 0;
+	return NULL;
+}
+
+static int send_job_file(struct gui_entry *ge)
+{
+	struct gfio_client *gc = ge->client;
+	int ret = 0;
+
+	/*
+	 * Prune old options, we are expecting the return options
+	 * when the job file is parsed remotely and returned to us.
+	 */
+	while (!flist_empty(&gc->o_list)) {
+		struct gfio_client_options *gco;
+
+		gco = flist_first_entry(&gc->o_list, struct gfio_client_options, list);
+		flist_del(&gco->list);
+		free(gco);
+	}
+
+	ret = fio_client_send_ini(gc->client, ge->job_file, false);
+	if (!ret)
+		return 0;
+
+	gfio_report_error(ge, "Failed to send file %s: %s\n", ge->job_file, strerror(-ret));
+	return 1;
+}
+
+static void *server_thread(void *arg)
+{
+	fio_server_create_sk_key();
+	is_backend = true;
+	gfio_server_running = true;
+	fio_start_server(NULL);
+	gfio_server_running = false;
+	fio_server_destroy_sk_key();
+	return NULL;
+}
+
+static void gfio_start_server(struct gui *ui)
+{
+	if (!gfio_server_running) {
+		gfio_server_running = true;
+		pthread_create(&ui->server_t, NULL, server_thread, NULL);
+		pthread_detach(ui->server_t);
+	}
+}
+
+static void start_job_clicked(__attribute__((unused)) GtkWidget *widget,
+			      gpointer data)
+{
+	struct gui_entry *ge = data;
+	struct gfio_client *gc = ge->client;
+
+	if (gc)
+		fio_start_client(gc->client);
+}
+
+static void file_open(GtkWidget *w, gpointer data);
+
+struct connection_widgets
+{
+	GtkWidget *hentry;
+	GtkWidget *combo;
+	GtkWidget *button;
+};
+
+static void hostname_cb(GtkEntry *entry, gpointer data)
+{
+	struct connection_widgets *cw = data;
+	int uses_net = 0, is_localhost = 0;
+	const gchar *text;
+	gchar *ctext;
+
+	/*
+	 * Check whether to display the 'auto start backend' box
+	 * or not. Show it if we are a localhost and using network,
+	 * or using a socket.
+	 */
+	ctext = gtk_combo_box_text_get_active_text(GTK_COMBO_BOX_TEXT(cw->combo));
+	if (!ctext || !strncmp(ctext, "IPv4", 4) || !strncmp(ctext, "IPv6", 4))
+		uses_net = 1;
+	g_free(ctext);
+
+	if (uses_net) {
+		text = gtk_entry_get_text(GTK_ENTRY(cw->hentry));
+		if (!strcmp(text, "127.0.0.1") || !strcmp(text, "localhost") ||
+		    !strcmp(text, "::1") || !strcmp(text, "ip6-localhost") ||
+		    !strcmp(text, "ip6-loopback"))
+			is_localhost = 1;
+	}
+
+	if (!uses_net || is_localhost) {
+		gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(cw->button), 1);
+		gtk_widget_set_sensitive(cw->button, 1);
+	} else {
+		gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(cw->button), 0);
+		gtk_widget_set_sensitive(cw->button, 0);
+	}
+}
+
+static int get_connection_details(struct gui_entry *ge)
+{
+	GtkWidget *dialog, *box, *vbox, *hbox, *frame, *pentry;
+	struct connection_widgets cw;
+	struct gui *ui = ge->ui;
+	char *typeentry;
+
+	if (ge->host)
+		return 0;
+
+	dialog = gtk_dialog_new_with_buttons("Connection details",
+			GTK_WINDOW(ui->window),
+			GTK_DIALOG_DESTROY_WITH_PARENT,
+			GTK_STOCK_OK, GTK_RESPONSE_ACCEPT,
+			GTK_STOCK_CANCEL, GTK_RESPONSE_REJECT, NULL);
+
+	frame = gtk_frame_new("Hostname / socket name");
+	vbox = gtk_dialog_get_content_area(GTK_DIALOG(dialog));
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+
+	box = gtk_vbox_new(FALSE, 6);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	hbox = gtk_hbox_new(TRUE, 10);
+	gtk_box_pack_start(GTK_BOX(box), hbox, FALSE, FALSE, 0);
+	cw.hentry = gtk_entry_new();
+	gtk_entry_set_text(GTK_ENTRY(cw.hentry), "localhost");
+	gtk_box_pack_start(GTK_BOX(hbox), cw.hentry, TRUE, TRUE, 0);
+
+	frame = gtk_frame_new("Port");
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+	box = gtk_vbox_new(FALSE, 10);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	hbox = gtk_hbox_new(TRUE, 4);
+	gtk_box_pack_start(GTK_BOX(box), hbox, FALSE, FALSE, 0);
+	pentry = create_spinbutton(hbox, 1, 65535, FIO_NET_PORT);
+
+	frame = gtk_frame_new("Type");
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+	box = gtk_vbox_new(FALSE, 10);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	hbox = gtk_hbox_new(TRUE, 4);
+	gtk_box_pack_start(GTK_BOX(box), hbox, FALSE, FALSE, 0);
+
+	cw.combo = gtk_combo_box_text_new();
+	gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(cw.combo), "IPv4");
+	gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(cw.combo), "IPv6");
+	gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(cw.combo), "local socket");
+	gtk_combo_box_set_active(GTK_COMBO_BOX(cw.combo), 0);
+
+	gtk_container_add(GTK_CONTAINER(hbox), cw.combo);
+
+	frame = gtk_frame_new("Options");
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+	box = gtk_vbox_new(FALSE, 10);
+	gtk_container_add(GTK_CONTAINER(frame), box);
+
+	hbox = gtk_hbox_new(TRUE, 4);
+	gtk_box_pack_start(GTK_BOX(box), hbox, FALSE, FALSE, 0);
+
+	cw.button = gtk_check_button_new_with_label("Auto-spawn fio backend");
+	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(cw.button), 1);
+	gtk_widget_set_tooltip_text(cw.button, "When running fio locally, it is necessary to have the backend running on the same system. If this is checked, gfio will start the backend automatically for you if it isn't already running.");
+	gtk_box_pack_start(GTK_BOX(hbox), cw.button, FALSE, FALSE, 6);
+
+	/*
+	 * Connect edit signal, so we can show/not-show the auto start button
+	 */
+	g_signal_connect(G_OBJECT(cw.hentry), "changed", G_CALLBACK(hostname_cb), &cw);
+	g_signal_connect(G_OBJECT(cw.combo), "changed", G_CALLBACK(hostname_cb), &cw);
+
+	gtk_widget_show_all(dialog);
+
+	if (gtk_dialog_run(GTK_DIALOG(dialog)) != GTK_RESPONSE_ACCEPT) {
+		gtk_widget_destroy(dialog);
+		return 1;
+	}
+
+	ge->host = strdup(gtk_entry_get_text(GTK_ENTRY(cw.hentry)));
+	ge->port = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(pentry));
+
+	typeentry = gtk_combo_box_text_get_active_text(GTK_COMBO_BOX_TEXT(cw.combo));
+	if (!typeentry || !strncmp(typeentry, "IPv4", 4))
+		ge->type = Fio_client_ipv4;
+	else if (!strncmp(typeentry, "IPv6", 4))
+		ge->type = Fio_client_ipv6;
+	else
+		ge->type = Fio_client_socket;
+	g_free(typeentry);
+
+	ge->server_start = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(cw.button));
+
+	gtk_widget_destroy(dialog);
+	return 0;
+}
+
+static void gfio_set_client(struct gfio_client *gc, struct fio_client *client)
+{
+	gc->client = fio_get_client(client);
+	client->client_data = gc;
+}
+
+static void gfio_client_added(struct gui_entry *ge, struct fio_client *client)
+{
+	struct gfio_client_options *gco;
+	struct gfio_client *gc;
+
+	gc = calloc(1, sizeof(*gc));
+	INIT_FLIST_HEAD(&gc->o_list);
+	gc->ge = ge;
+	ge->client = gc;
+	gfio_set_client(gc, client);
+
+	/*
+	 * Just add a default set of options, need to consider how best
+	 * to handle this
+	 */
+	gco = calloc(1, sizeof(*gco));
+	INIT_FLIST_HEAD(&gco->list);
+	options_default_fill(&gco->o);
+	flist_add_tail(&gco->list, &gc->o_list);
+	gc->o_list_nr++;
+}
+
+static void gfio_clear_graph_data(struct gfio_graphs *g)
+{
+	graph_clear_values(g->iops_graph);
+	graph_clear_values(g->bandwidth_graph);
+}
+
+static void connect_clicked(GtkWidget *widget, gpointer data)
+{
+	struct gui_entry *ge = data;
+	struct gfio_client *gc = ge->client;
+
+	if (ge->state == GE_STATE_NEW) {
+		int ret;
+
+		if (!ge->job_file)
+			file_open(widget, ge->ui);
+		if (!ge->job_file)
+			return;
+
+		gc = ge->client;
+
+		if (!gc->client) {
+			struct fio_client *client;
+
+			if (get_connection_details(ge)) {
+				gfio_report_error(ge, "Failed to get connection details\n");
+				return;
+			}
+
+			client = fio_client_add_explicit(&gfio_client_ops, ge->host, ge->type, ge->port);
+			if (!client) {
+				gfio_report_error(ge, "Failed to add client %s\n", ge->host);
+				free(ge->host);
+				ge->host = NULL;
+				return;
+			}
+			gfio_set_client(gc, client);
+		}
+
+		gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ge->thread_status_pb), "No jobs running");
+		gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ge->thread_status_pb), 0.0);
+		ret = fio_client_connect(gc->client);
+		if (!ret) {
+			if (!ge->ui->handler_running)
+				pthread_create(&ge->ui->t, NULL, job_thread, ge->ui);
+			gfio_set_state(ge, GE_STATE_CONNECTED);
+			gfio_clear_graph_data(&ge->graphs);
+		} else {
+			gfio_report_error(ge, "Failed to connect to %s: %s\n", ge->client->client->hostname, strerror(-ret));
+		}
+	} else {
+		fio_client_terminate(gc->client);
+		gfio_set_state(ge, GE_STATE_NEW);
+		clear_ge_ui_info(ge);
+	}
+}
+
+static void send_clicked(GtkWidget *widget, gpointer data)
+{
+	struct gui_entry *ge = data;
+
+	if (send_job_file(ge))
+		gtk_widget_set_sensitive(ge->button[GFIO_BUTTON_START], 1);
+}
+
+static GtkWidget *new_client_page(struct gui_entry *ge);
+
+static struct gui_entry *alloc_new_gui_entry(struct gui *ui)
+{
+	struct gui_entry *ge;
+
+	ge = malloc(sizeof(*ge));
+	memset(ge, 0, sizeof(*ge));
+	ge->state = GE_STATE_NEW;
+	ge->ui = ui;
+	return ge;
+}
+
+static struct gui_entry *get_new_ge_with_tab(struct gui *ui, const char *name)
+{
+	struct gui_entry *ge;
+
+	ge = alloc_new_gui_entry(ui);
+
+	ge->vbox = new_client_page(ge);
+	g_signal_connect(ge->vbox, "destroy", G_CALLBACK(ge_widget_destroy), ge);
+
+	ge->page_label = gtk_label_new(name);
+	ge->page_num = gtk_notebook_append_page(GTK_NOTEBOOK(ui->notebook), ge->vbox, ge->page_label);
+
+	g_hash_table_insert(ui->ge_hash, &ge->page_num, ge);
+
+	gtk_widget_show_all(ui->window);
+	return ge;
+}
+
+static void file_new(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	ge = get_new_ge_with_tab(ui, "Untitled");
+	gtk_notebook_set_current_page(GTK_NOTEBOOK(ui->notebook), ge->page_num);
+}
+
+/*
+ * Return the 'ge' corresponding to the tab. If the active tab is the
+ * main tab, open a new tab.
+ */
+static struct gui_entry *get_ge_from_page(struct gui *ui, gint cur_page,
+					  int *created)
+{
+	if (!cur_page) {
+		if (created)
+			*created = 1;
+		return get_new_ge_with_tab(ui, "Untitled");
+	}
+
+	if (created)
+		*created = 0;
+
+	return g_hash_table_lookup(ui->ge_hash, &cur_page);
+}
+
+static struct gui_entry *get_ge_from_cur_tab(struct gui *ui)
+{
+	gint cur_page;
+
+	/*
+	 * Main tab is tab 0, so any current page other than 0 holds
+	 * a ge entry.
+	 */
+	cur_page = gtk_notebook_get_current_page(GTK_NOTEBOOK(ui->notebook));
+	if (cur_page)
+		return get_ge_from_page(ui, cur_page, NULL);
+
+	return NULL;
+}
+
+static void file_close(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	/*
+	 * Can't close the main tab
+	 */
+	ge = get_ge_from_cur_tab(ui);
+	if (ge) {
+		gtk_widget_destroy(ge->vbox);
+		return;
+	}
+
+	if (g_hash_table_size(ui->ge_hash)) {
+		gfio_report_info(ui, "Error", "The main page view cannot be closed\n");
+		return;
+	}
+
+	gfio_quit(ui);
+}
+
+static void file_add_recent(struct gui *ui, const gchar *uri)
+{
+	GtkRecentData grd;
+
+	memset(&grd, 0, sizeof(grd));
+	grd.display_name = strdup("gfio");
+	grd.description = strdup("Fio job file");
+	grd.mime_type = strdup(GFIO_MIME);
+	grd.app_name = strdup(g_get_application_name());
+	grd.app_exec = strdup("gfio %f/%u");
+
+	gtk_recent_manager_add_full(ui->recentmanager, uri, &grd);
+}
+
+static gchar *get_filename_from_uri(const gchar *uri)
+{
+	if (strncmp(uri, "file://", 7))
+		return strdup(uri);
+
+	return strdup(uri + 7);
+}
+
+static int do_file_open(struct gui_entry *ge, const gchar *uri)
+{
+	struct fio_client *client;
+
+	assert(!ge->job_file);
+
+	ge->job_file = get_filename_from_uri(uri);
+
+	client = fio_client_add_explicit(&gfio_client_ops, ge->host, ge->type, ge->port);
+	if (client) {
+		char *label = strdup(uri);
+
+		basename(label);
+		gtk_label_set_text(GTK_LABEL(ge->page_label), basename(label));
+		free(label);
+
+		gfio_client_added(ge, client);
+		file_add_recent(ge->ui, uri);
+		return 0;
+	}
+
+	gfio_report_error(ge, "Failed to add client %s\n", ge->host);
+	free(ge->host);
+	ge->host = NULL;
+	free(ge->job_file);
+	ge->job_file = NULL;
+	return 1;
+}
+
+static int do_file_open_with_tab(struct gui *ui, const gchar *uri)
+{
+	struct gui_entry *ge;
+	gint cur_page;
+	int ret, ge_is_new = 0;
+
+	/*
+	 * Creates new tab if current tab is the main window, or the
+	 * current tab already has a client.
+	 */
+	cur_page = gtk_notebook_get_current_page(GTK_NOTEBOOK(ui->notebook));
+	ge = get_ge_from_page(ui, cur_page, &ge_is_new);
+	if (ge->client) {
+		ge = get_new_ge_with_tab(ui, "Untitled");
+		ge_is_new = 1;
+	}
+
+	gtk_notebook_set_current_page(GTK_NOTEBOOK(ui->notebook), ge->page_num);
+
+	if (get_connection_details(ge)) {
+		if (ge_is_new)
+			gtk_widget_destroy(ge->vbox);
+
+		return 1;
+	}
+
+	ret = do_file_open(ge, uri);
+
+	if (!ret) {
+		if (ge->server_start)
+			gfio_start_server(ui);
+	} else {
+		if (ge_is_new)
+			gtk_widget_destroy(ge->vbox);
+	}
+
+	return ret;
+}
+
+static void recent_open(GtkAction *action, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	GtkRecentInfo *info;
+	const gchar *uri;
+
+	info = g_object_get_data(G_OBJECT(action), "gtk-recent-info");
+	uri = gtk_recent_info_get_uri(info);
+
+	do_file_open_with_tab(ui, uri);
+}
+
+static void file_open(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = data;
+	GtkWidget *dialog;
+	GtkFileFilter *filter;
+	gchar *filename;
+
+	dialog = gtk_file_chooser_dialog_new("Open File",
+		GTK_WINDOW(ui->window),
+		GTK_FILE_CHOOSER_ACTION_OPEN,
+		GTK_STOCK_CANCEL, GTK_RESPONSE_CANCEL,
+		GTK_STOCK_OPEN, GTK_RESPONSE_ACCEPT,
+		NULL);
+	gtk_file_chooser_set_select_multiple(GTK_FILE_CHOOSER(dialog), FALSE);
+
+	filter = gtk_file_filter_new();
+	gtk_file_filter_add_pattern(filter, "*.fio");
+	gtk_file_filter_add_pattern(filter, "*.job");
+	gtk_file_filter_add_pattern(filter, "*.ini");
+	gtk_file_filter_add_mime_type(filter, GFIO_MIME);
+	gtk_file_filter_set_name(filter, "Fio job file");
+	gtk_file_chooser_set_filter(GTK_FILE_CHOOSER(dialog), filter);
+
+	if (gtk_dialog_run(GTK_DIALOG(dialog)) != GTK_RESPONSE_ACCEPT) {
+		gtk_widget_destroy(dialog);
+		return;
+	}
+
+	filename = gtk_file_chooser_get_filename(GTK_FILE_CHOOSER(dialog));
+
+	gtk_widget_destroy(dialog);
+
+	do_file_open_with_tab(ui, filename);
+	g_free(filename);
+}
+
+static void file_save(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = data;
+	GtkWidget *dialog;
+
+	dialog = gtk_file_chooser_dialog_new("Save File",
+		GTK_WINDOW(ui->window),
+		GTK_FILE_CHOOSER_ACTION_SAVE,
+		GTK_STOCK_CANCEL, GTK_RESPONSE_CANCEL,
+		GTK_STOCK_SAVE, GTK_RESPONSE_ACCEPT,
+		NULL);
+
+	gtk_file_chooser_set_do_overwrite_confirmation(GTK_FILE_CHOOSER(dialog), TRUE);
+	gtk_file_chooser_set_current_name(GTK_FILE_CHOOSER(dialog), "Untitled document");
+
+	if (gtk_dialog_run(GTK_DIALOG(dialog)) == GTK_RESPONSE_ACCEPT) {
+		char *filename;
+
+		filename = gtk_file_chooser_get_filename(GTK_FILE_CHOOSER(dialog));
+		// save_job_file(filename);
+		g_free(filename);
+	}
+	gtk_widget_destroy(dialog);
+}
+
+static void view_log_destroy(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+
+	g_object_ref(G_OBJECT(ui->log_tree));
+	gtk_container_remove(GTK_CONTAINER(w), ui->log_tree);
+	gtk_widget_destroy(w);
+	ui->log_view = NULL;
+}
+
+void gfio_view_log(struct gui *ui)
+{
+	GtkWidget *win, *scroll, *vbox, *box;
+
+	if (ui->log_view)
+		return;
+
+	ui->log_view = win = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+	gtk_window_set_title(GTK_WINDOW(win), "Log");
+	gtk_window_set_default_size(GTK_WINDOW(win), 700, 500);
+
+	scroll = gtk_scrolled_window_new(NULL, NULL);
+
+	gtk_container_set_border_width(GTK_CONTAINER(scroll), 5);
+
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scroll), GTK_POLICY_AUTOMATIC, GTK_POLICY_AUTOMATIC);
+
+	box = gtk_hbox_new(TRUE, 0);
+	gtk_box_pack_start(GTK_BOX(box), ui->log_tree, TRUE, TRUE, 0);
+	g_signal_connect(box, "destroy", G_CALLBACK(view_log_destroy), ui);
+	gtk_scrolled_window_add_with_viewport(GTK_SCROLLED_WINDOW(scroll), box);
+
+	vbox = gtk_vbox_new(TRUE, 5);
+	gtk_box_pack_start(GTK_BOX(vbox), scroll, TRUE, TRUE, 0);
+
+	gtk_container_add(GTK_CONTAINER(win), vbox);
+	gtk_widget_show_all(win);
+}
+
+static void view_log(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+
+	gfio_view_log(ui);
+}
+
+static void connect_job_entry(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	ge = get_ge_from_cur_tab(ui);
+	if (ge)
+		connect_clicked(w, ge);
+}
+
+static void send_job_entry(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	ge = get_ge_from_cur_tab(ui);
+	if (ge)
+		send_clicked(w, ge);
+}
+
+static void edit_job_entry(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	ge = get_ge_from_cur_tab(ui);
+	if (ge && ge->client)
+		gopt_get_options_window(ui->window, ge->client);
+}
+
+static void start_job_entry(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	ge = get_ge_from_cur_tab(ui);
+	if (ge)
+		start_job_clicked(w, ge);
+}
+
+static void view_results(GtkWidget *w, gpointer data)
+{
+	struct gui *ui = (struct gui *) data;
+	struct gfio_client *gc;
+	struct gui_entry *ge;
+
+	ge = get_ge_from_cur_tab(ui);
+	if (!ge)
+		return;
+
+	if (ge->results_window)
+		return;
+
+	gc = ge->client;
+	if (gc && gc->nr_results)
+		gfio_display_end_results(gc);
+}
+
+static void __update_graph_settings(struct gfio_graphs *g)
+{
+	line_graph_set_data_count_limit(g->iops_graph, gfio_graph_limit);
+	graph_set_font(g->iops_graph, gfio_graph_font);
+	line_graph_set_data_count_limit(g->bandwidth_graph, gfio_graph_limit);
+	graph_set_font(g->bandwidth_graph, gfio_graph_font);
+}
+
+static void ge_update_settings_fn(gpointer key, gpointer value, gpointer data)
+{
+	struct gui_entry *ge = (struct gui_entry *) value;
+	GdkEvent *ev;
+
+	__update_graph_settings(&ge->graphs);
+
+	ev = gdk_event_new(GDK_EXPOSE);
+	g_signal_emit_by_name(G_OBJECT(ge->graphs.drawing_area), GFIO_DRAW_EVENT, GTK_WIDGET(ge->graphs.drawing_area), ev, &ge->graphs);
+	gdk_event_free(ev);
+}
+
+static void update_graph_limits(void)
+{
+	struct gui *ui = &main_ui;
+	GdkEvent *ev;
+
+	__update_graph_settings(&ui->graphs);
+
+	ev = gdk_event_new(GDK_EXPOSE);
+	g_signal_emit_by_name(G_OBJECT(ui->graphs.drawing_area), GFIO_DRAW_EVENT, GTK_WIDGET(ui->graphs.drawing_area), ev, &ui->graphs);
+	gdk_event_free(ev);
+
+	g_hash_table_foreach(ui->ge_hash, ge_update_settings_fn, NULL);
+}
+
+static void preferences(GtkWidget *w, gpointer data)
+{
+	GtkWidget *dialog, *frame, *box, **buttons, *vbox, *font;
+	GtkWidget *hbox, *spin, *entry, *spin_int;
+	struct gui *ui = (struct gui *) data;
+	int i;
+
+	dialog = gtk_dialog_new_with_buttons("Preferences",
+		GTK_WINDOW(ui->window),
+		GTK_DIALOG_DESTROY_WITH_PARENT,
+		GTK_STOCK_OK, GTK_RESPONSE_ACCEPT,
+		GTK_STOCK_CANCEL, GTK_RESPONSE_REJECT,
+		NULL);
+
+	frame = gtk_frame_new("Graphing");
+	vbox = gtk_dialog_get_content_area(GTK_DIALOG(dialog));
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+	vbox = gtk_vbox_new(FALSE, 6);
+	gtk_container_add(GTK_CONTAINER(frame), vbox);
+
+	hbox = gtk_hbox_new(FALSE, 5);
+	gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, FALSE, 5);
+	entry = gtk_label_new("Font face to use for graph labels");
+	gtk_box_pack_start(GTK_BOX(hbox), entry, TRUE, TRUE, 5);
+
+	font = gtk_font_button_new_with_font(gfio_graph_font);
+	gtk_box_pack_start(GTK_BOX(hbox), font, FALSE, FALSE, 5);
+
+	box = gtk_vbox_new(FALSE, 6);
+	gtk_box_pack_start(GTK_BOX(vbox), box, FALSE, FALSE, 5);
+
+	hbox = gtk_hbox_new(FALSE, 5);
+	gtk_box_pack_start(GTK_BOX(box), hbox, TRUE, TRUE, 5);
+	entry = gtk_label_new("Maximum number of data points in graph (seconds)");
+	gtk_box_pack_start(GTK_BOX(hbox), entry, FALSE, FALSE, 5);
+
+	spin = create_spinbutton(hbox, 10, 1000000, gfio_graph_limit);
+
+	box = gtk_vbox_new(FALSE, 6);
+	gtk_box_pack_start(GTK_BOX(vbox), box, FALSE, FALSE, 5);
+
+	hbox = gtk_hbox_new(FALSE, 5);
+	gtk_box_pack_start(GTK_BOX(box), hbox, TRUE, TRUE, 5);
+	entry = gtk_label_new("Client ETA request interval (msec)");
+	gtk_box_pack_start(GTK_BOX(hbox), entry, FALSE, FALSE, 5);
+
+	spin_int = create_spinbutton(hbox, 100, 100000, gfio_client_ops.eta_msec);
+	frame = gtk_frame_new("Debug logging");
+	vbox = gtk_dialog_get_content_area(GTK_DIALOG(dialog));
+	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
+	vbox = gtk_vbox_new(FALSE, 6);
+	gtk_container_add(GTK_CONTAINER(frame), vbox);
+
+	box = gtk_hbox_new(FALSE, 6);
+	gtk_container_add(GTK_CONTAINER(vbox), box);
+
+	buttons = malloc(sizeof(GtkWidget *) * FD_DEBUG_MAX);
+
+	for (i = 0; i < FD_DEBUG_MAX; i++) {
+		if (i == 7) {
+			box = gtk_hbox_new(FALSE, 6);
+			gtk_container_add(GTK_CONTAINER(vbox), box);
+		}
+
+
+		buttons[i] = gtk_check_button_new_with_label(debug_levels[i].name);
+		gtk_widget_set_tooltip_text(buttons[i], debug_levels[i].help);
+		gtk_box_pack_start(GTK_BOX(box), buttons[i], FALSE, FALSE, 6);
+	}
+
+	gtk_widget_show_all(dialog);
+
+	if (gtk_dialog_run(GTK_DIALOG(dialog)) != GTK_RESPONSE_ACCEPT) {
+		gtk_widget_destroy(dialog);
+		return;
+	}
+
+	for (i = 0; i < FD_DEBUG_MAX; i++) {
+		int set;
+
+		set = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(buttons[i]));
+		if (set)
+			fio_debug |= (1UL << i);
+	}
+
+	gfio_graph_font = strdup(gtk_font_button_get_font_name(GTK_FONT_BUTTON(font)));
+	gfio_graph_limit = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(spin));
+	update_graph_limits();
+	gfio_client_ops.eta_msec = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(spin_int));
+
+	gtk_widget_destroy(dialog);
+}
+
+static void about_dialog(GtkWidget *w, gpointer data)
+{
+	const char *authors[] = {
+		"Jens Axboe <axboe@kernel.dk>",
+		"Stephen Cameron <stephenmcameron@gmail.com>",
+		NULL
+	};
+	const char *license[] = {
+		"Fio is free software; you can redistribute it and/or modify "
+		"it under the terms of the GNU General Public License as published by "
+		"the Free Software Foundation; either version 2 of the License, or "
+		"(at your option) any later version.\n",
+		"Fio is distributed in the hope that it will be useful, "
+		"but WITHOUT ANY WARRANTY; without even the implied warranty of "
+		"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the "
+		"GNU General Public License for more details.\n",
+		"You should have received a copy of the GNU General Public License "
+		"along with Fio; if not, write to the Free Software Foundation, Inc., "
+		"51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA\n"
+	};
+	char *license_trans;
+
+	license_trans = g_strconcat(license[0], "\n", license[1], "\n",
+				     license[2], "\n", NULL);
+
+	gtk_show_about_dialog(NULL,
+		"program-name", "gfio",
+		"comments", "Gtk2 UI for fio",
+		"license", license_trans,
+		"website", "http://git.kernel.dk/cgit/fio/",
+		"authors", authors,
+		"version", fio_version_string,
+		"copyright", "© 2012-2017 Jens Axboe <axboe@kernel.dk>",
+		"logo-icon-name", "fio",
+		/* Must be last: */
+		"wrap-license", TRUE,
+		NULL);
+
+	g_free(license_trans);
+}
+
+static GtkActionEntry menu_items[] = {
+	{ "FileMenuAction", GTK_STOCK_FILE, "File", NULL, NULL, NULL},
+	{ "ViewMenuAction", GTK_STOCK_FILE, "View", NULL, NULL, NULL},
+	{ "JobMenuAction", GTK_STOCK_FILE, "Job", NULL, NULL, NULL},
+	{ "HelpMenuAction", GTK_STOCK_HELP, "Help", NULL, NULL, NULL},
+	{ "NewFile", GTK_STOCK_NEW, "New", "<Control>N", NULL, G_CALLBACK(file_new) },
+	{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(file_close) },
+	{ "OpenFile", GTK_STOCK_OPEN, NULL,   "<Control>O", NULL, G_CALLBACK(file_open) },
+	{ "SaveFile", GTK_STOCK_SAVE, NULL,   "<Control>S", NULL, G_CALLBACK(file_save) },
+	{ "Preferences", GTK_STOCK_PREFERENCES, NULL, "<Control>p", NULL, G_CALLBACK(preferences) },
+	{ "ViewLog", NULL, "Log", "<Control>l", NULL, G_CALLBACK(view_log) },
+	{ "ViewResults", NULL, "Results", "<Control>R", NULL, G_CALLBACK(view_results) },
+	{ "ConnectJob", NULL, "Connect", "<Control>D", NULL, G_CALLBACK(connect_job_entry) },
+	{ "EditJob", NULL, "Edit job", "<Control>E", NULL, G_CALLBACK(edit_job_entry) },
+	{ "SendJob", NULL, "Send job", "<Control>X", NULL, G_CALLBACK(send_job_entry) },
+	{ "StartJob", NULL, "Start job", "<Control>L", NULL, G_CALLBACK(start_job_entry) },
+	{ "Quit", GTK_STOCK_QUIT, NULL,   "<Control>Q", NULL, G_CALLBACK(quit_clicked) },
+	{ "About", GTK_STOCK_ABOUT, NULL,  NULL, NULL, G_CALLBACK(about_dialog) },
+};
+static gint nmenu_items = ARRAY_SIZE(menu_items);
+
+static const gchar *ui_string = " \
+	<ui> \
+		<menubar name=\"MainMenu\"> \
+			<menu name=\"FileMenu\" action=\"FileMenuAction\"> \
+				<menuitem name=\"New\" action=\"NewFile\" /> \
+				<menuitem name=\"Open\" action=\"OpenFile\" /> \
+				<menuitem name=\"Close\" action=\"CloseFile\" /> \
+				<separator name=\"Separator1\"/> \
+				<menuitem name=\"Save\" action=\"SaveFile\" /> \
+				<separator name=\"Separator2\"/> \
+				<menuitem name=\"Preferences\" action=\"Preferences\" /> \
+				<separator name=\"Separator3\"/> \
+				<placeholder name=\"FileRecentFiles\"/> \
+				<separator name=\"Separator4\"/> \
+				<menuitem name=\"Quit\" action=\"Quit\" /> \
+			</menu> \
+			<menu name=\"JobMenu\" action=\"JobMenuAction\"> \
+				<menuitem name=\"Connect\" action=\"ConnectJob\" /> \
+				<separator name=\"Separator5\"/> \
+				<menuitem name=\"Edit job\" action=\"EditJob\" /> \
+				<menuitem name=\"Send job\" action=\"SendJob\" /> \
+				<separator name=\"Separator6\"/> \
+				<menuitem name=\"Start job\" action=\"StartJob\" /> \
+			</menu>\
+			<menu name=\"ViewMenu\" action=\"ViewMenuAction\"> \
+				<menuitem name=\"Results\" action=\"ViewResults\" /> \
+				<separator name=\"Separator7\"/> \
+				<menuitem name=\"Log\" action=\"ViewLog\" /> \
+			</menu>\
+			<menu name=\"Help\" action=\"HelpMenuAction\"> \
+				<menuitem name=\"About\" action=\"About\" /> \
+			</menu> \
+		</menubar> \
+	</ui> \
+";
+
+static GtkWidget *get_menubar_menu(GtkWidget *window, GtkUIManager *ui_manager,
+				   struct gui *ui)
+{
+	GtkActionGroup *action_group;
+	GError *error = 0;
+
+	action_group = gtk_action_group_new("Menu");
+	gtk_action_group_add_actions(action_group, menu_items, nmenu_items, ui);
+
+	gtk_ui_manager_insert_action_group(ui_manager, action_group, 0);
+	gtk_ui_manager_add_ui_from_string(GTK_UI_MANAGER(ui_manager), ui_string, -1, &error);
+
+	gtk_window_add_accel_group(GTK_WINDOW(window), gtk_ui_manager_get_accel_group(ui_manager));
+
+	return gtk_ui_manager_get_widget(ui_manager, "/MainMenu");
+}
+
+void gfio_ui_setup(GtkSettings *settings, GtkWidget *menubar,
+		   GtkWidget *vbox, GtkUIManager *ui_manager)
+{
+	gtk_box_pack_start(GTK_BOX(vbox), menubar, FALSE, FALSE, 0);
+}
+
+static void combo_entry_changed(GtkComboBox *box, gpointer data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+	gint index;
+
+	index = gtk_combo_box_get_active(box);
+
+	multitext_set_entry(&ge->eta.iotype, index);
+	multitext_set_entry(&ge->eta.bs, index);
+	multitext_set_entry(&ge->eta.ioengine, index);
+	multitext_set_entry(&ge->eta.iodepth, index);
+}
+
+static void combo_entry_destroy(GtkWidget *widget, gpointer data)
+{
+	struct gui_entry *ge = (struct gui_entry *) data;
+
+	multitext_free(&ge->eta.iotype);
+	multitext_free(&ge->eta.bs);
+	multitext_free(&ge->eta.ioengine);
+	multitext_free(&ge->eta.iodepth);
+}
+
+static GtkWidget *new_client_page(struct gui_entry *ge)
+{
+	GtkWidget *main_vbox, *probe, *probe_frame, *probe_box;
+	GtkWidget *scrolled_window, *bottom_align, *top_align, *top_vbox;
+
+	main_vbox = gtk_vbox_new(FALSE, 3);
+
+	top_align = gtk_alignment_new(0, 0, 1, 0);
+	top_vbox = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(top_align), top_vbox);
+	gtk_box_pack_start(GTK_BOX(main_vbox), top_align, FALSE, FALSE, 0);
+
+	probe = gtk_frame_new("Job");
+	gtk_box_pack_start(GTK_BOX(main_vbox), probe, FALSE, FALSE, 3);
+	probe_frame = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(probe), probe_frame);
+
+	probe_box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
+	ge->probe.hostname = new_info_label_in_frame(probe_box, "Host");
+	ge->probe.os = new_info_label_in_frame(probe_box, "OS");
+	ge->probe.arch = new_info_label_in_frame(probe_box, "Architecture");
+	ge->probe.fio_ver = new_info_label_in_frame(probe_box, "Fio version");
+
+	probe_box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
+
+	ge->eta.names = new_combo_entry_in_frame(probe_box, "Jobs");
+	g_signal_connect(ge->eta.names, "changed", G_CALLBACK(combo_entry_changed), ge);
+	g_signal_connect(ge->eta.names, "destroy", G_CALLBACK(combo_entry_destroy), ge);
+	ge->eta.iotype.entry = new_info_entry_in_frame(probe_box, "IO");
+	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write/Trim)");
+	ge->eta.ioengine.entry = new_info_entry_in_frame(probe_box, "IO Engine");
+	ge->eta.iodepth.entry = new_info_entry_in_frame(probe_box, "IO Depth");
+	ge->eta.jobs = new_info_entry_in_frame(probe_box, "Jobs");
+	ge->eta.files = new_info_entry_in_frame(probe_box, "Open files");
+
+	probe_box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
+	ge->eta.read_bw = new_info_entry_in_frame_rgb(probe_box, "Read BW", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "Read IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ge->eta.write_bw = new_info_entry_in_frame_rgb(probe_box, "Write BW", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "Write IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ge->eta.trim_bw = new_info_entry_in_frame_rgb(probe_box, "Trim BW", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "Trim IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+
+	/*
+	 * Only add this if we have a commit rate
+	 */
+#if 0
+	probe_box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, TRUE, FALSE, 3);
+
+	ge->eta.cr_bw = new_info_label_in_frame(probe_box, "Commit BW");
+	ge->eta.cr_iops = new_info_label_in_frame(probe_box, "Commit IOPS");
+
+	ge->eta.cw_bw = new_info_label_in_frame(probe_box, "Commit BW");
+	ge->eta.cw_iops = new_info_label_in_frame(probe_box, "Commit IOPS");
+#endif
+
+	/*
+	 * Set up a drawing area and IOPS and bandwidth graphs
+	 */
+	ge->graphs.drawing_area = gtk_drawing_area_new();
+	gtk_widget_set_size_request(GTK_WIDGET(ge->graphs.drawing_area),
+		DRAWING_AREA_XDIM, DRAWING_AREA_YDIM);
+	gtk_widget_modify_bg(ge->graphs.drawing_area, GTK_STATE_NORMAL, &gfio_color_lightyellow);
+	g_signal_connect(G_OBJECT(ge->graphs.drawing_area), GFIO_DRAW_EVENT,
+				G_CALLBACK(on_expose_drawing_area), &ge->graphs);
+	g_signal_connect(G_OBJECT(ge->graphs.drawing_area), "configure_event",
+				G_CALLBACK(on_config_drawing_area), &ge->graphs);
+	scrolled_window = gtk_scrolled_window_new(NULL, NULL);
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
+					GTK_POLICY_AUTOMATIC, GTK_POLICY_AUTOMATIC);
+	gtk_scrolled_window_add_with_viewport(GTK_SCROLLED_WINDOW(scrolled_window),
+					ge->graphs.drawing_area);
+	gtk_box_pack_start(GTK_BOX(main_vbox), scrolled_window, TRUE, TRUE, 0);
+
+	setup_graphs(&ge->graphs);
+
+	/*
+	 * Set up alignments for widgets at the bottom of ui,
+	 * align bottom left, expand horizontally but not vertically
+	 */
+	bottom_align = gtk_alignment_new(0, 1, 1, 0);
+	ge->buttonbox = gtk_hbox_new(FALSE, 0);
+	gtk_container_add(GTK_CONTAINER(bottom_align), ge->buttonbox);
+	gtk_box_pack_start(GTK_BOX(main_vbox), bottom_align, FALSE, FALSE, 0);
+
+	add_buttons(ge, buttonspeclist, ARRAY_SIZE(buttonspeclist));
+
+	/*
+	 * Set up thread status progress bar
+	 */
+	ge->thread_status_pb = gtk_progress_bar_new();
+	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ge->thread_status_pb), 0.0);
+	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ge->thread_status_pb), "No connections");
+	gtk_container_add(GTK_CONTAINER(ge->buttonbox), ge->thread_status_pb);
+
+
+	return main_vbox;
+}
+
+static GtkWidget *new_main_page(struct gui *ui)
+{
+	GtkWidget *main_vbox, *probe, *probe_frame, *probe_box;
+	GtkWidget *scrolled_window, *bottom_align, *top_align, *top_vbox;
+
+	main_vbox = gtk_vbox_new(FALSE, 3);
+
+	/*
+	 * Set up alignments for widgets at the top of ui,
+	 * align top left, expand horizontally but not vertically
+	 */
+	top_align = gtk_alignment_new(0, 0, 1, 0);
+	top_vbox = gtk_vbox_new(FALSE, 0);
+	gtk_container_add(GTK_CONTAINER(top_align), top_vbox);
+	gtk_box_pack_start(GTK_BOX(main_vbox), top_align, FALSE, FALSE, 0);
+
+	probe = gtk_frame_new("Run statistics");
+	gtk_box_pack_start(GTK_BOX(main_vbox), probe, FALSE, FALSE, 3);
+	probe_frame = gtk_vbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(probe), probe_frame);
+
+	probe_box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
+	ui->eta.jobs = new_info_entry_in_frame(probe_box, "Running");
+	ui->eta.read_bw = new_info_entry_in_frame_rgb(probe_box, "Read BW", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ui->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ui->eta.write_bw = new_info_entry_in_frame_rgb(probe_box, "Write BW", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ui->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ui->eta.trim_bw = new_info_entry_in_frame_rgb(probe_box, "Trim BW", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	ui->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+
+	/*
+	 * Only add this if we have a commit rate
+	 */
+#if 0
+	probe_box = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, TRUE, FALSE, 3);
+
+	ui->eta.cr_bw = new_info_label_in_frame(probe_box, "Commit BW");
+	ui->eta.cr_iops = new_info_label_in_frame(probe_box, "Commit IOPS");
+
+	ui->eta.cw_bw = new_info_label_in_frame(probe_box, "Commit BW");
+	ui->eta.cw_iops = new_info_label_in_frame(probe_box, "Commit IOPS");
+#endif
+
+	/*
+	 * Set up a drawing area and IOPS and bandwidth graphs
+	 */
+	ui->graphs.drawing_area = gtk_drawing_area_new();
+	gtk_widget_set_size_request(GTK_WIDGET(ui->graphs.drawing_area),
+		DRAWING_AREA_XDIM, DRAWING_AREA_YDIM);
+	gtk_widget_modify_bg(ui->graphs.drawing_area, GTK_STATE_NORMAL, &gfio_color_lightyellow);
+	g_signal_connect(G_OBJECT(ui->graphs.drawing_area), GFIO_DRAW_EVENT,
+			G_CALLBACK(on_expose_drawing_area), &ui->graphs);
+	g_signal_connect(G_OBJECT(ui->graphs.drawing_area), "configure_event",
+			G_CALLBACK(on_config_drawing_area), &ui->graphs);
+	scrolled_window = gtk_scrolled_window_new(NULL, NULL);
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
+					GTK_POLICY_AUTOMATIC, GTK_POLICY_AUTOMATIC);
+	gtk_scrolled_window_add_with_viewport(GTK_SCROLLED_WINDOW(scrolled_window),
+					ui->graphs.drawing_area);
+	gtk_box_pack_start(GTK_BOX(main_vbox), scrolled_window,
+			TRUE, TRUE, 0);
+
+	setup_graphs(&ui->graphs);
+
+	/*
+	 * Set up alignments for widgets at the bottom of ui,
+	 * align bottom left, expand horizontally but not vertically
+	 */
+	bottom_align = gtk_alignment_new(0, 1, 1, 0);
+	ui->buttonbox = gtk_hbox_new(FALSE, 0);
+	gtk_container_add(GTK_CONTAINER(bottom_align), ui->buttonbox);
+	gtk_box_pack_start(GTK_BOX(main_vbox), bottom_align, FALSE, FALSE, 0);
+
+	/*
+	 * Set up thread status progress bar
+	 */
+	ui->thread_status_pb = gtk_progress_bar_new();
+	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ui->thread_status_pb), 0.0);
+	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ui->thread_status_pb), "No connections");
+	gtk_container_add(GTK_CONTAINER(ui->buttonbox), ui->thread_status_pb);
+
+	return main_vbox;
+}
+
+static gboolean notebook_switch_page(GtkNotebook *notebook, GtkWidget *widget,
+				     guint page, gpointer data)
+
+{
+	struct gui *ui = (struct gui *) data;
+	struct gui_entry *ge;
+
+	if (!page) {
+		set_job_menu_visible(ui, 0);
+		set_view_results_visible(ui, 0);
+		return TRUE;
+	}
+
+	set_job_menu_visible(ui, 1);
+	ge = get_ge_from_page(ui, page, NULL);
+	if (ge)
+		update_button_states(ui, ge);
+
+	return TRUE;
+}
+
+static gint compare_recent_items(GtkRecentInfo *a, GtkRecentInfo *b)
+{
+	time_t time_a = gtk_recent_info_get_visited(a);
+	time_t time_b = gtk_recent_info_get_visited(b);
+
+	return time_b - time_a;
+}
+
+static void add_recent_file_items(struct gui *ui)
+{
+	const gchar *gfio = g_get_application_name();
+	GList *items, *item;
+	int i = 0;
+
+	if (ui->recent_ui_id) {
+		gtk_ui_manager_remove_ui(ui->uimanager, ui->recent_ui_id);
+		gtk_ui_manager_ensure_update(ui->uimanager);
+	}
+	ui->recent_ui_id = gtk_ui_manager_new_merge_id(ui->uimanager);
+
+	if (ui->actiongroup) {
+		gtk_ui_manager_remove_action_group(ui->uimanager, ui->actiongroup);
+		g_object_unref(ui->actiongroup);
+	}
+	ui->actiongroup = gtk_action_group_new("RecentFileActions");
+
+	gtk_ui_manager_insert_action_group(ui->uimanager, ui->actiongroup, -1);
+
+	items = gtk_recent_manager_get_items(ui->recentmanager);
+	items = g_list_sort(items, (GCompareFunc) compare_recent_items);
+
+	for (item = items; item && item->data; item = g_list_next(item)) {
+		GtkRecentInfo *info = (GtkRecentInfo *) item->data;
+		gchar *action_name;
+		const gchar *label;
+		GtkAction *action;
+
+		if (!gtk_recent_info_has_application(info, gfio))
+			continue;
+
+		/*
+		 * We only support local files for now
+		 */
+		if (!gtk_recent_info_is_local(info) || !gtk_recent_info_exists(info))
+			continue;
+
+		action_name = g_strdup_printf("RecentFile%u", i++);
+		label = gtk_recent_info_get_display_name(info);
+
+		action = g_object_new(GTK_TYPE_ACTION,
+					"name", action_name,
+					"label", label, NULL);
+
+		g_object_set_data_full(G_OBJECT(action), "gtk-recent-info",
+					gtk_recent_info_ref(info),
+					(GDestroyNotify) gtk_recent_info_unref);
+
+
+		g_signal_connect(action, "activate", G_CALLBACK(recent_open), ui);
+
+		gtk_action_group_add_action(ui->actiongroup, action);
+		g_object_unref(action);
+
+		gtk_ui_manager_add_ui(ui->uimanager, ui->recent_ui_id,
+					"/MainMenu/FileMenu/FileRecentFiles",
+					label, action_name,
+					GTK_UI_MANAGER_MENUITEM, FALSE);
+
+		g_free(action_name);
+
+		if (i == 8)
+			break;
+	}
+
+	g_list_foreach(items, (GFunc) gtk_recent_info_unref, NULL);
+	g_list_free(items);
+}
+
+static void drag_and_drop_received(GtkWidget *widget, GdkDragContext *ctx,
+				   gint x, gint y, GtkSelectionData *seldata,
+				   guint info, guint time, gpointer *data)
+{
+	struct gui *ui = (struct gui *) data;
+	gchar **uris;
+	GtkWidget *source;
+
+	source = gtk_drag_get_source_widget(ctx);
+	if (source && widget == gtk_widget_get_toplevel(source)) {
+		gtk_drag_finish(ctx, FALSE, FALSE, time);
+		return;
+	}
+
+	uris = gtk_selection_data_get_uris(seldata);
+	if (!uris) {
+		gtk_drag_finish(ctx, FALSE, FALSE, time);
+		return;
+	}
+
+	if (uris[0])
+		do_file_open_with_tab(ui, uris[0]);
+
+	gtk_drag_finish(ctx, TRUE, FALSE, time);
+	g_strfreev(uris);
+}
+
+static void init_ui(int *argc, char **argv[], struct gui *ui)
+{
+	GtkSettings *settings;
+	GtkWidget *vbox;
+
+	/* Magical g*thread incantation, you just need this thread stuff.
+	 * Without it, the update that happens in gfio_update_thread_status
+	 * doesn't really happen in a timely fashion, you need expose events
+	 */
+#if !GLIB_CHECK_VERSION(2, 31, 0)
+	if (!g_thread_supported())
+		g_thread_init(NULL);
+#endif
+
+	gdk_threads_init();
+
+	gtk_init(argc, argv);
+	settings = gtk_settings_get_default();
+	gtk_settings_set_long_property(settings, "gtk_tooltip_timeout", 10, "gfio setting");
+#if !GLIB_CHECK_VERSION(2, 36, 0)
+	g_type_init();
+#endif
+	gdk_color_parse("#fffff4", &gfio_color_lightyellow);
+	gdk_color_parse("white", &gfio_color_white);
+
+	ui->window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+	gtk_window_set_title(GTK_WINDOW(ui->window), "fio");
+	gtk_window_set_default_size(GTK_WINDOW(ui->window), 1024, 768);
+
+	g_signal_connect(ui->window, "delete-event", G_CALLBACK(quit_clicked), ui);
+	g_signal_connect(ui->window, "destroy", G_CALLBACK(quit_clicked), ui);
+
+	ui->vbox = gtk_vbox_new(FALSE, 0);
+	gtk_container_add(GTK_CONTAINER(ui->window), ui->vbox);
+
+	ui->uimanager = gtk_ui_manager_new();
+	ui->menu = get_menubar_menu(ui->window, ui->uimanager, ui);
+	gfio_ui_setup(settings, ui->menu, ui->vbox, ui->uimanager);
+
+	ui->recentmanager = gtk_recent_manager_get_default();
+	add_recent_file_items(ui);
+
+	ui->notebook = gtk_notebook_new();
+	g_signal_connect(ui->notebook, "switch-page", G_CALLBACK(notebook_switch_page), ui);
+	gtk_notebook_set_scrollable(GTK_NOTEBOOK(ui->notebook), 1);
+	gtk_notebook_popup_enable(GTK_NOTEBOOK(ui->notebook));
+	gtk_container_add(GTK_CONTAINER(ui->vbox), ui->notebook);
+
+	vbox = new_main_page(ui);
+	gtk_drag_dest_set(GTK_WIDGET(ui->window), GTK_DEST_DEFAULT_ALL, NULL, 1, GDK_ACTION_COPY);
+	gtk_drag_dest_add_uri_targets(GTK_WIDGET(ui->window));
+	g_signal_connect(ui->window, "drag-data-received", G_CALLBACK(drag_and_drop_received), ui);
+
+	gtk_notebook_append_page(GTK_NOTEBOOK(ui->notebook), vbox, gtk_label_new("Main"));
+
+	gfio_ui_setup_log(ui);
+
+	gtk_widget_show_all(ui->window);
+}
+
+int main(int argc, char *argv[], char *envp[])
+{
+	if (initialize_fio(envp))
+		return 1;
+	if (fio_init_options())
+		return 1;
+
+	gopt_init();
+
+	memset(&main_ui, 0, sizeof(main_ui));
+	main_ui.ge_hash = g_hash_table_new(g_int_hash, g_int_equal);
+
+	init_ui(&argc, &argv, &main_ui);
+
+	gdk_threads_enter();
+	gtk_main();
+	gdk_threads_leave();
+
+	g_hash_table_destroy(main_ui.ge_hash);
+
+	gopt_exit();
+	return 0;
+}
diff --git a/gfio.h b/gfio.h
new file mode 100644
index 0000000..aa14e3c
--- /dev/null
+++ b/gfio.h
@@ -0,0 +1,179 @@
+#ifndef GFIO_H
+#define GFIO_H
+
+#include <gtk/gtk.h>
+
+#include "gcompat.h"
+#include "stat.h"
+#include "thread_options.h"
+#include "ghelpers.h"
+#include "graph.h"
+
+struct probe_widget {
+	GtkWidget *hostname;
+	GtkWidget *os;
+	GtkWidget *arch;
+	GtkWidget *fio_ver;
+};
+
+struct eta_widget {
+	GtkWidget *names;
+	struct multitext_widget iotype;
+	struct multitext_widget bs;
+	struct multitext_widget ioengine;
+	struct multitext_widget iodepth;
+	GtkWidget *jobs;
+	GtkWidget *files;
+	GtkWidget *read_bw;
+	GtkWidget *read_iops;
+	GtkWidget *cr_bw;
+	GtkWidget *cr_iops;
+	GtkWidget *write_bw;
+	GtkWidget *write_iops;
+	GtkWidget *cw_bw;
+	GtkWidget *cw_iops;
+	GtkWidget *trim_bw;
+	GtkWidget *trim_iops;
+};
+
+struct gfio_graphs {
+#define DRAWING_AREA_XDIM 1000
+#define DRAWING_AREA_YDIM 400
+	GtkWidget *drawing_area;
+	struct graph *iops_graph;
+	graph_label_t read_iops;
+	graph_label_t write_iops;
+	graph_label_t trim_iops;
+	struct graph *bandwidth_graph;
+	graph_label_t read_bw;
+	graph_label_t write_bw;
+	graph_label_t trim_bw;
+};
+
+/*
+ * Main window widgets and data
+ */
+struct gui {
+	GtkUIManager *uimanager;
+	GtkRecentManager *recentmanager;
+	GtkActionGroup *actiongroup;
+	guint recent_ui_id;
+	GtkWidget *menu;
+	GtkWidget *window;
+	GtkWidget *vbox;
+	GtkWidget *thread_status_pb;
+	GtkWidget *buttonbox;
+	GtkWidget *notebook;
+	GtkWidget *error_info_bar;
+	GtkWidget *error_label;
+	GtkListStore *log_model;
+	GtkWidget *log_tree;
+	GtkWidget *log_view;
+	struct gfio_graphs graphs;
+	struct probe_widget probe;
+	struct eta_widget eta;
+	pthread_t server_t;
+
+	pthread_t t;
+	int handler_running;
+
+	GHashTable *ge_hash;
+} main_ui;
+
+enum {
+	GE_STATE_NEW = 1,
+	GE_STATE_CONNECTED,
+	GE_STATE_JOB_SENT,
+	GE_STATE_JOB_STARTED,
+	GE_STATE_JOB_RUNNING,
+	GE_STATE_JOB_DONE,
+};
+
+enum {
+	GFIO_BUTTON_CONNECT = 0,
+	GFIO_BUTTON_SEND,
+	GFIO_BUTTON_START,
+	GFIO_BUTTON_NR,
+};
+
+/*
+ * Notebook entry
+ */
+struct gui_entry {
+	struct gui *ui;
+
+	GtkWidget *vbox;
+	GtkWidget *job_notebook;
+	GtkWidget *thread_status_pb;
+	GtkWidget *buttonbox;
+	GtkWidget *button[GFIO_BUTTON_NR];
+	GtkWidget *notebook;
+	GtkWidget *error_info_bar;
+	GtkWidget *error_label;
+	GtkWidget *results_window;
+	GtkWidget *results_notebook;
+	GtkUIManager *results_uimanager;
+	GtkWidget *results_menu;
+	GtkWidget *disk_util_vbox;
+	GtkListStore *log_model;
+	GtkWidget *log_tree;
+	GtkWidget *log_view;
+	struct gfio_graphs graphs;
+	struct probe_widget probe;
+	struct eta_widget eta;
+	GtkWidget *page_label;
+	gint page_num;
+	unsigned int state;
+
+	struct graph *clat_graph;
+	struct graph *lat_bucket_graph;
+
+	struct gfio_client *client;
+	char *job_file;
+	char *host;
+	int port;
+	int type;
+	int server_start;
+};
+
+struct end_results {
+	struct group_run_stats gs;
+	struct thread_stat ts;
+};
+
+struct gfio_client_options {
+	struct flist_head list;
+	struct thread_options o;
+};
+
+struct gfio_client {
+	struct gui_entry *ge;
+	struct fio_client *client;
+	GtkWidget *err_entry;
+	uint32_t client_cpus;
+	uint64_t client_flags;
+
+	struct flist_head o_list;
+	unsigned int o_list_nr;
+
+	struct end_results *results;
+	unsigned int nr_results;
+
+	uint32_t update_job_status;
+	volatile uint32_t update_job_done;
+
+	struct cmd_du_pdu *du;
+	unsigned int nr_du;
+};
+
+#define GFIO_MIME	"text/fio"
+
+extern void gfio_view_log(struct gui *ui);
+extern void gfio_set_state(struct gui_entry *ge, unsigned int state);
+extern void clear_ge_ui_info(struct gui_entry *ge);
+
+extern const char *gfio_graph_font;
+extern GdkColor gfio_color_white;
+extern GdkColor gfio_color_lightyellow;
+
+#endif
diff --git a/ghelpers.c b/ghelpers.c
new file mode 100644
index 0000000..7acf588
--- /dev/null
+++ b/ghelpers.c
@@ -0,0 +1,201 @@
+#include <stdlib.h>
+#include <string.h>
+#include <gtk/gtk.h>
+
+#include "gcompat.h"
+#include "ghelpers.h"
+
+GtkWidget *new_combo_entry_in_frame(GtkWidget *box, const char *label)
+{
+	GtkWidget *entry, *frame;
+
+	frame = gtk_frame_new(label);
+	entry = gtk_combo_box_text_new();
+	gtk_box_pack_start(GTK_BOX(box), frame, TRUE, TRUE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), entry);
+
+	return entry;
+}
+
+GtkWidget *new_info_entry_in_frame(GtkWidget *box, const char *label)
+{
+	GtkWidget *entry, *frame;
+
+	frame = gtk_frame_new(label);
+	entry = gtk_entry_new();
+	gtk_editable_set_editable(GTK_EDITABLE(entry), 0);
+	gtk_box_pack_start(GTK_BOX(box), frame, TRUE, TRUE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), entry);
+
+	return entry;
+}
+
+static void fill_color_from_rgb(GdkColor *c, gfloat r, gfloat g, gfloat b)
+{
+	gint R, G, B;
+	gchar tmp[8];
+
+	memset(c, 0, sizeof(*c));
+	R = r * 255;
+	G = g * 255;
+	B = b * 255;
+	snprintf(tmp, sizeof(tmp), "#%02x%02x%02x", R, G, B);
+	gdk_color_parse(tmp, c);
+}
+
+GtkWidget *new_info_entry_in_frame_rgb(GtkWidget *box, const char *label,
+					gfloat r, gfloat g, gfloat b)
+{
+	GtkWidget *entry;
+	GdkColor c;
+
+	entry = new_info_entry_in_frame(box, label);
+	fill_color_from_rgb(&c, r, g, b);
+	gtk_widget_modify_text(entry, GTK_STATE_NORMAL, &c);
+	return entry;
+}
+
+GtkWidget *new_info_label_in_frame(GtkWidget *box, const char *label)
+{
+	GtkWidget *label_widget;
+	GtkWidget *frame;
+
+	frame = gtk_frame_new(label);
+	label_widget = gtk_label_new(NULL);
+	gtk_box_pack_start(GTK_BOX(box), frame, TRUE, TRUE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), label_widget);
+
+	return label_widget;
+}
+
+GtkWidget *create_spinbutton(GtkWidget *hbox, double min, double max, double defval)
+{
+	GtkWidget *button, *box;
+
+	box = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(hbox), box);
+
+	button = gtk_spin_button_new_with_range(min, max, 1.0);
+	gtk_box_pack_start(GTK_BOX(box), button, TRUE, TRUE, 0);
+
+	gtk_spin_button_set_update_policy(GTK_SPIN_BUTTON(button), GTK_UPDATE_IF_VALID);
+	gtk_spin_button_set_value(GTK_SPIN_BUTTON(button), defval);
+
+	return button;
+}
+
+void label_set_int_value(GtkWidget *entry, unsigned int val)
+{
+	char tmp[80];
+
+	sprintf(tmp, "%u", val);
+	gtk_label_set_text(GTK_LABEL(entry), tmp);
+}
+
+void entry_set_int_value(GtkWidget *entry, unsigned int val)
+{
+	char tmp[80];
+
+	sprintf(tmp, "%u", val);
+	gtk_entry_set_text(GTK_ENTRY(entry), tmp);
+}
+
+GtkTreeViewColumn *tree_view_column(GtkWidget *tree_view, int index, const char *title, unsigned int flags)
+{
+	GtkCellRenderer *renderer;
+	GtkTreeViewColumn *col;
+	double xalign = 0.0; /* left as default */
+	PangoAlignment align;
+	gboolean visible;
+
+	align = (flags & ALIGN_LEFT) ? PANGO_ALIGN_LEFT :
+		(flags & ALIGN_RIGHT) ? PANGO_ALIGN_RIGHT :
+		PANGO_ALIGN_CENTER;
+	visible = !(flags & INVISIBLE);
+
+	renderer = gtk_cell_renderer_text_new();
+	col = gtk_tree_view_column_new();
+
+	gtk_tree_view_column_set_title(col, title);
+	if (!(flags & UNSORTABLE))
+		gtk_tree_view_column_set_sort_column_id(col, index);
+	gtk_tree_view_column_set_resizable(col, TRUE);
+	gtk_tree_view_column_pack_start(col, renderer, TRUE);
+	gtk_tree_view_column_set_expand(col, TRUE);
+	gtk_tree_view_column_add_attribute(col, renderer, "text", index);
+	g_object_set(G_OBJECT(renderer), "alignment", align, NULL);
+	switch (align) {
+	case PANGO_ALIGN_LEFT:
+		xalign = 0.0;
+		break;
+	case PANGO_ALIGN_CENTER:
+		xalign = 0.5;
+		break;
+	case PANGO_ALIGN_RIGHT:
+		xalign = 1.0;
+		break;
+	}
+	gtk_cell_renderer_set_alignment(GTK_CELL_RENDERER(renderer), xalign, 0.5);
+	gtk_tree_view_column_set_visible(col, visible);
+	gtk_tree_view_append_column(GTK_TREE_VIEW(tree_view), col);
+	return col;
+}
+
+void multitext_add_entry(struct multitext_widget *mt, const char *text)
+{
+	mt->text = realloc(mt->text, (mt->max_text + 1) * sizeof(char *));
+	mt->text[mt->max_text] = strdup(text);
+	mt->max_text++;
+}
+
+void multitext_set_entry(struct multitext_widget *mt, unsigned int index)
+{
+	if (index >= mt->max_text)
+		return;
+	if (!mt->text || !mt->text[index])
+		return;
+
+	mt->cur_text = index;
+	gtk_entry_set_text(GTK_ENTRY(mt->entry), mt->text[index]);
+}
+
+void multitext_update_entry(struct multitext_widget *mt, unsigned int index,
+			    const char *text)
+{
+	if (!mt->text)
+		return;
+
+	if (mt->text[index])
+		free(mt->text[index]);
+
+	mt->text[index] = strdup(text);
+	if (mt->cur_text == index)
+		gtk_entry_set_text(GTK_ENTRY(mt->entry), mt->text[index]);
+}
+
+void multitext_free(struct multitext_widget *mt)
+{
+	int i;
+
+	gtk_entry_set_text(GTK_ENTRY(mt->entry), "");
+
+	for (i = 0; i < mt->max_text; i++) {
+		if (mt->text[i])
+			free(mt->text[i]);
+	}
+
+	free(mt->text);
+	mt->cur_text = -1;
+	mt->max_text = 0;
+}
+
+GtkWidget *get_scrolled_window(gint border_width)
+{
+	GtkWidget *scroll;
+
+	scroll = gtk_scrolled_window_new(NULL, NULL);
+	gtk_container_set_border_width(GTK_CONTAINER(scroll), border_width);
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scroll), GTK_POLICY_AUTOMATIC, GTK_POLICY_AUTOMATIC);
+
+	return scroll;
+}
diff --git a/ghelpers.h b/ghelpers.h
new file mode 100644
index 0000000..39a994b
--- /dev/null
+++ b/ghelpers.h
@@ -0,0 +1,35 @@
+#ifndef GFIO_HELPERS_H
+#define GFIO_HELPERS_H
+
+GtkWidget *new_combo_entry_in_frame(GtkWidget *box, const char *label);
+GtkWidget *new_info_entry_in_frame(GtkWidget *box, const char *label);
+GtkWidget *new_info_label_in_frame(GtkWidget *box, const char *label);
+GtkWidget *new_info_entry_in_frame_rgb(GtkWidget *box, const char *label,
+					gfloat r, gfloat g, gfloat b);
+GtkWidget *create_spinbutton(GtkWidget *hbox, double min, double max, double defval);
+void label_set_int_value(GtkWidget *entry, unsigned int val);
+void entry_set_int_value(GtkWidget *entry, unsigned int val);
+
+GtkWidget *get_scrolled_window(gint border_width);
+
+struct multitext_widget {
+	GtkWidget *entry;
+	char **text;
+	unsigned int cur_text;
+	unsigned int max_text;
+};
+
+void multitext_add_entry(struct multitext_widget *mt, const char *text);
+void multitext_set_entry(struct multitext_widget *mt, unsigned int index);
+void multitext_update_entry(struct multitext_widget *mt, unsigned int index,
+			    const char *text);
+void multitext_free(struct multitext_widget *mt);
+
+#define ALIGN_LEFT 1
+#define ALIGN_RIGHT 2
+#define INVISIBLE 4
+#define UNSORTABLE 8
+
+GtkTreeViewColumn *tree_view_column(GtkWidget *tree_view, int index, const char *title, unsigned int flags);
+
+#endif
diff --git a/goptions.c b/goptions.c
new file mode 100644
index 0000000..f44254b
--- /dev/null
+++ b/goptions.c
@@ -0,0 +1,1639 @@
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <glib.h>
+#include <cairo.h>
+#include <gtk/gtk.h>
+
+#include "fio.h"
+#include "gfio.h"
+#include "ghelpers.h"
+#include "gerror.h"
+#include "parse.h"
+#include "optgroup.h"
+
+struct gopt {
+	GtkWidget *box;
+	unsigned int opt_index;
+	unsigned int opt_type;
+	gulong sig_handler;
+	struct gopt_job_view *gjv;
+	struct flist_head changed_list;
+};
+
+struct gopt_combo {
+	struct gopt gopt;
+	GtkWidget *combo;
+};
+
+struct gopt_int {
+	struct gopt gopt;
+	unsigned long long lastval;
+	GtkWidget *spin;
+};
+
+struct gopt_bool {
+	struct gopt gopt;
+	GtkWidget *check;
+};
+
+struct gopt_str {
+	struct gopt gopt;
+	GtkWidget *entry;
+};
+
+struct gopt_str_val {
+	struct gopt gopt;
+	GtkWidget *spin;
+	GtkWidget *combo;
+	unsigned int maxindex;
+};
+
+#define GOPT_RANGE_SPIN	4
+
+struct gopt_range {
+	struct gopt gopt;
+	GtkWidget *spins[GOPT_RANGE_SPIN];
+};
+
+struct gopt_str_multi {
+	struct gopt gopt;
+	GtkWidget *checks[PARSE_MAX_VP];
+};
+
+enum {
+	GOPT_COMBO_INT = 1,
+	GOPT_COMBO_STR,
+	GOPT_INT,
+	GOPT_BOOL,
+	GOPT_STR,
+	GOPT_STR_VAL,
+	GOPT_RANGE,
+	GOPT_STR_MULTI,
+};
+
+struct gopt_frame_widget {
+	GtkWidget *vbox[2];
+	unsigned int nr;
+};
+
+struct gopt_job_view {
+	struct gopt_frame_widget g_widgets[__FIO_OPT_G_NR];
+	GtkWidget *vboxes[__FIO_OPT_C_NR];
+	struct gopt *gopts[FIO_MAX_OPTS];
+	GtkWidget *dialog;
+	GtkWidget *job_combo;
+	struct gfio_client *client;
+	struct flist_head changed_list;
+	struct thread_options *o;
+	int in_job_switch;
+};
+
+static GNode *gopt_dep_tree;
+
+static GtkWidget *gopt_get_group_frame(struct gopt_job_view *gjv,
+				       GtkWidget *box, uint64_t groupmask)
+{
+	uint64_t mask, group;
+	const struct opt_group *og;
+	GtkWidget *frame, *hbox;
+	struct gopt_frame_widget *gfw;
+
+	if (!groupmask)
+		return 0;
+
+	mask = groupmask;
+	og = opt_group_cat_from_mask(&mask);
+	if (!og)
+		return NULL;
+
+	group = ffz64(~groupmask);
+	gfw = &gjv->g_widgets[group];
+	if (!gfw->vbox[0]) {
+		frame = gtk_frame_new(og->name);
+		gtk_box_pack_start(GTK_BOX(box), frame, FALSE, FALSE, 3);
+		hbox = gtk_hbox_new(FALSE, 0);
+		gtk_container_add(GTK_CONTAINER(frame), hbox);
+		gfw->vbox[0] = gtk_vbox_new(TRUE, 5);
+		gfw->vbox[1] = gtk_vbox_new(TRUE, 5);
+		gtk_box_pack_start(GTK_BOX(hbox), gfw->vbox[0], TRUE, TRUE, 5);
+		gtk_box_pack_start(GTK_BOX(hbox), gfw->vbox[1], TRUE, TRUE, 5);
+	}
+
+	hbox = gtk_hbox_new(FALSE, 3);
+	gtk_box_pack_start(GTK_BOX(gfw->vbox[gfw->nr++ & 1]), hbox, FALSE, FALSE, 5);
+	return hbox;
+}
+
+/*
+ * Mark children as invisible, if needed.
+ */
+static void gopt_set_children_visible(struct gopt_job_view *gjv,
+				      struct fio_option *parent,
+				      gboolean visible)
+{
+	GNode *child, *node;
+
+	if (parent->hide_on_set)
+		visible = !visible;
+
+	node = g_node_find(gopt_dep_tree, G_IN_ORDER, G_TRAVERSE_ALL, parent);
+	child = g_node_first_child(node);
+	while (child) {
+		struct fio_option *o = child->data;
+		struct gopt *g = o->gui_data;
+		GtkWidget *widget = g->box;
+
+		/*
+		 * Recurse into child, if it also has children
+		 */
+		if (g_node_n_children(child))
+			gopt_set_children_visible(gjv, o, visible);
+
+		gtk_widget_set_sensitive(widget, visible);
+		child = g_node_next_sibling(child);
+	}
+}
+
+static void gopt_mark_index(struct gopt_job_view *gjv, struct gopt *gopt,
+			    unsigned int idx, int type)
+{
+	INIT_FLIST_HEAD(&gopt->changed_list);
+
+	assert(!gjv->gopts[idx]);
+	gopt->opt_index = idx;
+	gopt->opt_type = type;
+	gopt->gjv = gjv;
+	gjv->gopts[idx] = gopt;
+}
+
+static void gopt_dialog_update_apply_button(struct gopt_job_view *gjv)
+{
+	GtkDialog *dialog = GTK_DIALOG(gjv->dialog);
+	gboolean set;
+
+	set = !flist_empty(&gjv->changed_list);
+	gtk_dialog_set_response_sensitive(dialog, GTK_RESPONSE_APPLY, set);
+
+	if (set) {
+		gtk_widget_set_sensitive(gjv->job_combo, 0);
+		gtk_widget_set_tooltip_text(gjv->job_combo, "Apply option changes before switching to a new job");
+	} else {
+		gtk_widget_set_sensitive(gjv->job_combo, 1);
+		gtk_widget_set_tooltip_text(gjv->job_combo, "Change current job");
+	}
+}
+
+static void gopt_changed(struct gopt *gopt)
+{
+	struct gopt_job_view *gjv = gopt->gjv;
+
+	if (gjv->in_job_switch)
+		return;
+
+	/*
+	 * Add to changed list. This also prevents the option from being
+	 * freed when the widget is destroyed.
+	 */
+	if (flist_empty(&gopt->changed_list)) {
+		flist_add_tail(&gopt->changed_list, &gjv->changed_list);
+		gopt_dialog_update_apply_button(gjv);
+	}
+}
+
+static void gopt_str_changed(GtkEntry *entry, gpointer data)
+{
+	struct gopt_str *s = (struct gopt_str *) data;
+	struct fio_option *o = &fio_options[s->gopt.opt_index];
+	const gchar *text;
+	int set;
+
+	gopt_changed(&s->gopt);
+
+	text = gtk_entry_get_text(GTK_ENTRY(s->entry));
+	set = strcmp(text, "") != 0;
+
+	gopt_set_children_visible(s->gopt.gjv, o, set);
+}
+
+static void gopt_str_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_str *s = (struct gopt_str *) data;
+
+	free(s);
+	gtk_widget_destroy(w);
+}
+
+static void gopt_str_store_set_val(struct gopt_str *s, const char *text)
+{
+	if (text)
+		gtk_entry_set_text(GTK_ENTRY(s->entry), text);
+}
+
+static struct gopt *gopt_new_str_store(struct gopt_job_view *gjv,
+				       struct fio_option *o, const char *text,
+				       unsigned int idx)
+{
+	struct gopt_str *s;
+	GtkWidget *label;
+
+	s = calloc(1, sizeof(*s));
+
+	s->gopt.box = gtk_hbox_new(FALSE, 3);
+	if (!o->lname)
+		label = gtk_label_new(o->name);
+	else
+		label = gtk_label_new(o->lname);
+
+	s->entry = gtk_entry_new();
+	gopt_mark_index(gjv, &s->gopt, idx, GOPT_STR);
+	gtk_editable_set_editable(GTK_EDITABLE(s->entry), 1);
+
+	if (text)
+		gopt_str_store_set_val(s, text);
+	else if (o->def)
+		gopt_str_store_set_val(s, o->def);
+
+	s->gopt.sig_handler = g_signal_connect(G_OBJECT(s->entry), "changed", G_CALLBACK(gopt_str_changed), s);
+	g_signal_connect(G_OBJECT(s->entry), "destroy", G_CALLBACK(gopt_str_destroy), s);
+
+	gtk_box_pack_start(GTK_BOX(s->gopt.box), s->entry, FALSE, FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(s->gopt.box), label, FALSE, FALSE, 0);
+	return &s->gopt;
+}
+
+static void gopt_combo_changed(GtkComboBox *box, gpointer data)
+{
+	struct gopt_combo *c = (struct gopt_combo *) data;
+	struct fio_option *o = &fio_options[c->gopt.opt_index];
+	unsigned int index;
+
+	gopt_changed(&c->gopt);
+
+	index = gtk_combo_box_get_active(GTK_COMBO_BOX(c->combo));
+
+	gopt_set_children_visible(c->gopt.gjv, o, index);
+}
+
+static void gopt_combo_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_combo *c = (struct gopt_combo *) data;
+
+	free(c);
+	gtk_widget_destroy(w);
+}
+
+static struct gopt_combo *__gopt_new_combo(struct gopt_job_view *gjv,
+					   struct fio_option *o,
+					   unsigned int idx, int type)
+{
+	struct gopt_combo *c;
+	GtkWidget *label;
+
+	c = calloc(1, sizeof(*c));
+
+	c->gopt.box = gtk_hbox_new(FALSE, 3);
+	if (!o->lname)
+		label = gtk_label_new(o->name);
+	else
+		label = gtk_label_new(o->lname);
+
+	c->combo = gtk_combo_box_text_new();
+	gopt_mark_index(gjv, &c->gopt, idx, type);
+	g_signal_connect(G_OBJECT(c->combo), "destroy", G_CALLBACK(gopt_combo_destroy), c);
+
+	gtk_box_pack_start(GTK_BOX(c->gopt.box), c->combo, FALSE, FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(c->gopt.box), label, FALSE, FALSE, 0);
+
+	return c;
+}
+
+static void gopt_combo_str_set_val(struct gopt_combo *c, const char *text)
+{
+	struct fio_option *o = &fio_options[c->gopt.opt_index];
+	struct value_pair *vp;
+	int i;
+
+	i = 0;
+	vp = &o->posval[0];
+	while (vp->ival) {
+		if (!strcmp(vp->ival, text)) {
+			gtk_combo_box_set_active(GTK_COMBO_BOX(c->combo), i);
+			break;
+		}
+		vp++;
+		i++;
+	}
+}
+
+static struct gopt *gopt_new_combo_str(struct gopt_job_view *gjv,
+				       struct fio_option *o, const char *text,
+				       unsigned int idx)
+{
+	struct gopt_combo *c;
+	struct value_pair *vp;
+	int i, active = 0;
+
+	c = __gopt_new_combo(gjv, o, idx, GOPT_COMBO_STR);
+
+	i = 0;
+	vp = &o->posval[0];
+	while (vp->ival) {
+		gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(c->combo), vp->ival);
+		if (o->def && !strcmp(vp->ival, o->def))
+			active = i;
+		vp++;
+		i++;
+	}
+
+	gtk_combo_box_set_active(GTK_COMBO_BOX(c->combo), active);
+	if (text)
+		gopt_combo_str_set_val(c, text);
+	c->gopt.sig_handler = g_signal_connect(G_OBJECT(c->combo), "changed", G_CALLBACK(gopt_combo_changed), c);
+	return &c->gopt;
+}
+
+static void gopt_combo_int_set_val(struct gopt_combo *c, unsigned int ip)
+{
+	struct fio_option *o = &fio_options[c->gopt.opt_index];
+	struct value_pair *vp;
+	int i;
+
+	i = 0;
+	vp = &o->posval[0];
+	while (vp->ival) {
+		if (vp->oval == ip) {
+			gtk_combo_box_set_active(GTK_COMBO_BOX(c->combo), i);
+			break;
+		}
+		vp++;
+		i++;
+	}
+}
+
+static struct gopt *gopt_new_combo_int(struct gopt_job_view *gjv,
+				       struct fio_option *o, unsigned int *ip,
+				       unsigned int idx)
+{
+	struct gopt_combo *c;
+	struct value_pair *vp;
+	int i, active = 0;
+
+	c = __gopt_new_combo(gjv, o, idx, GOPT_COMBO_INT);
+
+	i = 0;
+	vp = &o->posval[0];
+	while (vp->ival) {
+		gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(c->combo), vp->ival);
+		if (ip && vp->oval == *ip)
+			active = i;
+		vp++;
+		i++;
+	}
+
+	gtk_combo_box_set_active(GTK_COMBO_BOX(c->combo), active);
+	if (ip)
+		gopt_combo_int_set_val(c, *ip);
+	c->gopt.sig_handler = g_signal_connect(G_OBJECT(c->combo), "changed", G_CALLBACK(gopt_combo_changed), c);
+	return &c->gopt;
+}
+
+static void gopt_str_multi_toggled(GtkToggleButton *button, gpointer data)
+{
+	struct gopt_str_multi *m = (struct gopt_str_multi *) data;
+
+	gopt_changed(&m->gopt);
+}
+
+static void gopt_str_multi_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_str_multi *m = (struct gopt_str_multi *) data;
+
+	free(m);
+	gtk_widget_destroy(w);
+}
+
+static void gopt_str_multi_set_val(struct gopt_str_multi *m, int val)
+{
+}
+
+static struct gopt *gopt_new_str_multi(struct gopt_job_view *gjv,
+				       struct fio_option *o, unsigned int idx)
+{
+	struct gopt_str_multi *m;
+	struct value_pair *vp;
+	GtkWidget *frame, *hbox;
+	int i;
+
+	m = calloc(1, sizeof(*m));
+	m->gopt.box = gtk_hbox_new(FALSE, 3);
+	gopt_mark_index(gjv, &m->gopt, idx, GOPT_STR_MULTI);
+
+	if (!o->lname)
+		frame = gtk_frame_new(o->name);
+	else
+		frame = gtk_frame_new(o->lname);
+	gtk_box_pack_start(GTK_BOX(m->gopt.box), frame, FALSE, FALSE, 3);
+
+	hbox = gtk_hbox_new(FALSE, 3);
+	gtk_container_add(GTK_CONTAINER(frame), hbox);
+
+	i = 0;
+	vp = &o->posval[0];
+	while (vp->ival) {
+		m->checks[i] = gtk_check_button_new_with_label(vp->ival);
+		gtk_widget_set_tooltip_text(m->checks[i], vp->help);
+		gtk_box_pack_start(GTK_BOX(hbox), m->checks[i], FALSE, FALSE, 3);
+		g_signal_connect(G_OBJECT(m->checks[i]), "toggled", G_CALLBACK(gopt_str_multi_toggled), m);
+		vp++;
+		i++;
+	}
+
+	gopt_str_multi_set_val(m, 0);
+	g_signal_connect(G_OBJECT(m->gopt.box), "destroy", G_CALLBACK(gopt_str_multi_destroy), m);
+	return &m->gopt;
+}
+
+static void gopt_int_changed(GtkSpinButton *spin, gpointer data)
+{
+	struct gopt_int *i = (struct gopt_int *) data;
+	struct fio_option *o = &fio_options[i->gopt.opt_index];
+	GtkAdjustment *adj;
+	int value, delta;
+
+	gopt_changed(&i->gopt);
+
+	adj = gtk_spin_button_get_adjustment(spin);
+	value = gtk_adjustment_get_value(adj);
+	delta = value - i->lastval;
+	i->lastval = value;
+
+	if (o->inv_opt) {
+		struct gopt *b_inv = o->inv_opt->gui_data;
+		struct gopt_int *i_inv = container_of(b_inv, struct gopt_int, gopt);
+		int cur_val;
+
+		assert(o->type == o->inv_opt->type);
+
+		cur_val = gtk_spin_button_get_value(GTK_SPIN_BUTTON(i_inv->spin));
+		cur_val -= delta;
+		g_signal_handler_block(G_OBJECT(i_inv->spin), i_inv->gopt.sig_handler);
+		gtk_spin_button_set_value(GTK_SPIN_BUTTON(i_inv->spin), cur_val);
+		g_signal_handler_unblock(G_OBJECT(i_inv->spin), i_inv->gopt.sig_handler);
+	}
+}
+
+static void gopt_int_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_int *i = (struct gopt_int *) data;
+
+	free(i);
+	gtk_widget_destroy(w);
+}
+
+static void gopt_int_set_val(struct gopt_int *i, unsigned long long p)
+{
+	gtk_spin_button_set_value(GTK_SPIN_BUTTON(i->spin), p);
+	i->lastval = p;
+}
+
+static struct gopt_int *__gopt_new_int(struct gopt_job_view *gjv,
+				       struct fio_option *o,
+				       unsigned long long *p, unsigned int idx)
+{
+	unsigned long long defval;
+	struct gopt_int *i;
+	guint maxval, interval;
+	GtkWidget *label;
+
+	i = calloc(1, sizeof(*i));
+	i->gopt.box = gtk_hbox_new(FALSE, 3);
+	if (!o->lname)
+		label = gtk_label_new(o->name);
+	else
+		label = gtk_label_new(o->lname);
+
+	maxval = o->maxval;
+	if (!maxval)
+		maxval = UINT_MAX;
+
+	defval = 0;
+	if (p)
+		defval = *p;
+	else if (o->def) {
+		long long val;
+
+		check_str_bytes(o->def, &val, o);
+		defval = val;
+	}
+
+	interval = 1.0;
+	if (o->interval)
+		interval = o->interval;
+
+	i->spin = gtk_spin_button_new_with_range(o->minval, maxval, interval);
+	gopt_mark_index(gjv, &i->gopt, idx, GOPT_INT);
+	gtk_spin_button_set_update_policy(GTK_SPIN_BUTTON(i->spin), GTK_UPDATE_IF_VALID);
+	if (p)
+		gopt_int_set_val(i, *p);
+	else
+		gopt_int_set_val(i, defval);
+	i->gopt.sig_handler = g_signal_connect(G_OBJECT(i->spin), "value-changed", G_CALLBACK(gopt_int_changed), i);
+	g_signal_connect(G_OBJECT(i->spin), "destroy", G_CALLBACK(gopt_int_destroy), i);
+
+	gtk_box_pack_start(GTK_BOX(i->gopt.box), i->spin, FALSE, FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(i->gopt.box), label, FALSE, FALSE, 0);
+
+	return i;
+}
+
+static struct gopt *gopt_new_int(struct gopt_job_view *gjv,
+				 struct fio_option *o, unsigned int *ip,
+				 unsigned int idx)
+{
+	unsigned long long ullp;
+	struct gopt_int *i;
+
+	if (ip) {
+		ullp = *ip;
+		i = __gopt_new_int(gjv, o, &ullp, idx);
+	} else
+		i = __gopt_new_int(gjv, o, NULL, idx);
+
+	return &i->gopt;
+}
+
+static struct gopt *gopt_new_ullong(struct gopt_job_view *gjv,
+				    struct fio_option *o, unsigned long long *p,
+				    unsigned int idx)
+{
+	struct gopt_int *i;
+
+	i = __gopt_new_int(gjv, o, p, idx);
+	return &i->gopt;
+}
+
+static void gopt_bool_toggled(GtkToggleButton *button, gpointer data)
+{
+	struct gopt_bool *b = (struct gopt_bool *) data;
+	struct fio_option *o = &fio_options[b->gopt.opt_index];
+	gboolean set;
+
+	gopt_changed(&b->gopt);
+
+	set = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(b->check));
+
+	if (o->inv_opt) {
+		struct gopt *g_inv = o->inv_opt->gui_data;
+		struct gopt_bool *b_inv = container_of(g_inv, struct gopt_bool, gopt);
+
+		assert(o->type == o->inv_opt->type);
+
+		g_signal_handler_block(G_OBJECT(b_inv->check), b_inv->gopt.sig_handler);
+		gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(b_inv->check), !set);
+		g_signal_handler_unblock(G_OBJECT(b_inv->check), b_inv->gopt.sig_handler);
+	}
+
+	gopt_set_children_visible(b->gopt.gjv, o, set);
+}
+
+static void gopt_bool_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_bool *b = (struct gopt_bool *) data;
+
+	free(b);
+	gtk_widget_destroy(w);
+}
+
+static void gopt_bool_set_val(struct gopt_bool *b, unsigned int val)
+{
+	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(b->check), val);
+}
+
+static struct gopt *gopt_new_bool(struct gopt_job_view *gjv,
+				  struct fio_option *o, unsigned int *val,
+				  unsigned int idx)
+{
+	struct gopt_bool *b;
+	GtkWidget *label;
+	int defstate = 0;
+
+	b = calloc(1, sizeof(*b));
+	b->gopt.box = gtk_hbox_new(FALSE, 3);
+	if (!o->lname)
+		label = gtk_label_new(o->name);
+	else
+		label = gtk_label_new(o->lname);
+
+	b->check = gtk_check_button_new();
+	gopt_mark_index(gjv, &b->gopt, idx, GOPT_BOOL);
+	if (o->def && !strcmp(o->def, "1"))
+		defstate = 1;
+
+	if (o->neg)
+		defstate = !defstate;
+
+	if (val)
+		gopt_bool_set_val(b, *val);
+	else
+		gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(b->check), defstate);
+	b->gopt.sig_handler = g_signal_connect(G_OBJECT(b->check), "toggled", G_CALLBACK(gopt_bool_toggled), b);
+	g_signal_connect(G_OBJECT(b->check), "destroy", G_CALLBACK(gopt_bool_destroy), b);
+
+	gtk_box_pack_start(GTK_BOX(b->gopt.box), b->check, FALSE, FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(b->gopt.box), label, FALSE, FALSE, 0);
+	return &b->gopt;
+}
+
+/*
+ * These are paired 0/1 and 2/3. 0/2 are min values, 1/3 are max values.
+ * If the max is made smaller than min, adjust min down.
+ * If the min is made larger than max, adjust the max.
+ */
+static void range_value_changed(GtkSpinButton *spin, gpointer data)
+{
+	struct gopt_range *r = (struct gopt_range *) data;
+	int changed = -1, i;
+	gint val, mval;
+
+	gopt_changed(&r->gopt);
+
+	for (i = 0; i < GOPT_RANGE_SPIN; i++) {
+		if (GTK_SPIN_BUTTON(r->spins[i]) == spin) {
+			changed = i;
+			break;
+		}
+	}
+
+	assert(changed != -1);
+
+	/*
+	 * Min changed
+	 */
+	if (changed == 0 || changed == 2) {
+		GtkWidget *mspin = r->spins[changed + 1];
+
+		val = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(r->spins[changed]));
+		mval = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(mspin));
+		if (val > mval)
+			gtk_spin_button_set_value(GTK_SPIN_BUTTON(mspin), val);
+	} else {
+		GtkWidget *mspin = r->spins[changed - 1];
+
+		val = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(r->spins[changed]));
+		mval = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(mspin));
+		if (val < mval)
+			gtk_spin_button_set_value(GTK_SPIN_BUTTON(mspin), val);
+	}
+}
+
+static void gopt_range_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_range *r = (struct gopt_range *) data;
+
+	free(r);
+	gtk_widget_destroy(w);
+}
+
+static void gopt_int_range_set_val(struct gopt_range *r, unsigned int *vals)
+{
+	int i;
+
+	for (i = 0; i < GOPT_RANGE_SPIN; i++)
+		gtk_spin_button_set_value(GTK_SPIN_BUTTON(r->spins[i]), vals[i]);
+}
+
+static struct gopt *gopt_new_int_range(struct gopt_job_view *gjv,
+				       struct fio_option *o, unsigned int **ip,
+				       unsigned int idx)
+{
+	struct gopt_range *r;
+	GtkWidget *label;
+	guint interval;
+	unsigned int defvals[GOPT_RANGE_SPIN];
+	gint maxval;
+	int i;
+
+	r = calloc(1, sizeof(*r));
+	r->gopt.box = gtk_hbox_new(FALSE, 3);
+	gopt_mark_index(gjv, &r->gopt, idx, GOPT_RANGE);
+	if (!o->lname)
+		label = gtk_label_new(o->name);
+	else
+		label = gtk_label_new(o->lname);
+
+	maxval = o->maxval;
+	if (!maxval)
+		maxval = INT_MAX;
+
+	memset(defvals, 0, sizeof(defvals));
+	if (o->def) {
+		long long val;
+
+		check_str_bytes(o->def, &val, o);
+		for (i = 0; i < GOPT_RANGE_SPIN; i++)
+			defvals[i] = val;
+	}
+
+	interval = 1.0;
+	if (o->interval)
+		interval = o->interval;
+
+	for (i = 0; i < GOPT_RANGE_SPIN; i++) {
+		r->spins[i] = gtk_spin_button_new_with_range(o->minval, maxval, interval);
+		gtk_spin_button_set_update_policy(GTK_SPIN_BUTTON(r->spins[i]), GTK_UPDATE_IF_VALID);
+		gtk_box_pack_start(GTK_BOX(r->gopt.box), r->spins[i], FALSE, FALSE, 0);
+	}
+
+	if (ip)
+		gopt_int_range_set_val(r, *ip);
+	else
+		gopt_int_range_set_val(r, defvals);
+
+	for (i = 0; i < GOPT_RANGE_SPIN; i++)
+		g_signal_connect(G_OBJECT(r->spins[i]), "value-changed", G_CALLBACK(range_value_changed), r);
+
+	gtk_box_pack_start(GTK_BOX(r->gopt.box), label, FALSE, FALSE, 0);
+	g_signal_connect(G_OBJECT(r->gopt.box), "destroy", G_CALLBACK(gopt_range_destroy), r);
+	return &r->gopt;
+}
+
+static void gopt_str_val_destroy(GtkWidget *w, gpointer data)
+{
+	struct gopt_str_val *g = (struct gopt_str_val *) data;
+
+	free(g);
+	gtk_widget_destroy(w);
+}
+
+static void gopt_str_val_spin_wrapped(GtkSpinButton *spin, gpointer data)
+{
+	struct gopt_str_val *g = (struct gopt_str_val *) data;
+	unsigned int val;
+	GtkAdjustment *adj;
+	gint index;
+
+	adj = gtk_spin_button_get_adjustment(spin);
+	val = gtk_adjustment_get_value(adj);
+
+	/*
+	 * Can't rely on exact value, as fast changes increment >= 1
+	 */
+	if (!val) {
+		index = gtk_combo_box_get_active(GTK_COMBO_BOX(g->combo));
+		if (index + 1 <= g->maxindex) {
+			val = 1;
+			gtk_combo_box_set_active(GTK_COMBO_BOX(g->combo), ++index);
+		} else
+			val = 1023;
+		gtk_spin_button_set_value(spin, val);
+	} else {
+		index = gtk_combo_box_get_active(GTK_COMBO_BOX(g->combo));
+		if (index) {
+			gtk_combo_box_set_active(GTK_COMBO_BOX(g->combo), --index);
+			gtk_spin_button_set_value(spin, 1023);
+		} else
+			gtk_spin_button_set_value(spin, 0);
+	}
+}
+
+static void gopt_str_val_changed(GtkSpinButton *spin, gpointer data)
+{
+	struct gopt_str_val *g = (struct gopt_str_val *) data;
+
+	gopt_changed(&g->gopt);
+}
+
+static void gopt_str_val_set_val(struct gopt_str_val *g, unsigned long long val)
+{
+	int i = 0;
+
+	do {
+		if (!val || (val % 1024))
+			break;
+
+		i++;
+		val /= 1024;
+	} while (1);
+
+	gtk_spin_button_set_value(GTK_SPIN_BUTTON(g->spin), val);
+	gtk_combo_box_set_active(GTK_COMBO_BOX(g->combo), i);
+}
+
+static struct gopt *gopt_new_str_val(struct gopt_job_view *gjv,
+				     struct fio_option *o,
+				     unsigned long long *p, unsigned int idx)
+{
+	struct gopt_str_val *g;
+	const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" };
+	GtkWidget *label;
+	int i;
+
+	g = calloc(1, sizeof(*g));
+	g->gopt.box = gtk_hbox_new(FALSE, 3);
+	if (!o->lname)
+		label = gtk_label_new(o->name);
+	else
+		label = gtk_label_new(o->lname);
+	gopt_mark_index(gjv, &g->gopt, idx, GOPT_STR_VAL);
+
+	g->spin = gtk_spin_button_new_with_range(0.0, 1023.0, 1.0);
+	gtk_spin_button_set_update_policy(GTK_SPIN_BUTTON(g->spin), GTK_UPDATE_IF_VALID);
+	gtk_spin_button_set_value(GTK_SPIN_BUTTON(g->spin), 0);
+	gtk_spin_button_set_wrap(GTK_SPIN_BUTTON(g->spin), 1);
+	gtk_box_pack_start(GTK_BOX(g->gopt.box), g->spin, FALSE, FALSE, 0);
+	g_signal_connect(G_OBJECT(g->spin), "wrapped", G_CALLBACK(gopt_str_val_spin_wrapped), g);
+	g_signal_connect(G_OBJECT(g->spin), "changed", G_CALLBACK(gopt_str_val_changed), g);
+
+	g->combo = gtk_combo_box_text_new();
+	i = 0;
+	while (strlen(postfix[i])) {
+		gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(g->combo), postfix[i]);
+		i++;
+	}
+	g->maxindex = i - 1;
+	gtk_combo_box_set_active(GTK_COMBO_BOX(g->combo), 0);
+	gtk_box_pack_start(GTK_BOX(g->gopt.box), g->combo, FALSE, FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(g->gopt.box), label, FALSE, FALSE, 3);
+
+	if (p)
+		gopt_str_val_set_val(g, *p);
+
+	g_signal_connect(G_OBJECT(g->combo), "changed", G_CALLBACK(gopt_str_val_changed), g);
+
+	g_signal_connect(G_OBJECT(g->gopt.box), "destroy", G_CALLBACK(gopt_str_val_destroy), g);
+	return &g->gopt;
+}
+
+static void gopt_set_option(struct gopt_job_view *gjv, struct fio_option *o,
+			    struct gopt *gopt, struct thread_options *to)
+{
+	switch (o->type) {
+	case FIO_OPT_STR_VAL: {
+		unsigned long long *ullp = NULL;
+		struct gopt_str_val *g;
+
+		if (o->off1)
+			ullp = td_var(to, o, o->off1);
+
+		g = container_of(gopt, struct gopt_str_val, gopt);
+		if (ullp)
+			gopt_str_val_set_val(g, *ullp);
+		break;
+		}
+	case FIO_OPT_STR_VAL_TIME: {
+		unsigned long long *ullp = NULL;
+		struct gopt_int *i;
+
+		if (o->off1)
+			ullp = td_var(to, o, o->off1);
+
+		i = container_of(gopt, struct gopt_int, gopt);
+		if (ullp)
+			gopt_int_set_val(i, *ullp);
+		break;
+		}
+	case FIO_OPT_INT:
+		if (o->posval[0].ival) {
+			unsigned int *ip = NULL;
+			struct gopt_combo *c;
+
+			if (o->off1)
+				ip = td_var(to, o, o->off1);
+
+			c = container_of(gopt, struct gopt_combo, gopt);
+			if (ip)
+				gopt_combo_int_set_val(c, *ip);
+		} else {
+			unsigned int *ip = NULL;
+			struct gopt_int *i;
+
+			if (o->off1)
+				ip = td_var(to, o, o->off1);
+
+			i = container_of(gopt, struct gopt_int, gopt);
+			if (ip)
+				gopt_int_set_val(i, *ip);
+		}
+		break;
+	case FIO_OPT_STR_SET:
+	case FIO_OPT_BOOL: {
+		unsigned int *ip = NULL;
+		struct gopt_bool *b;
+
+		if (o->off1)
+			ip = td_var(to, o, o->off1);
+
+		b = container_of(gopt, struct gopt_bool, gopt);
+		if (ip)
+			gopt_bool_set_val(b, *ip);
+		break;
+		}
+	case FIO_OPT_STR: {
+		if (o->posval[0].ival) {
+			unsigned int *ip = NULL;
+			struct gopt_combo *c;
+
+			if (o->off1)
+				ip = td_var(to, o, o->off1);
+
+			c = container_of(gopt, struct gopt_combo, gopt);
+			if (ip)
+				gopt_combo_int_set_val(c, *ip);
+		} else {
+			struct gopt_str *s;
+			char *text = NULL;
+
+			if (o->off1) {
+				char **p = td_var(to, o, o->off1);
+
+				text = *p;
+			}
+
+			s = container_of(gopt, struct gopt_str, gopt);
+			gopt_str_store_set_val(s, text);
+		}
+
+		break;
+		}
+	case FIO_OPT_STR_STORE: {
+		struct gopt_combo *c;
+		char *text = NULL;
+
+		if (o->off1) {
+			char **p = td_var(to, o, o->off1);
+			text = *p;
+		}
+
+		if (!o->posval[0].ival) {
+			struct gopt_str *s;
+
+			s = container_of(gopt, struct gopt_str, gopt);
+			gopt_str_store_set_val(s, text);
+			break;
+		}
+
+		c = container_of(gopt, struct gopt_combo, gopt);
+		if (text)
+			gopt_combo_str_set_val(c, text);
+		break;
+		}
+	case FIO_OPT_STR_MULTI:
+		/* HANDLE ME */
+		break;
+	case FIO_OPT_RANGE: {
+		struct gopt_range *r;
+		unsigned int *ip[4] = { td_var(to, o, o->off1),
+					td_var(to, o, o->off2),
+					td_var(to, o, o->off3),
+					td_var(to, o, o->off4) };
+
+		r = container_of(gopt, struct gopt_range, gopt);
+		gopt_int_range_set_val(r, *ip);
+		break;
+		}
+	/* still need to handle this one */
+	case FIO_OPT_FLOAT_LIST:
+		break;
+	case FIO_OPT_DEPRECATED:
+		break;
+	default:
+		printf("ignore type %u\n", o->type);
+		break;
+	}
+}
+
+static void gopt_add_option(struct gopt_job_view *gjv, GtkWidget *hbox,
+			    struct fio_option *o, unsigned int opt_index,
+			    struct thread_options *to)
+{
+	struct gopt *go = NULL;
+
+	switch (o->type) {
+	case FIO_OPT_STR_VAL: {
+		unsigned long long *ullp = NULL;
+
+		if (o->off1)
+			ullp = td_var(to, o, o->off1);
+
+		go = gopt_new_str_val(gjv, o, ullp, opt_index);
+		break;
+		}
+	case FIO_OPT_STR_VAL_TIME: {
+		unsigned long long *ullp = NULL;
+
+		if (o->off1)
+			ullp = td_var(to, o, o->off1);
+
+		go = gopt_new_ullong(gjv, o, ullp, opt_index);
+		break;
+		}
+	case FIO_OPT_INT:
+		if (o->posval[0].ival) {
+			unsigned int *ip = NULL;
+
+			if (o->off1)
+				ip = td_var(to, o, o->off1);
+
+			go = gopt_new_combo_int(gjv, o, ip, opt_index);
+		} else {
+			unsigned int *ip = NULL;
+
+			if (o->off1)
+				ip = td_var(to, o, o->off1);
+
+			go = gopt_new_int(gjv, o, ip, opt_index);
+		}
+		break;
+	case FIO_OPT_STR_SET:
+	case FIO_OPT_BOOL: {
+		unsigned int *ip = NULL;
+
+		if (o->off1)
+			ip = td_var(to, o, o->off1);
+
+		go = gopt_new_bool(gjv, o, ip, opt_index);
+		break;
+		}
+	case FIO_OPT_STR: {
+		if (o->posval[0].ival) {
+			unsigned int *ip = NULL;
+
+			if (o->off1)
+				ip = td_var(to, o, o->off1);
+
+			go = gopt_new_combo_int(gjv, o, ip, opt_index);
+		} else {
+			/* TODO: usually ->cb, or unsigned int pointer */
+			go = gopt_new_str_store(gjv, o, NULL, opt_index);
+		}
+
+		break;
+		}
+	case FIO_OPT_STR_STORE: {
+		char *text = NULL;
+
+		if (o->off1) {
+			char **p = td_var(to, o, o->off1);
+			text = *p;
+		}
+
+		if (!o->posval[0].ival) {
+			go = gopt_new_str_store(gjv, o, text, opt_index);
+			break;
+		}
+
+		go = gopt_new_combo_str(gjv, o, text, opt_index);
+		break;
+		}
+	case FIO_OPT_STR_MULTI:
+		go = gopt_new_str_multi(gjv, o, opt_index);
+		break;
+	case FIO_OPT_RANGE: {
+		unsigned int *ip[4] = { td_var(to, o, o->off1),
+					td_var(to, o, o->off2),
+					td_var(to, o, o->off3),
+					td_var(to, o, o->off4) };
+
+		go = gopt_new_int_range(gjv, o, ip, opt_index);
+		break;
+		}
+	/* still need to handle this one */
+	case FIO_OPT_FLOAT_LIST:
+		break;
+	case FIO_OPT_DEPRECATED:
+		break;
+	default:
+		printf("ignore type %u\n", o->type);
+		break;
+	}
+
+	if (go) {
+		GtkWidget *dest;
+
+		if (o->help)
+			gtk_widget_set_tooltip_text(go->box, o->help);
+
+		o->gui_data = go;
+
+		dest = gopt_get_group_frame(gjv, hbox, o->group);
+		if (!dest)
+			gtk_box_pack_start(GTK_BOX(hbox), go->box, FALSE, FALSE, 5);
+		else
+			gtk_box_pack_start(GTK_BOX(dest), go->box, FALSE, FALSE, 5);
+	}
+}
+
+static void gopt_add_options(struct gopt_job_view *gjv,
+			     struct thread_options *to)
+{
+	GtkWidget *hbox = NULL;
+	int i;
+
+	/*
+	 * First add all options
+	 */
+	for (i = 0; fio_options[i].name; i++) {
+		struct fio_option *o = &fio_options[i];
+		uint64_t mask = o->category;
+		const struct opt_group *og;
+
+		while ((og = opt_group_from_mask(&mask)) != NULL) {
+			GtkWidget *vbox = gjv->vboxes[ffz64(~og->mask)];
+
+			hbox = gtk_hbox_new(FALSE, 3);
+			gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, FALSE, 5);
+			gopt_add_option(gjv, hbox, o, i, to);
+		}
+	}
+}
+
+static void gopt_set_options(struct gopt_job_view *gjv,
+			     struct thread_options *to)
+{
+	int i;
+
+	for (i = 0; fio_options[i].name; i++) {
+		struct fio_option *o = &fio_options[i];
+		struct gopt *gopt = gjv->gopts[i];
+
+		gopt_set_option(gjv, o, gopt, to);
+	}
+}
+
+static GtkWidget *gopt_add_tab(GtkWidget *notebook, const char *name)
+{
+	GtkWidget *box, *vbox, *scroll;
+
+	scroll = gtk_scrolled_window_new(NULL, NULL);
+	gtk_container_set_border_width(GTK_CONTAINER(scroll), 5);
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scroll), GTK_POLICY_AUTOMATIC, GTK_POLICY_AUTOMATIC);
+
+	vbox = gtk_vbox_new(FALSE, 3);
+	box = gtk_hbox_new(FALSE, 0);
+	gtk_box_pack_start(GTK_BOX(vbox), box, FALSE, FALSE, 5);
+	gtk_scrolled_window_add_with_viewport(GTK_SCROLLED_WINDOW(scroll), vbox);
+	gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scroll, gtk_label_new(name));
+	return vbox;
+}
+
+static GtkWidget *gopt_add_group_tab(GtkWidget *notebook,
+				     const struct opt_group *og)
+{
+	return gopt_add_tab(notebook, og->name);
+}
+
+static void gopt_add_group_tabs(GtkWidget *notebook, struct gopt_job_view *gjv)
+{
+	const struct opt_group *og;
+	unsigned int i;
+
+	i = 0;
+	do {
+		uint64_t mask = (1ULL << i);
+
+		og = opt_group_from_mask(&mask);
+		if (!og)
+			break;
+		gjv->vboxes[i] = gopt_add_group_tab(notebook, og);
+		i++;
+	} while (1);
+}
+
+static void gopt_handle_str_multi_changed(struct gopt_job_view *gjv,
+					  struct gopt_str_multi *m,
+					  struct fio_option *o)
+{
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
+	struct value_pair *vp;
+	gboolean set;
+	guint val = 0;
+	int i;
+
+	i = 0;
+	vp = &o->posval[0];
+	while (vp->ival) {
+		if (!m->checks[i])
+			break;
+		set = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(m->checks[i]));
+		if (set) {
+			if (vp->orval)
+				val |= vp->oval;
+			else
+				val = vp->oval;
+		}
+		i++;
+		vp++;
+	}
+
+	if (o->off1)
+		*ip = val;
+}
+
+static void gopt_handle_range_changed(struct gopt_job_view *gjv,
+				      struct gopt_range *r,
+				      struct fio_option *o)
+{
+	unsigned int *ip[4] = { td_var(gjv->o, o, o->off1),
+				td_var(gjv->o, o, o->off2),
+				td_var(gjv->o, o, o->off3),
+				td_var(gjv->o, o, o->off4) };
+	gint val;
+	int i;
+
+	for (i = 0; i < GOPT_RANGE_SPIN; i++) {
+		val = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(r->spins[i]));
+		*ip[i] = val;
+	}
+}
+
+static void gopt_handle_str_val_changed(struct gopt_job_view *gjv,
+					struct gopt_str_val *s,
+					struct fio_option *o)
+{
+	unsigned long long *ullp = td_var(gjv->o, o, o->off1);
+	GtkAdjustment *adj;
+	gint index;
+
+	if (!ullp)
+		return;
+
+	/*
+	 * Numerical value
+	 */
+	adj = gtk_spin_button_get_adjustment(GTK_SPIN_BUTTON(s->spin));
+	*ullp = gtk_adjustment_get_value(adj);
+
+	/*
+	 * Multiplier
+	 */
+	index = gtk_combo_box_get_active(GTK_COMBO_BOX(s->combo));
+	while (index--)
+		*ullp *= 1024ULL;
+}
+
+static void gopt_handle_str_changed(struct gopt_job_view *gjv,
+				    struct gopt_str *s, struct fio_option *o)
+{
+	char **p = td_var(gjv->o, o, o->off1);
+
+	if (*p)
+		free(*p);
+
+	*p = strdup(gtk_entry_get_text(GTK_ENTRY(s->entry)));
+}
+
+static void gopt_handle_bool_changed(struct gopt_job_view *gjv,
+				     struct gopt_bool *b, struct fio_option *o)
+{
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
+	gboolean set;
+
+	set = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(b->check));
+	*ip = set;
+}
+
+static void gopt_handle_int_changed(struct gopt_job_view *gjv,
+				    struct gopt_int *i, struct fio_option *o)
+{
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
+	GtkAdjustment *adj;
+	guint val;
+
+	adj = gtk_spin_button_get_adjustment(GTK_SPIN_BUTTON(i->spin));
+	val = gtk_adjustment_get_value(adj);
+	*ip = val;
+}
+
+static void gopt_handle_combo_str_changed(struct gopt_job_view *gjv,
+					  struct gopt_combo *c,
+					  struct fio_option *o)
+{
+	char **p = td_var(gjv->o, o, o->off1);
+
+	if (*p)
+		free(*p);
+
+	*p = strdup(gtk_combo_box_text_get_active_text(GTK_COMBO_BOX_TEXT(c->combo)));
+}
+
+static void gopt_handle_combo_int_changed(struct gopt_job_view *gjv,
+					  struct gopt_combo *c,
+					  struct fio_option *o)
+{
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
+	gint index;
+
+	index = gtk_combo_box_get_active(GTK_COMBO_BOX(c->combo));
+	*ip = o->posval[index].oval;
+}
+
+static void gopt_handle_changed(struct gopt *gopt)
+{
+	struct fio_option *o = &fio_options[gopt->opt_index];
+	struct gopt_job_view *gjv = gopt->gjv;
+
+	switch (gopt->opt_type) {
+	case GOPT_COMBO_INT: {
+		struct gopt_combo *c;
+
+		c = container_of(gopt, struct gopt_combo, gopt);
+		gopt_handle_combo_int_changed(gjv, c, o);
+		break;
+		}
+	case GOPT_COMBO_STR: {
+		struct gopt_combo *c;
+
+		c = container_of(gopt, struct gopt_combo, gopt);
+		gopt_handle_combo_str_changed(gjv, c, o);
+		break;
+		}
+	case GOPT_INT: {
+		struct gopt_int *i;
+
+		i = container_of(gopt, struct gopt_int, gopt);
+		gopt_handle_int_changed(gjv, i, o);
+		break;
+		}
+	case GOPT_BOOL: {
+		struct gopt_bool *b;
+
+		b = container_of(gopt, struct gopt_bool, gopt);
+		gopt_handle_bool_changed(gjv, b, o);
+		break;
+		}
+	case GOPT_STR: {
+		struct gopt_str *s;
+
+		s = container_of(gopt, struct gopt_str, gopt);
+		gopt_handle_str_changed(gjv, s, o);
+		break;
+		}
+	case GOPT_STR_VAL: {
+		struct gopt_str_val *s;
+
+		s = container_of(gopt, struct gopt_str_val, gopt);
+		gopt_handle_str_val_changed(gjv, s, o);
+		break;
+		}
+	case GOPT_RANGE: {
+		struct gopt_range *r;
+
+		r = container_of(gopt, struct gopt_range, gopt);
+		gopt_handle_range_changed(gjv, r, o);
+		break;
+		}
+	case GOPT_STR_MULTI: {
+		struct gopt_str_multi *m;
+
+		m = container_of(gopt, struct gopt_str_multi, gopt);
+		gopt_handle_str_multi_changed(gjv, m, o);
+		break;
+		}
+	default:
+		log_err("gfio: bad option type: %d\n", gopt->opt_type);
+		break;
+	}
+}
+
+static void gopt_report_update_status(struct gopt_job_view *gjv)
+{
+	struct gfio_client *gc = gjv->client;
+	char tmp[80];
+
+	sprintf(tmp, "\nCompleted with error: %d\n", gc->update_job_status);
+	gfio_report_info(gc->ge->ui, "Update job", tmp);
+}
+
+static int gopt_handle_changed_options(struct gopt_job_view *gjv)
+{
+	struct gfio_client *gc = gjv->client;
+	struct flist_head *entry;
+	uint64_t waitid = 0;
+	struct gopt *gopt;
+	int ret;
+
+	flist_for_each(entry, &gjv->changed_list) {
+		gopt = flist_entry(entry, struct gopt, changed_list);
+		gopt_handle_changed(gopt);
+	}
+
+	gc->update_job_status = 0;
+	gc->update_job_done = 0;
+
+	ret = fio_client_update_options(gc->client, gjv->o, &waitid);
+	if (ret)
+		goto done;
+
+	ret = fio_client_wait_for_reply(gc->client, waitid);
+	if (ret)
+		goto done;
+
+	assert(gc->update_job_done);
+	if (gc->update_job_status)
+		goto done;
+
+	while (!flist_empty(&gjv->changed_list)) {
+		gopt = flist_first_entry(&gjv->changed_list, struct gopt, changed_list);
+		flist_del_init(&gopt->changed_list);
+	}
+
+done:
+	gopt_dialog_update_apply_button(gjv);
+	return ret;
+}
+
+static gint gopt_dialog_cancel(gint response)
+{
+	switch (response) {
+	case GTK_RESPONSE_NONE:
+	case GTK_RESPONSE_REJECT:
+	case GTK_RESPONSE_DELETE_EVENT:
+	case GTK_RESPONSE_CANCEL:
+	case GTK_RESPONSE_NO:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static gint gopt_dialog_done(gint response)
+{
+	switch (response) {
+	case GTK_RESPONSE_ACCEPT:
+	case GTK_RESPONSE_OK:
+	case GTK_RESPONSE_YES:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static void gopt_handle_option_dialog(struct gopt_job_view *gjv)
+{
+	gint response;
+
+	do {
+		response = gtk_dialog_run(GTK_DIALOG(gjv->dialog));
+
+		if (gopt_dialog_cancel(response) ||
+		    gopt_dialog_done(response))
+			break;
+
+		/*
+		 * Apply
+		 */
+		gopt_handle_changed_options(gjv);
+		gopt_report_update_status(gjv);
+	} while (1);
+
+	if (gopt_dialog_cancel(response))
+		return;
+
+	gopt_handle_changed_options(gjv);
+}
+
+static void gopt_job_changed(GtkComboBox *box, gpointer data)
+{
+	struct gopt_job_view *gjv = (struct gopt_job_view *) data;
+	struct gfio_client_options *gco = NULL;
+	struct gfio_client *gc = gjv->client;
+	struct flist_head *entry;
+	gchar *job;
+
+	/*
+	 * The switch act should be sensitized appropriately, so that we
+	 * never get here with modified options.
+	 */
+	if (!flist_empty(&gjv->changed_list)) {
+		gfio_report_info(gc->ge->ui, "Internal Error", "Modified options on job switch.\nThat should not be possible!\n");
+		return;
+	}
+
+	job = gtk_combo_box_text_get_active_text(GTK_COMBO_BOX_TEXT(gjv->job_combo));
+	flist_for_each(entry, &gc->o_list) {
+		const char *name;
+
+		gco = flist_entry(entry, struct gfio_client_options, list);
+		name = gco->o.name;
+		if (!name || !strlen(name))
+			name = "Default job";
+
+		if (!strcmp(name, job))
+			break;
+
+		gco = NULL;
+	}
+
+	if (!gco) {
+		gfio_report_info(gc->ge->ui, "Internal Error", "Could not find job description.\nThat should not be possible!\n");
+		return;
+	}
+
+	gjv->in_job_switch = 1;
+	gopt_set_options(gjv, &gco->o);
+	gjv->in_job_switch = 0;
+}
+
+void gopt_get_options_window(GtkWidget *window, struct gfio_client *gc)
+{
+	GtkWidget *dialog, *notebook, *vbox, *topvbox, *combo;
+	struct gfio_client_options *gco;
+	struct flist_head *entry;
+	struct gopt_job_view *gjv;
+
+	dialog = gtk_dialog_new_with_buttons("Fio options",
+			GTK_WINDOW(window), GTK_DIALOG_DESTROY_WITH_PARENT,
+			GTK_STOCK_OK, GTK_RESPONSE_ACCEPT,
+			GTK_STOCK_APPLY, GTK_RESPONSE_APPLY,
+			GTK_STOCK_CANCEL, GTK_RESPONSE_REJECT, NULL);
+
+	combo = gtk_combo_box_text_new();
+	flist_for_each(entry, &gc->o_list) {
+		struct thread_options *o;
+		const char *name;
+
+		gco = flist_entry(entry, struct gfio_client_options, list);
+		o = &gco->o;
+		name = o->name;
+		if (!name || !strlen(name))
+			name = "Default job";
+
+		gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(combo), name);
+	}
+	gtk_combo_box_set_active(GTK_COMBO_BOX(combo), 0);
+
+	gtk_widget_set_size_request(GTK_WIDGET(dialog), 1024, 768);
+
+	topvbox = gtk_dialog_get_content_area(GTK_DIALOG(dialog));
+	gtk_box_pack_start(GTK_BOX(topvbox), combo, FALSE, FALSE, 5);
+
+	vbox = gtk_vbox_new(TRUE, 5);
+	gtk_box_pack_start(GTK_BOX(topvbox), vbox, TRUE, TRUE, 5);
+
+	notebook = gtk_notebook_new();
+	gtk_notebook_set_scrollable(GTK_NOTEBOOK(notebook), 1);
+	gtk_notebook_popup_enable(GTK_NOTEBOOK(notebook));
+	gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 5);
+
+	gjv = calloc(1, sizeof(*gjv));
+	INIT_FLIST_HEAD(&gjv->changed_list);
+	gco = flist_first_entry(&gc->o_list, struct gfio_client_options, list);
+	gjv->o = &gco->o;
+	gjv->dialog = dialog;
+	gjv->client = gc;
+	gjv->job_combo = combo;
+	gopt_add_group_tabs(notebook, gjv);
+	gopt_add_options(gjv, &gco->o);
+	gopt_dialog_update_apply_button(gjv);
+
+	g_signal_connect(G_OBJECT(combo), "changed", G_CALLBACK(gopt_job_changed), gjv);
+
+	gtk_widget_show_all(dialog);
+
+	gopt_handle_option_dialog(gjv);
+
+	gtk_widget_destroy(dialog);
+	free(gjv);
+}
+
+/*
+ * Build n-ary option dependency tree
+ */
+void gopt_init(void)
+{
+	int i;
+
+	gopt_dep_tree = g_node_new(NULL);
+
+	for (i = 0; fio_options[i].name; i++) {
+		struct fio_option *o = &fio_options[i];
+		GNode *node, *nparent;
+
+		/*
+		 * Insert node with either the root parent, or an
+		 * option parent.
+		 */
+		node = g_node_new(o);
+		nparent = gopt_dep_tree;
+		if (o->parent) {
+			struct fio_option *parent;
+
+			parent = fio_option_find(o->parent);
+			nparent = g_node_find(gopt_dep_tree, G_IN_ORDER, G_TRAVERSE_ALL, parent);
+			if (!nparent) {
+				log_err("fio: did not find parent %s for opt %s\n", o->name, o->parent);
+				nparent = gopt_dep_tree;
+			}
+		}
+
+		g_node_insert(nparent, -1, node);
+	}
+}
+
+void gopt_exit(void)
+{
+	g_node_destroy(gopt_dep_tree);
+	gopt_dep_tree = NULL;
+}
diff --git a/goptions.h b/goptions.h
new file mode 100644
index 0000000..a225a8d
--- /dev/null
+++ b/goptions.h
@@ -0,0 +1,8 @@
+#ifndef GFIO_OPTIONS_H
+#define GFIO_OPTIONS_H
+
+void gopt_get_options_window(GtkWidget *window, struct gfio_client *gc);
+void gopt_init(void);
+void gopt_exit(void);
+
+#endif
diff --git a/graph.c b/graph.c
new file mode 100644
index 0000000..7a17417
--- /dev/null
+++ b/graph.c
@@ -0,0 +1,1033 @@
+/*
+ * gfio - gui front end for fio - the flexible io tester
+ *
+ * Copyright (C) 2012 Stephen M. Cameron <stephenmcameron@gmail.com>
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include <cairo.h>
+#include <gtk/gtk.h>
+
+#include "tickmarks.h"
+#include "graph.h"
+#include "flist.h"
+#include "lib/prio_tree.h"
+#include "cairo_text_helpers.h"
+
+/*
+ * Allowable difference to show tooltip
+ */
+#define TOOLTIP_DELTA	0.08
+
+struct xyvalue {
+	double x, y;
+};
+
+enum {
+	GV_F_ON_PRIO	= 1,
+	GV_F_PRIO_SKIP	= 2,
+};
+
+struct graph_value {
+	struct flist_head list;
+	struct prio_tree_node node;
+	struct flist_head alias;
+	unsigned int flags;
+	char *tooltip;
+	void *value;
+};
+
+struct graph_label {
+	struct flist_head list;
+	char *label;
+	struct flist_head value_list;
+	struct prio_tree_root prio_tree;
+	double r, g, b;
+	int hide;
+	int value_count;
+	struct graph *parent;
+};
+
+struct tick_value {
+	unsigned int offset;
+	double value;
+};
+
+struct graph {
+	char *title;
+	char *xtitle;
+	char *ytitle;
+	unsigned int xdim, ydim;
+	double xoffset, yoffset;
+	struct flist_head label_list;
+	int per_label_limit;
+	const char *font;
+	graph_axis_unit_change_callback x_axis_unit_change_callback;
+	graph_axis_unit_change_callback y_axis_unit_change_callback;
+	unsigned int base_offset;
+	unsigned int dont_graph_all_zeroes;
+	double left_extra;
+	double right_extra;
+	double top_extra;
+	double bottom_extra;
+
+	double xtick_zero;
+	double xtick_delta;
+	double xtick_zero_val;
+	double xtick_one_val;
+	double ytick_zero;
+	double ytick_delta;
+	double ytick_zero_val;
+	double ytick_one_val;
+};
+
+void graph_set_size(struct graph *g, unsigned int xdim, unsigned int ydim)
+{
+	g->xdim = xdim;
+	g->ydim = ydim;
+}
+
+void graph_set_position(struct graph *g, double xoffset, double yoffset)
+{
+	g->xoffset = xoffset;
+	g->yoffset = yoffset;
+}
+
+struct graph *graph_new(unsigned int xdim, unsigned int ydim, const char *font)
+{
+	struct graph *g;
+
+	g = calloc(1, sizeof(*g));
+	INIT_FLIST_HEAD(&g->label_list);
+	graph_set_size(g, xdim, ydim);
+	g->per_label_limit = -1;
+	g->font = font;
+	if (!g->font)
+		g->font = GRAPH_DEFAULT_FONT;
+	return g;
+}
+
+void graph_set_font(struct graph *g, const char *font)
+{
+	g->font = font;
+}
+
+void graph_x_axis_unit_change_notify(struct graph *g, graph_axis_unit_change_callback f)
+{
+	g->x_axis_unit_change_callback = f;
+}
+
+void graph_y_axis_unit_change_notify(struct graph *g, graph_axis_unit_change_callback f)
+{
+	g->y_axis_unit_change_callback = f;
+}
+
+static int count_labels(struct graph *g)
+{
+	struct flist_head *entry;
+	int count = 0;
+
+	flist_for_each(entry, &g->label_list)
+		count++;
+
+	return count;
+}
+
+static int count_values(struct graph_label *l)
+{
+	struct flist_head *entry;
+	int count = 0;
+
+	flist_for_each(entry, &l->value_list)
+		count++;
+
+	return count;
+}
+
+typedef double (*double_comparator)(double a, double b);
+
+static double mindouble(double a, double b)
+{
+	return a < b ? a : b;
+}
+
+static double maxdouble(double a, double b)
+{
+	return a < b ? b : a;
+}
+
+static double find_double_values(struct graph_label *l, double_comparator cmp)
+{
+	struct flist_head *entry;
+	double answer = 0.0, tmp;
+	int first = 1;
+
+	if (flist_empty(&l->value_list))
+		return 0.0;
+
+	flist_for_each(entry, &l->value_list) {
+		struct graph_value *i;
+
+		i = flist_entry(entry, struct graph_value, list);
+		tmp = *(double *) i->value;
+		if (first) {
+			answer = tmp;
+			first = 0;
+		} else {
+			answer = cmp(answer, tmp);
+		}
+	}
+	return answer;
+}
+
+static double find_double_data(struct graph *g, double_comparator cmp)
+{
+	struct flist_head *entry;
+	struct graph_label *i;
+	int first = 1;
+	double answer, tmp;
+
+	if (flist_empty(&g->label_list))
+		return 0.0;
+
+	flist_for_each(entry, &g->label_list) {
+		i = flist_entry(entry, struct graph_label, list);
+		tmp = find_double_values(i, cmp);
+		if (first) {
+			answer = tmp;
+			first = 0;
+		} else {
+			answer = cmp(tmp, answer);
+		}
+	}
+	return answer;
+}
+
+static double find_min_data(struct graph *g)
+{
+	return find_double_data(g, mindouble);
+}
+
+static double find_max_data(struct graph *g)
+{
+	return find_double_data(g, maxdouble);
+}
+
+static void draw_bars(struct graph *bg, cairo_t *cr, struct graph_label *lb,
+			double label_offset, double bar_width,
+			double mindata, double maxdata)
+{
+	struct flist_head *entry;
+	double x1, y1, x2, y2;
+	int bar_num = 0;
+	double domain, range, v;
+
+	domain = (maxdata - mindata);
+	range = (double) bg->ydim * 0.80; /* FIXME */
+	cairo_stroke(cr);
+	flist_for_each(entry, &lb->value_list) {
+		struct graph_value *i;
+
+		i = flist_entry(entry, struct graph_value, list);
+
+		x1 = label_offset + (double) bar_num * bar_width + (bar_width * 0.05);
+		x2 = x1 + bar_width * 0.90;
+		y2 = bg->ydim * 0.90;
+		v = *(double *) i->value;
+		y1 = y2 - (((v - mindata) / domain) * range);
+		cairo_move_to(cr, x1, y1);
+		cairo_line_to(cr, x1, y2);
+		cairo_line_to(cr, x2, y2);
+		cairo_line_to(cr, x2, y1);
+		cairo_close_path(cr);
+		cairo_fill(cr);
+		cairo_stroke(cr);
+		bar_num++;
+	}
+}
+
+static void graph_draw_common(struct graph *g, cairo_t *cr, double *x1,
+			      double *y1, double *x2, double *y2)
+{
+	const double shade_col[3][3] = { { 0.55, 0.54, 0.54 },
+					 { 0.80, 0.78, 0.78 },
+					 { 0.93, 0.91, 0.91 } };
+	int i;
+
+	*x1 = 0.10 * g->xdim;
+	*x2 = 0.95 * g->xdim;
+	*y1 = 0.10 * g->ydim;
+	*y2 = 0.90 * g->ydim;
+
+	/*
+	 * Add shade
+	 */
+	cairo_set_line_width(cr, 1.0);
+	for (i = 0; i < 3; i++) {
+		float offset = i + 1.0;
+
+		cairo_set_source_rgb(cr, shade_col[i][0], shade_col[i][1], shade_col[i][2]);
+		cairo_move_to(cr, offset + *x1, *y1 - offset);
+		cairo_line_to(cr, *x2 + offset, *y1 - offset);
+		cairo_line_to(cr, *x2 + offset, *y2 - offset);
+		cairo_stroke(cr);
+	}
+
+	cairo_set_source_rgb(cr, 0, 0, 0);
+	cairo_set_line_width(cr, 1.2);
+
+	cairo_move_to(cr, *x1, *y1);
+	cairo_line_to(cr, *x1, *y2);
+	cairo_line_to(cr, *x2, *y2);
+	cairo_line_to(cr, *x2, *y1);
+	cairo_line_to(cr, *x1, *y1);
+	cairo_stroke(cr);
+
+	draw_centered_text(cr, g->font, g->xdim / 2, g->ydim / 20, 20.0, g->title);
+	draw_centered_text(cr, g->font, g->xdim / 2, g->ydim * 0.97, 14.0, g->xtitle);
+	draw_vertical_centered_text(cr, g->font, g->xdim * 0.02, g->ydim / 2, 14.0, g->ytitle);
+	cairo_stroke(cr);
+}
+
+static void graph_draw_x_ticks(struct graph *g, cairo_t *cr,
+	double x1, double y1, double x2, double y2,
+	double minx, double maxx, int nticks, int add_tm_text)
+{
+	struct tickmark *tm;
+	double tx;
+	int i, power_of_ten;
+	static double dash[] = { 1.0, 2.0 };
+
+	nticks = calc_tickmarks(minx, maxx, nticks, &tm, &power_of_ten,
+		g->x_axis_unit_change_callback == NULL, g->base_offset);
+	if (g->x_axis_unit_change_callback)
+		g->x_axis_unit_change_callback(g, power_of_ten);
+
+	for (i = 0; i < nticks; i++) {
+		tx = (((tm[i].value) - minx) / (maxx - minx)) * (x2 - x1) + x1;
+
+		/*
+		 * Update tick delta
+		 */
+		if (!i) {
+			g->xtick_zero = tx;
+			g->xtick_zero_val = tm[0].value;
+		} else if (i == 1) {
+			g->xtick_delta = (tm[1].value - tm[0].value) / (tx - g->xtick_zero);
+			g->xtick_one_val = tm[1].value;
+		}
+
+		/* really tx < yx || tx > x2, but protect against rounding */
+		if (x1 - tx > 0.01 || tx - x2 > 0.01)
+			continue;
+
+		/* Draw tick mark */
+		cairo_set_line_width(cr, 1.0);
+		cairo_move_to(cr, tx, y2);
+		cairo_line_to(cr, tx, y2 + (y2 - y1) * 0.03);
+		cairo_stroke(cr);
+
+		/* draw grid lines */
+		cairo_save(cr);
+		cairo_set_dash(cr, dash, 2, 0.66);
+		cairo_set_line_width(cr, 0.33);
+		cairo_move_to(cr, tx, y1);
+		cairo_line_to(cr, tx, y2);
+		cairo_stroke(cr);
+		cairo_restore(cr);
+
+		if (!add_tm_text)
+			continue;
+
+		/* draw tickmark label */
+		draw_centered_text(cr, g->font, tx, y2 * 1.04, 12.0, tm[i].string);
+		cairo_stroke(cr);
+	}
+}
+
+static double graph_draw_y_ticks(struct graph *g, cairo_t *cr,
+	double x1, double y1, double x2, double y2,
+	double miny, double maxy, int nticks, int add_tm_text)
+{
+	struct tickmark *tm;
+	double ty;
+	int i, power_of_ten;
+	static double dash[] = { 1.0, 2.0 };
+
+	nticks = calc_tickmarks(miny, maxy, nticks, &tm, &power_of_ten,
+		g->y_axis_unit_change_callback == NULL, g->base_offset);
+	if (g->y_axis_unit_change_callback)
+		g->y_axis_unit_change_callback(g, power_of_ten);
+
+	/*
+	 * Use highest tickmark as top of graph, not highest value. Otherwise
+	 * it's impossible to see what the max value is, if the graph is
+	 * fairly flat.
+	 */
+	maxy = tm[nticks - 1].value;
+
+	for (i = 0; i < nticks; i++) {
+		ty = y2 - (((tm[i].value) - miny) / (maxy - miny)) * (y2 - y1);
+
+		/*
+		 * Update tick delta
+		 */
+		if (!i) {
+			g->ytick_zero = ty;
+			g->ytick_zero_val = tm[0].value;
+		} else if (i == 1) {
+			g->ytick_delta = (tm[1].value - tm[0].value) / (ty - g->ytick_zero);
+			g->ytick_one_val = tm[1].value;
+		}
+
+		/* really ty < y1 || ty > y2, but protect against rounding */
+		if (y1 - ty > 0.01 || ty - y2 > 0.01)
+			continue;
+
+		/* draw tick mark */
+		cairo_move_to(cr, x1, ty);
+		cairo_line_to(cr, x1 - (x2 - x1) * 0.02, ty);
+		cairo_stroke(cr);
+
+		/* draw grid lines */
+		cairo_save(cr);
+		cairo_set_dash(cr, dash, 2, 0.66);
+		cairo_set_line_width(cr, 0.33);
+		cairo_move_to(cr, x1, ty);
+		cairo_line_to(cr, x2, ty);
+		cairo_stroke(cr);
+		cairo_restore(cr);
+
+		if (!add_tm_text)
+			continue;
+
+		/* draw tickmark label */
+		draw_right_justified_text(cr, g->font, x1 - (x2 - x1) * 0.025, ty, 12.0, tm[i].string);
+		cairo_stroke(cr);
+	}
+
+	/*
+	 * Return new max to use
+	 */
+	return maxy;
+}
+
+void bar_graph_draw(struct graph *bg, cairo_t *cr)
+{
+	double x1, y1, x2, y2;
+	double space_per_label, bar_width;
+	double label_offset, mindata, maxdata;
+	int i, nlabels;
+	struct graph_label *lb;
+	struct flist_head *entry;
+
+	cairo_save(cr);
+	cairo_translate(cr, bg->xoffset, bg->yoffset);
+	graph_draw_common(bg, cr, &x1, &y1, &x2, &y2);
+
+	nlabels = count_labels(bg);
+	space_per_label = (x2 - x1) / (double) nlabels;
+
+	/*
+	 * Start bars at 0 unless we have negative values, otherwise we
+	 * present a skewed picture comparing label X and X+1.
+	 */
+	mindata = find_min_data(bg);
+	if (mindata > 0)
+		mindata = 0;
+
+	maxdata = find_max_data(bg);
+
+	if (fabs(maxdata - mindata) < 1e-20) {
+		draw_centered_text(cr, bg->font,
+			x1 + (x2 - x1) / 2.0,
+			y1 + (y2 - y1) / 2.0, 20.0, "No good data");
+		return;
+	}
+
+	maxdata = graph_draw_y_ticks(bg, cr, x1, y1, x2, y2, mindata, maxdata, 10, 1);
+	i = 0;
+	flist_for_each(entry, &bg->label_list) {
+		int nvalues;
+
+		lb = flist_entry(entry, struct graph_label, list);
+		nvalues = count_values(lb);
+		bar_width = (space_per_label - space_per_label * 0.2) / (double) nvalues;
+		label_offset = bg->xdim * 0.1 + space_per_label * (double) i + space_per_label * 0.1;
+		draw_bars(bg, cr, lb, label_offset, bar_width, mindata, maxdata);
+		// draw_centered_text(cr, label_offset + (bar_width / 2.0 + bar_width * 0.1), bg->ydim * 0.93,
+		draw_centered_text(cr, bg->font, x1 + space_per_label * (i + 0.5), bg->ydim * 0.93,
+			12.0, lb->label);
+		i++;
+	}
+	cairo_stroke(cr);
+	cairo_restore(cr);
+}
+
+typedef double (*xy_value_extractor)(struct graph_value *v);
+
+static double getx(struct graph_value *v)
+{
+	struct xyvalue *xy = v->value;
+	return xy->x;
+}
+
+static double gety(struct graph_value *v)
+{
+	struct xyvalue *xy = v->value;
+	return xy->y;
+}
+
+static double find_xy_value(struct graph *g, xy_value_extractor getvalue, double_comparator cmp)
+{
+	double tmp, answer = 0.0;
+	struct graph_label *i;
+	struct graph_value *j;
+	struct flist_head *jentry, *entry;
+	int first = 1;
+
+	flist_for_each(entry, &g->label_list) {
+		i = flist_entry(entry, struct graph_label, list);
+
+		flist_for_each(jentry, &i->value_list) {
+			j = flist_entry(jentry, struct graph_value, list);
+			tmp = getvalue(j);
+			if (first) {
+				first = 0;
+				answer = tmp;
+			}
+			answer = cmp(tmp, answer);
+		}
+	}
+
+	return answer;
+}
+
+void line_graph_draw(struct graph *g, cairo_t *cr)
+{
+	double x1, y1, x2, y2;
+	double minx, miny, maxx, maxy, gminx, gminy, gmaxx, gmaxy;
+	double tx, ty, top_extra, bottom_extra, left_extra, right_extra;
+	struct graph_label *i;
+	struct graph_value *j;
+	int good_data = 1, first = 1;
+	struct flist_head *entry, *lentry;
+
+	cairo_save(cr);
+	cairo_translate(cr, g->xoffset, g->yoffset);
+	graph_draw_common(g, cr, &x1, &y1, &x2, &y2);
+
+	minx = find_xy_value(g, getx, mindouble);
+	maxx = find_xy_value(g, getx, maxdouble);
+	miny = find_xy_value(g, gety, mindouble);
+
+	/*
+	 * Start graphs at zero, unless we have a value below. Otherwise
+	 * it's hard to visually compare the read and write graph, since
+	 * the lowest valued one will be the floor of the graph view.
+	 */
+	if (miny > 0)
+		miny = 0;
+
+	maxy = find_xy_value(g, gety, maxdouble);
+
+	if (fabs(maxx - minx) < 1e-20 || fabs(maxy - miny) < 1e-20) {
+		good_data = 0;
+		minx = 0.0;
+		miny = 0.0;
+		maxx = 10.0;
+		maxy = 100.0;
+	}
+
+	top_extra = 0.0;
+	bottom_extra = 0.0;
+	left_extra = 0.0;
+	right_extra = 0.0;
+
+	if (g->top_extra > 0.001)
+		top_extra = fabs(maxy - miny) * g->top_extra;
+	if (g->bottom_extra > 0.001)
+		bottom_extra = fabs(maxy - miny) * g->bottom_extra;
+	if (g->left_extra > 0.001)
+		left_extra = fabs(maxx - minx) * g->left_extra;
+	if (g->right_extra > 0.001)
+		right_extra = fabs(maxx - minx) * g->right_extra;
+
+	gminx = minx - left_extra;
+	gmaxx = maxx + right_extra;
+	gminy = miny - bottom_extra;
+	gmaxy = maxy + top_extra;
+
+	graph_draw_x_ticks(g, cr, x1, y1, x2, y2, gminx, gmaxx, 10, good_data);
+	gmaxy = graph_draw_y_ticks(g, cr, x1, y1, x2, y2, gminy, gmaxy, 10, good_data);
+
+	if (!good_data)
+		goto skip_data;
+
+	cairo_set_line_width(cr, 1.5);
+	cairo_set_line_join(cr, CAIRO_LINE_JOIN_ROUND);
+
+	flist_for_each(lentry, &g->label_list) {
+		i = flist_entry(lentry, struct graph_label, list);
+		first = 1;
+		if (i->hide || i->r < 0) /* invisible data */
+			continue;
+
+		cairo_set_source_rgb(cr, i->r, i->g, i->b);
+		flist_for_each(entry, &i->value_list) {
+			j = flist_entry(entry, struct graph_value, list);
+			tx = ((getx(j) - gminx) / (gmaxx - gminx)) * (x2 - x1) + x1;
+			ty = y2 - ((gety(j) - gminy) / (gmaxy - gminy)) * (y2 - y1);
+			if (first) {
+				cairo_move_to(cr, tx, ty);
+				first = 0;
+			} else
+				cairo_line_to(cr, tx, ty);
+		}
+		cairo_stroke(cr);
+	}
+
+skip_data:
+	cairo_restore(cr);
+}
+
+static void setstring(char **str, const char *value)
+{
+	free(*str);
+	*str = strdup(value);
+}
+
+void graph_title(struct graph *bg, const char *title)
+{
+	setstring(&bg->title, title);
+}
+
+void graph_x_title(struct graph *bg, const char *title)
+{
+	setstring(&bg->xtitle, title);
+}
+
+void graph_y_title(struct graph *bg, const char *title)
+{
+	setstring(&bg->ytitle, title);
+}
+
+static struct graph_label *graph_find_label(struct graph *bg,
+				const char *label)
+{
+	struct flist_head *entry;
+	struct graph_label *i;
+
+	flist_for_each(entry, &bg->label_list) {
+		i = flist_entry(entry, struct graph_label, list);
+
+		if (strcmp(label, i->label) == 0)
+			return i;
+	}
+
+	return NULL;
+}
+
+graph_label_t graph_add_label(struct graph *bg, const char *label)
+{
+	struct graph_label *i;
+
+	i = graph_find_label(bg, label);
+	if (i)
+		return i; /* already present. */
+	i = calloc(1, sizeof(*i));
+	INIT_FLIST_HEAD(&i->value_list);
+	i->parent = bg;
+	setstring(&i->label, label);
+	flist_add_tail(&i->list, &bg->label_list);
+	INIT_PRIO_TREE_ROOT(&i->prio_tree);
+	return i;
+}
+
+static void __graph_value_drop(struct graph_label *l, struct graph_value *v)
+{
+	flist_del_init(&v->list);
+	if (v->tooltip)
+		free(v->tooltip);
+	free(v->value);
+	free(v);
+	l->value_count--;
+}
+
+static void graph_value_drop(struct graph_label *l, struct graph_value *v)
+{
+	if (v->flags & GV_F_PRIO_SKIP) {
+		__graph_value_drop(l, v);
+		return;
+	}
+
+	/*
+	 * Find head, the guy that's on the prio tree
+	 */
+	while (!(v->flags & GV_F_ON_PRIO)) {
+		assert(!flist_empty(&v->alias));
+		v = flist_first_entry(&v->alias, struct graph_value, alias);
+	}
+
+	prio_tree_remove(&l->prio_tree, &v->node);
+
+	/*
+	 * Free aliases
+	 */
+	while (!flist_empty(&v->alias)) {
+		struct graph_value *a;
+
+		a = flist_first_entry(&v->alias, struct graph_value, alias);
+		flist_del_init(&a->alias);
+
+		__graph_value_drop(l, a);
+	}
+
+	__graph_value_drop(l, v);
+}
+
+static void graph_label_add_value(struct graph_label *i, void *value,
+				  const char *tooltip)
+{
+	struct graph *g = i->parent;
+	struct graph_value *x;
+
+	x = malloc(sizeof(*x));
+	memset(x, 0, sizeof(*x));
+	INIT_FLIST_HEAD(&x->alias);
+	INIT_FLIST_HEAD(&x->list);
+	flist_add_tail(&x->list, &i->value_list);
+	i->value_count++;
+	x->value = value;
+
+	if (tooltip) {
+		double xval = getx(x);
+		double minx = xval - (g->xtick_one_val * TOOLTIP_DELTA);
+		double maxx = xval + (g->xtick_one_val * TOOLTIP_DELTA);
+		struct prio_tree_node *ret;
+
+		/*
+		 * use msec to avoid dropping too much precision when
+		 * storing as an integer.
+		 */
+		minx = minx * 1000.0;
+		maxx = maxx * 1000.0;
+
+		INIT_PRIO_TREE_NODE(&x->node);
+		x->node.start = minx;
+		x->node.last = maxx;
+		x->tooltip = strdup(tooltip);
+		if (x->node.last == x->node.start) {
+			x->node.last += fabs(g->xtick_delta);
+			if (x->node.last == x->node.start)
+				x->node.last++;
+		}
+
+		/*
+		 * If ret != &x->node, we have an alias. Since the values
+		 * should be identical, we can drop it
+		 */
+		ret = prio_tree_insert(&i->prio_tree, &x->node);
+		if (ret != &x->node) {
+			struct graph_value *alias;
+
+			alias = container_of(ret, struct graph_value, node);
+			flist_add_tail(&x->alias, &alias->alias);
+		} else
+			x->flags = GV_F_ON_PRIO;
+	} else
+		x->flags = GV_F_PRIO_SKIP;
+
+	if (g->per_label_limit != -1 &&
+		i->value_count > g->per_label_limit) {
+		int to_drop = 1;
+
+		/*
+		 * If the limit was dynamically reduced, making us more
+		 * than 1 entry ahead after adding this one, drop two
+		 * entries. This will make us (eventually) reach the
+		 * specified limit.
+		 */
+		if (i->value_count - g->per_label_limit >= 2)
+			to_drop = 2;
+
+		while (to_drop-- && !flist_empty(&i->value_list)) {
+			x = flist_first_entry(&i->value_list, struct graph_value, list);
+			graph_value_drop(i, x);
+
+			/*
+			 * If we have aliases, we could drop > 1 above.
+			 */
+			if (i->value_count <= g->per_label_limit)
+				break;
+		}
+	}
+}
+
+int graph_add_data(struct graph *bg, graph_label_t label, const double value)
+{
+	struct graph_label *i = label;
+	double *d;
+
+	d = malloc(sizeof(*d));
+	*d = value;
+
+	graph_label_add_value(i, d, NULL);
+	return 0;
+}
+
+static int graph_nonzero_y(struct graph_label *l)
+{
+	struct flist_head *entry;
+
+	flist_for_each(entry, &l->value_list) {
+		struct graph_value *v;
+
+		v = flist_entry(entry, struct graph_value, list);
+		if (gety(v) != 0.0)
+			return 1;
+	}
+
+	return 0;
+}
+
+int graph_add_xy_data(struct graph *bg, graph_label_t label,
+		      const double x, const double y, const char *tooltip)
+{
+	struct graph_label *i = label;
+	struct xyvalue *xy;
+
+	if (bg->dont_graph_all_zeroes && y == 0.0 && !graph_nonzero_y(i))
+		i->hide = 1;
+	else
+		i->hide = 0;
+
+	xy = malloc(sizeof(*xy));
+	xy->x = x;
+	xy->y = y;
+
+	graph_label_add_value(i, xy, tooltip);
+	return 0;
+}
+
+static void graph_free_values(struct graph_label *l)
+{
+	struct graph_value *i;
+
+	while (!flist_empty(&l->value_list)) {
+		i = flist_first_entry(&l->value_list, struct graph_value, list);
+		graph_value_drop(l, i);
+	}
+}
+
+static void graph_free_labels(struct graph *g)
+{
+	struct graph_label *i;
+
+	while (!flist_empty(&g->label_list)) {
+		i = flist_first_entry(&g->label_list, struct graph_label, list);
+		flist_del(&i->list);
+		graph_free_values(i);
+		free(i);
+	}
+}
+
+void graph_clear_values(struct graph *g)
+{
+	struct flist_head *node;
+	struct graph_label *i;
+
+	flist_for_each(node, &g->label_list) {
+		i = flist_entry(node, struct graph_label, list);
+		graph_free_values(i);
+	}
+}
+
+void graph_set_color(struct graph *gr, graph_label_t label, double red,
+		     double green, double blue)
+{
+	struct graph_label *i = label;
+	double r, g, b;
+
+	if (red < 0.0) { /* invisible color */
+		r = -1.0;
+		g = -1.0;
+		b = -1.0;
+	} else {
+		r = fabs(red);
+		g = fabs(green);
+		b = fabs(blue);
+
+		if (r > 1.0)
+			r = 1.0;
+		if (g > 1.0)
+			g = 1.0;
+		if (b > 1.0)
+			b = 1.0;
+	}
+
+	i->r = r;
+	i->g = g;
+	i->b = b;
+}
+
+void graph_free(struct graph *bg)
+{
+	free(bg->title);
+	free(bg->xtitle);
+	free(bg->ytitle);
+	graph_free_labels(bg);
+}
+
+/* For each line in the line graph, up to per_label_limit segments may
+ * be added.  After that, adding more data to the end of the line
+ * causes data to drop off of the front of the line.
+ */
+void line_graph_set_data_count_limit(struct graph *g, int per_label_limit)
+{
+	g->per_label_limit = per_label_limit;
+}
+
+void graph_add_extra_space(struct graph *g, double left_percent,
+			   double right_percent, double top_percent,
+			   double bottom_percent)
+{
+	g->left_extra = left_percent;
+	g->right_extra = right_percent;
+	g->top_extra = top_percent;
+	g->bottom_extra = bottom_percent;
+}
+
+/*
+ * Normally values are logged in a base unit of 0, but for other purposes
+ * it makes more sense to log in higher unit. For instance for bandwidth
+ * purposes, you may want to log in KB/sec (or MB/sec) rather than bytes/sec.
+ */
+void graph_set_base_offset(struct graph *g, unsigned int base_offset)
+{
+	g->base_offset = base_offset;
+}
+
+int graph_has_tooltips(struct graph *g)
+{
+	struct flist_head *entry;
+	struct graph_label *i;
+
+	flist_for_each(entry, &g->label_list) {
+		i = flist_entry(entry, struct graph_label, list);
+
+		if (!prio_tree_empty(&i->prio_tree))
+			return 1;
+	}
+
+	return 0;
+}
+
+int graph_contains_xy(struct graph *g, int x, int y)
+{
+	int first_x = g->xoffset;
+	int last_x = g->xoffset + g->xdim;
+	int first_y = g->yoffset;
+	int last_y = g->yoffset + g->ydim;
+
+	return (x >= first_x && x <= last_x) && (y >= first_y && y <= last_y);
+}
+
+const char *graph_find_tooltip(struct graph *g, int ix, int iy)
+{
+	double x = ix, y = iy;
+	struct prio_tree_iter iter;
+	struct prio_tree_node *n;
+	struct graph_value *best = NULL;
+	struct flist_head *entry;
+	double best_delta;
+	double maxy, miny;
+
+	x -= g->xoffset;
+	y -= g->yoffset;
+
+	x = g->xtick_zero_val + ((x - g->xtick_zero) * g->xtick_delta);
+	y = g->ytick_zero_val + ((y - g->ytick_zero) * g->ytick_delta);
+
+	x = x * 1000.0;
+	maxy = y + (g->ytick_one_val * TOOLTIP_DELTA);
+	miny = y - (g->ytick_one_val * TOOLTIP_DELTA);
+	best_delta = UINT_MAX;
+	flist_for_each(entry, &g->label_list) {
+		struct graph_label *i;
+
+		i = flist_entry(entry, struct graph_label, list);
+		if (i->hide)
+			continue;
+
+		INIT_PRIO_TREE_ITER(&iter);
+		prio_tree_iter_init(&iter, &i->prio_tree, x, x);
+
+		n = prio_tree_next(&iter);
+		if (!n)
+			continue;
+
+		do {
+			struct graph_value *v, *rootv;
+			double yval, ydiff;
+
+			v = container_of(n, struct graph_value, node);
+			rootv = v;
+			do {
+				yval = gety(v);
+				ydiff = fabs(yval - y);
+
+				/*
+				 * zero delta, or within or match critera, break
+				 */
+				if (ydiff < best_delta) {
+					best_delta = ydiff;
+					if (!best_delta ||
+					    (yval >= miny && yval <= maxy)) {
+						best = v;
+						break;
+					}
+				}
+				if (!flist_empty(&v->alias))
+					v = flist_first_entry(&v->alias, struct graph_value, alias);
+			} while (v != rootv);
+		} while ((n = prio_tree_next(&iter)) != NULL);
+
+		/*
+		 * If we got matches in one label, don't check others.
+		 */
+		if (best)
+			break;
+	}
+
+	if (best)
+		return best->tooltip;
+
+	return NULL;
+}
+
+void graph_set_graph_all_zeroes(struct graph *g, unsigned int set)
+{
+	g->dont_graph_all_zeroes = !set;
+}
diff --git a/graph.h b/graph.h
new file mode 100644
index 0000000..078e50c
--- /dev/null
+++ b/graph.h
@@ -0,0 +1,99 @@
+#ifndef GRAPH_H
+#define GRAPH_H
+
+struct graph;
+struct graph_label;
+
+typedef struct graph_label * graph_label_t;
+
+#define GRAPH_DEFAULT_FONT	"Sans 12"
+
+struct graph *graph_new(unsigned int xdim, unsigned int ydim, const char *font);
+/* graph_new() Returns a new graph structure of the given dimensions and font */
+void graph_set_size(struct graph *g, unsigned int xdim, unsigned int ydim);
+/* graph_set_size() Changes the size of a graph to the given dimensions. */ 
+void graph_set_position(struct graph *g, double xoffset, double yoffset);
+/* graph_set_position() sets the x- and y-offset to translate the graph */
+void bar_graph_draw(struct graph *g, cairo_t *cr);
+/* bar_graph_draw() draws the given graph as a bar graph */
+void line_graph_draw(struct graph *g, cairo_t *cr);
+/* line_graph_draw draws the given graph as a line graph */
+void line_graph_set_data_count_limit(struct graph *g, int per_label_limit);
+/* line_graph_set_data_count_limit() limits the amount of data which can
+ * be added to a line graph.  Once the limit is reached, the oldest data 
+ * is discarded as new data is added
+ */
+void graph_set_font(struct graph *g, const char *font);
+void graph_title(struct graph *g, const char *title);
+/* graph_title() sets the main title of the graph to the given string */
+void graph_x_title(struct graph *g, const char *title);
+/* graph_x_title() sets the title of the x axis to the given string */
+void graph_y_title(struct graph *g, const char *title);
+/* graph_y_title() sets the title of the y axis to the given string */
+graph_label_t graph_add_label(struct graph *g, const char *label);
+/* graph_add_label() adds a new "stream" of data to be graphed.
+ * For line charts, each label is a separate line on the graph.
+ * For bar charts, each label is a grouping of columns on the x-axis
+ * For example:
+ *
+ *  |  *                          | **
+ *  |   *      xxxxxxxx           | **
+ *  |    ***  x                   | **              **
+ *  |       *x       ****         | **      **      **
+ *  |    xxxx*  *****             | ** xx   ** xx   **
+ *  |   x     **                  | ** xx   ** xx   ** xx
+ *  |  x                          | ** xx   ** xx   ** xx
+ *  -----------------------       -------------------------
+ *                                    A       B       C
+ *
+ * For a line graph, the 'x's     For a bar graph, 
+ * would be on one "label", and   'A', 'B', and 'C'
+ * the '*'s would be on another   are the labels.
+ * label.
+ */
+
+int graph_add_data(struct graph *g, graph_label_t label, const double value);
+/* graph_add_data() is used to add data to the labels of a bar graph */
+int graph_add_xy_data(struct graph *g, graph_label_t label,
+		const double x, const double y, const char *tooltip);
+/* graph_add_xy_data is used to add data to the labels of a line graph */
+
+void graph_set_color(struct graph *g, graph_label_t label,
+		double red, double green, double blue);
+#define INVISIBLE_COLOR (-1.0)
+/* graph_set_color is used to set the color used to plot the data in
+ * a line graph.  INVISIBLE_COLOR can be used to plot the data invisibly.
+ * Invisible data will have the same effect on the scaling of the axes
+ * as visible data.
+ */
+
+void graph_free(struct graph *bg);
+/* free a graph allocated by graph_new() */
+
+typedef void (*graph_axis_unit_change_callback)(struct graph *g, int power_of_ten);
+void graph_x_axis_unit_change_notify(struct graph *g, graph_axis_unit_change_callback f);
+void graph_y_axis_unit_change_notify(struct graph *g, graph_axis_unit_change_callback f);
+/* The labels used on the x and y axes may be shortened.  You can register for callbacks
+ * so that you can know how the labels are shorted, typically used to adjust the axis
+ * titles to display the proper units.  The power_of_ten parameter indicates what power
+ * of ten the labels have been divided by (9, 6, 3, or 0, corresponding to billions,
+ * millions, thousands and ones. 
+ */ 
+
+void graph_add_extra_space(struct graph *g, double left_percent, double right_percent,
+				double top_percent, double bottom_percent);
+/* graph_add_extra_space() adds extra space to edges of the the graph
+ * so that the data doesn't go to the very edges.
+ */
+
+extern int graph_has_tooltips(struct graph *g);
+extern const char *graph_find_tooltip(struct graph *g, int x, int y);
+extern int graph_contains_xy(struct graph *p, int x, int y);
+
+extern void graph_set_base_offset(struct graph *g, unsigned int base_offset);
+extern void graph_set_graph_all_zeroes(struct graph *g, unsigned int set);
+
+extern void graph_clear_values(struct graph *g);
+
+#endif
+
diff --git a/hash.h b/hash.h
new file mode 100644
index 0000000..66dd3d6
--- /dev/null
+++ b/hash.h
@@ -0,0 +1,164 @@
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+
+#include <inttypes.h>
+#include "arch/arch.h"
+
+/* Fast hashing routine for a long.
+   (C) 2002 William Lee Irwin III, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+
+#if BITS_PER_LONG == 32
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e370001UL
+#elif BITS_PER_LONG == 64
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
+#else
+#error Define GOLDEN_RATIO_PRIME for your wordsize.
+#endif
+
+/*
+ * The above primes are actively bad for hashing, since they are
+ * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
+ * real problems. Besides, the "prime" part is pointless for the
+ * multiplicative hash.
+ *
+ * Although a random odd number will do, it turns out that the golden
+ * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
+ * properties.
+ *
+ * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
+ * (See Knuth vol 3, section 6.4, exercise 9.)
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
+
+static inline unsigned long __hash_long(uint64_t val)
+{
+	uint64_t hash = val;
+
+#if BITS_PER_LONG == 64
+	hash *= GOLDEN_RATIO_64;
+#else
+	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+	uint64_t n = hash;
+	n <<= 18;
+	hash -= n;
+	n <<= 33;
+	hash -= n;
+	n <<= 3;
+	hash += n;
+	n <<= 3;
+	hash -= n;
+	n <<= 4;
+	hash += n;
+	n <<= 2;
+	hash += n;
+#endif
+
+	return hash;
+}
+
+static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+{
+	/* High bits are more random, so use them. */
+	return __hash_long(val) >> (BITS_PER_LONG - bits);
+}
+
+static inline uint64_t __hash_u64(uint64_t val)
+{
+	return val * GOLDEN_RATIO_64;
+}
+	
+static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
+{
+	return hash_long((uintptr_t)ptr, bits);
+}
+
+/*
+ * Bob Jenkins jhash
+ */
+
+#define JHASH_INITVAL	GOLDEN_RATIO_32
+
+static inline uint32_t rol32(uint32_t word, uint32_t shift)
+{
+	return (word << shift) | (word >> (32 - shift));
+}
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c)			\
+{						\
+	a -= c;  a ^= rol32(c, 4);  c += b;	\
+	b -= a;  b ^= rol32(a, 6);  a += c;	\
+	c -= b;  c ^= rol32(b, 8);  b += a;	\
+	a -= c;  a ^= rol32(c, 16); c += b;	\
+	b -= a;  b ^= rol32(a, 19); a += c;	\
+	c -= b;  c ^= rol32(b, 4);  b += a;	\
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c)			\
+{						\
+	c ^= b; c -= rol32(b, 14);		\
+	a ^= c; a -= rol32(c, 11);		\
+	b ^= a; b -= rol32(a, 25);		\
+	c ^= b; c -= rol32(b, 16);		\
+	a ^= c; a -= rol32(c, 4);		\
+	b ^= a; b -= rol32(a, 14);		\
+	c ^= b; c -= rol32(b, 24);		\
+}
+
+static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval)
+{
+	const uint8_t *k = key;
+	uint32_t a, b, c;
+
+	/* Set up the internal state */
+	a = b = c = JHASH_INITVAL + length + initval;
+
+	/* All but the last block: affect some 32 bits of (a,b,c) */
+	while (length > 12) {
+		a += *k;
+		b += *(k + 4);
+		c += *(k + 8);
+		__jhash_mix(a, b, c);
+		length -= 12;
+		k += 12;
+	}
+
+	/* Last block: affect all 32 bits of (c) */
+	/* All the case statements fall through */
+	switch (length) {
+	case 12: c += (uint32_t) k[11] << 24;	/* fall through */
+	case 11: c += (uint32_t) k[10] << 16;	/* fall through */
+	case 10: c += (uint32_t) k[9] << 8;	/* fall through */
+	case 9:  c += k[8];			/* fall through */
+	case 8:  b += (uint32_t) k[7] << 24;	/* fall through */
+	case 7:  b += (uint32_t) k[6] << 16;	/* fall through */
+	case 6:  b += (uint32_t) k[5] << 8;	/* fall through */
+	case 5:  b += k[4];			/* fall through */
+	case 4:  a += (uint32_t) k[3] << 24;	/* fall through */
+	case 3:  a += (uint32_t) k[2] << 16;	/* fall through */
+	case 2:  a += (uint32_t) k[1] << 8;	/* fall through */
+	case 1:  a += k[0];
+		 __jhash_final(a, b, c);
+	case 0: /* Nothing left to add */
+		break;
+	}
+
+	return c;
+}
+
+#endif /* _LINUX_HASH_H */
diff --git a/helper_thread.c b/helper_thread.c
new file mode 100644
index 0000000..7874985
--- /dev/null
+++ b/helper_thread.c
@@ -0,0 +1,331 @@
+#include <signal.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
+
+#include "fio.h"
+#include "smalloc.h"
+#include "helper_thread.h"
+#include "steadystate.h"
+#include "pshared.h"
+
+enum action {
+	A_EXIT		= 1,
+	A_RESET		= 2,
+	A_DO_STAT	= 3,
+};
+
+static struct helper_data {
+	volatile int exit;
+	int pipe[2]; /* 0: read end; 1: write end. */
+	struct sk_out *sk_out;
+	pthread_t thread;
+	struct fio_sem *startup_sem;
+} *helper_data;
+
+void helper_thread_destroy(void)
+{
+	if (!helper_data)
+		return;
+
+	close(helper_data->pipe[0]);
+	close(helper_data->pipe[1]);
+	sfree(helper_data);
+}
+
+#ifdef _WIN32
+static void sock_init(void)
+{
+	WSADATA wsaData;
+	int res;
+
+	/* It is allowed to call WSAStartup() more than once. */
+	res = WSAStartup(MAKEWORD(2, 2), &wsaData);
+	assert(res == 0);
+}
+
+static int make_nonblocking(int fd)
+{
+	unsigned long arg = 1;
+
+	return ioctlsocket(fd, FIONBIO, &arg);
+}
+
+static int write_to_pipe(int fd, const void *buf, size_t len)
+{
+	return send(fd, buf, len, 0);
+}
+
+static int read_from_pipe(int fd, void *buf, size_t len)
+{
+	return recv(fd, buf, len, 0);
+}
+#else
+static void sock_init(void)
+{
+}
+
+static int make_nonblocking(int fd)
+{
+	return fcntl(fd, F_SETFL, O_NONBLOCK);
+}
+
+static int write_to_pipe(int fd, const void *buf, size_t len)
+{
+	return write(fd, buf, len);
+}
+
+static int read_from_pipe(int fd, void *buf, size_t len)
+{
+	return read(fd, buf, len);
+}
+#endif
+
+static void submit_action(enum action a)
+{
+	const char data = a;
+	int ret;
+
+	if (!helper_data)
+		return;
+
+	ret = write_to_pipe(helper_data->pipe[1], &data, sizeof(data));
+	assert(ret == 1);
+}
+
+void helper_reset(void)
+{
+	submit_action(A_RESET);
+}
+
+/*
+ * May be invoked in signal handler context and hence must only call functions
+ * that are async-signal-safe. See also
+ * https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_03.
+ */
+void helper_do_stat(void)
+{
+	submit_action(A_DO_STAT);
+}
+
+bool helper_should_exit(void)
+{
+	if (!helper_data)
+		return true;
+
+	return helper_data->exit;
+}
+
+void helper_thread_exit(void)
+{
+	if (!helper_data)
+		return;
+
+	helper_data->exit = 1;
+	submit_action(A_EXIT);
+	pthread_join(helper_data->thread, NULL);
+}
+
+static void *helper_thread_main(void *data)
+{
+	struct helper_data *hd = data;
+	unsigned int msec_to_next_event, next_log, next_ss = STEADYSTATE_MSEC;
+	struct timespec ts, last_du, last_ss;
+	char action;
+	int ret = 0;
+
+	sk_out_assign(hd->sk_out);
+
+#ifdef HAVE_PTHREAD_SIGMASK
+	{
+	sigset_t sigmask;
+
+	/* Let another thread handle signals. */
+	ret = pthread_sigmask(SIG_UNBLOCK, NULL, &sigmask);
+	assert(ret == 0);
+	ret = pthread_sigmask(SIG_BLOCK, &sigmask, NULL);
+	assert(ret == 0);
+	}
+#endif
+
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+	clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+	memcpy(&last_du, &ts, sizeof(ts));
+	memcpy(&last_ss, &ts, sizeof(ts));
+
+	fio_sem_up(hd->startup_sem);
+
+	msec_to_next_event = DISK_UTIL_MSEC;
+	while (!ret && !hd->exit) {
+		uint64_t since_du, since_ss = 0;
+		struct timeval timeout = {
+			.tv_sec  = DISK_UTIL_MSEC / 1000,
+			.tv_usec = (DISK_UTIL_MSEC % 1000) * 1000,
+		};
+		fd_set rfds, efds;
+
+		timespec_add_msec(&ts, msec_to_next_event);
+
+		if (read_from_pipe(hd->pipe[0], &action, sizeof(action)) < 0) {
+			FD_ZERO(&rfds);
+			FD_SET(hd->pipe[0], &rfds);
+			FD_ZERO(&efds);
+			FD_SET(hd->pipe[0], &efds);
+			ret = select(1, &rfds, NULL, &efds, &timeout);
+			if (ret < 0)
+				log_err("fio: select() call in helper thread failed: %s",
+					strerror(errno));
+			if (read_from_pipe(hd->pipe[0], &action, sizeof(action)) <
+			    0)
+				action = 0;
+		}
+
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+		clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+		clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+
+		if (action == A_RESET) {
+			last_du = ts;
+			last_ss = ts;
+		}
+
+		since_du = mtime_since(&last_du, &ts);
+		if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
+			ret = update_io_ticks();
+			timespec_add_msec(&last_du, DISK_UTIL_MSEC);
+			msec_to_next_event = DISK_UTIL_MSEC;
+			if (since_du >= DISK_UTIL_MSEC)
+				msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
+		} else
+			msec_to_next_event = DISK_UTIL_MSEC - since_du;
+
+		if (action == A_DO_STAT)
+			__show_running_run_stats();
+
+		next_log = calc_log_samples();
+		if (!next_log)
+			next_log = DISK_UTIL_MSEC;
+
+		if (steadystate_enabled) {
+			since_ss = mtime_since(&last_ss, &ts);
+			if (since_ss >= STEADYSTATE_MSEC || STEADYSTATE_MSEC - since_ss < 10) {
+				steadystate_check();
+				timespec_add_msec(&last_ss, since_ss);
+				if (since_ss > STEADYSTATE_MSEC)
+					next_ss = STEADYSTATE_MSEC - (since_ss - STEADYSTATE_MSEC);
+				else
+					next_ss = STEADYSTATE_MSEC;
+			} else
+				next_ss = STEADYSTATE_MSEC - since_ss;
+                }
+
+		msec_to_next_event = min(min(next_log, msec_to_next_event), next_ss);
+		dprint(FD_HELPERTHREAD, "since_ss: %llu, next_ss: %u, next_log: %u, msec_to_next_event: %u\n", (unsigned long long)since_ss, next_ss, next_log, msec_to_next_event);
+
+		if (!is_backend)
+			print_thread_status();
+	}
+
+	fio_writeout_logs(false);
+
+	sk_out_drop();
+	return NULL;
+}
+
+/*
+ * Connect two sockets to each other to emulate the pipe() system call on Windows.
+ */
+int pipe_over_loopback(int fd[2])
+{
+	struct sockaddr_in addr = { .sin_family = AF_INET };
+	socklen_t len = sizeof(addr);
+	int res;
+
+	addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+	sock_init();
+
+	fd[0] = socket(AF_INET, SOCK_STREAM, 0);
+	if (fd[0] < 0)
+		goto err;
+	fd[1] = socket(AF_INET, SOCK_STREAM, 0);
+	if (fd[1] < 0)
+		goto close_fd_0;
+	res = bind(fd[0], (struct sockaddr *)&addr, len);
+	if (res < 0)
+		goto close_fd_1;
+	res = getsockname(fd[0], (struct sockaddr *)&addr, &len);
+	if (res < 0)
+		goto close_fd_1;
+	res = listen(fd[0], 1);
+	if (res < 0)
+		goto close_fd_1;
+	res = connect(fd[1], (struct sockaddr *)&addr, len);
+	if (res < 0)
+		goto close_fd_1;
+	res = accept(fd[0], NULL, NULL);
+	if (res < 0)
+		goto close_fd_1;
+	close(fd[0]);
+	fd[0] = res;
+	return 0;
+
+close_fd_1:
+	close(fd[1]);
+
+close_fd_0:
+	close(fd[0]);
+
+err:
+	return -1;
+}
+
+int helper_thread_create(struct fio_sem *startup_sem, struct sk_out *sk_out)
+{
+	struct helper_data *hd;
+	int ret;
+
+	hd = scalloc(1, sizeof(*hd));
+
+	setup_disk_util();
+	steadystate_setup();
+
+	hd->sk_out = sk_out;
+
+#if defined(CONFIG_PIPE2)
+	ret = pipe2(hd->pipe, O_CLOEXEC);
+#elif defined(CONFIG_PIPE)
+	ret = pipe(hd->pipe);
+#else
+	ret = pipe_over_loopback(hd->pipe);
+#endif
+	if (ret)
+		return 1;
+
+	ret = make_nonblocking(hd->pipe[0]);
+	assert(ret >= 0);
+
+	hd->startup_sem = startup_sem;
+
+	DRD_IGNORE_VAR(helper_data);
+
+	ret = pthread_create(&hd->thread, NULL, helper_thread_main, hd);
+	if (ret) {
+		log_err("Can't create helper thread: %s\n", strerror(ret));
+		return 1;
+	}
+
+	helper_data = hd;
+
+	dprint(FD_MUTEX, "wait on startup_sem\n");
+	fio_sem_down(startup_sem);
+	dprint(FD_MUTEX, "done waiting on startup_sem\n");
+	return 0;
+}
diff --git a/helper_thread.h b/helper_thread.h
new file mode 100644
index 0000000..d7df6c4
--- /dev/null
+++ b/helper_thread.h
@@ -0,0 +1,11 @@
+#ifndef FIO_HELPER_THREAD_H
+#define FIO_HELPER_THREAD_H
+
+extern void helper_reset(void);
+extern void helper_do_stat(void);
+extern bool helper_should_exit(void);
+extern void helper_thread_destroy(void);
+extern void helper_thread_exit(void);
+extern int helper_thread_create(struct fio_sem *, struct sk_out *);
+
+#endif
diff --git a/helpers.c b/helpers.c
new file mode 100644
index 0000000..ab9d706
--- /dev/null
+++ b/helpers.c
@@ -0,0 +1,34 @@
+#include <errno.h>
+
+#include "helpers.h"
+
+#ifndef CONFIG_LINUX_FALLOCATE
+int fallocate(int fd, int mode, off_t offset, off_t len)
+{
+	errno = ENOSYS;
+	return -1;
+}
+#endif
+
+#ifndef CONFIG_POSIX_FALLOCATE
+int posix_fallocate(int fd, off_t offset, off_t len)
+{
+	return 0;
+}
+#endif
+
+#ifndef CONFIG_SYNC_FILE_RANGE
+int sync_file_range(int fd, uint64_t offset, uint64_t nbytes,
+		    unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+#endif
+
+#ifndef CONFIG_POSIX_FADVISE
+int posix_fadvise(int fd, off_t offset, off_t len, int advice)
+{
+	return 0;
+}
+#endif
diff --git a/helpers.h b/helpers.h
new file mode 100644
index 0000000..4ec0f05
--- /dev/null
+++ b/helpers.h
@@ -0,0 +1,16 @@
+#ifndef FIO_HELPERS_H
+#define FIO_HELPERS_H
+
+#include <sys/types.h>
+
+#include "os/os.h"
+
+extern int fallocate(int fd, int mode, off_t offset, off_t len);
+extern int posix_fallocate(int fd, off_t offset, off_t len);
+#ifndef CONFIG_SYNC_FILE_RANGE
+extern int sync_file_range(int fd, uint64_t offset, uint64_t nbytes,
+					unsigned int flags);
+#endif
+extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
+
+#endif /* FIO_HELPERS_H_ */
diff --git a/idletime.c b/idletime.c
new file mode 100644
index 0000000..fc1df8e
--- /dev/null
+++ b/idletime.c
@@ -0,0 +1,510 @@
+#include <math.h>
+#include "fio.h"
+#include "json.h"
+#include "idletime.h"
+
+static volatile struct idle_prof_common ipc;
+
+/*
+ * Get time to complete an unit work on a particular cpu.
+ * The minimum number in CALIBRATE_RUNS runs is returned.
+ */
+static double calibrate_unit(unsigned char *data)
+{
+	unsigned long t, i, j, k;
+	struct timespec tps;
+	double tunit = 0.0;
+
+	for (i = 0; i < CALIBRATE_RUNS; i++) {
+
+		fio_gettime(&tps, NULL);
+		/* scale for less variance */
+		for (j = 0; j < CALIBRATE_SCALE; j++) {
+			/* unit of work */
+			for (k=0; k < page_size; k++) {
+				data[(k + j) % page_size] = k % 256;
+				/*
+				 * we won't see STOP here. this is to match
+				 * the same statement in the profiling loop.
+				 */
+				if (ipc.status == IDLE_PROF_STATUS_PROF_STOP)
+					return 0.0;
+			}
+		}
+
+		t = utime_since_now(&tps);
+		if (!t)
+			continue;
+
+		/* get the minimum time to complete CALIBRATE_SCALE units */
+		if ((i == 0) || ((double)t < tunit))
+			tunit = (double)t;
+	}
+
+	return tunit / CALIBRATE_SCALE;
+}
+
+static void free_cpu_affinity(struct idle_prof_thread *ipt)
+{
+#if defined(FIO_HAVE_CPU_AFFINITY)
+	fio_cpuset_exit(&ipt->cpu_mask);
+#endif
+}
+
+static int set_cpu_affinity(struct idle_prof_thread *ipt)
+{
+#if defined(FIO_HAVE_CPU_AFFINITY)
+	if (fio_cpuset_init(&ipt->cpu_mask)) {
+		log_err("fio: cpuset init failed\n");
+		return -1;
+	}
+
+	fio_cpu_set(&ipt->cpu_mask, ipt->cpu);
+
+	if (fio_setaffinity(gettid(), ipt->cpu_mask)) {
+		log_err("fio: fio_setaffinity failed\n");
+		fio_cpuset_exit(&ipt->cpu_mask);
+		return -1;
+	}
+
+	return 0;
+#else
+	log_err("fio: fio_setaffinity not supported\n");
+	return -1;
+#endif
+}
+
+static void *idle_prof_thread_fn(void *data)
+{
+	int retval;
+	unsigned long j, k;
+	struct idle_prof_thread *ipt = data;
+
+	/* wait for all threads are spawned */
+	pthread_mutex_lock(&ipt->init_lock);
+
+	/* exit if any other thread failed to start */
+	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
+		pthread_mutex_unlock(&ipt->init_lock);
+		return NULL;
+	}
+
+	retval = set_cpu_affinity(ipt);
+	if (retval == -1) {
+		ipt->state = TD_EXITED;
+		pthread_mutex_unlock(&ipt->init_lock);
+		return NULL;
+        }
+
+	ipt->cali_time = calibrate_unit(ipt->data);
+
+	/* delay to set IDLE class till now for better calibration accuracy */
+#if defined(CONFIG_SCHED_IDLE)
+	if ((retval = fio_set_sched_idle()))
+		log_err("fio: fio_set_sched_idle failed\n");
+#else
+	retval = -1;
+	log_err("fio: fio_set_sched_idle not supported\n");
+#endif
+	if (retval == -1) {
+		ipt->state = TD_EXITED;
+		pthread_mutex_unlock(&ipt->init_lock);
+		goto do_exit;
+	}
+
+	ipt->state = TD_INITIALIZED;
+
+	/* signal the main thread that calibration is done */
+	pthread_cond_signal(&ipt->cond);
+	pthread_mutex_unlock(&ipt->init_lock);
+
+	/* wait for other calibration to finish */
+	pthread_mutex_lock(&ipt->start_lock);
+
+	/* exit if other threads failed to initialize */
+	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
+		pthread_mutex_unlock(&ipt->start_lock);
+		goto do_exit;
+	}
+
+	/* exit if we are doing calibration only */
+	if (ipc.status == IDLE_PROF_STATUS_CALI_STOP) {
+		pthread_mutex_unlock(&ipt->start_lock);
+		goto do_exit;
+	}
+
+	fio_gettime(&ipt->tps, NULL);
+	ipt->state = TD_RUNNING;
+
+	j = 0;
+	while (1) {
+		for (k = 0; k < page_size; k++) {
+			ipt->data[(k + j) % page_size] = k % 256;
+			if (ipc.status == IDLE_PROF_STATUS_PROF_STOP) {
+				fio_gettime(&ipt->tpe, NULL);
+				goto idle_prof_done;
+			}
+		}
+		j++;
+	}
+
+idle_prof_done:
+
+	ipt->loops = j + (double) k / page_size;
+	ipt->state = TD_EXITED;
+	pthread_mutex_unlock(&ipt->start_lock);
+
+do_exit:
+	free_cpu_affinity(ipt);
+	return NULL;
+}
+
+/* calculate mean and standard deviation to complete an unit of work */
+static void calibration_stats(void)
+{
+	int i;
+	double sum = 0.0, var = 0.0;
+	struct idle_prof_thread *ipt;
+
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+		sum += ipt->cali_time;
+	}
+
+	ipc.cali_mean = sum/ipc.nr_cpus;
+
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+		var += pow(ipt->cali_time-ipc.cali_mean, 2);
+	}
+
+	ipc.cali_stddev = sqrt(var/(ipc.nr_cpus-1));
+}
+
+void fio_idle_prof_init(void)
+{
+	int i, ret;
+	struct timespec ts;
+	pthread_attr_t tattr;
+	pthread_condattr_t cattr;
+	struct idle_prof_thread *ipt;
+
+	ipc.nr_cpus = cpus_online();
+	ipc.status = IDLE_PROF_STATUS_OK;
+
+	if (ipc.opt == IDLE_PROF_OPT_NONE)
+		return;
+
+	ret = pthread_condattr_init(&cattr);
+	assert(ret == 0);
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+	ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
+	assert(ret == 0);
+#endif
+
+	if ((ret = pthread_attr_init(&tattr))) {
+		log_err("fio: pthread_attr_init %s\n", strerror(ret));
+		return;
+	}
+	if ((ret = pthread_attr_setscope(&tattr, PTHREAD_SCOPE_SYSTEM))) {
+		log_err("fio: pthread_attr_setscope %s\n", strerror(ret));
+		return;
+	}
+
+	ipc.ipts = malloc(ipc.nr_cpus * sizeof(struct idle_prof_thread));
+	if (!ipc.ipts) {
+		log_err("fio: malloc failed\n");
+		return;
+	}
+
+	ipc.buf = malloc(ipc.nr_cpus * page_size);
+	if (!ipc.buf) {
+		log_err("fio: malloc failed\n");
+		free(ipc.ipts);
+		return;
+	}
+
+	/*
+	 * profiling aborts on any single thread failure since the
+	 * result won't be accurate if any cpu is not used.
+	 */
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+
+		ipt->cpu = i;	
+		ipt->state = TD_NOT_CREATED;
+		ipt->data = (unsigned char *)(ipc.buf + page_size * i);
+
+		if ((ret = pthread_mutex_init(&ipt->init_lock, NULL))) {
+			ipc.status = IDLE_PROF_STATUS_ABORT;
+			log_err("fio: pthread_mutex_init %s\n", strerror(ret));
+			break;
+		}
+
+		if ((ret = pthread_mutex_init(&ipt->start_lock, NULL))) {
+			ipc.status = IDLE_PROF_STATUS_ABORT;
+			log_err("fio: pthread_mutex_init %s\n", strerror(ret));
+			break;
+		}
+
+		if ((ret = pthread_cond_init(&ipt->cond, &cattr))) {
+			ipc.status = IDLE_PROF_STATUS_ABORT;
+			log_err("fio: pthread_cond_init %s\n", strerror(ret));
+			break;
+		}
+
+		/* make sure all threads are spawned before they start */
+		pthread_mutex_lock(&ipt->init_lock);
+
+		/* make sure all threads finish init before profiling starts */
+		pthread_mutex_lock(&ipt->start_lock);
+
+		if ((ret = pthread_create(&ipt->thread, &tattr, idle_prof_thread_fn, ipt))) {
+			ipc.status = IDLE_PROF_STATUS_ABORT;
+			log_err("fio: pthread_create %s\n", strerror(ret));
+			break;
+		} else
+			ipt->state = TD_CREATED;
+
+		if ((ret = pthread_detach(ipt->thread))) {
+			/* log error and let the thread spin */
+			log_err("fio: pthread_detach %s\n", strerror(ret));
+		}
+	}
+
+	/*
+	 * let good threads continue so that they can exit
+	 * if errors on other threads occurred previously.
+	 */
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+		pthread_mutex_unlock(&ipt->init_lock);
+	}
+	
+	if (ipc.status == IDLE_PROF_STATUS_ABORT)
+		return;
+	
+	/* wait for calibration to finish */
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+		pthread_mutex_lock(&ipt->init_lock);
+		while ((ipt->state != TD_EXITED) &&
+		       (ipt->state!=TD_INITIALIZED)) {
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+			clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+			clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+			ts.tv_sec += 1;
+			pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts);
+		}
+		pthread_mutex_unlock(&ipt->init_lock);
+	
+		/*
+		 * any thread failed to initialize would abort other threads
+		 * later after fio_idle_prof_start. 
+		 */	
+		if (ipt->state == TD_EXITED)
+			ipc.status = IDLE_PROF_STATUS_ABORT;
+	}
+
+	if (ipc.status != IDLE_PROF_STATUS_ABORT)
+		calibration_stats();
+	else
+		ipc.cali_mean = ipc.cali_stddev = 0.0;
+
+	if (ipc.opt == IDLE_PROF_OPT_CALI)
+		ipc.status = IDLE_PROF_STATUS_CALI_STOP;
+}
+
+void fio_idle_prof_start(void)
+{
+	int i;
+	struct idle_prof_thread *ipt;
+
+	if (ipc.opt == IDLE_PROF_OPT_NONE)
+		return;
+
+	/* unlock regardless abort is set or not */
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+		pthread_mutex_unlock(&ipt->start_lock);
+	}
+}
+
+void fio_idle_prof_stop(void)
+{
+	int i;
+	uint64_t runt;
+	struct timespec ts;
+	struct idle_prof_thread *ipt;
+
+	if (ipc.opt == IDLE_PROF_OPT_NONE)
+		return;
+
+	if (ipc.opt == IDLE_PROF_OPT_CALI)
+		return;
+
+	ipc.status = IDLE_PROF_STATUS_PROF_STOP;
+
+	/* wait for all threads to exit from profiling */
+	for (i = 0; i < ipc.nr_cpus; i++) {
+		ipt = &ipc.ipts[i];
+		pthread_mutex_lock(&ipt->start_lock);
+		while ((ipt->state != TD_EXITED) &&
+		       (ipt->state!=TD_NOT_CREATED)) {
+			fio_gettime(&ts, NULL);
+			ts.tv_sec += 1;
+			/* timed wait in case a signal is not received */
+			pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts);
+		}
+		pthread_mutex_unlock(&ipt->start_lock);
+
+		/* calculate idleness */
+		if (ipc.cali_mean != 0.0) {
+			runt = utime_since(&ipt->tps, &ipt->tpe);
+			if (runt)
+				ipt->idleness = ipt->loops * ipc.cali_mean / runt;
+			else
+				ipt->idleness = 0.0;
+		} else
+			ipt->idleness = 0.0;
+	}
+
+	/*
+	 * memory allocations are freed via explicit fio_idle_prof_cleanup
+	 * after profiling stats are collected by apps.  
+	 */
+}
+
+/*
+ * return system idle percentage when cpu is -1;
+ * return one cpu idle percentage otherwise.
+ */
+static double fio_idle_prof_cpu_stat(int cpu)
+{
+	int i, nr_cpus = ipc.nr_cpus;
+	struct idle_prof_thread *ipt;
+	double p = 0.0;
+
+	if (ipc.opt == IDLE_PROF_OPT_NONE)
+		return 0.0;
+
+	if ((cpu >= nr_cpus) || (cpu < -1)) {
+		log_err("fio: idle profiling invalid cpu index\n");
+		return 0.0;
+	}
+
+	if (cpu == -1) {
+		for (i = 0; i < nr_cpus; i++) {
+			ipt = &ipc.ipts[i];
+			p += ipt->idleness;
+		}
+		p /= nr_cpus;
+	} else {
+		ipt = &ipc.ipts[cpu];
+		p = ipt->idleness;
+	}
+
+	return p * 100.0;
+}
+
+void fio_idle_prof_cleanup(void)
+{
+	if (ipc.ipts) {
+		free(ipc.ipts);
+		ipc.ipts = NULL;
+	}
+
+	if (ipc.buf) {
+		free(ipc.buf);
+		ipc.buf = NULL;
+	}
+}
+
+int fio_idle_prof_parse_opt(const char *args)
+{
+	ipc.opt = IDLE_PROF_OPT_NONE; /* default */
+
+	if (!args) {
+		log_err("fio: empty idle-prof option string\n");
+		return -1;
+	}	
+
+#if defined(FIO_HAVE_CPU_AFFINITY) && defined(CONFIG_SCHED_IDLE)
+	if (strcmp("calibrate", args) == 0) {
+		ipc.opt = IDLE_PROF_OPT_CALI;
+		fio_idle_prof_init();
+		fio_idle_prof_start();
+		fio_idle_prof_stop();
+		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL, NULL);
+		return 1;
+	} else if (strcmp("system", args) == 0) {
+		ipc.opt = IDLE_PROF_OPT_SYSTEM;
+		return 0;
+	} else if (strcmp("percpu", args) == 0) {
+		ipc.opt = IDLE_PROF_OPT_PERCPU;
+		return 0;
+	} else {
+		log_err("fio: incorrect idle-prof option: %s\n", args);
+		return -1;
+	}	
+#else
+	log_err("fio: idle-prof not supported on this platform\n");
+	return -1;
+#endif
+}
+
+void show_idle_prof_stats(int output, struct json_object *parent,
+			  struct buf_output *out)
+{
+	int i, nr_cpus = ipc.nr_cpus;
+	struct json_object *tmp;
+	char s[MAX_CPU_STR_LEN];
+
+	if (output == FIO_OUTPUT_NORMAL) {
+		if (ipc.opt > IDLE_PROF_OPT_CALI)
+			log_buf(out, "\nCPU idleness:\n");
+		else if (ipc.opt == IDLE_PROF_OPT_CALI)
+			log_buf(out, "CPU idleness:\n");
+
+		if (ipc.opt >= IDLE_PROF_OPT_SYSTEM)
+			log_buf(out, "  system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1));
+
+		if (ipc.opt == IDLE_PROF_OPT_PERCPU) {
+			log_buf(out, "  percpu: %3.2f%%", fio_idle_prof_cpu_stat(0));
+			for (i = 1; i < nr_cpus; i++)
+				log_buf(out, ", %3.2f%%", fio_idle_prof_cpu_stat(i));
+			log_buf(out, "\n");
+		}
+
+		if (ipc.opt >= IDLE_PROF_OPT_CALI) {
+			log_buf(out, "  unit work: mean=%3.2fus,", ipc.cali_mean);
+			log_buf(out, " stddev=%3.2f\n", ipc.cali_stddev);
+		}
+
+		return;
+	}
+
+	if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output & FIO_OUTPUT_JSON)) {
+		if (!parent)
+			return;
+
+		tmp = json_create_object();
+		if (!tmp)
+			return;
+
+		json_object_add_value_object(parent, "cpu_idleness", tmp);
+		json_object_add_value_float(tmp, "system", fio_idle_prof_cpu_stat(-1));
+
+		if (ipc.opt == IDLE_PROF_OPT_PERCPU) {
+			for (i = 0; i < nr_cpus; i++) {
+				snprintf(s, MAX_CPU_STR_LEN, "cpu-%d", i);
+				json_object_add_value_float(tmp, s, fio_idle_prof_cpu_stat(i));
+			}
+		}
+
+		json_object_add_value_float(tmp, "unit_mean", ipc.cali_mean);
+		json_object_add_value_float(tmp, "unit_stddev", ipc.cali_stddev);
+	}
+}
diff --git a/idletime.h b/idletime.h
new file mode 100644
index 0000000..91ca95f
--- /dev/null
+++ b/idletime.h
@@ -0,0 +1,63 @@
+#ifndef FIO_IDLETIME_H
+#define FIO_IDLETIME_H
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include "os/os.h"
+
+#define CALIBRATE_RUNS  10
+#define CALIBRATE_SCALE 1000
+#define MAX_CPU_STR_LEN 32
+
+enum {
+	IDLE_PROF_OPT_NONE,
+	IDLE_PROF_OPT_CALI,                /* calibration only */
+	IDLE_PROF_OPT_SYSTEM,
+	IDLE_PROF_OPT_PERCPU
+};
+
+enum {
+	 IDLE_PROF_STATUS_OK,
+	 IDLE_PROF_STATUS_CALI_STOP,
+	 IDLE_PROF_STATUS_PROF_STOP,
+	 IDLE_PROF_STATUS_ABORT
+};
+
+struct idle_prof_thread {
+	pthread_t thread;
+	int cpu;
+	int state;
+	struct timespec tps;
+	struct timespec tpe;
+	double cali_time; /* microseconds to finish a unit work */
+	double loops;
+	double idleness;
+	unsigned char *data;             /* bytes to be touched */
+	pthread_cond_t  cond;
+	pthread_mutex_t init_lock;
+	pthread_mutex_t start_lock;
+
+	os_cpu_mask_t cpu_mask;
+};
+
+struct idle_prof_common {
+	struct idle_prof_thread *ipts;
+	int nr_cpus;
+	int status;
+	int opt;
+	double cali_mean;
+	double cali_stddev;
+	void *buf;    /* single data allocation for all threads */
+};
+
+extern int fio_idle_prof_parse_opt(const char *);
+
+extern void fio_idle_prof_init(void);
+extern void fio_idle_prof_start(void);
+extern void fio_idle_prof_stop(void);
+
+extern void show_idle_prof_stats(int, struct json_object *, struct buf_output *);
+
+extern void fio_idle_prof_cleanup(void);
+
+#endif
diff --git a/init.c b/init.c
new file mode 100644
index 0000000..b531533
--- /dev/null
+++ b/init.c
@@ -0,0 +1,3014 @@
+/*
+ * This file contains job initialization and setup functions.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/ipc.h>
+#include <sys/types.h>
+#include <dlfcn.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
+
+#include "fio.h"
+#ifndef FIO_NO_HAVE_SHM_H
+#include <sys/shm.h>
+#endif
+
+#include "parse.h"
+#include "smalloc.h"
+#include "filehash.h"
+#include "verify.h"
+#include "profile.h"
+#include "server.h"
+#include "idletime.h"
+#include "filelock.h"
+#include "steadystate.h"
+#include "blktrace.h"
+
+#include "oslib/asprintf.h"
+#include "oslib/getopt.h"
+#include "oslib/strcasestr.h"
+
+#include "crc/test.h"
+#include "lib/pow2.h"
+#include "lib/memcpy.h"
+
+const char fio_version_string[] = FIO_VERSION;
+
+#define FIO_RANDSEED		(0xb1899bedUL)
+
+static char **ini_file;
+static int max_jobs = FIO_MAX_JOBS;
+static bool dump_cmdline;
+static bool parse_only;
+static bool merge_blktrace_only;
+
+static struct thread_data def_thread;
+struct thread_data *threads = NULL;
+static char **job_sections;
+static int nr_job_sections;
+
+bool exitall_on_terminate = false;
+int output_format = FIO_OUTPUT_NORMAL;
+int eta_print = FIO_ETA_AUTO;
+unsigned int eta_interval_msec = 1000;
+int eta_new_line = 0;
+FILE *f_out = NULL;
+FILE *f_err = NULL;
+char *exec_profile = NULL;
+int warnings_fatal = 0;
+int terse_version = 3;
+bool is_backend = false;
+bool is_local_backend = false;
+int nr_clients = 0;
+bool log_syslog = false;
+
+bool write_bw_log = false;
+bool read_only = false;
+int status_interval = 0;
+
+char *trigger_file = NULL;
+long long trigger_timeout = 0;
+char *trigger_cmd = NULL;
+char *trigger_remote_cmd = NULL;
+
+char *aux_path = NULL;
+
+static int prev_group_jobs;
+
+unsigned long fio_debug = 0;
+unsigned int fio_debug_jobno = -1;
+unsigned int *fio_debug_jobp = NULL;
+unsigned int *fio_warned = NULL;
+
+static char cmd_optstr[256];
+static bool did_arg;
+
+#define FIO_CLIENT_FLAG		(1 << 16)
+
+/*
+ * Command line options. These will contain the above, plus a few
+ * extra that only pertain to fio itself and not jobs.
+ */
+static struct option l_opts[FIO_NR_OPTIONS] = {
+	{
+		.name		= (char *) "output",
+		.has_arg	= required_argument,
+		.val		= 'o' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "latency-log",
+		.has_arg	= required_argument,
+		.val		= 'l' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "bandwidth-log",
+		.has_arg	= no_argument,
+		.val		= 'b' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "minimal",
+		.has_arg	= no_argument,
+		.val		= 'm' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "output-format",
+		.has_arg	= required_argument,
+		.val		= 'F' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "append-terse",
+		.has_arg	= optional_argument,
+		.val		= 'f',
+	},
+	{
+		.name		= (char *) "version",
+		.has_arg	= no_argument,
+		.val		= 'v' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "help",
+		.has_arg	= no_argument,
+		.val		= 'h' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "cmdhelp",
+		.has_arg	= optional_argument,
+		.val		= 'c' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "enghelp",
+		.has_arg	= optional_argument,
+		.val		= 'i' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "showcmd",
+		.has_arg	= no_argument,
+		.val		= 's' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "readonly",
+		.has_arg	= no_argument,
+		.val		= 'r' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "eta",
+		.has_arg	= required_argument,
+		.val		= 'e' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "eta-interval",
+		.has_arg	= required_argument,
+		.val		= 'O' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "eta-newline",
+		.has_arg	= required_argument,
+		.val		= 'E' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "debug",
+		.has_arg	= required_argument,
+		.val		= 'd' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "parse-only",
+		.has_arg	= no_argument,
+		.val		= 'P' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "section",
+		.has_arg	= required_argument,
+		.val		= 'x' | FIO_CLIENT_FLAG,
+	},
+#ifdef CONFIG_ZLIB
+	{
+		.name		= (char *) "inflate-log",
+		.has_arg	= required_argument,
+		.val		= 'X' | FIO_CLIENT_FLAG,
+	},
+#endif
+	{
+		.name		= (char *) "alloc-size",
+		.has_arg	= required_argument,
+		.val		= 'a' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "profile",
+		.has_arg	= required_argument,
+		.val		= 'p' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "warnings-fatal",
+		.has_arg	= no_argument,
+		.val		= 'w' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "max-jobs",
+		.has_arg	= required_argument,
+		.val		= 'j' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "terse-version",
+		.has_arg	= required_argument,
+		.val		= 'V' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "server",
+		.has_arg	= optional_argument,
+		.val		= 'S',
+	},
+	{	.name		= (char *) "daemonize",
+		.has_arg	= required_argument,
+		.val		= 'D',
+	},
+	{
+		.name		= (char *) "client",
+		.has_arg	= required_argument,
+		.val		= 'C',
+	},
+	{
+		.name		= (char *) "remote-config",
+		.has_arg	= required_argument,
+		.val		= 'R',
+	},
+	{
+		.name		= (char *) "cpuclock-test",
+		.has_arg	= no_argument,
+		.val		= 'T',
+	},
+	{
+		.name		= (char *) "crctest",
+		.has_arg	= optional_argument,
+		.val		= 'G',
+	},
+	{
+		.name		= (char *) "memcpytest",
+		.has_arg	= optional_argument,
+		.val		= 'M',
+	},
+	{
+		.name		= (char *) "idle-prof",
+		.has_arg	= required_argument,
+		.val		= 'I',
+	},
+	{
+		.name		= (char *) "status-interval",
+		.has_arg	= required_argument,
+		.val		= 'L' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "trigger-file",
+		.has_arg	= required_argument,
+		.val		= 'W',
+	},
+	{
+		.name		= (char *) "trigger-timeout",
+		.has_arg	= required_argument,
+		.val		= 'B',
+	},
+	{
+		.name		= (char *) "trigger",
+		.has_arg	= required_argument,
+		.val		= 'H',
+	},
+	{
+		.name		= (char *) "trigger-remote",
+		.has_arg	= required_argument,
+		.val		= 'J',
+	},
+	{
+		.name		= (char *) "aux-path",
+		.has_arg	= required_argument,
+		.val		= 'K',
+	},
+	{
+		.name		= (char *) "merge-blktrace-only",
+		.has_arg	= no_argument,
+		.val		= 'A' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= NULL,
+	},
+};
+
+void free_threads_shm(void)
+{
+	if (threads) {
+		void *tp = threads;
+#ifndef CONFIG_NO_SHM
+		struct shmid_ds sbuf;
+
+		threads = NULL;
+		shmdt(tp);
+		shmctl(shm_id, IPC_RMID, &sbuf);
+		shm_id = -1;
+#else
+		threads = NULL;
+		free(tp);
+#endif
+	}
+}
+
+static void free_shm(void)
+{
+	if (threads) {
+		flow_exit();
+		fio_debug_jobp = NULL;
+		fio_warned = NULL;
+		free_threads_shm();
+	}
+
+	free(trigger_file);
+	free(trigger_cmd);
+	free(trigger_remote_cmd);
+	trigger_file = trigger_cmd = trigger_remote_cmd = NULL;
+
+	options_free(fio_options, &def_thread.o);
+	fio_filelock_exit();
+	file_hash_exit();
+	scleanup();
+}
+
+/*
+ * The thread area is shared between the main process and the job
+ * threads/processes. So setup a shared memory segment that will hold
+ * all the job info. We use the end of the region for keeping track of
+ * open files across jobs, for file sharing.
+ */
+static int setup_thread_area(void)
+{
+	int i;
+
+	if (threads)
+		return 0;
+
+	/*
+	 * 1024 is too much on some machines, scale max_jobs if
+	 * we get a failure that looks like too large a shm segment
+	 */
+	do {
+		size_t size = max_jobs * sizeof(struct thread_data);
+
+		size += 2 * sizeof(unsigned int);
+
+#ifndef CONFIG_NO_SHM
+		shm_id = shmget(0, size, IPC_CREAT | 0600);
+		if (shm_id != -1)
+			break;
+		if (errno != EINVAL && errno != ENOMEM && errno != ENOSPC) {
+			perror("shmget");
+			break;
+		}
+#else
+		threads = malloc(size);
+		if (threads)
+			break;
+#endif
+
+		max_jobs >>= 1;
+	} while (max_jobs);
+
+#ifndef CONFIG_NO_SHM
+	if (shm_id == -1)
+		return 1;
+
+	threads = shmat(shm_id, NULL, 0);
+	if (threads == (void *) -1) {
+		perror("shmat");
+		return 1;
+	}
+	if (shm_attach_to_open_removed())
+		shmctl(shm_id, IPC_RMID, NULL);
+#endif
+
+	memset(threads, 0, max_jobs * sizeof(struct thread_data));
+	for (i = 0; i < max_jobs; i++)
+		DRD_IGNORE_VAR(threads[i]);
+	fio_debug_jobp = (unsigned int *)(threads + max_jobs);
+	*fio_debug_jobp = -1;
+	fio_warned = fio_debug_jobp + 1;
+	*fio_warned = 0;
+
+	flow_init();
+
+	return 0;
+}
+
+static void dump_print_option(struct print_option *p)
+{
+	const char *delim;
+
+	if (!strcmp("description", p->name))
+		delim = "\"";
+	else
+		delim = "";
+
+	log_info("--%s%s", p->name, p->value ? "" : " ");
+	if (p->value)
+		log_info("=%s%s%s ", delim, p->value, delim);
+}
+
+static void dump_opt_list(struct thread_data *td)
+{
+	struct flist_head *entry;
+	struct print_option *p;
+
+	if (flist_empty(&td->opt_list))
+		return;
+
+	flist_for_each(entry, &td->opt_list) {
+		p = flist_entry(entry, struct print_option, list);
+		dump_print_option(p);
+	}
+}
+
+static void fio_dump_options_free(struct thread_data *td)
+{
+	while (!flist_empty(&td->opt_list)) {
+		struct print_option *p;
+
+		p = flist_first_entry(&td->opt_list, struct print_option, list);
+		flist_del_init(&p->list);
+		free(p->name);
+		free(p->value);
+		free(p);
+	}
+}
+
+static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
+{
+	struct flist_head *entry;
+
+	if (flist_empty(&src->opt_list))
+		return;
+
+	flist_for_each(entry, &src->opt_list) {
+		struct print_option *srcp, *dstp;
+
+		srcp = flist_entry(entry, struct print_option, list);
+		dstp = malloc(sizeof(*dstp));
+		dstp->name = strdup(srcp->name);
+		if (srcp->value)
+			dstp->value = strdup(srcp->value);
+		else
+			dstp->value = NULL;
+		flist_add_tail(&dstp->list, &dst->opt_list);
+	}
+}
+
+/*
+ * Return a free job structure.
+ */
+static struct thread_data *get_new_job(bool global, struct thread_data *parent,
+				       bool preserve_eo, const char *jobname)
+{
+	struct thread_data *td;
+
+	if (global)
+		return &def_thread;
+	if (setup_thread_area()) {
+		log_err("error: failed to setup shm segment\n");
+		return NULL;
+	}
+	if (thread_number >= max_jobs) {
+		log_err("error: maximum number of jobs (%d) reached.\n",
+				max_jobs);
+		return NULL;
+	}
+
+	td = &threads[thread_number++];
+	*td = *parent;
+
+	INIT_FLIST_HEAD(&td->opt_list);
+	if (parent != &def_thread)
+		copy_opt_list(td, parent);
+
+	td->io_ops = NULL;
+	td->io_ops_init = 0;
+	if (!preserve_eo)
+		td->eo = NULL;
+
+	td->o.uid = td->o.gid = -1U;
+
+	dup_files(td, parent);
+	fio_options_mem_dupe(td);
+
+	profile_add_hooks(td);
+
+	td->thread_number = thread_number;
+	td->subjob_number = 0;
+
+	if (jobname)
+		td->o.name = strdup(jobname);
+
+	if (!parent->o.group_reporting || parent == &def_thread)
+		stat_number++;
+
+	return td;
+}
+
+static void put_job(struct thread_data *td)
+{
+	if (td == &def_thread)
+		return;
+
+	profile_td_exit(td);
+	flow_exit_job(td);
+
+	if (td->error)
+		log_info("fio: %s\n", td->verror);
+
+	fio_options_free(td);
+	fio_dump_options_free(td);
+	if (td->io_ops)
+		free_ioengine(td);
+
+	if (td->o.name)
+		free(td->o.name);
+
+	memset(&threads[td->thread_number - 1], 0, sizeof(*td));
+	thread_number--;
+}
+
+static int __setup_rate(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned long long bs = td->o.min_bs[ddir];
+
+	assert(ddir_rw(ddir));
+
+	if (td->o.rate[ddir])
+		td->rate_bps[ddir] = td->o.rate[ddir];
+	else
+		td->rate_bps[ddir] = (uint64_t) td->o.rate_iops[ddir] * bs;
+
+	if (!td->rate_bps[ddir]) {
+		log_err("rate lower than supported\n");
+		return -1;
+	}
+
+	td->rate_next_io_time[ddir] = 0;
+	td->rate_io_issue_bytes[ddir] = 0;
+	td->last_usec[ddir] = 0;
+	return 0;
+}
+
+static int setup_rate(struct thread_data *td)
+{
+	int ret = 0;
+
+	if (td->o.rate[DDIR_READ] || td->o.rate_iops[DDIR_READ])
+		ret = __setup_rate(td, DDIR_READ);
+	if (td->o.rate[DDIR_WRITE] || td->o.rate_iops[DDIR_WRITE])
+		ret |= __setup_rate(td, DDIR_WRITE);
+	if (td->o.rate[DDIR_TRIM] || td->o.rate_iops[DDIR_TRIM])
+		ret |= __setup_rate(td, DDIR_TRIM);
+
+	return ret;
+}
+
+static int fixed_block_size(struct thread_options *o)
+{
+	return o->min_bs[DDIR_READ] == o->max_bs[DDIR_READ] &&
+		o->min_bs[DDIR_WRITE] == o->max_bs[DDIR_WRITE] &&
+		o->min_bs[DDIR_TRIM] == o->max_bs[DDIR_TRIM] &&
+		o->min_bs[DDIR_READ] == o->min_bs[DDIR_WRITE] &&
+		o->min_bs[DDIR_READ] == o->min_bs[DDIR_TRIM];
+}
+
+/*
+ * <3 Johannes
+ */
+static unsigned int gcd(unsigned int m, unsigned int n)
+{
+	if (!n)
+		return m;
+
+	return gcd(n, m % n);
+}
+
+/*
+ * Lazy way of fixing up options that depend on each other. We could also
+ * define option callback handlers, but this is easier.
+ */
+static int fixup_options(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+	int ret = 0;
+
+	if (read_only && (td_write(td) || td_trim(td))) {
+		log_err("fio: trim and write operations are not allowed"
+			 " with the --readonly parameter.\n");
+		ret |= 1;
+	}
+
+#ifndef CONFIG_PSHARED
+	if (!o->use_thread) {
+		log_info("fio: this platform does not support process shared"
+			 " mutexes, forcing use of threads. Use the 'thread'"
+			 " option to get rid of this warning.\n");
+		o->use_thread = 1;
+		ret |= warnings_fatal;
+	}
+#endif
+
+	if (o->write_iolog_file && o->read_iolog_file) {
+		log_err("fio: read iolog overrides write_iolog\n");
+		free(o->write_iolog_file);
+		o->write_iolog_file = NULL;
+		ret |= warnings_fatal;
+	}
+
+	if (o->zone_mode == ZONE_MODE_NONE && o->zone_size) {
+		log_err("fio: --zonemode=none and --zonesize are not compatible.\n");
+		ret |= 1;
+	}
+
+	if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) {
+		log_err("fio: --zonesize must be specified when using --zonemode=strided.\n");
+		ret |= 1;
+	}
+
+	if (o->zone_mode == ZONE_MODE_NOT_SPECIFIED) {
+		if (o->zone_size)
+			o->zone_mode = ZONE_MODE_STRIDED;
+		else
+			o->zone_mode = ZONE_MODE_NONE;
+	}
+
+	/*
+	 * Strided zone mode only really works with 1 file.
+	 */
+	if (o->zone_mode == ZONE_MODE_STRIDED && o->open_files > 1)
+		o->zone_mode = ZONE_MODE_NONE;
+
+	/*
+	 * If zone_range isn't specified, backward compatibility dictates it
+	 * should be made equal to zone_size.
+	 */
+	if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_range)
+		o->zone_range = o->zone_size;
+
+	/*
+	 * Reads can do overwrites, we always need to pre-create the file
+	 */
+	if (td_read(td))
+		o->overwrite = 1;
+
+	if (!o->min_bs[DDIR_READ])
+		o->min_bs[DDIR_READ] = o->bs[DDIR_READ];
+	if (!o->max_bs[DDIR_READ])
+		o->max_bs[DDIR_READ] = o->bs[DDIR_READ];
+	if (!o->min_bs[DDIR_WRITE])
+		o->min_bs[DDIR_WRITE] = o->bs[DDIR_WRITE];
+	if (!o->max_bs[DDIR_WRITE])
+		o->max_bs[DDIR_WRITE] = o->bs[DDIR_WRITE];
+	if (!o->min_bs[DDIR_TRIM])
+		o->min_bs[DDIR_TRIM] = o->bs[DDIR_TRIM];
+	if (!o->max_bs[DDIR_TRIM])
+		o->max_bs[DDIR_TRIM] = o->bs[DDIR_TRIM];
+
+	o->rw_min_bs = min(o->min_bs[DDIR_READ], o->min_bs[DDIR_WRITE]);
+	o->rw_min_bs = min(o->min_bs[DDIR_TRIM], o->rw_min_bs);
+
+	/*
+	 * For random IO, allow blockalign offset other than min_bs.
+	 */
+	if (!o->ba[DDIR_READ] || !td_random(td))
+		o->ba[DDIR_READ] = o->min_bs[DDIR_READ];
+	if (!o->ba[DDIR_WRITE] || !td_random(td))
+		o->ba[DDIR_WRITE] = o->min_bs[DDIR_WRITE];
+	if (!o->ba[DDIR_TRIM] || !td_random(td))
+		o->ba[DDIR_TRIM] = o->min_bs[DDIR_TRIM];
+
+	if ((o->ba[DDIR_READ] != o->min_bs[DDIR_READ] ||
+	    o->ba[DDIR_WRITE] != o->min_bs[DDIR_WRITE] ||
+	    o->ba[DDIR_TRIM] != o->min_bs[DDIR_TRIM]) &&
+	    !o->norandommap) {
+		log_err("fio: Any use of blockalign= turns off randommap\n");
+		o->norandommap = 1;
+		ret |= warnings_fatal;
+	}
+
+	if (!o->file_size_high)
+		o->file_size_high = o->file_size_low;
+
+	if (o->start_delay_high) {
+		if (!o->start_delay_orig)
+			o->start_delay_orig = o->start_delay;
+		o->start_delay = rand_between(&td->delay_state,
+						o->start_delay_orig,
+						o->start_delay_high);
+	}
+
+	if (o->norandommap && o->verify != VERIFY_NONE
+	    && !fixed_block_size(o))  {
+		log_err("fio: norandommap given for variable block sizes, "
+			"verify limited\n");
+		ret |= warnings_fatal;
+	}
+	if (o->bs_unaligned && (o->odirect || td_ioengine_flagged(td, FIO_RAWIO)))
+		log_err("fio: bs_unaligned may not work with raw io\n");
+
+	/*
+	 * thinktime_spin must be less than thinktime
+	 */
+	if (o->thinktime_spin > o->thinktime)
+		o->thinktime_spin = o->thinktime;
+
+	/*
+	 * The low water mark cannot be bigger than the iodepth
+	 */
+	if (o->iodepth_low > o->iodepth || !o->iodepth_low)
+		o->iodepth_low = o->iodepth;
+
+	/*
+	 * If batch number isn't set, default to the same as iodepth
+	 */
+	if (o->iodepth_batch > o->iodepth || !o->iodepth_batch)
+		o->iodepth_batch = o->iodepth;
+
+	/*
+	 * If max batch complete number isn't set or set incorrectly,
+	 * default to the same as iodepth_batch_complete_min
+	 */
+	if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max)
+		o->iodepth_batch_complete_max = o->iodepth_batch_complete_min;
+
+	/*
+	 * There's no need to check for in-flight overlapping IOs if the job
+	 * isn't changing data or the maximum iodepth is guaranteed to be 1
+	 * when we are not in offload mode
+	 */
+	if (o->serialize_overlap && !(td->flags & TD_F_READ_IOLOG) &&
+	    (!(td_write(td) || td_trim(td)) || o->iodepth == 1) &&
+	    o->io_submit_mode != IO_MODE_OFFLOAD)
+		o->serialize_overlap = 0;
+
+	if (o->nr_files > td->files_index)
+		o->nr_files = td->files_index;
+
+	if (o->open_files > o->nr_files || !o->open_files)
+		o->open_files = o->nr_files;
+
+	if (((o->rate[DDIR_READ] + o->rate[DDIR_WRITE] + o->rate[DDIR_TRIM]) &&
+	    (o->rate_iops[DDIR_READ] + o->rate_iops[DDIR_WRITE] + o->rate_iops[DDIR_TRIM])) ||
+	    ((o->ratemin[DDIR_READ] + o->ratemin[DDIR_WRITE] + o->ratemin[DDIR_TRIM]) &&
+	    (o->rate_iops_min[DDIR_READ] + o->rate_iops_min[DDIR_WRITE] + o->rate_iops_min[DDIR_TRIM]))) {
+		log_err("fio: rate and rate_iops are mutually exclusive\n");
+		ret |= 1;
+	}
+	if ((o->rate[DDIR_READ] && (o->rate[DDIR_READ] < o->ratemin[DDIR_READ])) ||
+	    (o->rate[DDIR_WRITE] && (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE])) ||
+	    (o->rate[DDIR_TRIM] && (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM])) ||
+	    (o->rate_iops[DDIR_READ] && (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ])) ||
+	    (o->rate_iops[DDIR_WRITE] && (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE])) ||
+	    (o->rate_iops[DDIR_TRIM] && (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM]))) {
+		log_err("fio: minimum rate exceeds rate\n");
+		ret |= 1;
+	}
+
+	if (!o->timeout && o->time_based) {
+		log_err("fio: time_based requires a runtime/timeout setting\n");
+		o->time_based = 0;
+		ret |= warnings_fatal;
+	}
+
+	if (o->fill_device && !o->size)
+		o->size = -1ULL;
+
+	if (o->verify != VERIFY_NONE) {
+		if (td_write(td) && o->do_verify && o->numjobs > 1 &&
+		    (o->filename ||
+		     !(o->unique_filename &&
+		       strstr(o->filename_format, "$jobname") &&
+		       strstr(o->filename_format, "$jobnum") &&
+		       strstr(o->filename_format, "$filenum")))) {
+			log_info("fio: multiple writers may overwrite blocks "
+				"that belong to other jobs. This can cause "
+				"verification failures.\n");
+			ret |= warnings_fatal;
+		}
+
+		/*
+		 * Warn if verification is requested but no verification of any
+		 * kind can be started due to time constraints
+		 */
+		if (td_write(td) && o->do_verify && o->timeout &&
+		    o->time_based && !td_read(td) && !o->verify_backlog) {
+			log_info("fio: verification read phase will never "
+				 "start because write phase uses all of "
+				 "runtime\n");
+			ret |= warnings_fatal;
+		}
+
+		if (!fio_option_is_set(o, refill_buffers))
+			o->refill_buffers = 1;
+
+		if (o->max_bs[DDIR_WRITE] != o->min_bs[DDIR_WRITE] &&
+		    !o->verify_interval)
+			o->verify_interval = o->min_bs[DDIR_WRITE];
+
+		/*
+		 * Verify interval must be smaller or equal to the
+		 * write size.
+		 */
+		if (o->verify_interval > o->min_bs[DDIR_WRITE])
+			o->verify_interval = o->min_bs[DDIR_WRITE];
+		else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ])
+			o->verify_interval = o->min_bs[DDIR_READ];
+
+		/*
+		 * Verify interval must be a factor of both min and max
+		 * write size
+		 */
+		if (!o->verify_interval ||
+		    (o->min_bs[DDIR_WRITE] % o->verify_interval) ||
+		    (o->max_bs[DDIR_WRITE] % o->verify_interval))
+			o->verify_interval = gcd(o->min_bs[DDIR_WRITE],
+							o->max_bs[DDIR_WRITE]);
+	}
+
+	if (o->pre_read) {
+		if (o->invalidate_cache)
+			o->invalidate_cache = 0;
+		if (td_ioengine_flagged(td, FIO_PIPEIO)) {
+			log_info("fio: cannot pre-read files with an IO engine"
+				 " that isn't seekable. Pre-read disabled.\n");
+			ret |= warnings_fatal;
+		}
+	}
+
+	if (o->unit_base == N2S_NONE) {
+		if (td_ioengine_flagged(td, FIO_BIT_BASED))
+			o->unit_base = N2S_BITPERSEC;
+		else
+			o->unit_base = N2S_BYTEPERSEC;
+	}
+
+#ifndef CONFIG_FDATASYNC
+	if (o->fdatasync_blocks) {
+		log_info("fio: this platform does not support fdatasync()"
+			 " falling back to using fsync().  Use the 'fsync'"
+			 " option instead of 'fdatasync' to get rid of"
+			 " this warning\n");
+		o->fsync_blocks = o->fdatasync_blocks;
+		o->fdatasync_blocks = 0;
+		ret |= warnings_fatal;
+	}
+#endif
+
+#ifdef WIN32
+	/*
+	 * Windows doesn't support O_DIRECT or O_SYNC with the _open interface,
+	 * so fail if we're passed those flags
+	 */
+	if (td_ioengine_flagged(td, FIO_SYNCIO) && (o->odirect || o->sync_io)) {
+		log_err("fio: Windows does not support direct or non-buffered io with"
+				" the synchronous ioengines. Use the 'windowsaio' ioengine"
+				" with 'direct=1' and 'iodepth=1' instead.\n");
+		ret |= 1;
+	}
+#endif
+
+	/*
+	 * For fully compressible data, just zero them at init time.
+	 * It's faster than repeatedly filling it. For non-zero
+	 * compression, we should have refill_buffers set. Set it, unless
+	 * the job file already changed it.
+	 */
+	if (o->compress_percentage) {
+		if (o->compress_percentage == 100) {
+			o->zero_buffers = 1;
+			o->compress_percentage = 0;
+		} else if (!fio_option_is_set(o, refill_buffers)) {
+			o->refill_buffers = 1;
+			td->flags |= TD_F_REFILL_BUFFERS;
+		}
+	}
+
+	/*
+	 * Using a non-uniform random distribution excludes usage of
+	 * a random map
+	 */
+	if (o->random_distribution != FIO_RAND_DIST_RANDOM)
+		o->norandommap = 1;
+
+	/*
+	 * If size is set but less than the min block size, complain
+	 */
+	if (o->size && o->size < td_min_bs(td)) {
+		log_err("fio: size too small, must not be less than minimum block size: %llu < %llu\n",
+			(unsigned long long) o->size, td_min_bs(td));
+		ret |= 1;
+	}
+
+	/*
+	 * O_ATOMIC implies O_DIRECT
+	 */
+	if (o->oatomic)
+		o->odirect = 1;
+
+	/*
+	 * If randseed is set, that overrides randrepeat
+	 */
+	if (fio_option_is_set(o, rand_seed))
+		o->rand_repeatable = 0;
+
+	if (td_ioengine_flagged(td, FIO_NOEXTEND) && o->file_append) {
+		log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name);
+		ret |= 1;
+	}
+
+	if (fio_option_is_set(o, gtod_cpu)) {
+		fio_gtod_init();
+		fio_gtod_set_cpu(o->gtod_cpu);
+		fio_gtod_offload = 1;
+	}
+
+	td->loops = o->loops;
+	if (!td->loops)
+		td->loops = 1;
+
+	if (o->block_error_hist && o->nr_files != 1) {
+		log_err("fio: block error histogram only available "
+			"with a single file per job, but %d files "
+			"provided\n", o->nr_files);
+		ret |= 1;
+	}
+
+	if (o->disable_lat)
+		o->lat_percentiles = 0;
+	if (o->disable_clat)
+		o->clat_percentiles = 0;
+	if (o->disable_slat)
+		o->slat_percentiles = 0;
+
+	/*
+	 * Fix these up to be nsec internally
+	 */
+	o->max_latency *= 1000ULL;
+	o->latency_target *= 1000ULL;
+	o->latency_window *= 1000ULL;
+
+	return ret;
+}
+
+static void init_rand_file_service(struct thread_data *td)
+{
+	unsigned long nranges = td->o.nr_files << FIO_FSERVICE_SHIFT;
+	const unsigned int seed = td->rand_seeds[FIO_RAND_FILE_OFF];
+
+	if (td->o.file_service_type == FIO_FSERVICE_ZIPF) {
+		zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, seed);
+		zipf_disable_hash(&td->next_file_zipf);
+	} else if (td->o.file_service_type == FIO_FSERVICE_PARETO) {
+		pareto_init(&td->next_file_zipf, nranges, td->pareto_h, seed);
+		zipf_disable_hash(&td->next_file_zipf);
+	} else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) {
+		gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, seed);
+		gauss_disable_hash(&td->next_file_gauss);
+	}
+}
+
+void td_fill_verify_state_seed(struct thread_data *td)
+{
+	bool use64;
+
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		use64 = true;
+	else
+		use64 = false;
+
+	init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
+		use64);
+}
+
+static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
+{
+	unsigned int read_seed = td->rand_seeds[FIO_RAND_BS_OFF];
+	unsigned int write_seed = td->rand_seeds[FIO_RAND_BS1_OFF];
+	unsigned int trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF];
+	int i;
+
+	/*
+	 * trimwrite is special in that we need to generate the same
+	 * offsets to get the "write after trim" effect. If we are
+	 * using bssplit to set buffer length distributions, ensure that
+	 * we seed the trim and write generators identically. Ditto for
+	 * verify, read and writes must have the same seed, if we are doing
+	 * read verify.
+	 */
+	if (td->o.verify != VERIFY_NONE)
+		write_seed = read_seed;
+	if (td_trimwrite(td))
+		trim_seed = write_seed;
+	init_rand_seed(&td->bsrange_state[DDIR_READ], read_seed, use64);
+	init_rand_seed(&td->bsrange_state[DDIR_WRITE], write_seed, use64);
+	init_rand_seed(&td->bsrange_state[DDIR_TRIM], trim_seed, use64);
+
+	td_fill_verify_state_seed(td);
+	init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
+
+	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
+		init_rand_seed(&td->next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF], use64);
+	else if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+		init_rand_file_service(td);
+
+	init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64);
+	init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64);
+	init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64);
+	init_rand_seed(&td->poisson_state[0], td->rand_seeds[FIO_RAND_POISSON_OFF], 0);
+	init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0);
+	init_rand_seed(&td->poisson_state[2], td->rand_seeds[FIO_RAND_POISSON3_OFF], 0);
+	init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
+	init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
+	init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false);
+
+	if (!td_random(td))
+		return;
+
+	if (td->o.rand_repeatable)
+		td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
+
+	init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct frand_state *s = &td->seq_rand_state[i];
+
+		init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false);
+	}
+}
+
+void td_fill_rand_seeds(struct thread_data *td)
+{
+	bool use64;
+
+	if (td->o.allrand_repeatable) {
+		unsigned int i;
+
+		for (i = 0; i < FIO_RAND_NR_OFFS; i++)
+			td->rand_seeds[i] = FIO_RANDSEED * td->thread_number
+			       	+ i;
+	}
+
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		use64 = true;
+	else
+		use64 = false;
+
+	td_fill_rand_seeds_internal(td, use64);
+
+	init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
+	frand_copy(&td->buf_state_prev, &td->buf_state);
+}
+
+/*
+ * Initializes the ioengine configured for a job, if it has not been done so
+ * already.
+ */
+int ioengine_load(struct thread_data *td)
+{
+	if (!td->o.ioengine) {
+		log_err("fio: internal fault, no IO engine specified\n");
+		return 1;
+	}
+
+	if (td->io_ops) {
+		struct ioengine_ops *ops;
+		void *dlhandle;
+
+		/* An engine is loaded, but the requested ioengine
+		 * may have changed.
+		 */
+		if (!strcmp(td->io_ops->name, td->o.ioengine)) {
+			/* The right engine is already loaded */
+			return 0;
+		}
+
+		/*
+		 * Name of file and engine may be different, load ops
+		 * for this name and see if they match. If they do, then
+		 * the engine is unchanged.
+		 */
+		dlhandle = td->io_ops_dlhandle;
+		ops = load_ioengine(td);
+		if (ops == td->io_ops && dlhandle == td->io_ops_dlhandle) {
+			if (dlhandle)
+				dlclose(dlhandle);
+			return 0;
+		}
+
+		if (dlhandle && dlhandle != td->io_ops_dlhandle)
+			dlclose(dlhandle);
+
+		/* Unload the old engine. */
+		free_ioengine(td);
+	}
+
+	td->io_ops = load_ioengine(td);
+	if (!td->io_ops) {
+		log_err("fio: failed to load engine\n");
+		return 1;
+	}
+
+	if (td->io_ops->option_struct_size && td->io_ops->options) {
+		/*
+		 * In cases where td->eo is set, clone it for a child thread.
+		 * This requires that the parent thread has the same ioengine,
+		 * but that requirement must be enforced by the code which
+		 * cloned the thread.
+		 */
+		void *origeo = td->eo;
+		/*
+		 * Otherwise use the default thread options.
+		 */
+		if (!origeo && td != &def_thread && def_thread.eo &&
+		    def_thread.io_ops->options == td->io_ops->options)
+			origeo = def_thread.eo;
+
+		options_init(td->io_ops->options);
+		td->eo = malloc(td->io_ops->option_struct_size);
+		/*
+		 * Use the default thread as an option template if this uses the
+		 * same options structure and there are non-default options
+		 * used.
+		 */
+		if (origeo) {
+			memcpy(td->eo, origeo, td->io_ops->option_struct_size);
+			options_mem_dupe(td->io_ops->options, td->eo);
+		} else {
+			memset(td->eo, 0, td->io_ops->option_struct_size);
+			fill_default_options(td->eo, td->io_ops->options);
+		}
+		*(struct thread_data **)td->eo = td;
+	}
+
+	if (td->o.odirect)
+		td->io_ops->flags |= FIO_RAWIO;
+
+	td_set_ioengine_flags(td);
+	return 0;
+}
+
+static void init_flags(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+	int i;
+
+	if (o->verify_backlog)
+		td->flags |= TD_F_VER_BACKLOG;
+	if (o->trim_backlog)
+		td->flags |= TD_F_TRIM_BACKLOG;
+	if (o->read_iolog_file)
+		td->flags |= TD_F_READ_IOLOG;
+	if (o->refill_buffers)
+		td->flags |= TD_F_REFILL_BUFFERS;
+	/*
+	 * Always scramble buffers if asked to
+	 */
+	if (o->scramble_buffers && fio_option_is_set(o, scramble_buffers))
+		td->flags |= TD_F_SCRAMBLE_BUFFERS;
+	/*
+	 * But also scramble buffers, unless we were explicitly asked
+	 * to zero them.
+	 */
+	if (o->scramble_buffers && !(o->zero_buffers &&
+	    fio_option_is_set(o, zero_buffers)))
+		td->flags |= TD_F_SCRAMBLE_BUFFERS;
+	if (o->verify != VERIFY_NONE)
+		td->flags |= TD_F_DO_VERIFY;
+
+	if (o->verify_async || o->io_submit_mode == IO_MODE_OFFLOAD)
+		td->flags |= TD_F_NEED_LOCK;
+
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		td->flags &= ~TD_F_SCRAMBLE_BUFFERS;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (option_check_rate(td, i)) {
+			td->flags |= TD_F_CHECK_RATE;
+			break;
+		}
+	}
+}
+
+static int setup_random_seeds(struct thread_data *td)
+{
+	uint64_t seed;
+	unsigned int i;
+
+	if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
+		int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
+		if (!ret)
+			td_fill_rand_seeds(td);
+		return ret;
+	}
+
+	seed = td->o.rand_seed;
+	for (i = 0; i < 4; i++)
+		seed *= 0x9e370001UL;
+
+	for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
+		td->rand_seeds[i] = seed * td->thread_number + i;
+		seed *= 0x9e370001UL;
+	}
+
+	td_fill_rand_seeds(td);
+	return 0;
+}
+
+enum {
+	FPRE_NONE = 0,
+	FPRE_JOBNAME,
+	FPRE_JOBNUM,
+	FPRE_FILENUM
+};
+
+static struct fpre_keyword {
+	const char *keyword;
+	size_t strlen;
+	int key;
+} fpre_keywords[] = {
+	{ .keyword = "$jobname",	.key = FPRE_JOBNAME, },
+	{ .keyword = "$jobnum",		.key = FPRE_JOBNUM, },
+	{ .keyword = "$filenum",	.key = FPRE_FILENUM, },
+	{ .keyword = NULL, },
+	};
+
+static char *make_filename(char *buf, size_t buf_size,struct thread_options *o,
+			   const char *jobname, int jobnum, int filenum)
+{
+	struct fpre_keyword *f;
+	char copy[PATH_MAX];
+	size_t dst_left = PATH_MAX - 1;
+
+	if (!o->filename_format || !strlen(o->filename_format)) {
+		sprintf(buf, "%s.%d.%d", jobname, jobnum, filenum);
+		return buf;
+	}
+
+	for (f = &fpre_keywords[0]; f->keyword; f++)
+		f->strlen = strlen(f->keyword);
+
+	snprintf(buf, buf_size, "%s", o->filename_format);
+
+	memset(copy, 0, sizeof(copy));
+	for (f = &fpre_keywords[0]; f->keyword; f++) {
+		do {
+			size_t pre_len, post_start = 0;
+			char *str, *dst = copy;
+
+			str = strcasestr(buf, f->keyword);
+			if (!str)
+				break;
+
+			pre_len = str - buf;
+			if (strlen(str) != f->strlen)
+				post_start = pre_len + f->strlen;
+
+			if (pre_len) {
+				strncpy(dst, buf, pre_len);
+				dst += pre_len;
+				dst_left -= pre_len;
+			}
+
+			switch (f->key) {
+			case FPRE_JOBNAME: {
+				int ret;
+
+				ret = snprintf(dst, dst_left, "%s", jobname);
+				if (ret < 0)
+					break;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
+				break;
+				}
+			case FPRE_JOBNUM: {
+				int ret;
+
+				ret = snprintf(dst, dst_left, "%d", jobnum);
+				if (ret < 0)
+					break;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
+				break;
+				}
+			case FPRE_FILENUM: {
+				int ret;
+
+				ret = snprintf(dst, dst_left, "%d", filenum);
+				if (ret < 0)
+					break;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
+				break;
+				}
+			default:
+				assert(0);
+				break;
+			}
+
+			if (post_start)
+				strncpy(dst, buf + post_start, dst_left);
+
+			snprintf(buf, buf_size, "%s", copy);
+		} while (1);
+	}
+
+	return buf;
+}
+
+bool parse_dryrun(void)
+{
+	return dump_cmdline || parse_only;
+}
+
+static void gen_log_name(char *name, size_t size, const char *logtype,
+			 const char *logname, unsigned int num,
+			 const char *suf, int per_job)
+{
+	if (per_job)
+		snprintf(name, size, "%s_%s.%d.%s", logname, logtype, num, suf);
+	else
+		snprintf(name, size, "%s_%s.%s", logname, logtype, suf);
+}
+
+static int check_waitees(char *waitee)
+{
+	struct thread_data *td;
+	int i, ret = 0;
+
+	for_each_td(td, i) {
+		if (td->subjob_number)
+			continue;
+
+		ret += !strcmp(td->o.name, waitee);
+	}
+
+	return ret;
+}
+
+static bool wait_for_ok(const char *jobname, struct thread_options *o)
+{
+	int nw;
+
+	if (!o->wait_for)
+		return true;
+
+	if (!strcmp(jobname, o->wait_for)) {
+		log_err("%s: a job cannot wait for itself (wait_for=%s).\n",
+				jobname, o->wait_for);
+		return false;
+	}
+
+	if (!(nw = check_waitees(o->wait_for))) {
+		log_err("%s: waitee job %s unknown.\n", jobname, o->wait_for);
+		return false;
+	}
+
+	if (nw > 1) {
+		log_err("%s: multiple waitees %s found,\n"
+			"please avoid duplicates when using wait_for option.\n",
+				jobname, o->wait_for);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Treat an empty log file name the same as a one not given
+ */
+static const char *make_log_name(const char *logname, const char *jobname)
+{
+	if (logname && strcmp(logname, ""))
+		return logname;
+
+	return jobname;
+}
+
+/*
+ * Adds a job to the list of things todo. Sanitizes the various options
+ * to make sure we don't have conflicts, and initializes various
+ * members of td.
+ */
+static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
+		   int recursed, int client_type)
+{
+	unsigned int i;
+	char fname[PATH_MAX + 1];
+	int numjobs, file_alloced;
+	struct thread_options *o = &td->o;
+	char logname[PATH_MAX + 32];
+
+	/*
+	 * the def_thread is just for options, it's not a real job
+	 */
+	if (td == &def_thread)
+		return 0;
+
+	init_flags(td);
+
+	/*
+	 * if we are just dumping the output command line, don't add the job
+	 */
+	if (parse_dryrun()) {
+		put_job(td);
+		return 0;
+	}
+
+	td->client_type = client_type;
+
+	if (profile_td_init(td))
+		goto err;
+
+	if (ioengine_load(td))
+		goto err;
+
+	file_alloced = 0;
+	if (!o->filename && !td->files_index && !o->read_iolog_file) {
+		file_alloced = 1;
+
+		if (o->nr_files == 1 && exists_and_not_regfile(jobname))
+			add_file(td, jobname, job_add_num, 0);
+		else {
+			for (i = 0; i < o->nr_files; i++)
+				add_file(td, make_filename(fname, sizeof(fname), o, jobname, job_add_num, i), job_add_num, 0);
+		}
+	}
+
+	if (setup_random_seeds(td)) {
+		td_verror(td, errno, "setup_random_seeds");
+		goto err;
+	}
+
+	if (fixup_options(td))
+		goto err;
+
+	/*
+	 * Belongs to fixup_options, but o->name is not necessarily set as yet
+	 */
+	if (!wait_for_ok(jobname, o))
+		goto err;
+
+	flow_init_job(td);
+
+	/*
+	 * IO engines only need this for option callbacks, and the address may
+	 * change in subprocesses.
+	 */
+	if (td->eo)
+		*(struct thread_data **)td->eo = NULL;
+
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO)) {
+		struct fio_file *f;
+
+		for_each_file(td, f, i)
+			f->real_file_size = -1ULL;
+	}
+
+	td->sem = fio_sem_init(FIO_SEM_LOCKED);
+
+	td->ts.clat_percentiles = o->clat_percentiles;
+	td->ts.lat_percentiles = o->lat_percentiles;
+	td->ts.slat_percentiles = o->slat_percentiles;
+	td->ts.percentile_precision = o->percentile_precision;
+	memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
+	td->ts.sig_figs = o->sig_figs;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		td->ts.clat_stat[i].min_val = ULONG_MAX;
+		td->ts.slat_stat[i].min_val = ULONG_MAX;
+		td->ts.lat_stat[i].min_val = ULONG_MAX;
+		td->ts.bw_stat[i].min_val = ULONG_MAX;
+		td->ts.iops_stat[i].min_val = ULONG_MAX;
+		td->ts.clat_high_prio_stat[i].min_val = ULONG_MAX;
+		td->ts.clat_low_prio_stat[i].min_val = ULONG_MAX;
+	}
+	td->ts.sync_stat.min_val = ULONG_MAX;
+	td->ddir_seq_nr = o->ddir_seq_nr;
+
+	if ((o->stonewall || o->new_group) && prev_group_jobs) {
+		prev_group_jobs = 0;
+		groupid++;
+		if (groupid == INT_MAX) {
+			log_err("fio: too many groups defined\n");
+			goto err;
+		}
+	}
+
+	td->groupid = groupid;
+	prev_group_jobs++;
+
+	if (setup_rate(td))
+		goto err;
+
+	if (o->write_lat_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_LAT,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->lat_log_file, o->name);
+		const char *suf;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "lat", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->lat_log, &p, logname);
+
+		gen_log_name(logname, sizeof(logname), "slat", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->slat_log, &p, logname);
+
+		gen_log_name(logname, sizeof(logname), "clat", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->clat_log, &p, logname);
+
+	}
+
+	if (o->write_hist_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_HIST,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->hist_log_file, o->name);
+		const char *suf;
+
+#ifndef CONFIG_ZLIB
+		if (td->client_type) {
+			log_err("fio: --write_hist_log requires zlib in client/server mode\n");
+			goto err;
+		}
+#endif
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "clat_hist", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->clat_hist_log, &p, logname);
+	}
+
+	if (o->write_bw_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_BW,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->bw_log_file, o->name);
+		const char *suf;
+
+		if (fio_option_is_set(o, bw_avg_time))
+			p.avg_msec = min(o->log_avg_msec, o->bw_avg_time);
+		else
+			o->bw_avg_time = p.avg_msec;
+
+		p.hist_msec = o->log_hist_msec;
+		p.hist_coarseness = o->log_hist_coarseness;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "bw", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->bw_log, &p, logname);
+	}
+	if (o->write_iops_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_IOPS,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->iops_log_file, o->name);
+		const char *suf;
+
+		if (fio_option_is_set(o, iops_avg_time))
+			p.avg_msec = min(o->log_avg_msec, o->iops_avg_time);
+		else
+			o->iops_avg_time = p.avg_msec;
+
+		p.hist_msec = o->log_hist_msec;
+		p.hist_coarseness = o->log_hist_coarseness;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "iops", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->iops_log, &p, logname);
+	}
+
+	if (!o->name)
+		o->name = strdup(jobname);
+
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		if (!job_add_num) {
+			if (is_backend && !recursed)
+				fio_server_send_add_job(td);
+
+			if (!td_ioengine_flagged(td, FIO_NOIO)) {
+				char *c1, *c2, *c3, *c4;
+				char *c5 = NULL, *c6 = NULL;
+				int i2p = is_power_of_2(o->kb_base);
+				struct buf_output out;
+
+				c1 = num2str(o->min_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+				c2 = num2str(o->max_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+				c3 = num2str(o->min_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+				c4 = num2str(o->max_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+
+				if (!o->bs_is_seq_rand) {
+					c5 = num2str(o->min_bs[DDIR_TRIM], o->sig_figs, 1, i2p, N2S_BYTE);
+					c6 = num2str(o->max_bs[DDIR_TRIM], o->sig_figs, 1, i2p, N2S_BYTE);
+				}
+
+				buf_output_init(&out);
+				__log_buf(&out, "%s: (g=%d): rw=%s, ", td->o.name,
+							td->groupid,
+							ddir_str(o->td_ddir));
+
+				if (o->bs_is_seq_rand)
+					__log_buf(&out, "bs=(R) %s-%s, (W) %s-%s, bs_is_seq_rand, ",
+							c1, c2, c3, c4);
+				else
+					__log_buf(&out, "bs=(R) %s-%s, (W) %s-%s, (T) %s-%s, ",
+							c1, c2, c3, c4, c5, c6);
+
+				__log_buf(&out, "ioengine=%s, iodepth=%u\n",
+						td->io_ops->name, o->iodepth);
+				log_info_buf(out.buf, out.buflen);
+				buf_output_free(&out);
+
+				free(c1);
+				free(c2);
+				free(c3);
+				free(c4);
+				free(c5);
+				free(c6);
+			}
+		} else if (job_add_num == 1)
+			log_info("...\n");
+	}
+
+	if (td_steadystate_init(td))
+		goto err;
+
+	if (o->merge_blktrace_file && !merge_blktrace_iologs(td))
+		goto err;
+
+	if (merge_blktrace_only) {
+		put_job(td);
+		return 0;
+	}
+
+	/*
+	 * recurse add identical jobs, clear numjobs and stonewall options
+	 * as they don't apply to sub-jobs
+	 */
+	numjobs = o->numjobs;
+	while (--numjobs) {
+		struct thread_data *td_new = get_new_job(false, td, true, jobname);
+
+		if (!td_new)
+			goto err;
+
+		td_new->o.numjobs = 1;
+		td_new->o.stonewall = 0;
+		td_new->o.new_group = 0;
+		td_new->subjob_number = numjobs;
+		td_new->o.ss_dur = o->ss_dur * 1000000l;
+		td_new->o.ss_limit = o->ss_limit;
+
+		if (file_alloced) {
+			if (td_new->files) {
+				struct fio_file *f;
+				for_each_file(td_new, f, i) {
+					if (f->file_name)
+						sfree(f->file_name);
+					sfree(f);
+				}
+				free(td_new->files);
+				td_new->files = NULL;
+			}
+			td_new->files_index = 0;
+			td_new->files_size = 0;
+			if (td_new->o.filename) {
+				free(td_new->o.filename);
+				td_new->o.filename = NULL;
+			}
+		}
+
+		if (add_job(td_new, jobname, numjobs, 1, client_type))
+			goto err;
+	}
+
+	return 0;
+err:
+	put_job(td);
+	return -1;
+}
+
+/*
+ * Parse as if 'o' was a command line
+ */
+void add_job_opts(const char **o, int client_type)
+{
+	struct thread_data *td, *td_parent;
+	int i, in_global = 1;
+	char jobname[32];
+
+	i = 0;
+	td_parent = td = NULL;
+	while (o[i]) {
+		if (!strncmp(o[i], "name", 4)) {
+			in_global = 0;
+			if (td)
+				add_job(td, jobname, 0, 0, client_type);
+			td = NULL;
+			sprintf(jobname, "%s", o[i] + 5);
+		}
+		if (in_global && !td_parent)
+			td_parent = get_new_job(true, &def_thread, false, jobname);
+		else if (!in_global && !td) {
+			if (!td_parent)
+				td_parent = &def_thread;
+			td = get_new_job(false, td_parent, false, jobname);
+		}
+		if (in_global)
+			fio_options_parse(td_parent, (char **) &o[i], 1);
+		else
+			fio_options_parse(td, (char **) &o[i], 1);
+		i++;
+	}
+
+	if (td)
+		add_job(td, jobname, 0, 0, client_type);
+}
+
+static int skip_this_section(const char *name)
+{
+	int i;
+
+	if (!nr_job_sections)
+		return 0;
+	if (!strncmp(name, "global", 6))
+		return 0;
+
+	for (i = 0; i < nr_job_sections; i++)
+		if (!strcmp(job_sections[i], name))
+			return 0;
+
+	return 1;
+}
+
+static int is_empty_or_comment(char *line)
+{
+	unsigned int i;
+
+	for (i = 0; i < strlen(line); i++) {
+		if (line[i] == ';')
+			return 1;
+		if (line[i] == '#')
+			return 1;
+		if (!isspace((int) line[i]) && !iscntrl((int) line[i]))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * This is our [ini] type file parser.
+ */
+static int __parse_jobs_ini(struct thread_data *td,
+		char *file, int is_buf, int stonewall_flag, int type,
+		int nested, char *name, char ***popts, int *aopts, int *nopts)
+{
+	bool global = false;
+	char *string;
+	FILE *f;
+	char *p;
+	int ret = 0, stonewall;
+	int first_sect = 1;
+	int skip_fgets = 0;
+	int inside_skip = 0;
+	char **opts;
+	int i, alloc_opts, num_opts;
+
+	dprint(FD_PARSE, "Parsing ini file %s\n", file);
+	assert(td || !nested);
+
+	if (is_buf)
+		f = NULL;
+	else {
+		if (!strcmp(file, "-"))
+			f = stdin;
+		else
+			f = fopen(file, "r");
+
+		if (!f) {
+			int __err = errno;
+
+			log_err("fio: unable to open '%s' job file\n", file);
+			if (td)
+				td_verror(td, __err, "job file open");
+			return 1;
+		}
+	}
+
+	string = malloc(OPT_LEN_MAX);
+
+	/*
+	 * it's really 256 + small bit, 280 should suffice
+	 */
+	if (!nested) {
+		name = malloc(280);
+		memset(name, 0, 280);
+	}
+
+	opts = NULL;
+	if (nested && popts) {
+		opts = *popts;
+		alloc_opts = *aopts;
+		num_opts = *nopts;
+	}
+
+	if (!opts) {
+		alloc_opts = 8;
+		opts = malloc(sizeof(char *) * alloc_opts);
+		num_opts = 0;
+	}
+
+	stonewall = stonewall_flag;
+	do {
+		/*
+		 * if skip_fgets is set, we already have loaded a line we
+		 * haven't handled.
+		 */
+		if (!skip_fgets) {
+			if (is_buf)
+				p = strsep(&file, "\n");
+			else
+				p = fgets(string, OPT_LEN_MAX, f);
+			if (!p)
+				break;
+		}
+
+		skip_fgets = 0;
+		strip_blank_front(&p);
+		strip_blank_end(p);
+
+		dprint(FD_PARSE, "%s\n", p);
+		if (is_empty_or_comment(p))
+			continue;
+
+		if (!nested) {
+			if (sscanf(p, "[%255[^\n]]", name) != 1) {
+				if (inside_skip)
+					continue;
+
+				log_err("fio: option <%s> outside of "
+					"[] job section\n", p);
+				ret = 1;
+				break;
+			}
+
+			name[strlen(name) - 1] = '\0';
+
+			if (skip_this_section(name)) {
+				inside_skip = 1;
+				continue;
+			} else
+				inside_skip = 0;
+
+			dprint(FD_PARSE, "Parsing section [%s]\n", name);
+
+			global = !strncmp(name, "global", 6);
+
+			if (dump_cmdline) {
+				if (first_sect)
+					log_info("fio ");
+				if (!global)
+					log_info("--name=%s ", name);
+				first_sect = 0;
+			}
+
+			td = get_new_job(global, &def_thread, false, name);
+			if (!td) {
+				ret = 1;
+				break;
+			}
+
+			/*
+			 * Separate multiple job files by a stonewall
+			 */
+			if (!global && stonewall) {
+				td->o.stonewall = stonewall;
+				stonewall = 0;
+			}
+
+			num_opts = 0;
+			memset(opts, 0, alloc_opts * sizeof(char *));
+		}
+		else
+			skip_fgets = 1;
+
+		while (1) {
+			if (!skip_fgets) {
+				if (is_buf)
+					p = strsep(&file, "\n");
+				else
+					p = fgets(string, OPT_LEN_MAX, f);
+				if (!p)
+					break;
+				dprint(FD_PARSE, "%s", p);
+			}
+			else
+				skip_fgets = 0;
+
+			if (is_empty_or_comment(p))
+				continue;
+
+			strip_blank_front(&p);
+
+			/*
+			 * new section, break out and make sure we don't
+			 * fgets() a new line at the top.
+			 */
+			if (p[0] == '[') {
+				if (nested) {
+					log_err("No new sections in included files\n");
+					ret = 1;
+					goto out;
+				}
+
+				skip_fgets = 1;
+				break;
+			}
+
+			strip_blank_end(p);
+
+			if (!strncmp(p, "include", strlen("include"))) {
+				char *filename = p + strlen("include") + 1,
+					*ts, *full_fn = NULL;
+
+				/*
+				 * Allow for the include filename
+				 * specification to be relative.
+				 */
+				if (access(filename, F_OK) &&
+				    (ts = strrchr(file, '/'))) {
+					if (asprintf(&full_fn, "%.*s%s",
+						 (int)(ts - file + 1), file,
+						 filename) < 0) {
+						ret = ENOMEM;
+						break;
+					}
+					filename = full_fn;
+				}
+
+				ret = __parse_jobs_ini(td, filename, is_buf,
+						       stonewall_flag, type, 1,
+						       name, &opts,
+						       &alloc_opts, &num_opts);
+
+				if (ret) {
+					log_err("Error %d while parsing "
+						"include file %s\n",
+						ret, filename);
+				}
+
+				if (full_fn)
+					free(full_fn);
+
+				if (ret)
+					break;
+
+				continue;
+			}
+
+			if (num_opts == alloc_opts) {
+				alloc_opts <<= 1;
+				opts = realloc(opts,
+						alloc_opts * sizeof(char *));
+			}
+
+			opts[num_opts] = strdup(p);
+			num_opts++;
+		}
+
+		if (nested) {
+			*popts = opts;
+			*aopts = alloc_opts;
+			*nopts = num_opts;
+			goto out;
+		}
+
+		ret = fio_options_parse(td, opts, num_opts);
+		if (!ret) {
+			if (dump_cmdline)
+				dump_opt_list(td);
+
+			ret = add_job(td, name, 0, 0, type);
+		} else {
+			log_err("fio: job %s dropped\n", name);
+			put_job(td);
+		}
+
+		for (i = 0; i < num_opts; i++)
+			free(opts[i]);
+		num_opts = 0;
+	} while (!ret);
+
+	if (dump_cmdline)
+		log_info("\n");
+
+	i = 0;
+	while (i < nr_job_sections) {
+		free(job_sections[i]);
+		i++;
+	}
+
+	free(opts);
+out:
+	free(string);
+	if (!nested)
+		free(name);
+	if (!is_buf && f != stdin)
+		fclose(f);
+	return ret;
+}
+
+int parse_jobs_ini(char *file, int is_buf, int stonewall_flag, int type)
+{
+	return __parse_jobs_ini(NULL, file, is_buf, stonewall_flag, type,
+			0, NULL, NULL, NULL, NULL);
+}
+
+static int fill_def_thread(void)
+{
+	memset(&def_thread, 0, sizeof(def_thread));
+	INIT_FLIST_HEAD(&def_thread.opt_list);
+
+	fio_getaffinity(getpid(), &def_thread.o.cpumask);
+	def_thread.o.error_dump = 1;
+
+	/*
+	 * fill default options
+	 */
+	fio_fill_default_options(&def_thread);
+	return 0;
+}
+
+static void show_debug_categories(void)
+{
+#ifdef FIO_INC_DEBUG
+	const struct debug_level *dl = &debug_levels[0];
+	int curlen, first = 1;
+
+	curlen = 0;
+	while (dl->name) {
+		int has_next = (dl + 1)->name != NULL;
+
+		if (first || curlen + strlen(dl->name) >= 80) {
+			if (!first) {
+				printf("\n");
+				curlen = 0;
+			}
+			curlen += printf("\t\t\t%s", dl->name);
+			curlen += 3 * (8 - 1);
+			if (has_next)
+				curlen += printf(",");
+		} else {
+			curlen += printf("%s", dl->name);
+			if (has_next)
+				curlen += printf(",");
+		}
+		dl++;
+		first = 0;
+	}
+	printf("\n");
+#endif
+}
+
+/*
+ * Following options aren't printed by usage().
+ * --append-terse - Equivalent to --output-format=terse, see f6a7df53.
+ * --latency-log - Deprecated option.
+ */
+static void usage(const char *name)
+{
+	printf("%s\n", fio_version_string);
+	printf("%s [options] [job options] <job file(s)>\n", name);
+	printf("  --debug=options\tEnable debug logging. May be one/more of:\n");
+	show_debug_categories();
+	printf("  --parse-only\t\tParse options only, don't start any IO\n");
+	printf("  --merge-blktrace-only\tMerge blktraces only, don't start any IO\n");
+	printf("  --output\t\tWrite output to file\n");
+	printf("  --bandwidth-log\tGenerate aggregate bandwidth logs\n");
+	printf("  --minimal\t\tMinimal (terse) output\n");
+	printf("  --output-format=type\tOutput format (terse,json,json+,normal)\n");
+	printf("  --terse-version=type\tSet terse version output format"
+		" (default 3, or 2 or 4)\n");
+	printf("  --version\t\tPrint version info and exit\n");
+	printf("  --help\t\tPrint this page\n");
+	printf("  --cpuclock-test\tPerform test/validation of CPU clock\n");
+	printf("  --crctest=[type]\tTest speed of checksum functions\n");
+	printf("  --cmdhelp=cmd\t\tPrint command help, \"all\" for all of"
+		" them\n");
+	printf("  --enghelp=engine\tPrint ioengine help, or list"
+		" available ioengines\n");
+	printf("  --enghelp=engine,cmd\tPrint help for an ioengine"
+		" cmd\n");
+	printf("  --showcmd\t\tTurn a job file into command line options\n");
+	printf("  --eta=when\t\tWhen ETA estimate should be printed\n");
+	printf("            \t\tMay be \"always\", \"never\" or \"auto\"\n");
+	printf("  --eta-newline=t\tForce a new line for every 't'");
+	printf(" period passed\n");
+	printf("  --status-interval=t\tForce full status dump every");
+	printf(" 't' period passed\n");
+	printf("  --readonly\t\tTurn on safety read-only checks, preventing"
+		" writes\n");
+	printf("  --section=name\tOnly run specified section in job file,"
+		" multiple sections can be specified\n");
+	printf("  --alloc-size=kb\tSet smalloc pool to this size in kb"
+		" (def 16384)\n");
+	printf("  --warnings-fatal\tFio parser warnings are fatal\n");
+	printf("  --max-jobs=nr\t\tMaximum number of threads/processes to support\n");
+	printf("  --server=args\t\tStart a backend fio server\n");
+	printf("  --daemonize=pidfile\tBackground fio server, write pid to file\n");
+	printf("  --client=hostname\tTalk to remote backend(s) fio server at hostname\n");
+	printf("  --remote-config=file\tTell fio server to load this local job file\n");
+	printf("  --idle-prof=option\tReport cpu idleness on a system or percpu basis\n"
+		"\t\t\t(option=system,percpu) or run unit work\n"
+		"\t\t\tcalibration only (option=calibrate)\n");
+#ifdef CONFIG_ZLIB
+	printf("  --inflate-log=log\tInflate and output compressed log\n");
+#endif
+	printf("  --trigger-file=file\tExecute trigger cmd when file exists\n");
+	printf("  --trigger-timeout=t\tExecute trigger at this time\n");
+	printf("  --trigger=cmd\t\tSet this command as local trigger\n");
+	printf("  --trigger-remote=cmd\tSet this command as remote trigger\n");
+	printf("  --aux-path=path\tUse this path for fio state generated files\n");
+	printf("\nFio was written by Jens Axboe <axboe@kernel.dk>\n");
+}
+
+#ifdef FIO_INC_DEBUG
+const struct debug_level debug_levels[] = {
+	{ .name = "process",
+	  .help = "Process creation/exit logging",
+	  .shift = FD_PROCESS,
+	},
+	{ .name = "file",
+	  .help = "File related action logging",
+	  .shift = FD_FILE,
+	},
+	{ .name = "io",
+	  .help = "IO and IO engine action logging (offsets, queue, completions, etc)",
+	  .shift = FD_IO,
+	},
+	{ .name = "mem",
+	  .help = "Memory allocation/freeing logging",
+	  .shift = FD_MEM,
+	},
+	{ .name = "blktrace",
+	  .help = "blktrace action logging",
+	  .shift = FD_BLKTRACE,
+	},
+	{ .name = "verify",
+	  .help = "IO verification action logging",
+	  .shift = FD_VERIFY,
+	},
+	{ .name = "random",
+	  .help = "Random generation logging",
+	  .shift = FD_RANDOM,
+	},
+	{ .name = "parse",
+	  .help = "Parser logging",
+	  .shift = FD_PARSE,
+	},
+	{ .name = "diskutil",
+	  .help = "Disk utility logging actions",
+	  .shift = FD_DISKUTIL,
+	},
+	{ .name = "job",
+	  .help = "Logging related to creating/destroying jobs",
+	  .shift = FD_JOB,
+	},
+	{ .name = "mutex",
+	  .help = "Mutex logging",
+	  .shift = FD_MUTEX
+	},
+	{ .name	= "profile",
+	  .help = "Logging related to profiles",
+	  .shift = FD_PROFILE,
+	},
+	{ .name = "time",
+	  .help = "Logging related to time keeping functions",
+	  .shift = FD_TIME,
+	},
+	{ .name = "net",
+	  .help = "Network logging",
+	  .shift = FD_NET,
+	},
+	{ .name = "rate",
+	  .help = "Rate logging",
+	  .shift = FD_RATE,
+	},
+	{ .name = "compress",
+	  .help = "Log compression logging",
+	  .shift = FD_COMPRESS,
+	},
+	{ .name = "steadystate",
+	  .help = "Steady state detection logging",
+	  .shift = FD_STEADYSTATE,
+	},
+	{ .name = "helperthread",
+	  .help = "Helper thread logging",
+	  .shift = FD_HELPERTHREAD,
+	},
+	{ .name = "zbd",
+	  .help = "Zoned Block Device logging",
+	  .shift = FD_ZBD,
+	},
+	{ .name = NULL, },
+};
+
+static int set_debug(const char *string)
+{
+	const struct debug_level *dl;
+	char *p = (char *) string;
+	char *opt;
+	int i;
+
+	if (!string)
+		return 0;
+
+	if (!strcmp(string, "?") || !strcmp(string, "help")) {
+		log_info("fio: dumping debug options:");
+		for (i = 0; debug_levels[i].name; i++) {
+			dl = &debug_levels[i];
+			log_info("%s,", dl->name);
+		}
+		log_info("all\n");
+		return 1;
+	}
+
+	while ((opt = strsep(&p, ",")) != NULL) {
+		int found = 0;
+
+		if (!strncmp(opt, "all", 3)) {
+			log_info("fio: set all debug options\n");
+			fio_debug = ~0UL;
+			continue;
+		}
+
+		for (i = 0; debug_levels[i].name; i++) {
+			dl = &debug_levels[i];
+			found = !strncmp(opt, dl->name, strlen(dl->name));
+			if (!found)
+				continue;
+
+			if (dl->shift == FD_JOB) {
+				opt = strchr(opt, ':');
+				if (!opt) {
+					log_err("fio: missing job number\n");
+					break;
+				}
+				opt++;
+				fio_debug_jobno = atoi(opt);
+				log_info("fio: set debug jobno %d\n",
+							fio_debug_jobno);
+			} else {
+				log_info("fio: set debug option %s\n", opt);
+				fio_debug |= (1UL << dl->shift);
+			}
+			break;
+		}
+
+		if (!found)
+			log_err("fio: debug mask %s not found\n", opt);
+	}
+	return 0;
+}
+#else
+static int set_debug(const char *string)
+{
+	log_err("fio: debug tracing not included in build\n");
+	return 1;
+}
+#endif
+
+static void fio_options_fill_optstring(void)
+{
+	char *ostr = cmd_optstr;
+	int i, c;
+
+	c = i = 0;
+	while (l_opts[i].name) {
+		ostr[c++] = l_opts[i].val;
+		if (l_opts[i].has_arg == required_argument)
+			ostr[c++] = ':';
+		else if (l_opts[i].has_arg == optional_argument) {
+			ostr[c++] = ':';
+			ostr[c++] = ':';
+		}
+		i++;
+	}
+	ostr[c] = '\0';
+}
+
+static int client_flag_set(char c)
+{
+	int i;
+
+	i = 0;
+	while (l_opts[i].name) {
+		int val = l_opts[i].val;
+
+		if (c == (val & 0xff))
+			return (val & FIO_CLIENT_FLAG);
+
+		i++;
+	}
+
+	return 0;
+}
+
+static void parse_cmd_client(void *client, char *opt)
+{
+	fio_client_add_cmd_option(client, opt);
+}
+
+static void show_closest_option(const char *name)
+{
+	int best_option, best_distance;
+	int i, distance;
+
+	while (*name == '-')
+		name++;
+
+	best_option = -1;
+	best_distance = INT_MAX;
+	i = 0;
+	while (l_opts[i].name) {
+		distance = string_distance(name, l_opts[i].name);
+		if (distance < best_distance) {
+			best_distance = distance;
+			best_option = i;
+		}
+		i++;
+	}
+
+	if (best_option != -1 && string_distance_ok(name, best_distance))
+		log_err("Did you mean %s?\n", l_opts[best_option].name);
+}
+
+static int parse_output_format(const char *optarg)
+{
+	char *p, *orig, *opt;
+	int ret = 0;
+
+	p = orig = strdup(optarg);
+
+	output_format = 0;
+
+	while ((opt = strsep(&p, ",")) != NULL) {
+		if (!strcmp(opt, "minimal") ||
+		    !strcmp(opt, "terse") ||
+		    !strcmp(opt, "csv"))
+			output_format |= FIO_OUTPUT_TERSE;
+		else if (!strcmp(opt, "json"))
+			output_format |= FIO_OUTPUT_JSON;
+		else if (!strcmp(opt, "json+"))
+			output_format |= (FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS);
+		else if (!strcmp(opt, "normal"))
+			output_format |= FIO_OUTPUT_NORMAL;
+		else {
+			log_err("fio: invalid output format %s\n", opt);
+			ret = 1;
+			break;
+		}
+	}
+
+	free(orig);
+	return ret;
+}
+
+int parse_cmd_line(int argc, char *argv[], int client_type)
+{
+	struct thread_data *td = NULL;
+	int c, ini_idx = 0, lidx, ret = 0, do_exit = 0, exit_val = 0;
+	char *ostr = cmd_optstr;
+	char *pid_file = NULL;
+	void *cur_client = NULL;
+	bool backend = false;
+
+	/*
+	 * Reset optind handling, since we may call this multiple times
+	 * for the backend.
+	 */
+	optind = 1;
+
+	while ((c = getopt_long_only(argc, argv, ostr, l_opts, &lidx)) != -1) {
+		if ((c & FIO_CLIENT_FLAG) || client_flag_set(c)) {
+			parse_cmd_client(cur_client, argv[optind - 1]);
+			c &= ~FIO_CLIENT_FLAG;
+		}
+
+		switch (c) {
+		case 'a':
+			smalloc_pool_size = atoi(optarg);
+			smalloc_pool_size <<= 10;
+			sinit();
+			break;
+		case 'l':
+			log_err("fio: --latency-log is deprecated. Use per-job latency log options.\n");
+			do_exit++;
+			exit_val = 1;
+			break;
+		case 'b':
+			write_bw_log = true;
+			break;
+		case 'o': {
+			FILE *tmp;
+
+			if (f_out && f_out != stdout)
+				fclose(f_out);
+
+			tmp = fopen(optarg, "w+");
+			if (!tmp) {
+				log_err("fio: output file open error: %s\n", strerror(errno));
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
+			f_err = f_out = tmp;
+			break;
+			}
+		case 'm':
+			output_format = FIO_OUTPUT_TERSE;
+			break;
+		case 'F':
+			if (parse_output_format(optarg)) {
+				log_err("fio: failed parsing output-format\n");
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
+			break;
+		case 'f':
+			output_format |= FIO_OUTPUT_TERSE;
+			break;
+		case 'h':
+			did_arg = true;
+			if (!cur_client) {
+				usage(argv[0]);
+				do_exit++;
+			}
+			break;
+		case 'c':
+			did_arg = true;
+			if (!cur_client) {
+				fio_show_option_help(optarg);
+				do_exit++;
+			}
+			break;
+		case 'i':
+			did_arg = true;
+			if (!cur_client) {
+				fio_show_ioengine_help(optarg);
+				do_exit++;
+			}
+			break;
+		case 's':
+			did_arg = true;
+			dump_cmdline = true;
+			break;
+		case 'r':
+			read_only = 1;
+			break;
+		case 'v':
+			did_arg = true;
+			if (!cur_client) {
+				log_info("%s\n", fio_version_string);
+				do_exit++;
+			}
+			break;
+		case 'V':
+			terse_version = atoi(optarg);
+			if (!(terse_version >= 2 && terse_version <= 5)) {
+				log_err("fio: bad terse version format\n");
+				exit_val = 1;
+				do_exit++;
+			}
+			break;
+		case 'e':
+			if (!strcmp("always", optarg))
+				eta_print = FIO_ETA_ALWAYS;
+			else if (!strcmp("never", optarg))
+				eta_print = FIO_ETA_NEVER;
+			break;
+		case 'E': {
+			long long t = 0;
+
+			if (check_str_time(optarg, &t, 1)) {
+				log_err("fio: failed parsing eta time %s\n", optarg);
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
+			eta_new_line = t / 1000;
+			if (!eta_new_line) {
+				log_err("fio: eta new line time too short\n");
+				exit_val = 1;
+				do_exit++;
+			}
+			break;
+			}
+		case 'O': {
+			long long t = 0;
+
+			if (check_str_time(optarg, &t, 1)) {
+				log_err("fio: failed parsing eta interval %s\n", optarg);
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
+			eta_interval_msec = t / 1000;
+			if (eta_interval_msec < DISK_UTIL_MSEC) {
+				log_err("fio: eta interval time too short (%umsec min)\n", DISK_UTIL_MSEC);
+				exit_val = 1;
+				do_exit++;
+			}
+			break;
+			}
+		case 'd':
+			if (set_debug(optarg))
+				do_exit++;
+			break;
+		case 'P':
+			did_arg = true;
+			parse_only = true;
+			break;
+		case 'x': {
+			size_t new_size;
+
+			if (!strcmp(optarg, "global")) {
+				log_err("fio: can't use global as only "
+					"section\n");
+				do_exit++;
+				exit_val = 1;
+				break;
+			}
+			new_size = (nr_job_sections + 1) * sizeof(char *);
+			job_sections = realloc(job_sections, new_size);
+			job_sections[nr_job_sections] = strdup(optarg);
+			nr_job_sections++;
+			break;
+			}
+#ifdef CONFIG_ZLIB
+		case 'X':
+			exit_val = iolog_file_inflate(optarg);
+			did_arg = true;
+			do_exit++;
+			break;
+#endif
+		case 'p':
+			did_arg = true;
+			if (exec_profile)
+				free(exec_profile);
+			exec_profile = strdup(optarg);
+			break;
+		case FIO_GETOPT_JOB: {
+			const char *opt = l_opts[lidx].name;
+			char *val = optarg;
+
+			if (!strncmp(opt, "name", 4) && td) {
+				ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type);
+				if (ret)
+					goto out_free;
+				td = NULL;
+				did_arg = true;
+			}
+			if (!td) {
+				int is_section = !strncmp(opt, "name", 4);
+				int global = 0;
+
+				if (!is_section || !strncmp(val, "global", 6))
+					global = 1;
+
+				if (is_section && skip_this_section(val))
+					continue;
+
+				td = get_new_job(global, &def_thread, true, NULL);
+				if (!td || ioengine_load(td)) {
+					if (td) {
+						put_job(td);
+						td = NULL;
+					}
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
+				fio_options_set_ioengine_opts(l_opts, td);
+			}
+
+			if ((!val || !strlen(val)) &&
+			    l_opts[lidx].has_arg == required_argument) {
+				log_err("fio: option %s requires an argument\n", opt);
+				ret = 1;
+			} else
+				ret = fio_cmd_option_parse(td, opt, val);
+
+			if (ret) {
+				if (td) {
+					put_job(td);
+					td = NULL;
+				}
+				do_exit++;
+				exit_val = 1;
+			}
+
+			if (!ret && !strcmp(opt, "ioengine")) {
+				if (ioengine_load(td)) {
+					put_job(td);
+					td = NULL;
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
+				fio_options_set_ioengine_opts(l_opts, td);
+			}
+			break;
+		}
+		case FIO_GETOPT_IOENGINE: {
+			const char *opt = l_opts[lidx].name;
+			char *val = optarg;
+
+			if (!td)
+				break;
+
+			ret = fio_cmd_ioengine_option_parse(td, opt, val);
+			break;
+		}
+		case 'w':
+			warnings_fatal = 1;
+			break;
+		case 'j':
+			max_jobs = atoi(optarg);
+			if (!max_jobs || max_jobs > REAL_MAX_JOBS) {
+				log_err("fio: invalid max jobs: %d\n", max_jobs);
+				do_exit++;
+				exit_val = 1;
+			}
+			break;
+		case 'S':
+			did_arg = true;
+#ifndef CONFIG_NO_SHM
+			if (nr_clients) {
+				log_err("fio: can't be both client and server\n");
+				do_exit++;
+				exit_val = 1;
+				break;
+			}
+			if (optarg)
+				fio_server_set_arg(optarg);
+			is_backend = true;
+			backend = true;
+#else
+			log_err("fio: client/server requires SHM support\n");
+			do_exit++;
+			exit_val = 1;
+#endif
+			break;
+		case 'D':
+			if (pid_file)
+				free(pid_file);
+			pid_file = strdup(optarg);
+			break;
+		case 'I':
+			if ((ret = fio_idle_prof_parse_opt(optarg))) {
+				/* exit on error and calibration only */
+				did_arg = true;
+				do_exit++;
+				if (ret == -1)
+					exit_val = 1;
+			}
+			break;
+		case 'C':
+			did_arg = true;
+			if (is_backend) {
+				log_err("fio: can't be both client and server\n");
+				do_exit++;
+				exit_val = 1;
+				break;
+			}
+			/* if --client parameter contains a pathname */
+			if (0 == access(optarg, R_OK)) {
+				/* file contains a list of host addrs or names */
+				char hostaddr[PATH_MAX] = {0};
+				char formatstr[8];
+				FILE * hostf = fopen(optarg, "r");
+				if (!hostf) {
+					log_err("fio: could not open client list file %s for read\n", optarg);
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
+				sprintf(formatstr, "%%%ds", PATH_MAX - 1);
+				/*
+				 * read at most PATH_MAX-1 chars from each
+				 * record in this file
+				 */
+				while (fscanf(hostf, formatstr, hostaddr) == 1) {
+					/* expect EVERY host in file to be valid */
+					if (fio_client_add(&fio_client_ops, hostaddr, &cur_client)) {
+						log_err("fio: failed adding client %s from file %s\n", hostaddr, optarg);
+						do_exit++;
+						exit_val = 1;
+						break;
+					}
+				}
+				fclose(hostf);
+				break; /* no possibility of job file for "this client only" */
+			}
+			if (fio_client_add(&fio_client_ops, optarg, &cur_client)) {
+				log_err("fio: failed adding client %s\n", optarg);
+				do_exit++;
+				exit_val = 1;
+				break;
+			}
+			/*
+			 * If the next argument exists and isn't an option,
+			 * assume it's a job file for this client only.
+			 */
+			while (optind < argc) {
+				if (!strncmp(argv[optind], "--", 2) ||
+				    !strncmp(argv[optind], "-", 1))
+					break;
+
+				if (fio_client_add_ini_file(cur_client, argv[optind], false))
+					break;
+				optind++;
+			}
+			break;
+		case 'R':
+			did_arg = true;
+			if (fio_client_add_ini_file(cur_client, optarg, true)) {
+				do_exit++;
+				exit_val = 1;
+			}
+			break;
+		case 'T':
+			did_arg = true;
+			do_exit++;
+			exit_val = fio_monotonic_clocktest(1);
+			break;
+		case 'G':
+			did_arg = true;
+			do_exit++;
+			exit_val = fio_crctest(optarg);
+			break;
+		case 'M':
+			did_arg = true;
+			do_exit++;
+			exit_val = fio_memcpy_test(optarg);
+			break;
+		case 'L': {
+			long long val;
+
+			if (check_str_time(optarg, &val, 1)) {
+				log_err("fio: failed parsing time %s\n", optarg);
+				do_exit++;
+				exit_val = 1;
+				break;
+			}
+			if (val < 1000) {
+				log_err("fio: status interval too small\n");
+				do_exit++;
+				exit_val = 1;
+			}
+			status_interval = val / 1000;
+			break;
+			}
+		case 'W':
+			if (trigger_file)
+				free(trigger_file);
+			trigger_file = strdup(optarg);
+			break;
+		case 'H':
+			if (trigger_cmd)
+				free(trigger_cmd);
+			trigger_cmd = strdup(optarg);
+			break;
+		case 'J':
+			if (trigger_remote_cmd)
+				free(trigger_remote_cmd);
+			trigger_remote_cmd = strdup(optarg);
+			break;
+		case 'K':
+			if (aux_path)
+				free(aux_path);
+			aux_path = strdup(optarg);
+			break;
+		case 'B':
+			if (check_str_time(optarg, &trigger_timeout, 1)) {
+				log_err("fio: failed parsing time %s\n", optarg);
+				do_exit++;
+				exit_val = 1;
+			}
+			trigger_timeout /= 1000000;
+			break;
+
+		case 'A':
+			did_arg = true;
+			merge_blktrace_only = true;
+			break;
+		case '?':
+			log_err("%s: unrecognized option '%s'\n", argv[0],
+							argv[optind - 1]);
+			show_closest_option(argv[optind - 1]);
+			/* fall through */
+		default:
+			do_exit++;
+			exit_val = 1;
+			break;
+		}
+		if (do_exit)
+			break;
+	}
+
+	if (do_exit && !(is_backend || nr_clients))
+		exit(exit_val);
+
+	if (nr_clients && fio_clients_connect())
+		exit(1);
+
+	if (is_backend && backend)
+		return fio_start_server(pid_file);
+	else if (pid_file)
+		free(pid_file);
+
+	if (td) {
+		if (!ret) {
+			ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type);
+			if (ret)
+				exit(1);
+		}
+	}
+
+	while (!ret && optind < argc) {
+		ini_idx++;
+		ini_file = realloc(ini_file, ini_idx * sizeof(char *));
+		ini_file[ini_idx - 1] = strdup(argv[optind]);
+		optind++;
+	}
+
+out_free:
+	return ini_idx;
+}
+
+int fio_init_options(void)
+{
+	f_out = stdout;
+	f_err = stderr;
+
+	fio_options_fill_optstring();
+	fio_options_dup_and_init(l_opts);
+
+	atexit(free_shm);
+
+	if (fill_def_thread())
+		return 1;
+
+	return 0;
+}
+
+extern int fio_check_options(struct thread_options *);
+
+int parse_options(int argc, char *argv[])
+{
+	const int type = FIO_CLIENT_TYPE_CLI;
+	int job_files, i;
+
+	if (fio_init_options())
+		return 1;
+	if (fio_test_cconv(&def_thread.o))
+		log_err("fio: failed internal cconv test\n");
+
+	job_files = parse_cmd_line(argc, argv, type);
+
+	if (job_files > 0) {
+		for (i = 0; i < job_files; i++) {
+			if (i && fill_def_thread())
+				return 1;
+			if (nr_clients) {
+				if (fio_clients_send_ini(ini_file[i]))
+					return 1;
+				free(ini_file[i]);
+			} else if (!is_backend) {
+				if (parse_jobs_ini(ini_file[i], 0, i, type))
+					return 1;
+				free(ini_file[i]);
+			}
+		}
+	} else if (nr_clients) {
+		if (fill_def_thread())
+			return 1;
+		if (fio_clients_send_ini(NULL))
+			return 1;
+	}
+
+	free(ini_file);
+	fio_options_free(&def_thread);
+	filesetup_mem_free();
+
+	if (!thread_number) {
+		if (parse_dryrun())
+			return 0;
+		if (exec_profile)
+			return 0;
+		if (is_backend || nr_clients)
+			return 0;
+		if (did_arg)
+			return 0;
+
+		log_err("No job(s) defined\n\n");
+		usage(argv[0]);
+		return 1;
+	}
+
+	if (output_format & FIO_OUTPUT_NORMAL)
+		log_info("%s\n", fio_version_string);
+
+	return 0;
+}
+
+void options_default_fill(struct thread_options *o)
+{
+	memcpy(o, &def_thread.o, sizeof(*o));
+}
+
+struct thread_data *get_global_options(void)
+{
+	return &def_thread;
+}
diff --git a/io_ddir.h b/io_ddir.h
new file mode 100644
index 0000000..deaa8b5
--- /dev/null
+++ b/io_ddir.h
@@ -0,0 +1,76 @@
+#ifndef FIO_DDIR_H
+#define FIO_DDIR_H
+
+enum fio_ddir {
+	DDIR_READ = 0,
+	DDIR_WRITE = 1,
+	DDIR_TRIM = 2,
+	DDIR_SYNC = 3,
+	DDIR_DATASYNC,
+	DDIR_SYNC_FILE_RANGE,
+	DDIR_WAIT,
+	DDIR_LAST,
+	DDIR_INVAL = -1,
+
+	DDIR_RWDIR_CNT = 3,
+	DDIR_RWDIR_SYNC_CNT = 4,
+};
+
+static inline const char *io_ddir_name(enum fio_ddir ddir)
+{
+	static const char *name[] = { "read", "write", "trim", "sync",
+					"datasync", "sync_file_range",
+					"wait", };
+
+	if (ddir < DDIR_LAST)
+		return name[ddir];
+
+	return "invalid";
+}
+
+enum td_ddir {
+	TD_DDIR_READ		= 1 << 0,
+	TD_DDIR_WRITE		= 1 << 1,
+	TD_DDIR_RAND		= 1 << 2,
+	TD_DDIR_TRIM		= 1 << 3,
+	TD_DDIR_RW		= TD_DDIR_READ | TD_DDIR_WRITE,
+	TD_DDIR_RANDREAD	= TD_DDIR_READ | TD_DDIR_RAND,
+	TD_DDIR_RANDWRITE	= TD_DDIR_WRITE | TD_DDIR_RAND,
+	TD_DDIR_RANDRW		= TD_DDIR_RW | TD_DDIR_RAND,
+	TD_DDIR_RANDTRIM	= TD_DDIR_TRIM | TD_DDIR_RAND,
+	TD_DDIR_TRIMWRITE	= TD_DDIR_TRIM | TD_DDIR_WRITE,
+};
+
+#define td_read(td)		((td)->o.td_ddir & TD_DDIR_READ)
+#define td_write(td)		((td)->o.td_ddir & TD_DDIR_WRITE)
+#define td_trim(td)		((td)->o.td_ddir & TD_DDIR_TRIM)
+#define td_rw(td)		(((td)->o.td_ddir & TD_DDIR_RW) == TD_DDIR_RW)
+#define td_random(td)		((td)->o.td_ddir & TD_DDIR_RAND)
+#define file_randommap(td, f)	(!(td)->o.norandommap && fio_file_axmap((f)))
+#define td_trimwrite(td)	(((td)->o.td_ddir & TD_DDIR_TRIMWRITE) \
+					== TD_DDIR_TRIMWRITE)
+
+static inline int ddir_sync(enum fio_ddir ddir)
+{
+	return ddir == DDIR_SYNC || ddir == DDIR_DATASYNC ||
+	       ddir == DDIR_SYNC_FILE_RANGE;
+}
+
+static inline int ddir_rw(enum fio_ddir ddir)
+{
+	return ddir == DDIR_READ || ddir == DDIR_WRITE || ddir == DDIR_TRIM;
+}
+
+static inline const char *ddir_str(enum td_ddir ddir)
+{
+	static const char *__str[] = { NULL, "read", "write", "rw", "rand",
+				"randread", "randwrite", "randrw",
+				"trim", NULL, "trimwrite", NULL, "randtrim" };
+
+	return __str[ddir];
+}
+
+#define ddir_rw_sum(arr)	\
+	((arr)[DDIR_READ] + (arr)[DDIR_WRITE] + (arr)[DDIR_TRIM])
+
+#endif
diff --git a/io_u.c b/io_u.c
new file mode 100644
index 0000000..bcb893c
--- /dev/null
+++ b/io_u.c
@@ -0,0 +1,2238 @@
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+
+#include "fio.h"
+#include "verify.h"
+#include "trim.h"
+#include "lib/rand.h"
+#include "lib/axmap.h"
+#include "err.h"
+#include "lib/pow2.h"
+#include "minmax.h"
+#include "zbd.h"
+
+struct io_completion_data {
+	int nr;				/* input */
+
+	int error;			/* output */
+	uint64_t bytes_done[DDIR_RWDIR_CNT];	/* output */
+	struct timespec time;		/* output */
+};
+
+/*
+ * The ->io_axmap contains a map of blocks we have or have not done io
+ * to yet. Used to make sure we cover the entire range in a fair fashion.
+ */
+static bool random_map_free(struct fio_file *f, const uint64_t block)
+{
+	return !axmap_isset(f->io_axmap, block);
+}
+
+/*
+ * Mark a given offset as used in the map.
+ */
+static uint64_t mark_random_map(struct thread_data *td, struct io_u *io_u,
+				uint64_t offset, uint64_t buflen)
+{
+	unsigned long long min_bs = td->o.min_bs[io_u->ddir];
+	struct fio_file *f = io_u->file;
+	unsigned long long nr_blocks;
+	uint64_t block;
+
+	block = (offset - f->file_offset) / (uint64_t) min_bs;
+	nr_blocks = (buflen + min_bs - 1) / min_bs;
+	assert(nr_blocks > 0);
+
+	if (!(io_u->flags & IO_U_F_BUSY_OK)) {
+		nr_blocks = axmap_set_nr(f->io_axmap, block, nr_blocks);
+		assert(nr_blocks > 0);
+	}
+
+	if ((nr_blocks * min_bs) < buflen)
+		buflen = nr_blocks * min_bs;
+
+	return buflen;
+}
+
+static uint64_t last_block(struct thread_data *td, struct fio_file *f,
+			   enum fio_ddir ddir)
+{
+	uint64_t max_blocks;
+	uint64_t max_size;
+
+	assert(ddir_rw(ddir));
+
+	/*
+	 * Hmm, should we make sure that ->io_size <= ->real_file_size?
+	 * -> not for now since there is code assuming it could go either.
+	 */
+	max_size = f->io_size;
+	if (max_size > f->real_file_size)
+		max_size = f->real_file_size;
+
+	if (td->o.zone_mode == ZONE_MODE_STRIDED && td->o.zone_range)
+		max_size = td->o.zone_range;
+
+	if (td->o.min_bs[ddir] > td->o.ba[ddir])
+		max_size -= td->o.min_bs[ddir] - td->o.ba[ddir];
+
+	max_blocks = max_size / (uint64_t) td->o.ba[ddir];
+	if (!max_blocks)
+		return 0;
+
+	return max_blocks;
+}
+
+static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
+				  enum fio_ddir ddir, uint64_t *b,
+				  uint64_t lastb)
+{
+	uint64_t r;
+
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE ||
+	    td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) {
+
+		r = __rand(&td->random_state);
+
+		dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r);
+
+		*b = lastb * (r / (rand_max(&td->random_state) + 1.0));
+	} else {
+		uint64_t off = 0;
+
+		assert(fio_file_lfsr(f));
+
+		if (lfsr_next(&f->lfsr, &off))
+			return 1;
+
+		*b = off;
+	}
+
+	/*
+	 * if we are not maintaining a random map, we are done.
+	 */
+	if (!file_randommap(td, f))
+		goto ret;
+
+	/*
+	 * calculate map offset and check if it's free
+	 */
+	if (random_map_free(f, *b))
+		goto ret;
+
+	dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n",
+						(unsigned long long) *b);
+
+	*b = axmap_next_free(f->io_axmap, *b);
+	if (*b == (uint64_t) -1ULL)
+		return 1;
+ret:
+	return 0;
+}
+
+static int __get_next_rand_offset_zipf(struct thread_data *td,
+				       struct fio_file *f, enum fio_ddir ddir,
+				       uint64_t *b)
+{
+	*b = zipf_next(&f->zipf);
+	return 0;
+}
+
+static int __get_next_rand_offset_pareto(struct thread_data *td,
+					 struct fio_file *f, enum fio_ddir ddir,
+					 uint64_t *b)
+{
+	*b = pareto_next(&f->zipf);
+	return 0;
+}
+
+static int __get_next_rand_offset_gauss(struct thread_data *td,
+					struct fio_file *f, enum fio_ddir ddir,
+					uint64_t *b)
+{
+	*b = gauss_next(&f->gauss);
+	return 0;
+}
+
+static int __get_next_rand_offset_zoned_abs(struct thread_data *td,
+					    struct fio_file *f,
+					    enum fio_ddir ddir, uint64_t *b)
+{
+	struct zone_split_index *zsi;
+	uint64_t lastb, send, stotal;
+	unsigned int v;
+
+	lastb = last_block(td, f, ddir);
+	if (!lastb)
+		return 1;
+
+	if (!td->o.zone_split_nr[ddir]) {
+bail:
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	}
+
+	/*
+	 * Generate a value, v, between 1 and 100, both inclusive
+	 */
+	v = rand_between(&td->zone_state, 1, 100);
+
+	/*
+	 * Find our generated table. 'send' is the end block of this zone,
+	 * 'stotal' is our start offset.
+	 */
+	zsi = &td->zone_state_index[ddir][v - 1];
+	stotal = zsi->size_prev / td->o.ba[ddir];
+	send = zsi->size / td->o.ba[ddir];
+
+	/*
+	 * Should never happen
+	 */
+	if (send == -1U) {
+		if (!fio_did_warn(FIO_WARN_ZONED_BUG))
+			log_err("fio: bug in zoned generation\n");
+		goto bail;
+	} else if (send > lastb) {
+		/*
+		 * This happens if the user specifies ranges that exceed
+		 * the file/device size. We can't handle that gracefully,
+		 * so error and exit.
+		 */
+		log_err("fio: zoned_abs sizes exceed file size\n");
+		return 1;
+	}
+
+	/*
+	 * Generate index from 0..send-stotal
+	 */
+	if (__get_next_rand_offset(td, f, ddir, b, send - stotal) == 1)
+		return 1;
+
+	*b += stotal;
+	return 0;
+}
+
+static int __get_next_rand_offset_zoned(struct thread_data *td,
+					struct fio_file *f, enum fio_ddir ddir,
+					uint64_t *b)
+{
+	unsigned int v, send, stotal;
+	uint64_t offset, lastb;
+	struct zone_split_index *zsi;
+
+	lastb = last_block(td, f, ddir);
+	if (!lastb)
+		return 1;
+
+	if (!td->o.zone_split_nr[ddir]) {
+bail:
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	}
+
+	/*
+	 * Generate a value, v, between 1 and 100, both inclusive
+	 */
+	v = rand_between(&td->zone_state, 1, 100);
+
+	zsi = &td->zone_state_index[ddir][v - 1];
+	stotal = zsi->size_perc_prev;
+	send = zsi->size_perc;
+
+	/*
+	 * Should never happen
+	 */
+	if (send == -1U) {
+		if (!fio_did_warn(FIO_WARN_ZONED_BUG))
+			log_err("fio: bug in zoned generation\n");
+		goto bail;
+	}
+
+	/*
+	 * 'send' is some percentage below or equal to 100 that
+	 * marks the end of the current IO range. 'stotal' marks
+	 * the start, in percent.
+	 */
+	if (stotal)
+		offset = stotal * lastb / 100ULL;
+	else
+		offset = 0;
+
+	lastb = lastb * (send - stotal) / 100ULL;
+
+	/*
+	 * Generate index from 0..send-of-lastb
+	 */
+	if (__get_next_rand_offset(td, f, ddir, b, lastb) == 1)
+		return 1;
+
+	/*
+	 * Add our start offset, if any
+	 */
+	if (offset)
+		*b += offset;
+
+	return 0;
+}
+
+static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
+				enum fio_ddir ddir, uint64_t *b)
+{
+	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) {
+		uint64_t lastb;
+
+		lastb = last_block(td, f, ddir);
+		if (!lastb)
+			return 1;
+
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	} else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		return __get_next_rand_offset_zipf(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+		return __get_next_rand_offset_pareto(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		return __get_next_rand_offset_gauss(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+		return __get_next_rand_offset_zoned(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED_ABS)
+		return __get_next_rand_offset_zoned_abs(td, f, ddir, b);
+
+	log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
+	return 1;
+}
+
+static bool should_do_random(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned int v;
+
+	if (td->o.perc_rand[ddir] == 100)
+		return true;
+
+	v = rand_between(&td->seq_rand_state[ddir], 1, 100);
+
+	return v <= td->o.perc_rand[ddir];
+}
+
+static void loop_cache_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	struct thread_options *o = &td->o;
+
+	if (o->invalidate_cache && !o->odirect) {
+		int fio_unused ret;
+
+		ret = file_invalidate_cache(td, f);
+	}
+}
+
+static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
+			       enum fio_ddir ddir, uint64_t *b)
+{
+	if (!get_next_rand_offset(td, f, ddir, b))
+		return 0;
+
+	if (td->o.time_based ||
+	    (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)) {
+		fio_file_reset(td, f);
+		loop_cache_invalidate(td, f);
+		if (!get_next_rand_offset(td, f, ddir, b))
+			return 0;
+	}
+
+	dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n",
+			f->file_name, (unsigned long long) f->last_pos[ddir],
+			(unsigned long long) f->real_file_size);
+	return 1;
+}
+
+static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
+			       enum fio_ddir ddir, uint64_t *offset)
+{
+	struct thread_options *o = &td->o;
+
+	assert(ddir_rw(ddir));
+
+	/*
+	 * If we reach the end for a time based run, reset us back to 0
+	 * and invalidate the cache, if we need to.
+	 */
+	if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
+	    o->time_based) {
+		f->last_pos[ddir] = f->file_offset;
+		loop_cache_invalidate(td, f);
+	}
+
+	if (f->last_pos[ddir] < f->real_file_size) {
+		uint64_t pos;
+
+		/*
+		 * Only rewind if we already hit the end
+		 */
+		if (f->last_pos[ddir] == f->file_offset &&
+		    f->file_offset && o->ddir_seq_add < 0) {
+			if (f->real_file_size > f->io_size)
+				f->last_pos[ddir] = f->io_size;
+			else
+				f->last_pos[ddir] = f->real_file_size;
+		}
+
+		pos = f->last_pos[ddir] - f->file_offset;
+		if (pos && o->ddir_seq_add) {
+			pos += o->ddir_seq_add;
+
+			/*
+			 * If we reach beyond the end of the file
+			 * with holed IO, wrap around to the
+			 * beginning again. If we're doing backwards IO,
+			 * wrap to the end.
+			 */
+			if (pos >= f->real_file_size) {
+				if (o->ddir_seq_add > 0)
+					pos = f->file_offset;
+				else {
+					if (f->real_file_size > f->io_size)
+						pos = f->io_size;
+					else
+						pos = f->real_file_size;
+
+					pos += o->ddir_seq_add;
+				}
+			}
+		}
+
+		*offset = pos;
+		return 0;
+	}
+
+	return 1;
+}
+
+static int get_next_block(struct thread_data *td, struct io_u *io_u,
+			  enum fio_ddir ddir, int rw_seq,
+			  bool *is_random)
+{
+	struct fio_file *f = io_u->file;
+	uint64_t b, offset;
+	int ret;
+
+	assert(ddir_rw(ddir));
+
+	b = offset = -1ULL;
+
+	if (rw_seq) {
+		if (td_random(td)) {
+			if (should_do_random(td, ddir)) {
+				ret = get_next_rand_block(td, f, ddir, &b);
+				*is_random = true;
+			} else {
+				*is_random = false;
+				io_u_set(td, io_u, IO_U_F_BUSY_OK);
+				ret = get_next_seq_offset(td, f, ddir, &offset);
+				if (ret)
+					ret = get_next_rand_block(td, f, ddir, &b);
+			}
+		} else {
+			*is_random = false;
+			ret = get_next_seq_offset(td, f, ddir, &offset);
+		}
+	} else {
+		io_u_set(td, io_u, IO_U_F_BUSY_OK);
+		*is_random = false;
+
+		if (td->o.rw_seq == RW_SEQ_SEQ) {
+			ret = get_next_seq_offset(td, f, ddir, &offset);
+			if (ret) {
+				ret = get_next_rand_block(td, f, ddir, &b);
+				*is_random = false;
+			}
+		} else if (td->o.rw_seq == RW_SEQ_IDENT) {
+			if (f->last_start[ddir] != -1ULL)
+				offset = f->last_start[ddir] - f->file_offset;
+			else
+				offset = 0;
+			ret = 0;
+		} else {
+			log_err("fio: unknown rw_seq=%d\n", td->o.rw_seq);
+			ret = 1;
+		}
+	}
+
+	if (!ret) {
+		if (offset != -1ULL)
+			io_u->offset = offset;
+		else if (b != -1ULL)
+			io_u->offset = b * td->o.ba[ddir];
+		else {
+			log_err("fio: bug in offset generation: offset=%llu, b=%llu\n", (unsigned long long) offset, (unsigned long long) b);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * For random io, generate a random new block and see if it's used. Repeat
+ * until we find a free one. For sequential io, just return the end of
+ * the last io issued.
+ */
+static int get_next_offset(struct thread_data *td, struct io_u *io_u,
+			   bool *is_random)
+{
+	struct fio_file *f = io_u->file;
+	enum fio_ddir ddir = io_u->ddir;
+	int rw_seq_hit = 0;
+
+	assert(ddir_rw(ddir));
+
+	if (td->o.ddir_seq_nr && !--td->ddir_seq_nr) {
+		rw_seq_hit = 1;
+		td->ddir_seq_nr = td->o.ddir_seq_nr;
+	}
+
+	if (get_next_block(td, io_u, ddir, rw_seq_hit, is_random))
+		return 1;
+
+	if (io_u->offset >= f->io_size) {
+		dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n",
+					(unsigned long long) io_u->offset,
+					(unsigned long long) f->io_size);
+		return 1;
+	}
+
+	io_u->offset += f->file_offset;
+	if (io_u->offset >= f->real_file_size) {
+		dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n",
+					(unsigned long long) io_u->offset,
+					(unsigned long long) f->real_file_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline bool io_u_fits(struct thread_data *td, struct io_u *io_u,
+			     unsigned long long buflen)
+{
+	struct fio_file *f = io_u->file;
+
+	return io_u->offset + buflen <= f->io_size + get_start_offset(td, f);
+}
+
+static unsigned long long get_next_buflen(struct thread_data *td, struct io_u *io_u,
+				    bool is_random)
+{
+	int ddir = io_u->ddir;
+	unsigned long long buflen = 0;
+	unsigned long long minbs, maxbs;
+	uint64_t frand_max, r;
+	bool power_2;
+
+	assert(ddir_rw(ddir));
+
+	if (td->o.bs_is_seq_rand)
+		ddir = is_random ? DDIR_WRITE : DDIR_READ;
+
+	minbs = td->o.min_bs[ddir];
+	maxbs = td->o.max_bs[ddir];
+
+	if (minbs == maxbs)
+		return minbs;
+
+	/*
+	 * If we can't satisfy the min block size from here, then fail
+	 */
+	if (!io_u_fits(td, io_u, minbs))
+		return 0;
+
+	frand_max = rand_max(&td->bsrange_state[ddir]);
+	do {
+		r = __rand(&td->bsrange_state[ddir]);
+
+		if (!td->o.bssplit_nr[ddir]) {
+			buflen = minbs + (unsigned long long) ((double) maxbs *
+					(r / (frand_max + 1.0)));
+		} else {
+			long long perc = 0;
+			unsigned int i;
+
+			for (i = 0; i < td->o.bssplit_nr[ddir]; i++) {
+				struct bssplit *bsp = &td->o.bssplit[ddir][i];
+
+				if (!bsp->perc)
+					continue;
+				buflen = bsp->bs;
+				perc += bsp->perc;
+				if ((r / perc <= frand_max / 100ULL) &&
+				    io_u_fits(td, io_u, buflen))
+					break;
+			}
+		}
+
+		power_2 = is_power_of_2(minbs);
+		if (!td->o.bs_unaligned && power_2)
+			buflen &= ~(minbs - 1);
+		else if (!td->o.bs_unaligned && !power_2)
+			buflen -= buflen % minbs;
+		if (buflen > maxbs)
+			buflen = maxbs;
+	} while (!io_u_fits(td, io_u, buflen));
+
+	return buflen;
+}
+
+static void set_rwmix_bytes(struct thread_data *td)
+{
+	unsigned int diff;
+
+	/*
+	 * we do time or byte based switch. this is needed because
+	 * buffered writes may issue a lot quicker than they complete,
+	 * whereas reads do not.
+	 */
+	diff = td->o.rwmix[td->rwmix_ddir ^ 1];
+	td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100;
+}
+
+static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
+{
+	unsigned int v;
+
+	v = rand_between(&td->rwmix_state, 1, 100);
+
+	if (v <= td->o.rwmix[DDIR_READ])
+		return DDIR_READ;
+
+	return DDIR_WRITE;
+}
+
+int io_u_quiesce(struct thread_data *td)
+{
+	int ret = 0, completed = 0;
+
+	/*
+	 * We are going to sleep, ensure that we flush anything pending as
+	 * not to skew our latency numbers.
+	 *
+	 * Changed to only monitor 'in flight' requests here instead of the
+	 * td->cur_depth, b/c td->cur_depth does not accurately represent
+	 * io's that have been actually submitted to an async engine,
+	 * and cur_depth is meaningless for sync engines.
+	 */
+	if (td->io_u_queued || td->cur_depth)
+		td_io_commit(td);
+
+	while (td->io_u_in_flight) {
+		ret = io_u_queued_complete(td, 1);
+		if (ret > 0)
+			completed += ret;
+		else if (ret < 0)
+			break;
+	}
+
+	if (td->flags & TD_F_REGROW_LOGS)
+		regrow_logs(td);
+
+	if (completed)
+		return completed;
+
+	return ret;
+}
+
+static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
+{
+	enum fio_ddir odir = ddir ^ 1;
+	uint64_t usec;
+	uint64_t now;
+
+	assert(ddir_rw(ddir));
+	now = utime_since_now(&td->epoch);
+
+	/*
+	 * if rate_next_io_time is in the past, need to catch up to rate
+	 */
+	if (td->rate_next_io_time[ddir] <= now)
+		return ddir;
+
+	/*
+	 * We are ahead of rate in this direction. See if we
+	 * should switch.
+	 */
+	if (td_rw(td) && td->o.rwmix[odir]) {
+		/*
+		 * Other direction is behind rate, switch
+		 */
+		if (td->rate_next_io_time[odir] <= now)
+			return odir;
+
+		/*
+		 * Both directions are ahead of rate. sleep the min,
+		 * switch if necessary
+		 */
+		if (td->rate_next_io_time[ddir] <=
+		    td->rate_next_io_time[odir]) {
+			usec = td->rate_next_io_time[ddir] - now;
+		} else {
+			usec = td->rate_next_io_time[odir] - now;
+			ddir = odir;
+		}
+	} else
+		usec = td->rate_next_io_time[ddir] - now;
+
+	if (td->o.io_submit_mode == IO_MODE_INLINE)
+		io_u_quiesce(td);
+
+	usec_sleep(td, usec);
+	return ddir;
+}
+
+/*
+ * Return the data direction for the next io_u. If the job is a
+ * mixed read/write workload, check the rwmix cycle and switch if
+ * necessary.
+ */
+static enum fio_ddir get_rw_ddir(struct thread_data *td)
+{
+	enum fio_ddir ddir;
+
+	/*
+	 * See if it's time to fsync/fdatasync/sync_file_range first,
+	 * and if not then move on to check regular I/Os.
+	 */
+	if (should_fsync(td)) {
+		if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks))
+			return DDIR_SYNC;
+
+		if (td->o.fdatasync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks))
+			return DDIR_DATASYNC;
+
+		if (td->sync_file_range_nr && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr))
+			return DDIR_SYNC_FILE_RANGE;
+	}
+
+	if (td_rw(td)) {
+		/*
+		 * Check if it's time to seed a new data direction.
+		 */
+		if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) {
+			/*
+			 * Put a top limit on how many bytes we do for
+			 * one data direction, to avoid overflowing the
+			 * ranges too much
+			 */
+			ddir = get_rand_ddir(td);
+
+			if (ddir != td->rwmix_ddir)
+				set_rwmix_bytes(td);
+
+			td->rwmix_ddir = ddir;
+		}
+		ddir = td->rwmix_ddir;
+	} else if (td_read(td))
+		ddir = DDIR_READ;
+	else if (td_write(td))
+		ddir = DDIR_WRITE;
+	else if (td_trim(td))
+		ddir = DDIR_TRIM;
+	else
+		ddir = DDIR_INVAL;
+
+	td->rwmix_ddir = rate_ddir(td, ddir);
+	return td->rwmix_ddir;
+}
+
+static void set_rw_ddir(struct thread_data *td, struct io_u *io_u)
+{
+	enum fio_ddir ddir = get_rw_ddir(td);
+
+	if (td_trimwrite(td)) {
+		struct fio_file *f = io_u->file;
+		if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM])
+			ddir = DDIR_TRIM;
+		else
+			ddir = DDIR_WRITE;
+	}
+
+	io_u->ddir = io_u->acct_ddir = ddir;
+
+	if (io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_BARRIER) &&
+	    td->o.barrier_blocks &&
+	   !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) &&
+	     td->io_issues[DDIR_WRITE])
+		io_u_set(td, io_u, IO_U_F_BARRIER);
+}
+
+void put_file_log(struct thread_data *td, struct fio_file *f)
+{
+	unsigned int ret = put_file(td, f);
+
+	if (ret)
+		td_verror(td, ret, "file close");
+}
+
+void put_io_u(struct thread_data *td, struct io_u *io_u)
+{
+	const bool needs_lock = td_async_processing(td);
+
+	zbd_put_io_u(io_u);
+
+	if (td->parent)
+		td = td->parent;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	if (io_u->file && !(io_u->flags & IO_U_F_NO_FILE_PUT))
+		put_file_log(td, io_u->file);
+
+	io_u->file = NULL;
+	io_u_set(td, io_u, IO_U_F_FREE);
+
+	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
+		td->cur_depth--;
+		assert(!(td->flags & TD_F_CHILD));
+	}
+	io_u_qpush(&td->io_u_freelist, io_u);
+	td_io_u_free_notify(td);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+void clear_io_u(struct thread_data *td, struct io_u *io_u)
+{
+	io_u_clear(td, io_u, IO_U_F_FLIGHT);
+	put_io_u(td, io_u);
+}
+
+void requeue_io_u(struct thread_data *td, struct io_u **io_u)
+{
+	const bool needs_lock = td_async_processing(td);
+	struct io_u *__io_u = *io_u;
+	enum fio_ddir ddir = acct_ddir(__io_u);
+
+	dprint(FD_IO, "requeue %p\n", __io_u);
+
+	if (td->parent)
+		td = td->parent;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	io_u_set(td, __io_u, IO_U_F_FREE);
+	if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(ddir))
+		td->io_issues[ddir]--;
+
+	io_u_clear(td, __io_u, IO_U_F_FLIGHT);
+	if (__io_u->flags & IO_U_F_IN_CUR_DEPTH) {
+		td->cur_depth--;
+		assert(!(td->flags & TD_F_CHILD));
+	}
+
+	io_u_rpush(&td->io_u_requeues, __io_u);
+	td_io_u_free_notify(td);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+
+	*io_u = NULL;
+}
+
+static void setup_strided_zone_mode(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+
+	assert(td->o.zone_mode == ZONE_MODE_STRIDED);
+	assert(td->o.zone_size);
+	assert(td->o.zone_range);
+
+	/*
+	 * See if it's time to switch to a new zone
+	 */
+	if (td->zone_bytes >= td->o.zone_size) {
+		td->zone_bytes = 0;
+		f->file_offset += td->o.zone_range + td->o.zone_skip;
+
+		/*
+		 * Wrap from the beginning, if we exceed the file size
+		 */
+		if (f->file_offset >= f->real_file_size)
+			f->file_offset = get_start_offset(td, f);
+
+		f->last_pos[io_u->ddir] = f->file_offset;
+		td->io_skip_bytes += td->o.zone_skip;
+	}
+
+	/*
+	 * If zone_size > zone_range, then maintain the same zone until
+	 * zone_bytes >= zone_size.
+	 */
+	if (f->last_pos[io_u->ddir] >= (f->file_offset + td->o.zone_range)) {
+		dprint(FD_IO, "io_u maintain zone offset=%" PRIu64 "/last_pos=%" PRIu64 "\n",
+				f->file_offset, f->last_pos[io_u->ddir]);
+		f->last_pos[io_u->ddir] = f->file_offset;
+	}
+
+	/*
+	 * For random: if 'norandommap' is not set and zone_size > zone_range,
+	 * map needs to be reset as it's done with zone_range everytime.
+	 */
+	if ((td->zone_bytes % td->o.zone_range) == 0)
+		fio_file_reset(td, f);
+}
+
+static int fill_io_u(struct thread_data *td, struct io_u *io_u)
+{
+	bool is_random;
+	uint64_t offset;
+	enum io_u_action ret;
+
+	if (td_ioengine_flagged(td, FIO_NOIO))
+		goto out;
+
+	set_rw_ddir(td, io_u);
+
+	/*
+	 * fsync() or fdatasync() or trim etc, we are done
+	 */
+	if (!ddir_rw(io_u->ddir))
+		goto out;
+
+	if (td->o.zone_mode == ZONE_MODE_STRIDED)
+		setup_strided_zone_mode(td, io_u);
+	else if (td->o.zone_mode == ZONE_MODE_ZBD)
+		setup_zbd_zone_mode(td, io_u);
+
+	/*
+	 * No log, let the seq/rand engine retrieve the next buflen and
+	 * position.
+	 */
+	if (get_next_offset(td, io_u, &is_random)) {
+		dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
+		return 1;
+	}
+
+	io_u->buflen = get_next_buflen(td, io_u, is_random);
+	if (!io_u->buflen) {
+		dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
+		return 1;
+	}
+
+	offset = io_u->offset;
+	if (td->o.zone_mode == ZONE_MODE_ZBD) {
+		ret = zbd_adjust_block(td, io_u);
+		if (ret == io_u_eof)
+			return 1;
+	}
+
+	if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
+		dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
+			io_u,
+			(unsigned long long) io_u->offset, io_u->buflen,
+			(unsigned long long) io_u->file->real_file_size);
+		return 1;
+	}
+
+	/*
+	 * mark entry before potentially trimming io_u
+	 */
+	if (td_random(td) && file_randommap(td, io_u->file))
+		io_u->buflen = mark_random_map(td, io_u, offset, io_u->buflen);
+
+out:
+	dprint_io_u(io_u, "fill");
+	td->zone_bytes += io_u->buflen;
+	return 0;
+}
+
+static void __io_u_mark_map(uint64_t *map, unsigned int nr)
+{
+	int idx = 0;
+
+	switch (nr) {
+	default:
+		idx = 6;
+		break;
+	case 33 ... 64:
+		idx = 5;
+		break;
+	case 17 ... 32:
+		idx = 4;
+		break;
+	case 9 ... 16:
+		idx = 3;
+		break;
+	case 5 ... 8:
+		idx = 2;
+		break;
+	case 1 ... 4:
+		idx = 1;
+	case 0:
+		break;
+	}
+
+	map[idx]++;
+}
+
+void io_u_mark_submit(struct thread_data *td, unsigned int nr)
+{
+	__io_u_mark_map(td->ts.io_u_submit, nr);
+	td->ts.total_submit++;
+}
+
+void io_u_mark_complete(struct thread_data *td, unsigned int nr)
+{
+	__io_u_mark_map(td->ts.io_u_complete, nr);
+	td->ts.total_complete++;
+}
+
+void io_u_mark_depth(struct thread_data *td, unsigned int nr)
+{
+	int idx = 0;
+
+	switch (td->cur_depth) {
+	default:
+		idx = 6;
+		break;
+	case 32 ... 63:
+		idx = 5;
+		break;
+	case 16 ... 31:
+		idx = 4;
+		break;
+	case 8 ... 15:
+		idx = 3;
+		break;
+	case 4 ... 7:
+		idx = 2;
+		break;
+	case 2 ... 3:
+		idx = 1;
+	case 1:
+		break;
+	}
+
+	td->ts.io_u_map[idx] += nr;
+}
+
+static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec)
+{
+	int idx = 0;
+
+	assert(nsec < 1000);
+
+	switch (nsec) {
+	case 750 ... 999:
+		idx = 9;
+		break;
+	case 500 ... 749:
+		idx = 8;
+		break;
+	case 250 ... 499:
+		idx = 7;
+		break;
+	case 100 ... 249:
+		idx = 6;
+		break;
+	case 50 ... 99:
+		idx = 5;
+		break;
+	case 20 ... 49:
+		idx = 4;
+		break;
+	case 10 ... 19:
+		idx = 3;
+		break;
+	case 4 ... 9:
+		idx = 2;
+		break;
+	case 2 ... 3:
+		idx = 1;
+	case 0 ... 1:
+		break;
+	}
+
+	assert(idx < FIO_IO_U_LAT_N_NR);
+	td->ts.io_u_lat_n[idx]++;
+}
+
+static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec)
+{
+	int idx = 0;
+
+	assert(usec < 1000 && usec >= 1);
+
+	switch (usec) {
+	case 750 ... 999:
+		idx = 9;
+		break;
+	case 500 ... 749:
+		idx = 8;
+		break;
+	case 250 ... 499:
+		idx = 7;
+		break;
+	case 100 ... 249:
+		idx = 6;
+		break;
+	case 50 ... 99:
+		idx = 5;
+		break;
+	case 20 ... 49:
+		idx = 4;
+		break;
+	case 10 ... 19:
+		idx = 3;
+		break;
+	case 4 ... 9:
+		idx = 2;
+		break;
+	case 2 ... 3:
+		idx = 1;
+	case 0 ... 1:
+		break;
+	}
+
+	assert(idx < FIO_IO_U_LAT_U_NR);
+	td->ts.io_u_lat_u[idx]++;
+}
+
+static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec)
+{
+	int idx = 0;
+
+	assert(msec >= 1);
+
+	switch (msec) {
+	default:
+		idx = 11;
+		break;
+	case 1000 ... 1999:
+		idx = 10;
+		break;
+	case 750 ... 999:
+		idx = 9;
+		break;
+	case 500 ... 749:
+		idx = 8;
+		break;
+	case 250 ... 499:
+		idx = 7;
+		break;
+	case 100 ... 249:
+		idx = 6;
+		break;
+	case 50 ... 99:
+		idx = 5;
+		break;
+	case 20 ... 49:
+		idx = 4;
+		break;
+	case 10 ... 19:
+		idx = 3;
+		break;
+	case 4 ... 9:
+		idx = 2;
+		break;
+	case 2 ... 3:
+		idx = 1;
+	case 0 ... 1:
+		break;
+	}
+
+	assert(idx < FIO_IO_U_LAT_M_NR);
+	td->ts.io_u_lat_m[idx]++;
+}
+
+static void io_u_mark_latency(struct thread_data *td, unsigned long long nsec)
+{
+	if (nsec < 1000)
+		io_u_mark_lat_nsec(td, nsec);
+	else if (nsec < 1000000)
+		io_u_mark_lat_usec(td, nsec / 1000);
+	else
+		io_u_mark_lat_msec(td, nsec / 1000000);
+}
+
+static unsigned int __get_next_fileno_rand(struct thread_data *td)
+{
+	unsigned long fileno;
+
+	if (td->o.file_service_type == FIO_FSERVICE_RANDOM) {
+		uint64_t frand_max = rand_max(&td->next_file_state);
+		unsigned long r;
+
+		r = __rand(&td->next_file_state);
+		return (unsigned int) ((double) td->o.nr_files
+				* (r / (frand_max + 1.0)));
+	}
+
+	if (td->o.file_service_type == FIO_FSERVICE_ZIPF)
+		fileno = zipf_next(&td->next_file_zipf);
+	else if (td->o.file_service_type == FIO_FSERVICE_PARETO)
+		fileno = pareto_next(&td->next_file_zipf);
+	else if (td->o.file_service_type == FIO_FSERVICE_GAUSS)
+		fileno = gauss_next(&td->next_file_gauss);
+	else {
+		log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+		assert(0);
+		return 0;
+	}
+
+	return fileno >> FIO_FSERVICE_SHIFT;
+}
+
+/*
+ * Get next file to service by choosing one at random
+ */
+static struct fio_file *get_next_file_rand(struct thread_data *td,
+					   enum fio_file_flags goodf,
+					   enum fio_file_flags badf)
+{
+	struct fio_file *f;
+	int fno;
+
+	do {
+		int opened = 0;
+
+		fno = __get_next_fileno_rand(td);
+
+		f = td->files[fno];
+		if (fio_file_done(f))
+			continue;
+
+		if (!fio_file_open(f)) {
+			int err;
+
+			if (td->nr_open_files >= td->o.open_files)
+				return ERR_PTR(-EBUSY);
+
+			err = td_io_open_file(td, f);
+			if (err)
+				continue;
+			opened = 1;
+		}
+
+		if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) {
+			dprint(FD_FILE, "get_next_file_rand: %p\n", f);
+			return f;
+		}
+		if (opened)
+			td_io_close_file(td, f);
+	} while (1);
+}
+
+/*
+ * Get next file to service by doing round robin between all available ones
+ */
+static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf,
+					 int badf)
+{
+	unsigned int old_next_file = td->next_file;
+	struct fio_file *f;
+
+	do {
+		int opened = 0;
+
+		f = td->files[td->next_file];
+
+		td->next_file++;
+		if (td->next_file >= td->o.nr_files)
+			td->next_file = 0;
+
+		dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags);
+		if (fio_file_done(f)) {
+			f = NULL;
+			continue;
+		}
+
+		if (!fio_file_open(f)) {
+			int err;
+
+			if (td->nr_open_files >= td->o.open_files)
+				return ERR_PTR(-EBUSY);
+
+			err = td_io_open_file(td, f);
+			if (err) {
+				dprint(FD_FILE, "error %d on open of %s\n",
+					err, f->file_name);
+				f = NULL;
+				continue;
+			}
+			opened = 1;
+		}
+
+		dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf,
+								f->flags);
+		if ((!goodf || (f->flags & goodf)) && !(f->flags & badf))
+			break;
+
+		if (opened)
+			td_io_close_file(td, f);
+
+		f = NULL;
+	} while (td->next_file != old_next_file);
+
+	dprint(FD_FILE, "get_next_file_rr: %p\n", f);
+	return f;
+}
+
+static struct fio_file *__get_next_file(struct thread_data *td)
+{
+	struct fio_file *f;
+
+	assert(td->o.nr_files <= td->files_index);
+
+	if (td->nr_done_files >= td->o.nr_files) {
+		dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d,"
+				" nr_files=%d\n", td->nr_open_files,
+						  td->nr_done_files,
+						  td->o.nr_files);
+		return NULL;
+	}
+
+	f = td->file_service_file;
+	if (f && fio_file_open(f) && !fio_file_closing(f)) {
+		if (td->o.file_service_type == FIO_FSERVICE_SEQ)
+			goto out;
+		if (td->file_service_left--)
+			goto out;
+	}
+
+	if (td->o.file_service_type == FIO_FSERVICE_RR ||
+	    td->o.file_service_type == FIO_FSERVICE_SEQ)
+		f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing);
+	else
+		f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing);
+
+	if (IS_ERR(f))
+		return f;
+
+	td->file_service_file = f;
+	td->file_service_left = td->file_service_nr - 1;
+out:
+	if (f)
+		dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name);
+	else
+		dprint(FD_FILE, "get_next_file: NULL\n");
+	return f;
+}
+
+static struct fio_file *get_next_file(struct thread_data *td)
+{
+	return __get_next_file(td);
+}
+
+static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f;
+
+	do {
+		f = get_next_file(td);
+		if (IS_ERR_OR_NULL(f))
+			return PTR_ERR(f);
+
+		io_u->file = f;
+		get_file(f);
+
+		if (!fill_io_u(td, io_u))
+			break;
+
+		zbd_put_io_u(io_u);
+
+		put_file_log(td, f);
+		td_io_close_file(td, f);
+		io_u->file = NULL;
+		if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+			fio_file_reset(td, f);
+		else {
+			fio_file_set_done(f);
+			td->nr_done_files++;
+			dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
+					td->nr_done_files, td->o.nr_files);
+		}
+	} while (1);
+
+	return 0;
+}
+
+static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+		      unsigned long long tnsec, unsigned long long max_nsec)
+{
+	if (!td->error)
+		log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
+	td_verror(td, ETIMEDOUT, "max latency exceeded");
+	icd->error = ETIMEDOUT;
+}
+
+static void lat_new_cycle(struct thread_data *td)
+{
+	fio_gettime(&td->latency_ts, NULL);
+	td->latency_ios = ddir_rw_sum(td->io_blocks);
+	td->latency_failed = 0;
+}
+
+/*
+ * We had an IO outside the latency target. Reduce the queue depth. If we
+ * are at QD=1, then it's time to give up.
+ */
+static bool __lat_target_failed(struct thread_data *td)
+{
+	if (td->latency_qd == 1)
+		return true;
+
+	td->latency_qd_high = td->latency_qd;
+
+	if (td->latency_qd == td->latency_qd_low)
+		td->latency_qd_low--;
+
+	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
+
+	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * When we ramp QD down, quiesce existing IO to prevent
+	 * a storm of ramp downs due to pending higher depth.
+	 */
+	io_u_quiesce(td);
+	lat_new_cycle(td);
+	return false;
+}
+
+static bool lat_target_failed(struct thread_data *td)
+{
+	if (td->o.latency_percentile.u.f == 100.0)
+		return __lat_target_failed(td);
+
+	td->latency_failed++;
+	return false;
+}
+
+void lat_target_init(struct thread_data *td)
+{
+	td->latency_end_run = 0;
+
+	if (td->o.latency_target) {
+		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
+		fio_gettime(&td->latency_ts, NULL);
+		td->latency_qd = 1;
+		td->latency_qd_high = td->o.iodepth;
+		td->latency_qd_low = 1;
+		td->latency_ios = ddir_rw_sum(td->io_blocks);
+	} else
+		td->latency_qd = td->o.iodepth;
+}
+
+void lat_target_reset(struct thread_data *td)
+{
+	if (!td->latency_end_run)
+		lat_target_init(td);
+}
+
+static void lat_target_success(struct thread_data *td)
+{
+	const unsigned int qd = td->latency_qd;
+	struct thread_options *o = &td->o;
+
+	td->latency_qd_low = td->latency_qd;
+
+	/*
+	 * If we haven't failed yet, we double up to a failing value instead
+	 * of bisecting from highest possible queue depth. If we have set
+	 * a limit other than td->o.iodepth, bisect between that.
+	 */
+	if (td->latency_qd_high != o->iodepth)
+		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
+	else
+		td->latency_qd *= 2;
+
+	if (td->latency_qd > o->iodepth)
+		td->latency_qd = o->iodepth;
+
+	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * Same as last one, we are done. Let it run a latency cycle, so
+	 * we get only the results from the targeted depth.
+	 */
+	if (td->latency_qd == qd) {
+		if (td->latency_end_run) {
+			dprint(FD_RATE, "We are done\n");
+			td->done = 1;
+		} else {
+			dprint(FD_RATE, "Quiesce and final run\n");
+			io_u_quiesce(td);
+			td->latency_end_run = 1;
+			reset_all_stats(td);
+			reset_io_stats(td);
+		}
+	}
+
+	lat_new_cycle(td);
+}
+
+/*
+ * Check if we can bump the queue depth
+ */
+void lat_target_check(struct thread_data *td)
+{
+	uint64_t usec_window;
+	uint64_t ios;
+	double success_ios;
+
+	usec_window = utime_since_now(&td->latency_ts);
+	if (usec_window < td->o.latency_window)
+		return;
+
+	ios = ddir_rw_sum(td->io_blocks) - td->latency_ios;
+	success_ios = (double) (ios - td->latency_failed) / (double) ios;
+	success_ios *= 100.0;
+
+	dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
+
+	if (success_ios >= td->o.latency_percentile.u.f)
+		lat_target_success(td);
+	else
+		__lat_target_failed(td);
+}
+
+/*
+ * If latency target is enabled, we might be ramping up or down and not
+ * using the full queue depth available.
+ */
+bool queue_full(const struct thread_data *td)
+{
+	const int qempty = io_u_qempty(&td->io_u_freelist);
+
+	if (qempty)
+		return true;
+	if (!td->o.latency_target)
+		return false;
+
+	return td->cur_depth >= td->latency_qd;
+}
+
+struct io_u *__get_io_u(struct thread_data *td)
+{
+	const bool needs_lock = td_async_processing(td);
+	struct io_u *io_u = NULL;
+	int ret;
+
+	if (td->stop_io)
+		return NULL;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+again:
+	if (!io_u_rempty(&td->io_u_requeues))
+		io_u = io_u_rpop(&td->io_u_requeues);
+	else if (!queue_full(td)) {
+		io_u = io_u_qpop(&td->io_u_freelist);
+
+		io_u->file = NULL;
+		io_u->buflen = 0;
+		io_u->resid = 0;
+		io_u->end_io = NULL;
+	}
+
+	if (io_u) {
+		assert(io_u->flags & IO_U_F_FREE);
+		io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
+				 IO_U_F_TRIMMED | IO_U_F_BARRIER |
+				 IO_U_F_VER_LIST | IO_U_F_PRIORITY);
+
+		io_u->error = 0;
+		io_u->acct_ddir = -1;
+		td->cur_depth++;
+		assert(!(td->flags & TD_F_CHILD));
+		io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH);
+		io_u->ipo = NULL;
+	} else if (td_async_processing(td)) {
+		/*
+		 * We ran out, wait for async verify threads to finish and
+		 * return one
+		 */
+		assert(!(td->flags & TD_F_CHILD));
+		ret = pthread_cond_wait(&td->free_cond, &td->io_u_lock);
+		assert(ret == 0);
+		if (!td->error)
+			goto again;
+	}
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+
+	return io_u;
+}
+
+static bool check_get_trim(struct thread_data *td, struct io_u *io_u)
+{
+	if (!(td->flags & TD_F_TRIM_BACKLOG))
+		return false;
+	if (!td->trim_entries)
+		return false;
+
+	if (td->trim_batch) {
+		td->trim_batch--;
+		if (get_next_trim(td, io_u))
+			return true;
+	} else if (!(td->io_hist_len % td->o.trim_backlog) &&
+		     td->last_ddir != DDIR_READ) {
+		td->trim_batch = td->o.trim_batch;
+		if (!td->trim_batch)
+			td->trim_batch = td->o.trim_backlog;
+		if (get_next_trim(td, io_u))
+			return true;
+	}
+
+	return false;
+}
+
+static bool check_get_verify(struct thread_data *td, struct io_u *io_u)
+{
+	if (!(td->flags & TD_F_VER_BACKLOG))
+		return false;
+
+	if (td->io_hist_len) {
+		int get_verify = 0;
+
+		if (td->verify_batch)
+			get_verify = 1;
+		else if (!(td->io_hist_len % td->o.verify_backlog) &&
+			 td->last_ddir != DDIR_READ) {
+			td->verify_batch = td->o.verify_batch;
+			if (!td->verify_batch)
+				td->verify_batch = td->o.verify_backlog;
+			get_verify = 1;
+		}
+
+		if (get_verify && !get_next_verify(td, io_u)) {
+			td->verify_batch--;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Fill offset and start time into the buffer content, to prevent too
+ * easy compressible data for simple de-dupe attempts. Do this for every
+ * 512b block in the range, since that should be the smallest block size
+ * we can expect from a device.
+ */
+static void small_content_scramble(struct io_u *io_u)
+{
+	unsigned long long i, nr_blocks = io_u->buflen >> 9;
+	unsigned int offset;
+	uint64_t boffset, *iptr;
+	char *p;
+
+	if (!nr_blocks)
+		return;
+
+	p = io_u->xfer_buf;
+	boffset = io_u->offset;
+
+	if (io_u->buf_filled_len)
+		io_u->buf_filled_len = 0;
+
+	/*
+	 * Generate random index between 0..7. We do chunks of 512b, if
+	 * we assume a cacheline is 64 bytes, then we have 8 of those.
+	 * Scramble content within the blocks in the same cacheline to
+	 * speed things up.
+	 */
+	offset = (io_u->start_time.tv_nsec ^ boffset) & 7;
+
+	for (i = 0; i < nr_blocks; i++) {
+		/*
+		 * Fill offset into start of cacheline, time into end
+		 * of cacheline
+		 */
+		iptr = (void *) p + (offset << 6);
+		*iptr = boffset;
+
+		iptr = (void *) p + 64 - 2 * sizeof(uint64_t);
+		iptr[0] = io_u->start_time.tv_sec;
+		iptr[1] = io_u->start_time.tv_nsec;
+
+		p += 512;
+		boffset += 512;
+	}
+}
+
+/*
+ * Return an io_u to be processed. Gets a buflen and offset, sets direction,
+ * etc. The returned io_u is fully ready to be prepped, populated and submitted.
+ */
+struct io_u *get_io_u(struct thread_data *td)
+{
+	struct fio_file *f;
+	struct io_u *io_u;
+	int do_scramble = 0;
+	long ret = 0;
+
+	io_u = __get_io_u(td);
+	if (!io_u) {
+		dprint(FD_IO, "__get_io_u failed\n");
+		return NULL;
+	}
+
+	if (check_get_verify(td, io_u))
+		goto out;
+	if (check_get_trim(td, io_u))
+		goto out;
+
+	/*
+	 * from a requeue, io_u already setup
+	 */
+	if (io_u->file)
+		goto out;
+
+	/*
+	 * If using an iolog, grab next piece if any available.
+	 */
+	if (td->flags & TD_F_READ_IOLOG) {
+		if (read_iolog_get(td, io_u))
+			goto err_put;
+	} else if (set_io_u_file(td, io_u)) {
+		ret = -EBUSY;
+		dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
+		goto err_put;
+	}
+
+	f = io_u->file;
+	if (!f) {
+		dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
+		goto err_put;
+	}
+
+	assert(fio_file_open(f));
+
+	if (ddir_rw(io_u->ddir)) {
+		if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) {
+			dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
+			goto err_put;
+		}
+
+		f->last_start[io_u->ddir] = io_u->offset;
+		f->last_pos[io_u->ddir] = io_u->offset + io_u->buflen;
+
+		if (io_u->ddir == DDIR_WRITE) {
+			if (td->flags & TD_F_REFILL_BUFFERS) {
+				io_u_fill_buffer(td, io_u,
+					td->o.min_bs[DDIR_WRITE],
+					io_u->buflen);
+			} else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
+				   !(td->flags & TD_F_COMPRESS) &&
+				   !(td->flags & TD_F_DO_VERIFY))
+				do_scramble = 1;
+		} else if (io_u->ddir == DDIR_READ) {
+			/*
+			 * Reset the buf_filled parameters so next time if the
+			 * buffer is used for writes it is refilled.
+			 */
+			io_u->buf_filled_len = 0;
+		}
+	}
+
+	/*
+	 * Set io data pointers.
+	 */
+	io_u->xfer_buf = io_u->buf;
+	io_u->xfer_buflen = io_u->buflen;
+
+out:
+	assert(io_u->file);
+	if (!td_io_prep(td, io_u)) {
+		if (!td->o.disable_lat)
+			fio_gettime(&io_u->start_time, NULL);
+
+		if (do_scramble)
+			small_content_scramble(io_u);
+
+		return io_u;
+	}
+err_put:
+	dprint(FD_IO, "get_io_u failed\n");
+	put_io_u(td, io_u);
+	return ERR_PTR(ret);
+}
+
+static void __io_u_log_error(struct thread_data *td, struct io_u *io_u)
+{
+	enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error);
+
+	if (td_non_fatal_error(td, eb, io_u->error) && !td->o.error_dump)
+		return;
+
+	log_err("fio: io_u error%s%s: %s: %s offset=%llu, buflen=%llu\n",
+		io_u->file ? " on file " : "",
+		io_u->file ? io_u->file->file_name : "",
+		strerror(io_u->error),
+		io_ddir_name(io_u->ddir),
+		io_u->offset, io_u->xfer_buflen);
+
+	if (td->io_ops->errdetails) {
+		char *err = td->io_ops->errdetails(io_u);
+
+		log_err("fio: %s\n", err);
+		free(err);
+	}
+
+	if (!td->error)
+		td_verror(td, io_u->error, "io_u error");
+}
+
+void io_u_log_error(struct thread_data *td, struct io_u *io_u)
+{
+	__io_u_log_error(td, io_u);
+	if (td->parent)
+		__io_u_log_error(td->parent, io_u);
+}
+
+static inline bool gtod_reduce(struct thread_data *td)
+{
+	return (td->o.disable_clat && td->o.disable_slat && td->o.disable_bw)
+			|| td->o.gtod_reduce;
+}
+
+static void trim_block_info(struct thread_data *td, struct io_u *io_u)
+{
+	uint32_t *info = io_u_block_info(td, io_u);
+
+	if (BLOCK_INFO_STATE(*info) >= BLOCK_STATE_TRIM_FAILURE)
+		return;
+
+	*info = BLOCK_INFO(BLOCK_STATE_TRIMMED, BLOCK_INFO_TRIMS(*info) + 1);
+}
+
+static void account_io_completion(struct thread_data *td, struct io_u *io_u,
+				  struct io_completion_data *icd,
+				  const enum fio_ddir idx, unsigned int bytes)
+{
+	const int no_reduce = !gtod_reduce(td);
+	unsigned long long llnsec = 0;
+
+	if (td->parent)
+		td = td->parent;
+
+	if (!td->o.stats || td_ioengine_flagged(td, FIO_NOSTATS))
+		return;
+
+	if (no_reduce)
+		llnsec = ntime_since(&io_u->issue_time, &icd->time);
+
+	if (!td->o.disable_lat) {
+		unsigned long long tnsec;
+
+		tnsec = ntime_since(&io_u->start_time, &icd->time);
+		add_lat_sample(td, idx, tnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+
+		if (td->flags & TD_F_PROFILE_OPS) {
+			struct prof_io_ops *ops = &td->prof_io_ops;
+
+			if (ops->io_u_lat)
+				icd->error = ops->io_u_lat(td, tnsec);
+		}
+
+		if (td->o.max_latency && tnsec > td->o.max_latency)
+			lat_fatal(td, icd, tnsec, td->o.max_latency);
+		if (td->o.latency_target && tnsec > td->o.latency_target) {
+			if (lat_target_failed(td))
+				lat_fatal(td, icd, tnsec, td->o.latency_target);
+		}
+	}
+
+	if (ddir_rw(idx)) {
+		if (!td->o.disable_clat) {
+			add_clat_sample(td, idx, llnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+			io_u_mark_latency(td, llnsec);
+		}
+
+		if (!td->o.disable_bw && per_unit_log(td->bw_log))
+			add_bw_sample(td, io_u, bytes, llnsec);
+
+		if (no_reduce && per_unit_log(td->iops_log))
+			add_iops_sample(td, io_u, bytes);
+	} else if (ddir_sync(idx) && !td->o.disable_clat)
+		add_sync_clat_sample(&td->ts, llnsec);
+
+	if (td->ts.nr_block_infos && io_u->ddir == DDIR_TRIM)
+		trim_block_info(td, io_u);
+}
+
+static void file_log_write_comp(const struct thread_data *td, struct fio_file *f,
+				uint64_t offset, unsigned int bytes)
+{
+	int idx;
+
+	if (!f)
+		return;
+
+	if (f->first_write == -1ULL || offset < f->first_write)
+		f->first_write = offset;
+	if (f->last_write == -1ULL || ((offset + bytes) > f->last_write))
+		f->last_write = offset + bytes;
+
+	if (!f->last_write_comp)
+		return;
+
+	idx = f->last_write_idx++;
+	f->last_write_comp[idx] = offset;
+	if (f->last_write_idx == td->o.iodepth)
+		f->last_write_idx = 0;
+}
+
+static bool should_account(struct thread_data *td)
+{
+	return ramp_time_over(td) && (td->runstate == TD_RUNNING ||
+					   td->runstate == TD_VERIFYING);
+}
+
+static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
+			 struct io_completion_data *icd)
+{
+	struct io_u *io_u = *io_u_ptr;
+	enum fio_ddir ddir = io_u->ddir;
+	struct fio_file *f = io_u->file;
+
+	dprint_io_u(io_u, "complete");
+
+	assert(io_u->flags & IO_U_F_FLIGHT);
+	io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
+
+	/*
+	 * Mark IO ok to verify
+	 */
+	if (io_u->ipo) {
+		/*
+		 * Remove errored entry from the verification list
+		 */
+		if (io_u->error)
+			unlog_io_piece(td, io_u);
+		else {
+			io_u->ipo->flags &= ~IP_F_IN_FLIGHT;
+			write_barrier();
+		}
+	}
+
+	if (ddir_sync(ddir)) {
+		td->last_was_sync = true;
+		if (f) {
+			f->first_write = -1ULL;
+			f->last_write = -1ULL;
+		}
+		if (should_account(td))
+			account_io_completion(td, io_u, icd, ddir, io_u->buflen);
+		return;
+	}
+
+	td->last_was_sync = false;
+	td->last_ddir = ddir;
+
+	if (!io_u->error && ddir_rw(ddir)) {
+		unsigned long long bytes = io_u->buflen - io_u->resid;
+		int ret;
+
+		td->io_blocks[ddir]++;
+		td->io_bytes[ddir] += bytes;
+
+		if (!(io_u->flags & IO_U_F_VER_LIST)) {
+			td->this_io_blocks[ddir]++;
+			td->this_io_bytes[ddir] += bytes;
+		}
+
+		if (ddir == DDIR_WRITE)
+			file_log_write_comp(td, f, io_u->offset, bytes);
+
+		if (should_account(td))
+			account_io_completion(td, io_u, icd, ddir, bytes);
+
+		icd->bytes_done[ddir] += bytes;
+
+		if (io_u->end_io) {
+			ret = io_u->end_io(td, io_u_ptr);
+			io_u = *io_u_ptr;
+			if (ret && !icd->error)
+				icd->error = ret;
+		}
+	} else if (io_u->error) {
+		icd->error = io_u->error;
+		io_u_log_error(td, io_u);
+	}
+	if (icd->error) {
+		enum error_type_bit eb = td_error_type(ddir, icd->error);
+
+		if (!td_non_fatal_error(td, eb, icd->error))
+			return;
+
+		/*
+		 * If there is a non_fatal error, then add to the error count
+		 * and clear all the errors.
+		 */
+		update_error_count(td, icd->error);
+		td_clear_error(td);
+		icd->error = 0;
+		if (io_u)
+			io_u->error = 0;
+	}
+}
+
+static void init_icd(struct thread_data *td, struct io_completion_data *icd,
+		     int nr)
+{
+	int ddir;
+
+	if (!gtod_reduce(td))
+		fio_gettime(&icd->time, NULL);
+
+	icd->nr = nr;
+
+	icd->error = 0;
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		icd->bytes_done[ddir] = 0;
+}
+
+static void ios_completed(struct thread_data *td,
+			  struct io_completion_data *icd)
+{
+	struct io_u *io_u;
+	int i;
+
+	for (i = 0; i < icd->nr; i++) {
+		io_u = td->io_ops->event(td, i);
+
+		io_completed(td, &io_u, icd);
+
+		if (io_u)
+			put_io_u(td, io_u);
+	}
+}
+
+/*
+ * Complete a single io_u for the sync engines.
+ */
+int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
+{
+	struct io_completion_data icd;
+	int ddir;
+
+	init_icd(td, &icd, 1);
+	io_completed(td, &io_u, &icd);
+
+	if (io_u)
+		put_io_u(td, io_u);
+
+	if (icd.error) {
+		td_verror(td, icd.error, "io_u_sync_complete");
+		return -1;
+	}
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		td->bytes_done[ddir] += icd.bytes_done[ddir];
+
+	return 0;
+}
+
+/*
+ * Called to complete min_events number of io for the async engines.
+ */
+int io_u_queued_complete(struct thread_data *td, int min_evts)
+{
+	struct io_completion_data icd;
+	struct timespec *tvp = NULL;
+	int ret, ddir;
+	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
+
+	dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts);
+
+	if (!min_evts)
+		tvp = &ts;
+	else if (min_evts > td->cur_depth)
+		min_evts = td->cur_depth;
+
+	/* No worries, td_io_getevents fixes min and max if they are
+	 * set incorrectly */
+	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete_max, tvp);
+	if (ret < 0) {
+		td_verror(td, -ret, "td_io_getevents");
+		return ret;
+	} else if (!ret)
+		return ret;
+
+	init_icd(td, &icd, ret);
+	ios_completed(td, &icd);
+	if (icd.error) {
+		td_verror(td, icd.error, "io_u_queued_complete");
+		return -1;
+	}
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		td->bytes_done[ddir] += icd.bytes_done[ddir];
+
+	return ret;
+}
+
+/*
+ * Call when io_u is really queued, to update the submission latency.
+ */
+void io_u_queued(struct thread_data *td, struct io_u *io_u)
+{
+	if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) {
+		unsigned long slat_time;
+
+		slat_time = ntime_since(&io_u->start_time, &io_u->issue_time);
+
+		if (td->parent)
+			td = td->parent;
+
+		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
+				io_u->offset, io_u_is_prio(io_u));
+	}
+}
+
+/*
+ * See if we should reuse the last seed, if dedupe is enabled
+ */
+static struct frand_state *get_buf_state(struct thread_data *td)
+{
+	unsigned int v;
+
+	if (!td->o.dedupe_percentage)
+		return &td->buf_state;
+	else if (td->o.dedupe_percentage == 100) {
+		frand_copy(&td->buf_state_prev, &td->buf_state);
+		return &td->buf_state;
+	}
+
+	v = rand_between(&td->dedupe_state, 1, 100);
+
+	if (v <= td->o.dedupe_percentage)
+		return &td->buf_state_prev;
+
+	return &td->buf_state;
+}
+
+static void save_buf_state(struct thread_data *td, struct frand_state *rs)
+{
+	if (td->o.dedupe_percentage == 100)
+		frand_copy(rs, &td->buf_state_prev);
+	else if (rs == &td->buf_state)
+		frand_copy(&td->buf_state_prev, rs);
+}
+
+void fill_io_buffer(struct thread_data *td, void *buf, unsigned long long min_write,
+		    unsigned long long max_bs)
+{
+	struct thread_options *o = &td->o;
+
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		return;
+
+	if (o->compress_percentage || o->dedupe_percentage) {
+		unsigned int perc = td->o.compress_percentage;
+		struct frand_state *rs;
+		unsigned long long left = max_bs;
+		unsigned long long this_write;
+
+		do {
+			rs = get_buf_state(td);
+
+			min_write = min(min_write, left);
+
+			if (perc) {
+				this_write = min_not_zero(min_write,
+							(unsigned long long) td->o.compress_chunk);
+
+				fill_random_buf_percentage(rs, buf, perc,
+					this_write, this_write,
+					o->buffer_pattern,
+					o->buffer_pattern_bytes);
+			} else {
+				fill_random_buf(rs, buf, min_write);
+				this_write = min_write;
+			}
+
+			buf += this_write;
+			left -= this_write;
+			save_buf_state(td, rs);
+		} while (left);
+	} else if (o->buffer_pattern_bytes)
+		fill_buffer_pattern(td, buf, max_bs);
+	else if (o->zero_buffers)
+		memset(buf, 0, max_bs);
+	else
+		fill_random_buf(get_buf_state(td), buf, max_bs);
+}
+
+/*
+ * "randomly" fill the buffer contents
+ */
+void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u,
+		      unsigned long long min_write, unsigned long long max_bs)
+{
+	io_u->buf_filled_len = 0;
+	fill_io_buffer(td, io_u->buf, min_write, max_bs);
+}
+
+static int do_sync_file_range(const struct thread_data *td,
+			      struct fio_file *f)
+{
+	uint64_t offset, nbytes;
+
+	offset = f->first_write;
+	nbytes = f->last_write - f->first_write;
+
+	if (!nbytes)
+		return 0;
+
+	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
+}
+
+int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
+{
+	int ret;
+
+	if (io_u->ddir == DDIR_SYNC) {
+		ret = fsync(io_u->file->fd);
+	} else if (io_u->ddir == DDIR_DATASYNC) {
+#ifdef CONFIG_FDATASYNC
+		ret = fdatasync(io_u->file->fd);
+#else
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+#endif
+	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
+		ret = do_sync_file_range(td, io_u->file);
+	else {
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+	}
+
+	if (ret < 0)
+		io_u->error = errno;
+
+	return ret;
+}
+
+int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+{
+#ifndef FIO_HAVE_TRIM
+	io_u->error = EINVAL;
+	return 0;
+#else
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
+	if (!ret)
+		return io_u->xfer_buflen;
+
+	io_u->error = ret;
+	return 0;
+#endif
+}
diff --git a/io_u.h b/io_u.h
new file mode 100644
index 0000000..0f63cdd
--- /dev/null
+++ b/io_u.h
@@ -0,0 +1,200 @@
+#ifndef FIO_IO_U
+#define FIO_IO_U
+
+#include "compiler/compiler.h"
+#include "os/os.h"
+#include "io_ddir.h"
+#include "debug.h"
+#include "file.h"
+#include "workqueue.h"
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+#ifdef CONFIG_GUASI
+#include <guasi.h>
+#endif
+
+enum {
+	IO_U_F_FREE		= 1 << 0,
+	IO_U_F_FLIGHT		= 1 << 1,
+	IO_U_F_NO_FILE_PUT	= 1 << 2,
+	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
+	IO_U_F_BUSY_OK		= 1 << 4,
+	IO_U_F_TRIMMED		= 1 << 5,
+	IO_U_F_BARRIER		= 1 << 6,
+	IO_U_F_VER_LIST		= 1 << 7,
+	IO_U_F_PRIORITY		= 1 << 8,
+};
+
+/*
+ * The io unit
+ */
+struct io_u {
+	struct timespec start_time;
+	struct timespec issue_time;
+
+	struct fio_file *file;
+	unsigned int flags;
+	enum fio_ddir ddir;
+
+	/*
+	 * For replay workloads, we may want to account as a different
+	 * IO type than what is being submitted.
+	 */
+	enum fio_ddir acct_ddir;
+
+	/*
+	 * Write generation
+	 */
+	unsigned short numberio;
+
+	/*
+	 * Allocated/set buffer and length
+	 */
+	unsigned long long buflen;
+	unsigned long long offset;
+	void *buf;
+
+	/*
+	 * Initial seed for generating the buffer contents
+	 */
+	uint64_t rand_seed;
+
+	/*
+	 * IO engine state, may be different from above when we get
+	 * partial transfers / residual data counts
+	 */
+	void *xfer_buf;
+	unsigned long long xfer_buflen;
+
+	/*
+	 * Parameter related to pre-filled buffers and
+	 * their size to handle variable block sizes.
+	 */
+	unsigned long long buf_filled_len;
+
+	struct io_piece *ipo;
+
+	unsigned long long resid;
+	unsigned int error;
+
+	/*
+	 * io engine private data
+	 */
+	union {
+		unsigned int index;
+		unsigned int seen;
+		void *engine_data;
+	};
+
+	union {
+		struct flist_head verify_list;
+		struct workqueue_work work;
+	};
+
+#ifdef CONFIG_LINUX_BLKZONED
+	/*
+	 * ZBD mode zbd_queue_io callback: called after engine->queue operation
+	 * to advance a zone write pointer and eventually unlock the I/O zone.
+	 * @q indicates the I/O queue status (busy, queued or completed).
+	 * @success == true means that the I/O operation has been queued or
+	 * completed successfully.
+	 */
+	void (*zbd_queue_io)(struct io_u *, int q, bool success);
+
+	/*
+	 * ZBD mode zbd_put_io callback: called in after completion of an I/O
+	 * or commit of an async I/O to unlock the I/O target zone.
+	 */
+	void (*zbd_put_io)(const struct io_u *);
+#endif
+
+	/*
+	 * Callback for io completion
+	 */
+	int (*end_io)(struct thread_data *, struct io_u **);
+
+	union {
+#ifdef CONFIG_LIBAIO
+		struct iocb iocb;
+#endif
+#ifdef CONFIG_POSIXAIO
+		os_aiocb_t aiocb;
+#endif
+#ifdef FIO_HAVE_SGIO
+		struct sg_io_hdr hdr;
+#endif
+#ifdef CONFIG_GUASI
+		guasi_req_t greq;
+#endif
+#ifdef CONFIG_SOLARISAIO
+		aio_result_t resultp;
+#endif
+#ifdef CONFIG_RDMA
+		struct ibv_mr *mr;
+#endif
+		void *mmap_data;
+	};
+};
+
+/*
+ * io unit handling
+ */
+extern struct io_u *__get_io_u(struct thread_data *);
+extern struct io_u *get_io_u(struct thread_data *);
+extern void put_io_u(struct thread_data *, struct io_u *);
+extern void clear_io_u(struct thread_data *, struct io_u *);
+extern void requeue_io_u(struct thread_data *, struct io_u **);
+extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
+extern int __must_check io_u_queued_complete(struct thread_data *, int);
+extern void io_u_queued(struct thread_data *, struct io_u *);
+extern int io_u_quiesce(struct thread_data *);
+extern void io_u_log_error(struct thread_data *, struct io_u *);
+extern void io_u_mark_depth(struct thread_data *, unsigned int);
+extern void fill_io_buffer(struct thread_data *, void *, unsigned long long, unsigned long long);
+extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned long long, unsigned long long);
+void io_u_mark_complete(struct thread_data *, unsigned int);
+void io_u_mark_submit(struct thread_data *, unsigned int);
+bool queue_full(const struct thread_data *);
+
+int do_io_u_sync(const struct thread_data *, struct io_u *);
+int do_io_u_trim(const struct thread_data *, struct io_u *);
+
+#ifdef FIO_INC_DEBUG
+static inline void dprint_io_u(struct io_u *io_u, const char *p)
+{
+	struct fio_file *f = io_u->file;
+
+	if (f)
+		dprint(FD_IO, "%s: io_u %p: off=0x%llx,len=0x%llx,ddir=%d,file=%s\n",
+				p, io_u,
+				(unsigned long long) io_u->offset,
+				io_u->buflen, io_u->ddir,
+				f->file_name);
+	else
+		dprint(FD_IO, "%s: io_u %p: off=0x%llx,len=0x%llx,ddir=%d\n",
+				p, io_u,
+				(unsigned long long) io_u->offset,
+				io_u->buflen, io_u->ddir);
+}
+#else
+#define dprint_io_u(io_u, p)
+#endif
+
+static inline enum fio_ddir acct_ddir(struct io_u *io_u)
+{
+	if (io_u->acct_ddir != -1)
+		return io_u->acct_ddir;
+
+	return io_u->ddir;
+}
+
+#define io_u_clear(td, io_u, val)	\
+	td_flags_clear((td), &(io_u->flags), (val))
+#define io_u_set(td, io_u, val)		\
+	td_flags_set((td), &(io_u)->flags, (val))
+#define io_u_is_prio(io_u)	\
+	(io_u->flags & (unsigned int) IO_U_F_PRIORITY) != 0
+
+#endif
diff --git a/io_u_queue.c b/io_u_queue.c
new file mode 100644
index 0000000..41f98bc
--- /dev/null
+++ b/io_u_queue.c
@@ -0,0 +1,53 @@
+#include <stdlib.h>
+#include <string.h>
+#include "io_u_queue.h"
+#include "smalloc.h"
+
+bool io_u_qinit(struct io_u_queue *q, unsigned int nr, bool shared)
+{
+	if (shared)
+		q->io_us = smalloc(nr * sizeof(struct io_u *));
+	else
+		q->io_us = calloc(nr, sizeof(struct io_u *));
+
+	if (!q->io_us)
+		return false;
+
+	q->nr = 0;
+	q->max = nr;
+	return true;
+}
+
+void io_u_qexit(struct io_u_queue *q, bool shared)
+{
+	if (shared)
+		sfree(q->io_us);
+	else
+		free(q->io_us);
+}
+
+bool io_u_rinit(struct io_u_ring *ring, unsigned int nr)
+{
+	ring->max = nr + 1;
+	if (ring->max & (ring->max - 1)) {
+		ring->max--;
+		ring->max |= ring->max >> 1;
+		ring->max |= ring->max >> 2;
+		ring->max |= ring->max >> 4;
+		ring->max |= ring->max >> 8;
+		ring->max |= ring->max >> 16;
+		ring->max++;
+	}
+
+	ring->ring = calloc(ring->max, sizeof(struct io_u *));
+	if (!ring->ring)
+		return false;
+
+	ring->head = ring->tail = 0;
+	return true;
+}
+
+void io_u_rexit(struct io_u_ring *ring)
+{
+	free(ring->ring);
+}
diff --git a/io_u_queue.h b/io_u_queue.h
new file mode 100644
index 0000000..87de894
--- /dev/null
+++ b/io_u_queue.h
@@ -0,0 +1,89 @@
+#ifndef FIO_IO_U_QUEUE
+#define FIO_IO_U_QUEUE
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "lib/types.h"
+
+struct io_u;
+
+struct io_u_queue {
+	struct io_u **io_us;
+	unsigned int nr;
+	unsigned int max;
+};
+
+static inline struct io_u *io_u_qpop(struct io_u_queue *q)
+{
+	if (q->nr) {
+		const unsigned int next = --q->nr;
+		struct io_u *io_u = q->io_us[next];
+
+		q->io_us[next] = NULL;
+		return io_u;
+	}
+
+	return NULL;
+}
+
+static inline void io_u_qpush(struct io_u_queue *q, struct io_u *io_u)
+{
+	if (q->nr < q->max) {
+		q->io_us[q->nr++] = io_u;
+		return;
+	}
+
+	assert(0);
+}
+
+static inline int io_u_qempty(const struct io_u_queue *q)
+{
+	return !q->nr;
+}
+
+#define io_u_qiter(q, io_u, i)	\
+	for (i = 0; i < (q)->nr && (io_u = (q)->io_us[i]); i++)
+
+bool io_u_qinit(struct io_u_queue *q, unsigned int nr, bool shared);
+void io_u_qexit(struct io_u_queue *q, bool shared);
+
+struct io_u_ring {
+	unsigned int head;
+	unsigned int tail;
+	unsigned int max;
+	struct io_u **ring;
+};
+
+bool io_u_rinit(struct io_u_ring *ring, unsigned int nr);
+void io_u_rexit(struct io_u_ring *ring);
+
+static inline void io_u_rpush(struct io_u_ring *r, struct io_u *io_u)
+{
+	if (r->head + 1 != r->tail) {
+		r->ring[r->head] = io_u;
+		r->head = (r->head + 1) & (r->max - 1);
+		return;
+	}
+
+	assert(0);
+}
+
+static inline struct io_u *io_u_rpop(struct io_u_ring *r)
+{
+	if (r->head != r->tail) {
+		struct io_u *io_u = r->ring[r->tail];
+
+		r->tail = (r->tail + 1) & (r->max - 1);
+		return io_u;
+	}
+
+	return NULL;
+}
+
+static inline int io_u_rempty(struct io_u_ring *ring)
+{
+	return ring->head == ring->tail;
+}
+
+#endif
diff --git a/ioengines.c b/ioengines.c
new file mode 100644
index 0000000..2c7a0df
--- /dev/null
+++ b/ioengines.c
@@ -0,0 +1,637 @@
+/*
+ * The io parts of the fio tool, includes workers for sync and mmap'ed
+ * io, as well as both posix and linux libaio support.
+ *
+ * sync io is implemented on top of aio.
+ *
+ * This is not really specific to fio, if the get_io_u/put_io_u and
+ * structures was pulled into this as well it would be a perfectly
+ * generic io engine that could be used for other projects.
+ *
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <assert.h>
+
+#include "fio.h"
+#include "diskutil.h"
+#include "zbd.h"
+
+static FLIST_HEAD(engine_list);
+
+static bool check_engine_ops(struct ioengine_ops *ops)
+{
+	if (ops->version != FIO_IOOPS_VERSION) {
+		log_err("bad ioops version %d (want %d)\n", ops->version,
+							FIO_IOOPS_VERSION);
+		return true;
+	}
+
+	if (!ops->queue) {
+		log_err("%s: no queue handler\n", ops->name);
+		return true;
+	}
+
+	/*
+	 * sync engines only need a ->queue()
+	 */
+	if (ops->flags & FIO_SYNCIO)
+		return false;
+
+	if (!ops->event || !ops->getevents) {
+		log_err("%s: no event/getevents handler\n", ops->name);
+		return true;
+	}
+
+	return false;
+}
+
+void unregister_ioengine(struct ioengine_ops *ops)
+{
+	dprint(FD_IO, "ioengine %s unregistered\n", ops->name);
+	flist_del_init(&ops->list);
+}
+
+void register_ioengine(struct ioengine_ops *ops)
+{
+	dprint(FD_IO, "ioengine %s registered\n", ops->name);
+	flist_add_tail(&ops->list, &engine_list);
+}
+
+static struct ioengine_ops *find_ioengine(const char *name)
+{
+	struct ioengine_ops *ops;
+	struct flist_head *entry;
+
+	flist_for_each(entry, &engine_list) {
+		ops = flist_entry(entry, struct ioengine_ops, list);
+		if (!strcmp(name, ops->name))
+			return ops;
+	}
+
+	return NULL;
+}
+
+static struct ioengine_ops *dlopen_ioengine(struct thread_data *td,
+					    const char *engine_lib)
+{
+	struct ioengine_ops *ops;
+	void *dlhandle;
+
+	dprint(FD_IO, "dload engine %s\n", engine_lib);
+
+	dlerror();
+	dlhandle = dlopen(engine_lib, RTLD_LAZY);
+	if (!dlhandle) {
+		td_vmsg(td, -1, dlerror(), "dlopen");
+		return NULL;
+	}
+
+	/*
+	 * Unlike the included modules, external engines should have a
+	 * non-static ioengine structure that we can reference.
+	 */
+	ops = dlsym(dlhandle, engine_lib);
+	if (!ops)
+		ops = dlsym(dlhandle, "ioengine");
+
+	/*
+	 * For some external engines (like C++ ones) it is not that trivial
+	 * to provide a non-static ionengine structure that we can reference.
+	 * Instead we call a method which allocates the required ioengine
+	 * structure.
+	 */
+	if (!ops) {
+		get_ioengine_t get_ioengine = dlsym(dlhandle, "get_ioengine");
+
+		if (get_ioengine)
+			get_ioengine(&ops);
+	}
+
+	if (!ops) {
+		td_vmsg(td, -1, dlerror(), "dlsym");
+		dlclose(dlhandle);
+		return NULL;
+	}
+
+	td->io_ops_dlhandle = dlhandle;
+	return ops;
+}
+
+static struct ioengine_ops *__load_ioengine(const char *engine)
+{
+	/*
+	 * linux libaio has alias names, so convert to what we want
+	 */
+	if (!strncmp(engine, "linuxaio", 8)) {
+		dprint(FD_IO, "converting ioengine name: %s -> libaio\n",
+		       engine);
+		engine = "libaio";
+	}
+
+	dprint(FD_IO, "load ioengine %s\n", engine);
+	return find_ioengine(engine);
+}
+
+struct ioengine_ops *load_ioengine(struct thread_data *td)
+{
+	struct ioengine_ops *ops = NULL;
+	const char *name;
+
+	/*
+	 * Use ->ioengine_so_path if an external ioengine path is specified.
+	 * In this case, ->ioengine is "external" which also means the prefix
+	 * for external ioengines "external:" is properly used.
+	 */
+	name = td->o.ioengine_so_path ?: td->o.ioengine;
+
+	/*
+	 * Try to load ->ioengine first, and if failed try to dlopen(3) either
+	 * ->ioengine or ->ioengine_so_path.  This is redundant for an external
+	 * ioengine with prefix, and also leaves the possibility of unexpected
+	 * behavior (e.g. if the "external" ioengine exists), but we do this
+	 * so as not to break job files not using the prefix.
+	 */
+	ops = __load_ioengine(td->o.ioengine);
+	if (!ops)
+		ops = dlopen_ioengine(td, name);
+
+	/*
+	 * If ops is NULL, we failed to load ->ioengine, and also failed to
+	 * dlopen(3) either ->ioengine or ->ioengine_so_path as a path.
+	 */
+	if (!ops) {
+		log_err("fio: engine %s not loadable\n", name);
+		return NULL;
+	}
+
+	/*
+	 * Check that the required methods are there.
+	 */
+	if (check_engine_ops(ops))
+		return NULL;
+
+	return ops;
+}
+
+/*
+ * For cleaning up an ioengine which never made it to init().
+ */
+void free_ioengine(struct thread_data *td)
+{
+	dprint(FD_IO, "free ioengine %s\n", td->io_ops->name);
+
+	if (td->eo && td->io_ops->options) {
+		options_free(td->io_ops->options, td->eo);
+		free(td->eo);
+		td->eo = NULL;
+	}
+
+	if (td->io_ops_dlhandle) {
+		dlclose(td->io_ops_dlhandle);
+		td->io_ops_dlhandle = NULL;
+	}
+
+	td->io_ops = NULL;
+}
+
+void close_ioengine(struct thread_data *td)
+{
+	dprint(FD_IO, "close ioengine %s\n", td->io_ops->name);
+
+	if (td->io_ops->cleanup) {
+		td->io_ops->cleanup(td);
+		td->io_ops_data = NULL;
+	}
+
+	free_ioengine(td);
+}
+
+int td_io_prep(struct thread_data *td, struct io_u *io_u)
+{
+	dprint_io_u(io_u, "prep");
+	fio_ro_check(td, io_u);
+
+	lock_file(td, io_u->file, io_u->ddir);
+
+	if (td->io_ops->prep) {
+		int ret = td->io_ops->prep(td, io_u);
+
+		dprint(FD_IO, "prep: io_u %p: ret=%d\n", io_u, ret);
+
+		if (ret)
+			unlock_file(td, io_u->file);
+		return ret;
+	}
+
+	return 0;
+}
+
+int td_io_getevents(struct thread_data *td, unsigned int min, unsigned int max,
+		    const struct timespec *t)
+{
+	int r = 0;
+
+	/*
+	 * For ioengine=rdma one side operation RDMA_WRITE or RDMA_READ,
+	 * server side gets a message from the client
+	 * side that the task is finished, and
+	 * td->done is set to 1 after td_io_commit(). In this case,
+	 * there is no need to reap complete event in server side.
+	 */
+	if (td->done)
+		return 0;
+
+	if (min > 0 && td->io_ops->commit) {
+		r = td->io_ops->commit(td);
+		if (r < 0)
+			goto out;
+	}
+	if (max > td->cur_depth)
+		max = td->cur_depth;
+	if (min > max)
+		max = min;
+
+	r = 0;
+	if (max && td->io_ops->getevents)
+		r = td->io_ops->getevents(td, min, max, t);
+out:
+	if (r >= 0) {
+		/*
+		 * Reflect that our submitted requests were retrieved with
+		 * whatever OS async calls are in the underlying engine.
+		 */
+		td->io_u_in_flight -= r;
+		io_u_mark_complete(td, r);
+	} else
+		td_verror(td, r, "get_events");
+
+	dprint(FD_IO, "getevents: %d\n", r);
+	return r;
+}
+
+enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
+{
+	const enum fio_ddir ddir = acct_ddir(io_u);
+	unsigned long long buflen = io_u->xfer_buflen;
+	enum fio_q_status ret;
+
+	dprint_io_u(io_u, "queue");
+	fio_ro_check(td, io_u);
+
+	assert((io_u->flags & IO_U_F_FLIGHT) == 0);
+	io_u_set(td, io_u, IO_U_F_FLIGHT);
+
+	/*
+	 * If overlap checking was enabled in offload mode we
+	 * can release this lock that was acquired when we
+	 * started the overlap check because the IO_U_F_FLIGHT
+	 * flag is now set
+	 */
+	if (td_offload_overlap(td))
+		pthread_mutex_unlock(&overlap_check);
+
+	assert(fio_file_open(io_u->file));
+
+	/*
+	 * If using a write iolog, store this entry.
+	 */
+	log_io_u(td, io_u);
+
+	io_u->error = 0;
+	io_u->resid = 0;
+
+	if (td_ioengine_flagged(td, FIO_SYNCIO) ||
+		(td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) && 
+		io_u->ddir == DDIR_TRIM)) {
+		if (fio_fill_issue_time(td))
+			fio_gettime(&io_u->issue_time, NULL);
+
+		/*
+		 * only used for iolog
+		 */
+		if (td->o.read_iolog_file)
+			memcpy(&td->last_issue, &io_u->issue_time,
+					sizeof(io_u->issue_time));
+	}
+
+
+	if (ddir_rw(ddir)) {
+		if (!(io_u->flags & IO_U_F_VER_LIST)) {
+			td->io_issues[ddir]++;
+			td->io_issue_bytes[ddir] += buflen;
+		}
+		td->rate_io_issue_bytes[ddir] += buflen;
+	}
+
+	ret = td->io_ops->queue(td, io_u);
+	zbd_queue_io_u(io_u, ret);
+
+	unlock_file(td, io_u->file);
+
+	if (ret == FIO_Q_BUSY && ddir_rw(ddir)) {
+		td->io_issues[ddir]--;
+		td->io_issue_bytes[ddir] -= buflen;
+		td->rate_io_issue_bytes[ddir] -= buflen;
+		io_u_clear(td, io_u, IO_U_F_FLIGHT);
+	}
+
+	/*
+	 * If an error was seen and the io engine didn't propagate it
+	 * back to 'td', do so.
+	 */
+	if (io_u->error && !td->error)
+		td_verror(td, io_u->error, "td_io_queue");
+
+	/*
+	 * Add warning for O_DIRECT so that users have an easier time
+	 * spotting potentially bad alignment. If this triggers for the first
+	 * IO, then it's likely an alignment problem or because the host fs
+	 * does not support O_DIRECT
+	 */
+	if (io_u->error == EINVAL && td->io_issues[io_u->ddir & 1] == 1 &&
+	    td->o.odirect) {
+
+		log_info("fio: first direct IO errored. File system may not "
+			 "support direct IO, or iomem_align= is bad, or "
+			 "invalid block size. Try setting direct=0.\n");
+	}
+
+	if (zbd_unaligned_write(io_u->error) &&
+	    td->io_issues[io_u->ddir & 1] == 1 &&
+	    td->o.zone_mode != ZONE_MODE_ZBD) {
+		log_info("fio: first I/O failed. If %s is a zoned block device, consider --zonemode=zbd\n",
+			 io_u->file->file_name);
+	}
+
+	if (!td->io_ops->commit) {
+		io_u_mark_submit(td, 1);
+		io_u_mark_complete(td, 1);
+		zbd_put_io_u(io_u);
+	}
+
+	if (ret == FIO_Q_COMPLETED) {
+		if (ddir_rw(io_u->ddir) ||
+		    (ddir_sync(io_u->ddir) && td->runstate != TD_FSYNCING)) {
+			io_u_mark_depth(td, 1);
+			td->ts.total_io_u[io_u->ddir]++;
+		}
+	} else if (ret == FIO_Q_QUEUED) {
+		td->io_u_queued++;
+
+		if (ddir_rw(io_u->ddir) ||
+		    (ddir_sync(io_u->ddir) && td->runstate != TD_FSYNCING))
+			td->ts.total_io_u[io_u->ddir]++;
+
+		if (td->io_u_queued >= td->o.iodepth_batch)
+			td_io_commit(td);
+	}
+
+	if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
+		(!td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) ||
+		 io_u->ddir != DDIR_TRIM)) {
+		if (fio_fill_issue_time(td))
+			fio_gettime(&io_u->issue_time, NULL);
+
+		/*
+		 * only used for iolog
+		 */
+		if (td->o.read_iolog_file)
+			memcpy(&td->last_issue, &io_u->issue_time,
+					sizeof(io_u->issue_time));
+	}
+
+	return ret;
+}
+
+int td_io_init(struct thread_data *td)
+{
+	int ret = 0;
+
+	if (td->io_ops->init) {
+		ret = td->io_ops->init(td);
+		if (ret)
+			log_err("fio: io engine %s init failed.%s\n",
+				td->io_ops->name,
+				td->o.iodepth > 1 ?
+				" Perhaps try reducing io depth?" : "");
+		else
+			td->io_ops_init = 1;
+		if (!td->error)
+			td->error = ret;
+	}
+
+	return ret;
+}
+
+void td_io_commit(struct thread_data *td)
+{
+	int ret;
+
+	dprint(FD_IO, "calling ->commit(), depth %d\n", td->cur_depth);
+
+	if (!td->cur_depth || !td->io_u_queued)
+		return;
+
+	io_u_mark_depth(td, td->io_u_queued);
+
+	if (td->io_ops->commit) {
+		ret = td->io_ops->commit(td);
+		if (ret)
+			td_verror(td, -ret, "io commit");
+	}
+
+	/*
+	 * Reflect that events were submitted as async IO requests.
+	 */
+	td->io_u_in_flight += td->io_u_queued;
+	td->io_u_queued = 0;
+}
+
+int td_io_open_file(struct thread_data *td, struct fio_file *f)
+{
+	if (fio_file_closing(f)) {
+		/*
+		 * Open translates to undo closing.
+		 */
+		fio_file_clear_closing(f);
+		get_file(f);
+		return 0;
+	}
+	assert(!fio_file_open(f));
+	assert(f->fd == -1);
+	assert(td->io_ops->open_file);
+
+	if (td->io_ops->open_file(td, f)) {
+		if (td->error == EINVAL && td->o.odirect)
+			log_err("fio: destination does not support O_DIRECT\n");
+		if (td->error == EMFILE) {
+			log_err("fio: try reducing/setting openfiles (failed"
+				" at %u of %u)\n", td->nr_open_files,
+							td->o.nr_files);
+		}
+
+		assert(f->fd == -1);
+		assert(!fio_file_open(f));
+		return 1;
+	}
+
+	fio_file_reset(td, f);
+	fio_file_set_open(f);
+	fio_file_clear_closing(f);
+	disk_util_inc(f->du);
+
+	td->nr_open_files++;
+	get_file(f);
+
+	if (f->filetype == FIO_TYPE_PIPE) {
+		if (td_random(td)) {
+			log_err("fio: can't seek on pipes (no random io)\n");
+			goto err;
+		}
+	}
+
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
+		goto done;
+
+	if (td->o.invalidate_cache && file_invalidate_cache(td, f))
+		goto err;
+
+	if (td->o.fadvise_hint != F_ADV_NONE &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
+		int flags;
+
+		if (td->o.fadvise_hint == F_ADV_TYPE) {
+			if (td_random(td))
+				flags = POSIX_FADV_RANDOM;
+			else
+				flags = POSIX_FADV_SEQUENTIAL;
+		} else if (td->o.fadvise_hint == F_ADV_RANDOM)
+			flags = POSIX_FADV_RANDOM;
+		else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL)
+			flags = POSIX_FADV_SEQUENTIAL;
+		else {
+			log_err("fio: unknown fadvise type %d\n",
+							td->o.fadvise_hint);
+			flags = POSIX_FADV_NORMAL;
+		}
+
+		if (posix_fadvise(f->fd, f->file_offset, f->io_size, flags) < 0) {
+			if (!fio_did_warn(FIO_WARN_FADVISE))
+				log_err("fio: fadvise hint failed\n");
+		}
+	}
+#ifdef FIO_HAVE_WRITE_HINT
+	if (fio_option_is_set(&td->o, write_hint) &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
+		uint64_t hint = td->o.write_hint;
+		int cmd;
+
+		/*
+		 * For direct IO, we just need/want to set the hint on
+		 * the file descriptor. For buffered IO, we need to set
+		 * it on the inode.
+		 */
+		if (td->o.odirect)
+			cmd = F_SET_FILE_RW_HINT;
+		else
+			cmd = F_SET_RW_HINT;
+
+		if (fcntl(f->fd, cmd, &hint) < 0) {
+			td_verror(td, errno, "fcntl write hint");
+			goto err;
+		}
+	}
+#endif
+
+	if (td->o.odirect && !OS_O_DIRECT && fio_set_directio(td, f))
+		goto err;
+
+done:
+	log_file(td, f, FIO_LOG_OPEN_FILE);
+	return 0;
+err:
+	disk_util_dec(f->du);
+	if (td->io_ops->close_file)
+		td->io_ops->close_file(td, f);
+	return 1;
+}
+
+int td_io_close_file(struct thread_data *td, struct fio_file *f)
+{
+	if (!fio_file_closing(f))
+		log_file(td, f, FIO_LOG_CLOSE_FILE);
+
+	/*
+	 * mark as closing, do real close when last io on it has completed
+	 */
+	fio_file_set_closing(f);
+
+	return put_file(td, f);
+}
+
+int td_io_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td->io_ops->unlink_file)
+		return td->io_ops->unlink_file(td, f);
+	else {
+		int ret;
+
+		ret = unlink(f->file_name);
+		if (ret < 0)
+			return errno;
+
+		return 0;
+	}
+}
+
+int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	if (!td->io_ops->get_file_size)
+		return 0;
+
+	return td->io_ops->get_file_size(td, f);
+}
+
+int fio_show_ioengine_help(const char *engine)
+{
+	struct flist_head *entry;
+	struct thread_data td;
+	struct ioengine_ops *io_ops;
+	char *sep;
+	int ret = 1;
+
+	if (!engine || !*engine) {
+		log_info("Available IO engines:\n");
+		flist_for_each(entry, &engine_list) {
+			io_ops = flist_entry(entry, struct ioengine_ops, list);
+			log_info("\t%s\n", io_ops->name);
+		}
+		return 0;
+	}
+	sep = strchr(engine, ',');
+	if (sep) {
+		*sep = 0;
+		sep++;
+	}
+
+	memset(&td, 0, sizeof(struct thread_data));
+	td.o.ioengine = (char *)engine;
+	io_ops = load_ioengine(&td);
+
+	if (!io_ops) {
+		log_info("IO engine %s not found\n", engine);
+		return 1;
+	}
+
+	if (io_ops->options)
+		ret = show_cmd_help(io_ops->options, sep);
+	else
+		log_info("IO engine %s has no options\n", io_ops->name);
+
+	free_ioengine(&td);
+	return ret;
+}
diff --git a/ioengines.h b/ioengines.h
new file mode 100644
index 0000000..01a9b58
--- /dev/null
+++ b/ioengines.h
@@ -0,0 +1,96 @@
+#ifndef FIO_IOENGINE_H
+#define FIO_IOENGINE_H
+
+#include <stddef.h>
+
+#include "compiler/compiler.h"
+#include "flist.h"
+#include "io_u.h"
+
+#define FIO_IOOPS_VERSION	25
+
+/*
+ * io_ops->queue() return values
+ */
+enum fio_q_status {
+	FIO_Q_COMPLETED	= 0,		/* completed sync */
+	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
+	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
+};
+
+struct ioengine_ops {
+	struct flist_head list;
+	const char *name;
+	int version;
+	int flags;
+	int (*setup)(struct thread_data *);
+	int (*init)(struct thread_data *);
+	int (*post_init)(struct thread_data *);
+	int (*prep)(struct thread_data *, struct io_u *);
+	enum fio_q_status (*queue)(struct thread_data *, struct io_u *);
+	int (*commit)(struct thread_data *);
+	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+	struct io_u *(*event)(struct thread_data *, int);
+	char *(*errdetails)(struct io_u *);
+	int (*cancel)(struct thread_data *, struct io_u *);
+	void (*cleanup)(struct thread_data *);
+	int (*open_file)(struct thread_data *, struct fio_file *);
+	int (*close_file)(struct thread_data *, struct fio_file *);
+	int (*invalidate)(struct thread_data *, struct fio_file *);
+	int (*unlink_file)(struct thread_data *, struct fio_file *);
+	int (*get_file_size)(struct thread_data *, struct fio_file *);
+	void (*terminate)(struct thread_data *);
+	int (*iomem_alloc)(struct thread_data *, size_t);
+	void (*iomem_free)(struct thread_data *);
+	int (*io_u_init)(struct thread_data *, struct io_u *);
+	void (*io_u_free)(struct thread_data *, struct io_u *);
+	int option_struct_size;
+	struct fio_option *options;
+};
+
+enum fio_ioengine_flags {
+	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
+	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
+	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
+	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
+	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
+	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
+	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
+	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
+	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
+	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
+	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
+	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
+	FIO_NOSTATS	= 1 << 12,	/* don't do IO stats */
+	FIO_NOFILEHASH	= 1 << 13,	/* doesn't hash the files for lookup later. */
+	FIO_ASYNCIO_SYNC_TRIM
+			= 1 << 14	/* io engine has async ->queue except for trim */
+};
+
+/*
+ * External engine defined symbol to fill in the engine ops structure
+ */
+typedef void (*get_ioengine_t)(struct ioengine_ops **);
+
+/*
+ * io engine entry points
+ */
+extern int __must_check td_io_init(struct thread_data *);
+extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
+extern enum fio_q_status __must_check td_io_queue(struct thread_data *, struct io_u *);
+extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+extern void td_io_commit(struct thread_data *);
+extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
+extern int td_io_close_file(struct thread_data *, struct fio_file *);
+extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
+extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
+
+extern struct ioengine_ops *load_ioengine(struct thread_data *);
+extern void register_ioengine(struct ioengine_ops *);
+extern void unregister_ioengine(struct ioengine_ops *);
+extern void free_ioengine(struct thread_data *);
+extern void close_ioengine(struct thread_data *);
+
+extern int fio_show_ioengine_help(const char *engine);
+
+#endif
diff --git a/iolog.c b/iolog.c
new file mode 100644
index 0000000..917a446
--- /dev/null
+++ b/iolog.c
@@ -0,0 +1,1743 @@
+/*
+ * Code related to writing an iolog of what a thread is doing, and to
+ * later read that back and replay
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#ifdef CONFIG_ZLIB
+#include <zlib.h>
+#endif
+
+#include "flist.h"
+#include "fio.h"
+#include "trim.h"
+#include "filelock.h"
+#include "smalloc.h"
+#include "blktrace.h"
+#include "pshared.h"
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+static int iolog_flush(struct io_log *log);
+
+static const char iolog_ver2[] = "fio version 2 iolog";
+
+void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
+{
+	flist_add_tail(&ipo->list, &td->io_log_list);
+	td->total_io_size += ipo->len;
+}
+
+void log_io_u(const struct thread_data *td, const struct io_u *io_u)
+{
+	if (!td->o.write_iolog_file)
+		return;
+
+	fprintf(td->iolog_f, "%s %s %llu %llu\n", io_u->file->file_name,
+						io_ddir_name(io_u->ddir),
+						io_u->offset, io_u->buflen);
+}
+
+void log_file(struct thread_data *td, struct fio_file *f,
+	      enum file_log_act what)
+{
+	const char *act[] = { "add", "open", "close" };
+
+	assert(what < 3);
+
+	if (!td->o.write_iolog_file)
+		return;
+
+
+	/*
+	 * this happens on the pre-open/close done before the job starts
+	 */
+	if (!td->iolog_f)
+		return;
+
+	fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
+}
+
+static void iolog_delay(struct thread_data *td, unsigned long delay)
+{
+	uint64_t usec = utime_since_now(&td->last_issue);
+	unsigned long orig_delay = delay;
+	uint64_t this_delay;
+	struct timespec ts;
+
+	if (delay < td->time_offset) {
+		td->time_offset = 0;
+		return;
+	}
+
+	delay -= td->time_offset;
+	if (delay < usec)
+		return;
+
+	delay -= usec;
+
+	fio_gettime(&ts, NULL);
+	while (delay && !td->terminate) {
+		this_delay = delay;
+		if (this_delay > 500000)
+			this_delay = 500000;
+
+		usec_sleep(td, this_delay);
+		delay -= this_delay;
+	}
+
+	usec = utime_since_now(&ts);
+	if (usec > orig_delay)
+		td->time_offset = usec - orig_delay;
+	else
+		td->time_offset = 0;
+}
+
+static int ipo_special(struct thread_data *td, struct io_piece *ipo)
+{
+	struct fio_file *f;
+	int ret;
+
+	/*
+	 * Not a special ipo
+	 */
+	if (ipo->ddir != DDIR_INVAL)
+		return 0;
+
+	f = td->files[ipo->fileno];
+
+	switch (ipo->file_action) {
+	case FIO_LOG_OPEN_FILE:
+		if (td->o.replay_redirect && fio_file_open(f)) {
+			dprint(FD_FILE, "iolog: ignoring re-open of file %s\n",
+					f->file_name);
+			break;
+		}
+		ret = td_io_open_file(td, f);
+		if (!ret)
+			break;
+		td_verror(td, ret, "iolog open file");
+		return -1;
+	case FIO_LOG_CLOSE_FILE:
+		td_io_close_file(td, f);
+		break;
+	case FIO_LOG_UNLINK_FILE:
+		td_io_unlink_file(td, f);
+		break;
+	default:
+		log_err("fio: bad file action %d\n", ipo->file_action);
+		break;
+	}
+
+	return 1;
+}
+
+static bool read_iolog2(struct thread_data *td);
+
+int read_iolog_get(struct thread_data *td, struct io_u *io_u)
+{
+	struct io_piece *ipo;
+	unsigned long elapsed;
+
+	while (!flist_empty(&td->io_log_list)) {
+		int ret;
+		if (td->o.read_iolog_chunked) {
+			if (td->io_log_checkmark == td->io_log_current) {
+				if (!read_iolog2(td))
+					return 1;
+			}
+			td->io_log_current--;
+		}
+		ipo = flist_first_entry(&td->io_log_list, struct io_piece, list);
+		flist_del(&ipo->list);
+		remove_trim_entry(td, ipo);
+
+		ret = ipo_special(td, ipo);
+		if (ret < 0) {
+			free(ipo);
+			break;
+		} else if (ret > 0) {
+			free(ipo);
+			continue;
+		}
+
+		io_u->ddir = ipo->ddir;
+		if (ipo->ddir != DDIR_WAIT) {
+			io_u->offset = ipo->offset;
+			io_u->buflen = ipo->len;
+			io_u->file = td->files[ipo->fileno];
+			get_file(io_u->file);
+			dprint(FD_IO, "iolog: get %llu/%llu/%s\n", io_u->offset,
+						io_u->buflen, io_u->file->file_name);
+			if (ipo->delay)
+				iolog_delay(td, ipo->delay);
+		} else {
+			elapsed = mtime_since_genesis();
+			if (ipo->delay > elapsed)
+				usec_sleep(td, (ipo->delay - elapsed) * 1000);
+		}
+
+		free(ipo);
+
+		if (io_u->ddir != DDIR_WAIT)
+			return 0;
+	}
+
+	td->done = 1;
+	return 1;
+}
+
+void prune_io_piece_log(struct thread_data *td)
+{
+	struct io_piece *ipo;
+	struct fio_rb_node *n;
+
+	while ((n = rb_first(&td->io_hist_tree)) != NULL) {
+		ipo = rb_entry(n, struct io_piece, rb_node);
+		rb_erase(n, &td->io_hist_tree);
+		remove_trim_entry(td, ipo);
+		td->io_hist_len--;
+		free(ipo);
+	}
+
+	while (!flist_empty(&td->io_hist_list)) {
+		ipo = flist_first_entry(&td->io_hist_list, struct io_piece, list);
+		flist_del(&ipo->list);
+		remove_trim_entry(td, ipo);
+		td->io_hist_len--;
+		free(ipo);
+	}
+}
+
+/*
+ * log a successful write, so we can unwind the log for verify
+ */
+void log_io_piece(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rb_node **p, *parent;
+	struct io_piece *ipo, *__ipo;
+
+	ipo = calloc(1, sizeof(struct io_piece));
+	init_ipo(ipo);
+	ipo->file = io_u->file;
+	ipo->offset = io_u->offset;
+	ipo->len = io_u->buflen;
+	ipo->numberio = io_u->numberio;
+	ipo->flags = IP_F_IN_FLIGHT;
+
+	io_u->ipo = ipo;
+
+	if (io_u_should_trim(td, io_u)) {
+		flist_add_tail(&ipo->trim_list, &td->trim_list);
+		td->trim_entries++;
+	}
+
+	/*
+	 * Only sort writes if we don't have a random map in which case we need
+	 * to check for duplicate blocks and drop the old one, which we rely on
+	 * the rb insert/lookup for handling.
+	 */
+	if (file_randommap(td, ipo->file)) {
+		INIT_FLIST_HEAD(&ipo->list);
+		flist_add_tail(&ipo->list, &td->io_hist_list);
+		ipo->flags |= IP_F_ONLIST;
+		td->io_hist_len++;
+		return;
+	}
+
+	RB_CLEAR_NODE(&ipo->rb_node);
+
+	/*
+	 * Sort the entry into the verification list
+	 */
+restart:
+	p = &td->io_hist_tree.rb_node;
+	parent = NULL;
+	while (*p) {
+		int overlap = 0;
+		parent = *p;
+
+		__ipo = rb_entry(parent, struct io_piece, rb_node);
+		if (ipo->file < __ipo->file)
+			p = &(*p)->rb_left;
+		else if (ipo->file > __ipo->file)
+			p = &(*p)->rb_right;
+		else if (ipo->offset < __ipo->offset) {
+			p = &(*p)->rb_left;
+			overlap = ipo->offset + ipo->len > __ipo->offset;
+		}
+		else if (ipo->offset > __ipo->offset) {
+			p = &(*p)->rb_right;
+			overlap = __ipo->offset + __ipo->len > ipo->offset;
+		}
+		else
+			overlap = 1;
+
+		if (overlap) {
+			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu\n",
+				__ipo->offset, __ipo->len,
+				ipo->offset, ipo->len);
+			td->io_hist_len--;
+			rb_erase(parent, &td->io_hist_tree);
+			remove_trim_entry(td, __ipo);
+			if (!(__ipo->flags & IP_F_IN_FLIGHT))
+				free(__ipo);
+			goto restart;
+		}
+	}
+
+	rb_link_node(&ipo->rb_node, parent, p);
+	rb_insert_color(&ipo->rb_node, &td->io_hist_tree);
+	ipo->flags |= IP_F_ONRB;
+	td->io_hist_len++;
+}
+
+void unlog_io_piece(struct thread_data *td, struct io_u *io_u)
+{
+	struct io_piece *ipo = io_u->ipo;
+
+	if (td->ts.nr_block_infos) {
+		uint32_t *info = io_u_block_info(td, io_u);
+		if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) {
+			if (io_u->ddir == DDIR_TRIM)
+				*info = BLOCK_INFO_SET_STATE(*info,
+						BLOCK_STATE_TRIM_FAILURE);
+			else if (io_u->ddir == DDIR_WRITE)
+				*info = BLOCK_INFO_SET_STATE(*info,
+						BLOCK_STATE_WRITE_FAILURE);
+		}
+	}
+
+	if (!ipo)
+		return;
+
+	if (ipo->flags & IP_F_ONRB)
+		rb_erase(&ipo->rb_node, &td->io_hist_tree);
+	else if (ipo->flags & IP_F_ONLIST)
+		flist_del(&ipo->list);
+
+	free(ipo);
+	io_u->ipo = NULL;
+	td->io_hist_len--;
+}
+
+void trim_io_piece(const struct io_u *io_u)
+{
+	struct io_piece *ipo = io_u->ipo;
+
+	if (!ipo)
+		return;
+
+	ipo->len = io_u->xfer_buflen - io_u->resid;
+}
+
+void write_iolog_close(struct thread_data *td)
+{
+	fflush(td->iolog_f);
+	fclose(td->iolog_f);
+	free(td->iolog_buf);
+	td->iolog_f = NULL;
+	td->iolog_buf = NULL;
+}
+
+static int64_t iolog_items_to_fetch(struct thread_data *td)
+{
+	struct timespec now;
+	uint64_t elapsed;
+	uint64_t for_1s;
+	int64_t items_to_fetch;
+
+	if (!td->io_log_highmark)
+		return 10;
+
+
+	fio_gettime(&now, NULL);
+	elapsed = ntime_since(&td->io_log_highmark_time, &now);
+	if (elapsed) {
+		for_1s = (td->io_log_highmark - td->io_log_current) * 1000000000 / elapsed;
+		items_to_fetch = for_1s - td->io_log_current;
+		if (items_to_fetch < 0)
+			items_to_fetch = 0;
+	} else
+		items_to_fetch = 0;
+
+	td->io_log_highmark = td->io_log_current + items_to_fetch;
+	td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+	fio_gettime(&td->io_log_highmark_time, NULL);
+
+	return items_to_fetch;
+}
+
+/*
+ * Read version 2 iolog data. It is enhanced to include per-file logging,
+ * syncs, etc.
+ */
+static bool read_iolog2(struct thread_data *td)
+{
+	unsigned long long offset;
+	unsigned int bytes;
+	int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
+	char *rfname, *fname, *act;
+	char *str, *p;
+	enum fio_ddir rw;
+	bool realloc = false;
+	int64_t items_to_fetch = 0;
+
+	if (td->o.read_iolog_chunked) {
+		items_to_fetch = iolog_items_to_fetch(td);
+		if (!items_to_fetch)
+			return true;
+	}
+
+	/*
+	 * Read in the read iolog and store it, reuse the infrastructure
+	 * for doing verifications.
+	 */
+	str = malloc(4096);
+	rfname = fname = malloc(256+16);
+	act = malloc(256+16);
+
+	reads = writes = waits = 0;
+	while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
+		struct io_piece *ipo;
+		int r;
+
+		r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
+									&bytes);
+
+		if (td->o.replay_redirect)
+			fname = td->o.replay_redirect;
+
+		if (r == 4) {
+			/*
+			 * Check action first
+			 */
+			if (!strcmp(act, "wait"))
+				rw = DDIR_WAIT;
+			else if (!strcmp(act, "read"))
+				rw = DDIR_READ;
+			else if (!strcmp(act, "write"))
+				rw = DDIR_WRITE;
+			else if (!strcmp(act, "sync"))
+				rw = DDIR_SYNC;
+			else if (!strcmp(act, "datasync"))
+				rw = DDIR_DATASYNC;
+			else if (!strcmp(act, "trim"))
+				rw = DDIR_TRIM;
+			else {
+				log_err("fio: bad iolog file action: %s\n",
+									act);
+				continue;
+			}
+			fileno = get_fileno(td, fname);
+		} else if (r == 2) {
+			rw = DDIR_INVAL;
+			if (!strcmp(act, "add")) {
+				if (td->o.replay_redirect &&
+				    get_fileno(td, fname) != -1) {
+					dprint(FD_FILE, "iolog: ignoring"
+						" re-add of file %s\n", fname);
+				} else {
+					fileno = add_file(td, fname, td->subjob_number, 1);
+					file_action = FIO_LOG_ADD_FILE;
+				}
+				continue;
+			} else if (!strcmp(act, "open")) {
+				fileno = get_fileno(td, fname);
+				file_action = FIO_LOG_OPEN_FILE;
+			} else if (!strcmp(act, "close")) {
+				fileno = get_fileno(td, fname);
+				file_action = FIO_LOG_CLOSE_FILE;
+			} else {
+				log_err("fio: bad iolog file action: %s\n",
+									act);
+				continue;
+			}
+		} else {
+			log_err("bad iolog2: %s\n", p);
+			continue;
+		}
+
+		if (rw == DDIR_READ)
+			reads++;
+		else if (rw == DDIR_WRITE) {
+			/*
+			 * Don't add a write for ro mode
+			 */
+			if (read_only)
+				continue;
+			writes++;
+		} else if (rw == DDIR_WAIT) {
+			if (td->o.no_stall)
+				continue;
+			waits++;
+		} else if (rw == DDIR_INVAL) {
+		} else if (!ddir_sync(rw)) {
+			log_err("bad ddir: %d\n", rw);
+			continue;
+		}
+
+		/*
+		 * Make note of file
+		 */
+		ipo = calloc(1, sizeof(*ipo));
+		init_ipo(ipo);
+		ipo->ddir = rw;
+		if (rw == DDIR_WAIT) {
+			ipo->delay = offset;
+		} else {
+			if (td->o.replay_scale)
+				ipo->offset = offset / td->o.replay_scale;
+			else
+				ipo->offset = offset;
+			ipo_bytes_align(td->o.replay_align, ipo);
+
+			ipo->len = bytes;
+			if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw]) {
+				realloc = true;
+				td->o.max_bs[rw] = bytes;
+			}
+			ipo->fileno = fileno;
+			ipo->file_action = file_action;
+			td->o.size += bytes;
+		}
+
+		queue_io_piece(td, ipo);
+
+		if (td->o.read_iolog_chunked) {
+			td->io_log_current++;
+			items_to_fetch--;
+			if (items_to_fetch == 0)
+				break;
+		}
+	}
+
+	free(str);
+	free(act);
+	free(rfname);
+
+	if (td->o.read_iolog_chunked) {
+		td->io_log_highmark = td->io_log_current;
+		td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+		fio_gettime(&td->io_log_highmark_time, NULL);
+	}
+
+	if (writes && read_only) {
+		log_err("fio: <%s> skips replay of %d writes due to"
+			" read-only\n", td->o.name, writes);
+		writes = 0;
+	}
+
+	if (td->o.read_iolog_chunked) {
+		if (td->io_log_current == 0) {
+			return false;
+		}
+		td->o.td_ddir = TD_DDIR_RW;
+		if (realloc && td->orig_buffer)
+		{
+			io_u_quiesce(td);
+			free_io_mem(td);
+			init_io_u_buffers(td);
+		}
+		return true;
+	}
+
+	if (!reads && !writes && !waits)
+		return false;
+	else if (reads && !writes)
+		td->o.td_ddir = TD_DDIR_READ;
+	else if (!reads && writes)
+		td->o.td_ddir = TD_DDIR_WRITE;
+	else
+		td->o.td_ddir = TD_DDIR_RW;
+
+	return true;
+}
+
+static bool is_socket(const char *path)
+{
+	struct stat buf;
+	int r;
+
+	r = stat(path, &buf);
+	if (r == -1)
+		return false;
+
+	return S_ISSOCK(buf.st_mode);
+}
+
+static int open_socket(const char *path)
+{
+	struct sockaddr_un addr;
+	int ret, fd;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0)
+		return fd;
+
+	addr.sun_family = AF_UNIX;
+	if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path) >=
+	    sizeof(addr.sun_path)) {
+		log_err("%s: path name %s is too long for a Unix socket\n",
+			__func__, path);
+	}
+
+	ret = connect(fd, (const struct sockaddr *)&addr, strlen(path) + sizeof(addr.sun_family));
+	if (!ret)
+		return fd;
+
+	close(fd);
+	return -1;
+}
+
+/*
+ * open iolog, check version, and call appropriate parser
+ */
+static bool init_iolog_read(struct thread_data *td)
+{
+	char buffer[256], *p, *fname;
+	FILE *f = NULL;
+
+	fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
+	dprint(FD_IO, "iolog: name=%s\n", fname);
+
+	if (is_socket(fname)) {
+		int fd;
+
+		fd = open_socket(fname);
+		if (fd >= 0)
+			f = fdopen(fd, "r");
+	} else
+		f = fopen(fname, "r");
+
+	free(fname);
+
+	if (!f) {
+		perror("fopen read iolog");
+		return false;
+	}
+
+	p = fgets(buffer, sizeof(buffer), f);
+	if (!p) {
+		td_verror(td, errno, "iolog read");
+		log_err("fio: unable to read iolog\n");
+		fclose(f);
+		return false;
+	}
+
+	/*
+	 * version 2 of the iolog stores a specific string as the
+	 * first line, check for that
+	 */
+	if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) {
+		free_release_files(td);
+		td->io_log_rfile = f;
+		return read_iolog2(td);
+	}
+
+	log_err("fio: iolog version 1 is no longer supported\n");
+	fclose(f);
+	return false;
+}
+
+/*
+ * Set up a log for storing io patterns.
+ */
+static bool init_iolog_write(struct thread_data *td)
+{
+	struct fio_file *ff;
+	FILE *f;
+	unsigned int i;
+
+	f = fopen(td->o.write_iolog_file, "a");
+	if (!f) {
+		perror("fopen write iolog");
+		return false;
+	}
+
+	/*
+	 * That's it for writing, setup a log buffer and we're done.
+	  */
+	td->iolog_f = f;
+	td->iolog_buf = malloc(8192);
+	setvbuf(f, td->iolog_buf, _IOFBF, 8192);
+
+	/*
+	 * write our version line
+	 */
+	if (fprintf(f, "%s\n", iolog_ver2) < 0) {
+		perror("iolog init\n");
+		return false;
+	}
+
+	/*
+	 * add all known files
+	 */
+	for_each_file(td, ff, i)
+		log_file(td, ff, FIO_LOG_ADD_FILE);
+
+	return true;
+}
+
+bool init_iolog(struct thread_data *td)
+{
+	bool ret;
+
+	if (td->o.read_iolog_file) {
+		int need_swap;
+
+		/*
+		 * Check if it's a blktrace file and load that if possible.
+		 * Otherwise assume it's a normal log file and load that.
+		 */
+		if (is_blktrace(td->o.read_iolog_file, &need_swap))
+			ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
+		else
+			ret = init_iolog_read(td);
+	} else if (td->o.write_iolog_file)
+		ret = init_iolog_write(td);
+	else
+		ret = true;
+
+	if (!ret)
+		td_verror(td, EINVAL, "failed initializing iolog");
+
+	return ret;
+}
+
+void setup_log(struct io_log **log, struct log_params *p,
+	       const char *filename)
+{
+	struct io_log *l;
+	int i;
+	struct io_u_plat_entry *entry;
+	struct flist_head *list;
+
+	l = scalloc(1, sizeof(*l));
+	INIT_FLIST_HEAD(&l->io_logs);
+	l->log_type = p->log_type;
+	l->log_offset = p->log_offset;
+	l->log_gz = p->log_gz;
+	l->log_gz_store = p->log_gz_store;
+	l->avg_msec = p->avg_msec;
+	l->hist_msec = p->hist_msec;
+	l->hist_coarseness = p->hist_coarseness;
+	l->filename = strdup(filename);
+	l->td = p->td;
+
+	/* Initialize histogram lists for each r/w direction,
+	 * with initial io_u_plat of all zeros:
+	 */
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		list = &l->hist_window[i].list;
+		INIT_FLIST_HEAD(list);
+		entry = calloc(1, sizeof(struct io_u_plat_entry));
+		flist_add(&entry->list, list);
+	}
+
+	if (l->td && l->td->o.io_submit_mode != IO_MODE_OFFLOAD) {
+		struct io_logs *__p;
+
+		__p = calloc(1, sizeof(*l->pending));
+		__p->max_samples = DEF_LOG_ENTRIES;
+		__p->log = calloc(__p->max_samples, log_entry_sz(l));
+		l->pending = __p;
+	}
+
+	if (l->log_offset)
+		l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+
+	INIT_FLIST_HEAD(&l->chunk_list);
+
+	if (l->log_gz && !p->td)
+		l->log_gz = 0;
+	else if (l->log_gz || l->log_gz_store) {
+		mutex_init_pshared(&l->chunk_lock);
+		mutex_init_pshared(&l->deferred_free_lock);
+		p->td->flags |= TD_F_COMPRESS_LOG;
+	}
+
+	*log = l;
+}
+
+#ifdef CONFIG_SETVBUF
+static void *set_file_buffer(FILE *f)
+{
+	size_t size = 1048576;
+	void *buf;
+
+	buf = malloc(size);
+	setvbuf(f, buf, _IOFBF, size);
+	return buf;
+}
+
+static void clear_file_buffer(void *buf)
+{
+	free(buf);
+}
+#else
+static void *set_file_buffer(FILE *f)
+{
+	return NULL;
+}
+
+static void clear_file_buffer(void *buf)
+{
+}
+#endif
+
+void free_log(struct io_log *log)
+{
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+		free(cur_log->log);
+		sfree(cur_log);
+	}
+
+	if (log->pending) {
+		free(log->pending->log);
+		free(log->pending);
+		log->pending = NULL;
+	}
+
+	free(log->pending);
+	free(log->filename);
+	sfree(log);
+}
+
+uint64_t hist_sum(int j, int stride, uint64_t *io_u_plat,
+		uint64_t *io_u_plat_last)
+{
+	uint64_t sum;
+	int k;
+
+	if (io_u_plat_last) {
+		for (k = sum = 0; k < stride; k++)
+			sum += io_u_plat[j + k] - io_u_plat_last[j + k];
+	} else {
+		for (k = sum = 0; k < stride; k++)
+			sum += io_u_plat[j + k];
+	}
+
+	return sum;
+}
+
+static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
+			       uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, j, nr_samples;
+	struct io_u_plat_entry *entry, *entry_before;
+	uint64_t *io_u_plat;
+	uint64_t *io_u_plat_before;
+
+	int stride = 1 << hist_coarseness;
+	
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+		s = __get_sample(samples, log_offset, i);
+
+		entry = s->data.plat_entry;
+		io_u_plat = entry->io_u_plat;
+
+		entry_before = flist_first_entry(&entry->list, struct io_u_plat_entry, list);
+		io_u_plat_before = entry_before->io_u_plat;
+
+		fprintf(f, "%lu, %u, %llu, ", (unsigned long) s->time,
+						io_sample_ddir(s), (unsigned long long) s->bs);
+		for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
+			fprintf(f, "%llu, ", (unsigned long long)
+			        hist_sum(j, stride, io_u_plat, io_u_plat_before));
+		}
+		fprintf(f, "%llu\n", (unsigned long long)
+		        hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat,
+					io_u_plat_before));
+
+		flist_del(&entry_before->list);
+		free(entry_before);
+	}
+}
+
+void flush_samples(FILE *f, void *samples, uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, nr_samples;
+
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+		s = __get_sample(samples, log_offset, i);
+
+		if (!log_offset) {
+			fprintf(f, "%lu, %" PRId64 ", %u, %llu, %u\n",
+					(unsigned long) s->time,
+					s->data.val,
+					io_sample_ddir(s), (unsigned long long) s->bs, s->priority_bit);
+		} else {
+			struct io_sample_offset *so = (void *) s;
+
+			fprintf(f, "%lu, %" PRId64 ", %u, %llu, %llu, %u\n",
+					(unsigned long) s->time,
+					s->data.val,
+					io_sample_ddir(s), (unsigned long long) s->bs,
+					(unsigned long long) so->offset, s->priority_bit);
+		}
+	}
+}
+
+#ifdef CONFIG_ZLIB
+
+struct iolog_flush_data {
+	struct workqueue_work work;
+	struct io_log *log;
+	void *samples;
+	uint32_t nr_samples;
+	bool free;
+};
+
+#define GZ_CHUNK	131072
+
+static struct iolog_compress *get_new_chunk(unsigned int seq)
+{
+	struct iolog_compress *c;
+
+	c = malloc(sizeof(*c));
+	INIT_FLIST_HEAD(&c->list);
+	c->buf = malloc(GZ_CHUNK);
+	c->len = 0;
+	c->seq = seq;
+	return c;
+}
+
+static void free_chunk(struct iolog_compress *ic)
+{
+	free(ic->buf);
+	free(ic);
+}
+
+static int z_stream_init(z_stream *stream, int gz_hdr)
+{
+	int wbits = 15;
+
+	memset(stream, 0, sizeof(*stream));
+	stream->zalloc = Z_NULL;
+	stream->zfree = Z_NULL;
+	stream->opaque = Z_NULL;
+	stream->next_in = Z_NULL;
+
+	/*
+	 * zlib magic - add 32 for auto-detection of gz header or not,
+	 * if we decide to store files in a gzip friendly format.
+	 */
+	if (gz_hdr)
+		wbits += 32;
+
+	if (inflateInit2(stream, wbits) != Z_OK)
+		return 1;
+
+	return 0;
+}
+
+struct inflate_chunk_iter {
+	unsigned int seq;
+	int err;
+	void *buf;
+	size_t buf_size;
+	size_t buf_used;
+	size_t chunk_sz;
+};
+
+static void finish_chunk(z_stream *stream, FILE *f,
+			 struct inflate_chunk_iter *iter)
+{
+	int ret;
+
+	ret = inflateEnd(stream);
+	if (ret != Z_OK)
+		log_err("fio: failed to end log inflation seq %d (%d)\n",
+				iter->seq, ret);
+
+	flush_samples(f, iter->buf, iter->buf_used);
+	free(iter->buf);
+	iter->buf = NULL;
+	iter->buf_size = iter->buf_used = 0;
+}
+
+/*
+ * Iterative chunk inflation. Handles cases where we cross into a new
+ * sequence, doing flush finish of previous chunk if needed.
+ */
+static size_t inflate_chunk(struct iolog_compress *ic, int gz_hdr, FILE *f,
+			    z_stream *stream, struct inflate_chunk_iter *iter)
+{
+	size_t ret;
+
+	dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u\n",
+				(unsigned long) ic->len, ic->seq);
+
+	if (ic->seq != iter->seq) {
+		if (iter->seq)
+			finish_chunk(stream, f, iter);
+
+		z_stream_init(stream, gz_hdr);
+		iter->seq = ic->seq;
+	}
+
+	stream->avail_in = ic->len;
+	stream->next_in = ic->buf;
+
+	if (!iter->buf_size) {
+		iter->buf_size = iter->chunk_sz;
+		iter->buf = malloc(iter->buf_size);
+	}
+
+	while (stream->avail_in) {
+		size_t this_out = iter->buf_size - iter->buf_used;
+		int err;
+
+		stream->avail_out = this_out;
+		stream->next_out = iter->buf + iter->buf_used;
+
+		err = inflate(stream, Z_NO_FLUSH);
+		if (err < 0) {
+			log_err("fio: failed inflating log: %d\n", err);
+			iter->err = err;
+			break;
+		}
+
+		iter->buf_used += this_out - stream->avail_out;
+
+		if (!stream->avail_out) {
+			iter->buf_size += iter->chunk_sz;
+			iter->buf = realloc(iter->buf, iter->buf_size);
+			continue;
+		}
+
+		if (err == Z_STREAM_END)
+			break;
+	}
+
+	ret = (void *) stream->next_in - ic->buf;
+
+	dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) iter->buf_size);
+
+	return ret;
+}
+
+/*
+ * Inflate stored compressed chunks, or write them directly to the log
+ * file if so instructed.
+ */
+static int inflate_gz_chunks(struct io_log *log, FILE *f)
+{
+	struct inflate_chunk_iter iter = { .chunk_sz = log->log_gz, };
+	z_stream stream;
+
+	while (!flist_empty(&log->chunk_list)) {
+		struct iolog_compress *ic;
+
+		ic = flist_first_entry(&log->chunk_list, struct iolog_compress, list);
+		flist_del(&ic->list);
+
+		if (log->log_gz_store) {
+			size_t ret;
+
+			dprint(FD_COMPRESS, "log write chunk size=%lu, "
+				"seq=%u\n", (unsigned long) ic->len, ic->seq);
+
+			ret = fwrite(ic->buf, ic->len, 1, f);
+			if (ret != 1 || ferror(f)) {
+				iter.err = errno;
+				log_err("fio: error writing compressed log\n");
+			}
+		} else
+			inflate_chunk(ic, log->log_gz_store, f, &stream, &iter);
+
+		free_chunk(ic);
+	}
+
+	if (iter.seq) {
+		finish_chunk(&stream, f, &iter);
+		free(iter.buf);
+	}
+
+	return iter.err;
+}
+
+/*
+ * Open compressed log file and decompress the stored chunks and
+ * write them to stdout. The chunks are stored sequentially in the
+ * file, so we iterate over them and do them one-by-one.
+ */
+int iolog_file_inflate(const char *file)
+{
+	struct inflate_chunk_iter iter = { .chunk_sz = 64 * 1024 * 1024, };
+	struct iolog_compress ic;
+	z_stream stream;
+	struct stat sb;
+	size_t ret;
+	size_t total;
+	void *buf;
+	FILE *f;
+
+	f = fopen(file, "r");
+	if (!f) {
+		perror("fopen");
+		return 1;
+	}
+
+	if (stat(file, &sb) < 0) {
+		fclose(f);
+		perror("stat");
+		return 1;
+	}
+
+	ic.buf = buf = malloc(sb.st_size);
+	ic.len = sb.st_size;
+	ic.seq = 1;
+
+	ret = fread(ic.buf, ic.len, 1, f);
+	if (ret == 0 && ferror(f)) {
+		perror("fread");
+		fclose(f);
+		free(buf);
+		return 1;
+	} else if (ferror(f) || (!feof(f) && ret != 1)) {
+		log_err("fio: short read on reading log\n");
+		fclose(f);
+		free(buf);
+		return 1;
+	}
+
+	fclose(f);
+
+	/*
+	 * Each chunk will return Z_STREAM_END. We don't know how many
+	 * chunks are in the file, so we just keep looping and incrementing
+	 * the sequence number until we have consumed the whole compressed
+	 * file.
+	 */
+	total = ic.len;
+	do {
+		size_t iret;
+
+		iret = inflate_chunk(&ic,  1, stdout, &stream, &iter);
+		total -= iret;
+		if (!total)
+			break;
+		if (iter.err)
+			break;
+
+		ic.seq++;
+		ic.len -= iret;
+		ic.buf += iret;
+	} while (1);
+
+	if (iter.seq) {
+		finish_chunk(&stream, stdout, &iter);
+		free(iter.buf);
+	}
+
+	free(buf);
+	return iter.err;
+}
+
+#else
+
+static int inflate_gz_chunks(struct io_log *log, FILE *f)
+{
+	return 0;
+}
+
+int iolog_file_inflate(const char *file)
+{
+	log_err("fio: log inflation not possible without zlib\n");
+	return 1;
+}
+
+#endif
+
+void flush_log(struct io_log *log, bool do_append)
+{
+	void *buf;
+	FILE *f;
+
+	if (!do_append)
+		f = fopen(log->filename, "w");
+	else
+		f = fopen(log->filename, "a");
+	if (!f) {
+		perror("fopen log");
+		return;
+	}
+
+	buf = set_file_buffer(f);
+
+	inflate_gz_chunks(log, f);
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+		
+		if (log->td && log == log->td->clat_hist_log)
+			flush_hist_samples(f, log->hist_coarseness, cur_log->log,
+			                   log_sample_sz(log, cur_log));
+		else
+			flush_samples(f, cur_log->log, log_sample_sz(log, cur_log));
+		
+		sfree(cur_log);
+	}
+
+	fclose(f);
+	clear_file_buffer(buf);
+}
+
+static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
+{
+	if (td->flags & TD_F_COMPRESS_LOG)
+		iolog_flush(log);
+
+	if (trylock) {
+		if (fio_trylock_file(log->filename))
+			return 1;
+	} else
+		fio_lock_file(log->filename);
+
+	if (td->client_type == FIO_CLIENT_TYPE_GUI || is_backend)
+		fio_send_iolog(td, log, log->filename);
+	else
+		flush_log(log, !td->o.per_job_logs);
+
+	fio_unlock_file(log->filename);
+	free_log(log);
+	return 0;
+}
+
+size_t log_chunk_sizes(struct io_log *log)
+{
+	struct flist_head *entry;
+	size_t ret;
+
+	if (flist_empty(&log->chunk_list))
+		return 0;
+
+	ret = 0;
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(entry, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(entry, struct iolog_compress, list);
+		ret += c->len;
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+	return ret;
+}
+
+#ifdef CONFIG_ZLIB
+
+static void iolog_put_deferred(struct io_log *log, void *ptr)
+{
+	if (!ptr)
+		return;
+
+	pthread_mutex_lock(&log->deferred_free_lock);
+	if (log->deferred < IOLOG_MAX_DEFER) {
+		log->deferred_items[log->deferred] = ptr;
+		log->deferred++;
+	} else if (!fio_did_warn(FIO_WARN_IOLOG_DROP))
+		log_err("fio: had to drop log entry free\n");
+	pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
+static void iolog_free_deferred(struct io_log *log)
+{
+	int i;
+
+	if (!log->deferred)
+		return;
+
+	pthread_mutex_lock(&log->deferred_free_lock);
+
+	for (i = 0; i < log->deferred; i++) {
+		free(log->deferred_items[i]);
+		log->deferred_items[i] = NULL;
+	}
+
+	log->deferred = 0;
+	pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
+static int gz_work(struct iolog_flush_data *data)
+{
+	struct iolog_compress *c = NULL;
+	struct flist_head list;
+	unsigned int seq;
+	z_stream stream;
+	size_t total = 0;
+	int ret;
+
+	INIT_FLIST_HEAD(&list);
+
+	memset(&stream, 0, sizeof(stream));
+	stream.zalloc = Z_NULL;
+	stream.zfree = Z_NULL;
+	stream.opaque = Z_NULL;
+
+	ret = deflateInit(&stream, Z_DEFAULT_COMPRESSION);
+	if (ret != Z_OK) {
+		log_err("fio: failed to init gz stream\n");
+		goto err;
+	}
+
+	seq = ++data->log->chunk_seq;
+
+	stream.next_in = (void *) data->samples;
+	stream.avail_in = data->nr_samples * log_entry_sz(data->log);
+
+	dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u, log=%s\n",
+				(unsigned long) stream.avail_in, seq,
+				data->log->filename);
+	do {
+		if (c)
+			dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq,
+				(unsigned long) c->len);
+		c = get_new_chunk(seq);
+		stream.avail_out = GZ_CHUNK;
+		stream.next_out = c->buf;
+		ret = deflate(&stream, Z_NO_FLUSH);
+		if (ret < 0) {
+			log_err("fio: deflate log (%d)\n", ret);
+			free_chunk(c);
+			goto err;
+		}
+
+		c->len = GZ_CHUNK - stream.avail_out;
+		flist_add_tail(&c->list, &list);
+		total += c->len;
+	} while (stream.avail_in);
+
+	stream.next_out = c->buf + c->len;
+	stream.avail_out = GZ_CHUNK - c->len;
+
+	ret = deflate(&stream, Z_FINISH);
+	if (ret < 0) {
+		/*
+		 * Z_BUF_ERROR is special, it just means we need more
+		 * output space. We'll handle that below. Treat any other
+		 * error as fatal.
+		 */
+		if (ret != Z_BUF_ERROR) {
+			log_err("fio: deflate log (%d)\n", ret);
+			flist_del(&c->list);
+			free_chunk(c);
+			goto err;
+		}
+	}
+
+	total -= c->len;
+	c->len = GZ_CHUNK - stream.avail_out;
+	total += c->len;
+	dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq, (unsigned long) c->len);
+
+	if (ret != Z_STREAM_END) {
+		do {
+			c = get_new_chunk(seq);
+			stream.avail_out = GZ_CHUNK;
+			stream.next_out = c->buf;
+			ret = deflate(&stream, Z_FINISH);
+			c->len = GZ_CHUNK - stream.avail_out;
+			total += c->len;
+			flist_add_tail(&c->list, &list);
+			dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq,
+				(unsigned long) c->len);
+		} while (ret != Z_STREAM_END);
+	}
+
+	dprint(FD_COMPRESS, "deflated to size=%lu\n", (unsigned long) total);
+
+	ret = deflateEnd(&stream);
+	if (ret != Z_OK)
+		log_err("fio: deflateEnd %d\n", ret);
+
+	iolog_put_deferred(data->log, data->samples);
+
+	if (!flist_empty(&list)) {
+		pthread_mutex_lock(&data->log->chunk_lock);
+		flist_splice_tail(&list, &data->log->chunk_list);
+		pthread_mutex_unlock(&data->log->chunk_lock);
+	}
+
+	ret = 0;
+done:
+	if (data->free)
+		sfree(data);
+	return ret;
+err:
+	while (!flist_empty(&list)) {
+		c = flist_first_entry(list.next, struct iolog_compress, list);
+		flist_del(&c->list);
+		free_chunk(c);
+	}
+	ret = 1;
+	goto done;
+}
+
+/*
+ * Invoked from our compress helper thread, when logging would have exceeded
+ * the specified memory limitation. Compresses the previously stored
+ * entries.
+ */
+static int gz_work_async(struct submit_worker *sw, struct workqueue_work *work)
+{
+	return gz_work(container_of(work, struct iolog_flush_data, work));
+}
+
+static int gz_init_worker(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->wq->td;
+
+	if (!fio_option_is_set(&td->o, log_gz_cpumask))
+		return 0;
+
+	if (fio_setaffinity(gettid(), td->o.log_gz_cpumask) == -1) {
+		log_err("gz: failed to set CPU affinity\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct workqueue_ops log_compress_wq_ops = {
+	.fn		= gz_work_async,
+	.init_worker_fn	= gz_init_worker,
+	.nice		= 1,
+};
+
+int iolog_compress_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	if (!(td->flags & TD_F_COMPRESS_LOG))
+		return 0;
+
+	workqueue_init(td, &td->log_compress_wq, &log_compress_wq_ops, 1, sk_out);
+	return 0;
+}
+
+void iolog_compress_exit(struct thread_data *td)
+{
+	if (!(td->flags & TD_F_COMPRESS_LOG))
+		return;
+
+	workqueue_exit(&td->log_compress_wq);
+}
+
+/*
+ * Queue work item to compress the existing log entries. We reset the
+ * current log to a small size, and reference the existing log in the
+ * data that we queue for compression. Once compression has been done,
+ * this old log is freed. If called with finish == true, will not return
+ * until the log compression has completed, and will flush all previous
+ * logs too
+ */
+static int iolog_flush(struct io_log *log)
+{
+	struct iolog_flush_data *data;
+
+	data = malloc(sizeof(*data));
+	if (!data)
+		return 1;
+
+	data->log = log;
+	data->free = false;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		data->samples = cur_log->log;
+		data->nr_samples = cur_log->nr_samples;
+
+		sfree(cur_log);
+
+		gz_work(data);
+	}
+
+	free(data);
+	return 0;
+}
+
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+	struct iolog_flush_data *data;
+
+	data = smalloc(sizeof(*data));
+	if (!data)
+		return 1;
+
+	data->log = log;
+
+	data->samples = cur_log->log;
+	data->nr_samples = cur_log->nr_samples;
+	data->free = true;
+
+	cur_log->nr_samples = cur_log->max_samples = 0;
+	cur_log->log = NULL;
+
+	workqueue_enqueue(&log->td->log_compress_wq, &data->work);
+
+	iolog_free_deferred(log);
+
+	return 0;
+}
+#else
+
+static int iolog_flush(struct io_log *log)
+{
+	return 1;
+}
+
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+	return 1;
+}
+
+int iolog_compress_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	return 0;
+}
+
+void iolog_compress_exit(struct thread_data *td)
+{
+}
+
+#endif
+
+struct io_logs *iolog_cur_log(struct io_log *log)
+{
+	if (flist_empty(&log->io_logs))
+		return NULL;
+
+	return flist_last_entry(&log->io_logs, struct io_logs, list);
+}
+
+uint64_t iolog_nr_samples(struct io_log *iolog)
+{
+	struct flist_head *entry;
+	uint64_t ret = 0;
+
+	flist_for_each(entry, &iolog->io_logs) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_entry(entry, struct io_logs, list);
+		ret += cur_log->nr_samples;
+	}
+
+	return ret;
+}
+
+static int __write_log(struct thread_data *td, struct io_log *log, int try)
+{
+	if (log)
+		return finish_log(td, log, try);
+
+	return 0;
+}
+
+static int write_iops_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (per_unit_log(td->iops_log) != unit_log)
+		return 0;
+
+	ret = __write_log(td, td->iops_log, try);
+	if (!ret)
+		td->iops_log = NULL;
+
+	return ret;
+}
+
+static int write_slat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->slat_log, try);
+	if (!ret)
+		td->slat_log = NULL;
+
+	return ret;
+}
+
+static int write_clat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->clat_log, try);
+	if (!ret)
+		td->clat_log = NULL;
+
+	return ret;
+}
+
+static int write_clat_hist_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->clat_hist_log, try);
+	if (!ret)
+		td->clat_hist_log = NULL;
+
+	return ret;
+}
+
+static int write_lat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->lat_log, try);
+	if (!ret)
+		td->lat_log = NULL;
+
+	return ret;
+}
+
+static int write_bandw_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (per_unit_log(td->bw_log) != unit_log)
+		return 0;
+
+	ret = __write_log(td, td->bw_log, try);
+	if (!ret)
+		td->bw_log = NULL;
+
+	return ret;
+}
+
+enum {
+	BW_LOG_MASK	= 1,
+	LAT_LOG_MASK	= 2,
+	SLAT_LOG_MASK	= 4,
+	CLAT_LOG_MASK	= 8,
+	IOPS_LOG_MASK	= 16,
+	CLAT_HIST_LOG_MASK = 32,
+
+	ALL_LOG_NR	= 6,
+};
+
+struct log_type {
+	unsigned int mask;
+	int (*fn)(struct thread_data *, int, bool);
+};
+
+static struct log_type log_types[] = {
+	{
+		.mask	= BW_LOG_MASK,
+		.fn	= write_bandw_log,
+	},
+	{
+		.mask	= LAT_LOG_MASK,
+		.fn	= write_lat_log,
+	},
+	{
+		.mask	= SLAT_LOG_MASK,
+		.fn	= write_slat_log,
+	},
+	{
+		.mask	= CLAT_LOG_MASK,
+		.fn	= write_clat_log,
+	},
+	{
+		.mask	= IOPS_LOG_MASK,
+		.fn	= write_iops_log,
+	},
+	{
+		.mask	= CLAT_HIST_LOG_MASK,
+		.fn	= write_clat_hist_log,
+	}
+};
+
+void td_writeout_logs(struct thread_data *td, bool unit_logs)
+{
+	unsigned int log_mask = 0;
+	unsigned int log_left = ALL_LOG_NR;
+	int old_state, i;
+
+	old_state = td_bump_runstate(td, TD_FINISHING);
+
+	finalize_logs(td, unit_logs);
+
+	while (log_left) {
+		int prev_log_left = log_left;
+
+		for (i = 0; i < ALL_LOG_NR && log_left; i++) {
+			struct log_type *lt = &log_types[i];
+			int ret;
+
+			if (!(log_mask & lt->mask)) {
+				ret = lt->fn(td, log_left != 1, unit_logs);
+				if (!ret) {
+					log_left--;
+					log_mask |= lt->mask;
+				}
+			}
+		}
+
+		if (prev_log_left == log_left)
+			usleep(5000);
+	}
+
+	td_restore_runstate(td, old_state);
+}
+
+void fio_writeout_logs(bool unit_logs)
+{
+	struct thread_data *td;
+	int i;
+
+	for_each_td(td, i)
+		td_writeout_logs(td, unit_logs);
+}
diff --git a/iolog.h b/iolog.h
new file mode 100644
index 0000000..981081f
--- /dev/null
+++ b/iolog.h
@@ -0,0 +1,312 @@
+#ifndef FIO_IOLOG_H
+#define FIO_IOLOG_H
+
+#include <stdio.h>
+
+#include "lib/rbtree.h"
+#include "lib/ieee754.h"
+#include "flist.h"
+#include "ioengines.h"
+
+/*
+ * Use for maintaining statistics
+ */
+struct io_stat {
+	uint64_t max_val;
+	uint64_t min_val;
+	uint64_t samples;
+
+	fio_fp64_t mean;
+	fio_fp64_t S;
+};
+
+struct io_hist {
+	uint64_t samples;
+	unsigned long hist_last;
+	struct flist_head list;
+};
+
+
+union io_sample_data {
+	uint64_t val;
+	struct io_u_plat_entry *plat_entry;
+};
+
+#define sample_val(value) ((union io_sample_data) { .val = value })
+#define sample_plat(plat) ((union io_sample_data) { .plat_entry = plat })
+
+/*
+ * A single data sample
+ */
+struct io_sample {
+	uint64_t time;
+	union io_sample_data data;
+	uint32_t __ddir;
+	uint8_t priority_bit;
+	uint64_t bs;
+};
+
+struct io_sample_offset {
+	struct io_sample s;
+	uint64_t offset;
+};
+
+enum {
+	IO_LOG_TYPE_LAT = 1,
+	IO_LOG_TYPE_CLAT,
+	IO_LOG_TYPE_SLAT,
+	IO_LOG_TYPE_BW,
+	IO_LOG_TYPE_IOPS,
+	IO_LOG_TYPE_HIST,
+};
+
+#define DEF_LOG_ENTRIES		1024
+#define MAX_LOG_ENTRIES		(1024 * DEF_LOG_ENTRIES)
+
+struct io_logs {
+	struct flist_head list;
+	uint64_t nr_samples;
+	uint64_t max_samples;
+	void *log;
+};
+
+/*
+ * Dynamically growing data sample log
+ */
+struct io_log {
+	/*
+	 * Entries already logged
+	 */
+	struct flist_head io_logs;
+	uint32_t cur_log_max;
+
+	/*
+	 * When the current log runs out of space, store events here until
+	 * we have a chance to regrow
+	 */
+	struct io_logs *pending;
+
+	unsigned int log_ddir_mask;
+
+	char *filename;
+
+	struct thread_data *td;
+
+	unsigned int log_type;
+
+	/*
+	 * If we fail extending the log, stop collecting more entries.
+	 */
+	bool disabled;
+
+	/*
+	 * Log offsets
+	 */
+	unsigned int log_offset;
+
+	/*
+	 * Max size of log entries before a chunk is compressed
+	 */
+	unsigned int log_gz;
+
+	/*
+	 * Don't deflate for storing, just store the compressed bits
+	 */
+	unsigned int log_gz_store;
+
+	/*
+	 * Windowed average, for logging single entries average over some
+	 * period of time.
+	 */
+	struct io_stat avg_window[DDIR_RWDIR_CNT];
+	unsigned long avg_msec;
+	unsigned long avg_last[DDIR_RWDIR_CNT];
+
+	/*
+	 * Windowed latency histograms, for keeping track of when we need to
+	 * save a copy of the histogram every approximately hist_msec
+	 * milliseconds.
+	 */
+	struct io_hist hist_window[DDIR_RWDIR_CNT];
+	unsigned long hist_msec;
+	unsigned int hist_coarseness;
+
+	pthread_mutex_t chunk_lock;
+	unsigned int chunk_seq;
+	struct flist_head chunk_list;
+
+	pthread_mutex_t deferred_free_lock;
+#define IOLOG_MAX_DEFER	8
+	void *deferred_items[IOLOG_MAX_DEFER];
+	unsigned int deferred;
+};
+
+/*
+ * If the upper bit is set, then we have the offset as well
+ */
+#define LOG_OFFSET_SAMPLE_BIT	0x80000000U
+#define io_sample_ddir(io)	((io)->__ddir & ~LOG_OFFSET_SAMPLE_BIT)
+
+static inline void io_sample_set_ddir(struct io_log *log,
+				      struct io_sample *io,
+				      enum fio_ddir ddir)
+{
+	io->__ddir = ddir | log->log_ddir_mask;
+}
+
+static inline size_t __log_entry_sz(int log_offset)
+{
+	if (log_offset)
+		return sizeof(struct io_sample_offset);
+	else
+		return sizeof(struct io_sample);
+}
+
+static inline size_t log_entry_sz(struct io_log *log)
+{
+	return __log_entry_sz(log->log_offset);
+}
+
+static inline size_t log_sample_sz(struct io_log *log, struct io_logs *cur_log)
+{
+	return cur_log->nr_samples * log_entry_sz(log);
+}
+
+static inline struct io_sample *__get_sample(void *samples, int log_offset,
+					     uint64_t sample)
+{
+	uint64_t sample_offset = sample * __log_entry_sz(log_offset);
+	return (struct io_sample *) ((char *) samples + sample_offset);
+}
+
+struct io_logs *iolog_cur_log(struct io_log *);
+uint64_t iolog_nr_samples(struct io_log *);
+void regrow_logs(struct thread_data *);
+
+static inline struct io_sample *get_sample(struct io_log *iolog,
+					   struct io_logs *cur_log,
+					   uint64_t sample)
+{
+	return __get_sample(cur_log->log, iolog->log_offset, sample);
+}
+
+enum {
+	IP_F_ONRB	= 1,
+	IP_F_ONLIST	= 2,
+	IP_F_TRIMMED	= 4,
+	IP_F_IN_FLIGHT	= 8,
+};
+
+/*
+ * When logging io actions, this matches a single sent io_u
+ */
+struct io_piece {
+	union {
+		struct fio_rb_node rb_node;
+		struct flist_head list;
+	};
+	struct flist_head trim_list;
+	union {
+		int fileno;
+		struct fio_file *file;
+	};
+	unsigned long long offset;
+	unsigned short numberio;
+	unsigned long len;
+	unsigned int flags;
+	enum fio_ddir ddir;
+	union {
+		unsigned long delay;
+		unsigned int file_action;
+	};
+};
+
+/*
+ * Log exports
+ */
+enum file_log_act {
+	FIO_LOG_ADD_FILE,
+	FIO_LOG_OPEN_FILE,
+	FIO_LOG_CLOSE_FILE,
+	FIO_LOG_UNLINK_FILE,
+};
+
+struct io_u;
+extern int __must_check read_iolog_get(struct thread_data *, struct io_u *);
+extern void log_io_u(const struct thread_data *, const struct io_u *);
+extern void log_file(struct thread_data *, struct fio_file *, enum file_log_act);
+extern bool __must_check init_iolog(struct thread_data *td);
+extern void log_io_piece(struct thread_data *, struct io_u *);
+extern void unlog_io_piece(struct thread_data *, struct io_u *);
+extern void trim_io_piece(const struct io_u *);
+extern void queue_io_piece(struct thread_data *, struct io_piece *);
+extern void prune_io_piece_log(struct thread_data *);
+extern void write_iolog_close(struct thread_data *);
+extern int iolog_compress_init(struct thread_data *, struct sk_out *);
+extern void iolog_compress_exit(struct thread_data *);
+extern size_t log_chunk_sizes(struct io_log *);
+extern int init_io_u_buffers(struct thread_data *);
+
+#ifdef CONFIG_ZLIB
+extern int iolog_file_inflate(const char *);
+#endif
+
+/*
+ * Logging
+ */
+struct log_params {
+	struct thread_data *td;
+	unsigned long avg_msec;
+	unsigned long hist_msec;
+	int hist_coarseness;
+	int log_type;
+	int log_offset;
+	int log_gz;
+	int log_gz_store;
+	int log_compress;
+};
+
+static inline bool per_unit_log(struct io_log *log)
+{
+	return log && (!log->avg_msec || log->log_gz || log->log_gz_store);
+}
+
+static inline bool inline_log(struct io_log *log)
+{
+	return log->log_type == IO_LOG_TYPE_LAT ||
+		log->log_type == IO_LOG_TYPE_CLAT ||
+		log->log_type == IO_LOG_TYPE_SLAT;
+}
+
+static inline void ipo_bytes_align(unsigned int replay_align, struct io_piece *ipo)
+{
+	if (!replay_align)
+		return;
+
+	ipo->offset &= ~(replay_align - (uint64_t)1);
+}
+
+extern void finalize_logs(struct thread_data *td, bool);
+extern void setup_log(struct io_log **, struct log_params *, const char *);
+extern void flush_log(struct io_log *, bool);
+extern void flush_samples(FILE *, void *, uint64_t);
+extern uint64_t hist_sum(int, int, uint64_t *, uint64_t *);
+extern void free_log(struct io_log *);
+extern void fio_writeout_logs(bool);
+extern void td_writeout_logs(struct thread_data *, bool);
+extern int iolog_cur_flush(struct io_log *, struct io_logs *);
+
+static inline void init_ipo(struct io_piece *ipo)
+{
+	INIT_FLIST_HEAD(&ipo->list);
+	INIT_FLIST_HEAD(&ipo->trim_list);
+}
+
+struct iolog_compress {
+	struct flist_head list;
+	void *buf;
+	size_t len;
+	unsigned int seq;
+};
+
+#endif
diff --git a/json.c b/json.c
new file mode 100644
index 0000000..cd3d5d7
--- /dev/null
+++ b/json.c
@@ -0,0 +1,369 @@
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdarg.h>
+#include "json.h"
+#include "log.h"
+
+struct json_object *json_create_object(void)
+{
+	return calloc(1, sizeof(struct json_object));
+}
+
+struct json_array *json_create_array(void)
+{
+	return calloc(1, sizeof(struct json_array));
+}
+
+static struct json_pair *json_create_pair(const char *name, struct json_value *value)
+{
+	struct json_pair *pair = malloc(sizeof(struct json_pair));
+	if (pair) {
+		pair->name = strdup(name);
+		pair->value = value;
+
+		value->parent_type = JSON_PARENT_TYPE_PAIR;
+		value->parent_pair = pair;
+	}
+	return pair;
+}
+
+static struct json_value *json_create_value_int(long long number)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_INTEGER;
+		value->integer_number = number;
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_float(double number)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_FLOAT;
+		value->float_number = number;
+	}
+	return value;
+}
+
+static char *strdup_escape(const char *str)
+{
+	const char *input = str;
+	char *p, *ret;
+	int escapes;
+
+	if (!strlen(str))
+		return NULL;
+
+	escapes = 0;
+	while ((input = strpbrk(input, "\\\"")) != NULL) {
+		escapes++;
+		input++;
+	}
+
+	p = ret = malloc(strlen(str) + escapes + 1);
+	while (*str) {
+		if (*str == '\\' || *str == '\"')
+			*p++ = '\\';
+		*p++ = *str++;
+	}
+	*p = '\0';
+
+	return ret;
+}
+
+/*
+ * Valid JSON strings must escape '"' and '/' with a preceding '/'
+ */
+static struct json_value *json_create_value_string(const char *str)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_STRING;
+		value->string = strdup_escape(str);
+		if (!value->string) {
+			free(value);
+			value = NULL;
+		}
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_object(struct json_object *obj)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_OBJECT;
+		value->object = obj;
+		obj->parent = value;
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_array(struct json_array *array)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_ARRAY;
+		value->array = array;
+		array->parent = value;
+	}
+	return value;
+}
+
+static void json_free_pair(struct json_pair *pair);
+static void json_free_value(struct json_value *value);
+
+void json_free_object(struct json_object *obj)
+{
+	int i;
+
+	for (i = 0; i < obj->pair_cnt; i++)
+		json_free_pair(obj->pairs[i]);
+	free(obj->pairs);
+	free(obj);
+}
+
+static void json_free_array(struct json_array *array)
+{
+	int i;
+
+	for (i = 0; i < array->value_cnt; i++)
+		json_free_value(array->values[i]);
+	free(array->values);
+	free(array);
+}
+
+static void json_free_pair(struct json_pair *pair)
+{
+	json_free_value(pair->value);
+	free(pair->name);
+	free(pair);
+}
+
+static void json_free_value(struct json_value *value)
+{
+	switch (value->type) {
+	case JSON_TYPE_STRING:
+		free(value->string);
+		break;
+	case JSON_TYPE_OBJECT:
+		json_free_object(value->object);
+		break;
+	case JSON_TYPE_ARRAY:
+		json_free_array(value->array);
+		break;
+	}
+	free(value);
+}
+
+static int json_array_add_value(struct json_array *array, struct json_value *value)
+{
+	struct json_value **values = realloc(array->values,
+		sizeof(struct json_value *) * (array->value_cnt + 1));
+
+	if (!values)
+		return ENOMEM;
+	values[array->value_cnt] = value;
+	array->value_cnt++;
+	array->values = values;
+
+	value->parent_type = JSON_PARENT_TYPE_ARRAY;
+	value->parent_array = array;
+	return 0;
+}
+
+static int json_object_add_pair(struct json_object *obj, struct json_pair *pair)
+{
+	struct json_pair **pairs = realloc(obj->pairs,
+		sizeof(struct json_pair *) * (obj->pair_cnt + 1));
+	if (!pairs)
+		return ENOMEM;
+	pairs[obj->pair_cnt] = pair;
+	obj->pair_cnt++;
+	obj->pairs = pairs;
+
+	pair->parent = obj;
+	return 0;
+}
+
+int json_object_add_value_type(struct json_object *obj, const char *name,
+			       const struct json_value *arg)
+{
+	struct json_value *value;
+	struct json_pair *pair;
+	int ret;
+
+	switch (arg->type) {
+	case JSON_TYPE_STRING:
+		value = json_create_value_string(arg->string);
+		break;
+	case JSON_TYPE_INTEGER:
+		value = json_create_value_int(arg->integer_number);
+		break;
+	case JSON_TYPE_FLOAT:
+		value = json_create_value_float(arg->float_number);
+		break;
+	case JSON_TYPE_OBJECT:
+		value = json_create_value_object(arg->object);
+		break;
+	default:
+	case JSON_TYPE_ARRAY:
+		value = json_create_value_array(arg->array);
+		break;
+	}
+
+	if (!value)
+		return ENOMEM;
+
+	pair = json_create_pair(name, value);
+	if (!pair) {
+		json_free_value(value);
+		return ENOMEM;
+	}
+	ret = json_object_add_pair(obj, pair);
+	if (ret) {
+		json_free_pair(pair);
+		return ENOMEM;
+	}
+	return 0;
+}
+
+int json_array_add_value_type(struct json_array *array,
+			      const struct json_value *arg)
+{
+	struct json_value *value;
+	int ret;
+
+	switch (arg->type) {
+	case JSON_TYPE_STRING:
+		value = json_create_value_string(arg->string);
+		break;
+	case JSON_TYPE_INTEGER:
+		value = json_create_value_int(arg->integer_number);
+		break;
+	case JSON_TYPE_FLOAT:
+		value = json_create_value_float(arg->float_number);
+		break;
+	case JSON_TYPE_OBJECT:
+		value = json_create_value_object(arg->object);
+		break;
+	default:
+	case JSON_TYPE_ARRAY:
+		value = json_create_value_array(arg->array);
+		break;
+	}
+
+	if (!value)
+		return ENOMEM;
+
+	ret = json_array_add_value(array, value);
+	if (ret) {
+		json_free_value(value);
+		return ENOMEM;
+	}
+	return 0;
+}
+
+static int json_value_level(struct json_value *value);
+static int json_pair_level(struct json_pair *pair);
+static int json_array_level(struct json_array *array);
+static int json_object_level(struct json_object *object)
+{
+	if (object->parent == NULL)
+		return 0;
+	return json_value_level(object->parent);
+}
+
+static int json_pair_level(struct json_pair *pair)
+{
+	return json_object_level(pair->parent) + 1;
+}
+
+static int json_array_level(struct json_array *array)
+{
+	return json_value_level(array->parent);
+}
+
+static int json_value_level(struct json_value *value)
+{
+	if (value->parent_type == JSON_PARENT_TYPE_PAIR)
+		return json_pair_level(value->parent_pair);
+	else
+		return json_array_level(value->parent_array) + 1;
+}
+
+static void json_print_level(int level, struct buf_output *out)
+{
+	while (level-- > 0)
+		log_buf(out, "  ");
+}
+
+static void json_print_pair(struct json_pair *pair, struct buf_output *);
+static void json_print_value(struct json_value *value, struct buf_output *);
+
+void json_print_object(struct json_object *obj, struct buf_output *out)
+{
+	int i;
+
+	log_buf(out, "{\n");
+	for (i = 0; i < obj->pair_cnt; i++) {
+		if (i > 0)
+			log_buf(out, ",\n");
+		json_print_pair(obj->pairs[i], out);
+	}
+	log_buf(out, "\n");
+	json_print_level(json_object_level(obj), out);
+	log_buf(out, "}");
+}
+
+static void json_print_pair(struct json_pair *pair, struct buf_output *out)
+{
+	json_print_level(json_pair_level(pair), out);
+	log_buf(out, "\"%s\" : ", pair->name);
+	json_print_value(pair->value, out);
+}
+
+static void json_print_array(struct json_array *array, struct buf_output *out)
+{
+	int i;
+
+	log_buf(out, "[\n");
+	for (i = 0; i < array->value_cnt; i++) {
+		if (i > 0)
+			log_buf(out, ",\n");
+		json_print_level(json_value_level(array->values[i]), out);
+		json_print_value(array->values[i], out);
+	}
+	log_buf(out, "\n");
+	json_print_level(json_array_level(array), out);
+	log_buf(out, "]");
+}
+
+static void json_print_value(struct json_value *value, struct buf_output *out)
+{
+	switch (value->type) {
+	case JSON_TYPE_STRING:
+		log_buf(out, "\"%s\"", value->string);
+		break;
+	case JSON_TYPE_INTEGER:
+		log_buf(out, "%lld", value->integer_number);
+		break;
+	case JSON_TYPE_FLOAT:
+		log_buf(out, "%f", value->float_number);
+		break;
+	case JSON_TYPE_OBJECT:
+		json_print_object(value->object, out);
+		break;
+	case JSON_TYPE_ARRAY:
+		json_print_array(value->array, out);
+		break;
+	}
+}
diff --git a/json.h b/json.h
new file mode 100644
index 0000000..09c2f18
--- /dev/null
+++ b/json.h
@@ -0,0 +1,175 @@
+#ifndef __JSON__H
+#define __JSON__H
+
+#include "lib/output_buffer.h"
+
+#define JSON_TYPE_STRING 0
+#define JSON_TYPE_INTEGER 1
+#define JSON_TYPE_FLOAT 2
+#define JSON_TYPE_OBJECT 3
+#define JSON_TYPE_ARRAY 4
+#define JSON_PARENT_TYPE_PAIR 0
+#define JSON_PARENT_TYPE_ARRAY 1
+struct json_value {
+	int type;
+	union {
+		long long integer_number;
+		double float_number;
+		char *string;
+		struct json_object *object;
+		struct json_array *array;
+	};
+	int parent_type;
+	union {
+		struct json_pair *parent_pair;
+		struct json_array *parent_array;
+	};
+};
+
+struct json_array {
+	struct json_value **values;
+	int value_cnt;
+	struct json_value *parent;
+};
+
+struct json_object {
+	struct json_pair **pairs;
+	int pair_cnt;
+	struct json_value *parent;
+};
+
+struct json_pair {
+	char *name;
+	struct json_value *value;
+	struct json_object *parent;
+};
+
+struct json_object *json_create_object(void);
+struct json_array *json_create_array(void);
+
+void json_free_object(struct json_object *obj);
+
+int json_object_add_value_type(struct json_object *obj, const char *name,
+			       const struct json_value *val);
+
+static inline int json_object_add_value_int(struct json_object *obj,
+					    const char *name, long long val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_INTEGER,
+		.integer_number = val,
+	};
+
+	return json_object_add_value_type(obj, name, &arg);
+}
+
+static inline int json_object_add_value_float(struct json_object *obj,
+					      const char *name, double val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_FLOAT,
+		.float_number = val,
+	};
+
+	return json_object_add_value_type(obj, name, &arg);
+}
+
+static inline int json_object_add_value_string(struct json_object *obj,
+					       const char *name,
+					       const char *val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_STRING,
+		.string = (char *)val,
+	};
+
+	return json_object_add_value_type(obj, name, &arg);
+}
+
+static inline int json_object_add_value_object(struct json_object *obj,
+					       const char *name,
+					       struct json_object *val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_OBJECT,
+		.object = val,
+	};
+
+	return json_object_add_value_type(obj, name, &arg);
+}
+
+static inline int json_object_add_value_array(struct json_object *obj,
+					      const char *name,
+					      struct json_array *val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_ARRAY,
+		.array = val,
+	};
+
+	return json_object_add_value_type(obj, name, &arg);
+}
+
+int json_array_add_value_type(struct json_array *array,
+			      const struct json_value *val);
+
+static inline int json_array_add_value_int(struct json_array *obj,
+					   long long val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_INTEGER,
+		.integer_number = val,
+	};
+
+	return json_array_add_value_type(obj, &arg);
+}
+
+static inline int json_array_add_value_float(struct json_array *obj,
+					     double val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_FLOAT,
+		.float_number = val,
+	};
+
+	return json_array_add_value_type(obj, &arg);
+}
+
+static inline int json_array_add_value_string(struct json_array *obj,
+					      const char *val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_STRING,
+		.string = (char *)val,
+	};
+
+	return json_array_add_value_type(obj, &arg);
+}
+
+static inline int json_array_add_value_object(struct json_array *obj,
+					      struct json_object *val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_OBJECT,
+		.object = val,
+	};
+
+	return json_array_add_value_type(obj, &arg);
+}
+
+static inline int json_array_add_value_array(struct json_array *obj,
+					     struct json_array *val)
+{
+	struct json_value arg = {
+		.type = JSON_TYPE_ARRAY,
+		.array = val,
+	};
+
+	return json_array_add_value_type(obj, &arg);
+}
+
+#define json_array_last_value_object(obj) \
+	(obj->values[obj->value_cnt - 1]->object)
+
+void json_print_object(struct json_object *obj, struct buf_output *out);
+#endif
diff --git a/lib/axmap.c b/lib/axmap.c
new file mode 100644
index 0000000..27301bd
--- /dev/null
+++ b/lib/axmap.c
@@ -0,0 +1,454 @@
+/*
+ * Bitmap of bitmaps, where each layer is number-of-bits-per-word smaller than
+ * the previous. Hence an 'axmap', since we axe each previous layer into a
+ * much smaller piece. I swear, that is why it's named like that. It has
+ * nothing to do with anything remotely narcissistic.
+ *
+ * A set bit at layer N indicates a full word at layer N-1, and so forth. As
+ * the bitmap becomes progressively more full, checking for existence
+ * becomes cheaper (since fewer layers are walked, making it a lot more
+ * cache friendly) and locating the next free space likewise.
+ *
+ * Axmaps get pretty close to optimal (1 bit per block) space usage, since
+ * layers quickly diminish in size. Doing the size math is straight forward,
+ * since we have log64(blocks) layers of maps. For 20000 blocks, overhead
+ * is roughly 1.9%, or 1.019 bits per block. The number quickly converges
+ * towards 1.0158, or 1.58% of overhead.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "../arch/arch.h"
+#include "axmap.h"
+#include "../minmax.h"
+
+#if BITS_PER_LONG == 64
+#define UNIT_SHIFT		6
+#elif BITS_PER_LONG == 32
+#define UNIT_SHIFT		5
+#else
+#error "Number of arch bits unknown"
+#endif
+
+#define BLOCKS_PER_UNIT		(1U << UNIT_SHIFT)
+#define BLOCKS_PER_UNIT_MASK	(BLOCKS_PER_UNIT - 1)
+
+static const unsigned long bit_masks[] = {
+	0x0000000000000000, 0x0000000000000001, 0x0000000000000003, 0x0000000000000007,
+	0x000000000000000f, 0x000000000000001f, 0x000000000000003f, 0x000000000000007f,
+	0x00000000000000ff, 0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff,
+	0x0000000000000fff, 0x0000000000001fff, 0x0000000000003fff, 0x0000000000007fff,
+	0x000000000000ffff, 0x000000000001ffff, 0x000000000003ffff, 0x000000000007ffff,
+	0x00000000000fffff, 0x00000000001fffff, 0x00000000003fffff, 0x00000000007fffff,
+	0x0000000000ffffff, 0x0000000001ffffff, 0x0000000003ffffff, 0x0000000007ffffff,
+	0x000000000fffffff, 0x000000001fffffff, 0x000000003fffffff, 0x000000007fffffff,
+	0x00000000ffffffff,
+#if BITS_PER_LONG == 64
+	0x00000001ffffffff, 0x00000003ffffffff, 0x00000007ffffffff, 0x0000000fffffffff,
+	0x0000001fffffffff, 0x0000003fffffffff, 0x0000007fffffffff, 0x000000ffffffffff,
+	0x000001ffffffffff, 0x000003ffffffffff, 0x000007ffffffffff, 0x00000fffffffffff,
+	0x00001fffffffffff, 0x00003fffffffffff, 0x00007fffffffffff, 0x0000ffffffffffff,
+	0x0001ffffffffffff, 0x0003ffffffffffff, 0x0007ffffffffffff, 0x000fffffffffffff,
+	0x001fffffffffffff, 0x003fffffffffffff, 0x007fffffffffffff, 0x00ffffffffffffff,
+	0x01ffffffffffffff, 0x03ffffffffffffff, 0x07ffffffffffffff, 0x0fffffffffffffff,
+	0x1fffffffffffffff, 0x3fffffffffffffff, 0x7fffffffffffffff, 0xffffffffffffffff
+#endif
+};
+
+/**
+ * struct axmap_level - a bitmap used to implement struct axmap
+ * @level: Level index. Each map has at least one level with index zero. The
+ *	higher the level index, the fewer bits a struct axmap_level contains.
+ * @map_size: Number of elements of the @map array.
+ * @map: A bitmap with @map_size elements.
+ */
+struct axmap_level {
+	int level;
+	unsigned long map_size;
+	unsigned long *map;
+};
+
+/**
+ * struct axmap - a set that can store numbers 0 .. @nr_bits - 1
+ * @nr_level: Number of elements of the @levels array.
+ * @levels: struct axmap_level array in which lower levels contain more bits
+ *	than higher levels.
+ * @nr_bits: One more than the highest value stored in the set.
+ */
+struct axmap {
+	unsigned int nr_levels;
+	struct axmap_level *levels;
+	uint64_t nr_bits;
+};
+
+/* Remove all elements from the @axmap set */
+void axmap_reset(struct axmap *axmap)
+{
+	int i;
+
+	for (i = 0; i < axmap->nr_levels; i++) {
+		struct axmap_level *al = &axmap->levels[i];
+
+		memset(al->map, 0, al->map_size * sizeof(unsigned long));
+	}
+}
+
+void axmap_free(struct axmap *axmap)
+{
+	unsigned int i;
+
+	if (!axmap)
+		return;
+
+	for (i = 0; i < axmap->nr_levels; i++)
+		free(axmap->levels[i].map);
+
+	free(axmap->levels);
+	free(axmap);
+}
+
+/* Allocate memory for a set that can store the numbers 0 .. @nr_bits - 1. */
+struct axmap *axmap_new(uint64_t nr_bits)
+{
+	struct axmap *axmap;
+	unsigned int i, levels;
+
+	axmap = malloc(sizeof(*axmap));
+	if (!axmap)
+		return NULL;
+
+	levels = 1;
+	i = (nr_bits + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
+	while (i > 1) {
+		i = (i + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
+		levels++;
+	}
+
+	axmap->nr_levels = levels;
+	axmap->levels = calloc(axmap->nr_levels, sizeof(struct axmap_level));
+	if (!axmap->levels)
+		goto free_axmap;
+	axmap->nr_bits = nr_bits;
+
+	for (i = 0; i < axmap->nr_levels; i++) {
+		struct axmap_level *al = &axmap->levels[i];
+
+		nr_bits = (nr_bits + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
+
+		al->level = i;
+		al->map_size = nr_bits;
+		al->map = malloc(al->map_size * sizeof(unsigned long));
+		if (!al->map)
+			goto free_levels;
+
+	}
+
+	axmap_reset(axmap);
+	return axmap;
+
+free_levels:
+	for (i = 0; i < axmap->nr_levels; i++)
+		free(axmap->levels[i].map);
+
+	free(axmap->levels);
+
+free_axmap:
+	free(axmap);
+	return NULL;
+}
+
+/*
+ * Call @func for each level, starting at level zero, until a level is found
+ * for which @func returns true. Return false if none of the @func calls
+ * returns true.
+ */
+static bool axmap_handler(struct axmap *axmap, uint64_t bit_nr,
+			  bool (*func)(struct axmap_level *, uint64_t, unsigned int,
+			  void *), void *data)
+{
+	struct axmap_level *al;
+	uint64_t index = bit_nr;
+	int i;
+
+	for (i = 0; i < axmap->nr_levels; i++) {
+		unsigned long offset = index >> UNIT_SHIFT;
+		unsigned int bit = index & BLOCKS_PER_UNIT_MASK;
+
+		al = &axmap->levels[i];
+
+		if (func(al, offset, bit, data))
+			return true;
+
+		if (index)
+			index >>= UNIT_SHIFT;
+	}
+
+	return false;
+}
+
+/*
+ * Call @func for each level, starting at the highest level, until a level is
+ * found for which @func returns true. Return false if none of the @func calls
+ * returns true.
+ */
+static bool axmap_handler_topdown(struct axmap *axmap, uint64_t bit_nr,
+	bool (*func)(struct axmap_level *, uint64_t, unsigned int, void *))
+{
+	int i;
+
+	for (i = axmap->nr_levels - 1; i >= 0; i--) {
+		uint64_t index = bit_nr >> (UNIT_SHIFT * i);
+		unsigned long offset = index >> UNIT_SHIFT;
+		unsigned int bit = index & BLOCKS_PER_UNIT_MASK;
+
+		if (func(&axmap->levels[i], offset, bit, NULL))
+			return true;
+	}
+
+	return false;
+}
+
+struct axmap_set_data {
+	unsigned int nr_bits;
+	unsigned int set_bits;
+};
+
+/*
+ * Set at most @__data->nr_bits bits in @al at offset @offset. Do not exceed
+ * the boundary of the element at offset @offset. Return the number of bits
+ * that have been set in @__data->set_bits if @al->level == 0.
+ */
+static bool axmap_set_fn(struct axmap_level *al, uint64_t offset,
+			 unsigned int bit, void *__data)
+{
+	struct axmap_set_data *data = __data;
+	unsigned long mask, overlap;
+	unsigned int nr_bits;
+
+	nr_bits = min(data->nr_bits, BLOCKS_PER_UNIT - bit);
+
+	mask = bit_masks[nr_bits] << bit;
+
+	/*
+	 * Mask off any potential overlap, only sets contig regions
+	 */
+	overlap = al->map[offset] & mask;
+	if (overlap == mask) {
+		data->set_bits = 0;
+		return true;
+	}
+
+	if (overlap) {
+		nr_bits = ffz(~overlap) - bit;
+		if (!nr_bits)
+			return true;
+		mask = bit_masks[nr_bits] << bit;
+	}
+
+	assert(mask);
+	assert(!(al->map[offset] & mask));
+	al->map[offset] |= mask;
+
+	if (!al->level)
+		data->set_bits = nr_bits;
+
+	/* For the next level */
+	data->nr_bits = 1;
+
+	return al->map[offset] != -1UL;
+}
+
+/*
+ * Set up to @data->nr_bits starting from @bit_nr in @axmap. Start at
+ * @bit_nr. If that bit has not yet been set then set it and continue until
+ * either @data->nr_bits have been set or a 1 bit is found. Store the number
+ * of bits that have been set in @data->set_bits. It is guaranteed that all
+ * bits that have been requested to set fit in the same unsigned long word of
+ * level 0 of @axmap.
+ */
+static void __axmap_set(struct axmap *axmap, uint64_t bit_nr,
+			 struct axmap_set_data *data)
+{
+	unsigned int nr_bits = data->nr_bits;
+
+	if (bit_nr > axmap->nr_bits)
+		return;
+	else if (bit_nr + nr_bits > axmap->nr_bits)
+		nr_bits = axmap->nr_bits - bit_nr;
+
+	assert(nr_bits <= BLOCKS_PER_UNIT);
+
+	axmap_handler(axmap, bit_nr, axmap_set_fn, data);
+}
+
+void axmap_set(struct axmap *axmap, uint64_t bit_nr)
+{
+	struct axmap_set_data data = { .nr_bits = 1, };
+
+	__axmap_set(axmap, bit_nr, &data);
+}
+
+/*
+ * Set up to @nr_bits starting from @bit in @axmap. Start at @bit. If that
+ * bit has not yet been set then set it and continue until either @nr_bits
+ * have been set or a 1 bit is found. Return the number of bits that have been
+ * set.
+ */
+unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr,
+			  unsigned int nr_bits)
+{
+	unsigned int set_bits = 0;
+
+	do {
+		struct axmap_set_data data = { .nr_bits = nr_bits, };
+		unsigned int max_bits, this_set;
+
+		max_bits = BLOCKS_PER_UNIT - (bit_nr & BLOCKS_PER_UNIT_MASK);
+		if (nr_bits > max_bits)
+			data.nr_bits = max_bits;
+
+		this_set = data.nr_bits;
+		__axmap_set(axmap, bit_nr, &data);
+		set_bits += data.set_bits;
+		if (data.set_bits != this_set)
+			break;
+
+		nr_bits -= data.set_bits;
+		bit_nr += data.set_bits;
+	} while (nr_bits);
+
+	return set_bits;
+}
+
+static bool axmap_isset_fn(struct axmap_level *al, uint64_t offset,
+			   unsigned int bit, void *unused)
+{
+	return (al->map[offset] & (1ULL << bit)) != 0;
+}
+
+bool axmap_isset(struct axmap *axmap, uint64_t bit_nr)
+{
+	if (bit_nr <= axmap->nr_bits)
+		return axmap_handler_topdown(axmap, bit_nr, axmap_isset_fn);
+
+	return false;
+}
+
+/*
+ * Find the first free bit that is at least as large as bit_nr.  Return
+ * -1 if no free bit is found before the end of the map.
+ */
+static uint64_t axmap_find_first_free(struct axmap *axmap, uint64_t bit_nr)
+{
+	int i;
+	unsigned long temp;
+	unsigned int bit;
+	uint64_t offset, base_index, index;
+	struct axmap_level *al;
+
+	index = 0;
+	for (i = axmap->nr_levels - 1; i >= 0; i--) {
+		al = &axmap->levels[i];
+
+		/* Shift previously calculated index for next level */
+		index <<= UNIT_SHIFT;
+
+		/*
+		 * Start from an index that's at least as large as the
+		 * originally passed in bit number.
+		 */
+		base_index = bit_nr >> (UNIT_SHIFT * i);
+		if (index < base_index)
+			index = base_index;
+
+		/* Get the offset and bit for this level */
+		offset = index >> UNIT_SHIFT;
+		bit = index & BLOCKS_PER_UNIT_MASK;
+
+		/*
+		 * If the previous level had unused bits in its last
+		 * word, the offset could be bigger than the map at
+		 * this level. That means no free bits exist before the
+		 * end of the map, so return -1.
+		 */
+		if (offset >= al->map_size)
+			return -1ULL;
+
+		/* Check the first word starting with the specific bit */
+		temp = ~bit_masks[bit] & ~al->map[offset];
+		if (temp)
+			goto found;
+
+		/*
+		 * No free bit in the first word, so iterate
+		 * looking for a word with one or more free bits.
+		 */
+		for (offset++; offset < al->map_size; offset++) {
+			temp = ~al->map[offset];
+			if (temp)
+				goto found;
+		}
+
+		/* Did not find a free bit */
+		return -1ULL;
+
+found:
+		/* Compute the index of the free bit just found */
+		index = (offset << UNIT_SHIFT) + ffz(~temp);
+	}
+
+	/* If found an unused bit in the last word of level 0, return -1 */
+	if (index >= axmap->nr_bits)
+		return -1ULL;
+
+	return index;
+}
+
+/*
+ * 'bit_nr' is already set. Find the next free bit after this one.
+ * Return -1 if no free bits found.
+ */
+uint64_t axmap_next_free(struct axmap *axmap, uint64_t bit_nr)
+{
+	uint64_t ret;
+	uint64_t next_bit = bit_nr + 1;
+	unsigned long temp;
+	uint64_t offset;
+	unsigned int bit;
+
+	if (bit_nr >= axmap->nr_bits)
+		return -1ULL;
+
+	/* If at the end of the map, wrap-around */
+	if (next_bit == axmap->nr_bits)
+		next_bit = 0;
+
+	offset = next_bit >> UNIT_SHIFT;
+	bit = next_bit & BLOCKS_PER_UNIT_MASK;
+
+	/*
+	 * As an optimization, do a quick check for a free bit
+	 * in the current word at level 0. If not found, do
+	 * a topdown search.
+	 */
+	temp = ~bit_masks[bit] & ~axmap->levels[0].map[offset];
+	if (temp) {
+		ret = (offset << UNIT_SHIFT) + ffz(~temp);
+
+		/* Might have found an unused bit at level 0 */
+		if (ret >= axmap->nr_bits)
+			ret = -1ULL;
+	} else
+		ret = axmap_find_first_free(axmap, next_bit);
+
+	/*
+	 * If there are no free bits starting at next_bit and going
+	 * to the end of the map, wrap around by searching again
+	 * starting at bit 0.
+	 */
+	if (ret == -1ULL && next_bit != 0)
+		ret = axmap_find_first_free(axmap, 0);
+	return ret;
+}
diff --git a/lib/axmap.h b/lib/axmap.h
new file mode 100644
index 0000000..aa59768
--- /dev/null
+++ b/lib/axmap.h
@@ -0,0 +1,17 @@
+#ifndef FIO_BITMAP_H
+#define FIO_BITMAP_H
+
+#include <inttypes.h>
+#include "types.h"
+
+struct axmap;
+struct axmap *axmap_new(uint64_t nr_bits);
+void axmap_free(struct axmap *bm);
+
+void axmap_set(struct axmap *axmap, uint64_t bit_nr);
+unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr, unsigned int nr_bits);
+bool axmap_isset(struct axmap *axmap, uint64_t bit_nr);
+uint64_t axmap_next_free(struct axmap *axmap, uint64_t bit_nr);
+void axmap_reset(struct axmap *axmap);
+
+#endif
diff --git a/lib/bloom.c b/lib/bloom.c
new file mode 100644
index 0000000..f4f9b6b
--- /dev/null
+++ b/lib/bloom.c
@@ -0,0 +1,123 @@
+#include <stdlib.h>
+
+#include "bloom.h"
+#include "../hash.h"
+#include "../crc/xxhash.h"
+#include "../crc/murmur3.h"
+#include "../crc/crc32c.h"
+#include "../crc/fnv.h"
+
+struct bloom {
+	uint64_t nentries;
+
+	uint32_t *map;
+};
+
+#define BITS_PER_INDEX	(sizeof(uint32_t) * 8)
+#define BITS_INDEX_MASK	(BITS_PER_INDEX - 1)
+
+struct bloom_hash {
+	unsigned int seed;
+	uint32_t (*fn)(const void *, uint32_t, uint32_t);
+};
+
+static uint32_t bloom_crc32c(const void *buf, uint32_t len, uint32_t seed)
+{
+	return fio_crc32c(buf, len);
+}
+
+static uint32_t bloom_fnv(const void *buf, uint32_t len, uint32_t seed)
+{
+	return fnv(buf, len, seed);
+}
+
+#define BLOOM_SEED	0x8989
+
+static struct bloom_hash hashes[] = {
+	{
+		.seed = BLOOM_SEED,
+		.fn = jhash,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = XXH32,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = murmurhash3,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = bloom_crc32c,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = bloom_fnv,
+	},
+};
+
+#define N_HASHES	5
+
+struct bloom *bloom_new(uint64_t entries)
+{
+	struct bloom *b;
+	size_t no_uints;
+
+	crc32c_arm64_probe();
+	crc32c_intel_probe();
+
+	b = malloc(sizeof(*b));
+	b->nentries = entries;
+	no_uints = (entries + BITS_PER_INDEX - 1) / BITS_PER_INDEX;
+	b->map = calloc(no_uints, sizeof(uint32_t));
+	if (!b->map) {
+		free(b);
+		return NULL;
+	}
+
+	return b;
+}
+
+void bloom_free(struct bloom *b)
+{
+	free(b->map);
+	free(b);
+}
+
+static bool __bloom_check(struct bloom *b, const void *data, unsigned int len,
+			  bool set)
+{
+	uint32_t hash[N_HASHES];
+	int i, was_set;
+
+	for (i = 0; i < N_HASHES; i++) {
+		hash[i] = hashes[i].fn(data, len, hashes[i].seed);
+		hash[i] = hash[i] % b->nentries;
+	}
+
+	was_set = 0;
+	for (i = 0; i < N_HASHES; i++) {
+		const unsigned int index = hash[i] / BITS_PER_INDEX;
+		const unsigned int bit = hash[i] & BITS_INDEX_MASK;
+
+		if (b->map[index] & (1U << bit))
+			was_set++;
+		else if (set)
+			b->map[index] |= 1U << bit;
+		else
+			break;
+	}
+
+	return was_set == N_HASHES;
+}
+
+bool bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords)
+{
+	return __bloom_check(b, data, nwords * sizeof(uint32_t), true);
+}
+
+bool bloom_string(struct bloom *b, const char *data, unsigned int len,
+		  bool set)
+{
+	return __bloom_check(b, data, len, set);
+}
diff --git a/lib/bloom.h b/lib/bloom.h
new file mode 100644
index 0000000..141ead9
--- /dev/null
+++ b/lib/bloom.h
@@ -0,0 +1,14 @@
+#ifndef FIO_BLOOM_H
+#define FIO_BLOOM_H
+
+#include <inttypes.h>
+#include "../lib/types.h"
+
+struct bloom;
+
+struct bloom *bloom_new(uint64_t entries);
+void bloom_free(struct bloom *b);
+bool bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords);
+bool bloom_string(struct bloom *b, const char *data, unsigned int len, bool);
+
+#endif
diff --git a/lib/bswap.h b/lib/bswap.h
new file mode 100644
index 0000000..1fe5194
--- /dev/null
+++ b/lib/bswap.h
@@ -0,0 +1,46 @@
+#ifndef FIO_BSWAP_H
+#define FIO_BSWAP_H
+
+#include <inttypes.h>
+
+#ifdef CONFIG_LITTLE_ENDIAN
+static inline uint32_t __be32_to_cpu(uint32_t val)
+{
+	uint32_t c1, c2, c3, c4;
+
+	c1 = (val >> 24) & 0xff;
+	c2 = (val >> 16) & 0xff;
+	c3 = (val >> 8) & 0xff;
+	c4 = val & 0xff;
+
+	return c1 | c2 << 8 | c3 << 16 | c4 << 24;
+}
+
+static inline uint64_t __be64_to_cpu(uint64_t val)
+{
+	uint64_t c1, c2, c3, c4, c5, c6, c7, c8;
+
+	c1 = (val >> 56) & 0xff;
+	c2 = (val >> 48) & 0xff;
+	c3 = (val >> 40) & 0xff;
+	c4 = (val >> 32) & 0xff;
+	c5 = (val >> 24) & 0xff;
+	c6 = (val >> 16) & 0xff;
+	c7 = (val >> 8) & 0xff;
+	c8 = val & 0xff;
+
+	return c1 | c2 << 8 | c3 << 16 | c4 << 24 | c5 << 32 | c6 << 40 | c7 << 48 | c8 << 56;
+}
+#else
+static inline uint64_t __be64_to_cpu(uint64_t val)
+{
+	return val;
+}
+
+static inline uint32_t __be32_to_cpu(uint32_t val)
+{
+	return val;
+}
+#endif
+
+#endif
diff --git a/lib/ffz.h b/lib/ffz.h
new file mode 100644
index 0000000..16c9ae9
--- /dev/null
+++ b/lib/ffz.h
@@ -0,0 +1,52 @@
+#ifndef FIO_FFZ_H
+#define FIO_FFZ_H
+
+#include <inttypes.h>
+
+static inline int ffs64(uint64_t word)
+{
+	int r = 0;
+
+	if ((word & 0xffffffff) == 0) {
+		r += 32;
+		word >>= 32;
+	}
+	if (!(word & 0xffff)) {
+		word >>= 16;
+		r += 16;
+	}
+	if (!(word & 0xff)) {
+		word >>= 8;
+		r += 8;
+	}
+	if (!(word & 0xf)) {
+		word >>= 4;
+		r += 4;
+	}
+	if (!(word & 3)) {
+		word >>= 2;
+		r += 2;
+	}
+	if (!(word & 1))
+		r += 1;
+
+	return r;
+}
+
+#ifndef ARCH_HAVE_FFZ
+
+static inline int ffz(unsigned long bitmask)
+{
+	return ffs64(~bitmask);
+}
+
+#else
+#define ffz(bitmask)	arch_ffz(bitmask)
+#endif
+
+static inline int ffz64(uint64_t bitmask)
+{
+	return ffs64(~bitmask);
+}
+
+#endif
diff --git a/lib/flist_sort.c b/lib/flist_sort.c
new file mode 100644
index 0000000..1c91cc4
--- /dev/null
+++ b/lib/flist_sort.c
@@ -0,0 +1,140 @@
+#include <stdio.h>
+#include <string.h>
+#include "../flist.h"
+#include "../log.h"
+
+#define MAX_LIST_LENGTH_BITS 20
+
+/*
+ * Returns a list organized in an intermediate format suited
+ * to chaining of merge() calls: null-terminated, no reserved or
+ * sentinel head node, "prev" links not maintained.
+ */
+static struct flist_head *merge(void *priv,
+				int (*cmp)(void *priv, struct flist_head *a,
+					struct flist_head *b),
+				struct flist_head *a, struct flist_head *b)
+{
+	struct flist_head head, *tail = &head;
+
+	while (a && b) {
+		/* if equal, take 'a' -- important for sort stability */
+		if ((*cmp)(priv, a, b) <= 0) {
+			tail->next = a;
+			a = a->next;
+		} else {
+			tail->next = b;
+			b = b->next;
+		}
+		tail = tail->next;
+	}
+	tail->next = a?:b;
+	return head.next;
+}
+
+/*
+ * Combine final list merge with restoration of standard doubly-linked
+ * list structure.  This approach duplicates code from merge(), but
+ * runs faster than the tidier alternatives of either a separate final
+ * prev-link restoration pass, or maintaining the prev links
+ * throughout.
+ */
+static void merge_and_restore_back_links(void *priv,
+				int (*cmp)(void *priv, struct flist_head *a,
+					struct flist_head *b),
+				struct flist_head *head,
+				struct flist_head *a, struct flist_head *b)
+{
+	struct flist_head *tail = head;
+
+	while (a && b) {
+		/* if equal, take 'a' -- important for sort stability */
+		if ((*cmp)(priv, a, b) <= 0) {
+			tail->next = a;
+			a->prev = tail;
+			a = a->next;
+		} else {
+			tail->next = b;
+			b->prev = tail;
+			b = b->next;
+		}
+		tail = tail->next;
+	}
+	tail->next = a ? : b;
+
+	do {
+		/*
+		 * In worst cases this loop may run many iterations.
+		 * Continue callbacks to the client even though no
+		 * element comparison is needed, so the client's cmp()
+		 * routine can invoke cond_resched() periodically.
+		 */
+		(*cmp)(priv, tail->next, tail->next);
+
+		tail->next->prev = tail;
+		tail = tail->next;
+	} while (tail->next);
+
+	tail->next = head;
+	head->prev = tail;
+}
+
+/**
+ * list_sort - sort a list
+ * @priv: private data, opaque to list_sort(), passed to @cmp
+ * @head: the list to sort
+ * @cmp: the elements comparison function
+ *
+ * This function implements "merge sort", which has O(nlog(n))
+ * complexity.
+ *
+ * The comparison function @cmp must return a negative value if @a
+ * should sort before @b, and a positive value if @a should sort after
+ * @b. If @a and @b are equivalent, and their original relative
+ * ordering is to be preserved, @cmp must return 0.
+ */
+void flist_sort(void *priv, struct flist_head *head,
+		int (*cmp)(void *priv, struct flist_head *a,
+			struct flist_head *b))
+{
+	struct flist_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
+						-- last slot is a sentinel */
+	int lev;  /* index into part[] */
+	int max_lev = 0;
+	struct flist_head *list;
+
+	if (flist_empty(head))
+		return;
+
+	memset(part, 0, sizeof(part));
+
+	head->prev->next = NULL;
+	list = head->next;
+
+	while (list) {
+		struct flist_head *cur = list;
+		list = list->next;
+		cur->next = NULL;
+
+		for (lev = 0; part[lev]; lev++) {
+			cur = merge(priv, cmp, part[lev], cur);
+			part[lev] = NULL;
+		}
+		if (lev > max_lev) {
+			if (lev >= MAX_LIST_LENGTH_BITS) {
+				log_err("fio: list passed to"
+					" list_sort() too long for"
+					" efficiency\n");
+				lev--;
+			}
+			max_lev = lev;
+		}
+		part[lev] = cur;
+	}
+
+	for (lev = 0; lev < max_lev; lev++)
+		if (part[lev])
+			list = merge(priv, cmp, part[lev], list);
+
+	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+}
diff --git a/lib/fls.h b/lib/fls.h
new file mode 100644
index 0000000..dc7ecd0
--- /dev/null
+++ b/lib/fls.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_GENERIC_BITOPS_FLS_H_
+#define _ASM_GENERIC_BITOPS_FLS_H_
+
+/**
+ * fls - find last (most-significant) bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs.
+ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
+ */
+
+static inline int __fls(int x)
+{
+	int r = 32;
+
+	if (!x)
+		return 0;
+	if (!(x & 0xffff0000u)) {
+		x <<= 16;
+		r -= 16;
+	}
+	if (!(x & 0xff000000u)) {
+		x <<= 8;
+		r -= 8;
+	}
+	if (!(x & 0xf0000000u)) {
+		x <<= 4;
+		r -= 4;
+	}
+	if (!(x & 0xc0000000u)) {
+		x <<= 2;
+		r -= 2;
+	}
+	if (!(x & 0x80000000u)) {
+		x <<= 1;
+		r -= 1;
+	}
+	return r;
+}
+
+#endif /* _ASM_GENERIC_BITOPS_FLS_H_ */
diff --git a/lib/gauss.c b/lib/gauss.c
new file mode 100644
index 0000000..1d24e18
--- /dev/null
+++ b/lib/gauss.c
@@ -0,0 +1,63 @@
+#include <math.h>
+#include <string.h>
+#include "../hash.h"
+#include "gauss.h"
+
+#define GAUSS_ITERS	12
+
+static int gauss_dev(struct gauss_state *gs)
+{
+	unsigned int r;
+	int vr;
+
+	if (!gs->stddev)
+		return 0;
+
+	r = __rand(&gs->r);
+	vr = gs->stddev * (r / (FRAND32_MAX + 1.0));
+
+	return vr - gs->stddev / 2;
+}
+
+unsigned long long gauss_next(struct gauss_state *gs)
+{
+	unsigned long long sum = 0;
+	int i;
+
+	for (i = 0; i < GAUSS_ITERS; i++)
+		sum += __rand(&gs->r) % (gs->nranges + 1);
+
+	sum = (sum + GAUSS_ITERS - 1) / GAUSS_ITERS;
+
+	if (gs->stddev) {
+		int dev = gauss_dev(gs);
+
+		while (dev + sum >= gs->nranges)
+			dev /= 2;
+		sum += dev;
+	}
+
+	if (!gs->disable_hash)
+		sum = __hash_u64(sum);
+
+	return sum % gs->nranges;
+}
+
+void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
+		unsigned int seed)
+{
+	memset(gs, 0, sizeof(*gs));
+	init_rand_seed(&gs->r, seed, 0);
+	gs->nranges = nranges;
+
+	if (dev != 0.0) {
+		gs->stddev = ceil((double) (nranges * 100.0) / dev);
+		if (gs->stddev > nranges / 2)
+			gs->stddev = nranges / 2;
+	}
+}
+
+void gauss_disable_hash(struct gauss_state *gs)
+{
+	gs->disable_hash = true;
+}
diff --git a/lib/gauss.h b/lib/gauss.h
new file mode 100644
index 0000000..478aa14
--- /dev/null
+++ b/lib/gauss.h
@@ -0,0 +1,19 @@
+#ifndef FIO_GAUSS_H
+#define FIO_GAUSS_H
+
+#include <inttypes.h>
+#include "rand.h"
+
+struct gauss_state {
+	struct frand_state r;
+	uint64_t nranges;
+	unsigned int stddev;
+	bool disable_hash;
+};
+
+void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
+		unsigned int seed);
+unsigned long long gauss_next(struct gauss_state *gs);
+void gauss_disable_hash(struct gauss_state *gs);
+
+#endif
diff --git a/lib/getrusage.c b/lib/getrusage.c
new file mode 100644
index 0000000..96dcf6d
--- /dev/null
+++ b/lib/getrusage.c
@@ -0,0 +1,14 @@
+#include <errno.h>
+#include "getrusage.h"
+
+int fio_getrusage(struct rusage *ru)
+{
+#ifdef CONFIG_RUSAGE_THREAD
+	if (!getrusage(RUSAGE_THREAD, ru))
+		return 0;
+	if (errno != EINVAL)
+		return -1;
+	/* Fall through to RUSAGE_SELF */
+#endif
+	return getrusage(RUSAGE_SELF, ru);
+}
diff --git a/lib/getrusage.h b/lib/getrusage.h
new file mode 100644
index 0000000..49e6427
--- /dev/null
+++ b/lib/getrusage.h
@@ -0,0 +1,9 @@
+#ifndef FIO_GETRUSAGE_H
+#define FIO_GETRUSAGE_H
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+extern int fio_getrusage(struct rusage *ru);
+
+#endif
diff --git a/lib/hweight.c b/lib/hweight.c
new file mode 100644
index 0000000..2c819d6
--- /dev/null
+++ b/lib/hweight.c
@@ -0,0 +1,33 @@
+#include "hweight.h"
+
+unsigned int hweight8(uint8_t w)
+{
+	unsigned int res = w - ((w >> 1) & 0x55);
+
+	res = (res & 0x33) + ((res >> 2) & 0x33);
+	return (res + (res >> 4)) & 0x0F;
+}
+
+unsigned int hweight32(uint32_t w)
+{
+	unsigned int res = w - ((w >> 1) & 0x55555555);
+
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res + (res >> 4)) & 0x0F0F0F0F;
+	res = res + (res >> 8);
+	return (res + (res >> 16)) & 0x000000FF;
+}
+
+unsigned int hweight64(uint64_t w)
+{
+#if BITS_PER_LONG == 32
+	return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w);
+#else
+	uint64_t res = w - ((w >> 1) & 0x5555555555555555ULL);
+	res = (res & 0x3333333333333333ULL) + ((res >> 2) & 0x3333333333333333ULL);
+	res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+	res = res + (res >> 8);
+	res = res + (res >> 16);
+	return (res + (res >> 32)) & 0x00000000000000FFULL;
+#endif
+}
diff --git a/lib/hweight.h b/lib/hweight.h
new file mode 100644
index 0000000..68861dd
--- /dev/null
+++ b/lib/hweight.h
@@ -0,0 +1,10 @@
+#ifndef FIO_HWEIGHT_H
+#define FIO_HWEIGHT_H
+
+#include <inttypes.h>
+
+unsigned int hweight8(uint8_t w);
+unsigned int hweight32(uint32_t w);
+unsigned int hweight64(uint64_t w);
+
+#endif
diff --git a/lib/ieee754.c b/lib/ieee754.c
new file mode 100644
index 0000000..2154065
--- /dev/null
+++ b/lib/ieee754.c
@@ -0,0 +1,83 @@
+/*
+ * Shamelessly lifted from Beej's Guide to Network Programming, found here:
+ *
+ * http://beej.us/guide/bgnet/output/html/singlepage/bgnet.html#serialization
+ *
+ * Below code was granted to the public domain.
+ */
+#include "ieee754.h"
+
+uint64_t pack754(long double f, unsigned bits, unsigned expbits)
+{
+	long double fnorm;
+	int shift;
+	long long sign, exp, significand;
+	unsigned significandbits = bits - expbits - 1; // -1 for sign bit
+
+	// get this special case out of the way
+	if (f == 0.0)
+		return 0;
+
+	// check sign and begin normalization
+	if (f < 0) {
+		sign = 1;
+		fnorm = -f;
+	} else {
+		sign = 0;
+		fnorm = f;
+	}
+
+	// get the normalized form of f and track the exponent
+	shift = 0;
+	while (fnorm >= 2.0) {
+		fnorm /= 2.0;
+		shift++;
+	}
+	while (fnorm < 1.0) {
+		fnorm *= 2.0;
+		shift--;
+	}
+	fnorm = fnorm - 1.0;
+
+	// calculate the binary form (non-float) of the significand data
+	significand = fnorm * ((1LL << significandbits) + 0.5f);
+
+	// get the biased exponent
+	exp = shift + ((1 << (expbits - 1)) - 1); // shift + bias
+
+	// return the final answer
+	return (sign << (bits - 1)) | (exp << (bits-expbits - 1)) | significand;
+}
+
+long double unpack754(uint64_t i, unsigned bits, unsigned expbits)
+{
+	long double result;
+	long long shift;
+	unsigned bias;
+	unsigned significandbits = bits - expbits - 1; // -1 for sign bit
+
+	if (i == 0)
+		return 0.0;
+
+	// pull the significand
+	result = (i & ((1LL << significandbits) - 1)); // mask
+	result /= (1LL << significandbits); // convert back to float
+	result += 1.0f; // add the one back on
+
+	// deal with the exponent
+	bias = (1 << (expbits - 1)) - 1;
+	shift = ((i >> significandbits) & ((1LL << expbits) - 1)) - bias;
+	while (shift > 0) {
+		result *= 2.0;
+		shift--;
+	}
+	while (shift < 0) {
+		result /= 2.0;
+		shift++;
+	}
+
+	// sign it
+	result *= (i >> (bits - 1)) & 1 ? -1.0 : 1.0;
+
+	return result;
+}
diff --git a/lib/ieee754.h b/lib/ieee754.h
new file mode 100644
index 0000000..5af9518
--- /dev/null
+++ b/lib/ieee754.h
@@ -0,0 +1,20 @@
+#ifndef FIO_IEEE754_H
+#define FIO_IEEE754_H
+
+#include <inttypes.h>
+
+extern uint64_t pack754(long double f, unsigned bits, unsigned expbits);
+extern long double unpack754(uint64_t i, unsigned bits, unsigned expbits);
+
+#define fio_double_to_uint64(val)	pack754((val), 64, 11)
+#define fio_uint64_to_double(val)	unpack754((val), 64, 11)
+
+typedef struct fio_fp64 {
+	union {
+		uint64_t i;
+		double f;
+		uint8_t filler[16];
+	} u;
+} fio_fp64_t;
+
+#endif
diff --git a/lib/lfsr.c b/lib/lfsr.c
new file mode 100644
index 0000000..1ef6ebb
--- /dev/null
+++ b/lib/lfsr.c
@@ -0,0 +1,269 @@
+#include <stdio.h>
+
+#include "lfsr.h"
+#include "../compiler/compiler.h"
+
+/*
+ * LFSR taps retrieved from:
+ * http://home1.gte.net/res0658s/electronics/LFSRtaps.html
+ *
+ * The memory overhead of the following tap table should be relatively small,
+ * no more than 400 bytes.
+ */
+static uint8_t lfsr_taps[64][FIO_MAX_TAPS] =
+{
+	{0}, {0}, {0},		//LFSRs with less that 3-bits cannot exist
+	{3, 2},			//Tap position for 3-bit LFSR
+	{4, 3},			//Tap position for 4-bit LFSR
+	{5, 3},			//Tap position for 5-bit LFSR
+	{6, 5},			//Tap position for 6-bit LFSR
+	{7, 6},			//Tap position for 7-bit LFSR
+	{8, 6, 5 ,4},		//Tap position for 8-bit LFSR
+	{9, 5},			//Tap position for 9-bit LFSR
+	{10, 7},		//Tap position for 10-bit LFSR
+	{11, 9},		//Tap position for 11-bit LFSR
+	{12, 6, 4, 1},		//Tap position for 12-bit LFSR
+	{13, 4, 3, 1},		//Tap position for 13-bit LFSR
+	{14, 5, 3, 1},		//Tap position for 14-bit LFSR
+	{15, 14},		//Tap position for 15-bit LFSR
+	{16, 15, 13, 4},	//Tap position for 16-bit LFSR
+	{17, 14},		//Tap position for 17-bit LFSR
+	{18, 11},		//Tap position for 18-bit LFSR
+	{19, 6, 2, 1},		//Tap position for 19-bit LFSR
+	{20, 17},		//Tap position for 20-bit LFSR
+	{21, 19},		//Tap position for 21-bit LFSR
+	{22, 21},		//Tap position for 22-bit LFSR
+	{23, 18},		//Tap position for 23-bit LFSR
+	{24, 23, 22, 17},	//Tap position for 24-bit LFSR
+	{25, 22},		//Tap position for 25-bit LFSR
+	{26, 6, 2, 1},		//Tap position for 26-bit LFSR
+	{27, 5, 2, 1},		//Tap position for 27-bit LFSR
+	{28, 25},		//Tap position for 28-bit LFSR
+	{29, 27},		//Tap position for 29-bit LFSR
+	{30, 6, 4, 1},		//Tap position for 30-bit LFSR
+	{31, 28},		//Tap position for 31-bit LFSR
+	{32, 31, 29, 1},	//Tap position for 32-bit LFSR
+	{33, 20},		//Tap position for 33-bit LFSR
+	{34, 27, 2, 1},		//Tap position for 34-bit LFSR
+	{35, 33},		//Tap position for 35-bit LFSR
+	{36, 25},		//Tap position for 36-bit LFSR
+	{37, 5, 4, 3, 2, 1},	//Tap position for 37-bit LFSR
+	{38, 6, 5, 1},		//Tap position for 38-bit LFSR
+	{39, 35},		//Tap position for 39-bit LFSR
+	{40, 38, 21, 19},	//Tap position for 40-bit LFSR
+	{41, 38},		//Tap position for 41-bit LFSR
+	{42, 41, 20, 19},	//Tap position for 42-bit LFSR
+	{43, 42, 38, 37},	//Tap position for 43-bit LFSR
+	{44, 43, 18, 17},	//Tap position for 44-bit LFSR
+	{45, 44, 42, 41},	//Tap position for 45-bit LFSR
+	{46, 45, 26, 25},	//Tap position for 46-bit LFSR
+	{47, 42},		//Tap position for 47-bit LFSR
+	{48, 47, 21, 20},	//Tap position for 48-bit LFSR
+	{49, 40},		//Tap position for 49-bit LFSR
+	{50, 49, 24, 23},	//Tap position for 50-bit LFSR
+	{51, 50, 36, 35},	//Tap position for 51-bit LFSR
+	{52, 49},		//Tap position for 52-bit LFSR
+	{53, 52, 38, 37},	//Tap position for 53-bit LFSR
+	{54, 53, 18, 17},	//Tap position for 54-bit LFSR
+	{55, 31},		//Tap position for 55-bit LFSR
+	{56, 55, 35, 34},	//Tap position for 56-bit LFSR
+	{57, 50},		//Tap position for 57-bit LFSR
+	{58, 39},		//Tap position for 58-bit LFSR
+	{59, 58, 38, 37},	//Tap position for 59-bit LFSR
+	{60, 59},		//Tap position for 60-bit LFSR
+	{61, 60, 46, 45},	//Tap position for 61-bit LFSR
+	{62, 61, 6, 5},		//Tap position for 62-bit LFSR
+	{63, 62},		//Tap position for 63-bit LFSR
+};
+
+#define __LFSR_NEXT(__fl, __v)						\
+	__v = ((__v >> 1) | __fl->cached_bit) ^			\
+			(((__v & 1ULL) - 1ULL) & __fl->xormask);
+
+static inline void __lfsr_next(struct fio_lfsr *fl, unsigned int spin)
+{
+	/*
+	 * This should be O(1) since most compilers will create a jump table for
+	 * this switch.
+	 */
+	switch (spin) {
+		case 15: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case 14: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case 13: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case 12: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case 11: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case 10: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  9: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  8: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  7: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  6: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  5: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  4: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  3: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  2: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  1: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		case  0: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
+		default: break;
+	}
+}
+
+/*
+ * lfsr_next does the following:
+ *
+ * a. Return if the number of max values has been exceeded.
+ * b. Check if we have a spin value that produces a repeating subsequence.
+ *    This is previously calculated in `prepare_spin` and cycle_length should
+ *    be > 0. If we do have such a spin:
+ *
+ *    i. Decrement the calculated cycle.
+ *    ii. If it reaches zero, add "+1" to the spin and reset the cycle_length
+ *        (we have it cached in the struct fio_lfsr)
+ *
+ *    In either case, continue with the calculation of the next value.
+ * c. Check if the calculated value exceeds the desirable range. In this case,
+ *    go back to b, else return.
+ */
+int lfsr_next(struct fio_lfsr *fl, uint64_t *off)
+{
+	if (fl->num_vals++ > fl->max_val)
+		return 1;
+
+	do {
+		if (fl->cycle_length && !--fl->cycle_length) {
+			__lfsr_next(fl, fl->spin + 1);
+			fl->cycle_length = fl->cached_cycle_length;
+		} else
+			__lfsr_next(fl, fl->spin);
+	} while (fio_unlikely(fl->last_val > fl->max_val));
+
+	*off = fl->last_val;
+	return 0;
+}
+
+static uint64_t lfsr_create_xormask(uint8_t *taps)
+{
+	int i;
+	uint64_t xormask = 0;
+
+	for(i = 0; i < FIO_MAX_TAPS && taps[i] != 0; i++)
+		xormask |= 1ULL << (taps[i] - 1);
+
+	return xormask;
+}
+
+static uint8_t *find_lfsr(uint64_t size)
+{
+	int i;
+
+	/*
+	 * For an LFSR, there is always a prohibited state (all ones).
+	 * Thus, if we need to find the proper LFSR for our size, we must
+	 * take that into account.
+	 */
+	for (i = 3; i < 64; i++)
+		if ((1ULL << i) > size)
+			return lfsr_taps[i];
+
+	return NULL;
+}
+
+/*
+ * It is well-known that all maximal n-bit LFSRs will start repeating
+ * themselves after their 2^n iteration. The introduction of spins however, is
+ * possible to create a repetition of a sub-sequence before we hit that mark.
+ * This happens if:
+ *
+ * [1]: ((2^n - 1) * i) % (spin + 1) == 0,
+ * where "n" is LFSR's bits and "i" any number within the range [1,spin]
+ *
+ * It is important to know beforehand if a spin can cause a repetition of a
+ * sub-sequence (cycle) and its length. However, calculating (2^n - 1) * i may
+ * produce a buffer overflow for "n" close to 64, so we expand the above to:
+ *
+ * [2]: (2^n - 1) -> (x * (spin + 1) + y), where x >= 0 and 0 <= y <= spin
+ *
+ * Thus, [1] is equivalent to (y * i) % (spin + 1) == 0;
+ * Also, the cycle's length will be (x * i) + (y * i) / (spin + 1)
+ */
+static int prepare_spin(struct fio_lfsr *fl, unsigned int spin)
+{
+	uint64_t max = (fl->cached_bit << 1) - 1;
+	uint64_t x, y;
+	int i;
+
+	if (spin > 15)
+		return 1;
+
+	x = max / (spin + 1);
+	y = max % (spin + 1);
+	fl->cycle_length = 0;	/* No cycle occurs, other than the expected */
+	fl->spin = spin;
+
+	for (i = 1; i <= spin; i++) {
+		if ((y * i) % (spin + 1) == 0) {
+			fl->cycle_length = (x * i) + (y * i) / (spin + 1);
+			break;
+		}
+	}
+	fl->cached_cycle_length = fl->cycle_length;
+
+	/*
+	 * Increment cycle length for the first time only since the stored value
+	 * will not be printed otherwise.
+	 */
+	fl->cycle_length++;
+
+	return 0;
+}
+
+int lfsr_reset(struct fio_lfsr *fl, uint64_t seed)
+{
+	uint64_t bitmask = (fl->cached_bit << 1) - 1;
+
+	fl->num_vals = 0;
+	fl->last_val = seed & bitmask;
+
+	/* All-ones state is illegal for XNOR LFSRs */
+	if (fl->last_val == bitmask)
+		return 1;
+
+	return 0;
+}
+
+int lfsr_init(struct fio_lfsr *fl, uint64_t nums, uint64_t seed,
+	      unsigned int spin)
+{
+	uint8_t *taps;
+
+	taps = find_lfsr(nums);
+	if (!taps)
+		return 1;
+
+	fl->max_val = nums - 1;
+	fl->xormask = lfsr_create_xormask(taps);
+	fl->cached_bit = 1ULL << (taps[0] - 1);
+
+	if (prepare_spin(fl, spin))
+		return 1;
+
+	if (lfsr_reset(fl, seed))
+		return 1;
+
+	return 0;
+}
diff --git a/lib/lfsr.h b/lib/lfsr.h
new file mode 100644
index 0000000..95bc07f
--- /dev/null
+++ b/lib/lfsr.h
@@ -0,0 +1,30 @@
+#ifndef FIO_LFSR_H
+#define FIO_LFSR_H
+
+#include <inttypes.h>
+
+#define FIO_MAX_TAPS	6
+
+struct lfsr_taps {
+	unsigned int length;
+	unsigned int taps[FIO_MAX_TAPS];
+};
+
+
+struct fio_lfsr {
+	uint64_t xormask;
+	uint64_t last_val;
+	uint64_t cached_bit;
+	uint64_t max_val;
+	uint64_t num_vals;
+	uint64_t cycle_length;
+	uint64_t cached_cycle_length;
+	unsigned int spin;
+};
+
+int lfsr_next(struct fio_lfsr *fl, uint64_t *off);
+int lfsr_init(struct fio_lfsr *fl, uint64_t size,
+	      uint64_t seed, unsigned int spin);
+int lfsr_reset(struct fio_lfsr *fl, uint64_t seed);
+
+#endif
diff --git a/lib/memalign.c b/lib/memalign.c
new file mode 100644
index 0000000..214a66f
--- /dev/null
+++ b/lib/memalign.c
@@ -0,0 +1,36 @@
+#include <assert.h>
+#include <stdlib.h>
+
+#include "memalign.h"
+#include "smalloc.h"
+
+#define PTR_ALIGN(ptr, mask)   \
+	(char *)((uintptr_t)((ptr) + (mask)) & ~(mask))
+
+struct align_footer {
+	unsigned int offset;
+};
+
+void *__fio_memalign(size_t alignment, size_t size, malloc_fn fn)
+{
+	struct align_footer *f;
+	void *ptr, *ret = NULL;
+
+	assert(!(alignment & (alignment - 1)));
+
+	ptr = fn(size + alignment + sizeof(*f) - 1);
+	if (ptr) {
+		ret = PTR_ALIGN(ptr, alignment - 1);
+		f = ret + size;
+		f->offset = (uintptr_t) ret - (uintptr_t) ptr;
+	}
+
+	return ret;
+}
+
+void __fio_memfree(void *ptr, size_t size, free_fn fn)
+{
+	struct align_footer *f = ptr + size;
+
+	fn(ptr - f->offset);
+}
diff --git a/lib/memalign.h b/lib/memalign.h
new file mode 100644
index 0000000..815e3aa
--- /dev/null
+++ b/lib/memalign.h
@@ -0,0 +1,13 @@
+#ifndef FIO_MEMALIGN_H
+#define FIO_MEMALIGN_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+typedef void* (*malloc_fn)(size_t);
+typedef void (*free_fn)(void*);
+
+extern void *__fio_memalign(size_t alignment, size_t size, malloc_fn fn);
+extern void __fio_memfree(void *ptr, size_t size, free_fn fn);
+
+#endif
diff --git a/lib/memcpy.c b/lib/memcpy.c
new file mode 100644
index 0000000..a552134
--- /dev/null
+++ b/lib/memcpy.c
@@ -0,0 +1,290 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "memcpy.h"
+#include "rand.h"
+#include "../fio_time.h"
+#include "../gettime.h"
+#include "../os/os.h"
+
+#define BUF_SIZE	32 * 1024 * 1024ULL
+
+#define NR_ITERS	64
+
+struct memcpy_test {
+	const char *name;
+	void *src;
+	void *dst;
+	size_t size;
+};
+
+static struct memcpy_test tests[] = {
+	{
+		.name		= "8 bytes",
+		.size		= 8,
+	},
+	{
+		.name		= "16 bytes",
+		.size		= 16,
+	},
+	{
+		.name		= "96 bytes",
+		.size		= 96,
+	},
+	{
+		.name		= "128 bytes",
+		.size		= 128,
+	},
+	{
+		.name		= "256 bytes",
+		.size		= 256,
+	},
+	{
+		.name		= "512 bytes",
+		.size		= 512,
+	},
+	{
+		.name		= "2048 bytes",
+		.size		= 2048,
+	},
+	{
+		.name		= "8192 bytes",
+		.size		= 8192,
+	},
+	{
+		.name		= "131072 bytes",
+		.size		= 131072,
+	},
+	{
+		.name		= "262144 bytes",
+		.size		= 262144,
+	},
+	{
+		.name		= "524288 bytes",
+		.size		= 524288,
+	},
+	{
+		.name		= NULL,
+	},
+};
+
+struct memcpy_type {
+	const char *name;
+	unsigned int mask;
+	void (*fn)(struct memcpy_test *);
+};
+
+enum {
+	T_MEMCPY	= 1U << 0,
+	T_MEMMOVE	= 1U << 1,
+	T_SIMPLE	= 1U << 2,
+	T_HYBRID	= 1U << 3,
+};
+
+#define do_test(test, fn)	do {					\
+	size_t left, this;						\
+	void *src, *dst;						\
+	int i;								\
+									\
+	for (i = 0; i < NR_ITERS; i++) {				\
+		left = BUF_SIZE;					\
+		src = test->src;					\
+		dst = test->dst;					\
+		while (left) {						\
+			this = test->size;				\
+			if (this > left)				\
+				this = left;				\
+			(fn)(dst, src, this);				\
+			left -= this;					\
+			src += this;					\
+			dst += this;					\
+		}							\
+	}								\
+} while (0)
+
+static void t_memcpy(struct memcpy_test *test)
+{
+	do_test(test, memcpy);
+}
+
+static void t_memmove(struct memcpy_test *test)
+{
+	do_test(test, memmove);
+}
+
+static void simple_memcpy(void *dst, void const *src, size_t len)
+{
+ 	char *d = dst;
+	const char *s = src;
+
+	while (len--)
+		*d++ = *s++;
+}
+
+static void t_simple(struct memcpy_test *test)
+{
+	do_test(test, simple_memcpy);
+}
+
+static void t_hybrid(struct memcpy_test *test)
+{
+	if (test->size >= 64)
+		do_test(test, simple_memcpy);
+	else
+		do_test(test, memcpy);
+}
+
+static struct memcpy_type t[] = {
+	{
+		.name = "memcpy",
+		.mask = T_MEMCPY,
+		.fn = t_memcpy,
+	},
+	{
+		.name = "memmove",
+		.mask = T_MEMMOVE,
+		.fn = t_memmove,
+	},
+	{
+		.name = "simple",
+		.mask = T_SIMPLE,
+		.fn = t_simple,
+	},
+	{
+		.name = "hybrid",
+		.mask = T_HYBRID,
+		.fn = t_hybrid,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static unsigned int get_test_mask(const char *type)
+{
+	char *ostr, *str = strdup(type);
+	unsigned int mask;
+	char *name;
+	int i;
+
+	ostr = str;
+	mask = 0;
+	while ((name = strsep(&str, ",")) != NULL) {
+		for (i = 0; t[i].name; i++) {
+			if (!strcmp(t[i].name, name)) {
+				mask |= t[i].mask;
+				break;
+			}
+		}
+	}
+
+	free(ostr);
+	return mask;
+}
+
+static int list_types(void)
+{
+	int i;
+
+	for (i = 0; t[i].name; i++)
+		printf("%s\n", t[i].name);
+
+	return 1;
+}
+
+static int setup_tests(void)
+{
+	struct memcpy_test *test;
+	struct frand_state state;
+	void *src, *dst;
+	int i;
+
+	if (!tests[0].name)
+		return 0;
+
+	src = malloc(BUF_SIZE);
+	dst = malloc(BUF_SIZE);
+	if (!src || !dst) {
+		free(src);
+		free(dst);
+		return 1;
+	}
+
+	init_rand_seed(&state, 0x8989, 0);
+	fill_random_buf(&state, src, BUF_SIZE);
+
+	for (i = 0; tests[i].name; i++) {
+		test = &tests[i];
+		test->src = src;
+		test->dst = dst;
+	}
+
+	return 0;
+}
+
+static void free_tests(void)
+{
+	free(tests[0].src);
+	free(tests[0].dst);
+}
+
+int fio_memcpy_test(const char *type)
+{
+	unsigned int test_mask = 0;
+	int j, i;
+
+	if (!type)
+		test_mask = ~0U;
+	else if (!strcmp(type, "help") || !strcmp(type, "list"))
+		return list_types();
+	else
+		test_mask = get_test_mask(type);
+
+	if (!test_mask) {
+		fprintf(stderr, "fio: unknown hash `%s`. Available:\n", type);
+		return list_types();
+	}
+
+	if (setup_tests()) {
+		fprintf(stderr, "setting up mem regions failed\n");
+		return 1;
+	}
+
+	for (i = 0; t[i].name; i++) {
+		struct timespec ts;
+		double mb_sec;
+		uint64_t usec;
+
+		if (!(t[i].mask & test_mask))
+			continue;
+
+		/*
+		 * For first run, make sure CPUs are spun up and that
+		 * we've touched the data.
+		 */
+		usec_spin(100000);
+		t[i].fn(&tests[0]);
+
+		printf("%s\n", t[i].name);
+
+		for (j = 0; tests[j].name; j++) {
+			fio_gettime(&ts, NULL);
+			t[i].fn(&tests[j]);
+			usec = utime_since_now(&ts);
+
+			if (usec) {
+				unsigned long long mb = NR_ITERS * BUF_SIZE;
+
+				mb_sec = (double) mb / (double) usec;
+				mb_sec /= (1.024 * 1.024);
+				printf("\t%s:\t%8.2f MiB/sec\n", tests[j].name, mb_sec);
+			} else
+				printf("\t%s:inf MiB/sec\n", tests[j].name);
+		}
+	}
+
+	free_tests();
+	return 0;
+}
diff --git a/lib/memcpy.h b/lib/memcpy.h
new file mode 100644
index 0000000..f61a4a0
--- /dev/null
+++ b/lib/memcpy.h
@@ -0,0 +1,6 @@
+#ifndef FIO_MEMCPY_H
+#define FIO_MEMCPY_H
+
+int fio_memcpy_test(const char *type);
+
+#endif
diff --git a/lib/mountcheck.c b/lib/mountcheck.c
new file mode 100644
index 0000000..2fb6fe7
--- /dev/null
+++ b/lib/mountcheck.c
@@ -0,0 +1,85 @@
+#include <stdio.h>
+#include <string.h>
+
+#ifdef CONFIG_GETMNTENT
+#include <mntent.h>
+
+#include "mountcheck.h"
+
+#define MTAB	"/etc/mtab"
+
+int device_is_mounted(const char *dev)
+{
+	FILE *mtab;
+	struct mntent *mnt;
+	int ret = 0;
+
+	mtab = setmntent(MTAB, "r");
+	if (!mtab)
+		return 0;
+
+	while ((mnt = getmntent(mtab)) != NULL) {
+		if (!mnt->mnt_fsname)
+			continue;
+		if (!strcmp(mnt->mnt_fsname, dev)) {
+			ret = 1;
+			break;
+		}
+	}
+
+	endmntent(mtab);
+	return ret;
+}
+
+#elif defined(CONFIG_GETMNTINFO)
+/* for most BSDs */
+#include <sys/param.h>
+#include <sys/mount.h>
+
+int device_is_mounted(const char *dev)
+{
+	struct statfs *st;
+	int i, ret;
+
+	ret = getmntinfo(&st, MNT_NOWAIT);
+	if (ret <= 0)
+		return 0;
+
+	for (i = 0; i < ret; i++) {
+		if (!strcmp(st[i].f_mntfromname, dev))
+			return 1;
+	}
+
+	return 0;
+}
+
+#elif defined(CONFIG_GETMNTINFO_STATVFS)
+/* for NetBSD */
+#include <sys/statvfs.h>
+
+int device_is_mounted(const char *dev)
+{
+	struct statvfs *st;
+	int i, ret;
+
+	ret = getmntinfo(&st, MNT_NOWAIT);
+	if (ret <= 0)
+		return 0;
+
+	for (i = 0; i < ret; i++) {
+		if (!strcmp(st[i].f_mntfromname, dev))
+			return 1;
+	}
+
+	return 0;
+}
+
+#else
+/* others */
+
+int device_is_mounted(const char *dev)
+{
+	return 0;
+}
+
+#endif
diff --git a/lib/mountcheck.h b/lib/mountcheck.h
new file mode 100644
index 0000000..14ec45a
--- /dev/null
+++ b/lib/mountcheck.h
@@ -0,0 +1,6 @@
+#ifndef FIO_MOUNT_CHECK_H
+#define FIO_MOUNT_CHECK_H
+
+extern int device_is_mounted(const char *);
+
+#endif
diff --git a/lib/nowarn_snprintf.h b/lib/nowarn_snprintf.h
new file mode 100644
index 0000000..81a6d10
--- /dev/null
+++ b/lib/nowarn_snprintf.h
@@ -0,0 +1,27 @@
+#ifndef _NOWARN_SNPRINTF_H_
+#define _NOWARN_SNPRINTF_H_
+
+#include <stdio.h>
+#include <stdarg.h>
+
+static inline int nowarn_snprintf(char *str, size_t size, const char *format,
+				  ...)
+{
+	va_list args;
+	int res;
+
+	va_start(args, format);
+#if __GNUC__ -0 >= 8
+#pragma GCC diagnostic push "-Wformat-truncation"
+#pragma GCC diagnostic ignored "-Wformat-truncation"
+#endif
+	res = vsnprintf(str, size, format, args);
+#if __GNUC__ -0 >= 8
+#pragma GCC diagnostic pop "-Wformat-truncation"
+#endif
+	va_end(args);
+
+	return res;
+}
+
+#endif
diff --git a/lib/num2str.c b/lib/num2str.c
new file mode 100644
index 0000000..1abe22f
--- /dev/null
+++ b/lib/num2str.c
@@ -0,0 +1,120 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../compiler/compiler.h"
+#include "num2str.h"
+
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+
+/**
+ * num2str() - Cheesy number->string conversion, complete with carry rounding error.
+ * @num: quantity (e.g., number of blocks, bytes or bits)
+ * @maxlen: max number of digits in the output string (not counting prefix and units, but counting .)
+ * @base: multiplier for num (e.g., if num represents Ki, use 1024)
+ * @pow2: select unit prefix - 0=power-of-10 decimal SI, nonzero=power-of-2 binary IEC
+ * @units: select units - N2S_* constants defined in num2str.h
+ * @returns a malloc'd buffer containing "number[<unit prefix>][<units>]"
+ */
+char *num2str(uint64_t num, int maxlen, int base, int pow2, enum n2s_unit units)
+{
+	const char *sistr[] = { "", "k", "M", "G", "T", "P" };
+	const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi" };
+	const char **unitprefix;
+	static const char *const unitstr[] = {
+		[N2S_NONE]	= "",
+		[N2S_PERSEC]	= "/s",
+		[N2S_BYTE]	= "B",
+		[N2S_BIT]	= "bit",
+		[N2S_BYTEPERSEC]= "B/s",
+		[N2S_BITPERSEC]	= "bit/s"
+	};
+	const unsigned int thousand = pow2 ? 1024 : 1000;
+	unsigned int modulo;
+	int post_index, carry = 0;
+	char tmp[32], fmt[32];
+	char *buf;
+
+	compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes");
+	assert(units < ARRAY_SIZE(unitstr));
+
+	buf = malloc(128);
+	if (!buf)
+		return NULL;
+
+	if (pow2)
+		unitprefix = iecstr;
+	else
+		unitprefix = sistr;
+
+	for (post_index = 0; base > 1; post_index++)
+		base /= thousand;
+
+	switch (units) {
+	case N2S_NONE:
+		break;
+	case N2S_PERSEC:
+		break;
+	case N2S_BYTE:
+		break;
+	case N2S_BIT:
+		num *= 8;
+		break;
+	case N2S_BYTEPERSEC:
+		break;
+	case N2S_BITPERSEC:
+		num *= 8;
+		break;
+	}
+
+	/*
+	 * Divide by K/Ki until string length of num <= maxlen.
+	 */
+	modulo = -1U;
+	while (post_index < ARRAY_SIZE(sistr)) {
+		sprintf(tmp, "%llu", (unsigned long long) num);
+		if (strlen(tmp) <= maxlen)
+			break;
+
+		modulo = num % thousand;
+		num /= thousand;
+		carry = modulo >= thousand / 2;
+		post_index++;
+	}
+
+	/*
+	 * If no modulo, then we're done.
+	 */
+	if (modulo == -1U) {
+done:
+		if (post_index >= ARRAY_SIZE(sistr))
+			post_index = 0;
+
+		sprintf(buf, "%llu%s%s", (unsigned long long) num,
+			unitprefix[post_index], unitstr[units]);
+		return buf;
+	}
+
+	/*
+	 * If no room for decimals, then we're done.
+	 */
+	sprintf(tmp, "%llu", (unsigned long long) num);
+	if ((int)(maxlen - strlen(tmp)) <= 1) {
+		if (carry)
+			num++;
+		goto done;
+	}
+
+	/*
+	 * Fill in everything and return the result.
+	 */
+	assert(maxlen - strlen(tmp) - 1 > 0);
+	assert(modulo < thousand);
+	sprintf(fmt, "%%.%df", (int)(maxlen - strlen(tmp) - 1));
+	sprintf(tmp, fmt, (double)modulo / (double)thousand);
+
+	sprintf(buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2],
+			unitprefix[post_index], unitstr[units]);
+	return buf;
+}
diff --git a/lib/num2str.h b/lib/num2str.h
new file mode 100644
index 0000000..797288b
--- /dev/null
+++ b/lib/num2str.h
@@ -0,0 +1,17 @@
+#ifndef FIO_NUM2STR_H
+#define FIO_NUM2STR_H
+
+#include <inttypes.h>
+
+enum n2s_unit {
+	N2S_NONE	= 0,
+	N2S_PERSEC	= 1,
+	N2S_BYTE	= 2,
+	N2S_BIT		= 3,
+	N2S_BYTEPERSEC	= 4,
+	N2S_BITPERSEC	= 5,
+};
+
+extern char *num2str(uint64_t, int, int, int, enum n2s_unit);
+
+#endif
diff --git a/lib/output_buffer.c b/lib/output_buffer.c
new file mode 100644
index 0000000..beb8a14
--- /dev/null
+++ b/lib/output_buffer.c
@@ -0,0 +1,41 @@
+#include <string.h>
+#include <stdlib.h>
+
+#include "output_buffer.h"
+#include "../minmax.h"
+
+#define BUF_INC	1024
+
+void buf_output_init(struct buf_output *out)
+{
+	out->max_buflen = 0;
+	out->buflen = 0;
+	out->buf = NULL;
+}
+
+void buf_output_free(struct buf_output *out)
+{
+	free(out->buf);
+	buf_output_init(out);
+}
+
+size_t buf_output_add(struct buf_output *out, const char *buf, size_t len)
+{
+	if (out->max_buflen - out->buflen < len) {
+		size_t need = len - (out->max_buflen - out->buflen);
+		size_t old_max = out->max_buflen;
+
+		need = max((size_t) BUF_INC, need);
+		out->max_buflen += need;
+		out->buf = realloc(out->buf, out->max_buflen);
+
+		old_max = max(old_max, out->buflen + len);
+		if (old_max + need > out->max_buflen)
+			need = out->max_buflen - old_max;
+		memset(&out->buf[old_max], 0, need);
+	}
+
+	memcpy(&out->buf[out->buflen], buf, len);
+	out->buflen += len;
+	return len;
+}
diff --git a/lib/output_buffer.h b/lib/output_buffer.h
new file mode 100644
index 0000000..389ed5b
--- /dev/null
+++ b/lib/output_buffer.h
@@ -0,0 +1,16 @@
+#ifndef FIO_OUTPUT_BUFFER_H
+#define FIO_OUTPUT_BUFFER_H
+
+#include <stddef.h>
+
+struct buf_output {
+	char *buf;
+	size_t buflen;
+	size_t max_buflen;
+};
+
+void buf_output_init(struct buf_output *out);
+void buf_output_free(struct buf_output *out);
+size_t buf_output_add(struct buf_output *out, const char *buf, size_t len);
+
+#endif
diff --git a/lib/pattern.c b/lib/pattern.c
new file mode 100644
index 0000000..2024f2e
--- /dev/null
+++ b/lib/pattern.c
@@ -0,0 +1,536 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "strntol.h"
+#include "pattern.h"
+#include "../minmax.h"
+#include "../oslib/strcasestr.h"
+#include "../oslib/strndup.h"
+
+/**
+ * parse_file() - parses binary file to fill buffer
+ * @beg - string input, extract filename from this
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_file(const char *beg, char *out,
+			      unsigned int out_len,
+			      unsigned int *filled)
+{
+	const char *end;
+	char *file;
+	int fd;
+	ssize_t count;
+
+	if (!out_len)
+		goto err_out;
+
+	assert(*beg == '\'');
+	beg++;
+	end = strchr(beg, '\'');
+	if (!end)
+		goto err_out;
+
+	file = strndup(beg, end - beg);
+	if (file == NULL)
+		goto err_out;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0)
+		goto err_free_out;
+
+	count = read(fd, out, out_len);
+	if (count == -1)
+		goto err_free_close_out;
+
+	*filled = count;
+	close(fd);
+	free(file);
+
+	/* Catch up quote */
+	return end + 1;
+
+err_free_close_out:
+	close(fd);
+err_free_out:
+	free(file);
+err_out:
+	return NULL;
+
+}
+
+/**
+ * parse_string() - parses string in double quotes, like "abc"
+ * @beg - string input
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_string(const char *beg, char *out,
+				unsigned int out_len,
+				unsigned int *filled)
+{
+	const char *end;
+
+	if (!out_len)
+		return NULL;
+
+	assert(*beg == '"');
+	beg++;
+	end = strchr(beg, '"');
+	if (!end)
+		return NULL;
+	if (end - beg > out_len)
+		return NULL;
+
+	memcpy(out, beg, end - beg);
+	*filled = end - beg;
+
+	/* Catch up quote */
+	return end + 1;
+}
+
+/**
+ * parse_number() - parses numbers
+ * @beg - string input
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Supports decimals in the range [INT_MIN, INT_MAX] and
+ * hexidecimals of any size, which should be started with
+ * prefix 0x or 0X.
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_number(const char *beg, char *out,
+				unsigned int out_len,
+				unsigned int *filled)
+{
+	const char *end;
+	unsigned int val;
+	long lval;
+	int num, i;
+
+	if (!out_len)
+		return NULL;
+
+	num = 0;
+	sscanf(beg, "0%*[xX]%*[0-9a-fA-F]%n", &num);
+	if (num == 0) {
+		/* Here we are trying to parse decimal */
+
+		char *_end;
+
+		/* Looking ahead */
+		_end = strcasestr(beg, "0x");
+		if (_end)
+			num = _end - beg;
+		if (num)
+			lval = strntol(beg, num, &_end, 10);
+		else
+			lval = strtol(beg, &_end, 10);
+		if (beg == _end || lval > INT_MAX || lval < INT_MIN)
+			return NULL;
+		end = _end;
+		i = 0;
+		if (!lval) {
+			num    = 0;
+			out[i] = 0x00;
+			i      = 1;
+		} else {
+			val = (unsigned int)lval;
+			for (; val && out_len; out_len--, i++, val >>= 8)
+				out[i] = val & 0xff;
+			if (val)
+				return NULL;
+		}
+	} else {
+		assert(num > 2);
+
+		/* Catch up 0x prefix */
+		num -= 2;
+		beg += 2;
+
+		/* Look back, handle this combined string: 0xff0x14 */
+		if (beg[num] && !strncasecmp(&beg[num - 1], "0x", 2))
+			num--;
+
+		end  = beg + num;
+
+		for (i = 0; num && out_len;
+		     out_len--, i++, num -= 2, beg += 2) {
+			const char *fmt;
+
+			fmt = (num & 1 ? "%1hhx" : "%2hhx");
+			sscanf(beg, fmt, &out[i]);
+			if (num & 1) {
+				num++;
+				beg--;
+			}
+		}
+		if (num)
+			return NULL;
+	}
+
+	*filled = i;
+	return end;
+
+}
+
+/**
+ * parse_format() - parses formats, like %o, etc
+ * @in - string input
+ * @out - output buffer where space for format should be reserved
+ * @parsed - number of bytes which were already parsed so far
+ * @out_len - length of the output buffer
+ * @fmt_desc - format descritor array, what we expect to find
+ * @fmt_desc_sz - size of the format descritor array
+ * @fmt - format array, the output
+ * @fmt_sz - size of format array
+ *
+ * This function tries to find formats, e.g.:
+ *   %o - offset of the block
+ *
+ * In case of successfull parsing it fills the format param
+ * with proper offset and the size of the expected value, which
+ * should be pasted into buffer using the format 'func' callback.
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_format(const char *in, char *out, unsigned int parsed,
+				unsigned int out_len, unsigned int *filled,
+				const struct pattern_fmt_desc *fmt_desc,
+				unsigned int fmt_desc_sz,
+				struct pattern_fmt *fmt, unsigned int fmt_sz)
+{
+	int i;
+	struct pattern_fmt *f = NULL;
+	unsigned int len = 0;
+
+	if (!out_len || !fmt_desc || !fmt_desc_sz || !fmt || !fmt_sz)
+		return NULL;
+
+	assert(*in == '%');
+
+	for (i = 0; i < fmt_desc_sz; i++) {
+		const struct pattern_fmt_desc *desc;
+
+		desc = &fmt_desc[i];
+		len  = strlen(desc->fmt);
+		if (0 == strncmp(in, desc->fmt, len)) {
+			fmt->desc = desc;
+			fmt->off  = parsed;
+			f = fmt;
+			break;
+		}
+	}
+
+	if (!f)
+		return NULL;
+	if (f->desc->len > out_len)
+		return NULL;
+
+	memset(out, '\0', f->desc->len);
+	*filled = f->desc->len;
+
+	return in + len;
+}
+
+/**
+ * parse_and_fill_pattern() - Parses combined input, which consists of strings,
+ *                            numbers and pattern formats.
+ * @in - string input
+ * @in_len - size of the input string
+ * @out - output buffer where parsed result will be put
+ * @out_len - lengths of the output buffer
+ * @fmt_desc - array of pattern format descriptors [input]
+ * @fmt_desc_sz - size of the format descriptor array
+ * @fmt - array of pattern formats [output]
+ * @fmt_sz - pointer where the size of pattern formats array stored [input],
+ *           after successfull parsing this pointer will contain the number
+ *           of parsed formats if any [output].
+ *
+ * strings:
+ *   bytes sequence in double quotes, e.g. "123".
+ *   NOTE: there is no way to escape quote, so "123\"abc" does not work.
+ *
+ * numbers:
+ *   hexidecimal - sequence of hex bytes starting from 0x or 0X prefix,
+ *                 e.g. 0xff12ceff1100ff
+ *   decimal     - decimal number in range [INT_MIN, INT_MAX]
+ *
+ * formats:
+ *   %o - offset of block, reserved 8 bytes.
+ *
+ * Explicit examples of combined string:
+ * #1                  #2                 #3        #4
+ *    in="abcd"          in=-1024           in=66     in=0xFF0X1
+ *   out=61 62 63 64    out=00 fc ff ff    out=42    out=ff 01
+ *
+ * #5                                #6
+ *    in=%o                            in="123"0xFFeeCC
+ *   out=00 00 00 00 00 00 00 00      out=31 32 33 ff ec cc
+ *
+ * #7
+ *   in=-100xab"1"%o"2"
+ *  out=f6 ff ff ff ab 31 00 00 00 00 00 00 00 00 32
+ *
+ * #9
+ *    in=%o0xdeadbeef%o
+ *   out=00 00 00 00 00 00 00 00 de ad be ef 00 00 00 00 00 00 00 00
+ *
+ * #10
+ *    in=0xfefefefefefefefefefefefefefefefefefefefefe
+ *   out=fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
+ *
+ * Returns number of bytes filled or err < 0 in case of failure.
+ */
+int parse_and_fill_pattern(const char *in, unsigned int in_len,
+			   char *out, unsigned int out_len,
+			   const struct pattern_fmt_desc *fmt_desc,
+			   unsigned int fmt_desc_sz,
+			   struct pattern_fmt *fmt,
+			   unsigned int *fmt_sz_out)
+{
+	const char *beg, *end, *out_beg = out;
+	unsigned int total = 0, fmt_rem = 0;
+
+	if (!in || !in_len || !out || !out_len)
+		return -EINVAL;
+	if (fmt_sz_out)
+		fmt_rem = *fmt_sz_out;
+
+	beg = in;
+	do {
+		unsigned int filled;
+		int parsed_fmt;
+
+		filled     = 0;
+		parsed_fmt = 0;
+
+		switch (*beg) {
+		case '\'':
+			end = parse_file(beg, out, out_len, &filled);
+			break;
+		case '"':
+			end = parse_string(beg, out, out_len, &filled);
+			break;
+		case '%':
+			end = parse_format(beg, out, out - out_beg, out_len,
+					   &filled, fmt_desc, fmt_desc_sz,
+					   fmt, fmt_rem);
+			parsed_fmt = 1;
+			break;
+		default:
+			end = parse_number(beg, out, out_len, &filled);
+			break;
+		}
+
+		if (!end)
+			return -EINVAL;
+
+		if (parsed_fmt) {
+			assert(fmt_rem);
+			fmt_rem--;
+			fmt++;
+		}
+
+		assert(end - beg <= in_len);
+		in_len -= end - beg;
+		beg     = end;
+
+		assert(filled);
+		assert(filled <= out_len);
+		out_len -= filled;
+		out     += filled;
+		total   += filled;
+
+	} while (in_len);
+
+	if (fmt_sz_out)
+		*fmt_sz_out -= fmt_rem;
+	return total;
+}
+
+/**
+ * dup_pattern() - Duplicates part of the pattern all over the buffer.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+static int dup_pattern(char *out, unsigned int out_len, unsigned int pattern_len)
+{
+	unsigned int left, len, off;
+
+	if (out_len <= pattern_len)
+		/* Normal case */
+		return 0;
+
+	off  = pattern_len;
+	left = (out_len - off);
+	len  = min(left, off);
+
+	/* Duplicate leftover */
+	while (left) {
+		memcpy(out + off, out, len);
+		left -= len;
+		off <<= 1;
+		len   = min(left, off);
+	}
+
+	return 0;
+}
+
+/**
+ * cpy_pattern() - Copies pattern to the buffer.
+ *
+ * Function copies pattern along the whole buffer.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int cpy_pattern(const char *pattern, unsigned int pattern_len,
+		char *out, unsigned int out_len)
+{
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !out || !out_len)
+		return -EINVAL;
+
+	/* Copy pattern */
+	len = min(pattern_len, out_len);
+	memcpy(out, pattern, len);
+
+	/* Spread filled chunk all over the buffer */
+	return dup_pattern(out, out_len, pattern_len);
+}
+
+/**
+ * cmp_pattern() - Compares pattern and buffer.
+ *
+ * For the sake of performance this function avoids any loops.
+ * Firstly it tries to compare the buffer itself, checking that
+ * buffer consists of repeating patterns along the buffer size.
+ *
+ * If the difference is not found then the function tries to compare
+ * buffer and pattern.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int cmp_pattern(const char *pattern, unsigned int pattern_size,
+		unsigned int off, const char *buf, unsigned int len)
+{
+	int rc;
+	unsigned int size;
+
+	/* Find the difference in buffer */
+	if (len > pattern_size) {
+		rc = memcmp(buf, buf + pattern_size, len - pattern_size);
+		if (rc)
+			return -EILSEQ;
+	}
+	/* Compare second part of the pattern with buffer */
+	if (off) {
+		size = min(len, pattern_size - off);
+		rc = memcmp(buf, pattern + off, size);
+		if (rc)
+			return -EILSEQ;
+		buf += size;
+		len -= size;
+	}
+	/* Compare first part of the pattern or the whole pattern
+	 * with buffer */
+	if (len) {
+		size = min(len, (off ? off : pattern_size));
+		rc = memcmp(buf, pattern, size);
+		if (rc)
+			return -EILSEQ;
+	}
+
+	return 0;
+}
+
+/**
+ * paste_format_inplace() - Pastes parsed formats to the pattern.
+ *
+ * This function pastes formats to the pattern. If @fmt_sz is 0
+ * function does nothing and pattern buffer is left untouched.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int paste_format_inplace(char *pattern, unsigned int pattern_len,
+			 struct pattern_fmt *fmt, unsigned int fmt_sz,
+			 void *priv)
+{
+	int i, rc;
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !fmt)
+		return -EINVAL;
+
+	/* Paste formats for first pattern chunk */
+	for (i = 0; i < fmt_sz; i++) {
+		struct pattern_fmt *f;
+
+		f = &fmt[i];
+		if (pattern_len <= f->off)
+			break;
+		len = min(pattern_len - f->off, f->desc->len);
+		rc  = f->desc->paste(pattern + f->off, len, priv);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * paste_format() - Pastes parsed formats to the buffer.
+ *
+ * This function copies pattern to the buffer, pastes format
+ * into it and then duplicates pattern all over the buffer size.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int paste_format(const char *pattern, unsigned int pattern_len,
+		 struct pattern_fmt *fmt, unsigned int fmt_sz,
+		 char *out, unsigned int out_len, void *priv)
+{
+	int rc;
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !out || !out_len)
+		return -EINVAL;
+
+	/* Copy pattern */
+	len = min(pattern_len, out_len);
+	memcpy(out, pattern, len);
+
+	rc = paste_format_inplace(out, len, fmt, fmt_sz, priv);
+	if (rc)
+		return rc;
+
+	/* Spread filled chunk all over the buffer */
+	return dup_pattern(out, out_len, pattern_len);
+}
diff --git a/lib/pattern.h b/lib/pattern.h
new file mode 100644
index 0000000..2d655ad
--- /dev/null
+++ b/lib/pattern.h
@@ -0,0 +1,45 @@
+#ifndef FIO_PARSE_PATTERN_H
+#define FIO_PARSE_PATTERN_H
+
+/**
+ * Pattern format description. The input for 'parse_pattern'.
+ * Describes format with its name and callback, which should
+ * be called to paste something inside the buffer.
+ */
+struct pattern_fmt_desc {
+	const char  *fmt;
+	unsigned int len;
+	int (*paste)(char *buf, unsigned int len, void *priv);
+};
+
+/**
+ * Pattern format. The output of 'parse_pattern'.
+ * Describes the exact position inside the xbuffer.
+ */
+struct pattern_fmt {
+	unsigned int off;
+	const struct pattern_fmt_desc *desc;
+};
+
+int parse_and_fill_pattern(const char *in, unsigned int in_len,
+			   char *out, unsigned int out_len,
+			   const struct pattern_fmt_desc *fmt_desc,
+			   unsigned int fmt_desc_sz,
+			   struct pattern_fmt *fmt,
+			   unsigned int *fmt_sz_out);
+
+int paste_format_inplace(char *pattern, unsigned int pattern_len,
+			 struct pattern_fmt *fmt, unsigned int fmt_sz,
+			 void *priv);
+
+int paste_format(const char *pattern, unsigned int pattern_len,
+		 struct pattern_fmt *fmt, unsigned int fmt_sz,
+		 char *out, unsigned int out_len, void *priv);
+
+int cpy_pattern(const char *pattern, unsigned int pattern_len,
+		char *out, unsigned int out_len);
+
+int cmp_pattern(const char *pattern, unsigned int pattern_size,
+		unsigned int off, const char *buf, unsigned int len);
+
+#endif
diff --git a/lib/pow2.h b/lib/pow2.h
new file mode 100644
index 0000000..2cbca1a
--- /dev/null
+++ b/lib/pow2.h
@@ -0,0 +1,12 @@
+#ifndef FIO_POW2_H
+#define FIO_POW2_H
+
+#include <inttypes.h>
+#include "types.h"
+
+static inline bool is_power_of_2(uint64_t val)
+{
+	return (val != 0 && ((val & (val - 1)) == 0));
+}
+
+#endif
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
new file mode 100644
index 0000000..d8e1b89
--- /dev/null
+++ b/lib/prio_tree.c
@@ -0,0 +1,469 @@
+/*
+ * lib/prio_tree.c - priority search tree
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
+ *
+ * This file is released under the GPL v2.
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004	Initial version
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include "../compiler/compiler.h"
+#include "prio_tree.h"
+
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+
+/*
+ * A clever mix of heap and radix trees forms a radix priority search tree (PST)
+ * which is useful for storing intervals, e.g, we can consider a vma as a closed
+ * interval of file pages [offset_begin, offset_end], and store all vmas that
+ * map a file in a PST. Then, using the PST, we can answer a stabbing query,
+ * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
+ * given input interval X (a set of consecutive file pages), in "O(log n + m)"
+ * time where 'log n' is the height of the PST, and 'm' is the number of stored
+ * intervals (vmas) that overlap (map) with the input interval X (the set of
+ * consecutive file pages).
+ *
+ * In our implementation, we store closed intervals of the form [radix_index,
+ * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
+ * is designed for storing intervals with unique radix indices, i.e., each
+ * interval have different radix_index. However, this limitation can be easily
+ * overcome by using the size, i.e., heap_index - radix_index, as part of the
+ * index, so we index the tree using [(radix_index,size), heap_index].
+ *
+ * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
+ * machine, the maximum height of a PST can be 64. We can use a balanced version
+ * of the priority search tree to optimize the tree height, but the balanced
+ * tree proposed by McCreight is too complex and memory-hungry for our purpose.
+ */
+
+static void get_index(const struct prio_tree_node *node,
+		      unsigned long *radix, unsigned long *heap)
+{
+	*radix = node->start;
+	*heap = node->last;
+}
+
+static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
+
+static void fio_init prio_tree_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
+		index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
+	index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
+}
+
+/*
+ * Maximum heap_index that can be stored in a PST with index_bits bits
+ */
+static inline unsigned long prio_tree_maxindex(unsigned int bits)
+{
+	return index_bits_to_maxindex[bits - 1];
+}
+
+/*
+ * Extend a priority search tree so that it can store a node with heap_index
+ * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
+ * However, this function is used rarely and the common case performance is
+ * not bad.
+ */
+static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
+		struct prio_tree_node *node, unsigned long max_heap_index)
+{
+	struct prio_tree_node *first = NULL, *prev, *last = NULL;
+
+	if (max_heap_index > prio_tree_maxindex(root->index_bits))
+		root->index_bits++;
+
+	while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
+		root->index_bits++;
+
+		if (prio_tree_empty(root))
+			continue;
+
+		if (first == NULL) {
+			first = root->prio_tree_node;
+			prio_tree_remove(root, root->prio_tree_node);
+			INIT_PRIO_TREE_NODE(first);
+			last = first;
+		} else {
+			prev = last;
+			last = root->prio_tree_node;
+			prio_tree_remove(root, root->prio_tree_node);
+			INIT_PRIO_TREE_NODE(last);
+			prev->left = last;
+			last->parent = prev;
+		}
+	}
+
+	INIT_PRIO_TREE_NODE(node);
+
+	if (first) {
+		node->left = first;
+		first->parent = node;
+	} else
+		last = node;
+
+	if (!prio_tree_empty(root)) {
+		last->left = root->prio_tree_node;
+		last->left->parent = last;
+	}
+
+	root->prio_tree_node = node;
+	return node;
+}
+
+/*
+ * Replace a prio_tree_node with a new node and return the old node
+ */
+struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
+		struct prio_tree_node *old, struct prio_tree_node *node)
+{
+	INIT_PRIO_TREE_NODE(node);
+
+	if (prio_tree_root(old)) {
+		assert(root->prio_tree_node == old);
+		/*
+		 * We can reduce root->index_bits here. However, it is complex
+		 * and does not help much to improve performance (IMO).
+		 */
+		node->parent = node;
+		root->prio_tree_node = node;
+	} else {
+		node->parent = old->parent;
+		if (old->parent->left == old)
+			old->parent->left = node;
+		else
+			old->parent->right = node;
+	}
+
+	if (!prio_tree_left_empty(old)) {
+		node->left = old->left;
+		old->left->parent = node;
+	}
+
+	if (!prio_tree_right_empty(old)) {
+		node->right = old->right;
+		old->right->parent = node;
+	}
+
+	return old;
+}
+
+/*
+ * Insert a prio_tree_node @node into a radix priority search tree @root. The
+ * algorithm typically takes O(log n) time where 'log n' is the number of bits
+ * required to represent the maximum heap_index. In the worst case, the algo
+ * can take O((log n)^2) - check prio_tree_expand.
+ *
+ * If a prior node with same radix_index and heap_index is already found in
+ * the tree, then returns the address of the prior node. Otherwise, inserts
+ * @node into the tree and returns @node.
+ */
+struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
+		struct prio_tree_node *node)
+{
+	struct prio_tree_node *cur, *res = node;
+	unsigned long radix_index, heap_index;
+	unsigned long r_index, h_index, index, mask;
+	int size_flag = 0;
+
+	get_index(node, &radix_index, &heap_index);
+
+	if (prio_tree_empty(root) ||
+			heap_index > prio_tree_maxindex(root->index_bits))
+		return prio_tree_expand(root, node, heap_index);
+
+	cur = root->prio_tree_node;
+	mask = 1UL << (root->index_bits - 1);
+
+	while (mask) {
+		get_index(cur, &r_index, &h_index);
+
+		if (r_index == radix_index && h_index == heap_index)
+			return cur;
+
+                if (h_index < heap_index ||
+		    (h_index == heap_index && r_index > radix_index)) {
+			struct prio_tree_node *tmp = node;
+			node = prio_tree_replace(root, cur, node);
+			cur = tmp;
+			/* swap indices */
+			index = r_index;
+			r_index = radix_index;
+			radix_index = index;
+			index = h_index;
+			h_index = heap_index;
+			heap_index = index;
+		}
+
+		if (size_flag)
+			index = heap_index - radix_index;
+		else
+			index = radix_index;
+
+		if (index & mask) {
+			if (prio_tree_right_empty(cur)) {
+				INIT_PRIO_TREE_NODE(node);
+				cur->right = node;
+				node->parent = cur;
+				return res;
+			} else
+				cur = cur->right;
+		} else {
+			if (prio_tree_left_empty(cur)) {
+				INIT_PRIO_TREE_NODE(node);
+				cur->left = node;
+				node->parent = cur;
+				return res;
+			} else
+				cur = cur->left;
+		}
+
+		mask >>= 1;
+
+		if (!mask) {
+			mask = 1UL << (BITS_PER_LONG - 1);
+			size_flag = 1;
+		}
+	}
+	/* Should not reach here */
+	assert(0);
+	return NULL;
+}
+
+/*
+ * Remove a prio_tree_node @node from a radix priority search tree @root. The
+ * algorithm takes O(log n) time where 'log n' is the number of bits required
+ * to represent the maximum heap_index.
+ */
+void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
+{
+	struct prio_tree_node *cur;
+	unsigned long r_index, h_index_right, h_index_left;
+
+	cur = node;
+
+	while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
+		if (!prio_tree_left_empty(cur))
+			get_index(cur->left, &r_index, &h_index_left);
+		else {
+			cur = cur->right;
+			continue;
+		}
+
+		if (!prio_tree_right_empty(cur))
+			get_index(cur->right, &r_index, &h_index_right);
+		else {
+			cur = cur->left;
+			continue;
+		}
+
+		/* both h_index_left and h_index_right cannot be 0 */
+		if (h_index_left >= h_index_right)
+			cur = cur->left;
+		else
+			cur = cur->right;
+	}
+
+	if (prio_tree_root(cur)) {
+		assert(root->prio_tree_node == cur);
+		INIT_PRIO_TREE_ROOT(root);
+		return;
+	}
+
+	if (cur->parent->right == cur)
+		cur->parent->right = cur->parent;
+	else
+		cur->parent->left = cur->parent;
+
+	while (cur != node)
+		cur = prio_tree_replace(root, cur->parent, cur);
+}
+
+/*
+ * Following functions help to enumerate all prio_tree_nodes in the tree that
+ * overlap with the input interval X [radix_index, heap_index]. The enumeration
+ * takes O(log n + m) time where 'log n' is the height of the tree (which is
+ * proportional to # of bits required to represent the maximum heap_index) and
+ * 'm' is the number of prio_tree_nodes that overlap the interval X.
+ */
+
+static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
+		unsigned long *r_index, unsigned long *h_index)
+{
+	if (prio_tree_left_empty(iter->cur))
+		return NULL;
+
+	get_index(iter->cur->left, r_index, h_index);
+
+	if (iter->r_index <= *h_index) {
+		iter->cur = iter->cur->left;
+		iter->mask >>= 1;
+		if (iter->mask) {
+			if (iter->size_level)
+				iter->size_level++;
+		} else {
+			if (iter->size_level) {
+				assert(prio_tree_left_empty(iter->cur));
+				assert(prio_tree_right_empty(iter->cur));
+				iter->size_level++;
+				iter->mask = ULONG_MAX;
+			} else {
+				iter->size_level = 1;
+				iter->mask = 1UL << (BITS_PER_LONG - 1);
+			}
+		}
+		return iter->cur;
+	}
+
+	return NULL;
+}
+
+static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
+		unsigned long *r_index, unsigned long *h_index)
+{
+	unsigned long value;
+
+	if (prio_tree_right_empty(iter->cur))
+		return NULL;
+
+	if (iter->size_level)
+		value = iter->value;
+	else
+		value = iter->value | iter->mask;
+
+	if (iter->h_index < value)
+		return NULL;
+
+	get_index(iter->cur->right, r_index, h_index);
+
+	if (iter->r_index <= *h_index) {
+		iter->cur = iter->cur->right;
+		iter->mask >>= 1;
+		iter->value = value;
+		if (iter->mask) {
+			if (iter->size_level)
+				iter->size_level++;
+		} else {
+			if (iter->size_level) {
+				assert(prio_tree_left_empty(iter->cur));
+				assert(prio_tree_right_empty(iter->cur));
+				iter->size_level++;
+				iter->mask = ULONG_MAX;
+			} else {
+				iter->size_level = 1;
+				iter->mask = 1UL << (BITS_PER_LONG - 1);
+			}
+		}
+		return iter->cur;
+	}
+
+	return NULL;
+}
+
+static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
+{
+	iter->cur = iter->cur->parent;
+	if (iter->mask == ULONG_MAX)
+		iter->mask = 1UL;
+	else if (iter->size_level == 1)
+		iter->mask = 1UL;
+	else
+		iter->mask <<= 1;
+	if (iter->size_level)
+		iter->size_level--;
+	if (!iter->size_level && (iter->value & iter->mask))
+		iter->value ^= iter->mask;
+	return iter->cur;
+}
+
+static inline int overlap(struct prio_tree_iter *iter,
+		unsigned long r_index, unsigned long h_index)
+{
+	return iter->h_index >= r_index && iter->r_index <= h_index;
+}
+
+/*
+ * prio_tree_first:
+ *
+ * Get the first prio_tree_node that overlaps with the interval [radix_index,
+ * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
+ * traversal of the tree.
+ */
+static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
+{
+	struct prio_tree_root *root;
+	unsigned long r_index, h_index;
+
+	INIT_PRIO_TREE_ITER(iter);
+
+	root = iter->root;
+	if (prio_tree_empty(root))
+		return NULL;
+
+	get_index(root->prio_tree_node, &r_index, &h_index);
+
+	if (iter->r_index > h_index)
+		return NULL;
+
+	iter->mask = 1UL << (root->index_bits - 1);
+	iter->cur = root->prio_tree_node;
+
+	while (1) {
+		if (overlap(iter, r_index, h_index))
+			return iter->cur;
+
+		if (prio_tree_left(iter, &r_index, &h_index))
+			continue;
+
+		if (prio_tree_right(iter, &r_index, &h_index))
+			continue;
+
+		break;
+	}
+	return NULL;
+}
+
+/*
+ * prio_tree_next:
+ *
+ * Get the next prio_tree_node that overlaps with the input interval in iter
+ */
+struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
+{
+	unsigned long r_index, h_index;
+
+	if (iter->cur == NULL)
+		return prio_tree_first(iter);
+
+repeat:
+	while (prio_tree_left(iter, &r_index, &h_index))
+		if (overlap(iter, r_index, h_index))
+			return iter->cur;
+
+	while (!prio_tree_right(iter, &r_index, &h_index)) {
+	    	while (!prio_tree_root(iter->cur) &&
+				iter->cur->parent->right == iter->cur)
+			prio_tree_parent(iter);
+
+		if (prio_tree_root(iter->cur))
+			return NULL;
+
+		prio_tree_parent(iter);
+	}
+
+	if (overlap(iter, r_index, h_index))
+		return iter->cur;
+
+	goto repeat;
+}
diff --git a/lib/prio_tree.h b/lib/prio_tree.h
new file mode 100644
index 0000000..9bd458f
--- /dev/null
+++ b/lib/prio_tree.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_PRIO_TREE_H
+#define _LINUX_PRIO_TREE_H
+
+#include <inttypes.h>
+
+struct prio_tree_node {
+	struct prio_tree_node	*left;
+	struct prio_tree_node	*right;
+	struct prio_tree_node	*parent;
+	uint64_t		start;
+	uint64_t		last;	/* last location _in_ interval */
+};
+
+struct prio_tree_root {
+	struct prio_tree_node	*prio_tree_node;
+	unsigned short 		index_bits;
+};
+
+struct prio_tree_iter {
+	struct prio_tree_node	*cur;
+	unsigned long		mask;
+	unsigned long		value;
+	int			size_level;
+
+	struct prio_tree_root	*root;
+	uint64_t		r_index;
+	uint64_t		h_index;
+};
+
+static inline void prio_tree_iter_init(struct prio_tree_iter *iter,
+		struct prio_tree_root *root, uint64_t r_index, uint64_t h_index)
+{
+	iter->root = root;
+	iter->r_index = r_index;
+	iter->h_index = h_index;
+	iter->cur = NULL;
+}
+
+#define INIT_PRIO_TREE_ROOT(ptr)	\
+do {					\
+	(ptr)->prio_tree_node = NULL;	\
+	(ptr)->index_bits = 1;		\
+} while (0)
+
+#define INIT_PRIO_TREE_NODE(ptr)				\
+do {								\
+	(ptr)->left = (ptr)->right = (ptr)->parent = (ptr);	\
+} while (0)
+
+#define INIT_PRIO_TREE_ITER(ptr)	\
+do {					\
+	(ptr)->cur = NULL;		\
+	(ptr)->mask = 0UL;		\
+	(ptr)->value = 0UL;		\
+	(ptr)->size_level = 0;		\
+} while (0)
+
+#define prio_tree_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+static inline int prio_tree_empty(const struct prio_tree_root *root)
+{
+	return root->prio_tree_node == NULL;
+}
+
+static inline int prio_tree_root(const struct prio_tree_node *node)
+{
+	return node->parent == node;
+}
+
+static inline int prio_tree_left_empty(const struct prio_tree_node *node)
+{
+	return node->left == node;
+}
+
+static inline int prio_tree_right_empty(const struct prio_tree_node *node)
+{
+	return node->right == node;
+}
+
+
+struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
+                struct prio_tree_node *old, struct prio_tree_node *node);
+struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
+                struct prio_tree_node *node);
+void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node);
+struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter);
+
+#endif /* _LINUX_PRIO_TREE_H */
diff --git a/lib/rand.c b/lib/rand.c
new file mode 100644
index 0000000..69acb06
--- /dev/null
+++ b/lib/rand.c
@@ -0,0 +1,199 @@
+/*
+  This is a maximally equidistributed combined Tausworthe generator
+  based on code from GNU Scientific Library 1.5 (30 Jun 2004)
+
+   x_n = (s1_n ^ s2_n ^ s3_n)
+
+   s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19))
+   s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25))
+   s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11))
+
+   The period of this generator is about 2^88.
+
+   From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe
+   Generators", Mathematics of Computation, 65, 213 (1996), 203--213.
+
+   This is available on the net from L'Ecuyer's home page,
+
+   http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps
+   ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps
+
+   There is an erratum in the paper "Tables of Maximally
+   Equidistributed Combined LFSR Generators", Mathematics of
+   Computation, 68, 225 (1999), 261--269:
+   http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps
+
+        ... the k_j most significant bits of z_j must be non-
+        zero, for each j. (Note: this restriction also applies to the
+        computer code given in [4], but was mistakenly not mentioned in
+        that paper.)
+
+   This affects the seeding procedure by imposing the requirement
+   s1 > 1, s2 > 7, s3 > 15.
+
+*/
+
+#include <string.h>
+#include "rand.h"
+#include "pattern.h"
+#include "../hash.h"
+
+int arch_random;
+
+static inline uint64_t __seed(uint64_t x, uint64_t m)
+{
+	return (x < m) ? x + m : x;
+}
+
+static void __init_rand32(struct taus88_state *state, unsigned int seed)
+{
+	int cranks = 6;
+
+#define LCG(x, seed)  ((x) * 69069 ^ (seed))
+
+	state->s1 = __seed(LCG((2^31) + (2^17) + (2^7), seed), 1);
+	state->s2 = __seed(LCG(state->s1, seed), 7);
+	state->s3 = __seed(LCG(state->s2, seed), 15);
+
+	while (cranks--)
+		__rand32(state);
+}
+
+static void __init_rand64(struct taus258_state *state, uint64_t seed)
+{
+	int cranks = 6;
+
+#define LCG64(x, seed)  ((x) * 6906969069ULL ^ (seed))
+
+	state->s1 = __seed(LCG64((2^31) + (2^17) + (2^7), seed), 1);
+	state->s2 = __seed(LCG64(state->s1, seed), 7);
+	state->s3 = __seed(LCG64(state->s2, seed), 15);
+	state->s4 = __seed(LCG64(state->s3, seed), 33);
+	state->s5 = __seed(LCG64(state->s4, seed), 49);
+
+	while (cranks--)
+		__rand64(state);
+}
+
+void init_rand(struct frand_state *state, bool use64)
+{
+	state->use64 = use64;
+
+	if (!use64)
+		__init_rand32(&state->state32, 1);
+	else
+		__init_rand64(&state->state64, 1);
+}
+
+void init_rand_seed(struct frand_state *state, unsigned int seed, bool use64)
+{
+	state->use64 = use64;
+
+	if (!use64)
+		__init_rand32(&state->state32, seed);
+	else
+		__init_rand64(&state->state64, seed);
+}
+
+void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
+{
+	void *ptr = buf;
+
+	while (len) {
+		int this_len;
+
+		if (len >= sizeof(int64_t)) {
+			*((int64_t *) ptr) = seed;
+			this_len = sizeof(int64_t);
+		} else if (len >= sizeof(int32_t)) {
+			*((int32_t *) ptr) = seed;
+			this_len = sizeof(int32_t);
+		} else if (len >= sizeof(int16_t)) {
+			*((int16_t *) ptr) = seed;
+			this_len = sizeof(int16_t);
+		} else {
+			*((int8_t *) ptr) = seed;
+			this_len = sizeof(int8_t);
+		}
+		ptr += this_len;
+		len -= this_len;
+		seed *= GOLDEN_RATIO_PRIME;
+		seed >>= 3;
+	}
+}
+
+uint64_t fill_random_buf(struct frand_state *fs, void *buf,
+			 unsigned int len)
+{
+	uint64_t r = __rand(fs);
+
+	if (sizeof(int) != sizeof(long *))
+		r *= (unsigned long) __rand(fs);
+
+	__fill_random_buf(buf, len, r);
+	return r;
+}
+
+void __fill_random_buf_percentage(uint64_t seed, void *buf,
+				  unsigned int percentage,
+				  unsigned int segment, unsigned int len,
+				  char *pattern, unsigned int pbytes)
+{
+	unsigned int this_len;
+
+	if (percentage == 100) {
+		if (pbytes)
+			(void)cpy_pattern(pattern, pbytes, buf, len);
+		else
+			memset(buf, 0, len);
+		return;
+	}
+
+	if (segment > len)
+		segment = len;
+
+	while (len) {
+		/*
+		 * Fill random chunk
+		 */
+		this_len = ((unsigned long long)segment * (100 - percentage)) / 100;
+		if (this_len > len)
+			this_len = len;
+
+		__fill_random_buf(buf, this_len, seed);
+
+		len -= this_len;
+		if (!len)
+			break;
+		buf += this_len;
+		this_len = segment - this_len;
+
+		if (this_len > len)
+			this_len = len;
+		else if (len - this_len <= sizeof(long))
+			this_len = len;
+
+		if (pbytes)
+			(void)cpy_pattern(pattern, pbytes, buf, this_len);
+		else
+			memset(buf, 0, this_len);
+
+		len -= this_len;
+		buf += this_len;
+	}
+}
+
+uint64_t fill_random_buf_percentage(struct frand_state *fs, void *buf,
+				    unsigned int percentage,
+				    unsigned int segment, unsigned int len,
+				    char *pattern, unsigned int pbytes)
+{
+	uint64_t r = __rand(fs);
+
+	if (sizeof(int) != sizeof(long *))
+		r *= (unsigned long) __rand(fs);
+
+	__fill_random_buf_percentage(r, buf, percentage, segment, len,
+					pattern, pbytes);
+	return r;
+}
diff --git a/lib/rand.h b/lib/rand.h
new file mode 100644
index 0000000..95d4f6d
--- /dev/null
+++ b/lib/rand.h
@@ -0,0 +1,158 @@
+#ifndef FIO_RAND_H
+#define FIO_RAND_H
+
+#include <inttypes.h>
+#include <assert.h>
+#include "types.h"
+
+#define FRAND32_MAX	(-1U)
+#define FRAND64_MAX	(-1ULL)
+
+struct taus88_state {
+	unsigned int s1, s2, s3;
+};
+
+struct taus258_state {
+	uint64_t s1, s2, s3, s4, s5;
+};
+
+struct frand_state {
+	unsigned int use64;
+	union {
+		struct taus88_state state32;
+		struct taus258_state state64;
+	};
+};
+
+static inline uint64_t rand_max(struct frand_state *state)
+{
+	if (state->use64)
+		return FRAND64_MAX;
+	else
+		return FRAND32_MAX;
+}
+
+static inline void __frand32_copy(struct taus88_state *dst,
+				  struct taus88_state *src)
+{
+	dst->s1 = src->s1;
+	dst->s2 = src->s2;
+	dst->s3 = src->s3;
+}
+
+static inline void __frand64_copy(struct taus258_state *dst,
+				  struct taus258_state *src)
+{
+	dst->s1 = src->s1;
+	dst->s2 = src->s2;
+	dst->s3 = src->s3;
+	dst->s4 = src->s4;
+	dst->s5 = src->s5;
+}
+
+static inline void frand_copy(struct frand_state *dst, struct frand_state *src)
+{
+	if (src->use64)
+		__frand64_copy(&dst->state64, &src->state64);
+	else
+		__frand32_copy(&dst->state32, &src->state32);
+
+	dst->use64 = src->use64;
+}
+
+static inline unsigned int __rand32(struct taus88_state *state)
+{
+#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
+
+	state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12);
+	state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4);
+	state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17);
+
+	return (state->s1 ^ state->s2 ^ state->s3);
+}
+
+static inline uint64_t __rand64(struct taus258_state *state)
+{
+	uint64_t xval;
+
+	xval = ((state->s1 <<  1) ^ state->s1) >> 53;
+	state->s1 = ((state->s1 & 18446744073709551614ULL) << 10) ^ xval;
+
+	xval = ((state->s2 << 24) ^ state->s2) >> 50;
+	state->s2 = ((state->s2 & 18446744073709551104ULL) <<  5) ^ xval;
+
+	xval = ((state->s3 <<  3) ^ state->s3) >> 23;
+	state->s3 = ((state->s3 & 18446744073709547520ULL) << 29) ^ xval;
+
+	xval = ((state->s4 <<  5) ^ state->s4) >> 24;
+	state->s4 = ((state->s4 & 18446744073709420544ULL) << 23) ^ xval;
+
+	xval = ((state->s5 <<  3) ^ state->s5) >> 33;
+	state->s5 = ((state->s5 & 18446744073701163008ULL) <<  8) ^ xval;
+
+	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4 ^ state->s5);
+}
+
+static inline uint64_t __rand(struct frand_state *state)
+{
+	if (state->use64)
+		return __rand64(&state->state64);
+	else
+		return __rand32(&state->state32);
+}
+
+static inline double __rand_0_1(struct frand_state *state)
+{
+	if (state->use64) {
+		uint64_t val = __rand64(&state->state64);
+
+		return (val + 1.0) / (FRAND64_MAX + 1.0);
+	} else {
+		uint32_t val = __rand32(&state->state32);
+
+		return (val + 1.0) / (FRAND32_MAX + 1.0);
+	}
+}
+
+static inline uint32_t rand32_upto(struct frand_state *state, uint32_t end)
+{
+	uint32_t r;
+
+	assert(!state->use64);
+
+	r = __rand32(&state->state32);
+	end++;
+	return (int) ((double)end * (r / (FRAND32_MAX + 1.0)));
+}
+
+static inline uint64_t rand64_upto(struct frand_state *state, uint64_t end)
+{
+	uint64_t r;
+
+	assert(state->use64);
+
+	r = __rand64(&state->state64);
+	end++;
+	return (uint64_t) ((double)end * (r / (FRAND64_MAX + 1.0)));
+}
+
+/*
+ * Generate a random value between 'start' and 'end', both inclusive
+ */
+static inline uint64_t rand_between(struct frand_state *state, uint64_t start,
+				    uint64_t end)
+{
+	if (state->use64)
+		return start + rand64_upto(state, end - start);
+	else
+		return start + rand32_upto(state, end - start);
+}
+
+extern void init_rand(struct frand_state *, bool);
+extern void init_rand_seed(struct frand_state *, unsigned int seed, bool);
+extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
+extern uint64_t fill_random_buf(struct frand_state *, void *buf, unsigned int len);
+extern void __fill_random_buf_percentage(uint64_t, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
+extern uint64_t fill_random_buf_percentage(struct frand_state *, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
+
+#endif
diff --git a/lib/rbtree.c b/lib/rbtree.c
new file mode 100644
index 0000000..6f0feae
--- /dev/null
+++ b/lib/rbtree.c
@@ -0,0 +1,334 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+  linux/lib/rbtree.c
+*/
+
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct fio_rb_node *node, struct rb_root *root)
+{
+	struct fio_rb_node *right = node->rb_right;
+	struct fio_rb_node *parent = rb_parent(node);
+
+	if ((node->rb_right = right->rb_left))
+		rb_set_parent(right->rb_left, node);
+	right->rb_left = node;
+
+	rb_set_parent(right, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_left)
+			parent->rb_left = right;
+		else
+			parent->rb_right = right;
+	}
+	else
+		root->rb_node = right;
+	rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct fio_rb_node *node, struct rb_root *root)
+{
+	struct fio_rb_node *left = node->rb_left;
+	struct fio_rb_node *parent = rb_parent(node);
+
+	if ((node->rb_left = left->rb_right))
+		rb_set_parent(left->rb_right, node);
+	left->rb_right = node;
+
+	rb_set_parent(left, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_right)
+			parent->rb_right = left;
+		else
+			parent->rb_left = left;
+	}
+	else
+		root->rb_node = left;
+	rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct fio_rb_node *node, struct rb_root *root)
+{
+	struct fio_rb_node *parent, *gparent;
+
+	while ((parent = rb_parent(node)) && rb_is_red(parent))
+	{
+		gparent = rb_parent(parent);
+
+		if (parent == gparent->rb_left)
+		{
+			{
+				register struct fio_rb_node *uncle = gparent->rb_right;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_right == node)
+			{
+				register struct fio_rb_node *tmp;
+				__rb_rotate_left(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_right(gparent, root);
+		} else {
+			{
+				register struct fio_rb_node *uncle = gparent->rb_left;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_left == node)
+			{
+				register struct fio_rb_node *tmp;
+				__rb_rotate_right(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_left(gparent, root);
+		}
+	}
+
+	rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct fio_rb_node *node,
+			     struct fio_rb_node *parent,
+			     struct rb_root *root)
+{
+	struct fio_rb_node *other;
+
+	while ((!node || rb_is_black(node)) && node != root->rb_node)
+	{
+		if (parent->rb_left == node)
+		{
+			other = parent->rb_right;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_left(parent, root);
+				other = parent->rb_right;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_right || rb_is_black(other->rb_right))
+				{
+					struct fio_rb_node *o_left;
+					if ((o_left = other->rb_left))
+						rb_set_black(o_left);
+					rb_set_red(other);
+					__rb_rotate_right(other, root);
+					other = parent->rb_right;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				if (other->rb_right)
+					rb_set_black(other->rb_right);
+				__rb_rotate_left(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+		else
+		{
+			other = parent->rb_left;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_right(parent, root);
+				other = parent->rb_left;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_left || rb_is_black(other->rb_left))
+				{
+					register struct fio_rb_node *o_right;
+					if ((o_right = other->rb_right))
+						rb_set_black(o_right);
+					rb_set_red(other);
+					__rb_rotate_left(other, root);
+					other = parent->rb_left;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				if (other->rb_left)
+					rb_set_black(other->rb_left);
+				__rb_rotate_right(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+	}
+	if (node)
+		rb_set_black(node);
+}
+
+void rb_erase(struct fio_rb_node *node, struct rb_root *root)
+{
+	struct fio_rb_node *child, *parent;
+	int color;
+
+	if (!node->rb_left)
+		child = node->rb_right;
+	else if (!node->rb_right)
+		child = node->rb_left;
+	else
+	{
+		struct fio_rb_node *old = node, *left;
+
+		node = node->rb_right;
+		while ((left = node->rb_left) != NULL)
+			node = left;
+		child = node->rb_right;
+		parent = rb_parent(node);
+		color = rb_color(node);
+
+		if (child)
+			rb_set_parent(child, parent);
+		if (parent == old) {
+			parent->rb_right = child;
+			parent = node;
+		} else
+			parent->rb_left = child;
+
+		node->rb_parent_color = old->rb_parent_color;
+		node->rb_right = old->rb_right;
+		node->rb_left = old->rb_left;
+
+		if (rb_parent(old))
+		{
+			if (rb_parent(old)->rb_left == old)
+				rb_parent(old)->rb_left = node;
+			else
+				rb_parent(old)->rb_right = node;
+		} else
+			root->rb_node = node;
+
+		rb_set_parent(old->rb_left, node);
+		if (old->rb_right)
+			rb_set_parent(old->rb_right, node);
+		goto color;
+	}
+
+	parent = rb_parent(node);
+	color = rb_color(node);
+
+	if (child)
+		rb_set_parent(child, parent);
+	if (parent)
+	{
+		if (parent->rb_left == node)
+			parent->rb_left = child;
+		else
+			parent->rb_right = child;
+	}
+	else
+		root->rb_node = child;
+
+ color:
+	if (color == RB_BLACK)
+		__rb_erase_color(child, parent, root);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct fio_rb_node *rb_first(struct rb_root *root)
+{
+	struct fio_rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct fio_rb_node *rb_next(const struct fio_rb_node *node)
+{
+	struct fio_rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct fio_rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
diff --git a/lib/rbtree.h b/lib/rbtree.h
new file mode 100644
index 0000000..82ab97a
--- /dev/null
+++ b/lib/rbtree.h
@@ -0,0 +1,156 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  Some example of insert and search follows here. The search is a plain
+  normal search over an ordered tree. The insert instead must be implemented
+  int two steps: as first thing the code must insert the element in
+  order as a red leaf in the tree, then the support library function
+  rb_insert_color() must be called. Such function will do the
+  not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+						 unsigned long offset)
+{
+	struct fio_rb_node * n = inode->i_rb_page_cache.rb_node;
+	struct page * page;
+
+	while (n)
+	{
+		page = rb_entry(n, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			n = n->rb_left;
+		else if (offset > page->offset)
+			n = n->rb_right;
+		else
+			return page;
+	}
+	return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+						   unsigned long offset,
+						   struct fio_rb_node * node)
+{
+	struct fio_rb_node ** p = &inode->i_rb_page_cache.rb_node;
+	struct fio_rb_node * parent = NULL;
+	struct page * page;
+
+	while (*p)
+	{
+		parent = *p;
+		page = rb_entry(parent, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			p = &(*p)->rb_left;
+		else if (offset > page->offset)
+			p = &(*p)->rb_right;
+		else
+			return page;
+	}
+
+	rb_link_node(node, parent, p);
+
+	return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+						 unsigned long offset,
+						 struct fio_rb_node * node)
+{
+	struct page * ret;
+	if ((ret = __rb_insert_page_cache(inode, offset, node)))
+		goto out;
+	rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+	return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+struct fio_rb_node
+{
+	intptr_t rb_parent_color;
+#define	RB_RED		0
+#define	RB_BLACK	1
+	struct fio_rb_node *rb_right;
+	struct fio_rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root
+{
+	struct fio_rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct fio_rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r)   ((r)->rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct fio_rb_node *rb, struct fio_rb_node *p)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & 3) | (uintptr_t)p;
+}
+static inline void rb_set_color(struct fio_rb_node *rb, int color)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
+}
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
+#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+
+extern void rb_insert_color(struct fio_rb_node *, struct rb_root *);
+extern void rb_erase(struct fio_rb_node *, struct rb_root *);
+
+/* Find logical next and previous nodes in a tree */
+extern struct fio_rb_node *rb_first(struct rb_root *);
+extern struct fio_rb_node *rb_next(const struct fio_rb_node *);
+
+static inline void rb_link_node(struct fio_rb_node * node,
+				struct fio_rb_node * parent,
+				struct fio_rb_node ** rb_link)
+{
+	node->rb_parent_color = (uintptr_t)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+#endif	/* _LINUX_RBTREE_H */
diff --git a/lib/seqlock.h b/lib/seqlock.h
new file mode 100644
index 0000000..762b6ec
--- /dev/null
+++ b/lib/seqlock.h
@@ -0,0 +1,49 @@
+#ifndef FIO_SEQLOCK_H
+#define FIO_SEQLOCK_H
+
+#include "types.h"
+#include "../arch/arch.h"
+
+struct seqlock {
+	volatile int sequence;
+};
+
+static inline void seqlock_init(struct seqlock *s)
+{
+	s->sequence = 0;
+}
+
+static inline unsigned int read_seqlock_begin(struct seqlock *s)
+{
+	unsigned int seq;
+
+	do {
+		seq = s->sequence;
+		if (!(seq & 1))
+			break;
+		nop;
+	} while (1);
+
+	read_barrier();
+	return seq;
+}
+
+static inline bool read_seqlock_retry(struct seqlock *s, unsigned int seq)
+{
+	read_barrier();
+	return s->sequence != seq;
+}
+
+static inline void write_seqlock_begin(struct seqlock *s)
+{
+	s->sequence++;
+	write_barrier();
+}
+
+static inline void write_seqlock_end(struct seqlock *s)
+{
+	write_barrier();
+	s->sequence++;
+}
+
+#endif
diff --git a/lib/strntol.c b/lib/strntol.c
new file mode 100644
index 0000000..c3a55a1
--- /dev/null
+++ b/lib/strntol.c
@@ -0,0 +1,33 @@
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include "strntol.h"
+
+long strntol(const char *str, size_t sz, char **end, int base)
+{
+	/* Expect that digit representation of LONG_MAX/MIN
+	 * not greater than this buffer */
+	char buf[24];
+	long ret;
+	const char *beg = str;
+
+	/* Catch up leading spaces */
+	for (; beg && sz && *beg == ' '; beg++, sz--)
+		;
+
+	if (!sz || sz >= sizeof(buf)) {
+		if (end)
+			*end = (char *)str;
+		return 0;
+	}
+
+	memcpy(buf, beg, sz);
+	buf[sz] = '\0';
+	ret = strtol(buf, end, base);
+	if (ret == LONG_MIN || ret == LONG_MAX)
+		return ret;
+	if (end)
+		*end = (char *)beg + (*end - buf);
+	return ret;
+}
diff --git a/lib/strntol.h b/lib/strntol.h
new file mode 100644
index 0000000..59c090d
--- /dev/null
+++ b/lib/strntol.h
@@ -0,0 +1,8 @@
+#ifndef FIO_STRNTOL_H
+#define FIO_STRNTOL_H
+
+#include <stdint.h>
+
+long strntol(const char *str, size_t sz, char **end, int base);
+
+#endif
diff --git a/lib/types.h b/lib/types.h
new file mode 100644
index 0000000..d92b064
--- /dev/null
+++ b/lib/types.h
@@ -0,0 +1,20 @@
+#ifndef FIO_TYPES_H
+#define FIO_TYPES_H
+
+#if !defined(CONFIG_HAVE_BOOL) && !defined(__cplusplus)
+typedef int bool;
+#ifndef false
+#define false	0
+#endif
+#ifndef true
+#define true	1
+#endif
+#else
+#include <stdbool.h> /* IWYU pragma: export */
+#endif
+
+#if !defined(CONFIG_HAVE_KERNEL_RWF_T)
+typedef int __kernel_rwf_t;
+#endif
+
+#endif
diff --git a/lib/zipf.c b/lib/zipf.c
new file mode 100644
index 0000000..321a4fb
--- /dev/null
+++ b/lib/zipf.c
@@ -0,0 +1,96 @@
+#include <math.h>
+#include <string.h>
+#include "zipf.h"
+#include "../minmax.h"
+#include "../hash.h"
+
+#define ZIPF_MAX_GEN	10000000UL
+
+static void zipf_update(struct zipf_state *zs)
+{
+	uint64_t to_gen;
+	unsigned int i;
+
+	/*
+	 * It can become very costly to generate long sequences. Just cap it at
+	 * 10M max, that should be doable in 1-2s on even slow machines.
+	 * Precision will take a slight hit, but nothing major.
+	 */
+	to_gen = min(zs->nranges, (uint64_t) ZIPF_MAX_GEN);
+
+	for (i = 0; i < to_gen; i++)
+		zs->zetan += pow(1.0 / (double) (i + 1), zs->theta);
+}
+
+static void shared_rand_init(struct zipf_state *zs, uint64_t nranges,
+			     unsigned int seed)
+{
+	memset(zs, 0, sizeof(*zs));
+	zs->nranges = nranges;
+
+	init_rand_seed(&zs->rand, seed, 0);
+	zs->rand_off = __rand(&zs->rand);
+}
+
+void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta,
+	       unsigned int seed)
+{
+	shared_rand_init(zs, nranges, seed);
+
+	zs->theta = theta;
+	zs->zeta2 = pow(1.0, zs->theta) + pow(0.5, zs->theta);
+
+	zipf_update(zs);
+}
+
+uint64_t zipf_next(struct zipf_state *zs)
+{
+	double alpha, eta, rand_uni, rand_z;
+	unsigned long long n = zs->nranges;
+	unsigned long long val;
+
+	alpha = 1.0 / (1.0 - zs->theta);
+	eta = (1.0 - pow(2.0 / n, 1.0 - zs->theta)) / (1.0 - zs->zeta2 / zs->zetan);
+
+	rand_uni = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
+	rand_z = rand_uni * zs->zetan;
+
+	if (rand_z < 1.0)
+		val = 1;
+	else if (rand_z < (1.0 + pow(0.5, zs->theta)))
+		val = 2;
+	else
+		val = 1 + (unsigned long long)(n * pow(eta*rand_uni - eta + 1.0, alpha));
+
+	val--;
+
+	if (!zs->disable_hash)
+		val = __hash_u64(val);
+
+	return (val + zs->rand_off) % zs->nranges;
+}
+
+void pareto_init(struct zipf_state *zs, uint64_t nranges, double h,
+		 unsigned int seed)
+{
+	shared_rand_init(zs, nranges, seed);
+	zs->pareto_pow = log(h) / log(1.0 - h);
+}
+
+uint64_t pareto_next(struct zipf_state *zs)
+{
+	double rand = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
+	unsigned long long n;
+
+	n = (zs->nranges - 1) * pow(rand, zs->pareto_pow);
+
+	if (!zs->disable_hash)
+		n = __hash_u64(n);
+
+	return (n + zs->rand_off)  % zs->nranges;
+}
+
+void zipf_disable_hash(struct zipf_state *zs)
+{
+	zs->disable_hash = true;
+}
diff --git a/lib/zipf.h b/lib/zipf.h
new file mode 100644
index 0000000..16b65f5
--- /dev/null
+++ b/lib/zipf.h
@@ -0,0 +1,26 @@
+#ifndef FIO_ZIPF_H
+#define FIO_ZIPF_H
+
+#include <inttypes.h>
+#include "rand.h"
+#include "types.h"
+
+struct zipf_state {
+	uint64_t nranges;
+	double theta;
+	double zeta2;
+	double zetan;
+	double pareto_pow;
+	struct frand_state rand;
+	uint64_t rand_off;
+	bool disable_hash;
+};
+
+void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta, unsigned int seed);
+uint64_t zipf_next(struct zipf_state *zs);
+
+void pareto_init(struct zipf_state *zs, uint64_t nranges, double h, unsigned int seed);
+uint64_t pareto_next(struct zipf_state *zs);
+void zipf_disable_hash(struct zipf_state *zs);
+
+#endif
diff --git a/libfio.c b/libfio.c
new file mode 100644
index 0000000..7348b16
--- /dev/null
+++ b/libfio.c
@@ -0,0 +1,436 @@
+/*
+ * fio - the flexible io tester
+ *
+ * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006-2012 Jens Axboe <axboe@kernel.dk>
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <string.h>
+#include <signal.h>
+#include <stdint.h>
+#include <locale.h>
+#include <fcntl.h>
+
+#include "fio.h"
+#include "smalloc.h"
+#include "os/os.h"
+#include "filelock.h"
+#include "helper_thread.h"
+#include "filehash.h"
+
+FLIST_HEAD(disk_list);
+
+unsigned long arch_flags = 0;
+
+uintptr_t page_mask = 0;
+uintptr_t page_size = 0;
+
+/* see os/os.h */
+static const char *fio_os_strings[os_nr] = {
+	"Invalid",
+	"Linux",
+	"AIX",
+	"FreeBSD",
+	"HP-UX",
+	"OSX",
+	"NetBSD",
+	"OpenBSD",
+	"Solaris",
+	"Windows",
+	"Android",
+	"DragonFly",
+};
+
+/* see arch/arch.h */
+static const char *fio_arch_strings[arch_nr] = {
+	"Invalid",
+	"x86-64",
+	"x86",
+	"ppc",
+	"ia64",
+	"s390",
+	"alpha",
+	"sparc",
+	"sparc64",
+	"arm",
+	"sh",
+	"hppa",
+	"mips",
+	"aarch64",
+	"generic"
+};
+
+static void reset_io_counters(struct thread_data *td, int all)
+{
+	int ddir;
+
+	if (all) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			td->stat_io_bytes[ddir] = 0;
+			td->this_io_bytes[ddir] = 0;
+			td->stat_io_blocks[ddir] = 0;
+			td->this_io_blocks[ddir] = 0;
+			td->rate_bytes[ddir] = 0;
+			td->rate_blocks[ddir] = 0;
+			td->bytes_done[ddir] = 0;
+			td->rate_io_issue_bytes[ddir] = 0;
+			td->rate_next_io_time[ddir] = 0;
+			td->last_usec[ddir] = 0;
+		}
+	}
+
+	td->zone_bytes = 0;
+
+	td->last_was_sync = false;
+	td->rwmix_issues = 0;
+
+	/*
+	 * reset file done count if we are to start over
+	 */
+	if (td->o.time_based || td->o.loops || td->o.do_verify)
+		td->nr_done_files = 0;
+}
+
+void clear_io_state(struct thread_data *td, int all)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	reset_io_counters(td, all);
+
+	close_files(td);
+	for_each_file(td, f, i) {
+		fio_file_clear_done(f);
+		f->file_offset = get_start_offset(td, f);
+	}
+
+	/*
+	 * Re-Seed random number generator if rand_repeatable is true
+	 */
+	if (td->o.rand_repeatable)
+		td_fill_rand_seeds(td);
+}
+
+void reset_all_stats(struct thread_data *td)
+{
+	int i;
+
+	reset_io_counters(td, 1);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		td->io_bytes[i] = 0;
+		td->io_blocks[i] = 0;
+		td->io_issues[i] = 0;
+		td->ts.total_io_u[i] = 0;
+		td->ts.runtime[i] = 0;
+		td->rwmix_issues = 0;
+	}
+
+	set_epoch_time(td, td->o.log_unix_epoch);
+	memcpy(&td->start, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+
+	lat_target_reset(td);
+	clear_rusage_stat(td);
+	helper_reset();
+}
+
+void reset_fio_state(void)
+{
+	groupid = 0;
+	thread_number = 0;
+	stat_number = 0;
+	done_secs = 0;
+}
+
+const char *fio_get_os_string(int nr)
+{
+	if (nr < os_nr)
+		return fio_os_strings[nr];
+
+	return NULL;
+}
+
+const char *fio_get_arch_string(int nr)
+{
+	if (nr < arch_nr)
+		return fio_arch_strings[nr];
+
+	return NULL;
+}
+
+static const char *td_runstates[] = {
+	"NOT_CREATED",
+	"CREATED",
+	"INITIALIZED",
+	"RAMP",
+	"SETTING_UP",
+	"RUNNING",
+	"PRE_READING",
+	"VERIFYING",
+	"FSYNCING",
+	"FINISHING",
+	"EXITED",
+	"REAPED",
+};
+
+const char *runstate_to_name(int runstate)
+{
+	compiletime_assert(TD_LAST == 12, "td runstate list");
+	if (runstate >= 0 && runstate < TD_LAST)
+		return td_runstates[runstate];
+
+	return "invalid";
+}
+
+void td_set_runstate(struct thread_data *td, int runstate)
+{
+	if (td->runstate == runstate)
+		return;
+
+	dprint(FD_PROCESS, "pid=%d: runstate %s -> %s\n", (int) td->pid,
+						runstate_to_name(td->runstate),
+						runstate_to_name(runstate));
+	td->runstate = runstate;
+}
+
+int td_bump_runstate(struct thread_data *td, int new_state)
+{
+	int old_state = td->runstate;
+
+	td_set_runstate(td, new_state);
+	return old_state;
+}
+
+void td_restore_runstate(struct thread_data *td, int old_state)
+{
+	td_set_runstate(td, old_state);
+}
+
+void fio_mark_td_terminate(struct thread_data *td)
+{
+	fio_gettime(&td->terminate_time, NULL);
+	write_barrier();
+	td->terminate = true;
+}
+
+void fio_terminate_threads(unsigned int group_id, unsigned int terminate)
+{
+	struct thread_data *td;
+	pid_t pid = getpid();
+	int i;
+
+	dprint(FD_PROCESS, "terminate group_id=%d\n", group_id);
+
+	for_each_td(td, i) {
+		if ((terminate == TERMINATE_GROUP && group_id == TERMINATE_ALL) ||
+		    (terminate == TERMINATE_GROUP && group_id == td->groupid) ||
+		    (terminate == TERMINATE_STONEWALL && td->runstate >= TD_RUNNING) ||
+		    (terminate == TERMINATE_ALL)) {
+			dprint(FD_PROCESS, "setting terminate on %s/%d\n",
+						td->o.name, (int) td->pid);
+
+			if (td->terminate)
+				continue;
+
+			fio_mark_td_terminate(td);
+			td->o.start_delay = 0;
+
+			/*
+			 * if the thread is running, just let it exit
+			 */
+			if (!td->pid || pid == td->pid)
+				continue;
+			else if (td->runstate < TD_RAMP)
+				kill(td->pid, SIGTERM);
+			else {
+				struct ioengine_ops *ops = td->io_ops;
+
+				if (ops && ops->terminate)
+					ops->terminate(td);
+			}
+		}
+	}
+}
+
+int fio_running_or_pending_io_threads(void)
+{
+	struct thread_data *td;
+	int i;
+	int nr_io_threads = 0;
+
+	for_each_td(td, i) {
+		if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO))
+			continue;
+		nr_io_threads++;
+		if (td->runstate < TD_EXITED)
+			return 1;
+	}
+
+	if (!nr_io_threads)
+		return -1; /* we only had cpuio threads to begin with */
+	return 0;
+}
+
+int fio_set_fd_nonblocking(int fd, const char *who)
+{
+	int flags;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		log_err("fio: %s failed to get file flags: %s\n", who, strerror(errno));
+	else {
+		int new_flags = flags | O_NONBLOCK;
+
+		new_flags = fcntl(fd, F_SETFL, new_flags);
+		if (new_flags < 0)
+			log_err("fio: %s failed to get file flags: %s\n", who, strerror(errno));
+	}
+
+	return flags;
+}
+
+enum {
+	ENDIAN_INVALID_BE = 1,
+	ENDIAN_INVALID_LE,
+	ENDIAN_INVALID_CONFIG,
+	ENDIAN_BROKEN,
+};
+
+static int endian_check(void)
+{
+	union {
+		uint8_t c[8];
+		uint64_t v;
+	} u;
+	int le = 0, be = 0;
+
+	u.v = 0x12;
+	if (u.c[7] == 0x12)
+		be = 1;
+	else if (u.c[0] == 0x12)
+		le = 1;
+
+#if defined(CONFIG_LITTLE_ENDIAN)
+	if (be)
+		return ENDIAN_INVALID_BE;
+#elif defined(CONFIG_BIG_ENDIAN)
+	if (le)
+		return ENDIAN_INVALID_LE;
+#else
+	return ENDIAN_INVALID_CONFIG;
+#endif
+
+	if (!le && !be)
+		return ENDIAN_BROKEN;
+
+	return 0;
+}
+
+int initialize_fio(char *envp[])
+{
+	long ps;
+	int err;
+
+	/*
+	 * We need these to be properly 64-bit aligned, otherwise we
+	 * can run into problems on archs that fault on unaligned fp
+	 * access (ARM).
+	 */
+	compiletime_assert((offsetof(struct thread_data, ts) % sizeof(void *)) == 0, "ts");
+	compiletime_assert((offsetof(struct thread_stat, percentile_list) % 8) == 0, "stat percentile_list");
+	compiletime_assert((offsetof(struct thread_stat, total_run_time) % 8) == 0, "total_run_time");
+	compiletime_assert((offsetof(struct thread_stat, total_err_count) % 8) == 0, "total_err_count");
+	compiletime_assert((offsetof(struct thread_stat, latency_percentile) % 8) == 0, "stat latency_percentile");
+	compiletime_assert((offsetof(struct thread_data, ts.clat_stat) % 8) == 0, "ts.clat_stat");
+	compiletime_assert((offsetof(struct thread_options_pack, zipf_theta) % 8) == 0, "zipf_theta");
+	compiletime_assert((offsetof(struct thread_options_pack, pareto_h) % 8) == 0, "pareto_h");
+	compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list");
+	compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile");
+	compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate");
+
+	compiletime_assert(__TD_F_LAST <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+	compiletime_assert(BSSPLIT_MAX <= ZONESPLIT_MAX, "bsssplit/zone max");
+
+	err = endian_check();
+	if (err) {
+		log_err("fio: endianness settings appear wrong.\n");
+		switch (err) {
+		case ENDIAN_INVALID_BE:
+			log_err("fio: got big-endian when configured for little\n");
+			break;
+		case ENDIAN_INVALID_LE:
+			log_err("fio: got little-endian when configured for big\n");
+			break;
+		case ENDIAN_INVALID_CONFIG:
+			log_err("fio: not configured to any endianness\n");
+			break;
+		case ENDIAN_BROKEN:
+			log_err("fio: failed to detect endianness\n");
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		log_err("fio: please report this to fio@vger.kernel.org\n");
+		return 1;
+	}
+
+#if !defined(CONFIG_GETTIMEOFDAY) && !defined(CONFIG_CLOCK_GETTIME)
+#error "No available clock source!"
+#endif
+
+	arch_init(envp);
+
+	sinit();
+
+	if (fio_filelock_init()) {
+		log_err("fio: failed initializing filelock subsys\n");
+		return 1;
+	}
+
+	file_hash_init();
+
+	/*
+	 * We need locale for number printing, if it isn't set then just
+	 * go with the US format.
+	 */
+	if (!getenv("LC_NUMERIC"))
+		setlocale(LC_NUMERIC, "en_US");
+
+	ps = sysconf(_SC_PAGESIZE);
+	if (ps < 0) {
+		log_err("Failed to get page size\n");
+		return 1;
+	}
+
+	page_size = ps;
+	page_mask = ps - 1;
+
+	fio_keywords_init();
+	return 0;
+}
+
+void deinitialize_fio(void)
+{
+	fio_keywords_exit();
+}
diff --git a/log.c b/log.c
new file mode 100644
index 0000000..6c36813
--- /dev/null
+++ b/log.c
@@ -0,0 +1,148 @@
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <syslog.h>
+
+#include "fio.h"
+#include "oslib/asprintf.h"
+
+size_t log_info_buf(const char *buf, size_t len)
+{
+	/*
+	 * buf could be NULL (not just "").
+	 */
+	if (!buf)
+		return 0;
+
+	if (is_backend) {
+		ssize_t ret = fio_server_text_output(FIO_LOG_INFO, buf, len);
+		if (ret != -1)
+			return ret;
+	}
+
+	if (log_syslog) {
+		syslog(LOG_INFO, "%s", buf);
+		return len;
+	} else
+		return fwrite(buf, len, 1, f_out);
+}
+
+size_t log_valist(const char *fmt, va_list args)
+{
+	char *buffer;
+	int len;
+
+	len = vasprintf(&buffer, fmt, args);
+	if (len < 0)
+		return 0;
+	len = log_info_buf(buffer, len);
+	free(buffer);
+
+	return len;
+}
+
+/* add prefix for the specified type in front of the valist */
+void log_prevalist(int type, const char *fmt, va_list args)
+{
+	char *buf1, *buf2;
+	int len;
+	pid_t pid;
+
+	pid = gettid();
+	if (fio_debug_jobp && *fio_debug_jobp != -1U
+	    && pid != *fio_debug_jobp)
+		return;
+
+	len = vasprintf(&buf1, fmt, args);
+	if (len < 0)
+		return;
+	len = asprintf(&buf2, "%-8s %-5u %s", debug_levels[type].name,
+		       (int) pid, buf1);
+	free(buf1);
+	if (len < 0)
+		return;
+	len = log_info_buf(buf2, len);
+	free(buf2);
+}
+
+ssize_t log_info(const char *format, ...)
+{
+	va_list args;
+	ssize_t ret;
+
+	va_start(args, format);
+	ret = log_valist(format, args);
+	va_end(args);
+
+	return ret;
+}
+
+size_t __log_buf(struct buf_output *buf, const char *format, ...)
+{
+	char *buffer;
+	va_list args;
+	int len;
+
+	va_start(args, format);
+	len = vasprintf(&buffer, format, args);
+	va_end(args);
+	if (len < 0)
+		return 0;
+	len = buf_output_add(buf, buffer, len);
+	free(buffer);
+
+	return len;
+}
+
+int log_info_flush(void)
+{
+	if (is_backend || log_syslog)
+		return 0;
+
+	return fflush(f_out);
+}
+
+ssize_t log_err(const char *format, ...)
+{
+	ssize_t ret;
+	int len;
+	char *buffer;
+	va_list args;
+
+	va_start(args, format);
+	len = vasprintf(&buffer, format, args);
+	va_end(args);
+	if (len < 0)
+		return len;
+
+	if (is_backend) {
+		ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
+		if (ret != -1)
+			goto done;
+	}
+
+	if (log_syslog) {
+		syslog(LOG_INFO, "%s", buffer);
+		ret = len;
+	} else {
+		if (f_err != stderr)
+			ret = fwrite(buffer, len, 1, stderr);
+
+		ret = fwrite(buffer, len, 1, f_err);
+	}
+
+done:
+	free(buffer);
+	return ret;
+}
+
+const char *log_get_level(int level)
+{
+	static const char *levels[] = { "Unknown", "Debug", "Info", "Error",
+						"Unknown" };
+
+	if (level >= FIO_LOG_NR)
+		level = FIO_LOG_NR;
+
+	return levels[level];
+}
diff --git a/log.h b/log.h
new file mode 100644
index 0000000..562f3f4
--- /dev/null
+++ b/log.h
@@ -0,0 +1,40 @@
+#ifndef FIO_LOG_H
+#define FIO_LOG_H
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+
+#include "lib/output_buffer.h"
+
+extern FILE *f_out;
+extern FILE *f_err;
+
+extern ssize_t log_err(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+extern ssize_t log_info(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+extern size_t __log_buf(struct buf_output *, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3)));
+extern size_t log_valist(const char *str, va_list);
+extern void log_prevalist(int type, const char *str, va_list);
+extern size_t log_info_buf(const char *buf, size_t len);
+extern int log_info_flush(void);
+
+#define log_buf(buf, format, args...)			\
+({							\
+	size_t __ret;					\
+	if ((buf) != NULL)				\
+		__ret = __log_buf(buf, format, ##args);	\
+	else						\
+		__ret = log_info(format, ##args);	\
+	__ret;						\
+})
+
+enum {
+	FIO_LOG_DEBUG	= 1,
+	FIO_LOG_INFO	= 2,
+	FIO_LOG_ERR	= 3,
+	FIO_LOG_NR	= 4,
+};
+
+extern const char *log_get_level(int level);
+
+#endif
diff --git a/memory.c b/memory.c
new file mode 100644
index 0000000..5f0225f
--- /dev/null
+++ b/memory.c
@@ -0,0 +1,362 @@
+/*
+ * Memory helpers
+ */
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include "fio.h"
+#ifndef FIO_NO_HAVE_SHM_H
+#include <sys/shm.h>
+#endif
+
+void fio_unpin_memory(struct thread_data *td)
+{
+	if (td->pinned_mem) {
+		dprint(FD_MEM, "unpinning %llu bytes\n", td->o.lockmem);
+		if (munlock(td->pinned_mem, td->o.lockmem) < 0)
+			perror("munlock");
+		munmap(td->pinned_mem, td->o.lockmem);
+		td->pinned_mem = NULL;
+	}
+}
+
+int fio_pin_memory(struct thread_data *td)
+{
+	unsigned long long phys_mem;
+
+	if (!td->o.lockmem)
+		return 0;
+
+	dprint(FD_MEM, "pinning %llu bytes\n", td->o.lockmem);
+
+	/*
+	 * Don't allow mlock of more than real_mem-128MiB
+	 */
+	phys_mem = os_phys_mem();
+	if (phys_mem) {
+		if ((td->o.lockmem + 128 * 1024 * 1024) > phys_mem) {
+			td->o.lockmem = phys_mem - 128 * 1024 * 1024;
+			log_info("fio: limiting mlocked memory to %lluMiB\n",
+							td->o.lockmem >> 20);
+		}
+	}
+
+	td->pinned_mem = mmap(NULL, td->o.lockmem, PROT_READ | PROT_WRITE,
+				MAP_PRIVATE | OS_MAP_ANON, -1, 0);
+	if (td->pinned_mem == MAP_FAILED) {
+		perror("malloc locked mem");
+		td->pinned_mem = NULL;
+		return 1;
+	}
+	if (mlock(td->pinned_mem, td->o.lockmem) < 0) {
+		perror("mlock");
+		munmap(td->pinned_mem, td->o.lockmem);
+		td->pinned_mem = NULL;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int alloc_mem_shm(struct thread_data *td, unsigned int total_mem)
+{
+#ifndef CONFIG_NO_SHM
+	int flags = IPC_CREAT | S_IRUSR | S_IWUSR;
+
+	if (td->o.mem_type == MEM_SHMHUGE) {
+		unsigned long mask = td->o.hugepage_size - 1;
+
+		flags |= SHM_HUGETLB;
+		total_mem = (total_mem + mask) & ~mask;
+	}
+
+	td->shm_id = shmget(IPC_PRIVATE, total_mem, flags);
+	dprint(FD_MEM, "shmget %u, %d\n", total_mem, td->shm_id);
+	if (td->shm_id < 0) {
+		td_verror(td, errno, "shmget");
+		if (geteuid() != 0 && (errno == ENOMEM || errno == EPERM))
+			log_err("fio: you may need to run this job as root\n");
+		if (td->o.mem_type == MEM_SHMHUGE) {
+			if (errno == EINVAL) {
+				log_err("fio: check that you have free huge"
+					" pages and that hugepage-size is"
+					" correct.\n");
+			} else if (errno == ENOSYS) {
+				log_err("fio: your system does not appear to"
+					" support huge pages.\n");
+			} else if (errno == ENOMEM) {
+				log_err("fio: no huge pages available, do you"
+					" need to allocate some? See HOWTO.\n");
+			}
+		}
+
+		return 1;
+	}
+
+	td->orig_buffer = shmat(td->shm_id, NULL, 0);
+	dprint(FD_MEM, "shmat %d, %p\n", td->shm_id, td->orig_buffer);
+	if (td->orig_buffer == (void *) -1) {
+		td_verror(td, errno, "shmat");
+		td->orig_buffer = NULL;
+		return 1;
+	}
+
+	return 0;
+#else
+	log_err("fio: shm not supported\n");
+	return 1;
+#endif
+}
+
+static void free_mem_shm(struct thread_data *td)
+{
+#ifndef CONFIG_NO_SHM
+	struct shmid_ds sbuf;
+
+	dprint(FD_MEM, "shmdt/ctl %d %p\n", td->shm_id, td->orig_buffer);
+	shmdt(td->orig_buffer);
+	shmctl(td->shm_id, IPC_RMID, &sbuf);
+#endif
+}
+
+static int alloc_mem_mmap(struct thread_data *td, size_t total_mem)
+{
+	int flags = 0;
+
+	td->mmapfd = -1;
+
+	if (td->o.mem_type == MEM_MMAPHUGE) {
+		unsigned long mask = td->o.hugepage_size - 1;
+
+		/* TODO: make sure the file is a real hugetlbfs file */
+		if (!td->o.mmapfile)
+			flags |= MAP_HUGETLB;
+		total_mem = (total_mem + mask) & ~mask;
+	}
+
+	if (td->o.mmapfile) {
+		if (access(td->o.mmapfile, F_OK) == 0)
+			td->flags |= TD_F_MMAP_KEEP;
+
+		td->mmapfd = open(td->o.mmapfile, O_RDWR|O_CREAT, 0644);
+
+		if (td->mmapfd < 0) {
+			td_verror(td, errno, "open mmap file");
+			td->orig_buffer = NULL;
+			return 1;
+		}
+		if (td->o.mem_type != MEM_MMAPHUGE &&
+		    td->o.mem_type != MEM_MMAPSHARED &&
+		    ftruncate(td->mmapfd, total_mem) < 0) {
+			td_verror(td, errno, "truncate mmap file");
+			td->orig_buffer = NULL;
+			return 1;
+		}
+		if (td->o.mem_type == MEM_MMAPHUGE ||
+		    td->o.mem_type == MEM_MMAPSHARED)
+			flags |= MAP_SHARED;
+		else
+			flags |= MAP_PRIVATE;
+	} else
+		flags |= OS_MAP_ANON | MAP_PRIVATE;
+
+	td->orig_buffer = mmap(NULL, total_mem, PROT_READ | PROT_WRITE, flags,
+				td->mmapfd, 0);
+	dprint(FD_MEM, "mmap %llu/%d %p\n", (unsigned long long) total_mem,
+						td->mmapfd, td->orig_buffer);
+	if (td->orig_buffer == MAP_FAILED) {
+		td_verror(td, errno, "mmap");
+		td->orig_buffer = NULL;
+		if (td->mmapfd != 1 && td->mmapfd != -1) {
+			close(td->mmapfd);
+			if (td->o.mmapfile && !(td->flags & TD_F_MMAP_KEEP))
+				unlink(td->o.mmapfile);
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static void free_mem_mmap(struct thread_data *td, size_t total_mem)
+{
+	dprint(FD_MEM, "munmap %llu %p\n", (unsigned long long) total_mem,
+						td->orig_buffer);
+	munmap(td->orig_buffer, td->orig_buffer_size);
+	if (td->o.mmapfile) {
+		if (td->mmapfd != -1)
+			close(td->mmapfd);
+		if (!(td->flags & TD_F_MMAP_KEEP))
+			unlink(td->o.mmapfile);
+		free(td->o.mmapfile);
+	}
+}
+
+static int alloc_mem_malloc(struct thread_data *td, size_t total_mem)
+{
+	td->orig_buffer = malloc(total_mem);
+	dprint(FD_MEM, "malloc %llu %p\n", (unsigned long long) total_mem,
+							td->orig_buffer);
+
+	return td->orig_buffer == NULL;
+}
+
+static void free_mem_malloc(struct thread_data *td)
+{
+	dprint(FD_MEM, "free malloc mem %p\n", td->orig_buffer);
+	free(td->orig_buffer);
+}
+
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+#ifdef CONFIG_CUDA
+	CUresult ret;
+	char name[128];
+
+	ret = cuInit(0);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed initialize cuda driver api\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device count\n");
+		return 1;
+	}
+	dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+	if (td->gpu_dev_cnt == 0) {
+		log_err("fio: no GPU device found. "
+			"Can not perform GPUDirect RDMA.\n");
+		return 1;
+	}
+
+	td->gpu_dev_id = td->o.gpu_dev_id;
+	ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get GPU device\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device name\n");
+		return 1;
+	}
+	dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+	       td->gpu_dev_id, name);
+
+	ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed to create cuda context: %d\n", ret);
+		return 1;
+	}
+
+	ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+		return 1;
+	}
+	td->orig_buffer = (void *) td->dev_mem_ptr;
+
+	dprint(FD_MEM, "cudaMalloc %llu %p\n",				\
+	       (unsigned long long) total_mem, td->orig_buffer);
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+#ifdef CONFIG_CUDA
+	if (td->dev_mem_ptr != NULL)
+		cuMemFree(td->dev_mem_ptr);
+
+	if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+		log_err("fio: failed to destroy cuda context\n");
+#endif
+}
+
+/*
+ * Set up the buffer area we need for io.
+ */
+int allocate_io_mem(struct thread_data *td)
+{
+	size_t total_mem;
+	int ret = 0;
+
+	if (td_ioengine_flagged(td, FIO_NOIO))
+		return 0;
+
+	total_mem = td->orig_buffer_size;
+
+	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+	    td_ioengine_flagged(td, FIO_MEMALIGN)) {
+		total_mem += page_mask;
+		if (td->o.mem_align && td->o.mem_align > page_size)
+			total_mem += td->o.mem_align - page_size;
+	}
+
+	dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem);
+
+	/*
+	 * If the IO engine has hooks to allocate/free memory, use those. But
+	 * error out if the user explicitly asked for something else.
+	 */
+	if (td->io_ops->iomem_alloc) {
+		if (fio_option_is_set(&td->o, mem_type)) {
+			log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
+			ret = 1;
+		} else
+			ret = td->io_ops->iomem_alloc(td, total_mem);
+	} else if (td->o.mem_type == MEM_MALLOC)
+		ret = alloc_mem_malloc(td, total_mem);
+	else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
+		ret = alloc_mem_shm(td, total_mem);
+	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
+		 td->o.mem_type == MEM_MMAPSHARED)
+		ret = alloc_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		ret = alloc_mem_cudamalloc(td, total_mem);
+	else {
+		log_err("fio: bad mem type: %d\n", td->o.mem_type);
+		ret = 1;
+	}
+
+	if (ret)
+		td_verror(td, ENOMEM, "iomem allocation");
+
+	return ret;
+}
+
+void free_io_mem(struct thread_data *td)
+{
+	unsigned int total_mem;
+
+	total_mem = td->orig_buffer_size;
+	if (td->o.odirect || td->o.oatomic)
+		total_mem += page_mask;
+
+	if (td->io_ops->iomem_alloc) {
+		if (td->io_ops->iomem_free)
+			td->io_ops->iomem_free(td);
+	} else if (td->o.mem_type == MEM_MALLOC)
+		free_mem_malloc(td);
+	else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
+		free_mem_shm(td);
+	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
+		 td->o.mem_type == MEM_MMAPSHARED)
+		free_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		free_mem_cudamalloc(td);
+	else
+		log_err("Bad memory type %u\n", td->o.mem_type);
+
+	td->orig_buffer = NULL;
+	td->orig_buffer_size = 0;
+}
diff --git a/minmax.h b/minmax.h
new file mode 100644
index 0000000..ec0848c
--- /dev/null
+++ b/minmax.h
@@ -0,0 +1,25 @@
+#ifndef FIO_MIN_MAX_H
+#define FIO_MIN_MAX_H
+
+#ifndef min
+#define min(x,y) ({ \
+	__typeof__(x) _x = (x);	\
+	__typeof__(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x < _y ? _x : _y; })
+#endif
+
+#ifndef max
+#define max(x,y) ({ \
+	__typeof__(x) _x = (x);	\
+	__typeof__(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x > _y ? _x : _y; })
+#endif
+
+#define min_not_zero(x, y) ({		\
+	__typeof__(x) __x = (x);		\
+	__typeof__(y) __y = (y);		\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
+#endif
diff --git a/optgroup.c b/optgroup.c
new file mode 100644
index 0000000..c228ff2
--- /dev/null
+++ b/optgroup.c
@@ -0,0 +1,213 @@
+#include <stdio.h>
+#include <inttypes.h>
+#include "optgroup.h"
+#include "compiler/compiler.h"
+
+/*
+ * Option grouping
+ */
+static const struct opt_group fio_opt_groups[] = {
+	{
+		.name	= "General",
+		.mask	= FIO_OPT_C_GENERAL,
+	},
+	{
+		.name	= "I/O",
+		.mask	= FIO_OPT_C_IO,
+	},
+	{
+		.name	= "File",
+		.mask	= FIO_OPT_C_FILE,
+	},
+	{
+		.name	= "Statistics",
+		.mask	= FIO_OPT_C_STAT,
+	},
+	{
+		.name	= "Logging",
+		.mask	= FIO_OPT_C_LOG,
+	},
+	{
+		.name	= "Profiles",
+		.mask	= FIO_OPT_C_PROFILE,
+	},
+	{
+		.name	= "I/O engines",
+		.mask	= FIO_OPT_C_ENGINE,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static const struct opt_group fio_opt_cat_groups[] = {
+	{
+		.name	= "Rate",
+		.mask	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "Zone",
+		.mask	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "Read/write mix",
+		.mask	= FIO_OPT_G_RWMIX,
+	},
+	{
+		.name	= "Verify",
+		.mask	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "Trim",
+		.mask	= FIO_OPT_G_TRIM,
+	},
+	{
+		.name	= "I/O Logging",
+		.mask	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "I/O Depth",
+		.mask	= FIO_OPT_G_IO_DEPTH,
+	},
+	{
+		.name	= "I/O Flow",
+		.mask	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name	= "Description",
+		.mask	= FIO_OPT_G_DESC,
+	},
+	{
+		.name	= "Filename",
+		.mask	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "General I/O",
+		.mask	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "Cgroups",
+		.mask	= FIO_OPT_G_CGROUP,
+	},
+	{
+		.name	= "Runtime",
+		.mask	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "Process",
+		.mask	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "Job credentials / priority",
+		.mask	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "Clock settings",
+		.mask	= FIO_OPT_G_CLOCK,
+	},
+	{
+		.name	= "I/O Type",
+		.mask	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "I/O Thinktime",
+		.mask	= FIO_OPT_G_THINKTIME,
+	},
+	{
+		.name	= "Randomizations",
+		.mask	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "I/O buffers",
+		.mask	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "Tiobench profile",
+		.mask	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= "Error handling",
+		.mask	= FIO_OPT_G_ERR,
+	},
+	{
+		.name	= "Ext4 defrag I/O engine", /* e4defrag */
+		.mask	= FIO_OPT_G_E4DEFRAG,
+	},
+	{
+		.name	= "Network I/O engine", /* net */
+		.mask	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "RDMA I/O engine", /* rdma */
+		.mask	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "libaio I/O engine", /* libaio */
+		.mask	= FIO_OPT_G_LIBAIO,
+	},
+	{
+		.name	= "ACT Aerospike like benchmark profile",
+		.mask	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "Latency profiling",
+		.mask	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "RBD I/O engine", /* rbd */
+		.mask	= FIO_OPT_G_RBD,
+	},
+	{
+		.name	= "GlusterFS I/O engine", /* gfapi,gfapi_async */
+		.mask	= FIO_OPT_G_GFAPI,
+	},
+	{
+		.name	= "MTD I/O engine", /* mtd */
+		.mask	= FIO_OPT_G_MTD,
+	},
+	{
+		.name	= "libhdfs I/O engine", /* libhdfs */
+		.mask	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "NBD I/O engine", /* NBD */
+		.mask	= FIO_OPT_G_NBD,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static const struct opt_group *group_from_mask(const struct opt_group *ogs,
+					       uint64_t *mask,
+					       uint64_t inv_mask)
+{
+	int i;
+
+	if (*mask == inv_mask || !*mask)
+		return NULL;
+
+	for (i = 0; ogs[i].name; i++) {
+		const struct opt_group *og = &ogs[i];
+
+		if (*mask & og->mask) {
+			*mask &= ~(og->mask);
+			return og;
+		}
+	}
+
+	return NULL;
+}
+
+const struct opt_group *opt_group_from_mask(uint64_t *mask)
+{
+	return group_from_mask(fio_opt_groups, mask, FIO_OPT_C_INVALID);
+}
+
+const struct opt_group *opt_group_cat_from_mask(uint64_t *mask)
+{
+	compiletime_assert(__FIO_OPT_G_NR <= 8 * sizeof(uint64_t),
+				"__FIO_OPT_G_NR");
+
+	return group_from_mask(fio_opt_cat_groups, mask, FIO_OPT_G_INVALID);
+}
diff --git a/optgroup.h b/optgroup.h
new file mode 100644
index 0000000..5789afd
--- /dev/null
+++ b/optgroup.h
@@ -0,0 +1,116 @@
+#ifndef FIO_OPT_GROUP_H
+#define FIO_OPT_GROUP_H
+
+struct opt_group {
+	const char *name;
+	uint64_t mask;
+};
+
+enum opt_category {
+	__FIO_OPT_C_GENERAL	= 0,
+	__FIO_OPT_C_IO,
+	__FIO_OPT_C_FILE,
+	__FIO_OPT_C_STAT,
+	__FIO_OPT_C_LOG,
+	__FIO_OPT_C_PROFILE,
+	__FIO_OPT_C_ENGINE,
+	__FIO_OPT_C_NR,
+
+	FIO_OPT_C_GENERAL	= (1ULL << __FIO_OPT_C_GENERAL),
+	FIO_OPT_C_IO		= (1ULL << __FIO_OPT_C_IO),
+	FIO_OPT_C_FILE		= (1ULL << __FIO_OPT_C_FILE),
+	FIO_OPT_C_STAT		= (1ULL << __FIO_OPT_C_STAT),
+	FIO_OPT_C_LOG		= (1ULL << __FIO_OPT_C_LOG),
+	FIO_OPT_C_PROFILE	= (1ULL << __FIO_OPT_C_PROFILE),
+	FIO_OPT_C_ENGINE	= (1ULL << __FIO_OPT_C_ENGINE),
+	FIO_OPT_C_INVALID	= (1ULL << __FIO_OPT_C_NR),
+};
+
+enum opt_category_group {
+	__FIO_OPT_G_RATE	= 0,
+	__FIO_OPT_G_ZONE,
+	__FIO_OPT_G_RWMIX,
+	__FIO_OPT_G_VERIFY,
+	__FIO_OPT_G_TRIM,
+	__FIO_OPT_G_IOLOG,
+	__FIO_OPT_G_IO_DEPTH,
+	__FIO_OPT_G_IO_FLOW,
+	__FIO_OPT_G_DESC,
+	__FIO_OPT_G_FILENAME,
+	__FIO_OPT_G_IO_BASIC,
+	__FIO_OPT_G_CGROUP,
+	__FIO_OPT_G_RUNTIME,
+	__FIO_OPT_G_PROCESS,
+	__FIO_OPT_G_CRED,
+	__FIO_OPT_G_CLOCK,
+	__FIO_OPT_G_IO_TYPE,
+	__FIO_OPT_G_THINKTIME,
+	__FIO_OPT_G_RANDOM,
+	__FIO_OPT_G_IO_BUF,
+	__FIO_OPT_G_TIOBENCH,
+	__FIO_OPT_G_ERR,
+	__FIO_OPT_G_E4DEFRAG,
+	__FIO_OPT_G_NETIO,
+	__FIO_OPT_G_RDMA,
+	__FIO_OPT_G_LIBAIO,
+	__FIO_OPT_G_ACT,
+	__FIO_OPT_G_LATPROF,
+	__FIO_OPT_G_RBD,
+	__FIO_OPT_G_HTTP,
+	__FIO_OPT_G_GFAPI,
+	__FIO_OPT_G_MTD,
+	__FIO_OPT_G_HDFS,
+	__FIO_OPT_G_SG,
+	__FIO_OPT_G_MMAP,
+	__FIO_OPT_G_ISCSI,
+	__FIO_OPT_G_NBD,
+	__FIO_OPT_G_IOURING,
+	__FIO_OPT_G_FILESTAT,
+	__FIO_OPT_G_NR,
+
+	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
+	FIO_OPT_G_ZONE		= (1ULL << __FIO_OPT_G_ZONE),
+	FIO_OPT_G_RWMIX		= (1ULL << __FIO_OPT_G_RWMIX),
+	FIO_OPT_G_VERIFY	= (1ULL << __FIO_OPT_G_VERIFY),
+	FIO_OPT_G_TRIM		= (1ULL << __FIO_OPT_G_TRIM),
+	FIO_OPT_G_IOLOG		= (1ULL << __FIO_OPT_G_IOLOG),
+	FIO_OPT_G_IO_DEPTH	= (1ULL << __FIO_OPT_G_IO_DEPTH),
+	FIO_OPT_G_IO_FLOW	= (1ULL << __FIO_OPT_G_IO_FLOW),
+	FIO_OPT_G_DESC		= (1ULL << __FIO_OPT_G_DESC),
+	FIO_OPT_G_FILENAME	= (1ULL << __FIO_OPT_G_FILENAME),
+	FIO_OPT_G_IO_BASIC	= (1ULL << __FIO_OPT_G_IO_BASIC),
+	FIO_OPT_G_CGROUP	= (1ULL << __FIO_OPT_G_CGROUP),
+	FIO_OPT_G_RUNTIME	= (1ULL << __FIO_OPT_G_RUNTIME),
+	FIO_OPT_G_PROCESS	= (1ULL << __FIO_OPT_G_PROCESS),
+	FIO_OPT_G_CRED		= (1ULL << __FIO_OPT_G_CRED),
+	FIO_OPT_G_CLOCK		= (1ULL << __FIO_OPT_G_CLOCK),
+	FIO_OPT_G_IO_TYPE	= (1ULL << __FIO_OPT_G_IO_TYPE),
+	FIO_OPT_G_THINKTIME	= (1ULL << __FIO_OPT_G_THINKTIME),
+	FIO_OPT_G_RANDOM	= (1ULL << __FIO_OPT_G_RANDOM),
+	FIO_OPT_G_IO_BUF	= (1ULL << __FIO_OPT_G_IO_BUF),
+	FIO_OPT_G_TIOBENCH	= (1ULL << __FIO_OPT_G_TIOBENCH),
+	FIO_OPT_G_ERR		= (1ULL << __FIO_OPT_G_ERR),
+	FIO_OPT_G_E4DEFRAG	= (1ULL << __FIO_OPT_G_E4DEFRAG),
+	FIO_OPT_G_NETIO		= (1ULL << __FIO_OPT_G_NETIO),
+	FIO_OPT_G_RDMA		= (1ULL << __FIO_OPT_G_RDMA),
+	FIO_OPT_G_LIBAIO	= (1ULL << __FIO_OPT_G_LIBAIO),
+	FIO_OPT_G_ACT		= (1ULL << __FIO_OPT_G_ACT),
+	FIO_OPT_G_LATPROF	= (1ULL << __FIO_OPT_G_LATPROF),
+	FIO_OPT_G_RBD		= (1ULL << __FIO_OPT_G_RBD),
+	FIO_OPT_G_HTTP		= (1ULL << __FIO_OPT_G_HTTP),
+	FIO_OPT_G_GFAPI		= (1ULL << __FIO_OPT_G_GFAPI),
+	FIO_OPT_G_MTD		= (1ULL << __FIO_OPT_G_MTD),
+	FIO_OPT_G_HDFS		= (1ULL << __FIO_OPT_G_HDFS),
+	FIO_OPT_G_SG		= (1ULL << __FIO_OPT_G_SG),
+	FIO_OPT_G_MMAP		= (1ULL << __FIO_OPT_G_MMAP),
+	FIO_OPT_G_INVALID	= (1ULL << __FIO_OPT_G_NR),
+	FIO_OPT_G_ISCSI         = (1ULL << __FIO_OPT_G_ISCSI),
+	FIO_OPT_G_NBD		= (1ULL << __FIO_OPT_G_NBD),
+	FIO_OPT_G_IOURING	= (1ULL << __FIO_OPT_G_IOURING),
+	FIO_OPT_G_FILESTAT	= (1ULL << __FIO_OPT_G_FILESTAT),
+};
+
+extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
+extern const struct opt_group *opt_group_cat_from_mask(uint64_t *mask);
+
+#endif
diff --git a/options.c b/options.c
new file mode 100644
index 0000000..4714a3a
--- /dev/null
+++ b/options.c
@@ -0,0 +1,5369 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/stat.h>
+#include <netinet/in.h>
+
+#include "fio.h"
+#include "verify.h"
+#include "parse.h"
+#include "lib/pattern.h"
+#include "options.h"
+#include "optgroup.h"
+
+char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 };
+
+#define cb_data_to_td(data)	container_of(data, struct thread_data, o)
+
+static struct pattern_fmt_desc fmt_desc[] = {
+	{
+		.fmt   = "%o",
+		.len   = FIELD_SIZE(struct io_u *, offset),
+		.paste = paste_blockoff
+	}
+};
+
+/*
+ * Check if mmap/mmaphuge has a :/foo/bar/file at the end. If so, return that.
+ */
+static char *get_opt_postfix(const char *str)
+{
+	char *p = strstr(str, ":");
+
+	if (!p)
+		return NULL;
+
+	p++;
+	strip_blank_front(&p);
+	strip_blank_end(p);
+	return strdup(p);
+}
+
+static int bs_cmp(const void *p1, const void *p2)
+{
+	const struct bssplit *bsp1 = p1;
+	const struct bssplit *bsp2 = p2;
+
+	return (int) bsp1->perc - (int) bsp2->perc;
+}
+
+struct split {
+	unsigned int nr;
+	unsigned long long val1[ZONESPLIT_MAX];
+	unsigned long long val2[ZONESPLIT_MAX];
+};
+
+static int split_parse_ddir(struct thread_options *o, struct split *split,
+			    char *str, bool absolute, unsigned int max_splits)
+{
+	unsigned long long perc;
+	unsigned int i;
+	long long val;
+	char *fname;
+
+	split->nr = 0;
+
+	i = 0;
+	while ((fname = strsep(&str, ":")) != NULL) {
+		char *perc_str;
+
+		if (!strlen(fname))
+			break;
+
+		perc_str = strstr(fname, "/");
+		if (perc_str) {
+			*perc_str = '\0';
+			perc_str++;
+			if (absolute) {
+				if (str_to_decimal(perc_str, &val, 1, o, 0, 0)) {
+					log_err("fio: split conversion failed\n");
+					return 1;
+				}
+				perc = val;
+			} else {
+				perc = atoi(perc_str);
+				if (perc > 100)
+					perc = 100;
+				else if (!perc)
+					perc = -1U;
+			}
+		} else {
+			if (absolute)
+				perc = 0;
+			else
+				perc = -1U;
+		}
+
+		if (str_to_decimal(fname, &val, 1, o, 0, 0)) {
+			log_err("fio: split conversion failed\n");
+			return 1;
+		}
+
+		split->val1[i] = val;
+		split->val2[i] = perc;
+		i++;
+		if (i == max_splits) {
+			log_err("fio: hit max of %d split entries\n", i);
+			break;
+		}
+	}
+
+	split->nr = i;
+	return 0;
+}
+
+static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
+			bool data)
+{
+	unsigned int i, perc, perc_missing;
+	unsigned long long max_bs, min_bs;
+	struct split split;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(o, &split, str, data, BSSPLIT_MAX))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	max_bs = 0;
+	min_bs = -1;
+	o->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
+	o->bssplit_nr[ddir] = split.nr;
+	for (i = 0; i < split.nr; i++) {
+		if (split.val1[i] > max_bs)
+			max_bs = split.val1[i];
+		if (split.val1[i] < min_bs)
+			min_bs = split.val1[i];
+
+		o->bssplit[ddir][i].bs = split.val1[i];
+		o->bssplit[ddir][i].perc =split.val2[i];
+	}
+
+	/*
+	 * Now check if the percentages add up, and how much is missing
+	 */
+	perc = perc_missing = 0;
+	for (i = 0; i < o->bssplit_nr[ddir]; i++) {
+		struct bssplit *bsp = &o->bssplit[ddir][i];
+
+		if (bsp->perc == -1U)
+			perc_missing++;
+		else
+			perc += bsp->perc;
+	}
+
+	if (perc > 100 && perc_missing > 1) {
+		log_err("fio: bssplit percentages add to more than 100%%\n");
+		free(o->bssplit[ddir]);
+		o->bssplit[ddir] = NULL;
+		return 1;
+	}
+
+	/*
+	 * If values didn't have a percentage set, divide the remains between
+	 * them.
+	 */
+	if (perc_missing) {
+		if (perc_missing == 1 && o->bssplit_nr[ddir] == 1)
+			perc = 100;
+		for (i = 0; i < o->bssplit_nr[ddir]; i++) {
+			struct bssplit *bsp = &o->bssplit[ddir][i];
+
+			if (bsp->perc == -1U)
+				bsp->perc = (100 - perc) / perc_missing;
+		}
+	}
+
+	o->min_bs[ddir] = min_bs;
+	o->max_bs[ddir] = max_bs;
+
+	/*
+	 * now sort based on percentages, for ease of lookup
+	 */
+	qsort(o->bssplit[ddir], o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
+	return 0;
+}
+
+typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *, bool);
+
+static int str_split_parse(struct thread_data *td, char *str,
+			   split_parse_fn *fn, bool data)
+{
+	char *odir, *ddir;
+	int ret = 0;
+
+	odir = strchr(str, ',');
+	if (odir) {
+		ddir = strchr(odir + 1, ',');
+		if (ddir) {
+			ret = fn(&td->o, DDIR_TRIM, ddir + 1, data);
+			if (!ret)
+				*ddir = '\0';
+		} else {
+			char *op;
+
+			op = strdup(odir + 1);
+			ret = fn(&td->o, DDIR_TRIM, op, data);
+
+			free(op);
+		}
+		if (!ret)
+			ret = fn(&td->o, DDIR_WRITE, odir + 1, data);
+		if (!ret) {
+			*odir = '\0';
+			ret = fn(&td->o, DDIR_READ, str, data);
+		}
+	} else {
+		char *op;
+
+		op = strdup(str);
+		ret = fn(&td->o, DDIR_WRITE, op, data);
+		free(op);
+
+		if (!ret) {
+			op = strdup(str);
+			ret = fn(&td->o, DDIR_TRIM, op, data);
+			free(op);
+		}
+		if (!ret)
+			ret = fn(&td->o, DDIR_READ, str, data);
+	}
+
+	return ret;
+}
+
+static int str_bssplit_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p;
+	int ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	ret = str_split_parse(td, str, bssplit_ddir, false);
+
+	if (parse_dryrun()) {
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(td->o.bssplit[i]);
+			td->o.bssplit[i] = NULL;
+			td->o.bssplit_nr[i] = 0;
+		}
+	}
+
+	free(p);
+	return ret;
+}
+
+static int str2error(char *str)
+{
+	const char *err[] = { "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO",
+			    "ENXIO", "E2BIG", "ENOEXEC", "EBADF",
+			    "ECHILD", "EAGAIN", "ENOMEM", "EACCES",
+			    "EFAULT", "ENOTBLK", "EBUSY", "EEXIST",
+			    "EXDEV", "ENODEV", "ENOTDIR", "EISDIR",
+			    "EINVAL", "ENFILE", "EMFILE", "ENOTTY",
+			    "ETXTBSY","EFBIG", "ENOSPC", "ESPIPE",
+			    "EROFS","EMLINK", "EPIPE", "EDOM", "ERANGE" };
+	int i = 0, num = sizeof(err) / sizeof(char *);
+
+	while (i < num) {
+		if (!strcmp(err[i], str))
+			return i + 1;
+		i++;
+	}
+	return 0;
+}
+
+static int ignore_error_type(struct thread_data *td, enum error_type_bit etype,
+				char *str)
+{
+	unsigned int i;
+	int *error;
+	char *fname;
+
+	if (etype >= ERROR_TYPE_CNT) {
+		log_err("Illegal error type\n");
+		return 1;
+	}
+
+	td->o.ignore_error_nr[etype] = 4;
+	error = calloc(4, sizeof(int));
+
+	i = 0;
+	while ((fname = strsep(&str, ":")) != NULL) {
+
+		if (!strlen(fname))
+			break;
+
+		/*
+		 * grow struct buffer, if needed
+		 */
+		if (i == td->o.ignore_error_nr[etype]) {
+			td->o.ignore_error_nr[etype] <<= 1;
+			error = realloc(error, td->o.ignore_error_nr[etype]
+						  * sizeof(int));
+		}
+		if (fname[0] == 'E') {
+			error[i] = str2error(fname);
+		} else {
+			error[i] = atoi(fname);
+			if (error[i] < 0)
+				error[i] = -error[i];
+		}
+		if (!error[i]) {
+			log_err("Unknown error %s, please use number value\n",
+				  fname);
+			td->o.ignore_error_nr[etype] = 0;
+			free(error);
+			return 1;
+		}
+		i++;
+	}
+	if (i) {
+		td->o.continue_on_error |= 1 << etype;
+		td->o.ignore_error_nr[etype] = i;
+		td->o.ignore_error[etype] = error;
+	} else {
+		td->o.ignore_error_nr[etype] = 0;
+		free(error);
+	}
+
+	return 0;
+
+}
+
+static int str_replay_skip_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p, *n;
+	int ret = 0;
+
+	if (parse_dryrun())
+		return 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	while (p) {
+		n = strchr(p, ',');
+		if (n)
+			*n++ = '\0';
+		if (!strcmp(p, "read"))
+			td->o.replay_skip |= 1u << DDIR_READ;
+		else if (!strcmp(p, "write"))
+			td->o.replay_skip |= 1u << DDIR_WRITE;
+		else if (!strcmp(p, "trim"))
+			td->o.replay_skip |= 1u << DDIR_TRIM;
+		else if (!strcmp(p, "sync"))
+			td->o.replay_skip |= 1u << DDIR_SYNC;
+		else {
+			log_err("Unknown skip type: %s\n", p);
+			ret = 1;
+			break;
+		}
+		p = n;
+	}
+	free(str);
+	return ret;
+}
+
+static int str_ignore_error_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p, *n;
+	int ret = 1;
+	enum error_type_bit type = 0;
+
+	if (parse_dryrun())
+		return 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	while (p) {
+		n = strchr(p, ',');
+		if (n)
+			*n++ = '\0';
+		ret = ignore_error_type(td, type, p);
+		if (ret)
+			break;
+		p = n;
+		type++;
+	}
+	free(str);
+	return ret;
+}
+
+static int str_rw_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct thread_options *o = &td->o;
+	char *nr;
+
+	if (parse_dryrun())
+		return 0;
+
+	o->ddir_seq_nr = 1;
+	o->ddir_seq_add = 0;
+
+	nr = get_opt_postfix(str);
+	if (!nr)
+		return 0;
+
+	if (td_random(td))
+		o->ddir_seq_nr = atoi(nr);
+	else {
+		long long val;
+
+		if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
+			log_err("fio: rw postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		o->ddir_seq_add = val;
+	}
+
+	free(nr);
+	return 0;
+}
+
+static int str_mem_cb(void *data, const char *mem)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAP ||
+	    td->o.mem_type == MEM_MMAPSHARED)
+		td->o.mmapfile = get_opt_postfix(mem);
+
+	return 0;
+}
+
+static int fio_clock_source_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	fio_clock_source = td->o.clocksource;
+	fio_clock_source_set = 1;
+	fio_clock_init();
+	return 0;
+}
+
+static int str_rwmix_read_cb(void *data, unsigned long long *val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	td->o.rwmix[DDIR_READ] = *val;
+	td->o.rwmix[DDIR_WRITE] = 100 - *val;
+	return 0;
+}
+
+static int str_rwmix_write_cb(void *data, unsigned long long *val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	td->o.rwmix[DDIR_WRITE] = *val;
+	td->o.rwmix[DDIR_READ] = 100 - *val;
+	return 0;
+}
+
+static int str_exitall_cb(void)
+{
+	exitall_on_terminate = true;
+	return 0;
+}
+
+#ifdef FIO_HAVE_CPU_AFFINITY
+int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
+{
+	unsigned int i, index, cpus_in_mask;
+	const long max_cpu = cpus_online();
+
+	cpus_in_mask = fio_cpu_count(mask);
+	if (!cpus_in_mask)
+		return 0;
+
+	cpu_index = cpu_index % cpus_in_mask;
+
+	index = 0;
+	for (i = 0; i < max_cpu; i++) {
+		if (!fio_cpu_isset(mask, i))
+			continue;
+
+		if (cpu_index != index)
+			fio_cpu_clear(mask, i);
+
+		index++;
+	}
+
+	return fio_cpu_count(mask);
+}
+
+static int str_cpumask_cb(void *data, unsigned long long *val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned int i;
+	long max_cpu;
+	int ret;
+
+	if (parse_dryrun())
+		return 0;
+
+	ret = fio_cpuset_init(&td->o.cpumask);
+	if (ret < 0) {
+		log_err("fio: cpuset_init failed\n");
+		td_verror(td, ret, "fio_cpuset_init");
+		return 1;
+	}
+
+	max_cpu = cpus_online();
+
+	for (i = 0; i < sizeof(int) * 8; i++) {
+		if ((1 << i) & *val) {
+			if (i >= max_cpu) {
+				log_err("fio: CPU %d too large (max=%ld)\n", i,
+								max_cpu - 1);
+				return 1;
+			}
+			dprint(FD_PARSE, "set cpu allowed %d\n", i);
+			fio_cpu_set(&td->o.cpumask, i);
+		}
+	}
+
+	return 0;
+}
+
+static int set_cpus_allowed(struct thread_data *td, os_cpu_mask_t *mask,
+			    const char *input)
+{
+	char *cpu, *str, *p;
+	long max_cpu;
+	int ret = 0;
+
+	ret = fio_cpuset_init(mask);
+	if (ret < 0) {
+		log_err("fio: cpuset_init failed\n");
+		td_verror(td, ret, "fio_cpuset_init");
+		return 1;
+	}
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	max_cpu = cpus_online();
+
+	while ((cpu = strsep(&str, ",")) != NULL) {
+		char *str2, *cpu2;
+		int icpu, icpu2;
+
+		if (!strlen(cpu))
+			break;
+
+		str2 = cpu;
+		icpu2 = -1;
+		while ((cpu2 = strsep(&str2, "-")) != NULL) {
+			if (!strlen(cpu2))
+				break;
+
+			icpu2 = atoi(cpu2);
+		}
+
+		icpu = atoi(cpu);
+		if (icpu2 == -1)
+			icpu2 = icpu;
+		while (icpu <= icpu2) {
+			if (icpu >= FIO_MAX_CPUS) {
+				log_err("fio: your OS only supports up to"
+					" %d CPUs\n", (int) FIO_MAX_CPUS);
+				ret = 1;
+				break;
+			}
+			if (icpu >= max_cpu) {
+				log_err("fio: CPU %d too large (max=%ld)\n",
+							icpu, max_cpu - 1);
+				ret = 1;
+				break;
+			}
+
+			dprint(FD_PARSE, "set cpu allowed %d\n", icpu);
+			fio_cpu_set(mask, icpu);
+			icpu++;
+		}
+		if (ret)
+			break;
+	}
+
+	free(p);
+	return ret;
+}
+
+static int str_cpus_allowed_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
+
+	return set_cpus_allowed(td, &td->o.cpumask, input);
+}
+
+static int str_verify_cpus_allowed_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
+
+	return set_cpus_allowed(td, &td->o.verify_cpumask, input);
+}
+
+#ifdef CONFIG_ZLIB
+static int str_log_cpus_allowed_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
+
+	return set_cpus_allowed(td, &td->o.log_gz_cpumask, input);
+}
+#endif /* CONFIG_ZLIB */
+
+#endif /* FIO_HAVE_CPU_AFFINITY */
+
+#ifdef CONFIG_LIBNUMA
+static int str_numa_cpunodes_cb(void *data, char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct bitmask *verify_bitmask;
+
+	if (parse_dryrun())
+		return 0;
+
+	/* numa_parse_nodestring() parses a character string list
+	 * of nodes into a bit mask. The bit mask is allocated by
+	 * numa_allocate_nodemask(), so it should be freed by
+	 * numa_free_nodemask().
+	 */
+	verify_bitmask = numa_parse_nodestring(input);
+	if (verify_bitmask == NULL) {
+		log_err("fio: numa_parse_nodestring failed\n");
+		td_verror(td, 1, "str_numa_cpunodes_cb");
+		return 1;
+	}
+	numa_free_nodemask(verify_bitmask);
+
+	td->o.numa_cpunodes = strdup(input);
+	return 0;
+}
+
+static int str_numa_mpol_cb(void *data, char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	const char * const policy_types[] =
+		{ "default", "prefer", "bind", "interleave", "local", NULL };
+	int i;
+	char *nodelist;
+	struct bitmask *verify_bitmask;
+
+	if (parse_dryrun())
+		return 0;
+
+	nodelist = strchr(input, ':');
+	if (nodelist) {
+		/* NUL-terminate mode */
+		*nodelist++ = '\0';
+	}
+
+	for (i = 0; i <= MPOL_LOCAL; i++) {
+		if (!strcmp(input, policy_types[i])) {
+			td->o.numa_mem_mode = i;
+			break;
+		}
+	}
+	if (i > MPOL_LOCAL) {
+		log_err("fio: memory policy should be: default, prefer, bind, interleave, local\n");
+		goto out;
+	}
+
+	switch (td->o.numa_mem_mode) {
+	case MPOL_PREFERRED:
+		/*
+		 * Insist on a nodelist of one node only
+		 */
+		if (nodelist) {
+			char *rest = nodelist;
+			while (isdigit(*rest))
+				rest++;
+			if (*rest) {
+				log_err("fio: one node only for \'prefer\'\n");
+				goto out;
+			}
+		} else {
+			log_err("fio: one node is needed for \'prefer\'\n");
+			goto out;
+		}
+		break;
+	case MPOL_INTERLEAVE:
+		/*
+		 * Default to online nodes with memory if no nodelist
+		 */
+		if (!nodelist)
+			nodelist = strdup("all");
+		break;
+	case MPOL_LOCAL:
+	case MPOL_DEFAULT:
+		/*
+		 * Don't allow a nodelist
+		 */
+		if (nodelist) {
+			log_err("fio: NO nodelist for \'local\'\n");
+			goto out;
+		}
+		break;
+	case MPOL_BIND:
+		/*
+		 * Insist on a nodelist
+		 */
+		if (!nodelist) {
+			log_err("fio: a nodelist is needed for \'bind\'\n");
+			goto out;
+		}
+		break;
+	}
+
+
+	/* numa_parse_nodestring() parses a character string list
+	 * of nodes into a bit mask. The bit mask is allocated by
+	 * numa_allocate_nodemask(), so it should be freed by
+	 * numa_free_nodemask().
+	 */
+	switch (td->o.numa_mem_mode) {
+	case MPOL_PREFERRED:
+		td->o.numa_mem_prefer_node = atoi(nodelist);
+		break;
+	case MPOL_INTERLEAVE:
+	case MPOL_BIND:
+		verify_bitmask = numa_parse_nodestring(nodelist);
+		if (verify_bitmask == NULL) {
+			log_err("fio: numa_parse_nodestring failed\n");
+			td_verror(td, 1, "str_numa_memnodes_cb");
+			return 1;
+		}
+		td->o.numa_memnodes = strdup(nodelist);
+		numa_free_nodemask(verify_bitmask);
+
+		break;
+	case MPOL_LOCAL:
+	case MPOL_DEFAULT:
+	default:
+		break;
+	}
+
+	return 0;
+out:
+	return 1;
+}
+#endif
+
+static int str_fst_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	bool done = false;
+	char *nr;
+
+	td->file_service_nr = 1;
+
+	switch (td->o.file_service_type) {
+	case FIO_FSERVICE_RANDOM:
+	case FIO_FSERVICE_RR:
+	case FIO_FSERVICE_SEQ:
+		nr = get_opt_postfix(str);
+		if (nr) {
+			td->file_service_nr = atoi(nr);
+			free(nr);
+		}
+		done = true;
+		break;
+	case FIO_FSERVICE_ZIPF:
+		val = FIO_DEF_ZIPF;
+		break;
+	case FIO_FSERVICE_PARETO:
+		val = FIO_DEF_PARETO;
+		break;
+	case FIO_FSERVICE_GAUSS:
+		val = 0.0;
+		break;
+	default:
+		log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+		return 1;
+	}
+
+	if (done)
+		return 0;
+
+	nr = get_opt_postfix(str);
+	if (nr && !str_to_float(nr, &val, 0)) {
+		log_err("fio: file service type random postfix parsing failed\n");
+		free(nr);
+		return 1;
+	}
+
+	free(nr);
+
+	switch (td->o.file_service_type) {
+	case FIO_FSERVICE_ZIPF:
+		if (val == 1.00) {
+			log_err("fio: zipf theta must be different than 1.0\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->zipf_theta = val;
+		break;
+	case FIO_FSERVICE_PARETO:
+		if (val <= 0.00 || val >= 1.00) {
+                          log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+                          return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->pareto_h = val;
+		break;
+	case FIO_FSERVICE_GAUSS:
+		if (val < 0.00 || val >= 100.00) {
+                          log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+                          return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->gauss_dev = val;
+		break;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_SYNC_FILE_RANGE
+static int str_sfr_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *nr = get_opt_postfix(str);
+
+	td->sync_file_range_nr = 1;
+	if (nr) {
+		td->sync_file_range_nr = atoi(nr);
+		free(nr);
+	}
+
+	return 0;
+}
+#endif
+
+static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
+			   char *str, bool absolute)
+{
+	unsigned int i, perc, perc_missing, sperc, sperc_missing;
+	struct split split;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(o, &split, str, absolute, ZONESPLIT_MAX))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	o->zone_split[ddir] = malloc(split.nr * sizeof(struct zone_split));
+	o->zone_split_nr[ddir] = split.nr;
+	for (i = 0; i < split.nr; i++) {
+		o->zone_split[ddir][i].access_perc = split.val1[i];
+		if (absolute)
+			o->zone_split[ddir][i].size = split.val2[i];
+		else
+			o->zone_split[ddir][i].size_perc = split.val2[i];
+	}
+
+	/*
+	 * Now check if the percentages add up, and how much is missing
+	 */
+	perc = perc_missing = 0;
+	sperc = sperc_missing = 0;
+	for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+		struct zone_split *zsp = &o->zone_split[ddir][i];
+
+		if (zsp->access_perc == (uint8_t) -1U)
+			perc_missing++;
+		else
+			perc += zsp->access_perc;
+
+		if (!absolute) {
+			if (zsp->size_perc == (uint8_t) -1U)
+				sperc_missing++;
+			else
+				sperc += zsp->size_perc;
+		}
+	}
+
+	if (perc > 100 || sperc > 100) {
+		log_err("fio: zone_split percentages add to more than 100%%\n");
+		free(o->zone_split[ddir]);
+		o->zone_split[ddir] = NULL;
+		return 1;
+	}
+	if (perc < 100) {
+		log_err("fio: access percentage don't add up to 100 for zoned "
+			"random distribution (got=%u)\n", perc);
+		free(o->zone_split[ddir]);
+		o->zone_split[ddir] = NULL;
+		return 1;
+	}
+
+	/*
+	 * If values didn't have a percentage set, divide the remains between
+	 * them.
+	 */
+	if (perc_missing) {
+		if (perc_missing == 1 && o->zone_split_nr[ddir] == 1)
+			perc = 100;
+		for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+			struct zone_split *zsp = &o->zone_split[ddir][i];
+
+			if (zsp->access_perc == (uint8_t) -1U)
+				zsp->access_perc = (100 - perc) / perc_missing;
+		}
+	}
+	if (sperc_missing) {
+		if (sperc_missing == 1 && o->zone_split_nr[ddir] == 1)
+			sperc = 100;
+		for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+			struct zone_split *zsp = &o->zone_split[ddir][i];
+
+			if (zsp->size_perc == (uint8_t) -1U)
+				zsp->size_perc = (100 - sperc) / sperc_missing;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_zoned_distribution(struct thread_data *td, const char *input,
+				    bool absolute)
+{
+	const char *pre = absolute ? "zoned_abs:" : "zoned:";
+	char *str, *p;
+	int i, ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	/* We expect it to start like that, bail if not */
+	if (strncmp(str, pre, strlen(pre))) {
+		log_err("fio: mismatch in zoned input <%s>\n", str);
+		free(p);
+		return 1;
+	}
+	str += strlen(pre);
+
+	ret = str_split_parse(td, str, zone_split_ddir, absolute);
+
+	free(p);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		int j;
+
+		dprint(FD_PARSE, "zone ddir %d (nr=%u): \n", i, td->o.zone_split_nr[i]);
+
+		for (j = 0; j < td->o.zone_split_nr[i]; j++) {
+			struct zone_split *zsp = &td->o.zone_split[i][j];
+
+			if (absolute) {
+				dprint(FD_PARSE, "\t%d: %u/%llu\n", j,
+						zsp->access_perc,
+						(unsigned long long) zsp->size);
+			} else {
+				dprint(FD_PARSE, "\t%d: %u/%u\n", j,
+						zsp->access_perc,
+						zsp->size_perc);
+			}
+		}
+	}
+
+	if (parse_dryrun()) {
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(td->o.zone_split[i]);
+			td->o.zone_split[i] = NULL;
+			td->o.zone_split_nr[i] = 0;
+		}
+
+		return ret;
+	}
+
+	if (ret) {
+		for (i = 0; i < DDIR_RWDIR_CNT; i++)
+			td->o.zone_split_nr[i] = 0;
+	}
+
+	return ret;
+}
+
+static int str_random_distribution_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	char *nr;
+
+	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		val = FIO_DEF_ZIPF;
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+		val = FIO_DEF_PARETO;
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		val = 0.0;
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+		return parse_zoned_distribution(td, str, false);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED_ABS)
+		return parse_zoned_distribution(td, str, true);
+	else
+		return 0;
+
+	nr = get_opt_postfix(str);
+	if (nr && !str_to_float(nr, &val, 0)) {
+		log_err("fio: random postfix parsing failed\n");
+		free(nr);
+		return 1;
+	}
+
+	free(nr);
+
+	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) {
+		if (val == 1.00) {
+			log_err("fio: zipf theta must different than 1.0\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->o.zipf_theta.u.f = val;
+	} else if (td->o.random_distribution == FIO_RAND_DIST_PARETO) {
+		if (val <= 0.00 || val >= 1.00) {
+			log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->o.pareto_h.u.f = val;
+	} else {
+		if (val < 0.00 || val >= 100.0) {
+			log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->o.gauss_dev.u.f = val;
+	}
+
+	return 0;
+}
+
+static int str_steadystate_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	char *nr;
+	char *pct;
+	long long ll;
+
+	if (td->o.ss_state != FIO_SS_IOPS && td->o.ss_state != FIO_SS_IOPS_SLOPE &&
+	    td->o.ss_state != FIO_SS_BW && td->o.ss_state != FIO_SS_BW_SLOPE) {
+		/* should be impossible to get here */
+		log_err("fio: unknown steady state criterion\n");
+		return 1;
+	}
+
+	nr = get_opt_postfix(str);
+	if (!nr) {
+		log_err("fio: steadystate threshold must be specified in addition to criterion\n");
+		free(nr);
+		return 1;
+	}
+
+	/* ENHANCEMENT Allow fio to understand size=10.2% and use here */
+	pct = strstr(nr, "%");
+	if (pct) {
+		*pct = '\0';
+		strip_blank_end(nr);
+		if (!str_to_float(nr, &val, 0))	{
+			log_err("fio: could not parse steadystate threshold percentage\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state threshold to %f%%\n", val);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_state |= FIO_SS_PCT;
+		td->o.ss_limit.u.f = val;
+	} else if (td->o.ss_state & FIO_SS_IOPS) {
+		if (!str_to_float(nr, &val, 0)) {
+			log_err("fio: steadystate IOPS threshold postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state IOPS threshold to %f\n", val);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_limit.u.f = val;
+	} else {	/* bandwidth criterion */
+		if (str_to_decimal(nr, &ll, 1, td, 0, 0)) {
+			log_err("fio: steadystate BW threshold postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state BW threshold to %lld\n", ll);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_limit.u.f = (double) ll;
+	}
+
+	td->ss.state = td->o.ss_state;
+	return 0;
+}
+
+/*
+ * Return next name in the string. Files are separated with ':'. If the ':'
+ * is escaped with a '\', then that ':' is part of the filename and does not
+ * indicate a new file.
+ */
+char *get_next_str(char **ptr)
+{
+	char *str = *ptr;
+	char *p, *start;
+
+	if (!str || !strlen(str))
+		return NULL;
+
+	start = str;
+	do {
+		/*
+		 * No colon, we are done
+		 */
+		p = strchr(str, ':');
+		if (!p) {
+			*ptr = NULL;
+			break;
+		}
+
+		/*
+		 * We got a colon, but it's the first character. Skip and
+		 * continue
+		 */
+		if (p == start) {
+			str = ++start;
+			continue;
+		}
+
+		if (*(p - 1) != '\\') {
+			*p = '\0';
+			*ptr = p + 1;
+			break;
+		}
+
+		memmove(p - 1, p, strlen(p) + 1);
+		str = p;
+	} while (1);
+
+	return start;
+}
+
+
+int get_max_str_idx(char *input)
+{
+	unsigned int cur_idx;
+	char *str, *p;
+
+	p = str = strdup(input);
+	for (cur_idx = 0; ; cur_idx++)
+		if (get_next_str(&str) == NULL)
+			break;
+
+	free(p);
+	return cur_idx;
+}
+
+/*
+ * Returns the directory at the index, indexes > entires will be
+ * assigned via modulo division of the index
+ */
+int set_name_idx(char *target, size_t tlen, char *input, int index,
+		 bool unique_filename)
+{
+	unsigned int cur_idx;
+	int len;
+	char *fname, *str, *p;
+
+	p = str = strdup(input);
+
+	index %= get_max_str_idx(input);
+	for (cur_idx = 0; cur_idx <= index; cur_idx++)
+		fname = get_next_str(&str);
+
+	if (client_sockaddr_str[0] && unique_filename) {
+		len = snprintf(target, tlen, "%s/%s.", fname,
+				client_sockaddr_str);
+	} else
+		len = snprintf(target, tlen, "%s%c", fname,
+				FIO_OS_PATH_SEPARATOR);
+
+	target[tlen - 1] = '\0';
+	free(p);
+
+	return len;
+}
+
+char* get_name_by_idx(char *input, int index)
+{
+	unsigned int cur_idx;
+	char *fname, *str, *p;
+
+	p = str = strdup(input);
+
+	index %= get_max_str_idx(input);
+	for (cur_idx = 0; cur_idx <= index; cur_idx++)
+		fname = get_next_str(&str);
+
+	fname = strdup(fname);
+	free(p);
+
+	return fname;
+}
+
+static int str_filename_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *fname, *str, *p;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	/*
+	 * Ignore what we may already have from nrfiles option.
+	 */
+	if (!td->files_index)
+		td->o.nr_files = 0;
+
+	while ((fname = get_next_str(&str)) != NULL) {
+		if (!strlen(fname))
+			break;
+		add_file(td, fname, 0, 1);
+	}
+
+	free(p);
+	return 0;
+}
+
+static int str_directory_cb(void *data, const char fio_unused *unused)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct stat sb;
+	char *dirname, *str, *p;
+	int ret = 0;
+
+	if (parse_dryrun())
+		return 0;
+
+	p = str = strdup(td->o.directory);
+	while ((dirname = get_next_str(&str)) != NULL) {
+		if (lstat(dirname, &sb) < 0) {
+			ret = errno;
+
+			log_err("fio: %s is not a directory\n", dirname);
+			td_verror(td, ret, "lstat");
+			goto out;
+		}
+		if (!S_ISDIR(sb.st_mode)) {
+			log_err("fio: %s is not a directory\n", dirname);
+			ret = 1;
+			goto out;
+		}
+	}
+
+out:
+	free(p);
+	return ret;
+}
+
+static int str_opendir_cb(void *data, const char fio_unused *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
+
+	if (!td->files_index)
+		td->o.nr_files = 0;
+
+	return add_dir_files(td, td->o.opendir);
+}
+
+static int str_buffer_pattern_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	int ret;
+
+	/* FIXME: for now buffer pattern does not support formats */
+	ret = parse_and_fill_pattern(input, strlen(input), td->o.buffer_pattern,
+				     MAX_PATTERN_SIZE, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		return 1;
+
+	assert(ret != 0);
+	td->o.buffer_pattern_bytes = ret;
+
+	/*
+	 * If this job is doing any reading or has compression set,
+	 * ensure that we refill buffers for writes or we could be
+	 * invalidating the pattern through reads.
+	 */
+	if (!td->o.compress_percentage && !td_read(td))
+		td->o.refill_buffers = 0;
+	else
+		td->o.refill_buffers = 1;
+
+	td->o.scramble_buffers = 0;
+	td->o.zero_buffers = 0;
+
+	return 0;
+}
+
+static int str_buffer_compress_cb(void *data, unsigned long long *il)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	td->flags |= TD_F_COMPRESS;
+	td->o.compress_percentage = *il;
+	return 0;
+}
+
+static int str_dedupe_cb(void *data, unsigned long long *il)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	td->flags |= TD_F_COMPRESS;
+	td->o.dedupe_percentage = *il;
+	td->o.refill_buffers = 1;
+	return 0;
+}
+
+static int str_verify_pattern_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	int ret;
+
+	td->o.verify_fmt_sz = ARRAY_SIZE(td->o.verify_fmt);
+	ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
+				     MAX_PATTERN_SIZE, fmt_desc, sizeof(fmt_desc),
+				     td->o.verify_fmt, &td->o.verify_fmt_sz);
+	if (ret < 0)
+		return 1;
+
+	assert(ret != 0);
+	td->o.verify_pattern_bytes = ret;
+	/*
+	 * VERIFY_* could already be set
+	 */
+	if (!fio_option_is_set(&td->o, verify))
+		td->o.verify = VERIFY_PATTERN;
+
+	return 0;
+}
+
+static int str_gtod_reduce_cb(void *data, int *il)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	int val = *il;
+
+	/*
+	 * Only modfiy options if gtod_reduce==1
+	 * Otherwise leave settings alone.
+	 */
+	if (val) {
+		td->o.disable_lat = 1;
+		td->o.disable_clat = 1;
+		td->o.disable_slat = 1;
+		td->o.disable_bw = 1;
+		td->o.clat_percentiles = 0;
+		td->o.lat_percentiles = 0;
+		td->o.slat_percentiles = 0;
+		td->ts_cache_mask = 63;
+	}
+
+	return 0;
+}
+
+static int str_offset_cb(void *data, unsigned long long *__val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.start_offset = 0;
+		td->o.start_offset_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET start_offset_percent %d\n",
+					td->o.start_offset_percent);
+	} else
+		td->o.start_offset = v;
+
+	return 0;
+}
+
+static int str_offset_increment_cb(void *data, unsigned long long *__val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.offset_increment = 0;
+		td->o.offset_increment_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET offset_increment_percent %d\n",
+					td->o.offset_increment_percent);
+	} else
+		td->o.offset_increment = v;
+
+	return 0;
+}
+
+static int str_size_cb(void *data, unsigned long long *__val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.size = 0;
+		td->o.size_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET size_percent %d\n",
+					td->o.size_percent);
+	} else
+		td->o.size = v;
+
+	return 0;
+}
+
+static int str_write_bw_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.bw_log_file = strdup(str);
+
+	td->o.write_bw_log = 1;
+	return 0;
+}
+
+static int str_write_lat_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.lat_log_file = strdup(str);
+
+	td->o.write_lat_log = 1;
+	return 0;
+}
+
+static int str_write_iops_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.iops_log_file = strdup(str);
+
+	td->o.write_iops_log = 1;
+	return 0;
+}
+
+static int str_write_hist_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.hist_log_file = strdup(str);
+
+	td->o.write_hist_log = 1;
+	return 0;
+}
+
+/*
+ * str is supposed to be a substring of the strdup'd original string,
+ * and is valid only if it's a regular file path.
+ * This function keeps the pointer to the path as needed later.
+ *
+ * "external:/path/to/so\0" <- original pointer updated with strdup'd
+ * "external\0"             <- above pointer after parsed, i.e. ->ioengine
+ *          "/path/to/so\0" <- str argument, i.e. ->ioengine_so_path
+ */
+static int str_ioengine_external_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct stat sb;
+	char *p;
+
+	if (!str) {
+		log_err("fio: null external ioengine path\n");
+		return 1;
+	}
+
+	p = (char *)str; /* str is mutable */
+	strip_blank_front(&p);
+	strip_blank_end(p);
+
+	if (stat(p, &sb) || !S_ISREG(sb.st_mode)) {
+		log_err("fio: invalid external ioengine path \"%s\"\n", p);
+		return 1;
+	}
+
+	td->o.ioengine_so_path = p;
+	return 0;
+}
+
+static int rw_verify(const struct fio_option *o, void *data)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (read_only && (td_write(td) || td_trim(td))) {
+		log_err("fio: job <%s> has write or trim bit set, but"
+			" fio is in read-only mode\n", td->o.name);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int gtod_cpu_verify(const struct fio_option *o, void *data)
+{
+#ifndef FIO_HAVE_CPU_AFFINITY
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (td->o.gtod_cpu) {
+		log_err("fio: platform must support CPU affinity for"
+			"gettimeofday() offloading\n");
+		return 1;
+	}
+#endif
+
+	return 0;
+}
+
+/*
+ * Map of job/command line options
+ */
+struct fio_option fio_options[FIO_MAX_OPTS] = {
+	{
+		.name	= "description",
+		.lname	= "Description of job",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, description),
+		.help	= "Text job description",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_DESC,
+	},
+	{
+		.name	= "name",
+		.lname	= "Job name",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, name),
+		.help	= "Name of this job",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_DESC,
+	},
+	{
+		.name	= "wait_for",
+		.lname	= "Waitee name",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, wait_for),
+		.help	= "Name of the job this one wants to wait for before starting",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_DESC,
+	},
+	{
+		.name	= "filename",
+		.lname	= "Filename(s)",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, filename),
+		.cb	= str_filename_cb,
+		.prio	= -1, /* must come after "directory" */
+		.help	= "File(s) to use for the workload",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "directory",
+		.lname	= "Directory",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, directory),
+		.cb	= str_directory_cb,
+		.help	= "Directory to store files in",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "filename_format",
+		.lname	= "Filename Format",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, filename_format),
+		.prio	= -1, /* must come after "directory" */
+		.help	= "Override default $jobname.$jobnum.$filenum naming",
+		.def	= "$jobname.$jobnum.$filenum",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "unique_filename",
+		.lname	= "Unique Filename",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unique_filename),
+		.help	= "For network clients, prefix file with source IP",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "lockfile",
+		.lname	= "Lockfile",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, file_lock_mode),
+		.help	= "Lock file when doing IO to it",
+		.prio	= 1,
+		.parent	= "filename",
+		.hide	= 0,
+		.def	= "none",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+		.posval = {
+			  { .ival = "none",
+			    .oval = FILE_LOCK_NONE,
+			    .help = "No file locking",
+			  },
+			  { .ival = "exclusive",
+			    .oval = FILE_LOCK_EXCLUSIVE,
+			    .help = "Exclusive file lock",
+			  },
+			  {
+			    .ival = "readwrite",
+			    .oval = FILE_LOCK_READWRITE,
+			    .help = "Read vs write lock",
+			  },
+		},
+	},
+	{
+		.name	= "opendir",
+		.lname	= "Open directory",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, opendir),
+		.cb	= str_opendir_cb,
+		.help	= "Recursively add files from this directory and down",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "rw",
+		.lname	= "Read/write",
+		.alias	= "readwrite",
+		.type	= FIO_OPT_STR,
+		.cb	= str_rw_cb,
+		.off1	= offsetof(struct thread_options, td_ddir),
+		.help	= "IO direction",
+		.def	= "read",
+		.verify	= rw_verify,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval = {
+			  { .ival = "read",
+			    .oval = TD_DDIR_READ,
+			    .help = "Sequential read",
+			  },
+			  { .ival = "write",
+			    .oval = TD_DDIR_WRITE,
+			    .help = "Sequential write",
+			  },
+			  { .ival = "trim",
+			    .oval = TD_DDIR_TRIM,
+			    .help = "Sequential trim",
+			  },
+			  { .ival = "randread",
+			    .oval = TD_DDIR_RANDREAD,
+			    .help = "Random read",
+			  },
+			  { .ival = "randwrite",
+			    .oval = TD_DDIR_RANDWRITE,
+			    .help = "Random write",
+			  },
+			  { .ival = "randtrim",
+			    .oval = TD_DDIR_RANDTRIM,
+			    .help = "Random trim",
+			  },
+			  { .ival = "rw",
+			    .oval = TD_DDIR_RW,
+			    .help = "Sequential read and write mix",
+			  },
+			  { .ival = "readwrite",
+			    .oval = TD_DDIR_RW,
+			    .help = "Sequential read and write mix",
+			  },
+			  { .ival = "randrw",
+			    .oval = TD_DDIR_RANDRW,
+			    .help = "Random read and write mix"
+			  },
+			  { .ival = "trimwrite",
+			    .oval = TD_DDIR_TRIMWRITE,
+			    .help = "Trim and write mix, trims preceding writes"
+			  },
+		},
+	},
+	{
+		.name	= "rw_sequencer",
+		.lname	= "RW Sequencer",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, rw_seq),
+		.help	= "IO offset generator modifier",
+		.def	= "sequential",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval = {
+			  { .ival = "sequential",
+			    .oval = RW_SEQ_SEQ,
+			    .help = "Generate sequential offsets",
+			  },
+			  { .ival = "identical",
+			    .oval = RW_SEQ_IDENT,
+			    .help = "Generate identical offsets",
+			  },
+		},
+	},
+
+	{
+		.name	= "ioengine",
+		.lname	= "IO Engine",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, ioengine),
+		.help	= "IO engine to use",
+		.def	= FIO_PREFERRED_ENGINE,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval	= {
+			  { .ival = "sync",
+			    .help = "Use read/write",
+			  },
+			  { .ival = "psync",
+			    .help = "Use pread/pwrite",
+			  },
+			  { .ival = "vsync",
+			    .help = "Use readv/writev",
+			  },
+#ifdef CONFIG_PWRITEV
+			  { .ival = "pvsync",
+			    .help = "Use preadv/pwritev",
+			  },
+#endif
+#ifdef FIO_HAVE_PWRITEV2
+			  { .ival = "pvsync2",
+			    .help = "Use preadv2/pwritev2",
+			  },
+#endif
+#ifdef CONFIG_LIBAIO
+			  { .ival = "libaio",
+			    .help = "Linux native asynchronous IO",
+			  },
+#endif
+#ifdef ARCH_HAVE_IOURING
+			  { .ival = "io_uring",
+			    .help = "Fast Linux native aio",
+			  },
+#endif
+#ifdef CONFIG_POSIXAIO
+			  { .ival = "posixaio",
+			    .help = "POSIX asynchronous IO",
+			  },
+#endif
+#ifdef CONFIG_SOLARISAIO
+			  { .ival = "solarisaio",
+			    .help = "Solaris native asynchronous IO",
+			  },
+#endif
+#ifdef CONFIG_WINDOWSAIO
+			  { .ival = "windowsaio",
+			    .help = "Windows native asynchronous IO"
+			  },
+#endif
+#ifdef CONFIG_RBD
+			  { .ival = "rbd",
+			    .help = "Rados Block Device asynchronous IO"
+			  },
+#endif
+			  { .ival = "mmap",
+			    .help = "Memory mapped IO"
+			  },
+#ifdef CONFIG_LINUX_SPLICE
+			  { .ival = "splice",
+			    .help = "splice/vmsplice based IO",
+			  },
+			  { .ival = "netsplice",
+			    .help = "splice/vmsplice to/from the network",
+			  },
+#endif
+#ifdef FIO_HAVE_SGIO
+			  { .ival = "sg",
+			    .help = "SCSI generic v3 IO",
+			  },
+#endif
+			  { .ival = "null",
+			    .help = "Testing engine (no data transfer)",
+			  },
+			  { .ival = "net",
+			    .help = "Network IO",
+			  },
+			  { .ival = "cpuio",
+			    .help = "CPU cycle burner engine",
+			  },
+#ifdef CONFIG_GUASI
+			  { .ival = "guasi",
+			    .help = "GUASI IO engine",
+			  },
+#endif
+#ifdef CONFIG_RDMA
+			  { .ival = "rdma",
+			    .help = "RDMA IO engine",
+			  },
+#endif
+#ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
+			  { .ival = "e4defrag",
+			    .help = "ext4 defrag engine",
+			  },
+#endif
+#ifdef CONFIG_LINUX_FALLOCATE
+			  { .ival = "falloc",
+			    .help = "fallocate() file based engine",
+			  },
+#endif
+#ifdef CONFIG_GFAPI
+			  { .ival = "gfapi",
+			    .help = "Glusterfs libgfapi(sync) based engine"
+			  },
+			  { .ival = "gfapi_async",
+			    .help = "Glusterfs libgfapi(async) based engine"
+			  },
+#endif
+#ifdef CONFIG_LIBHDFS
+			  { .ival = "libhdfs",
+			    .help = "Hadoop Distributed Filesystem (HDFS) engine"
+			  },
+#endif
+#ifdef CONFIG_PMEMBLK
+			  { .ival = "pmemblk",
+			    .help = "PMDK libpmemblk based IO engine",
+			  },
+
+#endif
+#ifdef CONFIG_IME
+			  { .ival = "ime_psync",
+			    .help = "DDN's IME synchronous IO engine",
+			  },
+			  { .ival = "ime_psyncv",
+			    .help = "DDN's IME synchronous IO engine using iovecs",
+			  },
+			  { .ival = "ime_aio",
+			    .help = "DDN's IME asynchronous IO engine",
+			  },
+#endif
+#ifdef CONFIG_LINUX_DEVDAX
+			  { .ival = "dev-dax",
+			    .help = "DAX Device based IO engine",
+			  },
+#endif
+			  {
+			    .ival = "filecreate",
+			    .help = "File creation engine",
+			  },
+			  { .ival = "external",
+			    .help = "Load external engine (append name)",
+			    .cb = str_ioengine_external_cb,
+			  },
+#ifdef CONFIG_LIBPMEM
+			  { .ival = "libpmem",
+			    .help = "PMDK libpmem based IO engine",
+			  },
+#endif
+#ifdef CONFIG_HTTP
+			  { .ival = "http",
+			    .help = "HTTP (WebDAV/S3) IO engine",
+			  },
+#endif
+			  { .ival = "nbd",
+			    .help = "Network Block Device (NBD) IO engine"
+			  },
+		},
+	},
+	{
+		.name	= "iodepth",
+		.lname	= "IO Depth",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth),
+		.help	= "Number of IO buffers to keep in flight",
+		.minval = 1,
+		.interval = 1,
+		.def	= "1",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "iodepth_batch",
+		.lname	= "IO Depth batch",
+		.alias	= "iodepth_batch_submit",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_batch),
+		.help	= "Number of IO buffers to submit in one go",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.interval = 1,
+		.def	= "1",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "iodepth_batch_complete_min",
+		.lname	= "Min IO depth batch complete",
+		.alias	= "iodepth_batch_complete",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_min),
+		.help	= "Min number of IO buffers to retrieve in one go",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.minval	= 0,
+		.interval = 1,
+		.def	= "1",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "iodepth_batch_complete_max",
+		.lname	= "Max IO depth batch complete",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_max),
+		.help	= "Max number of IO buffers to retrieve in one go",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.minval	= 0,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "iodepth_low",
+		.lname	= "IO Depth batch low",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_low),
+		.help	= "Low water mark for queuing depth",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "serialize_overlap",
+		.lname	= "Serialize overlap",
+		.off1	= offsetof(struct thread_options, serialize_overlap),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Wait for in-flight IOs that collide to complete",
+		.parent	= "iodepth",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "io_submit_mode",
+		.lname	= "IO submit mode",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, io_submit_mode),
+		.help	= "How IO submissions and completions are done",
+		.def	= "inline",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval = {
+			  { .ival = "inline",
+			    .oval = IO_MODE_INLINE,
+			    .help = "Submit and complete IO inline",
+			  },
+			  { .ival = "offload",
+			    .oval = IO_MODE_OFFLOAD,
+			    .help = "Offload submit and complete to threads",
+			  },
+		},
+	},
+	{
+		.name	= "size",
+		.lname	= "Size",
+		.type	= FIO_OPT_STR_VAL,
+		.cb	= str_size_cb,
+		.off1	= offsetof(struct thread_options, size),
+		.help	= "Total size of device or files",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "io_size",
+		.alias	= "io_limit",
+		.lname	= "IO Size",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, io_size),
+		.help	= "Total size of I/O to be performed",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "fill_device",
+		.lname	= "Fill device",
+		.alias	= "fill_fs",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, fill_device),
+		.help	= "Write until an ENOSPC error occurs",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "filesize",
+		.lname	= "File size",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, file_size_low),
+		.off2	= offsetof(struct thread_options, file_size_high),
+		.minval = 1,
+		.help	= "Size of individual files",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "file_append",
+		.lname	= "File append",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, file_append),
+		.help	= "IO will start at the end of the file(s)",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "offset",
+		.lname	= "IO offset",
+		.alias	= "fileoffset",
+		.type	= FIO_OPT_STR_VAL,
+		.cb	= str_offset_cb,
+		.off1	= offsetof(struct thread_options, start_offset),
+		.help	= "Start IO from this offset",
+		.def	= "0",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "offset_align",
+		.lname	= "IO offset alignment",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, start_offset_align),
+		.help	= "Start IO from this offset alignment",
+		.def	= "0",
+		.interval = 512,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "offset_increment",
+		.lname	= "IO offset increment",
+		.type	= FIO_OPT_STR_VAL,
+		.cb	= str_offset_increment_cb,
+		.off1	= offsetof(struct thread_options, offset_increment),
+		.help	= "What is the increment from one offset to the next",
+		.parent = "offset",
+		.hide	= 1,
+		.def	= "0",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "number_ios",
+		.lname	= "Number of IOs to perform",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, number_ios),
+		.help	= "Force job completion after this number of IOs",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "bs",
+		.lname	= "Block size",
+		.alias	= "blocksize",
+		.type	= FIO_OPT_ULL,
+		.off1	= offsetof(struct thread_options, bs[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, bs[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, bs[DDIR_TRIM]),
+		.minval = 1,
+		.help	= "Block size unit",
+		.def	= "4096",
+		.parent = "rw",
+		.hide	= 1,
+		.interval = 512,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "ba",
+		.lname	= "Block size align",
+		.alias	= "blockalign",
+		.type	= FIO_OPT_ULL,
+		.off1	= offsetof(struct thread_options, ba[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, ba[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, ba[DDIR_TRIM]),
+		.minval	= 1,
+		.help	= "IO block offset alignment",
+		.parent	= "rw",
+		.hide	= 1,
+		.interval = 512,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "bsrange",
+		.lname	= "Block size range",
+		.alias	= "blocksize_range",
+		.type	= FIO_OPT_RANGE,
+		.off1	= offsetof(struct thread_options, min_bs[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, max_bs[DDIR_READ]),
+		.off3	= offsetof(struct thread_options, min_bs[DDIR_WRITE]),
+		.off4	= offsetof(struct thread_options, max_bs[DDIR_WRITE]),
+		.off5	= offsetof(struct thread_options, min_bs[DDIR_TRIM]),
+		.off6	= offsetof(struct thread_options, max_bs[DDIR_TRIM]),
+		.minval = 1,
+		.help	= "Set block size range (in more detail than bs)",
+		.parent = "rw",
+		.hide	= 1,
+		.interval = 4096,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "bssplit",
+		.lname	= "Block size split",
+		.type	= FIO_OPT_STR_ULL,
+		.cb	= str_bssplit_cb,
+		.off1	= offsetof(struct thread_options, bssplit),
+		.help	= "Set a specific mix of block sizes",
+		.parent	= "rw",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "bs_unaligned",
+		.lname	= "Block size unaligned",
+		.alias	= "blocksize_unaligned",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, bs_unaligned),
+		.help	= "Don't sector align IO buffer sizes",
+		.parent = "rw",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "bs_is_seq_rand",
+		.lname	= "Block size division is seq/random (not read/write)",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, bs_is_seq_rand),
+		.help	= "Consider any blocksize setting to be sequential,random",
+		.def	= "0",
+		.parent = "blocksize",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "randrepeat",
+		.lname	= "Random repeatable",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, rand_repeatable),
+		.help	= "Use repeatable random IO pattern",
+		.def	= "1",
+		.parent = "rw",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "randseed",
+		.lname	= "The random generator seed",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, rand_seed),
+		.help	= "Set the random generator seed value",
+		.def	= "0x89",
+		.parent = "rw",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "norandommap",
+		.lname	= "No randommap",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, norandommap),
+		.help	= "Accept potential duplicate random blocks",
+		.parent = "rw",
+		.hide	= 1,
+		.hide_on_set = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "softrandommap",
+		.lname	= "Soft randommap",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, softrandommap),
+		.help	= "Set norandommap if randommap allocation fails",
+		.parent	= "norandommap",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "random_generator",
+		.lname	= "Random Generator",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, random_generator),
+		.help	= "Type of random number generator to use",
+		.def	= "tausworthe",
+		.posval	= {
+			  { .ival = "tausworthe",
+			    .oval = FIO_RAND_GEN_TAUSWORTHE,
+			    .help = "Strong Tausworthe generator",
+			  },
+			  { .ival = "lfsr",
+			    .oval = FIO_RAND_GEN_LFSR,
+			    .help = "Variable length LFSR",
+			  },
+			  {
+			    .ival = "tausworthe64",
+			    .oval = FIO_RAND_GEN_TAUSWORTHE64,
+			    .help = "64-bit Tausworthe variant",
+			  },
+		},
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "random_distribution",
+		.lname	= "Random Distribution",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, random_distribution),
+		.cb	= str_random_distribution_cb,
+		.help	= "Random offset distribution generator",
+		.def	= "random",
+		.posval	= {
+			  { .ival = "random",
+			    .oval = FIO_RAND_DIST_RANDOM,
+			    .help = "Completely random",
+			  },
+			  { .ival = "zipf",
+			    .oval = FIO_RAND_DIST_ZIPF,
+			    .help = "Zipf distribution",
+			  },
+			  { .ival = "pareto",
+			    .oval = FIO_RAND_DIST_PARETO,
+			    .help = "Pareto distribution",
+			  },
+			  { .ival = "normal",
+			    .oval = FIO_RAND_DIST_GAUSS,
+			    .help = "Normal (Gaussian) distribution",
+			  },
+			  { .ival = "zoned",
+			    .oval = FIO_RAND_DIST_ZONED,
+			    .help = "Zoned random distribution",
+			  },
+			  { .ival = "zoned_abs",
+			    .oval = FIO_RAND_DIST_ZONED_ABS,
+			    .help = "Zoned absolute random distribution",
+			  },
+		},
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "percentage_random",
+		.lname	= "Percentage Random",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, perc_rand[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, perc_rand[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, perc_rand[DDIR_TRIM]),
+		.maxval	= 100,
+		.help	= "Percentage of seq/random mix that should be random",
+		.def	= "100,100,100",
+		.interval = 5,
+		.inverse = "percentage_sequential",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "percentage_sequential",
+		.lname	= "Percentage Sequential",
+		.type	= FIO_OPT_DEPRECATED,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "allrandrepeat",
+		.lname	= "All Random Repeat",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allrand_repeatable),
+		.help	= "Use repeatable random numbers for everything",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "nrfiles",
+		.lname	= "Number of files",
+		.alias	= "nr_files",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, nr_files),
+		.help	= "Split job workload between this number of files",
+		.def	= "1",
+		.interval = 1,
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "openfiles",
+		.lname	= "Number of open files",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, open_files),
+		.help	= "Number of files to keep open at the same time",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "file_service_type",
+		.lname	= "File service type",
+		.type	= FIO_OPT_STR,
+		.cb	= str_fst_cb,
+		.off1	= offsetof(struct thread_options, file_service_type),
+		.help	= "How to select which file to service next",
+		.def	= "roundrobin",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+		.posval	= {
+			  { .ival = "random",
+			    .oval = FIO_FSERVICE_RANDOM,
+			    .help = "Choose a file at random (uniform)",
+			  },
+			  { .ival = "zipf",
+			    .oval = FIO_FSERVICE_ZIPF,
+			    .help = "Zipf randomized",
+			  },
+			  { .ival = "pareto",
+			    .oval = FIO_FSERVICE_PARETO,
+			    .help = "Pareto randomized",
+			  },
+			  { .ival = "normal",
+			    .oval = FIO_FSERVICE_GAUSS,
+			    .help = "Normal (Gaussian) randomized",
+			  },
+			  { .ival = "gauss",
+			    .oval = FIO_FSERVICE_GAUSS,
+			    .help = "Alias for normal",
+			  },
+			  { .ival = "roundrobin",
+			    .oval = FIO_FSERVICE_RR,
+			    .help = "Round robin select files",
+			  },
+			  { .ival = "sequential",
+			    .oval = FIO_FSERVICE_SEQ,
+			    .help = "Finish one file before moving to the next",
+			  },
+		},
+		.parent = "nrfiles",
+		.hide	= 1,
+	},
+	{
+		.name	= "fallocate",
+		.lname	= "Fallocate",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, fallocate_mode),
+		.help	= "Whether pre-allocation is performed when laying out files",
+#ifdef FIO_HAVE_DEFAULT_FALLOCATE
+		.def	= "native",
+#else
+		.def	= "none",
+#endif
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+		.posval	= {
+			  { .ival = "none",
+			    .oval = FIO_FALLOCATE_NONE,
+			    .help = "Do not pre-allocate space",
+			  },
+			  { .ival = "native",
+			    .oval = FIO_FALLOCATE_NATIVE,
+			    .help = "Use native pre-allocation if possible",
+			  },
+#ifdef CONFIG_POSIX_FALLOCATE
+			  { .ival = "posix",
+			    .oval = FIO_FALLOCATE_POSIX,
+			    .help = "Use posix_fallocate()",
+			  },
+#endif
+#ifdef CONFIG_LINUX_FALLOCATE
+			  { .ival = "keep",
+			    .oval = FIO_FALLOCATE_KEEP_SIZE,
+			    .help = "Use fallocate(..., FALLOC_FL_KEEP_SIZE, ...)",
+			  },
+#endif
+			  { .ival = "truncate",
+			    .oval = FIO_FALLOCATE_TRUNCATE,
+			    .help = "Truncate file to final size instead of allocating"
+			  },
+			  /* Compatibility with former boolean values */
+			  { .ival = "0",
+			    .oval = FIO_FALLOCATE_NONE,
+			    .help = "Alias for 'none'",
+			  },
+#ifdef CONFIG_POSIX_FALLOCATE
+			  { .ival = "1",
+			    .oval = FIO_FALLOCATE_POSIX,
+			    .help = "Alias for 'posix'",
+			  },
+#endif
+		},
+	},
+	{
+		.name	= "fadvise_hint",
+		.lname	= "Fadvise hint",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, fadvise_hint),
+		.posval	= {
+			  { .ival = "0",
+			    .oval = F_ADV_NONE,
+			    .help = "Don't issue fadvise/madvise",
+			  },
+			  { .ival = "1",
+			    .oval = F_ADV_TYPE,
+			    .help = "Advise using fio IO pattern",
+			  },
+			  { .ival = "random",
+			    .oval = F_ADV_RANDOM,
+			    .help = "Advise using FADV_RANDOM",
+			  },
+			  { .ival = "sequential",
+			    .oval = F_ADV_SEQUENTIAL,
+			    .help = "Advise using FADV_SEQUENTIAL",
+			  },
+		},
+		.help	= "Use fadvise() to advise the kernel on IO pattern",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "fsync",
+		.lname	= "Fsync",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, fsync_blocks),
+		.help	= "Issue fsync for writes every given number of blocks",
+		.def	= "0",
+		.interval = 1,
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "fdatasync",
+		.lname	= "Fdatasync",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, fdatasync_blocks),
+		.help	= "Issue fdatasync for writes every given number of blocks",
+		.def	= "0",
+		.interval = 1,
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_barrier",
+		.lname	= "Write barrier",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, barrier_blocks),
+		.help	= "Make every Nth write a barrier write",
+		.def	= "0",
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef CONFIG_SYNC_FILE_RANGE
+	{
+		.name	= "sync_file_range",
+		.lname	= "Sync file range",
+		.posval	= {
+			  { .ival = "wait_before",
+			    .oval = SYNC_FILE_RANGE_WAIT_BEFORE,
+			    .help = "SYNC_FILE_RANGE_WAIT_BEFORE",
+			    .orval  = 1,
+			  },
+			  { .ival = "write",
+			    .oval = SYNC_FILE_RANGE_WRITE,
+			    .help = "SYNC_FILE_RANGE_WRITE",
+			    .orval  = 1,
+			  },
+			  {
+			    .ival = "wait_after",
+			    .oval = SYNC_FILE_RANGE_WAIT_AFTER,
+			    .help = "SYNC_FILE_RANGE_WAIT_AFTER",
+			    .orval  = 1,
+			  },
+		},
+		.type	= FIO_OPT_STR_MULTI,
+		.cb	= str_sfr_cb,
+		.off1	= offsetof(struct thread_options, sync_file_range),
+		.help	= "Use sync_file_range()",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "sync_file_range",
+		.lname	= "Sync file range",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support sync_file_range",
+	},
+#endif
+	{
+		.name	= "direct",
+		.lname	= "Direct I/O",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, odirect),
+		.help	= "Use O_DIRECT IO (negates buffered)",
+		.def	= "0",
+		.inverse = "buffered",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "atomic",
+		.lname	= "Atomic I/O",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, oatomic),
+		.help	= "Use Atomic IO with O_DIRECT (implies O_DIRECT)",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "buffered",
+		.lname	= "Buffered I/O",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, odirect),
+		.neg	= 1,
+		.help	= "Use buffered IO (negates direct)",
+		.def	= "1",
+		.inverse = "direct",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "overwrite",
+		.lname	= "Overwrite",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, overwrite),
+		.help	= "When writing, set whether to overwrite current data",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "loops",
+		.lname	= "Loops",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, loops),
+		.help	= "Number of times to run the job",
+		.def	= "1",
+		.interval = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "numjobs",
+		.lname	= "Number of jobs",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, numjobs),
+		.help	= "Duplicate this job this many times",
+		.def	= "1",
+		.interval = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "startdelay",
+		.lname	= "Start delay",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, start_delay),
+		.off2	= offsetof(struct thread_options, start_delay_high),
+		.help	= "Only start job when this period has passed",
+		.def	= "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "runtime",
+		.lname	= "Runtime",
+		.alias	= "timeout",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, timeout),
+		.help	= "Stop workload when this amount of time has passed",
+		.def	= "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "time_based",
+		.lname	= "Time based",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, time_based),
+		.help	= "Keep running until runtime/timeout is met",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "verify_only",
+		.lname	= "Verify only",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, verify_only),
+		.help	= "Verifies previously written data is still valid",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "ramp_time",
+		.lname	= "Ramp time",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, ramp_time),
+		.help	= "Ramp up time before measuring performance",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "clocksource",
+		.lname	= "Clock source",
+		.type	= FIO_OPT_STR,
+		.cb	= fio_clock_source_cb,
+		.off1	= offsetof(struct thread_options, clocksource),
+		.help	= "What type of timing source to use",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CLOCK,
+		.posval	= {
+#ifdef CONFIG_GETTIMEOFDAY
+			  { .ival = "gettimeofday",
+			    .oval = CS_GTOD,
+			    .help = "Use gettimeofday(2) for timing",
+			  },
+#endif
+#ifdef CONFIG_CLOCK_GETTIME
+			  { .ival = "clock_gettime",
+			    .oval = CS_CGETTIME,
+			    .help = "Use clock_gettime(2) for timing",
+			  },
+#endif
+#ifdef ARCH_HAVE_CPU_CLOCK
+			  { .ival = "cpu",
+			    .oval = CS_CPUCLOCK,
+			    .help = "Use CPU private clock",
+			  },
+#endif
+		},
+	},
+	{
+		.name	= "mem",
+		.alias	= "iomem",
+		.lname	= "I/O Memory",
+		.type	= FIO_OPT_STR,
+		.cb	= str_mem_cb,
+		.off1	= offsetof(struct thread_options, mem_type),
+		.help	= "Backing type for IO buffers",
+		.def	= "malloc",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+		.posval	= {
+			  { .ival = "malloc",
+			    .oval = MEM_MALLOC,
+			    .help = "Use malloc(3) for IO buffers",
+			  },
+#ifndef CONFIG_NO_SHM
+			  { .ival = "shm",
+			    .oval = MEM_SHM,
+			    .help = "Use shared memory segments for IO buffers",
+			  },
+#ifdef FIO_HAVE_HUGETLB
+			  { .ival = "shmhuge",
+			    .oval = MEM_SHMHUGE,
+			    .help = "Like shm, but use huge pages",
+			  },
+#endif
+#endif
+			  { .ival = "mmap",
+			    .oval = MEM_MMAP,
+			    .help = "Use mmap(2) (file or anon) for IO buffers",
+			  },
+			  { .ival = "mmapshared",
+			    .oval = MEM_MMAPSHARED,
+			    .help = "Like mmap, but use the shared flag",
+			  },
+#ifdef FIO_HAVE_HUGETLB
+			  { .ival = "mmaphuge",
+			    .oval = MEM_MMAPHUGE,
+			    .help = "Like mmap, but use huge pages",
+			  },
+#endif
+#ifdef CONFIG_CUDA
+			  { .ival = "cudamalloc",
+			    .oval = MEM_CUDA_MALLOC,
+			    .help = "Allocate GPU device memory for GPUDirect RDMA",
+			  },
+#endif
+		  },
+	},
+	{
+		.name	= "iomem_align",
+		.alias	= "mem_align",
+		.lname	= "I/O memory alignment",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, mem_align),
+		.minval	= 0,
+		.help	= "IO memory buffer offset alignment",
+		.def	= "0",
+		.parent	= "iomem",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "verify",
+		.lname	= "Verify",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, verify),
+		.help	= "Verify data written",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+		.posval = {
+			  { .ival = "0",
+			    .oval = VERIFY_NONE,
+			    .help = "Don't do IO verification",
+			  },
+			  { .ival = "md5",
+			    .oval = VERIFY_MD5,
+			    .help = "Use md5 checksums for verification",
+			  },
+			  { .ival = "crc64",
+			    .oval = VERIFY_CRC64,
+			    .help = "Use crc64 checksums for verification",
+			  },
+			  { .ival = "crc32",
+			    .oval = VERIFY_CRC32,
+			    .help = "Use crc32 checksums for verification",
+			  },
+			  { .ival = "crc32c-intel",
+			    .oval = VERIFY_CRC32C,
+			    .help = "Use crc32c checksums for verification (hw assisted, if available)",
+			  },
+			  { .ival = "crc32c",
+			    .oval = VERIFY_CRC32C,
+			    .help = "Use crc32c checksums for verification (hw assisted, if available)",
+			  },
+			  { .ival = "crc16",
+			    .oval = VERIFY_CRC16,
+			    .help = "Use crc16 checksums for verification",
+			  },
+			  { .ival = "crc7",
+			    .oval = VERIFY_CRC7,
+			    .help = "Use crc7 checksums for verification",
+			  },
+			  { .ival = "sha1",
+			    .oval = VERIFY_SHA1,
+			    .help = "Use sha1 checksums for verification",
+			  },
+			  { .ival = "sha256",
+			    .oval = VERIFY_SHA256,
+			    .help = "Use sha256 checksums for verification",
+			  },
+			  { .ival = "sha512",
+			    .oval = VERIFY_SHA512,
+			    .help = "Use sha512 checksums for verification",
+			  },
+			  { .ival = "sha3-224",
+			    .oval = VERIFY_SHA3_224,
+			    .help = "Use sha3-224 checksums for verification",
+			  },
+			  { .ival = "sha3-256",
+			    .oval = VERIFY_SHA3_256,
+			    .help = "Use sha3-256 checksums for verification",
+			  },
+			  { .ival = "sha3-384",
+			    .oval = VERIFY_SHA3_384,
+			    .help = "Use sha3-384 checksums for verification",
+			  },
+			  { .ival = "sha3-512",
+			    .oval = VERIFY_SHA3_512,
+			    .help = "Use sha3-512 checksums for verification",
+			  },
+			  { .ival = "xxhash",
+			    .oval = VERIFY_XXHASH,
+			    .help = "Use xxhash checksums for verification",
+			  },
+			  /* Meta information was included into verify_header,
+			   * 'meta' verification is implied by default. */
+			  { .ival = "meta",
+			    .oval = VERIFY_HDR_ONLY,
+			    .help = "Use io information for verification. "
+				    "Now is implied by default, thus option is obsolete, "
+				    "don't use it",
+			  },
+			  { .ival = "pattern",
+			    .oval = VERIFY_PATTERN_NO_HDR,
+			    .help = "Verify strict pattern",
+			  },
+			  {
+			    .ival = "null",
+			    .oval = VERIFY_NULL,
+			    .help = "Pretend to verify",
+			  },
+		},
+	},
+	{
+		.name	= "do_verify",
+		.lname	= "Perform verify step",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, do_verify),
+		.help	= "Run verification stage after write",
+		.def	= "1",
+		.parent = "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verifysort",
+		.lname	= "Verify sort",
+		.type	= FIO_OPT_SOFT_DEPRECATED,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verifysort_nr",
+		.lname	= "Verify Sort Nr",
+		.type	= FIO_OPT_SOFT_DEPRECATED,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name   = "verify_interval",
+		.lname	= "Verify interval",
+		.type   = FIO_OPT_INT,
+		.off1   = offsetof(struct thread_options, verify_interval),
+		.minval	= 2 * sizeof(struct verify_header),
+		.help   = "Store verify buffer header every N bytes",
+		.parent	= "verify",
+		.hide	= 1,
+		.interval = 2 * sizeof(struct verify_header),
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_offset",
+		.lname	= "Verify offset",
+		.type	= FIO_OPT_INT,
+		.help	= "Offset verify header location by N bytes",
+		.off1	= offsetof(struct thread_options, verify_offset),
+		.minval	= sizeof(struct verify_header),
+		.parent	= "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_pattern",
+		.lname	= "Verify pattern",
+		.type	= FIO_OPT_STR,
+		.cb	= str_verify_pattern_cb,
+		.off1	= offsetof(struct thread_options, verify_pattern),
+		.help	= "Fill pattern for IO buffers",
+		.parent	= "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_fatal",
+		.lname	= "Verify fatal",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, verify_fatal),
+		.def	= "0",
+		.help	= "Exit on a single verify failure, don't continue",
+		.parent = "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_dump",
+		.lname	= "Verify dump",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, verify_dump),
+		.def	= "0",
+		.help	= "Dump contents of good and bad blocks on failure",
+		.parent = "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_async",
+		.lname	= "Verify asynchronously",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, verify_async),
+		.def	= "0",
+		.help	= "Number of async verifier threads to use",
+		.parent	= "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_backlog",
+		.lname	= "Verify backlog",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, verify_backlog),
+		.help	= "Verify after this number of blocks are written",
+		.parent	= "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_backlog_batch",
+		.lname	= "Verify backlog batch",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, verify_batch),
+		.help	= "Verify this number of IO blocks",
+		.parent	= "verify",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+#ifdef FIO_HAVE_CPU_AFFINITY
+	{
+		.name	= "verify_async_cpus",
+		.lname	= "Async verify CPUs",
+		.type	= FIO_OPT_STR,
+		.cb	= str_verify_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, verify_cpumask),
+		.help	= "Set CPUs allowed for async verify threads",
+		.parent	= "verify_async",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+#else
+	{
+		.name	= "verify_async_cpus",
+		.lname	= "Async verify CPUs",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+#endif
+	{
+		.name	= "experimental_verify",
+		.lname	= "Experimental Verify",
+		.off1	= offsetof(struct thread_options, experimental_verify),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Enable experimental verification",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_state_load",
+		.lname	= "Load verify state",
+		.off1	= offsetof(struct thread_options, verify_state),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Load verify termination state",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_state_save",
+		.lname	= "Save verify state",
+		.off1	= offsetof(struct thread_options, verify_state_save),
+		.type	= FIO_OPT_BOOL,
+		.def	= "1",
+		.help	= "Save verify state on termination",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+#ifdef FIO_HAVE_TRIM
+	{
+		.name	= "trim_percentage",
+		.lname	= "Trim percentage",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, trim_percentage),
+		.minval = 0,
+		.maxval = 100,
+		.help	= "Number of verify blocks to trim (i.e., discard)",
+		.parent	= "verify",
+		.def	= "0",
+		.interval = 1,
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_TRIM,
+	},
+	{
+		.name	= "trim_verify_zero",
+		.lname	= "Verify trim zero",
+		.type	= FIO_OPT_BOOL,
+		.help	= "Verify that trimmed (i.e., discarded) blocks are returned as zeroes",
+		.off1	= offsetof(struct thread_options, trim_zero),
+		.parent	= "trim_percentage",
+		.hide	= 1,
+		.def	= "1",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_TRIM,
+	},
+	{
+		.name	= "trim_backlog",
+		.lname	= "Trim backlog",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, trim_backlog),
+		.help	= "Trim after this number of blocks are written",
+		.parent	= "trim_percentage",
+		.hide	= 1,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_TRIM,
+	},
+	{
+		.name	= "trim_backlog_batch",
+		.lname	= "Trim backlog batch",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, trim_batch),
+		.help	= "Trim this number of IO blocks",
+		.parent	= "trim_percentage",
+		.hide	= 1,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_TRIM,
+	},
+#else
+	{
+		.name	= "trim_percentage",
+		.lname	= "Trim percentage",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_verify_zero",
+		.lname	= "Verify trim zero",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_backlog",
+		.lname	= "Trim backlog",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_backlog_batch",
+		.lname	= "Trim backlog batch",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+#endif
+	{
+		.name	= "write_iolog",
+		.lname	= "Write I/O log",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, write_iolog_file),
+		.help	= "Store IO pattern to file",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "read_iolog",
+		.lname	= "Read I/O log",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, read_iolog_file),
+		.help	= "Playback IO pattern from file",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "read_iolog_chunked",
+		.lname	= "Read I/O log in parts",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, read_iolog_chunked),
+		.def	= "0",
+		.parent	= "read_iolog",
+		.help	= "Parse IO pattern in chunks",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_no_stall",
+		.lname	= "Don't stall on replay",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, no_stall),
+		.def	= "0",
+		.parent	= "read_iolog",
+		.hide	= 1,
+		.help	= "Playback IO pattern file as fast as possible without stalls",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_redirect",
+		.lname	= "Redirect device for replay",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, replay_redirect),
+		.parent	= "read_iolog",
+		.hide	= 1,
+		.help	= "Replay all I/O onto this device, regardless of trace device",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_scale",
+		.lname	= "Replace offset scale factor",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_scale),
+		.parent	= "read_iolog",
+		.def	= "1",
+		.help	= "Align offsets to this blocksize",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_align",
+		.lname	= "Replace alignment",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_align),
+		.parent	= "read_iolog",
+		.help	= "Scale offset down by this factor",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+		.pow2	= 1,
+	},
+	{
+		.name	= "replay_time_scale",
+		.lname	= "Replay Time Scale",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_time_scale),
+		.def	= "100",
+		.minval	= 1,
+		.parent	= "read_iolog",
+		.hide	= 1,
+		.help	= "Scale time for replay events",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_skip",
+		.lname	= "Replay Skip",
+		.type	= FIO_OPT_STR,
+		.cb	= str_replay_skip_cb,
+		.off1	= offsetof(struct thread_options, replay_skip),
+		.parent	= "read_iolog",
+		.help	= "Skip certain IO types (read,write,trim,flush)",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "merge_blktrace_file",
+		.lname	= "Merged blktrace output filename",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, merge_blktrace_file),
+		.help	= "Merged blktrace output filename",
+		.category = FIO_OPT_C_IO,
+		.group = FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "merge_blktrace_scalars",
+		.lname	= "Percentage to scale each trace",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, merge_blktrace_scalars),
+		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
+		.help	= "Percentage to scale each trace",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "merge_blktrace_iters",
+		.lname	= "Number of iterations to run per trace",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, merge_blktrace_iters),
+		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
+		.help	= "Number of iterations to run per trace",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "exec_prerun",
+		.lname	= "Pre-execute runnable",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, exec_prerun),
+		.help	= "Execute this file prior to running job",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "exec_postrun",
+		.lname	= "Post-execute runnable",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, exec_postrun),
+		.help	= "Execute this file after running job",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef FIO_HAVE_IOSCHED_SWITCH
+	{
+		.name	= "ioscheduler",
+		.lname	= "I/O scheduler",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, ioscheduler),
+		.help	= "Use this IO scheduler on the backing device",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "ioscheduler",
+		.lname	= "I/O scheduler",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO scheduler switching",
+	},
+#endif
+	{
+		.name	= "zonemode",
+		.lname	= "Zone mode",
+		.help	= "Mode for the zonesize, zonerange and zoneskip parameters",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, zone_mode),
+		.def	= "none",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+		.posval	= {
+			   { .ival = "none",
+			     .oval = ZONE_MODE_NONE,
+			     .help = "no zoning",
+			   },
+			   { .ival = "strided",
+			     .oval = ZONE_MODE_STRIDED,
+			     .help = "strided mode - random I/O is restricted to a single zone",
+			   },
+			   { .ival = "zbd",
+			     .oval = ZONE_MODE_ZBD,
+			     .help = "zoned block device mode - random I/O selects one of multiple zones randomly",
+			   },
+		},
+	},
+	{
+		.name	= "zonesize",
+		.lname	= "Zone size",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, zone_size),
+		.help	= "Amount of data to read per zone",
+		.def	= "0",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "zonerange",
+		.lname	= "Zone range",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, zone_range),
+		.help	= "Give size of an IO zone",
+		.def	= "0",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "zoneskip",
+		.lname	= "Zone skip",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, zone_skip),
+		.help	= "Space between IO zones",
+		.def	= "0",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "read_beyond_wp",
+		.lname	= "Allow reads beyond the zone write pointer",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, read_beyond_wp),
+		.help	= "Allow reads beyond the zone write pointer",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "max_open_zones",
+		.lname	= "Maximum number of open zones",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, max_open_zones),
+		.maxval	= FIO_MAX_OPEN_ZBD_ZONES,
+		.help	= "Limit random writes to SMR drives to the specified"
+			  " number of sequential zones",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "zone_reset_threshold",
+		.lname	= "Zone reset threshold",
+		.help	= "Zoned block device reset threshold",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.maxlen	= 1,
+		.off1	= offsetof(struct thread_options, zrt),
+		.minfp	= 0,
+		.maxfp	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "zone_reset_frequency",
+		.lname	= "Zone reset frequency",
+		.help	= "Zoned block device zone reset frequency in HZ",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.maxlen	= 1,
+		.off1	= offsetof(struct thread_options, zrf),
+		.minfp	= 0,
+		.maxfp	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "lockmem",
+		.lname	= "Lock memory",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, lockmem),
+		.help	= "Lock down this amount of memory (per worker)",
+		.def	= "0",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "rwmixread",
+		.lname	= "Read/write mix read",
+		.type	= FIO_OPT_INT,
+		.cb	= str_rwmix_read_cb,
+		.off1	= offsetof(struct thread_options, rwmix[DDIR_READ]),
+		.maxval	= 100,
+		.help	= "Percentage of mixed workload that is reads",
+		.def	= "50",
+		.interval = 5,
+		.inverse = "rwmixwrite",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RWMIX,
+	},
+	{
+		.name	= "rwmixwrite",
+		.lname	= "Read/write mix write",
+		.type	= FIO_OPT_INT,
+		.cb	= str_rwmix_write_cb,
+		.off1	= offsetof(struct thread_options, rwmix[DDIR_WRITE]),
+		.maxval	= 100,
+		.help	= "Percentage of mixed workload that is writes",
+		.def	= "50",
+		.interval = 5,
+		.inverse = "rwmixread",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RWMIX,
+	},
+	{
+		.name	= "rwmixcycle",
+		.lname	= "Read/write mix cycle",
+		.type	= FIO_OPT_DEPRECATED,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RWMIX,
+	},
+	{
+		.name	= "nice",
+		.lname	= "Nice",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, nice),
+		.help	= "Set job CPU nice value",
+		.minval	= -20,
+		.maxval	= 19,
+		.def	= "0",
+		.interval = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+#ifdef FIO_HAVE_IOPRIO
+	{
+		.name	= "prio",
+		.lname	= "I/O nice priority",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, ioprio),
+		.help	= "Set job IO priority value",
+		.minval	= IOPRIO_MIN_PRIO,
+		.maxval	= IOPRIO_MAX_PRIO,
+		.interval = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+#else
+	{
+		.name	= "prio",
+		.lname	= "I/O nice priority",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO priorities",
+	},
+#endif
+#ifdef FIO_HAVE_IOPRIO_CLASS
+#ifndef FIO_HAVE_IOPRIO
+#error "FIO_HAVE_IOPRIO_CLASS requires FIO_HAVE_IOPRIO"
+#endif
+	{
+		.name	= "prioclass",
+		.lname	= "I/O nice priority class",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, ioprio_class),
+		.help	= "Set job IO priority class",
+		.minval	= IOPRIO_MIN_PRIO_CLASS,
+		.maxval	= IOPRIO_MAX_PRIO_CLASS,
+		.interval = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+#else
+	{
+		.name	= "prioclass",
+		.lname	= "I/O nice priority class",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO priority classes",
+	},
+#endif
+	{
+		.name	= "thinktime",
+		.lname	= "Thinktime",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, thinktime),
+		.help	= "Idle time between IO buffers (usec)",
+		.def	= "0",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_THINKTIME,
+	},
+	{
+		.name	= "thinktime_spin",
+		.lname	= "Thinktime spin",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, thinktime_spin),
+		.help	= "Start think time by spinning this amount (usec)",
+		.def	= "0",
+		.is_time = 1,
+		.parent	= "thinktime",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_THINKTIME,
+	},
+	{
+		.name	= "thinktime_blocks",
+		.lname	= "Thinktime blocks",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, thinktime_blocks),
+		.help	= "IO buffer period between 'thinktime'",
+		.def	= "1",
+		.parent	= "thinktime",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_THINKTIME,
+	},
+	{
+		.name	= "rate",
+		.lname	= "I/O rate",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, rate[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate[DDIR_TRIM]),
+		.help	= "Set bandwidth rate",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "rate_min",
+		.alias	= "ratemin",
+		.lname	= "I/O min rate",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, ratemin[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, ratemin[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, ratemin[DDIR_TRIM]),
+		.help	= "Job must meet this rate or it will be shutdown",
+		.parent	= "rate",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "rate_iops",
+		.lname	= "I/O rate IOPS",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, rate_iops[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate_iops[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate_iops[DDIR_TRIM]),
+		.help	= "Limit IO used to this number of IO operations/sec",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "rate_iops_min",
+		.lname	= "I/O min rate IOPS",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, rate_iops_min[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate_iops_min[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate_iops_min[DDIR_TRIM]),
+		.help	= "Job must meet this rate or it will be shut down",
+		.parent	= "rate_iops",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "rate_process",
+		.lname	= "Rate Process",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, rate_process),
+		.help	= "What process controls how rated IO is managed",
+		.def	= "linear",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+		.posval = {
+			  { .ival = "linear",
+			    .oval = RATE_PROCESS_LINEAR,
+			    .help = "Linear rate of IO",
+			  },
+			  {
+			    .ival = "poisson",
+			    .oval = RATE_PROCESS_POISSON,
+			    .help = "Rate follows Poisson process",
+			  },
+		},
+		.parent = "rate",
+	},
+	{
+		.name	= "rate_cycle",
+		.alias	= "ratecycle",
+		.lname	= "I/O rate cycle",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, ratecycle),
+		.help	= "Window average for rate limits (msec)",
+		.def	= "1000",
+		.parent = "rate",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "rate_ignore_thinktime",
+		.lname	= "Rate ignore thinktime",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, rate_ign_think),
+		.help	= "Rated IO ignores thinktime settings",
+		.parent = "rate",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "max_latency",
+		.lname	= "Max Latency (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, max_latency),
+		.help	= "Maximum tolerated IO latency (usec)",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group = FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_target",
+		.lname	= "Latency Target (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, latency_target),
+		.help	= "Ramp to max queue depth supporting this latency",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_window",
+		.lname	= "Latency Window (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, latency_window),
+		.help	= "Time to sustain latency_target",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_percentile",
+		.lname	= "Latency Percentile",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, latency_percentile),
+		.help	= "Percentile of IOs must be below latency_target",
+		.def	= "100",
+		.maxlen	= 1,
+		.minfp	= 0.0,
+		.maxfp	= 100.0,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "invalidate",
+		.lname	= "Cache invalidate",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, invalidate_cache),
+		.help	= "Invalidate buffer/page cache prior to running job",
+		.def	= "1",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "sync",
+		.lname	= "Synchronous I/O",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, sync_io),
+		.help	= "Use O_SYNC for buffered writes",
+		.def	= "0",
+		.parent = "buffered",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_TYPE,
+	},
+#ifdef FIO_HAVE_WRITE_HINT
+	{
+		.name	= "write_hint",
+		.lname	= "Write hint",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, write_hint),
+		.help	= "Set expected write life time",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+		.posval = {
+			  { .ival = "none",
+			    .oval = RWH_WRITE_LIFE_NONE,
+			  },
+			  { .ival = "short",
+			    .oval = RWH_WRITE_LIFE_SHORT,
+			  },
+			  { .ival = "medium",
+			    .oval = RWH_WRITE_LIFE_MEDIUM,
+			  },
+			  { .ival = "long",
+			    .oval = RWH_WRITE_LIFE_LONG,
+			  },
+			  { .ival = "extreme",
+			    .oval = RWH_WRITE_LIFE_EXTREME,
+			  },
+		},
+	},
+#endif
+	{
+		.name	= "create_serialize",
+		.lname	= "Create serialize",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, create_serialize),
+		.help	= "Serialize creation of job files",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "create_fsync",
+		.lname	= "Create fsync",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, create_fsync),
+		.help	= "fsync file after creation",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "create_on_open",
+		.lname	= "Create on open",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, create_on_open),
+		.help	= "Create files when they are opened for IO",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "create_only",
+		.lname	= "Create Only",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, create_only),
+		.help	= "Only perform file creation phase",
+		.category = FIO_OPT_C_FILE,
+		.def	= "0",
+	},
+	{
+		.name	= "allow_file_create",
+		.lname	= "Allow file create",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allow_create),
+		.help	= "Permit fio to create files, if they don't exist",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "allow_mounted_write",
+		.lname	= "Allow mounted write",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allow_mounted_write),
+		.help	= "Allow writes to a mounted partition",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "pre_read",
+		.lname	= "Pre-read files",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, pre_read),
+		.help	= "Pre-read files before starting official testing",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef FIO_HAVE_CPU_AFFINITY
+	{
+		.name	= "cpumask",
+		.lname	= "CPU mask",
+		.type	= FIO_OPT_INT,
+		.cb	= str_cpumask_cb,
+		.off1	= offsetof(struct thread_options, cpumask),
+		.help	= "CPU affinity mask",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "cpus_allowed",
+		.lname	= "CPUs allowed",
+		.type	= FIO_OPT_STR,
+		.cb	= str_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, cpumask),
+		.help	= "Set CPUs allowed",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "cpus_allowed_policy",
+		.lname	= "CPUs allowed distribution policy",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, cpus_allowed_policy),
+		.help	= "Distribution policy for cpus_allowed",
+		.parent = "cpus_allowed",
+		.prio	= 1,
+		.posval = {
+			  { .ival = "shared",
+			    .oval = FIO_CPUS_SHARED,
+			    .help = "Mask shared between threads",
+			  },
+			  { .ival = "split",
+			    .oval = FIO_CPUS_SPLIT,
+			    .help = "Mask split between threads",
+			  },
+		},
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+#else
+	{
+		.name	= "cpumask",
+		.lname	= "CPU mask",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+	{
+		.name	= "cpus_allowed",
+		.lname	= "CPUs allowed",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+	{
+		.name	= "cpus_allowed_policy",
+		.lname	= "CPUs allowed distribution policy",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+#endif
+#ifdef CONFIG_LIBNUMA
+	{
+		.name	= "numa_cpu_nodes",
+		.lname	= "NUMA CPU Nodes",
+		.type	= FIO_OPT_STR,
+		.cb	= str_numa_cpunodes_cb,
+		.off1	= offsetof(struct thread_options, numa_cpunodes),
+		.help	= "NUMA CPU nodes bind",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "numa_mem_policy",
+		.lname	= "NUMA Memory Policy",
+		.type	= FIO_OPT_STR,
+		.cb	= str_numa_mpol_cb,
+		.off1	= offsetof(struct thread_options, numa_memnodes),
+		.help	= "NUMA memory policy setup",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "numa_cpu_nodes",
+		.lname	= "NUMA CPU Nodes",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Build fio with libnuma-dev(el) to enable this option",
+	},
+	{
+		.name	= "numa_mem_policy",
+		.lname	= "NUMA Memory Policy",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Build fio with libnuma-dev(el) to enable this option",
+	},
+#endif
+#ifdef CONFIG_CUDA
+	{
+		.name	= "gpu_dev_id",
+		.lname	= "GPU device ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gpu_dev_id),
+		.help	= "Set GPU device ID for GPUDirect RDMA",
+		.def    = "0",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#endif
+	{
+		.name	= "end_fsync",
+		.lname	= "End fsync",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, end_fsync),
+		.help	= "Include fsync at the end of job",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "fsync_on_close",
+		.lname	= "Fsync on close",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, fsync_on_close),
+		.help	= "fsync files on close",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "unlink",
+		.lname	= "Unlink file",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unlink),
+		.help	= "Unlink created files after job has completed",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "unlink_each_loop",
+		.lname	= "Unlink file after each loop of a job",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unlink_each_loop),
+		.help	= "Unlink created files after each loop in a job has completed",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "exitall",
+		.lname	= "Exit-all on terminate",
+		.type	= FIO_OPT_STR_SET,
+		.cb	= str_exitall_cb,
+		.help	= "Terminate all jobs when one exits",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "exit_what",
+		.lname	= "What jobs to quit on terminate",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, exit_what),
+		.help	= "Fine-grained control for exitall",
+		.def	= "group",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+		.posval	= {
+			  { .ival = "group",
+			    .oval = TERMINATE_GROUP,
+			    .help = "exit_all=1 default behaviour",
+			  },
+			  { .ival = "stonewall",
+			    .oval = TERMINATE_STONEWALL,
+			    .help = "quit all currently running jobs; continue with next stonewall",
+			  },
+			  { .ival = "all",
+			    .oval = TERMINATE_ALL,
+			    .help = "Quit everything",
+			  },
+		},
+	},
+	{
+		.name	= "exitall_on_error",
+		.lname	= "Exit-all on terminate in error",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, exitall_error),
+		.help	= "Terminate all jobs when one exits in error",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "stonewall",
+		.lname	= "Wait for previous",
+		.alias	= "wait_for_previous",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, stonewall),
+		.help	= "Insert a hard barrier between this job and previous",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "new_group",
+		.lname	= "New group",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, new_group),
+		.help	= "Mark the start of a new group (for reporting)",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "thread",
+		.lname	= "Thread",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, use_thread),
+		.help	= "Use threads instead of processes",
+#ifdef CONFIG_NO_SHM
+		.def	= "1",
+		.no_warn_def = 1,
+#endif
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "per_job_logs",
+		.lname	= "Per Job Logs",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, per_job_logs),
+		.help	= "Include job number in generated log files or not",
+		.def	= "1",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_bw_log",
+		.lname	= "Write bandwidth log",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, bw_log_file),
+		.cb	= str_write_bw_log_cb,
+		.help	= "Write log of bandwidth during run",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_lat_log",
+		.lname	= "Write latency log",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, lat_log_file),
+		.cb	= str_write_lat_log_cb,
+		.help	= "Write log of latency during run",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_iops_log",
+		.lname	= "Write IOPS log",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, iops_log_file),
+		.cb	= str_write_iops_log_cb,
+		.help	= "Write log of IOPS during run",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_avg_msec",
+		.lname	= "Log averaging (msec)",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_avg_msec),
+		.help	= "Average bw/iops/lat logs over this period of time",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_hist_msec",
+		.lname	= "Log histograms (msec)",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_hist_msec),
+		.help	= "Dump completion latency histograms at frequency of this time value",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_hist_coarseness",
+		.lname	= "Histogram logs coarseness",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_hist_coarseness),
+		.help	= "Integer in range [0,6]. Higher coarseness outputs"
+			" fewer histogram bins per sample. The number of bins for"
+			" these are [1216, 608, 304, 152, 76, 38, 19] respectively.",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_hist_log",
+		.lname	= "Write latency histogram logs",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, hist_log_file),
+		.cb	= str_write_hist_log_cb,
+		.help	= "Write log of latency histograms during run",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_max_value",
+		.lname	= "Log maximum instead of average",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_max),
+		.help	= "Log max sample in a window instead of average",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_offset",
+		.lname	= "Log offset of IO",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_offset),
+		.help	= "Include offset of IO for each log entry",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef CONFIG_ZLIB
+	{
+		.name	= "log_compression",
+		.lname	= "Log compression",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_gz),
+		.help	= "Log in compressed chunks of this size",
+		.minval	= 1024ULL,
+		.maxval	= 512 * 1024 * 1024ULL,
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef FIO_HAVE_CPU_AFFINITY
+	{
+		.name	= "log_compression_cpus",
+		.lname	= "Log Compression CPUs",
+		.type	= FIO_OPT_STR,
+		.cb	= str_log_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, log_gz_cpumask),
+		.parent = "log_compression",
+		.help	= "Limit log compression to these CPUs",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "log_compression_cpus",
+		.lname	= "Log Compression CPUs",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+#endif
+	{
+		.name	= "log_store_compressed",
+		.lname	= "Log store compressed",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_gz_store),
+		.help	= "Store logs in a compressed format",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "log_compression",
+		.lname	= "Log compression",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Install libz-dev(el) to get compression support",
+	},
+	{
+		.name	= "log_store_compressed",
+		.lname	= "Log store compressed",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Install libz-dev(el) to get compression support",
+	},
+#endif
+	{
+		.name = "log_unix_epoch",
+		.lname = "Log epoch unix",
+		.type = FIO_OPT_BOOL,
+		.off1 = offsetof(struct thread_options, log_unix_epoch),
+		.help = "Use Unix time in log files",
+		.category = FIO_OPT_C_LOG,
+		.group = FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "block_error_percentiles",
+		.lname	= "Block error percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, block_error_hist),
+		.help	= "Record trim block errors and make a histogram",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "bwavgtime",
+		.lname	= "Bandwidth average time",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, bw_avg_time),
+		.help	= "Time window over which to calculate bandwidth"
+			  " (msec)",
+		.def	= "500",
+		.parent	= "write_bw_log",
+		.hide	= 1,
+		.interval = 100,
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "iopsavgtime",
+		.lname	= "IOPS average time",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iops_avg_time),
+		.help	= "Time window over which to calculate IOPS (msec)",
+		.def	= "500",
+		.parent	= "write_iops_log",
+		.hide	= 1,
+		.interval = 100,
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "group_reporting",
+		.lname	= "Group reporting",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, group_reporting),
+		.help	= "Do reporting on a per-group basis",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "stats",
+		.lname	= "Stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, stats),
+		.help	= "Enable collection of stats",
+		.def	= "1",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "zero_buffers",
+		.lname	= "Zero I/O buffers",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, zero_buffers),
+		.help	= "Init IO buffers to all zeroes",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "refill_buffers",
+		.lname	= "Refill I/O buffers",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, refill_buffers),
+		.help	= "Refill IO buffers on every IO submit",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "scramble_buffers",
+		.lname	= "Scramble I/O buffers",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, scramble_buffers),
+		.help	= "Slightly scramble buffers on every IO submit",
+		.def	= "1",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "buffer_pattern",
+		.lname	= "Buffer pattern",
+		.type	= FIO_OPT_STR,
+		.cb	= str_buffer_pattern_cb,
+		.off1	= offsetof(struct thread_options, buffer_pattern),
+		.help	= "Fill pattern for IO buffers",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "buffer_compress_percentage",
+		.lname	= "Buffer compression percentage",
+		.type	= FIO_OPT_INT,
+		.cb	= str_buffer_compress_cb,
+		.off1	= offsetof(struct thread_options, compress_percentage),
+		.maxval	= 100,
+		.minval	= 0,
+		.help	= "How compressible the buffer is (approximately)",
+		.interval = 5,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "buffer_compress_chunk",
+		.lname	= "Buffer compression chunk size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, compress_chunk),
+		.parent	= "buffer_compress_percentage",
+		.hide	= 1,
+		.help	= "Size of compressible region in buffer",
+		.def	= "512",
+		.interval = 256,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "dedupe_percentage",
+		.lname	= "Dedupe percentage",
+		.type	= FIO_OPT_INT,
+		.cb	= str_dedupe_cb,
+		.off1	= offsetof(struct thread_options, dedupe_percentage),
+		.maxval	= 100,
+		.minval	= 0,
+		.help	= "Percentage of buffers that are dedupable",
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "clat_percentiles",
+		.lname	= "Completion latency percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, clat_percentiles),
+		.help	= "Enable the reporting of completion latency percentiles",
+		.def	= "1",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "lat_percentiles",
+		.lname	= "IO latency percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, lat_percentiles),
+		.help	= "Enable the reporting of IO latency percentiles",
+		.def	= "0",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "slat_percentiles",
+		.lname	= "Submission latency percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, slat_percentiles),
+		.help	= "Enable the reporting of submission latency percentiles",
+		.def	= "0",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "percentile_list",
+		.lname	= "Percentile list",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, percentile_list),
+		.off2	= offsetof(struct thread_options, percentile_precision),
+		.help	= "Specify a custom list of percentiles to report for "
+			  "completion latency and block errors",
+		.def    = "1:5:10:20:30:40:50:60:70:80:90:95:99:99.5:99.9:99.95:99.99",
+		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
+		.minfp	= 0.0,
+		.maxfp	= 100.0,
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "significant_figures",
+		.lname	= "Significant figures",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, sig_figs),
+		.maxval	= 10,
+		.minval	= 1,
+		.help	= "Significant figures for output-format set to normal",
+		.def	= "4",
+		.interval = 1,
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+
+#ifdef FIO_HAVE_DISK_UTIL
+	{
+		.name	= "disk_util",
+		.lname	= "Disk utilization",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, do_disk_util),
+		.help	= "Log disk utilization statistics",
+		.def	= "1",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "disk_util",
+		.lname	= "Disk utilization",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support disk utilization",
+	},
+#endif
+	{
+		.name	= "gtod_reduce",
+		.lname	= "Reduce gettimeofday() calls",
+		.type	= FIO_OPT_BOOL,
+		.help	= "Greatly reduce number of gettimeofday() calls",
+		.cb	= str_gtod_reduce_cb,
+		.def	= "0",
+		.hide_on_set = 1,
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "disable_lat",
+		.lname	= "Disable all latency stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, disable_lat),
+		.help	= "Disable latency numbers",
+		.parent	= "gtod_reduce",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "disable_clat",
+		.lname	= "Disable completion latency stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, disable_clat),
+		.help	= "Disable completion latency numbers",
+		.parent	= "gtod_reduce",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "disable_slat",
+		.lname	= "Disable submission latency stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, disable_slat),
+		.help	= "Disable submission latency numbers",
+		.parent	= "gtod_reduce",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "disable_bw_measurement",
+		.alias	= "disable_bw",
+		.lname	= "Disable bandwidth stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, disable_bw),
+		.help	= "Disable bandwidth logging",
+		.parent	= "gtod_reduce",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "gtod_cpu",
+		.lname	= "Dedicated gettimeofday() CPU",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gtod_cpu),
+		.help	= "Set up dedicated gettimeofday() thread on this CPU",
+		.verify	= gtod_cpu_verify,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CLOCK,
+	},
+	{
+		.name	= "unified_rw_reporting",
+		.lname	= "Unified RW Reporting",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unified_rw_rep),
+		.help	= "Unify reporting across data direction",
+		.def	= "0",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "continue_on_error",
+		.lname	= "Continue on error",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, continue_on_error),
+		.help	= "Continue on non-fatal errors during IO",
+		.def	= "none",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_ERR,
+		.posval = {
+			  { .ival = "none",
+			    .oval = ERROR_TYPE_NONE,
+			    .help = "Exit when an error is encountered",
+			  },
+			  { .ival = "read",
+			    .oval = ERROR_TYPE_READ,
+			    .help = "Continue on read errors only",
+			  },
+			  { .ival = "write",
+			    .oval = ERROR_TYPE_WRITE,
+			    .help = "Continue on write errors only",
+			  },
+			  { .ival = "io",
+			    .oval = ERROR_TYPE_READ | ERROR_TYPE_WRITE,
+			    .help = "Continue on any IO errors",
+			  },
+			  { .ival = "verify",
+			    .oval = ERROR_TYPE_VERIFY,
+			    .help = "Continue on verify errors only",
+			  },
+			  { .ival = "all",
+			    .oval = ERROR_TYPE_ANY,
+			    .help = "Continue on all io and verify errors",
+			  },
+			  { .ival = "0",
+			    .oval = ERROR_TYPE_NONE,
+			    .help = "Alias for 'none'",
+			  },
+			  { .ival = "1",
+			    .oval = ERROR_TYPE_ANY,
+			    .help = "Alias for 'all'",
+			  },
+		},
+	},
+	{
+		.name	= "ignore_error",
+		.lname	= "Ignore Error",
+		.type	= FIO_OPT_STR,
+		.cb	= str_ignore_error_cb,
+		.off1	= offsetof(struct thread_options, ignore_error_nr),
+		.help	= "Set a specific list of errors to ignore",
+		.parent	= "rw",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_ERR,
+	},
+	{
+		.name	= "error_dump",
+		.lname	= "Error Dump",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, error_dump),
+		.def	= "0",
+		.help	= "Dump info on each error",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_ERR,
+	},
+	{
+		.name	= "profile",
+		.lname	= "Profile",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, profile),
+		.help	= "Select a specific builtin performance test",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "cgroup",
+		.lname	= "Cgroup",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, cgroup),
+		.help	= "Add job to cgroup of this name",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CGROUP,
+	},
+	{
+		.name	= "cgroup_nodelete",
+		.lname	= "Cgroup no-delete",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, cgroup_nodelete),
+		.help	= "Do not delete cgroups after job completion",
+		.def	= "0",
+		.parent	= "cgroup",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CGROUP,
+	},
+	{
+		.name	= "cgroup_weight",
+		.lname	= "Cgroup weight",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, cgroup_weight),
+		.help	= "Use given weight for cgroup",
+		.minval = 100,
+		.maxval	= 1000,
+		.parent	= "cgroup",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CGROUP,
+	},
+	{
+		.name	= "uid",
+		.lname	= "User ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, uid),
+		.help	= "Run job with this user ID",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "gid",
+		.lname	= "Group ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gid),
+		.help	= "Run job with this group ID",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "kb_base",
+		.lname	= "KB Base",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, kb_base),
+		.prio	= 1,
+		.def	= "1024",
+		.posval = {
+			  { .ival = "1024",
+			    .oval = 1024,
+			    .help = "Inputs invert IEC and SI prefixes (for compatibility); outputs prefer binary",
+			  },
+			  { .ival = "1000",
+			    .oval = 1000,
+			    .help = "Inputs use IEC and SI prefixes; outputs prefer SI",
+			  },
+		},
+		.help	= "Unit prefix interpretation for quantities of data (IEC and SI)",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "unit_base",
+		.lname	= "Unit for quantities of data (Bits or Bytes)",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, unit_base),
+		.prio	= 1,
+		.posval = {
+			  { .ival = "0",
+			    .oval = N2S_NONE,
+			    .help = "Auto-detect",
+			  },
+			  { .ival = "8",
+			    .oval = N2S_BYTEPERSEC,
+			    .help = "Normal (byte based)",
+			  },
+			  { .ival = "1",
+			    .oval = N2S_BITPERSEC,
+			    .help = "Bit based",
+			  },
+		},
+		.help	= "Bit multiple of result summary data (8 for byte, 1 for bit)",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "hugepage-size",
+		.lname	= "Hugepage size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, hugepage_size),
+		.help	= "When using hugepages, specify size of each page",
+		.def	= __fio_stringify(FIO_HUGE_PAGE),
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "flow_id",
+		.lname	= "I/O flow ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, flow_id),
+		.help	= "The flow index ID to use",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name	= "flow",
+		.lname	= "I/O flow weight",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, flow),
+		.help	= "Weight for flow control of this job",
+		.parent	= "flow_id",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name	= "flow_watermark",
+		.lname	= "I/O flow watermark",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, flow_watermark),
+		.help	= "High watermark for flow control. This option"
+			" should be set to the same value for all threads"
+			" with non-zero flow.",
+		.parent	= "flow_id",
+		.hide	= 1,
+		.def	= "1024",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name	= "flow_sleep",
+		.lname	= "I/O flow sleep",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, flow_sleep),
+		.help	= "How many microseconds to sleep after being held"
+			" back by the flow control mechanism",
+		.parent	= "flow_id",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name   = "steadystate",
+		.lname  = "Steady state threshold",
+		.alias  = "ss",
+		.type   = FIO_OPT_STR,
+		.off1   = offsetof(struct thread_options, ss_state),
+		.cb	= str_steadystate_cb,
+		.help   = "Define the criterion and limit to judge when a job has reached steady state",
+		.def	= "iops_slope:0.01%",
+		.posval	= {
+			  { .ival = "iops",
+			    .oval = FIO_SS_IOPS,
+			    .help = "maximum mean deviation of IOPS measurements",
+			  },
+			  { .ival = "iops_slope",
+			    .oval = FIO_SS_IOPS_SLOPE,
+			    .help = "slope calculated from IOPS measurements",
+			  },
+			  { .ival = "bw",
+			    .oval = FIO_SS_BW,
+			    .help = "maximum mean deviation of bandwidth measurements",
+			  },
+			  {
+			    .ival = "bw_slope",
+			    .oval = FIO_SS_BW_SLOPE,
+			    .help = "slope calculated from bandwidth measurements",
+			  },
+		},
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+        {
+		.name   = "steadystate_duration",
+		.lname  = "Steady state duration",
+		.alias  = "ss_dur",
+		.parent	= "steadystate",
+		.type   = FIO_OPT_STR_VAL_TIME,
+		.off1   = offsetof(struct thread_options, ss_dur),
+		.help   = "Stop workload upon attaining steady state for specified duration",
+		.def    = "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+        {
+		.name   = "steadystate_ramp_time",
+		.lname  = "Steady state ramp time",
+		.alias  = "ss_ramp",
+		.parent	= "steadystate",
+		.type   = FIO_OPT_STR_VAL_TIME,
+		.off1   = offsetof(struct thread_options, ss_ramp_time),
+		.help   = "Delay before initiation of data collection for steady state job termination testing",
+		.def    = "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static void add_to_lopt(struct option *lopt, struct fio_option *o,
+			const char *name, int val)
+{
+	lopt->name = (char *) name;
+	lopt->val = val;
+	if (o->type == FIO_OPT_STR_SET)
+		lopt->has_arg = optional_argument;
+	else
+		lopt->has_arg = required_argument;
+}
+
+static void options_to_lopts(struct fio_option *opts,
+			      struct option *long_options,
+			      int i, int option_type)
+{
+	struct fio_option *o = &opts[0];
+	while (o->name) {
+		add_to_lopt(&long_options[i], o, o->name, option_type);
+		if (o->alias) {
+			i++;
+			add_to_lopt(&long_options[i], o, o->alias, option_type);
+		}
+
+		i++;
+		o++;
+		assert(i < FIO_NR_OPTIONS);
+	}
+}
+
+void fio_options_set_ioengine_opts(struct option *long_options,
+				   struct thread_data *td)
+{
+	unsigned int i;
+
+	i = 0;
+	while (long_options[i].name) {
+		if (long_options[i].val == FIO_GETOPT_IOENGINE) {
+			memset(&long_options[i], 0, sizeof(*long_options));
+			break;
+		}
+		i++;
+	}
+
+	/*
+	 * Just clear out the prior ioengine options.
+	 */
+	if (!td || !td->eo)
+		return;
+
+	options_to_lopts(td->io_ops->options, long_options, i,
+			 FIO_GETOPT_IOENGINE);
+}
+
+void fio_options_dup_and_init(struct option *long_options)
+{
+	unsigned int i;
+
+	options_init(fio_options);
+
+	i = 0;
+	while (long_options[i].name)
+		i++;
+
+	options_to_lopts(fio_options, long_options, i, FIO_GETOPT_JOB);
+}
+
+struct fio_keyword {
+	const char *word;
+	const char *desc;
+	char *replace;
+};
+
+static struct fio_keyword fio_keywords[] = {
+	{
+		.word	= "$pagesize",
+		.desc	= "Page size in the system",
+	},
+	{
+		.word	= "$mb_memory",
+		.desc	= "Megabytes of memory online",
+	},
+	{
+		.word	= "$ncpus",
+		.desc	= "Number of CPUs online in the system",
+	},
+	{
+		.word	= NULL,
+	},
+};
+
+void fio_keywords_exit(void)
+{
+	struct fio_keyword *kw;
+
+	kw = &fio_keywords[0];
+	while (kw->word) {
+		free(kw->replace);
+		kw->replace = NULL;
+		kw++;
+	}
+}
+
+void fio_keywords_init(void)
+{
+	unsigned long long mb_memory;
+	char buf[128];
+	long l;
+
+	sprintf(buf, "%lu", (unsigned long) page_size);
+	fio_keywords[0].replace = strdup(buf);
+
+	mb_memory = os_phys_mem() / (1024 * 1024);
+	sprintf(buf, "%llu", mb_memory);
+	fio_keywords[1].replace = strdup(buf);
+
+	l = cpus_online();
+	sprintf(buf, "%lu", l);
+	fio_keywords[2].replace = strdup(buf);
+}
+
+#define BC_APP		"bc"
+
+static char *bc_calc(char *str)
+{
+	char buf[128], *tmp;
+	FILE *f;
+	int ret;
+
+	/*
+	 * No math, just return string
+	 */
+	if ((!strchr(str, '+') && !strchr(str, '-') && !strchr(str, '*') &&
+	     !strchr(str, '/')) || strchr(str, '\''))
+		return str;
+
+	/*
+	 * Split option from value, we only need to calculate the value
+	 */
+	tmp = strchr(str, '=');
+	if (!tmp)
+		return str;
+
+	tmp++;
+
+	/*
+	 * Prevent buffer overflows; such a case isn't reasonable anyway
+	 */
+	if (strlen(str) >= 128 || strlen(tmp) > 100)
+		return str;
+
+	sprintf(buf, "which %s > /dev/null", BC_APP);
+	if (system(buf)) {
+		log_err("fio: bc is needed for performing math\n");
+		return NULL;
+	}
+
+	sprintf(buf, "echo '%s' | %s", tmp, BC_APP);
+	f = popen(buf, "r");
+	if (!f)
+		return NULL;
+
+	ret = fread(&buf[tmp - str], 1, 128 - (tmp - str), f);
+	if (ret <= 0) {
+		pclose(f);
+		return NULL;
+	}
+
+	pclose(f);
+	buf[(tmp - str) + ret - 1] = '\0';
+	memcpy(buf, str, tmp - str);
+	free(str);
+	return strdup(buf);
+}
+
+/*
+ * Return a copy of the input string with substrings of the form ${VARNAME}
+ * substituted with the value of the environment variable VARNAME.  The
+ * substitution always occurs, even if VARNAME is empty or the corresponding
+ * environment variable undefined.
+ */
+char *fio_option_dup_subs(const char *opt)
+{
+	char out[OPT_LEN_MAX+1];
+	char in[OPT_LEN_MAX+1];
+	char *outptr = out;
+	char *inptr = in;
+	char *ch1, *ch2, *env;
+	ssize_t nchr = OPT_LEN_MAX;
+	size_t envlen;
+
+	if (strlen(opt) + 1 > OPT_LEN_MAX) {
+		log_err("OPT_LEN_MAX (%d) is too small\n", OPT_LEN_MAX);
+		return NULL;
+	}
+
+	snprintf(in, sizeof(in), "%s", opt);
+
+	while (*inptr && nchr > 0) {
+		if (inptr[0] == '$' && inptr[1] == '{') {
+			ch2 = strchr(inptr, '}');
+			if (ch2 && inptr+1 < ch2) {
+				ch1 = inptr+2;
+				inptr = ch2+1;
+				*ch2 = '\0';
+
+				env = getenv(ch1);
+				if (env) {
+					envlen = strlen(env);
+					if (envlen <= nchr) {
+						memcpy(outptr, env, envlen);
+						outptr += envlen;
+						nchr -= envlen;
+					}
+				}
+
+				continue;
+			}
+		}
+
+		*outptr++ = *inptr++;
+		--nchr;
+	}
+
+	*outptr = '\0';
+	return strdup(out);
+}
+
+/*
+ * Look for reserved variable names and replace them with real values
+ */
+static char *fio_keyword_replace(char *opt)
+{
+	char *s;
+	int i;
+	int docalc = 0;
+
+	for (i = 0; fio_keywords[i].word != NULL; i++) {
+		struct fio_keyword *kw = &fio_keywords[i];
+
+		while ((s = strstr(opt, kw->word)) != NULL) {
+			char *new = malloc(strlen(opt) + 1);
+			char *o_org = opt;
+			int olen = s - opt;
+			int len;
+
+			/*
+			 * Copy part of the string before the keyword and
+			 * sprintf() the replacement after it.
+			 */
+			memcpy(new, opt, olen);
+			len = sprintf(new + olen, "%s", kw->replace);
+
+			/*
+			 * If there's more in the original string, copy that
+			 * in too
+			 */
+			opt += strlen(kw->word) + olen;
+			if (strlen(opt))
+				memcpy(new + olen + len, opt, opt - o_org - 1);
+
+			/*
+			 * replace opt and free the old opt
+			 */
+			opt = new;
+			free(o_org);
+
+			docalc = 1;
+		}
+	}
+
+	/*
+	 * Check for potential math and invoke bc, if possible
+	 */
+	if (docalc)
+		opt = bc_calc(opt);
+
+	return opt;
+}
+
+static char **dup_and_sub_options(char **opts, int num_opts)
+{
+	int i;
+	char **opts_copy = malloc(num_opts * sizeof(*opts));
+	for (i = 0; i < num_opts; i++) {
+		opts_copy[i] = fio_option_dup_subs(opts[i]);
+		if (!opts_copy[i])
+			continue;
+		opts_copy[i] = fio_keyword_replace(opts_copy[i]);
+	}
+	return opts_copy;
+}
+
+static void show_closest_option(const char *opt)
+{
+	int best_option, best_distance;
+	int i, distance;
+	char *name;
+
+	if (!strlen(opt))
+		return;
+
+	name = strdup(opt);
+	i = 0;
+	while (name[i] != '\0' && name[i] != '=')
+		i++;
+	name[i] = '\0';
+
+	best_option = -1;
+	best_distance = INT_MAX;
+	i = 0;
+	while (fio_options[i].name) {
+		distance = string_distance(name, fio_options[i].name);
+		if (distance < best_distance) {
+			best_distance = distance;
+			best_option = i;
+		}
+		i++;
+	}
+
+	if (best_option != -1 && string_distance_ok(name, best_distance) &&
+	    fio_options[best_option].type != FIO_OPT_UNSUPPORTED)
+		log_err("Did you mean %s?\n", fio_options[best_option].name);
+
+	free(name);
+}
+
+int fio_options_parse(struct thread_data *td, char **opts, int num_opts)
+{
+	int i, ret, unknown;
+	char **opts_copy;
+
+	sort_options(opts, fio_options, num_opts);
+	opts_copy = dup_and_sub_options(opts, num_opts);
+
+	for (ret = 0, i = 0, unknown = 0; i < num_opts; i++) {
+		const struct fio_option *o;
+		int newret = parse_option(opts_copy[i], opts[i], fio_options,
+						&o, &td->o, &td->opt_list);
+
+		if (!newret && o)
+			fio_option_mark_set(&td->o, o);
+
+		if (opts_copy[i]) {
+			if (newret && !o) {
+				unknown++;
+				continue;
+			}
+			free(opts_copy[i]);
+			opts_copy[i] = NULL;
+		}
+
+		ret |= newret;
+	}
+
+	if (unknown) {
+		ret |= ioengine_load(td);
+		if (td->eo) {
+			sort_options(opts_copy, td->io_ops->options, num_opts);
+			opts = opts_copy;
+		}
+		for (i = 0; i < num_opts; i++) {
+			const struct fio_option *o = NULL;
+			int newret = 1;
+
+			if (!opts_copy[i])
+				continue;
+
+			if (td->eo)
+				newret = parse_option(opts_copy[i], opts[i],
+						      td->io_ops->options, &o,
+						      td->eo, &td->opt_list);
+
+			ret |= newret;
+			if (!o) {
+				log_err("Bad option <%s>\n", opts[i]);
+				show_closest_option(opts[i]);
+			}
+			free(opts_copy[i]);
+			opts_copy[i] = NULL;
+		}
+	}
+
+	free(opts_copy);
+	return ret;
+}
+
+int fio_cmd_option_parse(struct thread_data *td, const char *opt, char *val)
+{
+	int ret;
+
+	ret = parse_cmd_option(opt, val, fio_options, &td->o, &td->opt_list);
+	if (!ret) {
+		const struct fio_option *o;
+
+		o = find_option_c(fio_options, opt);
+		if (o)
+			fio_option_mark_set(&td->o, o);
+	}
+
+	return ret;
+}
+
+int fio_cmd_ioengine_option_parse(struct thread_data *td, const char *opt,
+				char *val)
+{
+	return parse_cmd_option(opt, val, td->io_ops->options, td->eo,
+					&td->opt_list);
+}
+
+void fio_fill_default_options(struct thread_data *td)
+{
+	td->o.magic = OPT_MAGIC;
+	fill_default_options(&td->o, fio_options);
+}
+
+int fio_show_option_help(const char *opt)
+{
+	return show_cmd_help(fio_options, opt);
+}
+
+/*
+ * dupe FIO_OPT_STR_STORE options
+ */
+void fio_options_mem_dupe(struct thread_data *td)
+{
+	options_mem_dupe(fio_options, &td->o);
+
+	if (td->eo && td->io_ops) {
+		void *oldeo = td->eo;
+
+		td->eo = malloc(td->io_ops->option_struct_size);
+		memcpy(td->eo, oldeo, td->io_ops->option_struct_size);
+		options_mem_dupe(td->io_ops->options, td->eo);
+	}
+}
+
+unsigned int fio_get_kb_base(void *data)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct thread_options *o = &td->o;
+	unsigned int kb_base = 0;
+
+	/*
+	 * This is a hack... For private options, *data is not holding
+	 * a pointer to the thread_options, but to private data. This means
+	 * we can't safely dereference it, but magic is first so mem wise
+	 * it is valid. But this also means that if the job first sets
+	 * kb_base and expects that to be honored by private options,
+	 * it will be disappointed. We will return the global default
+	 * for this.
+	 */
+	if (o && o->magic == OPT_MAGIC)
+		kb_base = o->kb_base;
+	if (!kb_base)
+		kb_base = 1024;
+
+	return kb_base;
+}
+
+int add_option(const struct fio_option *o)
+{
+	struct fio_option *__o;
+	int opt_index = 0;
+
+	__o = fio_options;
+	while (__o->name) {
+		opt_index++;
+		__o++;
+	}
+
+	if (opt_index + 1 == FIO_MAX_OPTS) {
+		log_err("fio: FIO_MAX_OPTS is too small\n");
+		return 1;
+	}
+
+	memcpy(&fio_options[opt_index], o, sizeof(*o));
+	fio_options[opt_index + 1].name = NULL;
+	return 0;
+}
+
+void invalidate_profile_options(const char *prof_name)
+{
+	struct fio_option *o;
+
+	o = fio_options;
+	while (o->name) {
+		if (o->prof_name && !strcmp(o->prof_name, prof_name)) {
+			o->type = FIO_OPT_INVALID;
+			o->prof_name = NULL;
+		}
+		o++;
+	}
+}
+
+void add_opt_posval(const char *optname, const char *ival, const char *help)
+{
+	struct fio_option *o;
+	unsigned int i;
+
+	o = find_option(fio_options, optname);
+	if (!o)
+		return;
+
+	for (i = 0; i < PARSE_MAX_VP; i++) {
+		if (o->posval[i].ival)
+			continue;
+
+		o->posval[i].ival = ival;
+		o->posval[i].help = help;
+		break;
+	}
+}
+
+void del_opt_posval(const char *optname, const char *ival)
+{
+	struct fio_option *o;
+	unsigned int i;
+
+	o = find_option(fio_options, optname);
+	if (!o)
+		return;
+
+	for (i = 0; i < PARSE_MAX_VP; i++) {
+		if (!o->posval[i].ival)
+			continue;
+		if (strcmp(o->posval[i].ival, ival))
+			continue;
+
+		o->posval[i].ival = NULL;
+		o->posval[i].help = NULL;
+	}
+}
+
+void fio_options_free(struct thread_data *td)
+{
+	options_free(fio_options, &td->o);
+	if (td->eo && td->io_ops && td->io_ops->options) {
+		options_free(td->io_ops->options, td->eo);
+		free(td->eo);
+		td->eo = NULL;
+	}
+}
+
+struct fio_option *fio_option_find(const char *name)
+{
+	return find_option(fio_options, name);
+}
+
+static struct fio_option *find_next_opt(struct fio_option *from,
+					unsigned int off1)
+{
+	struct fio_option *opt;
+
+	if (!from)
+		from = &fio_options[0];
+	else
+		from++;
+
+	opt = NULL;
+	do {
+		if (off1 == from->off1) {
+			opt = from;
+			break;
+		}
+		from++;
+	} while (from->name);
+
+	return opt;
+}
+
+static int opt_is_set(struct thread_options *o, struct fio_option *opt)
+{
+	unsigned int opt_off, index, offset;
+
+	opt_off = opt - &fio_options[0];
+	index = opt_off / (8 * sizeof(uint64_t));
+	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
+	return (o->set_options[index] & ((uint64_t)1 << offset)) != 0;
+}
+
+bool __fio_option_is_set(struct thread_options *o, unsigned int off1)
+{
+	struct fio_option *opt, *next;
+
+	next = NULL;
+	while ((opt = find_next_opt(next, off1)) != NULL) {
+		if (opt_is_set(o, opt))
+			return true;
+
+		next = opt;
+	}
+
+	return false;
+}
+
+void fio_option_mark_set(struct thread_options *o, const struct fio_option *opt)
+{
+	unsigned int opt_off, index, offset;
+
+	opt_off = opt - &fio_options[0];
+	index = opt_off / (8 * sizeof(uint64_t));
+	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
+	o->set_options[index] |= (uint64_t)1 << offset;
+}
diff --git a/options.h b/options.h
new file mode 100644
index 0000000..5276f31
--- /dev/null
+++ b/options.h
@@ -0,0 +1,56 @@
+#ifndef FIO_OPTION_H
+#define FIO_OPTION_H
+
+#define FIO_MAX_OPTS		512
+
+#include <string.h>
+#include <inttypes.h>
+#include "parse.h"
+#include "lib/types.h"
+
+int add_option(const struct fio_option *);
+void invalidate_profile_options(const char *);
+extern char *exec_profile;
+
+void add_opt_posval(const char *, const char *, const char *);
+void del_opt_posval(const char *, const char *);
+struct thread_data;
+void fio_options_free(struct thread_data *);
+char *get_next_str(char **ptr);
+int get_max_str_idx(char *input);
+char* get_name_by_idx(char *input, int index);
+int set_name_idx(char *, size_t, char *, int, bool);
+
+extern char client_sockaddr_str[];  /* used with --client option */
+
+extern struct fio_option fio_options[FIO_MAX_OPTS];
+
+extern bool __fio_option_is_set(struct thread_options *, unsigned int off);
+
+#define fio_option_is_set(__td, name)					\
+({									\
+	const unsigned int off = offsetof(struct thread_options, name);	\
+	bool __r = __fio_option_is_set((__td), off);			\
+	__r;								\
+})
+
+extern void fio_option_mark_set(struct thread_options *,
+				const struct fio_option *);
+
+static inline bool o_match(const struct fio_option *o, const char *opt)
+{
+	if (!strcmp(o->name, opt))
+		return true;
+	else if (o->alias && !strcmp(o->alias, opt))
+		return true;
+
+	return false;
+}
+
+extern struct fio_option *find_option(struct fio_option *, const char *);
+extern const struct fio_option *
+find_option_c(const struct fio_option *, const char *);
+extern struct fio_option *fio_option_find(const char *);
+extern unsigned int fio_get_kb_base(void *);
+
+#endif
diff --git a/os/kcompat.h b/os/kcompat.h
new file mode 100644
index 0000000..9ef1f33
--- /dev/null
+++ b/os/kcompat.h
@@ -0,0 +1,9 @@
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#include <stdint.h>
+
+#define u64 uint64_t
+#define u32 uint32_t
+
+#endif
diff --git a/os/linux/io_uring.h b/os/linux/io_uring.h
new file mode 100644
index 0000000..03d2dde
--- /dev/null
+++ b/os/linux/io_uring.h
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header file for the io_uring interface.
+ *
+ * Copyright (C) 2019 Jens Axboe
+ * Copyright (C) 2019 Christoph Hellwig
+ */
+#ifndef LINUX_IO_URING_H
+#define LINUX_IO_URING_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+/*
+ * IO submission data structure (Submission Queue Entry)
+ */
+struct io_uring_sqe {
+	__u8	opcode;		/* type of operation for this sqe */
+	__u8	flags;		/* IOSQE_ flags */
+	__u16	ioprio;		/* ioprio for the request */
+	__s32	fd;		/* file descriptor to do IO on */
+	union {
+		__u64	off;	/* offset into file */
+		__u64	addr2;
+	};
+	__u64	addr;		/* pointer to buffer or iovecs */
+	__u32	len;		/* buffer size or number of iovecs */
+	union {
+		__kernel_rwf_t	rw_flags;
+		__u32		fsync_flags;
+		__u16		poll_events;
+		__u32		sync_range_flags;
+		__u32		msg_flags;
+		__u32		timeout_flags;
+		__u32		accept_flags;
+		__u32		cancel_flags;
+		__u32		open_flags;
+		__u32		statx_flags;
+	};
+	__u64	user_data;	/* data to be passed back at completion time */
+	union {
+		__u16	buf_index;	/* index into fixed buffers, if used */
+		__u64	__pad2[3];
+	};
+};
+
+/*
+ * sqe->flags
+ */
+#define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+#define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
+#define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
+#define IOSQE_IO_HARDLINK	(1U << 3)	/* like LINK, but stronger */
+#define IOSQE_ASYNC		(1U << 4)	/* always go async */
+
+/*
+ * io_uring_setup() flags
+ */
+#define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
+#define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
+#define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
+#define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
+
+enum {
+	IORING_OP_NOP,
+	IORING_OP_READV,
+	IORING_OP_WRITEV,
+	IORING_OP_FSYNC,
+	IORING_OP_READ_FIXED,
+	IORING_OP_WRITE_FIXED,
+	IORING_OP_POLL_ADD,
+	IORING_OP_POLL_REMOVE,
+	IORING_OP_SYNC_FILE_RANGE,
+	IORING_OP_SENDMSG,
+	IORING_OP_RECVMSG,
+	IORING_OP_TIMEOUT,
+	IORING_OP_TIMEOUT_REMOVE,
+	IORING_OP_ACCEPT,
+	IORING_OP_ASYNC_CANCEL,
+	IORING_OP_LINK_TIMEOUT,
+	IORING_OP_CONNECT,
+	IORING_OP_FALLOCATE,
+	IORING_OP_OPENAT,
+	IORING_OP_CLOSE,
+	IORING_OP_FILES_UPDATE,
+	IORING_OP_STATX,
+	IORING_OP_READ,
+	IORING_OP_WRITE,
+
+	/* this goes last, obviously */
+	IORING_OP_LAST,
+};
+
+/*
+ * sqe->fsync_flags
+ */
+#define IORING_FSYNC_DATASYNC	(1U << 0)
+
+/*
+ * sqe->timeout_flags
+ */
+#define IORING_TIMEOUT_ABS	(1U << 0)
+
+/*
+ * IO completion data structure (Completion Queue Entry)
+ */
+struct io_uring_cqe {
+	__u64	user_data;	/* sqe->data submission passed back */
+	__s32	res;		/* result code for this event */
+	__u32	flags;
+};
+
+/*
+ * Magic offsets for the application to mmap the data it needs
+ */
+#define IORING_OFF_SQ_RING		0ULL
+#define IORING_OFF_CQ_RING		0x8000000ULL
+#define IORING_OFF_SQES			0x10000000ULL
+
+/*
+ * Filled with the offset for mmap(2)
+ */
+struct io_sqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 flags;
+	__u32 dropped;
+	__u32 array;
+	__u32 resv1;
+	__u64 resv2;
+};
+
+/*
+ * sq_ring->flags
+ */
+#define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+
+struct io_cqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 overflow;
+	__u32 cqes;
+	__u64 resv[2];
+};
+
+/*
+ * io_uring_enter(2) flags
+ */
+#define IORING_ENTER_GETEVENTS	(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
+
+/*
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
+ */
+struct io_uring_params {
+	__u32 sq_entries;
+	__u32 cq_entries;
+	__u32 flags;
+	__u32 sq_thread_cpu;
+	__u32 sq_thread_idle;
+	__u32 features;
+	__u32 resv[4];
+	struct io_sqring_offsets sq_off;
+	struct io_cqring_offsets cq_off;
+};
+
+/*
+ * io_uring_params->features flags
+ */
+#define IORING_FEAT_SINGLE_MMAP		(1U << 0)
+#define IORING_FEAT_NODROP		(1U << 1)
+#define IORING_FEAT_SUBMIT_STABLE	(1U << 2)
+
+/*
+ * io_uring_register(2) opcodes and arguments
+ */
+#define IORING_REGISTER_BUFFERS		0
+#define IORING_UNREGISTER_BUFFERS	1
+#define IORING_REGISTER_FILES		2
+#define IORING_UNREGISTER_FILES		3
+#define IORING_REGISTER_EVENTFD		4
+#define IORING_UNREGISTER_EVENTFD	5
+#define IORING_REGISTER_FILES_UPDATE	6
+
+struct io_uring_files_update {
+	__u32 offset;
+	__s32 *fds;
+};
+
+#endif
diff --git a/os/os-aix.h b/os/os-aix.h
new file mode 100644
index 0000000..1aab96e
--- /dev/null
+++ b/os/os-aix.h
@@ -0,0 +1,49 @@
+#ifndef FIO_OS_AIX_H
+#define FIO_OS_AIX_H
+
+#define	FIO_OS	os_aix
+
+#include <errno.h>
+#include <unistd.h>
+#include <sys/devinfo.h>
+#include <sys/ioctl.h>
+
+#include "../file.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+
+#define OS_MAP_ANON		MAP_ANON
+#define OS_MSG_DONTWAIT		0
+
+#define FIO_USE_GENERIC_SWAP
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct devinfo info;
+
+	if (!ioctl(f->fd, IOCINFO, &info)) {
+        	*bytes = (unsigned long long)info.un.scdk.numblks *
+				info.un.scdk.blksize;
+		return 0;
+	}
+
+	return errno;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	long mem = sysconf(_SC_AIX_REALMEM);
+
+	if (mem == -1)
+		return 0;
+
+	return (unsigned long long) mem * 1024;
+}
+
+#endif
diff --git a/os/os-android.h b/os/os-android.h
new file mode 100644
index 0000000..3c05077
--- /dev/null
+++ b/os/os-android.h
@@ -0,0 +1,281 @@
+#ifndef FIO_OS_ANDROID_H
+#define FIO_OS_ANDROID_H
+
+#define	FIO_OS	os_android
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+#include <sys/syscall.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sched.h>
+#include <linux/unistd.h>
+#include <linux/major.h>
+#include <asm/byteorder.h>
+
+#include "./os-linux-syscall.h"
+#include "../file.h"
+
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+#define FIO_HAVE_DISK_UTIL
+#define FIO_HAVE_IOSCHED_SWITCH
+#define FIO_HAVE_IOPRIO
+#define FIO_HAVE_IOPRIO_CLASS
+#define FIO_HAVE_ODIRECT
+#define FIO_HAVE_HUGETLB
+#define FIO_HAVE_BLKTRACE
+#define FIO_HAVE_CL_SIZE
+#define FIO_HAVE_CGROUPS
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
+#define FIO_HAVE_GETTID
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_E4_ENG
+#define FIO_HAVE_BYTEORDER_FUNCS
+#define FIO_HAVE_MMAP_HUGE
+#define FIO_NO_HAVE_SHM_H
+
+#define OS_MAP_ANON		MAP_ANONYMOUS
+
+#ifndef POSIX_MADV_DONTNEED
+#define posix_madvise   madvise
+#define POSIX_MADV_DONTNEED MADV_DONTNEED
+#define POSIX_MADV_SEQUENTIAL	MADV_SEQUENTIAL
+#define POSIX_MADV_RANDOM	MADV_RANDOM
+#endif
+
+#ifdef MADV_REMOVE
+#define FIO_MADV_FREE	MADV_REMOVE
+#endif
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+#ifndef CONFIG_NO_SHM
+/*
+ * Bionic doesn't support SysV shared memeory, so implement it using ashmem
+ */
+#include <stdio.h>
+#include <linux/ashmem.h>
+#include <linux/shm.h>
+#define shmid_ds shmid64_ds
+#define SHM_HUGETLB    04000
+
+#define ASHMEM_DEVICE	"/dev/ashmem"
+
+static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
+{
+	int ret=0;
+	if (__cmd == IPC_RMID)
+	{
+		int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+		struct ashmem_pin pin = {0 , length};
+		ret = ioctl(__shmid, ASHMEM_UNPIN, &pin);
+		close(__shmid);
+	}
+	return ret;
+}
+
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+	int fd,ret;
+	char keybuf[11];
+
+	fd = open(ASHMEM_DEVICE, O_RDWR);
+	if (fd < 0)
+		return fd;
+
+	sprintf(keybuf,"%d",__key);
+	ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
+	if (ret < 0)
+		goto error;
+
+	/* Stores size in first 8 bytes, allocate extra space */
+	ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
+	if (ret < 0)
+		goto error;
+
+	return fd;
+
+error:
+	close(fd);
+	return ret;
+}
+
+static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
+{
+	size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+	/* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
+	uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
+	/* Save size at beginning of buffer, for use with munmap */
+	*ptr = size;
+	return ptr + 1;
+}
+
+static inline int shmdt (const void *__shmaddr)
+{
+	/* Find mmap size which we stored at the beginning of the buffer */
+	uint64_t *ptr = (uint64_t *)__shmaddr - 1;
+	size_t size = *ptr;
+	return munmap(ptr, size);
+}
+#endif
+
+#define SPLICE_DEF_SIZE	(64*1024)
+
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+#define IOPRIO_BITS		16
+#define IOPRIO_CLASS_SHIFT	13
+
+#define IOPRIO_MIN_PRIO		0	/* highest priority */
+#define IOPRIO_MAX_PRIO		7	/* lowest priority */
+
+#define IOPRIO_MIN_PRIO_CLASS	0
+#define IOPRIO_MAX_PRIO_CLASS	3
+
+static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+{
+	/*
+	 * If no class is set, assume BE
+	 */
+	if (!ioprio_class)
+		ioprio_class = IOPRIO_CLASS_BE;
+
+	ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
+	return syscall(__NR_ioprio_set, which, who, ioprio);
+}
+
+#ifndef BLKGETSIZE64
+#define BLKGETSIZE64	_IOR(0x12,114,size_t)
+#endif
+
+#ifndef BLKFLSBUF
+#define BLKFLSBUF	_IO(0x12,97)
+#endif
+
+#ifndef BLKDISCARD
+#define BLKDISCARD	_IO(0x12,119)
+#endif
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ioctl(f->fd, BLKFLSBUF);
+}
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	if (!ioctl(f->fd, BLKGETSIZE64, bytes))
+		return 0;
+
+	return errno;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	long pagesize, pages;
+
+	pagesize = sysconf(_SC_PAGESIZE);
+	pages = sysconf(_SC_PHYS_PAGES);
+	if (pages == -1 || pagesize == -1)
+		return 0;
+
+	return (unsigned long long) pages * (unsigned long long) pagesize;
+}
+
+#ifdef O_NOATIME
+#define FIO_O_NOATIME	O_NOATIME
+#else
+#define FIO_O_NOATIME	0
+#endif
+
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
+#define fio_swap16(x)	__builtin_bswap16(x)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
+
+#define CACHE_LINE_FILE	\
+	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
+
+static inline int arch_cache_line_size(void)
+{
+	char size[32];
+	int fd, ret;
+
+	fd = open(CACHE_LINE_FILE, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, size, sizeof(size));
+
+	close(fd);
+
+	if (ret <= 0)
+		return -1;
+	else
+		return atoi(size);
+}
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statfs s;
+
+	if (statfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_bsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(struct fio_file *f, unsigned long long start,
+			  unsigned long long len)
+{
+	uint64_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(f->fd, BLKDISCARD, range))
+		return 0;
+
+	return errno;
+}
+
+#ifdef CONFIG_SCHED_IDLE
+static inline int fio_set_sched_idle(void)
+{
+        struct sched_param p = { .sched_priority = 0, };
+        return sched_setscheduler(gettid(), SCHED_IDLE, &p);
+}
+#endif
+
+#endif
diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h
new file mode 100644
index 0000000..44bfcd5
--- /dev/null
+++ b/os/os-dragonfly.h
@@ -0,0 +1,253 @@
+#ifndef FIO_OS_DRAGONFLY_H
+#define FIO_OS_DRAGONFLY_H
+
+#define	FIO_OS	os_dragonfly
+
+#include <errno.h>
+#include <unistd.h>
+#include <sys/endian.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <sys/statvfs.h>
+#include <sys/diskslice.h>
+#include <sys/usched.h>
+#include <sys/resource.h>
+
+/* API changed during "5.3 development" */
+#if __DragonFly_version < 500302
+#include <sys/ioctl_compat.h>
+#define DAIOCTRIM	IOCTLTRIM
+#else
+#include <bus/cam/scsi/scsi_daio.h>
+#endif
+
+#include "../file.h"
+#include "../lib/types.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
+#define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_GETTID
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_IOPRIO
+#define FIO_HAVE_SHM_ATTACH_REMOVED
+
+#define OS_MAP_ANON		MAP_ANON
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 4096
+#endif
+
+#define fio_swap16(x)	bswap16(x)
+#define fio_swap32(x)	bswap32(x)
+#define fio_swap64(x)	bswap64(x)
+
+/* This is supposed to equal (sizeof(cpumask_t)*8) */
+#define FIO_MAX_CPUS	SMP_MAXCPU
+
+typedef cpumask_t os_cpu_mask_t;
+
+/*
+ * These macros are copied from sys/cpu/x86_64/include/types.h.
+ * It's okay to copy from arch dependent header because x86_64 is the only
+ * supported arch, and no other arch is going to be supported any time soon.
+ *
+ * These are supposed to be able to be included from userspace by defining
+ * _KERNEL_STRUCTURES, however this scheme is badly broken that enabling it
+ * causes compile-time conflicts with other headers. Although the current
+ * upstream code no longer requires _KERNEL_STRUCTURES, they should be kept
+ * here for compatibility with older versions.
+ */
+#ifndef CPUMASK_SIMPLE
+#define CPUMASK_SIMPLE(cpu)		((uint64_t)1 << (cpu))
+#define CPUMASK_TESTBIT(val, i)		((val).ary[((i) >> 6) & 3] & \
+					 CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_ORBIT(mask, i)		((mask).ary[((i) >> 6) & 3] |= \
+					 CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_NANDBIT(mask, i)	((mask).ary[((i) >> 6) & 3] &= \
+					 ~CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_ASSZERO(mask)		do {				\
+					(mask).ary[0] = 0;		\
+					(mask).ary[1] = 0;		\
+					(mask).ary[2] = 0;		\
+					(mask).ary[3] = 0;		\
+					} while(0)
+#endif
+
+/*
+ * Define USCHED_GET_CPUMASK as the macro didn't exist until release 4.5.
+ * usched_set(2) returns EINVAL if the kernel doesn't support it.
+ *
+ * Also note usched_set(2) works only for the current thread regardless of
+ * the command type. It doesn't work against another thread regardless of
+ * a caller's privilege. A caller would generally specify 0 for pid for the
+ * current thread though that's the only choice. See BUGS in usched_set(2).
+ */
+#ifndef USCHED_GET_CPUMASK
+#define USCHED_GET_CPUMASK	5
+#endif
+
+/* No CPU_COUNT(), but use the default function defined in os/os.h */
+#define fio_cpu_count(mask)             CPU_COUNT((mask))
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	CPUMASK_ASSZERO(*mask);
+	return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
+
+static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	CPUMASK_NANDBIT(*mask, cpu);
+}
+
+static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	CPUMASK_ORBIT(*mask, cpu);
+}
+
+static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	return CPUMASK_TESTBIT(*mask, cpu) != 0;
+}
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t mask)
+{
+	int i, firstcall = 1;
+
+	/* 0 for the current thread, see BUGS in usched_set(2) */
+	pid = 0;
+
+	for (i = 0; i < FIO_MAX_CPUS; i++) {
+		if (!CPUMASK_TESTBIT(mask, i))
+			continue;
+		if (firstcall) {
+			if (usched_set(pid, USCHED_SET_CPU, &i, sizeof(int)))
+				return -1;
+			firstcall = 0;
+		} else {
+			if (usched_set(pid, USCHED_ADD_CPU, &i, sizeof(int)))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	/* 0 for the current thread, see BUGS in usched_set(2) */
+	pid = 0;
+
+	if (usched_set(pid, USCHED_GET_CPUMASK, mask, sizeof(*mask)))
+		return -1;
+
+	return 0;
+}
+
+/* fio code is Linux based, so rename macros to Linux style */
+#define IOPRIO_WHO_PROCESS	PRIO_PROCESS
+#define IOPRIO_WHO_PGRP		PRIO_PGRP
+#define IOPRIO_WHO_USER		PRIO_USER
+
+#define IOPRIO_MIN_PRIO		1	/* lowest priority */
+#define IOPRIO_MAX_PRIO		10	/* highest priority */
+
+/*
+ * Prototypes declared in sys/sys/resource.h are preventing from defining
+ * ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro.
+ * Note that there is no idea of class within ioprio_set(2) unlike Linux.
+ */
+#define ioprio_set(which, who, ioprio_class, ioprio)	\
+	ioprio_set(which, who, ioprio)
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct partinfo pi;
+
+	if (!ioctl(f->fd, DIOCGPART, &pi)) {
+		*bytes = (unsigned long long) pi.media_size;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	return blockdev_size(f, bytes);
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM };
+	uint64_t mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return (int) lwp_gettid();
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(struct fio_file *f, unsigned long long start,
+			  unsigned long long len)
+{
+	off_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(f->fd, DAIOCTRIM, range))
+		return 0;
+
+	return errno;
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
+#endif
diff --git a/os/os-freebsd.h b/os/os-freebsd.h
new file mode 100644
index 0000000..b3addf9
--- /dev/null
+++ b/os/os-freebsd.h
@@ -0,0 +1,146 @@
+#ifndef FIO_OS_FREEBSD_H
+#define FIO_OS_FREEBSD_H
+
+#define	FIO_OS	os_freebsd
+
+#include <errno.h>
+#include <sys/sysctl.h>
+#include <sys/disk.h>
+#include <sys/endian.h>
+#include <sys/thr.h>
+#include <sys/socket.h>
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/statvfs.h>
+
+#include "../file.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
+#define FIO_HAVE_GETTID
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_SHM_ATTACH_REMOVED
+
+#define OS_MAP_ANON		MAP_ANON
+
+#define fio_swap16(x)	bswap16(x)
+#define fio_swap32(x)	bswap32(x)
+#define fio_swap64(x)	bswap64(x)
+
+typedef cpuset_t os_cpu_mask_t;
+
+#define fio_cpu_clear(mask, cpu)        (void) CPU_CLR((cpu), (mask))
+#define fio_cpu_set(mask, cpu)          (void) CPU_SET((cpu), (mask))
+#define fio_cpu_isset(mask, cpu)	(CPU_ISSET((cpu), (mask)) != 0)
+#define fio_cpu_count(mask)		CPU_COUNT((mask))
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+        CPU_ZERO(mask);
+        return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+        return 0;
+}
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, pid, sizeof(cpumask), &cpumask);
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *cpumask)
+{
+	return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, sizeof(cpumask), cpumask);
+}
+
+#define FIO_MAX_CPUS                    CPU_SETSIZE
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	off_t size;
+
+	if (!ioctl(f->fd, DIOCGMEDIASIZE, &size)) {
+		*bytes = size;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	return blockdev_size(f, bytes);
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM };
+	unsigned long long mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+static inline int gettid(void)
+{
+	long lwpid;
+
+	thr_self(&lwpid);
+	return (int) lwpid;
+}
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(struct fio_file *f, unsigned long long start,
+			  unsigned long long len)
+{
+	off_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(f->fd, DIOCGDELETE, range))
+		return 0;
+
+	return errno;
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
+#endif
diff --git a/os/os-hpux.h b/os/os-hpux.h
new file mode 100644
index 0000000..c1dafe4
--- /dev/null
+++ b/os/os-hpux.h
@@ -0,0 +1,91 @@
+#ifndef FIO_OS_HPUX_H
+#define FIO_OS_HPUX_H
+
+#define	FIO_OS	os_hpux
+
+#include <errno.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <sys/fadvise.h>
+#include <sys/mman.h>
+#include <sys/mpctl.h>
+#include <sys/diskio.h>
+#include <sys/param.h>
+#include <sys/pstat.h>
+#include <time.h>
+#include <aio.h>
+#include <arm.h>
+
+#include "../file.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_CHARDEV_SIZE
+
+#define OS_MAP_ANON		MAP_ANONYMOUS
+#define OS_MSG_DONTWAIT		0
+
+#define POSIX_MADV_DONTNEED	MADV_DONTNEED
+#define POSIX_MADV_SEQUENTIAL	MADV_SEQUENTIAL
+#define POSIX_MADV_RANDOM	MADV_RANDOM
+#define posix_madvise(ptr, sz, hint)	madvise((ptr), (sz), (hint))
+
+#ifndef MSG_WAITALL
+#define MSG_WAITALL	0x40
+#endif
+
+#define FIO_USE_GENERIC_SWAP
+
+#define FIO_OS_HAVE_AIOCB_TYPEDEF
+typedef struct aiocb64 os_aiocb_t;
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	disk_describe_type_ext_t dext;
+
+	if (!ioctl(f->fd, DIOC_DESCRIBE_EXT, &dext)) {
+		unsigned long long lba;
+
+		lba = ((uint64_t) dext.maxsva_high << 32) | dext.maxsva_low;
+		*bytes = lba * dext.lgblksz;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	return blockdev_size(f, bytes);
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	unsigned long long ret;
+	struct pst_static pst;
+	union pstun pu;
+
+	pu.pst_static = &pst;
+	if (pstat(PSTAT_STATIC, pu, sizeof(pst), 0, 0) == -1)
+		return 0;
+
+	ret = pst.physical_memory;
+	ret *= pst.page_size;
+	return ret;
+}
+
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+
+static inline unsigned int cpus_online(void)
+{
+	return mpctl(MPC_GETNUMSPUS, 0, NULL);
+}
+
+#endif
diff --git a/os/os-linux-syscall.h b/os/os-linux-syscall.h
new file mode 100644
index 0000000..c399b2f
--- /dev/null
+++ b/os/os-linux-syscall.h
@@ -0,0 +1,277 @@
+#ifndef FIO_OS_LINUX_SYSCALL_H
+#define FIO_OS_LINUX_SYSCALL_H
+
+#include "../arch/arch.h"
+
+/* Linux syscalls for x86 */
+#if defined(ARCH_X86_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		250
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		313
+#define __NR_sys_tee		315
+#define __NR_sys_vmsplice	316
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		378
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		379
+#endif
+
+/* Linux syscalls for x86_64 */
+#elif defined(ARCH_X86_64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		251
+#define __NR_ioprio_get		252
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		221
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		275
+#define __NR_sys_tee		276
+#define __NR_sys_vmsplice	278
+#endif
+
+#ifndef __NR_shmget
+#define __NR_shmget		 29
+#define __NR_shmat		 30
+#define __NR_shmctl		 31
+#define __NR_shmdt		 67
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		327
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		328
+#endif
+
+/* Linux syscalls for ppc */
+#elif defined(ARCH_PPC_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		273
+#define __NR_ioprio_get		274
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		233
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		283
+#define __NR_sys_tee		284
+#define __NR_sys_vmsplice	285
+#endif
+
+/* Linux syscalls for ia64 */
+#elif defined(ARCH_IA64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		1274
+#define __NR_ioprio_get		1275
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		1234
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		1297
+#define __NR_sys_tee		1301
+#define __NR_sys_vmsplice	1302
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		1348
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		1349
+#endif
+
+/* Linux syscalls for alpha */
+#elif defined(ARCH_ALPHA_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		442
+#define __NR_ioprio_get		443
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		413
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		468
+#define __NR_sys_tee		470
+#define __NR_sys_vmsplice	471
+#endif
+
+/* Linux syscalls for s390 */
+#elif defined(ARCH_S390_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		282
+#define __NR_ioprio_get		283
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		253
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		306
+#define __NR_sys_tee		308
+#define __NR_sys_vmsplice	309
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		376
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		377
+#endif
+
+/* Linux syscalls for sparc */
+#elif defined(ARCH_SPARC_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		196
+#define __NR_ioprio_get		218
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		209
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		232
+#define __NR_sys_tee		280
+#define __NR_sys_vmsplice	25
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		358
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		359
+#endif
+
+/* Linux syscalls for sparc64 */
+#elif defined(ARCH_SPARC64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		196
+#define __NR_ioprio_get		218
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		209
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		232
+#define __NR_sys_tee		280
+#define __NR_sys_vmsplice	25
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		358
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		359
+#endif
+
+/* Linux syscalls for arm */
+#elif defined(ARCH_ARM_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		314
+#define __NR_ioprio_get		315
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		270
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		340
+#define __NR_sys_tee		342
+#define __NR_sys_vmsplice	343
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		392
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		393
+#endif
+
+/* Linux syscalls for mips */
+#elif defined(ARCH_MIPS64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		314
+#define __NR_ioprio_get		315
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		215
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		263
+#define __NR_sys_tee		265
+#define __NR_sys_vmsplice	266
+#endif
+
+/* Linux syscalls for sh */
+#elif defined(ARCH_SH_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		288
+#define __NR_ioprio_get		289
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		250
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		313
+#define __NR_sys_tee		315
+#define __NR_sys_vmsplice	316
+#endif
+
+/* Linux syscalls for hppa */
+#elif defined(ARCH_HPPA_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		267
+#define __NR_ioprio_get		268
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		236
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		291
+#define __NR_sys_tee		293
+#define __NR_sys_vmsplice	294
+#endif
+
+/* Linux syscalls for aarch64 */
+#elif defined(ARCH_AARCH64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		30
+#define __NR_ioprio_get		31
+#endif
+
+#else
+#warning "Unknown architecture"
+#endif
+
+#endif /* FIO_OS_LINUX_SYSCALL_H */
diff --git a/os/os-linux.h b/os/os-linux.h
new file mode 100644
index 0000000..0f0bcc3
--- /dev/null
+++ b/os/os-linux.h
@@ -0,0 +1,429 @@
+#ifndef FIO_OS_LINUX_H
+#define FIO_OS_LINUX_H
+
+#define	FIO_OS	os_linux
+
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <sys/syscall.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sched.h>
+#include <linux/unistd.h>
+#include <linux/raw.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <scsi/sg.h>
+
+#ifdef ARCH_HAVE_CRC_CRYPTO
+#include <sys/auxv.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32             (1 << 7)
+#endif /* HWCAP_CRC32 */
+#endif /* ARCH_HAVE_CRC_CRYPTO */
+
+#include "./os-linux-syscall.h"
+#include "../file.h"
+
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_DISK_UTIL
+#define FIO_HAVE_SGIO
+#define FIO_HAVE_IOPRIO
+#define FIO_HAVE_IOPRIO_CLASS
+#define FIO_HAVE_IOSCHED_SWITCH
+#define FIO_HAVE_ODIRECT
+#define FIO_HAVE_HUGETLB
+#define FIO_HAVE_RAWBIND
+#define FIO_HAVE_BLKTRACE
+#define FIO_HAVE_CL_SIZE
+#define FIO_HAVE_CGROUPS
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
+#define FIO_HAVE_GETTID
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_PWRITEV2
+#define FIO_HAVE_SHM_ATTACH_REMOVED
+
+#ifdef MAP_HUGETLB
+#define FIO_HAVE_MMAP_HUGE
+#endif
+
+#define OS_MAP_ANON		MAP_ANONYMOUS
+
+typedef cpu_set_t os_cpu_mask_t;
+
+#ifdef CONFIG_3ARG_AFFINITY
+#define fio_setaffinity(pid, cpumask)		\
+	sched_setaffinity((pid), sizeof(cpumask), &(cpumask))
+#define fio_getaffinity(pid, ptr)	\
+	sched_getaffinity((pid), sizeof(cpu_set_t), (ptr))
+#elif defined(CONFIG_2ARG_AFFINITY)
+#define fio_setaffinity(pid, cpumask)	\
+	sched_setaffinity((pid), &(cpumask))
+#define fio_getaffinity(pid, ptr)	\
+	sched_getaffinity((pid), (ptr))
+#endif
+
+#define fio_cpu_clear(mask, cpu)	(void) CPU_CLR((cpu), (mask))
+#define fio_cpu_set(mask, cpu)		(void) CPU_SET((cpu), (mask))
+#define fio_cpu_isset(mask, cpu)	(CPU_ISSET((cpu), (mask)) != 0)
+#define fio_cpu_count(mask)		CPU_COUNT((mask))
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	CPU_ZERO(mask);
+	return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
+
+#define FIO_MAX_CPUS			CPU_SETSIZE
+
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+#define IOPRIO_BITS		16
+#define IOPRIO_CLASS_SHIFT	13
+
+#define IOPRIO_MIN_PRIO		0	/* highest priority */
+#define IOPRIO_MAX_PRIO		7	/* lowest priority */
+
+#define IOPRIO_MIN_PRIO_CLASS	0
+#define IOPRIO_MAX_PRIO_CLASS	3
+
+static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+{
+	/*
+	 * If no class is set, assume BE
+	 */
+	if (!ioprio_class)
+		ioprio_class = IOPRIO_CLASS_BE;
+
+	ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
+	return syscall(__NR_ioprio_set, which, who, ioprio);
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+#endif
+
+#define SPLICE_DEF_SIZE	(64*1024)
+
+#ifndef BLKGETSIZE64
+#define BLKGETSIZE64	_IOR(0x12,114,size_t)
+#endif
+
+#ifndef BLKFLSBUF
+#define BLKFLSBUF	_IO(0x12,97)
+#endif
+
+#ifndef BLKDISCARD
+#define BLKDISCARD	_IO(0x12,119)
+#endif
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ioctl(f->fd, BLKFLSBUF);
+}
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	if (!ioctl(f->fd, BLKGETSIZE64, bytes))
+		return 0;
+
+	return errno;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	long pagesize, pages;
+
+	pagesize = sysconf(_SC_PAGESIZE);
+	pages = sysconf(_SC_PHYS_PAGES);
+	if (pages == -1 || pagesize == -1)
+		return 0;
+
+	return (unsigned long long) pages * (unsigned long long) pagesize;
+}
+
+static inline int fio_lookup_raw(dev_t dev, int *majdev, int *mindev)
+{
+	struct raw_config_request rq;
+	int fd;
+
+	if (major(dev) != RAW_MAJOR)
+		return 1;
+
+	/*
+	 * we should be able to find /dev/rawctl or /dev/raw/rawctl
+	 */
+	fd = open("/dev/rawctl", O_RDONLY);
+	if (fd < 0) {
+		fd = open("/dev/raw/rawctl", O_RDONLY);
+		if (fd < 0)
+			return 1;
+	}
+
+	rq.raw_minor = minor(dev);
+	if (ioctl(fd, RAW_GETBIND, &rq) < 0) {
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	*majdev = rq.block_major;
+	*mindev = rq.block_minor;
+	return 0;
+}
+
+#ifdef O_NOATIME
+#define FIO_O_NOATIME	O_NOATIME
+#else
+#define FIO_O_NOATIME	0
+#endif
+
+#ifdef O_ATOMIC
+#define OS_O_ATOMIC	O_ATOMIC
+#else
+#define OS_O_ATOMIC	040000000
+#endif
+
+#ifdef MADV_REMOVE
+#define FIO_MADV_FREE	MADV_REMOVE
+#endif
+
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
+#define fio_swap16(x)	__builtin_bswap16(x)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
+
+#define CACHE_LINE_FILE	\
+	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
+
+static inline int arch_cache_line_size(void)
+{
+	char size[32];
+	int fd, ret;
+
+	fd = open(CACHE_LINE_FILE, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, size, sizeof(size));
+
+	close(fd);
+
+	if (ret <= 0)
+		return -1;
+	else
+		return atoi(size);
+}
+
+#ifdef __powerpc64__
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+        return sysconf(_SC_NPROCESSORS_CONF);
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statfs s;
+
+	if (statfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_bsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(struct fio_file *f, unsigned long long start,
+			  unsigned long long len)
+{
+	uint64_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(f->fd, BLKDISCARD, range))
+		return 0;
+
+	return errno;
+}
+
+#ifdef CONFIG_SCHED_IDLE
+static inline int fio_set_sched_idle(void)
+{
+	struct sched_param p = { .sched_priority = 0, };
+	return sched_setscheduler(gettid(), SCHED_IDLE, &p);
+}
+#endif
+
+#ifndef F_GET_RW_HINT
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE	1024
+#endif
+#define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+#endif
+
+#ifndef RWH_WRITE_LIFE_NONE
+#define RWH_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NONE	1
+#define RWH_WRITE_LIFE_SHORT	2
+#define RWH_WRITE_LIFE_MEDIUM	3
+#define RWH_WRITE_LIFE_LONG	4
+#define RWH_WRITE_LIFE_EXTREME	5
+#endif
+
+#define FIO_HAVE_WRITE_HINT
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI	0x00000001
+#endif
+#ifndef RWF_DSYNC
+#define RWF_DSYNC	0x00000002
+#endif
+#ifndef RWF_SYNC
+#define RWF_SYNC	0x00000004
+#endif
+
+#ifndef RWF_UNCACHED
+#define RWF_UNCACHED	0x00000040
+#endif
+
+#ifndef RWF_WRITE_LIFE_SHIFT
+#define RWF_WRITE_LIFE_SHIFT		4
+#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)
+#endif
+
+#ifndef CONFIG_PWRITEV2
+#ifdef __NR_preadv2
+static inline void make_pos_h_l(unsigned long *pos_h, unsigned long *pos_l,
+				off_t offset)
+{
+#if BITS_PER_LONG == 64
+	*pos_l = offset;
+	*pos_h = 0;
+#else
+	*pos_l = offset & 0xffffffff;
+	*pos_h = ((uint64_t) offset) >> 32;
+#endif
+}
+static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
+			      off_t offset, unsigned int flags)
+{
+	unsigned long pos_l, pos_h;
+
+	make_pos_h_l(&pos_h, &pos_l, offset);
+	return syscall(__NR_preadv2, fd, iov, iovcnt, pos_l, pos_h, flags);
+}
+static inline ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
+			       off_t offset, unsigned int flags)
+{
+	unsigned long pos_l, pos_h;
+
+	make_pos_h_l(&pos_h, &pos_l, offset);
+	return syscall(__NR_pwritev2, fd, iov, iovcnt, pos_l, pos_h, flags);
+}
+#else
+static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
+			      off_t offset, unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+static inline ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
+			       off_t offset, unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+#endif /* __NR_preadv2 */
+#endif /* CONFIG_PWRITEV2 */
+
+static inline int shm_attach_to_open_removed(void)
+{
+	return 1;
+}
+
+#ifdef CONFIG_LINUX_FALLOCATE
+#define FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset,
+				 uint64_t len)
+{
+	int ret;
+	ret = fallocate(f->fd, 0, offset, len);
+	if (ret == 0)
+		return true;
+
+	/* Work around buggy old glibc versions... */
+	if (ret > 0)
+		errno = ret;
+
+	return false;
+}
+#endif
+
+#define FIO_HAVE_CPU_HAS
+static inline bool os_cpu_has(cpu_features feature)
+{
+	bool have_feature;
+	unsigned long fio_unused hwcap;
+
+	switch (feature) {
+#ifdef ARCH_HAVE_CRC_CRYPTO
+	case CPU_ARM64_CRC32C:
+		hwcap = getauxval(AT_HWCAP);
+		have_feature = (hwcap & HWCAP_CRC32) != 0;
+		break;
+#endif
+	default:
+		have_feature = false;
+	}
+
+	return have_feature;
+}
+
+#endif
diff --git a/os/os-mac.h b/os/os-mac.h
new file mode 100644
index 0000000..2852ac6
--- /dev/null
+++ b/os/os-mac.h
@@ -0,0 +1,109 @@
+#ifndef FIO_OS_APPLE_H
+#define FIO_OS_APPLE_H
+
+#define	FIO_OS	os_mac
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/disk.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <mach/mach_init.h>
+#include <machine/endian.h>
+#include <libkern/OSByteOrder.h>
+
+#include "../file.h"
+
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_GETTID
+#define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_NATIVE_FALLOCATE
+
+#define OS_MAP_ANON		MAP_ANON
+
+#define fio_swap16(x)	OSSwapInt16(x)
+#define fio_swap32(x)	OSSwapInt32(x)
+#define fio_swap64(x)	OSSwapInt64(x)
+
+/*
+ * OSX has a pitifully small shared memory segment by default,
+ * so default to a lower number of max jobs supported
+ */
+#define FIO_MAX_JOBS		128
+
+#ifndef CONFIG_CLOCKID_T
+typedef unsigned int clockid_t;
+#endif
+
+#define FIO_OS_DIRECTIO
+static inline int fio_set_odirect(struct fio_file *f)
+{
+	if (fcntl(f->fd, F_NOCACHE, 1) == -1)
+		return errno;
+	return 0;
+}
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	uint32_t block_size;
+	uint64_t block_count;
+
+	if (ioctl(f->fd, DKIOCGETBLOCKCOUNT, &block_count) == -1)
+		return errno;
+	if (ioctl(f->fd, DKIOCGETBLOCKSIZE, &block_size) == -1)
+		return errno;
+
+	*bytes = block_size;
+	*bytes *= block_count;
+	return 0;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	/*
+	 * Could be a raw block device, this is better than just assuming
+	 * we can't get the size at all.
+	 */
+	if (!blockdev_size(f, bytes))
+		return 0;
+
+	*bytes = -1ULL;
+	return 0;
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM };
+	unsigned long long mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return mach_thread_self();
+}
+#endif
+
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+	fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, offset, len};
+	if (fcntl(f->fd, F_PREALLOCATE, &store) != -1) {
+		if (ftruncate(f->fd, len) == 0)
+			return true;
+	}
+
+	return false;
+}
+
+#endif
diff --git a/os/os-netbsd.h b/os/os-netbsd.h
new file mode 100644
index 0000000..abc1d3c
--- /dev/null
+++ b/os/os-netbsd.h
@@ -0,0 +1,90 @@
+#ifndef FIO_OS_NETBSD_H
+#define FIO_OS_NETBSD_H
+
+#define	FIO_OS	os_netbsd
+
+#include <errno.h>
+#include <lwp.h>
+#include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <sys/dkio.h>
+#include <sys/disklabel.h>
+#include <sys/endian.h>
+#include <sys/sysctl.h>
+
+/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
+#undef rb_node
+#undef rb_left
+#undef rb_right
+
+#include "../file.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_GETTID
+
+#define OS_MAP_ANON		MAP_ANON
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 4096
+#endif
+
+#define fio_swap16(x)	bswap16(x)
+#define fio_swap32(x)	bswap32(x)
+#define fio_swap64(x)	bswap64(x)
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct disklabel dl;
+
+	if (!ioctl(f->fd, DIOCGDINFO, &dl)) {
+		*bytes = ((unsigned long long)dl.d_secperunit) * dl.d_secsize;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM64 };
+	uint64_t mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return (int) _lwp_self();
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+#endif
diff --git a/os/os-openbsd.h b/os/os-openbsd.h
new file mode 100644
index 0000000..994bf07
--- /dev/null
+++ b/os/os-openbsd.h
@@ -0,0 +1,120 @@
+#ifndef FIO_OS_OPENBSD_H
+#define FIO_OS_OPENBSD_H
+
+#define	FIO_OS	os_openbsd
+
+#include <errno.h>
+#include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <sys/dkio.h>
+#include <sys/disklabel.h>
+#include <sys/endian.h>
+#include <sys/utsname.h>
+#include <sys/sysctl.h>
+
+/* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
+#undef RB_BLACK
+#undef RB_RED
+#undef RB_ROOT
+
+#include "../file.h"
+
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_GETTID
+#define FIO_HAVE_SHM_ATTACH_REMOVED
+
+#define OS_MAP_ANON		MAP_ANON
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 4096
+#endif
+
+#define fio_swap16(x)	swap16(x)
+#define fio_swap32(x)	swap32(x)
+#define fio_swap64(x)	swap64(x)
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct disklabel dl;
+
+	if (!ioctl(f->fd, DIOCGDINFO, &dl)) {
+		*bytes = ((unsigned long long)dl.d_secperunit) * dl.d_secsize;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM64 };
+	uint64_t mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return (int)(intptr_t) pthread_self();
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+static inline int shm_attach_to_open_removed(void)
+{
+	struct utsname uts;
+	int major, minor;
+
+	if (uname(&uts) == -1)
+		return 0;
+
+	/*
+	 * Return 1 if >= OpenBSD 5.1 according to 97900ebf,
+	 * assuming both major/minor versions are < 10.
+	 */
+	if (uts.release[0] > '9' || uts.release[0] < '0')
+		return 0;
+	if (uts.release[1] != '.')
+		return 0;
+	if (uts.release[2] > '9' || uts.release[2] < '0')
+		return 0;
+
+	major = uts.release[0] - '0';
+	minor = uts.release[2] - '0';
+
+	if (major > 5)
+		return 1;
+	if (major == 5 && minor >= 1)
+		return 1;
+
+	return 0;
+}
+
+#endif
diff --git a/os/os-solaris.h b/os/os-solaris.h
new file mode 100644
index 0000000..f1966f4
--- /dev/null
+++ b/os/os-solaris.h
@@ -0,0 +1,183 @@
+#ifndef FIO_OS_SOLARIS_H
+#define FIO_OS_SOLARIS_H
+
+#define	FIO_OS	os_solaris
+
+#include <errno.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/pset.h>
+#include <sys/mman.h>
+#include <sys/dkio.h>
+#include <sys/byteorder.h>
+#include <sys/statvfs.h>
+#include <pthread.h>
+
+#include "../file.h"
+#include "../lib/types.h"
+
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_CHARDEV_SIZE
+#define FIO_USE_GENERIC_BDEV_SIZE
+#define FIO_HAVE_FS_STAT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_GETTID
+
+#define OS_MAP_ANON		MAP_ANON
+#define OS_RAND_MAX		2147483648UL
+
+#define fio_swap16(x)	BSWAP_16(x)
+#define fio_swap32(x)	BSWAP_32(x)
+#define fio_swap64(x)	BSWAP_64(x)
+
+struct solaris_rand_seed {
+	unsigned short r[3];
+};
+
+#ifndef POSIX_MADV_SEQUENTIAL
+#define posix_madvise	madvise
+#define POSIX_MADV_SEQUENTIAL	MADV_SEQUENTIAL
+#define POSIX_MADV_DONTNEED	MADV_DONTNEED
+#define POSIX_MADV_RANDOM	MADV_RANDOM
+#endif
+
+#define os_ctime_r(x, y, z)     ctime_r((x), (y), (z))
+#define FIO_OS_HAS_CTIME_R
+
+typedef psetid_t os_cpu_mask_t;
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct dk_minfo info;
+
+	*bytes = 0;
+
+	if (ioctl(f->fd, DKIOCGMEDIAINFO, &info) < 0)
+		return errno;
+
+	*bytes = info.dki_lbsize * info.dki_capacity;
+	return 0;
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	long pagesize, pages;
+
+	pagesize = sysconf(_SC_PAGESIZE);
+	pages = sysconf(_SC_PHYS_PAGES);
+	if (pages == -1 || pagesize == -1)
+		return 0;
+
+	return (unsigned long long) pages * (unsigned long long) pagesize;
+}
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+#define FIO_OS_DIRECTIO
+extern int directio(int, int);
+static inline int fio_set_odirect(struct fio_file *f)
+{
+	if (directio(f->fd, DIRECTIO_ON) < 0)
+		return errno;
+
+	return 0;
+}
+
+/*
+ * pset binding hooks for fio
+ */
+#define fio_setaffinity(pid, cpumask)		\
+	pset_bind((cpumask), P_LWPID, (pid), NULL)
+#define fio_getaffinity(pid, ptr)	({ 0; })
+
+#define fio_cpu_clear(mask, cpu)	pset_assign(PS_NONE, (cpu), NULL)
+#define fio_cpu_set(mask, cpu)		pset_assign(*(mask), (cpu), NULL)
+
+static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	unsigned int num_cpus;
+	processorid_t *cpus;
+	bool ret;
+	int i;
+
+	cpus = malloc(sizeof(*cpus) * max_cpus);
+
+	if (pset_info(*mask, NULL, &num_cpus, cpus) < 0) {
+		free(cpus);
+		return false;
+	}
+
+	ret = false;
+	for (i = 0; i < num_cpus; i++) {
+		if (cpus[i] == cpu) {
+			ret = true;
+			break;
+		}
+	}
+
+	free(cpus);
+	return ret;
+}
+
+static inline int fio_cpu_count(os_cpu_mask_t *mask)
+{
+	unsigned int num_cpus;
+
+	if (pset_info(*mask, NULL, &num_cpus, NULL) < 0)
+		return 0;
+
+	return num_cpus;
+}
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	if (pset_create(mask) < 0)
+		return -1;
+
+	return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	if (pset_destroy(*mask) < 0)
+		return -1;
+
+	return 0;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return pthread_self();
+}
+#endif
+
+/*
+ * Should be enough, not aware of what (if any) restrictions Solaris has
+ */
+#define FIO_MAX_CPUS			16384
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+#endif
diff --git a/os/os-windows-7.h b/os/os-windows-7.h
new file mode 100644
index 0000000..b8bd9e7
--- /dev/null
+++ b/os/os-windows-7.h
@@ -0,0 +1,7 @@
+#define FIO_MAX_CPUS		512 /* From Hyper-V 2016's max logical processors */
+#define FIO_CPU_MASK_STRIDE	64
+#define FIO_CPU_MASK_ROWS	(FIO_MAX_CPUS / FIO_CPU_MASK_STRIDE)
+
+typedef struct {
+	uint64_t row[FIO_CPU_MASK_ROWS];
+} os_cpu_mask_t;
diff --git a/os/os-windows-xp.h b/os/os-windows-xp.h
new file mode 100644
index 0000000..fbc23e2
--- /dev/null
+++ b/os/os-windows-xp.h
@@ -0,0 +1,3 @@
+#define FIO_MAX_CPUS	MAXIMUM_PROCESSORS
+
+typedef DWORD_PTR os_cpu_mask_t;
diff --git a/os/os-windows.h b/os/os-windows.h
new file mode 100644
index 0000000..fa2955f
--- /dev/null
+++ b/os/os-windows.h
@@ -0,0 +1,236 @@
+#ifndef FIO_OS_WINDOWS_H
+#define FIO_OS_WINDOWS_H
+
+#define FIO_OS	os_windows
+
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <winsock2.h>
+#include <windows.h>
+#include <psapi.h>
+#include <stdlib.h>
+
+#include "../smalloc.h"
+#include "../debug.h"
+#include "../file.h"
+#include "../log.h"
+#include "../lib/hweight.h"
+#include "../oslib/strcasestr.h"
+#include "../lib/types.h"
+
+#include "windows/posix.h"
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 65535
+#endif
+
+#define FIO_HAVE_ODIRECT
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_GETTID
+#define FIO_EMULATED_MKDIR_TWO
+
+#define FIO_PREFERRED_ENGINE		"windowsaio"
+#define FIO_PREFERRED_CLOCK_SOURCE	CS_CGETTIME
+#define FIO_OS_PATH_SEPARATOR		'\\'
+
+#define OS_MAP_ANON		MAP_ANON
+
+#define fio_swap16(x)	_byteswap_ushort(x)
+#define fio_swap32(x)	_byteswap_ulong(x)
+#define fio_swap64(x)	_byteswap_uint64(x)
+
+#define _SC_PAGESIZE			0x1
+#define _SC_NPROCESSORS_ONLN	0x2
+#define _SC_PHYS_PAGES			0x4
+
+#define SA_RESTART	0
+#define SIGPIPE		0
+
+/*
+ * Windows doesn't have O_DIRECT or O_SYNC, so define them
+ * here so we can reject them at runtime when using the _open
+ * interface (windowsaio uses CreateFile)
+ */
+#define O_DIRECT	0x1000000
+#define O_SYNC		0x2000000
+
+/* Windows doesn't support madvise, so any values will work */
+#define POSIX_MADV_DONTNEED		0
+#define POSIX_MADV_SEQUENTIAL	0
+#define POSIX_MADV_RANDOM		0
+
+#define F_SETFL			0x1
+#define F_GETFL			0x2
+#define O_NONBLOCK		FIONBIO
+
+/* Winsock doesn't support MSG_WAIT */
+#define OS_MSG_DONTWAIT	0
+
+#ifndef S_ISSOCK
+#define S_ISSOCK(x) 0
+#endif
+
+#define SIGCONT	0
+#define SIGUSR1	1
+#define SIGUSR2 2
+
+typedef int sigset_t;
+typedef int siginfo_t;
+
+struct sigaction
+{
+	void (*sa_handler)(int);
+	sigset_t sa_mask;
+	int sa_flags;
+	void* (*sa_sigaction)(int, siginfo_t *, void*);
+};
+
+long sysconf(int name);
+
+int kill(pid_t pid, int sig);
+pid_t setsid(void);
+int setgid(gid_t gid);
+int setuid(uid_t uid);
+int nice(int incr);
+int sigaction(int sig, const struct sigaction *act,
+		struct sigaction *oact);
+int fsync(int fildes);
+int fork(void);
+int fcntl(int fildes, int cmd, ...);
+int fdatasync(int fildes);
+int lstat(const char * path, struct stat * buf);
+uid_t geteuid(void);
+char* ctime_r(const time_t *t, char *buf);
+int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
+ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
+ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
+		off_t offset);
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	int rc = 0;
+	HANDLE hFile;
+	GET_LENGTH_INFORMATION info;
+	DWORD outBytes;
+
+	if (f->hFile == NULL) {
+		hFile = CreateFile(f->file_name, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE,
+				NULL, OPEN_EXISTING, 0, NULL);
+	} else {
+		hFile = f->hFile;
+	}
+
+	if (DeviceIoControl(hFile, IOCTL_DISK_GET_LENGTH_INFO, NULL, 0, &info, sizeof(info), &outBytes, NULL))
+		*bytes = info.Length.QuadPart;
+	else
+		rc = EIO;
+
+	/* If we were passed a POSIX fd,
+	 * close the HANDLE we created via CreateFile */
+	if (hFile != INVALID_HANDLE_VALUE && f->hFile == NULL)
+		CloseHandle(hFile);
+
+	return rc;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	return blockdev_size(f, bytes);
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	long pagesize, pages;
+
+	pagesize = sysconf(_SC_PAGESIZE);
+	pages = sysconf(_SC_PHYS_PAGES);
+	if (pages == -1 || pagesize == -1)
+		return 0;
+
+	return (unsigned long long) pages * (unsigned long long) pagesize;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return GetCurrentThreadId();
+}
+#endif
+
+static inline int init_random_seeds(uint64_t *rand_seeds, int size)
+{
+	HCRYPTPROV hCryptProv;
+
+	if (!CryptAcquireContext(&hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT))
+	{
+		errno = GetLastError();
+		log_err("CryptAcquireContext() failed: error %d\n", errno);
+		return 1;
+	}
+
+	if (!CryptGenRandom(hCryptProv, size, (BYTE*)rand_seeds)) {
+		errno = GetLastError();
+		log_err("CryptGenRandom() failed, error %d\n", errno);
+		CryptReleaseContext(hCryptProv, 0);
+		return 1;
+	}
+
+	CryptReleaseContext(hCryptProv, 0);
+	return 0;
+}
+
+static inline int fio_set_sched_idle(void)
+{
+	/* SetThreadPriority returns nonzero for success */
+	return (SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE))? 0 : -1;
+}
+
+static inline int fio_mkdir(const char *path, mode_t mode) {
+	DWORD dwAttr = GetFileAttributesA(path);
+
+	if (dwAttr != INVALID_FILE_ATTRIBUTES &&
+	    (dwAttr & FILE_ATTRIBUTE_DIRECTORY)) {
+		errno = EEXIST;
+		return -1;
+	}
+
+	if (CreateDirectoryA(path, NULL) == 0) {
+		/* Ignore errors if path is a device namespace */
+		if (strcmp(path, "\\\\.") == 0) {
+			errno = EEXIST;
+			return -1;
+		}
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_WINDOWS_XP
+#include "os-windows-xp.h"
+#else
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+unsigned int cpus_online(void);
+#include "os-windows-7.h"
+#endif
+
+int first_set_cpu(os_cpu_mask_t *cpumask);
+int fio_setaffinity(int pid, os_cpu_mask_t cpumask);
+int fio_cpuset_init(os_cpu_mask_t *mask);
+int fio_getaffinity(int pid, os_cpu_mask_t *mask);
+void fio_cpu_clear(os_cpu_mask_t *mask, int cpu);
+void fio_cpu_set(os_cpu_mask_t *mask, int cpu);
+int fio_cpu_isset(os_cpu_mask_t *mask, int cpu);
+int fio_cpu_count(os_cpu_mask_t *mask);
+int fio_cpuset_exit(os_cpu_mask_t *mask);
+
+#endif /* FIO_OS_WINDOWS_H */
diff --git a/os/os.h b/os/os.h
new file mode 100644
index 0000000..9a280e5
--- /dev/null
+++ b/os/os.h
@@ -0,0 +1,414 @@
+#ifndef FIO_OS_H
+#define FIO_OS_H
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "../arch/arch.h" /* IWYU pragma: export */
+#include "../lib/types.h"
+
+enum {
+	os_linux = 1,
+	os_aix,
+	os_freebsd,
+	os_hpux,
+	os_mac,
+	os_netbsd,
+	os_openbsd,
+	os_solaris,
+	os_windows,
+	os_android,
+	os_dragonfly,
+
+	os_nr,
+};
+
+typedef enum {
+        CPU_ARM64_CRC32C,
+} cpu_features;
+
+/* IWYU pragma: begin_exports */
+#if defined(__ANDROID__)
+#include "os-android.h"
+#elif defined(__linux__)
+#include "os-linux.h"
+#elif defined(__FreeBSD__)
+#include "os-freebsd.h"
+#elif defined(__OpenBSD__)
+#include "os-openbsd.h"
+#elif defined(__NetBSD__)
+#include "os-netbsd.h"
+#elif defined(__sun__)
+#include "os-solaris.h"
+#elif defined(__APPLE__)
+#include "os-mac.h"
+#elif defined(_AIX)
+#include "os-aix.h"
+#elif defined(__hpux)
+#include "os-hpux.h"
+#elif defined(WIN32)
+#include "os-windows.h"
+#elif defined (__DragonFly__)
+#include "os-dragonfly.h"
+#else
+#error "unsupported os"
+#endif
+
+#ifdef CONFIG_POSIXAIO
+#include <aio.h>
+#ifndef FIO_OS_HAVE_AIOCB_TYPEDEF
+typedef struct aiocb os_aiocb_t;
+#endif
+#endif
+
+#ifndef CONFIG_STRSEP
+#include "../oslib/strsep.h"
+#endif
+
+#ifndef CONFIG_STRLCAT
+#include "../oslib/strlcat.h"
+#endif
+/* IWYU pragma: end_exports */
+
+#ifdef MSG_DONTWAIT
+#define OS_MSG_DONTWAIT	MSG_DONTWAIT
+#endif
+
+#ifndef POSIX_FADV_DONTNEED
+#define POSIX_FADV_DONTNEED	(0)
+#define POSIX_FADV_SEQUENTIAL	(0)
+#define POSIX_FADV_RANDOM	(0)
+#define POSIX_FADV_NORMAL	(0)
+#endif
+
+#ifndef FIO_HAVE_CPU_AFFINITY
+#define fio_cpu_clear(mask, cpu)	do { } while (0)
+typedef unsigned long os_cpu_mask_t;
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	return 0;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *cpumask)
+{
+	return -1;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return -1;
+}
+
+static inline int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
+{
+	return 0;
+}
+#else
+extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
+#endif
+
+#ifndef FIO_HAVE_IOPRIO
+#define ioprio_set(which, who, prioclass, prio)	(0)
+#endif
+
+#ifndef FIO_HAVE_ODIRECT
+#define OS_O_DIRECT			0
+#else
+#define OS_O_DIRECT			O_DIRECT
+#endif
+
+#ifdef OS_O_ATOMIC
+#define FIO_O_ATOMIC			OS_O_ATOMIC
+#else
+#define FIO_O_ATOMIC			0
+#endif
+
+#ifndef FIO_HAVE_HUGETLB
+#define SHM_HUGETLB			0
+#define MAP_HUGETLB			0
+#ifndef FIO_HUGE_PAGE
+#define FIO_HUGE_PAGE			0
+#endif
+#else
+#ifndef FIO_HUGE_PAGE
+#define FIO_HUGE_PAGE			4194304
+#endif
+#endif
+
+#ifndef FIO_HAVE_MMAP_HUGE
+#define MAP_HUGETLB			0
+#endif
+
+#ifndef FIO_O_NOATIME
+#define FIO_O_NOATIME			0
+#endif
+
+#ifndef OS_RAND_MAX
+#define OS_RAND_MAX			RAND_MAX
+#endif
+
+#ifndef FIO_HAVE_RAWBIND
+#define fio_lookup_raw(dev, majdev, mindev)	1
+#endif
+
+#ifndef FIO_PREFERRED_ENGINE
+#define FIO_PREFERRED_ENGINE	"psync"
+#endif
+
+#ifndef FIO_OS_PATH_SEPARATOR
+#define FIO_OS_PATH_SEPARATOR	'/'
+#endif
+
+#ifndef FIO_PREFERRED_CLOCK_SOURCE
+#ifdef CONFIG_CLOCK_GETTIME
+#define FIO_PREFERRED_CLOCK_SOURCE	CS_CGETTIME
+#else
+#define FIO_PREFERRED_CLOCK_SOURCE	CS_GTOD
+#endif
+#endif
+
+#ifndef FIO_MAX_JOBS
+#define FIO_MAX_JOBS		4096
+#endif
+
+#ifndef CONFIG_SOCKLEN_T
+typedef unsigned int socklen_t;
+#endif
+
+#ifndef FIO_OS_HAS_CTIME_R
+#define os_ctime_r(x, y, z)     (void) ctime_r((x), (y))
+#endif
+
+#ifdef FIO_USE_GENERIC_SWAP
+static inline uint16_t fio_swap16(uint16_t val)
+{
+	return (val << 8) | (val >> 8);
+}
+
+static inline uint32_t fio_swap32(uint32_t val)
+{
+	val = ((val & 0xff00ff00UL) >> 8) | ((val & 0x00ff00ffUL) << 8);
+
+	return (val >> 16) | (val << 16);
+}
+
+static inline uint64_t fio_swap64(uint64_t val)
+{
+	val = ((val & 0xff00ff00ff00ff00ULL) >> 8) |
+	      ((val & 0x00ff00ff00ff00ffULL) << 8);
+	val = ((val & 0xffff0000ffff0000ULL) >> 16) |
+	      ((val & 0x0000ffff0000ffffULL) << 16);
+
+	return (val >> 32) | (val << 32);
+}
+#endif
+
+#ifndef FIO_HAVE_BYTEORDER_FUNCS
+#ifdef CONFIG_LITTLE_ENDIAN
+#define __be16_to_cpu(x)		fio_swap16(x)
+#define __be32_to_cpu(x)		fio_swap32(x)
+#define __be64_to_cpu(x)		fio_swap64(x)
+#define __le16_to_cpu(x)		(x)
+#define __le32_to_cpu(x)		(x)
+#define __le64_to_cpu(x)		(x)
+#define __cpu_to_be16(x)		fio_swap16(x)
+#define __cpu_to_be32(x)		fio_swap32(x)
+#define __cpu_to_be64(x)		fio_swap64(x)
+#define __cpu_to_le16(x)		(x)
+#define __cpu_to_le32(x)		(x)
+#define __cpu_to_le64(x)		(x)
+#else
+#define __be16_to_cpu(x)		(x)
+#define __be32_to_cpu(x)		(x)
+#define __be64_to_cpu(x)		(x)
+#define __le16_to_cpu(x)		fio_swap16(x)
+#define __le32_to_cpu(x)		fio_swap32(x)
+#define __le64_to_cpu(x)		fio_swap64(x)
+#define __cpu_to_be16(x)		(x)
+#define __cpu_to_be32(x)		(x)
+#define __cpu_to_be64(x)		(x)
+#define __cpu_to_le16(x)		fio_swap16(x)
+#define __cpu_to_le32(x)		fio_swap32(x)
+#define __cpu_to_le64(x)		fio_swap64(x)
+#endif
+#endif /* FIO_HAVE_BYTEORDER_FUNCS */
+
+#ifdef FIO_INTERNAL
+#define be16_to_cpu(val) ({			\
+	typecheck(uint16_t, val);		\
+	__be16_to_cpu(val);			\
+})
+#define be32_to_cpu(val) ({			\
+	typecheck(uint32_t, val);		\
+	__be32_to_cpu(val);			\
+})
+#define be64_to_cpu(val) ({			\
+	typecheck(uint64_t, val);		\
+	__be64_to_cpu(val);			\
+})
+#define le16_to_cpu(val) ({			\
+	typecheck(uint16_t, val);		\
+	__le16_to_cpu(val);			\
+})
+#define le32_to_cpu(val) ({			\
+	typecheck(uint32_t, val);		\
+	__le32_to_cpu(val);			\
+})
+#define le64_to_cpu(val) ({			\
+	typecheck(uint64_t, val);		\
+	__le64_to_cpu(val);			\
+})
+#endif
+
+#define cpu_to_be16(val) ({			\
+	typecheck(uint16_t, val);		\
+	__cpu_to_be16(val);			\
+})
+#define cpu_to_be32(val) ({			\
+	typecheck(uint32_t, val);		\
+	__cpu_to_be32(val);			\
+})
+#define cpu_to_be64(val) ({			\
+	typecheck(uint64_t, val);		\
+	__cpu_to_be64(val);			\
+})
+#define cpu_to_le16(val) ({			\
+	typecheck(uint16_t, val);		\
+	__cpu_to_le16(val);			\
+})
+#define cpu_to_le32(val) ({			\
+	typecheck(uint32_t, val);		\
+	__cpu_to_le32(val);			\
+})
+#define cpu_to_le64(val) ({			\
+	typecheck(uint64_t, val);		\
+	__cpu_to_le64(val);			\
+})
+
+#define FIO_DEF_CL_SIZE		128
+
+static inline int os_cache_line_size(void)
+{
+#ifdef FIO_HAVE_CL_SIZE
+	int ret = arch_cache_line_size();
+
+	if (ret <= 0)
+		return FIO_DEF_CL_SIZE;
+
+	return ret;
+#else
+	return FIO_DEF_CL_SIZE;
+#endif
+}
+
+#ifdef FIO_USE_GENERIC_BDEV_SIZE
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	off_t end;
+
+	*bytes = 0;
+
+	end = lseek(f->fd, 0, SEEK_END);
+	if (end < 0)
+		return errno;
+
+	*bytes = end;
+	return 0;
+}
+#endif
+
+#ifdef FIO_USE_GENERIC_INIT_RANDOM_STATE
+static inline int init_random_seeds(uint64_t *rand_seeds, int size)
+{
+	int fd;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd == -1) {
+		return 1;
+	}
+
+	if (read(fd, rand_seeds, size) < size) {
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	return 0;
+}
+#endif
+
+#ifndef FIO_HAVE_FS_STAT
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	return 0;
+}
+#endif
+
+#ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+	return sysconf(_SC_NPROCESSORS_ONLN);
+}
+#endif
+
+#ifndef CPU_COUNT
+#ifdef FIO_HAVE_CPU_AFFINITY
+static inline int CPU_COUNT(os_cpu_mask_t *mask)
+{
+	int max_cpus = cpus_online();
+	int nr_cpus, i;
+
+	for (i = 0, nr_cpus = 0; i < max_cpus; i++)
+		if (fio_cpu_isset(mask, i))
+			nr_cpus++;
+
+	return nr_cpus;
+}
+#endif
+#endif
+
+#ifndef FIO_HAVE_GETTID
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return getpid();
+}
+#endif
+#endif
+
+#ifndef FIO_HAVE_SHM_ATTACH_REMOVED
+static inline int shm_attach_to_open_removed(void)
+{
+	return 0;
+}
+#endif
+
+#ifndef FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+	errno = ENOSYS;
+	return false;
+}
+#endif
+
+#if defined(CONFIG_POSIX_FALLOCATE) || defined(FIO_HAVE_NATIVE_FALLOCATE)
+# define FIO_HAVE_DEFAULT_FALLOCATE
+#endif
+
+#ifndef FIO_HAVE_CPU_HAS
+static inline bool os_cpu_has(cpu_features feature)
+{
+	return false;
+}
+#endif
+
+#ifndef FIO_EMULATED_MKDIR_TWO
+# define fio_mkdir(path, mode)	mkdir(path, mode)
+#endif
+
+#endif /* FIO_OS_H */
diff --git a/os/windows/cpu-affinity.c b/os/windows/cpu-affinity.c
new file mode 100644
index 0000000..69997b2
--- /dev/null
+++ b/os/windows/cpu-affinity.c
@@ -0,0 +1,444 @@
+#include "os/os.h"
+
+#include <windows.h>
+
+#ifdef CONFIG_WINDOWS_XP
+int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	HANDLE h;
+	BOOL bSuccess = FALSE;
+
+	h = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION, TRUE,
+		       pid);
+	if (h != NULL) {
+		bSuccess = SetThreadAffinityMask(h, cpumask);
+		if (!bSuccess)
+			log_err("fio_setaffinity failed: failed to set thread affinity (pid %d, mask %.16llx)\n",
+				pid, cpumask);
+
+		CloseHandle(h);
+	} else {
+		log_err("fio_setaffinity failed: failed to get handle for pid %d\n",
+			pid);
+	}
+
+	return bSuccess ? 0 : -1;
+}
+
+int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	os_cpu_mask_t systemMask;
+
+	HANDLE h = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
+
+	if (h != NULL) {
+		GetProcessAffinityMask(h, mask, &systemMask);
+		CloseHandle(h);
+	} else {
+		log_err("fio_getaffinity failed: failed to get handle for pid %d\n",
+			pid);
+		return -1;
+	}
+
+	return 0;
+}
+
+void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	*mask &= ~(1ULL << cpu);
+}
+
+void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	*mask |= 1ULL << cpu;
+}
+
+int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	return (*mask & (1ULL << cpu)) != 0;
+}
+
+int fio_cpu_count(os_cpu_mask_t *mask)
+{
+	return hweight64(*mask);
+}
+
+int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	*mask = 0;
+	return 0;
+}
+
+int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
+#else /* CONFIG_WINDOWS_XP */
+/* Return all processors regardless of processor group */
+unsigned int cpus_online(void)
+{
+	return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+}
+
+static void print_mask(os_cpu_mask_t *cpumask)
+{
+	for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
+		dprint(FD_PROCESS, "cpumask[%d]=%lu\n", i, cpumask->row[i]);
+}
+
+/* Return the index of the least significant set CPU in cpumask or -1 if no
+ * CPUs are set */
+int first_set_cpu(os_cpu_mask_t *cpumask)
+{
+	int cpus_offset, mask_first_cpu, row;
+
+	cpus_offset = 0;
+	row = 0;
+	mask_first_cpu = -1;
+	while (mask_first_cpu < 0 && row < FIO_CPU_MASK_ROWS) {
+		int row_first_cpu;
+
+		row_first_cpu = __builtin_ffsll(cpumask->row[row]) - 1;
+		dprint(FD_PROCESS, "row_first_cpu=%d cpumask->row[%d]=%lu\n",
+		       row_first_cpu, row, cpumask->row[row]);
+		if (row_first_cpu > -1) {
+			mask_first_cpu = cpus_offset + row_first_cpu;
+			dprint(FD_PROCESS, "first set cpu in mask is at index %d\n",
+			       mask_first_cpu);
+		} else {
+			cpus_offset += FIO_CPU_MASK_STRIDE;
+			row++;
+		}
+	}
+
+	return mask_first_cpu;
+}
+
+/* Return the index of the most significant set CPU in cpumask or -1 if no
+ * CPUs are set */
+static int last_set_cpu(os_cpu_mask_t *cpumask)
+{
+	int cpus_offset, mask_last_cpu, row;
+
+	cpus_offset = (FIO_CPU_MASK_ROWS - 1) * FIO_CPU_MASK_STRIDE;
+	row = FIO_CPU_MASK_ROWS - 1;
+	mask_last_cpu = -1;
+	while (mask_last_cpu < 0 && row >= 0) {
+		int row_last_cpu;
+
+		if (cpumask->row[row] == 0)
+			row_last_cpu = -1;
+		else {
+			uint64_t tmp = cpumask->row[row];
+
+			row_last_cpu = 0;
+			while (tmp >>= 1)
+			    row_last_cpu++;
+		}
+
+		dprint(FD_PROCESS, "row_last_cpu=%d cpumask->row[%d]=%lu\n",
+		       row_last_cpu, row, cpumask->row[row]);
+		if (row_last_cpu > -1) {
+			mask_last_cpu = cpus_offset + row_last_cpu;
+			dprint(FD_PROCESS, "last set cpu in mask is at index %d\n",
+			       mask_last_cpu);
+		} else {
+			cpus_offset -= FIO_CPU_MASK_STRIDE;
+			row--;
+		}
+	}
+
+	return mask_last_cpu;
+}
+
+static int mask_to_group_mask(os_cpu_mask_t *cpumask, int *processor_group, uint64_t *affinity_mask)
+{
+	WORD online_groups, group, group_size;
+	bool found;
+	int cpus_offset, search_cpu, last_cpu, bit_offset, row, end;
+	uint64_t group_cpumask;
+
+	search_cpu = first_set_cpu(cpumask);
+	if (search_cpu < 0) {
+		log_info("CPU mask doesn't set any CPUs\n");
+		return 1;
+	}
+
+	/* Find processor group first set CPU applies to */
+	online_groups = GetActiveProcessorGroupCount();
+	group = 0;
+	found = false;
+	cpus_offset = 0;
+	group_size = 0;
+	while (!found && group < online_groups) {
+		group_size = GetActiveProcessorCount(group);
+		dprint(FD_PROCESS, "group=%d group_start=%d group_size=%u search_cpu=%d\n",
+		       group, cpus_offset, group_size, search_cpu);
+		if (cpus_offset + group_size > search_cpu)
+			found = true;
+		else {
+			cpus_offset += group_size;
+			group++;
+		}
+	}
+
+	if (!found) {
+		log_err("CPU mask contains processor beyond last active processor index (%d)\n",
+			 cpus_offset - 1);
+		print_mask(cpumask);
+		return 1;
+	}
+
+	/* Check all the CPUs in the mask apply to ONLY that processor group */
+	last_cpu = last_set_cpu(cpumask);
+	if (last_cpu > (cpus_offset + group_size - 1)) {
+		log_info("CPU mask cannot bind CPUs (e.g. %d, %d) that are "
+			 "in different processor groups\n", search_cpu,
+			 last_cpu);
+		print_mask(cpumask);
+		return 1;
+	}
+
+	/* Extract the current processor group mask from the cpumask */
+	row = cpus_offset / FIO_CPU_MASK_STRIDE;
+	bit_offset = cpus_offset % FIO_CPU_MASK_STRIDE;
+	group_cpumask = cpumask->row[row] >> bit_offset;
+	end = bit_offset + group_size;
+	if (end > FIO_CPU_MASK_STRIDE && (row + 1 < FIO_CPU_MASK_ROWS)) {
+		/* Some of the next row needs to be part of the mask */
+		int needed, needed_shift, needed_mask_shift;
+		uint64_t needed_mask;
+
+		needed = end - FIO_CPU_MASK_STRIDE;
+		needed_shift = FIO_CPU_MASK_STRIDE - bit_offset;
+		needed_mask_shift = FIO_CPU_MASK_STRIDE - needed;
+		needed_mask = (uint64_t)-1 >> needed_mask_shift;
+		dprint(FD_PROCESS, "bit_offset=%d end=%d needed=%d needed_shift=%d needed_mask=%ld needed_mask_shift=%d\n", bit_offset, end, needed, needed_shift, needed_mask, needed_mask_shift);
+		group_cpumask |= (cpumask->row[row + 1] & needed_mask) << needed_shift;
+	}
+	group_cpumask &= (uint64_t)-1 >> (FIO_CPU_MASK_STRIDE - group_size);
+
+	/* Return group and mask */
+	dprint(FD_PROCESS, "Returning group=%d group_mask=%lu\n", group, group_cpumask);
+	*processor_group = group;
+	*affinity_mask = group_cpumask;
+
+	return 0;
+}
+
+int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	HANDLE handle = NULL;
+	int group, ret;
+	uint64_t group_mask = 0;
+	GROUP_AFFINITY new_group_affinity;
+
+	ret = -1;
+
+	if (mask_to_group_mask(&cpumask, &group, &group_mask) != 0)
+		goto err;
+
+	handle = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION,
+			    TRUE, pid);
+	if (handle == NULL) {
+		log_err("fio_setaffinity: failed to get handle for pid %d\n", pid);
+		goto err;
+	}
+
+	/* Set group and mask.
+	 * Note: if the GROUP_AFFINITY struct's Reserved members are not
+	 * initialised to 0 then SetThreadGroupAffinity will fail with
+	 * GetLastError() set to ERROR_INVALID_PARAMETER */
+	new_group_affinity.Mask = (KAFFINITY) group_mask;
+	new_group_affinity.Group = group;
+	new_group_affinity.Reserved[0] = 0;
+	new_group_affinity.Reserved[1] = 0;
+	new_group_affinity.Reserved[2] = 0;
+	if (SetThreadGroupAffinity(handle, &new_group_affinity, NULL) != 0)
+		ret = 0;
+	else {
+		log_err("fio_setaffinity: failed to set thread affinity "
+			 "(pid %d, group %d, mask %" PRIx64 ", "
+			 "GetLastError=%d)\n", pid, group, group_mask,
+			 GetLastError());
+		goto err;
+	}
+
+err:
+	if (handle)
+		CloseHandle(handle);
+	return ret;
+}
+
+static void cpu_to_row_offset(int cpu, int *row, int *offset)
+{
+	*row = cpu / FIO_CPU_MASK_STRIDE;
+	*offset = cpu << FIO_CPU_MASK_STRIDE * *row;
+}
+
+int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
+		mask->row[i] = 0;
+	return 0;
+}
+
+/*
+ * fio_getaffinity() should not be called once a fio_setaffinity() call has
+ * been made because fio_setaffinity() may put the process into multiple
+ * processor groups
+ */
+int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	int ret;
+	int row, offset, end, group, group_size, group_start_cpu;
+	DWORD_PTR process_mask, system_mask;
+	HANDLE handle;
+	PUSHORT current_groups;
+	USHORT group_count;
+	WORD online_groups;
+
+	ret = -1;
+	current_groups = NULL;
+	handle = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
+	if (handle == NULL) {
+		log_err("fio_getaffinity: failed to get handle for pid %d\n",
+			pid);
+		goto err;
+	}
+
+	group_count = 16;
+	/*
+	 * GetProcessGroupAffinity() seems to expect more than the natural
+	 * alignment for a USHORT from the area pointed to by current_groups so
+	 * arrange for maximum alignment by allocating via malloc()
+	 */
+	current_groups = malloc(group_count * sizeof(USHORT));
+	if (!current_groups) {
+		log_err("fio_getaffinity: malloc failed\n");
+		goto err;
+	}
+	if (!GetProcessGroupAffinity(handle, &group_count, current_groups)) {
+		log_err("%s: failed to get single group affinity for pid %d (%d)\n",
+			__func__, pid, GetLastError());
+		goto err;
+	}
+	if (group_count > 1) {
+		log_err("%s: pid %d is associated with %d process groups\n",
+			__func__, pid, group_count);
+		goto err;
+	}
+	if (!GetProcessAffinityMask(handle, &process_mask, &system_mask)) {
+		log_err("%s: GetProcessAffinityMask() failed for pid\n",
+			__func__, pid);
+		goto err;
+	}
+
+	/* Convert group and group relative mask to full CPU mask */
+	online_groups = GetActiveProcessorGroupCount();
+	if (online_groups == 0) {
+		log_err("fio_getaffinity: error retrieving total processor groups\n");
+		goto err;
+	}
+
+	group = 0;
+	group_start_cpu = 0;
+	group_size = 0;
+	dprint(FD_PROCESS, "current_groups=%d group_count=%d\n",
+	       current_groups[0], group_count);
+	while (true) {
+		group_size = GetActiveProcessorCount(group);
+		if (group_size == 0) {
+			log_err("fio_getaffinity: error retrieving size of "
+				"processor group %d\n", group);
+			goto err;
+		} else if (group >= current_groups[0] || group >= online_groups)
+			break;
+		else {
+			group_start_cpu += group_size;
+			group++;
+		}
+	}
+
+	if (group != current_groups[0]) {
+		log_err("fio_getaffinity: could not find processor group %d\n",
+			current_groups[0]);
+		goto err;
+	}
+
+	dprint(FD_PROCESS, "group_start_cpu=%d, group size=%u\n",
+	       group_start_cpu, group_size);
+	if ((group_start_cpu + group_size) >= FIO_MAX_CPUS) {
+		log_err("fio_getaffinity failed: current CPU affinity (group "
+			"%d, group_start_cpu %d, group_size %d) extends "
+			"beyond mask's highest CPU (%d)\n", group,
+			group_start_cpu, group_size, FIO_MAX_CPUS);
+		goto err;
+	}
+
+	fio_cpuset_init(mask);
+	cpu_to_row_offset(group_start_cpu, &row, &offset);
+	mask->row[row] = process_mask;
+	mask->row[row] <<= offset;
+	end = offset + group_size;
+	if (end > FIO_CPU_MASK_STRIDE) {
+		int needed;
+		uint64_t needed_mask;
+
+		needed = FIO_CPU_MASK_STRIDE - end;
+		needed_mask = (uint64_t)-1 >> (FIO_CPU_MASK_STRIDE - needed);
+		row++;
+		mask->row[row] = process_mask;
+		mask->row[row] >>= needed;
+		mask->row[row] &= needed_mask;
+	}
+	ret = 0;
+
+err:
+	if (handle)
+		CloseHandle(handle);
+	if (current_groups)
+		free(current_groups);
+
+	return ret;
+}
+
+void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	int row, offset;
+	cpu_to_row_offset(cpu, &row, &offset);
+
+	mask->row[row] &= ~(1ULL << offset);
+}
+
+void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	int row, offset;
+	cpu_to_row_offset(cpu, &row, &offset);
+
+	mask->row[row] |= 1ULL << offset;
+}
+
+int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	int row, offset;
+	cpu_to_row_offset(cpu, &row, &offset);
+
+	return (mask->row[row] & (1ULL << offset)) != 0;
+}
+
+int fio_cpu_count(os_cpu_mask_t *mask)
+{
+	int count = 0;
+
+	for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
+		count += hweight64(mask->row[i]);
+
+	return count;
+}
+
+int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
+#endif /* CONFIG_WINDOWS_XP */
diff --git a/os/windows/dobuild.cmd b/os/windows/dobuild.cmd
new file mode 100644
index 0000000..ef12d82
--- /dev/null
+++ b/os/windows/dobuild.cmd
@@ -0,0 +1,34 @@
+@echo off
+setlocal enabledelayedexpansion
+set /a counter=1
+for /f "tokens=3" %%i in (..\..\FIO-VERSION-FILE) do (
+ if "!counter!"=="1" set FIO_VERSION=%%i
+ set /a counter+=1
+)
+
+for /f "tokens=2 delims=-" %%i in ("%FIO_VERSION%") do (
+ set FIO_VERSION_NUMBERS=%%i
+)
+
+if not defined FIO_VERSION_NUMBERS (
+  echo Could not find version numbers in the string '%FIO_VERSION%'
+  echo Expected version to follow format 'fio-^([0-9]+.[0-9.]+^)'
+  goto end
+)
+
+if "%1"=="x86" set FIO_ARCH=x86
+if "%1"=="x64" set FIO_ARCH=x64
+
+if not defined FIO_ARCH (
+  echo Error: must specify the architecture.
+  echo Usage: dobuild x86
+  echo Usage: dobuild x64
+  goto end
+)
+
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% -dFioVersionNumbers="%FIO_VERSION_NUMBERS%" install.wxs
+@if ERRORLEVEL 1 goto end
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs
+@if ERRORLEVEL 1 goto end
+"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi
+:end
diff --git a/os/windows/eula.rtf b/os/windows/eula.rtf
new file mode 100755
index 0000000..01472be
Binary files /dev/null and b/os/windows/eula.rtf differ
diff --git a/os/windows/examples.wxs b/os/windows/examples.wxs
new file mode 100755
index 0000000..9308ba8
--- /dev/null
+++ b/os/windows/examples.wxs
@@ -0,0 +1,231 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">
+    <Fragment>
+        <DirectoryRef Id="examples">
+                <Component>
+                  <File Source="..\..\examples\1mbs_clients.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\aio-read.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\backwards-read.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\basic-verify.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\butterfly.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\cpp_null.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\cpuio.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\dev-dax.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\disk-zone-profile.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\e4defrag.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\e4defrag2.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\enospc-pressure.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\falloc.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\filecreate-ioengine.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-rand-read.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-rand-RW.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-rand-write.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-seq-read.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-seq-RW.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-seq-write.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fixed-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\flow.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fsx.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\ftruncate.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gfapi.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gpudirect-rdmaio-client.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gpudirect-rdmaio-server.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\http-s3.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\http-swift.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\http-webdav.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\ime.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\iometer-file-access-server.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\jesd219.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\latency-profile.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\libhdfs.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\libpmem.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\mtd.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\netio.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\netio_multicast.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\null.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\numa.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\pmemblk.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\poisson-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rados.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rand-zones.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rbd.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rdmaio-client.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rdmaio-server.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\ssd-steadystate.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\ssd-test.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\steadystate.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\surface-scan.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\tiobench-example.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\waitfor.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\zipf.fio" />
+                </Component>
+        </DirectoryRef>
+    </Fragment>
+    <Fragment>
+        <ComponentGroup Id="examples">
+            <ComponentRef Id="_1mbs_clients.fio" />
+            <ComponentRef Id="aio_read.fio" />
+            <ComponentRef Id="backwards_read.fio" />
+            <ComponentRef Id="basic_verify.fio" />
+            <ComponentRef Id="butterfly.fio"/>
+            <ComponentRef Id="cpp_null.fio"/>
+            <ComponentRef Id="cpuio.fio" />
+            <ComponentRef Id="dev_dax.fio" />
+            <ComponentRef Id="disk_zone_profile.fio" />
+            <ComponentRef Id="e4defrag.fio" />
+            <ComponentRef Id="e4defrag2.fio" />
+            <ComponentRef Id="enospc_pressure.fio" />
+            <ComponentRef Id="falloc.fio" />
+            <ComponentRef Id="filecreate_ioengine.fio"/>
+            <ComponentRef Id="fio_rand_read.fio"/>
+            <ComponentRef Id="fio_rand_RW.fio"/>
+            <ComponentRef Id="fio_rand_write.fio"/>
+            <ComponentRef Id="fio_seq_read.fio"/>
+            <ComponentRef Id="fio_seq_RW.fio"/>
+            <ComponentRef Id="fio_seq_write.fio"/>
+            <ComponentRef Id="fixed_rate_submission.fio" />
+            <ComponentRef Id="flow.fio" />
+            <ComponentRef Id="fsx.fio" />
+            <ComponentRef Id="ftruncate.fio"/>
+            <ComponentRef Id="gfapi.fio" />
+            <ComponentRef Id="gpudirect_rdmaio_client.fio"/>
+            <ComponentRef Id="gpudirect_rdmaio_server.fio"/>
+            <ComponentRef Id="http_s3.fio"/>
+            <ComponentRef Id="http_swift.fio"/>
+            <ComponentRef Id="http_webdav.fio"/>
+            <ComponentRef Id="ime.fio"/>
+            <ComponentRef Id="iometer_file_access_server.fio" />
+            <ComponentRef Id="jesd219.fio" />
+            <ComponentRef Id="latency_profile.fio" />
+            <ComponentRef Id="libhdfs.fio" />
+            <ComponentRef Id="libpmem.fio"/>
+            <ComponentRef Id="mtd.fio" />
+            <ComponentRef Id="netio.fio" />
+            <ComponentRef Id="netio_multicast.fio" />
+            <ComponentRef Id="null.fio" />
+            <ComponentRef Id="numa.fio" />
+            <ComponentRef Id="pmemblk.fio" />
+            <ComponentRef Id="poisson_rate_submission.fio" />
+            <ComponentRef Id="rados.fio"/>
+            <ComponentRef Id="rand_zones.fio" />
+            <ComponentRef Id="rbd.fio" />
+            <ComponentRef Id="rdmaio_client.fio" />
+            <ComponentRef Id="rdmaio_server.fio" />
+            <ComponentRef Id="ssd_steadystate.fio" />
+            <ComponentRef Id="ssd_test.fio" />
+            <ComponentRef Id="steadystate.fio" />
+            <ComponentRef Id="surface_scan.fio" />
+            <ComponentRef Id="tiobench_example.fio" />
+            <ComponentRef Id="waitfor.fio" />
+            <ComponentRef Id="zipf.fio" />
+        </ComponentGroup>
+    </Fragment>
+</Wix>
diff --git a/os/windows/install.wxs b/os/windows/install.wxs
new file mode 100755
index 0000000..dcb8c92
--- /dev/null
+++ b/os/windows/install.wxs
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">
+
+	<?if $(sys.BUILDARCH) = x86 ?>
+		<?define ProgramDirectory = ProgramFilesFolder ?>
+	<?else?>
+		<?define ProgramDirectory = ProgramFiles64Folder ?>
+	<?endif?>
+
+	<Product Id="*"
+	  Codepage="1252" Language="1033"
+	  Manufacturer="fio" Name="fio"
+	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="$(var.FioVersionNumbers)">
+		<Package
+		  Description="Flexible I/O Tester"
+		  InstallerVersion="301" Keywords="Installer,MSI,Database"
+		  Languages="1033" Manufacturer="fio"
+		  InstallScope="perMachine" InstallPrivileges="elevated" Compressed="yes"/>
+
+		<Media Id="1" Cabinet="Product.cab" EmbedCab="yes" CompressionLevel="high"/>
+
+		<Directory Id="TARGETDIR" Name="SourceDir">
+			<Directory Id="$(var.ProgramDirectory)">
+				<Directory Id="INSTALLDIR">
+					<Directory Id="fio" Name="fio">
+						<Component>
+							<File Source="..\..\fio.exe"/>
+							<Environment Action="set" Part="last" Id="PATH" Name="PATH" Value="[INSTALLDIR]fio\" System="yes"/>
+						</Component>
+						<Component>
+							<File Id="README" Name="README.txt" Source="..\..\README"/>
+						</Component>
+						<Component>
+							<File Id="REPORTING_BUGS" Name="REPORTING-BUGS.txt" Source="..\..\REPORTING-BUGS"/>
+						</Component>
+						<Component>
+							<File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO"/>
+						</Component>
+						<Component>
+							<File Id="COPYING" Name="COPYING.txt" Source="..\..\COPYING"/>
+						</Component>
+						<Component>
+							<File Id="MORAL_LICENSE" Name="MORAL-LICENSE.txt" Source="..\..\MORAL-LICENSE"/>
+						</Component>
+						<Directory Id="examples" Name="examples"/>
+						<Directory Id="tests" Name="tests">
+							<Component>
+								<File Source="../../t/fio-genzipf.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/fio-dedupe.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/stest.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/ieee754.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/axmap.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/lfsr-test.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/gen-rand.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/fio-verify-state.exe"/>
+							</Component>
+						</Directory>
+					</Directory>
+				</Directory>
+			</Directory>
+	</Directory>
+
+	<Feature Id="AlwaysInstall" Absent="disallow" ConfigurableDirectory="INSTALLDIR" Display="hidden" Level="1" Title="Flexible I/O Tester">
+		<ComponentRef Id="fio.exe"/>
+		<ComponentRef Id="HOWTO"/>
+		<ComponentRef Id="README"/>
+		<ComponentRef Id="REPORTING_BUGS"/>
+		<ComponentRef Id="COPYING"/>
+		<ComponentRef Id="MORAL_LICENSE"/>
+		<ComponentGroupRef Id="examples"/>
+		<ComponentRef Id="fio_genzipf.exe"/>
+		<ComponentRef Id="fio_dedupe.exe"/>
+		<ComponentRef Id="stest.exe"/>
+		<ComponentRef Id="ieee754.exe"/>
+		<ComponentRef Id="axmap.exe"/>
+		<ComponentRef Id="lfsr_test.exe"/>
+		<ComponentRef Id="gen_rand.exe"/>
+		<ComponentRef Id="fio_verify_state.exe"/>
+	</Feature>
+
+	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/cgit/fio/" />
+	<Property Id='ARPCONTACT'>fio@vger.kernel.org</Property>
+	<Property Id='ARPHELPLINK'>http://www.spinics.net/lists/fio/</Property>
+	<Property Id='ARPURLUPDATEINFO'>https://bluestop.org/fio/</Property>
+
+	<WixVariable Id="WixUILicenseRtf" Value="eula.rtf" />
+
+	<UIRef Id="WixUI_Minimal"/>
+
+	<MajorUpgrade AllowDowngrades="no" DowngradeErrorMessage="A newer version of the application is already installed."
+                  AllowSameVersionUpgrades="yes"/>
+</Product>
+</Wix>
diff --git a/os/windows/posix.c b/os/windows/posix.c
new file mode 100644
index 0000000..e36453e
--- /dev/null
+++ b/os/windows/posix.c
@@ -0,0 +1,1114 @@
+/* This file contains functions which implement those POSIX and Linux functions
+ * that MinGW and Microsoft don't provide. The implementations contain just enough
+ * functionality to support fio.
+ */
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <windows.h>
+#include <stddef.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <time.h>
+#include <semaphore.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+#include <sys/resource.h>
+#include <poll.h>
+#include <sys/wait.h>
+#include <setjmp.h>
+
+#include "../os-windows.h"
+#include "../../lib/hweight.h"
+
+extern unsigned long mtime_since_now(struct timespec *);
+extern void fio_gettime(struct timespec *, void *);
+
+int win_to_posix_error(DWORD winerr)
+{
+	switch (winerr) {
+	case ERROR_SUCCESS:
+		return 0;
+	case ERROR_FILE_NOT_FOUND:
+		return ENOENT;
+	case ERROR_PATH_NOT_FOUND:
+		return ENOENT;
+	case ERROR_ACCESS_DENIED:
+		return EACCES;
+	case ERROR_INVALID_HANDLE:
+		return EBADF;
+	case ERROR_NOT_ENOUGH_MEMORY:
+		return ENOMEM;
+	case ERROR_INVALID_DATA:
+		return EINVAL;
+	case ERROR_OUTOFMEMORY:
+		return ENOMEM;
+	case ERROR_INVALID_DRIVE:
+		return ENODEV;
+	case ERROR_NOT_SAME_DEVICE:
+		return EXDEV;
+	case ERROR_WRITE_PROTECT:
+		return EROFS;
+	case ERROR_BAD_UNIT:
+		return ENODEV;
+	case ERROR_NOT_READY:
+		return EAGAIN;
+	case ERROR_SHARING_VIOLATION:
+		return EACCES;
+	case ERROR_LOCK_VIOLATION:
+		return EACCES;
+	case ERROR_SHARING_BUFFER_EXCEEDED:
+		return ENOLCK;
+	case ERROR_HANDLE_DISK_FULL:
+		return ENOSPC;
+	case ERROR_NOT_SUPPORTED:
+		return ENOSYS;
+	case ERROR_FILE_EXISTS:
+		return EEXIST;
+	case ERROR_CANNOT_MAKE:
+		return EPERM;
+	case ERROR_INVALID_PARAMETER:
+		return EINVAL;
+	case ERROR_NO_PROC_SLOTS:
+		return EAGAIN;
+	case ERROR_BROKEN_PIPE:
+		return EPIPE;
+	case ERROR_OPEN_FAILED:
+		return EIO;
+	case ERROR_NO_MORE_SEARCH_HANDLES:
+		return ENFILE;
+	case ERROR_CALL_NOT_IMPLEMENTED:
+		return ENOSYS;
+	case ERROR_INVALID_NAME:
+		return ENOENT;
+	case ERROR_WAIT_NO_CHILDREN:
+		return ECHILD;
+	case ERROR_CHILD_NOT_COMPLETE:
+		return EBUSY;
+	case ERROR_DIR_NOT_EMPTY:
+		return ENOTEMPTY;
+	case ERROR_SIGNAL_REFUSED:
+		return EIO;
+	case ERROR_BAD_PATHNAME:
+		return ENOENT;
+	case ERROR_SIGNAL_PENDING:
+		return EBUSY;
+	case ERROR_MAX_THRDS_REACHED:
+		return EAGAIN;
+	case ERROR_BUSY:
+		return EBUSY;
+	case ERROR_ALREADY_EXISTS:
+		return EEXIST;
+	case ERROR_NO_SIGNAL_SENT:
+		return EIO;
+	case ERROR_FILENAME_EXCED_RANGE:
+		return EINVAL;
+	case ERROR_META_EXPANSION_TOO_LONG:
+		return EINVAL;
+	case ERROR_INVALID_SIGNAL_NUMBER:
+		return EINVAL;
+	case ERROR_THREAD_1_INACTIVE:
+		return EINVAL;
+	case ERROR_BAD_PIPE:
+		return EINVAL;
+	case ERROR_PIPE_BUSY:
+		return EBUSY;
+	case ERROR_NO_DATA:
+		return EPIPE;
+	case ERROR_MORE_DATA:
+		return EAGAIN;
+	case ERROR_DIRECTORY:
+		return ENOTDIR;
+	case ERROR_PIPE_CONNECTED:
+		return EBUSY;
+	case ERROR_NO_TOKEN:
+		return EINVAL;
+	case ERROR_PROCESS_ABORTED:
+		return EFAULT;
+	case ERROR_BAD_DEVICE:
+		return ENODEV;
+	case ERROR_BAD_USERNAME:
+		return EINVAL;
+	case ERROR_OPEN_FILES:
+		return EAGAIN;
+	case ERROR_ACTIVE_CONNECTIONS:
+		return EAGAIN;
+	case ERROR_DEVICE_IN_USE:
+		return EBUSY;
+	case ERROR_INVALID_AT_INTERRUPT_TIME:
+		return EINTR;
+	case ERROR_IO_DEVICE:
+		return EIO;
+	case ERROR_NOT_OWNER:
+		return EPERM;
+	case ERROR_END_OF_MEDIA:
+		return ENOSPC;
+	case ERROR_EOM_OVERFLOW:
+		return ENOSPC;
+	case ERROR_BEGINNING_OF_MEDIA:
+		return ESPIPE;
+	case ERROR_SETMARK_DETECTED:
+		return ESPIPE;
+	case ERROR_NO_DATA_DETECTED:
+		return ENOSPC;
+	case ERROR_POSSIBLE_DEADLOCK:
+		return EDEADLOCK;
+	case ERROR_CRC:
+		return EIO;
+	case ERROR_NEGATIVE_SEEK:
+		return EINVAL;
+	case ERROR_DISK_FULL:
+		return ENOSPC;
+	case ERROR_NOACCESS:
+		return EFAULT;
+	case ERROR_FILE_INVALID:
+		return ENXIO;
+	default:
+		log_err("fio: windows error %d not handled\n", winerr);
+		return EIO;
+	}
+
+	return winerr;
+}
+
+int GetNumLogicalProcessors(void)
+{
+	SYSTEM_LOGICAL_PROCESSOR_INFORMATION *processor_info = NULL;
+	DWORD len = 0;
+	DWORD num_processors = 0;
+	DWORD error = 0;
+	DWORD i;
+
+	while (!GetLogicalProcessorInformation(processor_info, &len)) {
+		error = GetLastError();
+		if (error == ERROR_INSUFFICIENT_BUFFER)
+			processor_info = malloc(len);
+		else {
+			log_err("Error: GetLogicalProcessorInformation failed: %d\n", error);
+			return -1;
+		}
+
+		if (processor_info == NULL) {
+			log_err("Error: failed to allocate memory for GetLogicalProcessorInformation");
+			return -1;
+		}
+	}
+
+	for (i = 0; i < len / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); i++) {
+		if (processor_info[i].Relationship == RelationProcessorCore)
+			num_processors += hweight64(processor_info[i].ProcessorMask);
+	}
+
+	free(processor_info);
+	return num_processors;
+}
+
+long sysconf(int name)
+{
+	long val = -1;
+	long val2 = -1;
+	SYSTEM_INFO sysInfo;
+	MEMORYSTATUSEX status;
+
+	switch (name) {
+	case _SC_NPROCESSORS_ONLN:
+		val = GetNumLogicalProcessors();
+		if (val == -1)
+			log_err("sysconf(_SC_NPROCESSORS_ONLN) failed\n");
+
+		break;
+
+	case _SC_PAGESIZE:
+		GetSystemInfo(&sysInfo);
+		val = sysInfo.dwPageSize;
+		break;
+
+	case _SC_PHYS_PAGES:
+		status.dwLength = sizeof(status);
+		val2 = sysconf(_SC_PAGESIZE);
+		if (GlobalMemoryStatusEx(&status) && val2 != -1)
+			val = status.ullTotalPhys / val2;
+		else
+			log_err("sysconf(_SC_PHYS_PAGES) failed\n");
+		break;
+	default:
+		log_err("sysconf(%d) is not implemented\n", name);
+		break;
+	}
+
+	return val;
+}
+
+char *dl_error = NULL;
+
+int dlclose(void *handle)
+{
+	return !FreeLibrary((HMODULE)handle);
+}
+
+void *dlopen(const char *file, int mode)
+{
+	HMODULE hMod;
+
+	hMod = LoadLibrary(file);
+	if (hMod == INVALID_HANDLE_VALUE)
+		dl_error = (char*)"LoadLibrary failed";
+	else
+		dl_error = NULL;
+
+	return hMod;
+}
+
+void *dlsym(void *handle, const char *name)
+{
+	FARPROC fnPtr;
+
+	fnPtr = GetProcAddress((HMODULE)handle, name);
+	if (fnPtr == NULL)
+		dl_error = (char*)"GetProcAddress failed";
+	else
+		dl_error = NULL;
+
+	return fnPtr;
+}
+
+char *dlerror(void)
+{
+	return dl_error;
+}
+
+/* Copied from http://blogs.msdn.com/b/joshpoley/archive/2007/12/19/date-time-formats-and-conversions.aspx */
+void Time_tToSystemTime(time_t dosTime, SYSTEMTIME *systemTime)
+{
+	FILETIME utcFT;
+	LONGLONG jan1970;
+	SYSTEMTIME tempSystemTime;
+
+	jan1970 = Int32x32To64(dosTime, 10000000) + 116444736000000000;
+	utcFT.dwLowDateTime = (DWORD)jan1970;
+	utcFT.dwHighDateTime = jan1970 >> 32;
+
+	FileTimeToSystemTime((FILETIME*)&utcFT, &tempSystemTime);
+	SystemTimeToTzSpecificLocalTime(NULL, &tempSystemTime, systemTime);
+}
+
+char *ctime_r(const time_t *t, char *buf)
+{
+	SYSTEMTIME systime;
+	const char * const dayOfWeek[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" };
+	const char * const monthOfYear[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
+
+	Time_tToSystemTime(*t, &systime);
+
+	/*
+	 * We don't know how long `buf` is, but assume it's rounded up from
+	 * the minimum of 25 to 32
+	 */
+	snprintf(buf, 32, "%s %s %d %02d:%02d:%02d %04d\n",
+		 dayOfWeek[systime.wDayOfWeek % 7],
+		 monthOfYear[(systime.wMonth - 1) % 12],
+		 systime.wDay, systime.wHour, systime.wMinute,
+		 systime.wSecond, systime.wYear);
+	return buf;
+}
+
+int gettimeofday(struct timeval *restrict tp, void *restrict tzp)
+{
+	FILETIME fileTime;
+	uint64_t unix_time, windows_time;
+	const uint64_t MILLISECONDS_BETWEEN_1601_AND_1970 = 11644473600000;
+
+	/* Ignore the timezone parameter */
+	(void)tzp;
+
+	/*
+	 * Windows time is stored as the number 100 ns intervals since January 1 1601.
+	 * Conversion details from http://www.informit.com/articles/article.aspx?p=102236&seqNum=3
+	 * Its precision is 100 ns but accuracy is only one clock tick, or normally around 15 ms.
+	 */
+	GetSystemTimeAsFileTime(&fileTime);
+	windows_time = ((uint64_t)fileTime.dwHighDateTime << 32) + fileTime.dwLowDateTime;
+	/* Divide by 10,000 to convert to ms and subtract the time between 1601 and 1970 */
+	unix_time = (((windows_time)/10000) - MILLISECONDS_BETWEEN_1601_AND_1970);
+	/* unix_time is now the number of milliseconds since 1970 (the Unix epoch) */
+	tp->tv_sec = unix_time / 1000;
+	tp->tv_usec = (unix_time % 1000) * 1000;
+	return 0;
+}
+
+int sigaction(int sig, const struct sigaction *act, struct sigaction *oact)
+{
+	int rc = 0;
+	void (*prev_handler)(int);
+
+	prev_handler = signal(sig, act->sa_handler);
+	if (oact != NULL)
+		oact->sa_handler = prev_handler;
+
+	if (prev_handler == SIG_ERR)
+		rc = -1;
+
+	return rc;
+}
+
+int lstat(const char *path, struct stat *buf)
+{
+	return stat(path, buf);
+}
+
+void *mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off)
+{
+	DWORD vaProt = 0;
+	DWORD mapAccess = 0;
+	DWORD lenlow;
+	DWORD lenhigh;
+	HANDLE hMap;
+	void* allocAddr = NULL;
+
+	if (prot & PROT_NONE)
+		vaProt |= PAGE_NOACCESS;
+
+	if ((prot & PROT_READ) && !(prot & PROT_WRITE)) {
+		vaProt |= PAGE_READONLY;
+		mapAccess = FILE_MAP_READ;
+	}
+
+	if (prot & PROT_WRITE) {
+		vaProt |= PAGE_READWRITE;
+		mapAccess |= FILE_MAP_WRITE;
+	}
+
+	lenlow = len & 0xFFFF;
+	lenhigh = len >> 16;
+	/* If the low DWORD is zero and the high DWORD is non-zero, `CreateFileMapping`
+	   will return ERROR_INVALID_PARAMETER. To avoid this, set both to zero. */
+	if (lenlow == 0)
+		lenhigh = 0;
+
+	if (flags & MAP_ANON || flags & MAP_ANONYMOUS) {
+		allocAddr = VirtualAlloc(addr, len, MEM_COMMIT, vaProt);
+		if (allocAddr == NULL)
+			errno = win_to_posix_error(GetLastError());
+	} else {
+		hMap = CreateFileMapping((HANDLE)_get_osfhandle(fildes), NULL,
+						vaProt, lenhigh, lenlow, NULL);
+
+		if (hMap != NULL)
+			allocAddr = MapViewOfFile(hMap, mapAccess, off >> 16,
+							off & 0xFFFF, len);
+		if (hMap == NULL || allocAddr == NULL)
+			errno = win_to_posix_error(GetLastError());
+
+	}
+
+	return allocAddr;
+}
+
+int munmap(void *addr, size_t len)
+{
+	BOOL success;
+
+	/* We may have allocated the memory with either MapViewOfFile or
+		 VirtualAlloc. Therefore, try calling UnmapViewOfFile first, and if that
+		 fails, call VirtualFree. */
+	success = UnmapViewOfFile(addr);
+
+	if (!success)
+		success = VirtualFree(addr, 0, MEM_RELEASE);
+
+	return !success;
+}
+
+int msync(void *addr, size_t len, int flags)
+{
+	return !FlushViewOfFile(addr, len);
+}
+
+int fork(void)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+pid_t setsid(void)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+static HANDLE log_file = INVALID_HANDLE_VALUE;
+
+void openlog(const char *ident, int logopt, int facility)
+{
+	if (log_file != INVALID_HANDLE_VALUE)
+		return;
+
+	log_file = CreateFileA("syslog.txt", GENERIC_WRITE,
+				FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+				OPEN_ALWAYS, 0, NULL);
+}
+
+void closelog(void)
+{
+	CloseHandle(log_file);
+	log_file = INVALID_HANDLE_VALUE;
+}
+
+void syslog(int priority, const char *message, ... /* argument */)
+{
+	va_list v;
+	int len;
+	char *output;
+	DWORD bytes_written;
+
+	if (log_file == INVALID_HANDLE_VALUE) {
+		log_file = CreateFileA("syslog.txt", GENERIC_WRITE,
+					FILE_SHARE_READ | FILE_SHARE_WRITE,
+					NULL, OPEN_ALWAYS, 0, NULL);
+	}
+
+	if (log_file == INVALID_HANDLE_VALUE) {
+		log_err("syslog: failed to open log file\n");
+		return;
+	}
+
+	va_start(v, message);
+	len = _vscprintf(message, v);
+	output = malloc(len + sizeof(char));
+	vsprintf(output, message, v);
+	WriteFile(log_file, output, len, &bytes_written, NULL);
+	va_end(v);
+	free(output);
+}
+
+int kill(pid_t pid, int sig)
+{
+	errno = ESRCH;
+	return -1;
+}
+
+/*
+ * This is assumed to be used only by the network code,
+ * and so doesn't try and handle any of the other cases
+ */
+int fcntl(int fildes, int cmd, ...)
+{
+	/*
+	 * non-blocking mode doesn't work the same as in BSD sockets,
+	 * so ignore it.
+	 */
+#if 0
+	va_list ap;
+	int val, opt, status;
+
+	if (cmd == F_GETFL)
+		return 0;
+	else if (cmd != F_SETFL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	va_start(ap, 1);
+
+	opt = va_arg(ap, int);
+	if (opt & O_NONBLOCK)
+		val = 1;
+	else
+		val = 0;
+
+	status = ioctlsocket((SOCKET)fildes, opt, &val);
+
+	if (status == SOCKET_ERROR) {
+		errno = EINVAL;
+		val = -1;
+	}
+
+	va_end(ap);
+
+	return val;
+#endif
+return 0;
+}
+
+/*
+ * Get the value of a local clock source.
+ * This implementation supports 2 clocks: CLOCK_MONOTONIC provides high-accuracy
+ * relative time, while CLOCK_REALTIME provides a low-accuracy wall time.
+ */
+int clock_gettime(clockid_t clock_id, struct timespec *tp)
+{
+	int rc = 0;
+
+	if (clock_id == CLOCK_MONOTONIC) {
+		static LARGE_INTEGER freq = {{0,0}};
+		LARGE_INTEGER counts;
+		uint64_t t;
+
+		QueryPerformanceCounter(&counts);
+		if (freq.QuadPart == 0)
+			QueryPerformanceFrequency(&freq);
+
+		tp->tv_sec = counts.QuadPart / freq.QuadPart;
+		/* Get the difference between the number of ns stored
+		 * in 'tv_sec' and that stored in 'counts' */
+		t = tp->tv_sec * freq.QuadPart;
+		t = counts.QuadPart - t;
+		/* 't' now contains the number of cycles since the last second.
+		 * We want the number of nanoseconds, so multiply out by 1,000,000,000
+		 * and then divide by the frequency. */
+		t *= 1000000000;
+		tp->tv_nsec = t / freq.QuadPart;
+	} else if (clock_id == CLOCK_REALTIME) {
+		/* clock_gettime(CLOCK_REALTIME,...) is just an alias for gettimeofday with a
+		 * higher-precision field. */
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+		tp->tv_sec = tv.tv_sec;
+		tp->tv_nsec = tv.tv_usec * 1000;
+	} else {
+		errno = EINVAL;
+		rc = -1;
+	}
+
+	return rc;
+}
+
+int mlock(const void * addr, size_t len)
+{
+	SIZE_T min, max;
+	BOOL success;
+	HANDLE process = GetCurrentProcess();
+
+	success = GetProcessWorkingSetSize(process, &min, &max);
+	if (!success) {
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	min += len;
+	max += len;
+	success = SetProcessWorkingSetSize(process, min, max);
+	if (!success) {
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	success = VirtualLock((LPVOID)addr, len);
+	if (!success) {
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+
+int munlock(const void * addr, size_t len)
+{
+	BOOL success = VirtualUnlock((LPVOID)addr, len);
+
+	if (!success) {
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+
+pid_t waitpid(pid_t pid, int *stat_loc, int options)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+int usleep(useconds_t useconds)
+{
+	Sleep(useconds / 1000);
+	return 0;
+}
+
+char *basename(char *path)
+{
+	static char name[MAX_PATH];
+	int i;
+
+	if (path == NULL || strlen(path) == 0)
+		return (char*)".";
+
+	i = strlen(path) - 1;
+
+	while (path[i] != '\\' && path[i] != '/' && i >= 0)
+		i--;
+
+	name[MAX_PATH - 1] = '\0';
+	strncpy(name, path + i + 1, MAX_PATH - 1);
+
+	return name;
+}
+
+int fsync(int fildes)
+{
+	HANDLE hFile = (HANDLE)_get_osfhandle(fildes);
+	if (!FlushFileBuffers(hFile)) {
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+
+int nFileMappings = 0;
+HANDLE fileMappings[1024];
+
+int shmget(key_t key, size_t size, int shmflg)
+{
+	int mapid = -1;
+	uint32_t size_low = size & 0xFFFFFFFF;
+	uint32_t size_high = ((uint64_t)size) >> 32;
+	HANDLE hMapping;
+
+	hMapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL,
+					PAGE_EXECUTE_READWRITE | SEC_RESERVE,
+					size_high, size_low, NULL);
+	if (hMapping != NULL) {
+		fileMappings[nFileMappings] = hMapping;
+		mapid = nFileMappings;
+		nFileMappings++;
+	} else
+		errno = ENOSYS;
+
+	return mapid;
+}
+
+void *shmat(int shmid, const void *shmaddr, int shmflg)
+{
+	void *mapAddr;
+	MEMORY_BASIC_INFORMATION memInfo;
+
+	mapAddr = MapViewOfFile(fileMappings[shmid], FILE_MAP_ALL_ACCESS, 0, 0, 0);
+	if (mapAddr == NULL) {
+		errno = win_to_posix_error(GetLastError());
+		return (void*)-1;
+	}
+
+	if (VirtualQuery(mapAddr, &memInfo, sizeof(memInfo)) == 0) {
+		errno = win_to_posix_error(GetLastError());
+		return (void*)-1;
+	}
+
+	mapAddr = VirtualAlloc(mapAddr, memInfo.RegionSize, MEM_COMMIT, PAGE_READWRITE);
+	if (mapAddr == NULL) {
+		errno = win_to_posix_error(GetLastError());
+		return (void*)-1;
+	}
+
+	return mapAddr;
+}
+
+int shmdt(const void *shmaddr)
+{
+	if (!UnmapViewOfFile(shmaddr)) {
+		errno = win_to_posix_error(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+
+int shmctl(int shmid, int cmd, struct shmid_ds *buf)
+{
+	if (cmd == IPC_RMID) {
+		fileMappings[shmid] = INVALID_HANDLE_VALUE;
+		return 0;
+	}
+
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+int setuid(uid_t uid)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+int setgid(gid_t gid)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+int nice(int incr)
+{
+	DWORD prioclass = NORMAL_PRIORITY_CLASS;
+	
+	if (incr < -15)
+		prioclass = HIGH_PRIORITY_CLASS;
+	else if (incr < 0)
+		prioclass = ABOVE_NORMAL_PRIORITY_CLASS;
+	else if (incr > 15)
+		prioclass = IDLE_PRIORITY_CLASS;
+	else if (incr > 0)
+		prioclass = BELOW_NORMAL_PRIORITY_CLASS;
+	
+	if (!SetPriorityClass(GetCurrentProcess(), prioclass))
+		log_err("fio: SetPriorityClass failed\n");
+
+	return 0;
+}
+
+int getrusage(int who, struct rusage *r_usage)
+{
+	const uint64_t SECONDS_BETWEEN_1601_AND_1970 = 11644473600;
+	FILETIME cTime, eTime, kTime, uTime;
+	time_t time;
+	HANDLE h;
+
+	memset(r_usage, 0, sizeof(*r_usage));
+
+	if (who == RUSAGE_SELF) {
+		h = GetCurrentProcess();
+		GetProcessTimes(h, &cTime, &eTime, &kTime, &uTime);
+	} else if (who == RUSAGE_THREAD) {
+		h = GetCurrentThread();
+		GetThreadTimes(h, &cTime, &eTime, &kTime, &uTime);
+	} else {
+		log_err("fio: getrusage %d is not implemented\n", who);
+		return -1;
+	}
+
+	time = ((uint64_t)uTime.dwHighDateTime << 32) + uTime.dwLowDateTime;
+	/* Divide by 10,000,000 to get the number of seconds and move the epoch from
+	 * 1601 to 1970 */
+	time = (time_t)(((time)/10000000) - SECONDS_BETWEEN_1601_AND_1970);
+	r_usage->ru_utime.tv_sec = time;
+	/* getrusage() doesn't care about anything other than seconds, so set tv_usec to 0 */
+	r_usage->ru_utime.tv_usec = 0;
+	time = ((uint64_t)kTime.dwHighDateTime << 32) + kTime.dwLowDateTime;
+	/* Divide by 10,000,000 to get the number of seconds and move the epoch from
+	 * 1601 to 1970 */
+	time = (time_t)(((time)/10000000) - SECONDS_BETWEEN_1601_AND_1970);
+	r_usage->ru_stime.tv_sec = time;
+	r_usage->ru_stime.tv_usec = 0;
+	return 0;
+}
+
+int posix_madvise(void *addr, size_t len, int advice)
+{
+	return ENOSYS;
+}
+
+int fdatasync(int fildes)
+{
+	return fsync(fildes);
+}
+
+ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
+		off_t offset)
+{
+	int64_t pos = _telli64(fildes);
+	ssize_t len = _write(fildes, buf, nbyte);
+
+	_lseeki64(fildes, pos, SEEK_SET);
+	return len;
+}
+
+ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset)
+{
+	int64_t pos = _telli64(fildes);
+	ssize_t len = read(fildes, buf, nbyte);
+
+	_lseeki64(fildes, pos, SEEK_SET);
+	return len;
+}
+
+ssize_t readv(int fildes, const struct iovec *iov, int iovcnt)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+ssize_t writev(int fildes, const struct iovec *iov, int iovcnt)
+{
+	int i;
+	DWORD bytes_written = 0;
+
+	for (i = 0; i < iovcnt; i++) {
+		int len;
+
+		len = send((SOCKET)fildes, iov[i].iov_base, iov[i].iov_len, 0);
+		if (len == SOCKET_ERROR) {
+			DWORD err = GetLastError();
+			errno = win_to_posix_error(err);
+			bytes_written = -1;
+			break;
+		}
+		bytes_written += len;
+	}
+
+	return bytes_written;
+}
+
+long long strtoll(const char *restrict str, char **restrict endptr, int base)
+{
+	return _strtoi64(str, endptr, base);
+}
+
+int poll(struct pollfd fds[], nfds_t nfds, int timeout)
+{
+	struct timeval tv;
+	struct timeval *to = NULL;
+	fd_set readfds, writefds, exceptfds;
+	int i;
+	int rc;
+
+	if (timeout != -1) {
+		to = &tv;
+		to->tv_sec = timeout / 1000;
+		to->tv_usec = (timeout % 1000) * 1000;
+	}
+
+	FD_ZERO(&readfds);
+	FD_ZERO(&writefds);
+	FD_ZERO(&exceptfds);
+
+	for (i = 0; i < nfds; i++) {
+		if (fds[i].fd < 0) {
+			fds[i].revents = 0;
+			continue;
+		}
+
+		if (fds[i].events & POLLIN)
+			FD_SET(fds[i].fd, &readfds);
+
+		if (fds[i].events & POLLOUT)
+			FD_SET(fds[i].fd, &writefds);
+
+		FD_SET(fds[i].fd, &exceptfds);
+	}
+	rc = select(nfds, &readfds, &writefds, &exceptfds, to);
+
+	if (rc != SOCKET_ERROR) {
+		for (i = 0; i < nfds; i++) {
+			if (fds[i].fd < 0)
+				continue;
+
+			if ((fds[i].events & POLLIN) && FD_ISSET(fds[i].fd, &readfds))
+				fds[i].revents |= POLLIN;
+
+			if ((fds[i].events & POLLOUT) && FD_ISSET(fds[i].fd, &writefds))
+				fds[i].revents |= POLLOUT;
+
+			if (FD_ISSET(fds[i].fd, &exceptfds))
+				fds[i].revents |= POLLHUP;
+		}
+	}
+	return rc;
+}
+
+int nanosleep(const struct timespec *rqtp, struct timespec *rmtp)
+{
+	struct timespec tv;
+	DWORD ms_remaining;
+	DWORD ms_total = (rqtp->tv_sec * 1000) + (rqtp->tv_nsec / 1000000.0);
+
+	if (ms_total == 0)
+		ms_total = 1;
+
+	ms_remaining = ms_total;
+
+	/* Since Sleep() can sleep for less than the requested time, add a loop to
+	   ensure we only return after the requested length of time has elapsed */
+	do {
+		fio_gettime(&tv, NULL);
+		Sleep(ms_remaining);
+		ms_remaining = ms_total - mtime_since_now(&tv);
+	} while (ms_remaining > 0 && ms_remaining < ms_total);
+
+	/* this implementation will never sleep for less than the requested time */
+	if (rmtp != NULL) {
+		rmtp->tv_sec = 0;
+		rmtp->tv_nsec = 0;
+	}
+
+	return 0;
+}
+
+DIR *opendir(const char *dirname)
+{
+	struct dirent_ctx *dc = NULL;
+	HANDLE file;
+
+	/* See if we can open it. If not, we'll return an error here */
+	file = CreateFileA(dirname, 0, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+				OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+	if (file != INVALID_HANDLE_VALUE) {
+		CloseHandle(file);
+		dc = malloc(sizeof(struct dirent_ctx));
+		snprintf(dc->dirname, sizeof(dc->dirname), "%s", dirname);
+		dc->find_handle = INVALID_HANDLE_VALUE;
+	} else {
+		DWORD error = GetLastError();
+		if (error == ERROR_FILE_NOT_FOUND)
+			errno = ENOENT;
+
+		else if (error == ERROR_PATH_NOT_FOUND)
+			errno = ENOTDIR;
+		else if (error == ERROR_TOO_MANY_OPEN_FILES)
+			errno = ENFILE;
+		else if (error == ERROR_ACCESS_DENIED)
+			errno = EACCES;
+		else
+			errno = error;
+	}
+
+	return dc;
+}
+
+int closedir(DIR *dirp)
+{
+	if (dirp != NULL && dirp->find_handle != INVALID_HANDLE_VALUE)
+		FindClose(dirp->find_handle);
+
+	free(dirp);
+	return 0;
+}
+
+struct dirent *readdir(DIR *dirp)
+{
+	static struct dirent de;
+	WIN32_FIND_DATA find_data;
+
+	if (dirp == NULL)
+		return NULL;
+
+	if (dirp->find_handle == INVALID_HANDLE_VALUE) {
+		char search_pattern[MAX_PATH];
+
+		snprintf(search_pattern, sizeof(search_pattern), "%s\\*",
+			 dirp->dirname);
+		dirp->find_handle = FindFirstFileA(search_pattern, &find_data);
+		if (dirp->find_handle == INVALID_HANDLE_VALUE)
+			return NULL;
+	} else {
+		if (!FindNextFile(dirp->find_handle, &find_data))
+			return NULL;
+	}
+
+	snprintf(de.d_name, sizeof(de.d_name), find_data.cFileName);
+	de.d_ino = 0;
+
+	return &de;
+}
+
+uid_t geteuid(void)
+{
+	log_err("%s is not implemented\n", __func__);
+	errno = ENOSYS;
+	return -1;
+}
+
+in_addr_t inet_network(const char *cp)
+{
+	in_addr_t hbo;
+	in_addr_t nbo = inet_addr(cp);
+	hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24);
+	return hbo;
+}
+
+#ifdef CONFIG_WINDOWS_XP
+const char *inet_ntop(int af, const void *restrict src, char *restrict dst,
+		      socklen_t size)
+{
+	INT status = SOCKET_ERROR;
+	WSADATA wsd;
+	char *ret = NULL;
+
+	if (af != AF_INET && af != AF_INET6) {
+		errno = EAFNOSUPPORT;
+		return NULL;
+	}
+
+	WSAStartup(MAKEWORD(2,2), &wsd);
+
+	if (af == AF_INET) {
+		struct sockaddr_in si;
+		DWORD len = size;
+
+		memset(&si, 0, sizeof(si));
+		si.sin_family = af;
+		memcpy(&si.sin_addr, src, sizeof(si.sin_addr));
+		status = WSAAddressToString((struct sockaddr*)&si, sizeof(si), NULL, dst, &len);
+	} else if (af == AF_INET6) {
+		struct sockaddr_in6 si6;
+		DWORD len = size;
+
+		memset(&si6, 0, sizeof(si6));
+		si6.sin6_family = af;
+		memcpy(&si6.sin6_addr, src, sizeof(si6.sin6_addr));
+		status = WSAAddressToString((struct sockaddr*)&si6, sizeof(si6), NULL, dst, &len);
+	}
+
+	if (status != SOCKET_ERROR)
+		ret = dst;
+	else
+		errno = ENOSPC;
+
+	WSACleanup();
+
+	return ret;
+}
+
+int inet_pton(int af, const char *restrict src, void *restrict dst)
+{
+	INT status = SOCKET_ERROR;
+	WSADATA wsd;
+	int ret = 1;
+
+	if (af != AF_INET && af != AF_INET6) {
+		errno = EAFNOSUPPORT;
+		return -1;
+	}
+
+	WSAStartup(MAKEWORD(2,2), &wsd);
+
+	if (af == AF_INET) {
+		struct sockaddr_in si;
+		INT len = sizeof(si);
+
+		memset(&si, 0, sizeof(si));
+		si.sin_family = af;
+		status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si, &len);
+		if (status != SOCKET_ERROR)
+			memcpy(dst, &si.sin_addr, sizeof(si.sin_addr));
+	} else if (af == AF_INET6) {
+		struct sockaddr_in6 si6;
+		INT len = sizeof(si6);
+
+		memset(&si6, 0, sizeof(si6));
+		si6.sin6_family = af;
+		status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si6, &len);
+		if (status != SOCKET_ERROR)
+			memcpy(dst, &si6.sin6_addr, sizeof(si6.sin6_addr));
+	}
+
+	if (status == SOCKET_ERROR) {
+		errno = ENOSPC;
+		ret = 0;
+	}
+
+	WSACleanup();
+
+	return ret;
+}
+#endif /* CONFIG_WINDOWS_XP */
diff --git a/os/windows/posix.h b/os/windows/posix.h
new file mode 100644
index 0000000..02a9075
--- /dev/null
+++ b/os/windows/posix.h
@@ -0,0 +1,10 @@
+#ifndef FIO_WINDOWS_POSIX_H
+#define FIO_WINDOWS_POSIX_H
+
+typedef int clockid_t;
+
+extern int clock_gettime(clockid_t clock_id, struct timespec *tp);
+extern int inet_aton(const char *, struct in_addr *);
+extern int win_to_posix_error(DWORD winerr);
+
+#endif
diff --git a/os/windows/posix/include/arpa/inet.h b/os/windows/posix/include/arpa/inet.h
new file mode 100644
index 0000000..056f1dd
--- /dev/null
+++ b/os/windows/posix/include/arpa/inet.h
@@ -0,0 +1,21 @@
+#ifndef ARPA_INET_H
+#define ARPA_INET_H
+
+#include <ws2tcpip.h>
+#include <inttypes.h>
+
+typedef int socklen_t;
+typedef int in_addr_t;
+
+/* EAI_SYSTEM isn't used on Windows, so map it to EAI_FAIL */
+#define EAI_SYSTEM EAI_FAIL
+
+in_addr_t inet_network(const char *cp);
+
+#ifdef CONFIG_WINDOWS_XP
+const char *inet_ntop(int af, const void *restrict src,
+        char *restrict dst, socklen_t size);
+int inet_pton(int af, const char *restrict src, void *restrict dst);
+#endif
+
+#endif /* ARPA_INET_H */
diff --git a/os/windows/posix/include/asm/types.h b/os/windows/posix/include/asm/types.h
new file mode 100644
index 0000000..7e1489a
--- /dev/null
+++ b/os/windows/posix/include/asm/types.h
@@ -0,0 +1,8 @@
+#ifndef ASM_TYPES_H
+#define ASM_TYPES_H
+
+typedef unsigned short __u16;
+typedef unsigned int __u32;
+typedef unsigned long long __u64;
+
+#endif /* ASM_TYPES_H */
diff --git a/os/windows/posix/include/dirent.h b/os/windows/posix/include/dirent.h
new file mode 100644
index 0000000..6b4e671
--- /dev/null
+++ b/os/windows/posix/include/dirent.h
@@ -0,0 +1,24 @@
+#ifndef DIRENT_H
+#define DIRENT_H
+
+#include <winsock2.h>
+
+struct dirent
+{
+	ino_t  d_ino;     /*  File serial number */
+	char   d_name[MAX_PATH];  /* Name of entry */
+};
+
+struct dirent_ctx
+{
+	HANDLE find_handle;
+	char dirname[MAX_PATH];
+};
+
+typedef struct dirent_ctx DIR;
+
+DIR *opendir(const char *dirname);
+struct dirent *readdir(DIR *dirp);
+int closedir(DIR *dirp);
+
+#endif /* DIRENT_H */
diff --git a/os/windows/posix/include/dlfcn.h b/os/windows/posix/include/dlfcn.h
new file mode 100644
index 0000000..57f2930
--- /dev/null
+++ b/os/windows/posix/include/dlfcn.h
@@ -0,0 +1,11 @@
+#ifndef DLFCN_H
+#define DLFCN_H
+
+#define RTLD_LAZY 1
+
+void *dlopen(const char *file, int mode);
+int dlclose(void *handle);
+void *dlsym(void *restrict handle, const char *restrict name);
+char *dlerror(void);
+
+#endif /* DLFCN_H */
diff --git a/os/windows/posix/include/libgen.h b/os/windows/posix/include/libgen.h
new file mode 100644
index 0000000..5c4fb23
--- /dev/null
+++ b/os/windows/posix/include/libgen.h
@@ -0,0 +1,6 @@
+#ifndef LIBGEN_H
+#define LIBGEN_H
+
+char *basename(char *path);
+
+#endif /* LIBGEN_H */
diff --git a/os/windows/posix/include/netdb.h b/os/windows/posix/include/netdb.h
new file mode 100644
index 0000000..5dace16
--- /dev/null
+++ b/os/windows/posix/include/netdb.h
@@ -0,0 +1,4 @@
+#ifndef NETDB_H
+#define NETDB_H
+
+#endif /* NETDB_H */
diff --git a/os/windows/posix/include/netinet/in.h b/os/windows/posix/include/netinet/in.h
new file mode 100644
index 0000000..f7e2419
--- /dev/null
+++ b/os/windows/posix/include/netinet/in.h
@@ -0,0 +1,8 @@
+#ifndef NETINET_IN_H
+#define NETINET_IN_H
+
+#include <inttypes.h>
+#include <sys/un.h>
+
+
+#endif /* NETINET_IN_H */
diff --git a/os/windows/posix/include/netinet/tcp.h b/os/windows/posix/include/netinet/tcp.h
new file mode 100644
index 0000000..250c4c3
--- /dev/null
+++ b/os/windows/posix/include/netinet/tcp.h
@@ -0,0 +1,4 @@
+#ifndef NETINET_TCP_H
+#define NETINET_TCP_H
+
+#endif
diff --git a/os/windows/posix/include/poll.h b/os/windows/posix/include/poll.h
new file mode 100644
index 0000000..25b8183
--- /dev/null
+++ b/os/windows/posix/include/poll.h
@@ -0,0 +1,24 @@
+#ifndef POLL_H
+#define POLL_H
+
+#include <winsock2.h>
+
+typedef int nfds_t;
+
+#ifdef CONFIG_WINDOWS_XP
+struct pollfd
+{
+	int fd;
+	short events;
+	short revents;
+};
+
+#define POLLOUT	1
+#define POLLIN	2
+#define POLLERR	0
+#define POLLHUP	1
+#endif /* CONFIG_WINDOWS_XP */
+
+int poll(struct pollfd fds[], nfds_t nfds, int timeout);
+
+#endif /* POLL_H */
diff --git a/os/windows/posix/include/semaphore.h b/os/windows/posix/include/semaphore.h
new file mode 100644
index 0000000..39cd624
--- /dev/null
+++ b/os/windows/posix/include/semaphore.h
@@ -0,0 +1,4 @@
+#ifndef SEMAPHORE_H
+#define SEMAPHORE_H
+
+#endif /* SEMAPHORE_H */
diff --git a/os/windows/posix/include/sys/ioctl.h b/os/windows/posix/include/sys/ioctl.h
new file mode 100644
index 0000000..a42247d
--- /dev/null
+++ b/os/windows/posix/include/sys/ioctl.h
@@ -0,0 +1,7 @@
+#ifndef IOCTL_H
+#define IOCTL_H
+
+/* This file is empty since it only needs to exist on Windows
+   but isn't otherwise used */
+
+#endif /* IOCTL_H */
\ No newline at end of file
diff --git a/os/windows/posix/include/sys/ipc.h b/os/windows/posix/include/sys/ipc.h
new file mode 100644
index 0000000..abf26e8
--- /dev/null
+++ b/os/windows/posix/include/sys/ipc.h
@@ -0,0 +1,4 @@
+#ifndef SYS_IPC_H
+#define SYS_IPC_H
+
+#endif /* SYS_IPC_H */
diff --git a/os/windows/posix/include/sys/mman.h b/os/windows/posix/include/sys/mman.h
new file mode 100644
index 0000000..8edd9fc
--- /dev/null
+++ b/os/windows/posix/include/sys/mman.h
@@ -0,0 +1,36 @@
+#ifndef SYS_MMAN_H
+#define SYS_MMAN_H
+
+#include <sys/types.h>
+
+#define PROT_NONE	0x1
+#define PROT_READ	0x2
+#define PROT_WRITE	0x4
+
+#define MAP_ANON			0x1
+#define MAP_ANONYMOUS		MAP_ANON
+#define MAP_FIXED			0x2
+#define MAP_HASSEMAPHORE	0x4
+#define MAP_INHERIT			0x8
+#define MAP_NOCORE			0x10
+#define MAP_NOSYNC			0x20
+#define MAP_PREFAULT_READ	0x40
+#define MAP_PRIVATE			0x80
+#define MAP_SHARED			0x100
+#define MAP_STACK			0x200
+
+#define MAP_FAILED			NULL
+
+#define MS_ASYNC			0x1
+#define MS_SYNC				0x2
+#define MS_INVALIDATE		0x3
+
+int posix_madvise(void *addr, size_t len, int advice);
+void *mmap(void *addr, size_t len, int prot, int flags,
+		int fildes, off_t off);
+int munmap(void *addr, size_t len);
+int msync(void *addr, size_t len, int flags);
+int munlock(const void * addr, size_t len);
+int mlock(const void *addr, size_t len);
+
+#endif /* SYS_MMAN_H */
diff --git a/os/windows/posix/include/sys/resource.h b/os/windows/posix/include/sys/resource.h
new file mode 100644
index 0000000..aa95705
--- /dev/null
+++ b/os/windows/posix/include/sys/resource.h
@@ -0,0 +1,19 @@
+#ifndef SYS_RESOURCE_H
+#define SYS_RESOURCE_H
+
+#define RUSAGE_SELF	0
+#define RUSAGE_THREAD	1
+
+struct rusage
+{
+	struct timeval ru_utime;
+	struct timeval ru_stime;
+	int ru_nvcsw;
+	int ru_minflt;
+	int ru_majflt;
+	int ru_nivcsw;
+};
+
+int getrusage(int who, struct rusage *r_usage);
+
+#endif /* SYS_RESOURCE_H */
diff --git a/os/windows/posix/include/sys/shm.h b/os/windows/posix/include/sys/shm.h
new file mode 100644
index 0000000..c059274
--- /dev/null
+++ b/os/windows/posix/include/sys/shm.h
@@ -0,0 +1,41 @@
+#ifndef SYS_SHM_H
+#define SYS_SHM_H
+
+#define IPC_RMID	0x1
+#define IPC_CREAT	0x2
+#define IPC_PRIVATE	0x4
+
+typedef int uid_t;
+typedef int gid_t;
+
+typedef int shmatt_t;
+typedef int key_t;
+
+struct ipc_perm
+{
+	uid_t    uid;    /* owner's user ID */
+	gid_t    gid;    /* owner's group ID */
+	uid_t    cuid;   /* creator's user ID */
+	gid_t    cgid;   /* creator's group ID */
+	mode_t   mode;   /* read/write permission */
+};
+
+
+struct shmid_ds
+{
+	struct ipc_perm shm_perm;   /* operation permission structure */
+	size_t          shm_segsz;  /* size of segment in bytes */
+	pid_t           shm_lpid;   /* process ID of last shared memory operation */
+	pid_t           shm_cpid;   /* process ID of creator */
+	shmatt_t        shm_nattch; /* number of current attaches */
+	time_t          shm_atime;  /* time of last shmat() */
+	time_t          shm_dtime;  /* time of last shmdt() */
+	time_t          shm_ctime;  /* time of last change by shmctl() */
+};
+
+int shmctl(int shmid, int cmd, struct shmid_ds *buf);
+int shmget(key_t key, size_t size, int shmflg);
+void *shmat(int shmid, const void *shmaddr, int shmflg);
+int shmdt(const void *shmaddr);
+
+#endif /* SYS_SHM_H */
diff --git a/os/windows/posix/include/sys/socket.h b/os/windows/posix/include/sys/socket.h
new file mode 100644
index 0000000..4da6f2f
--- /dev/null
+++ b/os/windows/posix/include/sys/socket.h
@@ -0,0 +1,4 @@
+#ifndef SYS_SOCKET_H
+#define SYS_SOCKET_H
+
+#endif /* SYS_SOCKET_H */
diff --git a/os/windows/posix/include/sys/uio.h b/os/windows/posix/include/sys/uio.h
new file mode 100644
index 0000000..402e988
--- /dev/null
+++ b/os/windows/posix/include/sys/uio.h
@@ -0,0 +1,16 @@
+#ifndef SYS_UIO_H
+#define SYS_UIO_H
+
+#include <inttypes.h>
+#include <unistd.h>
+
+struct iovec
+{
+	void	*iov_base;  /* Base address of a memory region for input or output */
+	size_t	 iov_len;   /* The size of the memory pointed to by iov_base */
+};
+
+ ssize_t readv(int fildes, const struct iovec *iov, int iovcnt);
+ ssize_t writev(int fildes, const struct iovec *iov, int iovcnt);
+
+#endif /* SYS_UIO_H */
diff --git a/os/windows/posix/include/sys/un.h b/os/windows/posix/include/sys/un.h
new file mode 100644
index 0000000..e418c6d
--- /dev/null
+++ b/os/windows/posix/include/sys/un.h
@@ -0,0 +1,13 @@
+#ifndef SYS_UN_H
+#define SYS_UN_H
+
+typedef int sa_family_t;
+typedef int in_port_t;
+
+ struct sockaddr_un
+ {
+	sa_family_t	sun_family; /* Address family */
+	char		sun_path[260]; /* Socket pathname */
+};
+
+#endif /* SYS_UN_H */
diff --git a/os/windows/posix/include/sys/wait.h b/os/windows/posix/include/sys/wait.h
new file mode 100644
index 0000000..ac50aa8
--- /dev/null
+++ b/os/windows/posix/include/sys/wait.h
@@ -0,0 +1,12 @@
+#ifndef SYS_WAIT_H
+#define SYS_WAIT_H
+
+#define WIFSIGNALED(a)	0
+#define WIFEXITED(a)	0
+#define WTERMSIG(a)		0
+#define WEXITSTATUS(a)	0
+#define WNOHANG			1
+
+pid_t waitpid(pid_t, int *stat_loc, int options);
+
+#endif /* SYS_WAIT_H */
diff --git a/os/windows/posix/include/syslog.h b/os/windows/posix/include/syslog.h
new file mode 100644
index 0000000..b8582e9
--- /dev/null
+++ b/os/windows/posix/include/syslog.h
@@ -0,0 +1,18 @@
+#ifndef SYSLOG_H
+#define SYSLOG_H
+
+int syslog();
+
+#define LOG_INFO	0x1
+#define LOG_ERROR	0x2
+#define LOG_WARN	0x4
+
+#define LOG_NDELAY	0x1
+#define LOG_NOWAIT	0x2
+#define LOG_PID		0x4
+#define LOG_USER	0x8
+
+void closelog(void);
+void openlog(const char *ident, int logopt, int facility);
+
+#endif /* SYSLOG_H */
diff --git a/oslib/asprintf.c b/oslib/asprintf.c
new file mode 100644
index 0000000..ff503c5
--- /dev/null
+++ b/oslib/asprintf.c
@@ -0,0 +1,43 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "oslib/asprintf.h"
+
+#ifndef CONFIG_HAVE_VASPRINTF
+int vasprintf(char **strp, const char *fmt, va_list ap)
+{
+	va_list ap_copy;
+	char *str;
+	int len;
+
+#ifdef va_copy
+	va_copy(ap_copy, ap);
+#else
+	__va_copy(ap_copy, ap);
+#endif
+	len = vsnprintf(NULL, 0, fmt, ap_copy);
+	va_end(ap_copy);
+
+	if (len < 0)
+		return len;
+
+	len++;
+	str = malloc(len);
+	*strp = str;
+	return str ? vsnprintf(str, len, fmt, ap) : -1;
+}
+#endif
+
+#ifndef CONFIG_HAVE_ASPRINTF
+int asprintf(char **strp, const char *fmt, ...)
+{
+	va_list arg;
+	int done;
+
+	va_start(arg, fmt);
+	done = vasprintf(strp, fmt, arg);
+	va_end(arg);
+
+	return done;
+}
+#endif
diff --git a/oslib/asprintf.h b/oslib/asprintf.h
new file mode 100644
index 0000000..7425300
--- /dev/null
+++ b/oslib/asprintf.h
@@ -0,0 +1,11 @@
+#ifndef FIO_ASPRINTF_H
+#define FIO_ASPRINTF_H
+
+#ifndef CONFIG_HAVE_VASPRINTF
+int vasprintf(char **strp, const char *fmt, va_list ap);
+#endif
+#ifndef CONFIG_HAVE_ASPRINTF
+int asprintf(char **strp, const char *fmt, ...);
+#endif
+
+#endif /* FIO_ASPRINTF_H */
diff --git a/oslib/getopt.h b/oslib/getopt.h
new file mode 100644
index 0000000..bc8a268
--- /dev/null
+++ b/oslib/getopt.h
@@ -0,0 +1,26 @@
+#ifdef CONFIG_GETOPT_LONG_ONLY
+
+#include <getopt.h>
+
+#else
+
+#ifndef _GETOPT_H
+#define _GETOPT_H
+
+struct option {
+	const char *name;
+	int has_arg;
+	int *flag;
+	int val;
+};
+
+enum {
+	no_argument	  = 0,
+	required_argument = 1,
+	optional_argument = 2,
+};
+
+int getopt_long_only(int, char *const *, const char *, const struct option *, int *);
+
+#endif
+#endif
diff --git a/oslib/getopt_long.c b/oslib/getopt_long.c
new file mode 100644
index 0000000..8ec7741
--- /dev/null
+++ b/oslib/getopt_long.c
@@ -0,0 +1,193 @@
+/*
+ * getopt.c
+ *
+ * getopt_long(), or at least a common subset thereof:
+ *
+ * - Option reordering is not supported
+ * - -W foo is not supported
+ * - First optstring character "-" not supported.
+ *
+ * This file was imported from the klibc library from hpa
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "getopt.h"
+
+char *optarg = NULL;
+int optind = 0, opterr = 0, optopt = 0;
+
+static struct getopt_private_state {
+	const char *optptr;
+	const char *last_optstring;
+	char *const *last_argv;
+} pvt;
+
+static inline const char *option_matches(const char *arg_str,
+					 const char *opt_name, int smatch)
+{
+	while (*arg_str != '\0' && *arg_str != '=') {
+		if (*arg_str++ != *opt_name++)
+			return NULL;
+	}
+
+	if (*opt_name && !smatch)
+		return NULL;
+
+	return arg_str;
+}
+
+int getopt_long_only(int argc, char *const *argv, const char *optstring,
+		const struct option *longopts, int *longindex)
+{
+	const char *carg;
+	const char *osptr;
+	int opt;
+
+	optarg = NULL;
+
+	/* getopt() relies on a number of different global state
+	   variables, which can make this really confusing if there is
+	   more than one use of getopt() in the same program.  This
+	   attempts to detect that situation by detecting if the
+	   "optstring" or "argv" argument have changed since last time
+	   we were called; if so, reinitialize the query state. */
+
+	if (optstring != pvt.last_optstring || argv != pvt.last_argv ||
+	    optind < 1 || optind > argc) {
+		/* optind doesn't match the current query */
+		pvt.last_optstring = optstring;
+		pvt.last_argv = argv;
+		optind = 1;
+		pvt.optptr = NULL;
+	}
+
+	carg = argv[optind];
+
+	/* First, eliminate all non-option cases */
+
+	if (!carg || carg[0] != '-' || !carg[1])
+		return -1;
+
+	if (carg[1] == '-') {
+		const struct option *lo;
+		const char *opt_end = NULL;
+
+		optind++;
+
+		/* Either it's a long option, or it's -- */
+		if (!carg[2]) {
+			/* It's -- */
+			return -1;
+		}
+
+		for (lo = longopts; lo->name; lo++) {
+			opt_end = option_matches(carg+2, lo->name, 0);
+			if (opt_end)
+			    break;
+		}
+		/*
+		 * The GNU getopt_long_only() apparently allows a short match,
+		 * if it's unique and if we don't have a full match. Let's
+		 * do the same here, search and see if there is one (and only
+		 * one) short match.
+		 */
+		if (!opt_end) {
+			const struct option *lo_match = NULL;
+
+			for (lo = longopts; lo->name; lo++) {
+				const char *ret;
+
+				ret = option_matches(carg+2, lo->name, 1);
+				if (!ret)
+					continue;
+				if (!opt_end) {
+					opt_end = ret;
+					lo_match = lo;
+				} else {
+					opt_end = NULL;
+					break;
+				}
+			}
+			if (!opt_end)
+				return '?';
+			lo = lo_match;
+		}
+
+		if (longindex)
+			*longindex = lo-longopts;
+
+		if (*opt_end == '=') {
+			if (lo->has_arg)
+				optarg = (char *)opt_end+1;
+			else
+				return '?';
+		} else if (lo->has_arg == 1) {
+			if (!(optarg = argv[optind]))
+				return '?';
+			optind++;
+		}
+
+		if (lo->flag) {
+			*lo->flag = lo->val;
+			return 0;
+		} else {
+			return lo->val;
+		}
+	}
+
+	if ((uintptr_t) (pvt.optptr - carg) > (uintptr_t) strlen(carg)) {
+		/* Someone frobbed optind, change to new opt. */
+		pvt.optptr = carg + 1;
+	}
+
+	opt = *pvt.optptr++;
+
+	if (opt != ':' && (osptr = strchr(optstring, opt))) {
+		if (osptr[1] == ':') {
+			if (*pvt.optptr) {
+				/* Argument-taking option with attached
+				   argument */
+				optarg = (char *)pvt.optptr;
+				optind++;
+			} else {
+				/* Argument-taking option with non-attached
+				   argument */
+				if (osptr[2] == ':') {
+					if (argv[optind + 1]) {
+						optarg = (char *)argv[optind+1];
+						optind += 2;
+					} else {
+						optarg = NULL;
+						optind++;
+					}
+					return opt;
+				} else if (argv[optind + 1]) {
+					optarg = (char *)argv[optind+1];
+					optind += 2;
+				} else {
+					/* Missing argument */
+					optind++;
+					return (optstring[0] == ':')
+						? ':' : '?';
+				}
+			}
+			return opt;
+		} else {
+			/* Non-argument-taking option */
+			/* pvt.optptr will remember the exact position to
+			   resume at */
+			if (!*pvt.optptr)
+				optind++;
+			return opt;
+		}
+	} else {
+		/* Unknown option */
+		optopt = opt;
+		if (!*pvt.optptr)
+			optind++;
+		return '?';
+	}
+}
diff --git a/oslib/inet_aton.c b/oslib/inet_aton.c
new file mode 100644
index 0000000..7ae7db7
--- /dev/null
+++ b/oslib/inet_aton.c
@@ -0,0 +1,6 @@
+#include "inet_aton.h"
+
+int inet_aton(const char *cp, struct in_addr *inp)
+{
+	return inet_pton(AF_INET, cp, inp);
+}
diff --git a/oslib/inet_aton.h b/oslib/inet_aton.h
new file mode 100644
index 0000000..c93c87f
--- /dev/null
+++ b/oslib/inet_aton.h
@@ -0,0 +1,8 @@
+#ifndef FIO_INET_ATON_LIB_H
+#define FIO_INET_ATON_LIB_H
+
+#include <arpa/inet.h>
+
+int inet_aton(const char *cp, struct in_addr *inp);
+
+#endif
diff --git a/oslib/libmtd.c b/oslib/libmtd.c
new file mode 100644
index 0000000..385b9d2
--- /dev/null
+++ b/oslib/libmtd.c
@@ -0,0 +1,1424 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#include <mtd/mtd-user.h>
+#include "libmtd.h"
+
+#include "libmtd_int.h"
+#include "libmtd_common.h"
+
+/**
+ * mkpath - compose full path from 2 given components.
+ * @path: the first component
+ * @name: the second component
+ *
+ * This function returns the resulting path in case of success and %NULL in
+ * case of failure.
+ */
+static char *mkpath(const char *path, const char *name)
+{
+	char *n;
+	size_t len1 = strlen(path);
+	size_t len2 = strlen(name);
+
+	n = xmalloc(len1 + len2 + 6);
+
+	memcpy(n, path, len1);
+	if (n[len1 - 1] != '/')
+		n[len1++] = '/';
+
+	memcpy(n + len1, name, len2 + 1);
+	return n;
+}
+
+/**
+ * read_data - read data from a file.
+ * @file: the file to read from
+ * @buf: the buffer to read to
+ * @buf_len: buffer length
+ *
+ * This function returns number of read bytes in case of success and %-1 in
+ * case of failure. Note, if the file contains more then @buf_len bytes of
+ * date, this function fails with %EINVAL error code.
+ */
+static int read_data(const char *file, void *buf, int buf_len)
+{
+	int fd, rd, tmp, tmp1;
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, buf_len);
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+
+	if (rd == buf_len) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	((char *)buf)[rd] = '\0';
+
+	/* Make sure all data is read */
+	tmp1 = read(fd, &tmp, 1);
+	if (tmp1 == 1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (tmp1) {
+		errmsg("file \"%s\" contains too much data (> %d bytes)",
+		       file, buf_len);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd)) {
+		sys_errmsg("close failed on \"%s\"", file);
+		return -1;
+	}
+
+	return rd;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_major - read major and minor numbers from a file.
+ * @file: name of the file to read from
+ * @major: major number is returned here
+ * @minor: minor number is returned here
+ *
+ * This function returns % in case of success, and %-1 in case of failure.
+ */
+static int read_major(const char *file, int *major, int *minor)
+{
+	int ret;
+	char buf[50];
+
+	ret = read_data(file, buf, 50);
+	if (ret < 0)
+		return ret;
+
+	ret = sscanf(buf, "%d:%d\n", major, minor);
+	if (ret != 2) {
+		errno = EINVAL;
+		return errmsg("\"%s\" does not have major:minor format", file);
+	}
+
+	if (*major < 0 || *minor < 0) {
+		errno = EINVAL;
+		return errmsg("bad major:minor %d:%d in \"%s\"",
+			      *major, *minor, file);
+	}
+
+	return 0;
+}
+
+/**
+ * dev_get_major - get major and minor numbers of an MTD device.
+ * @lib: libmtd descriptor
+ * @mtd_num: MTD device number
+ * @major: major number is returned here
+ * @minor: minor number is returned here
+ *
+ * This function returns zero in case of success and %-1 in case of failure.
+ */
+static int dev_get_major(struct libmtd *lib, int mtd_num, int *major, int *minor)
+{
+	char file[strlen(lib->mtd_dev) + 50];
+
+	sprintf(file, lib->mtd_dev, mtd_num);
+	return read_major(file, major, minor);
+}
+
+/**
+ * dev_read_data - read data from an MTD device's sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @buf: buffer to read to
+ * @buf_len: buffer length
+ *
+ * This function returns number of read bytes in case of success and %-1 in
+ * case of failure.
+ */
+static int dev_read_data(const char *patt, int mtd_num, void *buf, int buf_len)
+{
+	char file[strlen(patt) + 100];
+
+	sprintf(file, patt, mtd_num);
+	return read_data(file, buf, buf_len);
+}
+
+/**
+ * read_hex_ll - read a hex 'long long' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function reads file @file and interprets its contents as hexadecimal
+ * 'long long' integer. If this is not true, it fails with %EINVAL error code.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+static int read_hex_ll(const char *file, long long *value)
+{
+	int fd, rd;
+	char buf[50];
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, sizeof(buf));
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (rd == sizeof(buf)) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+	buf[rd] = '\0';
+
+	if (sscanf(buf, "%llx\n", value) != 1) {
+		errmsg("cannot read integer from \"%s\"\n", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (*value < 0) {
+		errmsg("negative value %lld in \"%s\"", *value, file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd))
+		return sys_errmsg("close failed on \"%s\"", file);
+
+	return 0;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_pos_ll - read a positive 'long long' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function reads file @file and interprets its contents as a positive
+ * 'long long' integer. If this is not true, it fails with %EINVAL error code.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+static int read_pos_ll(const char *file, long long *value)
+{
+	int fd, rd;
+	char buf[50];
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, 50);
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (rd == 50) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (sscanf(buf, "%lld\n", value) != 1) {
+		errmsg("cannot read integer from \"%s\"\n", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (*value < 0) {
+		errmsg("negative value %lld in \"%s\"", *value, file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd))
+		return sys_errmsg("close failed on \"%s\"", file);
+
+	return 0;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_hex_int - read an 'int' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function is the same as 'read_pos_ll()', but it reads an 'int'
+ * value, not 'long long'.
+ */
+static int read_hex_int(const char *file, int *value)
+{
+	long long res;
+
+	if (read_hex_ll(file, &res))
+		return -1;
+
+	/* Make sure the value has correct range */
+	if (res > INT_MAX || res < INT_MIN) {
+		errmsg("value %lld read from file \"%s\" is out of range",
+		       res, file);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*value = res;
+	return 0;
+}
+
+/**
+ * read_pos_int - read a positive 'int' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function is the same as 'read_pos_ll()', but it reads an 'int'
+ * value, not 'long long'.
+ */
+static int read_pos_int(const char *file, int *value)
+{
+	long long res;
+
+	if (read_pos_ll(file, &res))
+		return -1;
+
+	/* Make sure the value is not too big */
+	if (res > INT_MAX) {
+		errmsg("value %lld read from file \"%s\" is out of range",
+		       res, file);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*value = res;
+	return 0;
+}
+
+/**
+ * dev_read_hex_int - read an hex 'int' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_hex_int(const char *patt, int mtd_num, int *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_hex_int(file, value);
+}
+
+/**
+ * dev_read_pos_int - read a positive 'int' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_pos_int(const char *patt, int mtd_num, int *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_pos_int(file, value);
+}
+
+/**
+ * dev_read_pos_ll - read a positive 'long long' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_pos_ll(const char *patt, int mtd_num, long long *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_pos_ll(file, value);
+}
+
+/**
+ * type_str2int - convert MTD device type to integer.
+ * @str: MTD device type string to convert
+ *
+ * This function converts MTD device type string @str, read from sysfs, into an
+ * integer.
+ */
+static int type_str2int(const char *str)
+{
+	if (!strcmp(str, "nand"))
+		return MTD_NANDFLASH;
+	if (!strcmp(str, "mlc-nand"))
+		return MTD_MLCNANDFLASH;
+	if (!strcmp(str, "nor"))
+		return MTD_NORFLASH;
+	if (!strcmp(str, "rom"))
+		return MTD_ROM;
+	if (!strcmp(str, "absent"))
+		return MTD_ABSENT;
+	if (!strcmp(str, "dataflash"))
+		return MTD_DATAFLASH;
+	if (!strcmp(str, "ram"))
+		return MTD_RAM;
+	if (!strcmp(str, "ubi"))
+		return MTD_UBIVOLUME;
+	return -1;
+}
+
+/**
+ * dev_node2num - find UBI device number by its character device node.
+ * @lib: MTD library descriptor
+ * @node: name of the MTD device node
+ * @mtd_num: MTD device number is returned here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_node2num(struct libmtd *lib, const char *node, int *mtd_num)
+{
+	struct stat st;
+	int i, mjr, mnr;
+	struct mtd_info info;
+
+	if (stat(node, &st))
+		return sys_errmsg("cannot get information about \"%s\"", node);
+
+	if (!S_ISCHR(st.st_mode)) {
+		errmsg("\"%s\" is not a character device", node);
+		errno = EINVAL;
+		return -1;
+	}
+
+	mjr = major(st.st_rdev);
+	mnr = minor(st.st_rdev);
+
+	if (mtd_get_info((libmtd_t *)lib, &info))
+		return -1;
+
+	for (i = info.lowest_mtd_num; i <= info.highest_mtd_num; i++) {
+		int mjr1, mnr1, ret;
+
+		ret = dev_get_major(lib, i, &mjr1, &mnr1);
+		if (ret) {
+			if (errno == ENOENT)
+				continue;
+			if (!errno)
+				break;
+			return -1;
+		}
+
+		if (mjr1 == mjr && mnr1 == mnr) {
+			errno = 0;
+			*mtd_num = i;
+			return 0;
+		}
+	}
+
+	errno = ENODEV;
+	return -1;
+}
+
+/**
+ * sysfs_is_supported - check whether the MTD sub-system supports MTD.
+ * @lib: MTD library descriptor
+ *
+ * The Linux kernel MTD subsystem gained MTD support starting from kernel
+ * 2.6.30 and libmtd tries to use sysfs interface if possible, because the NAND
+ * sub-page size is available there (and not available at all in pre-sysfs
+ * kernels).
+ *
+ * Very old kernels did not have "/sys/class/mtd" directory. Not very old
+ * kernels (e.g., 2.6.29) did have "/sys/class/mtd/mtdX" directories, by there
+ * were no files there, e.g., the "name" file was not present. So all we can do
+ * is to check for a "/sys/class/mtd/mtdX/name" file. But this is not a
+ * reliable check, because if this is a new system with no MTD devices - we'll
+ * treat it as a pre-sysfs system.
+ */
+static int sysfs_is_supported(struct libmtd *lib)
+{
+	int fd, num = -1;
+	DIR *sysfs_mtd;
+	char file[strlen(lib->mtd_name) + 10];
+
+	sysfs_mtd = opendir(lib->sysfs_mtd);
+	if (!sysfs_mtd) {
+		if (errno == ENOENT) {
+			errno = 0;
+			return 0;
+		}
+		return sys_errmsg("cannot open \"%s\"", lib->sysfs_mtd);
+	}
+
+	/*
+	 * First of all find an "mtdX" directory. This is needed because there
+	 * may be, for example, mtd1 but no mtd0.
+	 */
+	while (1) {
+		int ret, mtd_num;
+		char tmp_buf[256];
+		struct dirent *dirent;
+
+		dirent = readdir(sysfs_mtd);
+		if (!dirent)
+			break;
+
+		if (strlen(dirent->d_name) >= 255) {
+			errmsg("invalid entry in %s: \"%s\"",
+			       lib->sysfs_mtd, dirent->d_name);
+			errno = EINVAL;
+			closedir(sysfs_mtd);
+			return -1;
+		}
+
+		ret = sscanf(dirent->d_name, MTD_NAME_PATT"%s",
+			     &mtd_num, tmp_buf);
+		if (ret == 1) {
+			num = mtd_num;
+			break;
+		}
+	}
+
+	if (closedir(sysfs_mtd))
+		return sys_errmsg("closedir failed on \"%s\"", lib->sysfs_mtd);
+
+	if (num == -1)
+		/* No mtd device, treat this as pre-sysfs system */
+		return 0;
+
+	sprintf(file, lib->mtd_name, num);
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return 0;
+
+	if (close(fd)) {
+		sys_errmsg("close failed on \"%s\"", file);
+		return -1;
+	}
+
+	return 1;
+}
+
+libmtd_t libmtd_open(void)
+{
+	struct libmtd *lib;
+
+	lib = xzalloc(sizeof(*lib));
+
+	lib->offs64_ioctls = OFFS64_IOCTLS_UNKNOWN;
+
+	lib->sysfs_mtd = mkpath("/sys", SYSFS_MTD);
+	if (!lib->sysfs_mtd)
+		goto out_error;
+
+	lib->mtd = mkpath(lib->sysfs_mtd, MTD_NAME_PATT);
+	if (!lib->mtd)
+		goto out_error;
+
+	lib->mtd_name = mkpath(lib->mtd, MTD_NAME);
+	if (!lib->mtd_name)
+		goto out_error;
+
+	if (!sysfs_is_supported(lib)) {
+		free(lib->mtd);
+		free(lib->sysfs_mtd);
+		free(lib->mtd_name);
+		lib->mtd_name = lib->mtd = lib->sysfs_mtd = NULL;
+		return lib;
+	}
+
+	lib->mtd_dev = mkpath(lib->mtd, MTD_DEV);
+	if (!lib->mtd_dev)
+		goto out_error;
+
+	lib->mtd_type = mkpath(lib->mtd, MTD_TYPE);
+	if (!lib->mtd_type)
+		goto out_error;
+
+	lib->mtd_eb_size = mkpath(lib->mtd, MTD_EB_SIZE);
+	if (!lib->mtd_eb_size)
+		goto out_error;
+
+	lib->mtd_size = mkpath(lib->mtd, MTD_SIZE);
+	if (!lib->mtd_size)
+		goto out_error;
+
+	lib->mtd_min_io_size = mkpath(lib->mtd, MTD_MIN_IO_SIZE);
+	if (!lib->mtd_min_io_size)
+		goto out_error;
+
+	lib->mtd_subpage_size = mkpath(lib->mtd, MTD_SUBPAGE_SIZE);
+	if (!lib->mtd_subpage_size)
+		goto out_error;
+
+	lib->mtd_oob_size = mkpath(lib->mtd, MTD_OOB_SIZE);
+	if (!lib->mtd_oob_size)
+		goto out_error;
+
+	lib->mtd_region_cnt = mkpath(lib->mtd, MTD_REGION_CNT);
+	if (!lib->mtd_region_cnt)
+		goto out_error;
+
+	lib->mtd_flags = mkpath(lib->mtd, MTD_FLAGS);
+	if (!lib->mtd_flags)
+		goto out_error;
+
+	lib->sysfs_supported = 1;
+	return lib;
+
+out_error:
+	libmtd_close((libmtd_t)lib);
+	return NULL;
+}
+
+void libmtd_close(libmtd_t desc)
+{
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	free(lib->mtd_flags);
+	free(lib->mtd_region_cnt);
+	free(lib->mtd_oob_size);
+	free(lib->mtd_subpage_size);
+	free(lib->mtd_min_io_size);
+	free(lib->mtd_size);
+	free(lib->mtd_eb_size);
+	free(lib->mtd_type);
+	free(lib->mtd_dev);
+	free(lib->mtd_name);
+	free(lib->mtd);
+	free(lib->sysfs_mtd);
+	free(lib);
+}
+
+int mtd_dev_present(libmtd_t desc, int mtd_num) {
+	struct stat st;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (!lib->sysfs_supported) {
+		return legacy_dev_present(mtd_num) == 1;
+	} else {
+		char file[strlen(lib->mtd) + 10];
+
+		sprintf(file, lib->mtd, mtd_num);
+		return !stat(file, &st);
+	}
+}
+
+int mtd_get_info(libmtd_t desc, struct mtd_info *info)
+{
+	DIR *sysfs_mtd;
+	struct dirent *dirent;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	memset(info, 0, sizeof(struct mtd_info));
+
+	if (!lib->sysfs_supported)
+		return legacy_mtd_get_info(info);
+
+	info->sysfs_supported = 1;
+
+	/*
+	 * We have to scan the MTD sysfs directory to identify how many MTD
+	 * devices are present.
+	 */
+	sysfs_mtd = opendir(lib->sysfs_mtd);
+	if (!sysfs_mtd) {
+		if (errno == ENOENT) {
+			errno = ENODEV;
+			return -1;
+		}
+		return sys_errmsg("cannot open \"%s\"", lib->sysfs_mtd);
+	}
+
+	info->lowest_mtd_num = INT_MAX;
+	while (1) {
+		int mtd_num, ret;
+		char tmp_buf[256];
+
+		errno = 0;
+		dirent = readdir(sysfs_mtd);
+		if (!dirent)
+			break;
+
+		if (strlen(dirent->d_name) >= 255) {
+			errmsg("invalid entry in %s: \"%s\"",
+			       lib->sysfs_mtd, dirent->d_name);
+			errno = EINVAL;
+			goto out_close;
+		}
+
+		ret = sscanf(dirent->d_name, MTD_NAME_PATT"%s",
+			     &mtd_num, tmp_buf);
+		if (ret == 1) {
+			info->mtd_dev_cnt += 1;
+			if (mtd_num > info->highest_mtd_num)
+				info->highest_mtd_num = mtd_num;
+			if (mtd_num < info->lowest_mtd_num)
+				info->lowest_mtd_num = mtd_num;
+		}
+	}
+
+	if (!dirent && errno) {
+		sys_errmsg("readdir failed on \"%s\"", lib->sysfs_mtd);
+		goto out_close;
+	}
+
+	if (closedir(sysfs_mtd))
+		return sys_errmsg("closedir failed on \"%s\"", lib->sysfs_mtd);
+
+	if (info->lowest_mtd_num == INT_MAX)
+		info->lowest_mtd_num = 0;
+
+	return 0;
+
+out_close:
+	closedir(sysfs_mtd);
+	return -1;
+}
+
+int mtd_get_dev_info1(libmtd_t desc, int mtd_num, struct mtd_dev_info *mtd)
+{
+	int ret;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	memset(mtd, 0, sizeof(struct mtd_dev_info));
+	mtd->mtd_num = mtd_num;
+
+	if (!mtd_dev_present(desc, mtd_num)) {
+		errno = ENODEV;
+		return -1;
+	} else if (!lib->sysfs_supported)
+		return legacy_get_dev_info1(mtd_num, mtd);
+
+	if (dev_get_major(lib, mtd_num, &mtd->major, &mtd->minor))
+		return -1;
+
+	ret = dev_read_data(lib->mtd_name, mtd_num, &mtd->name,
+			    MTD_NAME_MAX + 1);
+	if (ret < 0)
+		return -1;
+	((char *)mtd->name)[ret - 1] = '\0';
+
+	ret = dev_read_data(lib->mtd_type, mtd_num, &mtd->type_str,
+			    MTD_TYPE_MAX + 1);
+	if (ret < 0)
+		return -1;
+	((char *)mtd->type_str)[ret - 1] = '\0';
+
+	if (dev_read_pos_int(lib->mtd_eb_size, mtd_num, &mtd->eb_size))
+		return -1;
+	if (dev_read_pos_ll(lib->mtd_size, mtd_num, &mtd->size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_min_io_size, mtd_num, &mtd->min_io_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_subpage_size, mtd_num, &mtd->subpage_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_oob_size, mtd_num, &mtd->oob_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_region_cnt, mtd_num, &mtd->region_cnt))
+		return -1;
+	if (dev_read_hex_int(lib->mtd_flags, mtd_num, &ret))
+		return -1;
+	mtd->writable = !!(ret & MTD_WRITEABLE);
+
+	mtd->eb_cnt = mtd->size / mtd->eb_size;
+	mtd->type = type_str2int(mtd->type_str);
+	mtd->bb_allowed = !!(mtd->type == MTD_NANDFLASH ||
+				mtd->type == MTD_MLCNANDFLASH);
+
+	return 0;
+}
+
+int mtd_get_dev_info(libmtd_t desc, const char *node, struct mtd_dev_info *mtd)
+{
+	int mtd_num;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (!lib->sysfs_supported)
+		return legacy_get_dev_info(node, mtd);
+
+	if (dev_node2num(lib, node, &mtd_num))
+		return -1;
+
+	return mtd_get_dev_info1(desc, mtd_num, mtd);
+}
+
+static inline int mtd_ioctl_error(const struct mtd_dev_info *mtd, int eb,
+				  const char *sreq)
+{
+	return sys_errmsg("%s ioctl failed for eraseblock %d (mtd%d)",
+			  sreq, eb, mtd->mtd_num);
+}
+
+static int mtd_valid_erase_block(const struct mtd_dev_info *mtd, int eb)
+{
+	if (eb < 0 || eb >= mtd->eb_cnt) {
+		errmsg("bad eraseblock number %d, mtd%d has %d eraseblocks",
+		       eb, mtd->mtd_num, mtd->eb_cnt);
+		errno = EINVAL;
+		return -1;
+	}
+	return 0;
+}
+
+static int mtd_xlock(const struct mtd_dev_info *mtd, int fd, int eb, int req,
+		     const char *sreq)
+{
+	int ret;
+	struct erase_info_user ei;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	ei.start = eb * mtd->eb_size;
+	ei.length = mtd->eb_size;
+
+	ret = ioctl(fd, req, &ei);
+	if (ret < 0)
+		return mtd_ioctl_error(mtd, eb, sreq);
+
+	return 0;
+}
+#define mtd_xlock(mtd, fd, eb, req) mtd_xlock(mtd, fd, eb, req, #req)
+
+int mtd_lock(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	return mtd_xlock(mtd, fd, eb, MEMLOCK);
+}
+
+int mtd_unlock(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	return mtd_xlock(mtd, fd, eb, MEMUNLOCK);
+}
+
+int mtd_erase(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	struct libmtd *lib = (struct libmtd *)desc;
+	struct erase_info_user64 ei64;
+	struct erase_info_user ei;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	ei64.start = (__u64)eb * mtd->eb_size;
+	ei64.length = mtd->eb_size;
+
+	if (lib->offs64_ioctls == OFFS64_IOCTLS_SUPPORTED ||
+	    lib->offs64_ioctls == OFFS64_IOCTLS_UNKNOWN) {
+		ret = ioctl(fd, MEMERASE64, &ei64);
+		if (ret == 0)
+			return ret;
+
+		if (errno != ENOTTY ||
+		    lib->offs64_ioctls != OFFS64_IOCTLS_UNKNOWN)
+			return mtd_ioctl_error(mtd, eb, "MEMERASE64");
+
+		/*
+		 * MEMERASE64 support was added in kernel version 2.6.31, so
+		 * probably we are working with older kernel and this ioctl is
+		 * not supported.
+		 */
+		lib->offs64_ioctls = OFFS64_IOCTLS_NOT_SUPPORTED;
+	}
+
+	if (ei64.start + ei64.length > 0xFFFFFFFF) {
+		errmsg("this system can address only %u eraseblocks",
+		       0xFFFFFFFFU / mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	ei.start = ei64.start;
+	ei.length = ei64.length;
+	ret = ioctl(fd, MEMERASE, &ei);
+	if (ret < 0)
+		return mtd_ioctl_error(mtd, eb, "MEMERASE");
+	return 0;
+}
+
+int mtd_regioninfo(int fd, int regidx, struct region_info_user *reginfo)
+{
+	int ret;
+
+	if (regidx < 0) {
+		errno = ENODEV;
+		return -1;
+	}
+
+	reginfo->regionindex = regidx;
+
+	ret = ioctl(fd, MEMGETREGIONINFO, reginfo);
+	if (ret < 0)
+		return sys_errmsg("%s ioctl failed for erase region %d",
+			"MEMGETREGIONINFO", regidx);
+
+	return 0;
+}
+
+int mtd_is_locked(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	erase_info_t ei;
+
+	ei.start = eb * mtd->eb_size;
+	ei.length = mtd->eb_size;
+
+	ret = ioctl(fd, MEMISLOCKED, &ei);
+	if (ret < 0) {
+		if (errno != ENOTTY && errno != EOPNOTSUPP)
+			return mtd_ioctl_error(mtd, eb, "MEMISLOCKED");
+		else
+			errno = EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+/* Patterns to write to a physical eraseblock when torturing it */
+static uint8_t patterns[] = {0xa5, 0x5a, 0x0};
+
+/**
+ * check_pattern - check if buffer contains only a certain byte pattern.
+ * @buf: buffer to check
+ * @patt: the pattern to check
+ * @size: buffer size in bytes
+ *
+ * This function returns %1 in there are only @patt bytes in @buf, and %0 if
+ * something else was also found.
+ */
+static int check_pattern(const void *buf, uint8_t patt, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		if (((const uint8_t *)buf)[i] != patt)
+			return 0;
+	return 1;
+}
+
+int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int err, i, patt_count;
+	void *buf;
+
+	normsg("run torture test for PEB %d", eb);
+	patt_count = ARRAY_SIZE(patterns);
+
+	buf = xmalloc(mtd->eb_size);
+
+	for (i = 0; i < patt_count; i++) {
+		err = mtd_erase(desc, mtd, fd, eb);
+		if (err)
+			goto out;
+
+		/* Make sure the PEB contains only 0xFF bytes */
+		err = mtd_read(mtd, fd, eb, 0, buf, mtd->eb_size);
+		if (err)
+			goto out;
+
+		err = check_pattern(buf, 0xFF, mtd->eb_size);
+		if (err == 0) {
+			errmsg("erased PEB %d, but a non-0xFF byte found", eb);
+			errno = EIO;
+			goto out;
+		}
+
+		/* Write a pattern and check it */
+		memset(buf, patterns[i], mtd->eb_size);
+		err = mtd_write(desc, mtd, fd, eb, 0, buf, mtd->eb_size, NULL,
+				0, 0);
+		if (err)
+			goto out;
+
+		memset(buf, ~patterns[i], mtd->eb_size);
+		err = mtd_read(mtd, fd, eb, 0, buf, mtd->eb_size);
+		if (err)
+			goto out;
+
+		err = check_pattern(buf, patterns[i], mtd->eb_size);
+		if (err == 0) {
+			errmsg("pattern %x checking failed for PEB %d",
+				patterns[i], eb);
+			errno = EIO;
+			goto out;
+		}
+	}
+
+	normsg("PEB %d passed torture test, do not mark it a bad", eb);
+
+out:
+	free(buf);
+	return -1;
+}
+
+int mtd_is_bad(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	loff_t seek;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (!mtd->bb_allowed)
+		return 0;
+
+	seek = (loff_t)eb * mtd->eb_size;
+	ret = ioctl(fd, MEMGETBADBLOCK, &seek);
+	if (ret == -1)
+		return mtd_ioctl_error(mtd, eb, "MEMGETBADBLOCK");
+	return ret;
+}
+
+int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	loff_t seek;
+
+	if (!mtd->bb_allowed) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	seek = (loff_t)eb * mtd->eb_size;
+	ret = ioctl(fd, MEMSETBADBLOCK, &seek);
+	if (ret == -1)
+		return mtd_ioctl_error(mtd, eb, "MEMSETBADBLOCK");
+	return 0;
+}
+
+int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+	     void *buf, int len)
+{
+	int ret, rd = 0;
+	off_t seek;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs + len > mtd->eb_size) {
+		errmsg("bad offset %d or length %d, mtd%d eraseblock size is %d",
+		       offs, len, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Seek to the beginning of the eraseblock */
+	seek = (off_t)eb * mtd->eb_size + offs;
+	if (lseek(fd, seek, SEEK_SET) != seek)
+		return sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+				  mtd->mtd_num, seek);
+
+	while (rd < len) {
+		ret = read(fd, buf, len);
+		if (ret < 0)
+			return sys_errmsg("cannot read %d bytes from mtd%d (eraseblock %d, offset %d)",
+					  len, mtd->mtd_num, eb, offs);
+		rd += ret;
+	}
+
+	return 0;
+}
+
+static int legacy_auto_oob_layout(const struct mtd_dev_info *mtd, int fd,
+				  int ooblen, void *oob) {
+	struct nand_oobinfo old_oobinfo;
+	int start, len;
+	uint8_t *tmp_buf;
+
+	/* Read the current oob info */
+	if (ioctl(fd, MEMGETOOBSEL, &old_oobinfo))
+		return sys_errmsg("MEMGETOOBSEL failed");
+
+	tmp_buf = malloc(ooblen);
+	memcpy(tmp_buf, oob, ooblen);
+
+	/*
+	 * We use autoplacement and have the oobinfo with the autoplacement
+	 * information from the kernel available
+	 */
+	if (old_oobinfo.useecc == MTD_NANDECC_AUTOPLACE) {
+		int i, tags_pos = 0;
+		for (i = 0; old_oobinfo.oobfree[i][1]; i++) {
+			/* Set the reserved bytes to 0xff */
+			start = old_oobinfo.oobfree[i][0];
+			len = old_oobinfo.oobfree[i][1];
+			memcpy(oob + start, tmp_buf + tags_pos, len);
+			tags_pos += len;
+		}
+	} else {
+		/* Set at least the ecc byte positions to 0xff */
+		start = old_oobinfo.eccbytes;
+		len = mtd->oob_size - start;
+		memcpy(oob + start, tmp_buf + start, len);
+	}
+	free(tmp_buf);
+
+	return 0;
+}
+
+int mtd_write(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb,
+	      int offs, void *data, int len, void *oob, int ooblen,
+	      uint8_t mode)
+{
+	int ret;
+	off_t seek;
+	struct mtd_write_req ops;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs + len > mtd->eb_size) {
+		errmsg("bad offset %d or length %d, mtd%d eraseblock size is %d",
+		       offs, len, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (offs % mtd->subpage_size) {
+		errmsg("write offset %d is not aligned to mtd%d min. I/O size %d",
+		       offs, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (len % mtd->subpage_size) {
+		errmsg("write length %d is not aligned to mtd%d min. I/O size %d",
+		       len, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Calculate seek address */
+	seek = (off_t)eb * mtd->eb_size + offs;
+
+	if (oob) {
+		ops.start = seek;
+		ops.len = len;
+		ops.ooblen = ooblen;
+		ops.usr_data = (uint64_t)(unsigned long)data;
+		ops.usr_oob = (uint64_t)(unsigned long)oob;
+		ops.mode = mode;
+
+		ret = ioctl(fd, MEMWRITE, &ops);
+		if (ret == 0)
+			return 0;
+		else if (errno != ENOTTY && errno != EOPNOTSUPP)
+			return mtd_ioctl_error(mtd, eb, "MEMWRITE");
+
+		/* Fall back to old OOB ioctl() if necessary */
+		if (mode == MTD_OPS_AUTO_OOB)
+			if (legacy_auto_oob_layout(mtd, fd, ooblen, oob))
+				return -1;
+		if (mtd_write_oob(desc, mtd, fd, seek, ooblen, oob) < 0)
+			return sys_errmsg("cannot write to OOB");
+	}
+	if (data) {
+		/* Seek to the beginning of the eraseblock */
+		if (lseek(fd, seek, SEEK_SET) != seek)
+			return sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+					mtd->mtd_num, seek);
+		ret = write(fd, data, len);
+		if (ret != len)
+			return sys_errmsg("cannot write %d bytes to mtd%d "
+					  "(eraseblock %d, offset %d)",
+					  len, mtd->mtd_num, eb, offs);
+	}
+
+	return 0;
+}
+
+static int do_oob_op(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+	      uint64_t start, uint64_t length, void *data, unsigned int cmd64,
+	      unsigned int cmd)
+{
+	int ret, oob_offs;
+	struct mtd_oob_buf64 oob64;
+	struct mtd_oob_buf oob;
+	unsigned long long max_offs;
+	const char *cmd64_str, *cmd_str;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (cmd64 ==  MEMREADOOB64) {
+		cmd64_str = "MEMREADOOB64";
+		cmd_str   = "MEMREADOOB";
+	} else {
+		cmd64_str = "MEMWRITEOOB64";
+		cmd_str   = "MEMWRITEOOB";
+	}
+
+	max_offs = (unsigned long long)mtd->eb_cnt * mtd->eb_size;
+	if (start >= max_offs) {
+		errmsg("bad page address %" PRIu64 ", mtd%d has %d eraseblocks (%llu bytes)",
+		       start, mtd->mtd_num, mtd->eb_cnt, max_offs);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob_offs = start & (mtd->min_io_size - 1);
+	if (oob_offs + length > mtd->oob_size || length == 0) {
+		errmsg("Cannot write %" PRIu64 " OOB bytes to address %" PRIu64 " (OOB offset %u) - mtd%d OOB size is only %d bytes",
+		       length, start, oob_offs, mtd->mtd_num,  mtd->oob_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob64.start = start;
+	oob64.length = length;
+	oob64.usr_ptr = (uint64_t)(unsigned long)data;
+
+	if (lib->offs64_ioctls == OFFS64_IOCTLS_SUPPORTED ||
+	    lib->offs64_ioctls == OFFS64_IOCTLS_UNKNOWN) {
+		ret = ioctl(fd, cmd64, &oob64);
+		if (ret == 0)
+			return ret;
+
+		if (errno != ENOTTY ||
+		    lib->offs64_ioctls != OFFS64_IOCTLS_UNKNOWN) {
+			sys_errmsg("%s ioctl failed for mtd%d, offset %" PRIu64 " (eraseblock %" PRIu64 ")",
+				   cmd64_str, mtd->mtd_num, start, start / mtd->eb_size);
+		}
+
+		/*
+		 * MEMREADOOB64/MEMWRITEOOB64 support was added in kernel
+		 * version 2.6.31, so probably we are working with older kernel
+		 * and these ioctls are not supported.
+		 */
+		lib->offs64_ioctls = OFFS64_IOCTLS_NOT_SUPPORTED;
+	}
+
+	if (oob64.start > 0xFFFFFFFFULL) {
+		errmsg("this system can address only up to address %lu",
+		       0xFFFFFFFFUL);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob.start = oob64.start;
+	oob.length = oob64.length;
+	oob.ptr = data;
+
+	ret = ioctl(fd, cmd, &oob);
+	if (ret < 0)
+		sys_errmsg("%s ioctl failed for mtd%d, offset %" PRIu64 " (eraseblock %" PRIu64 ")",
+			   cmd_str, mtd->mtd_num, start, start / mtd->eb_size);
+	return ret;
+}
+
+int mtd_read_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		 uint64_t start, uint64_t length, void *data)
+{
+	return do_oob_op(desc, mtd, fd, start, length, data,
+			 MEMREADOOB64, MEMREADOOB);
+}
+
+int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		  uint64_t start, uint64_t length, void *data)
+{
+	return do_oob_op(desc, mtd, fd, start, length, data,
+			 MEMWRITEOOB64, MEMWRITEOOB);
+}
+
+int mtd_write_img(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+		  const char *img_name)
+{
+	int tmp, ret, in_fd, len, written = 0;
+	off_t seek;
+	struct stat st;
+	char *buf;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs >= mtd->eb_size) {
+		errmsg("bad offset %d, mtd%d eraseblock size is %d",
+		       offs, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (offs % mtd->subpage_size) {
+		errmsg("write offset %d is not aligned to mtd%d min. I/O size %d",
+		       offs, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	in_fd = open(img_name, O_RDONLY | O_CLOEXEC);
+	if (in_fd == -1)
+		return sys_errmsg("cannot open \"%s\"", img_name);
+
+	if (fstat(in_fd, &st)) {
+		sys_errmsg("cannot stat %s", img_name);
+		goto out_close;
+	}
+
+	len = st.st_size;
+	if (len % mtd->subpage_size) {
+		errmsg("size of \"%s\" is %d byte, which is not aligned to "
+		       "mtd%d min. I/O size %d", img_name, len, mtd->mtd_num,
+		       mtd->subpage_size);
+		errno = EINVAL;
+		goto out_close;
+	}
+	tmp = (offs + len + mtd->eb_size - 1) / mtd->eb_size;
+	if (eb + tmp > mtd->eb_cnt) {
+		errmsg("\"%s\" image size is %d bytes, mtd%d size is %d "
+		       "eraseblocks, the image does not fit if we write it "
+		       "starting from eraseblock %d, offset %d",
+		       img_name, len, mtd->mtd_num, mtd->eb_cnt, eb, offs);
+		errno = EINVAL;
+		goto out_close;
+	}
+
+	/* Seek to the beginning of the eraseblock */
+	seek = (off_t)eb * mtd->eb_size + offs;
+	if (lseek(fd, seek, SEEK_SET) != seek) {
+		sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+			    mtd->mtd_num, seek);
+		goto out_close;
+	}
+
+	buf = xmalloc(mtd->eb_size);
+
+	while (written < len) {
+		int rd = 0;
+
+		do {
+			ret = read(in_fd, buf, mtd->eb_size - offs - rd);
+			if (ret == -1) {
+				sys_errmsg("cannot read \"%s\"", img_name);
+				goto out_free;
+			}
+			rd += ret;
+		} while (ret && rd < mtd->eb_size - offs);
+
+		ret = write(fd, buf, rd);
+		if (ret != rd) {
+			sys_errmsg("cannot write %d bytes to mtd%d (eraseblock %d, offset %d)",
+				   len, mtd->mtd_num, eb, offs);
+			goto out_free;
+		}
+
+		offs = 0;
+		eb += 1;
+		written += rd;
+	}
+
+	free(buf);
+	close(in_fd);
+	return 0;
+
+out_free:
+	free(buf);
+out_close:
+	close(in_fd);
+	return -1;
+}
+
+int mtd_probe_node(libmtd_t desc, const char *node)
+{
+	struct stat st;
+	struct mtd_info info;
+	int i, mjr, mnr;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (stat(node, &st))
+		return sys_errmsg("cannot get information about \"%s\"", node);
+
+	if (!S_ISCHR(st.st_mode)) {
+		errmsg("\"%s\" is not a character device", node);
+		errno = EINVAL;
+		return -1;
+	}
+
+	mjr = major(st.st_rdev);
+	mnr = minor(st.st_rdev);
+
+	if (mtd_get_info((libmtd_t *)lib, &info))
+		return -1;
+
+	if (!lib->sysfs_supported)
+		return 0;
+
+	for (i = info.lowest_mtd_num; i <= info.highest_mtd_num; i++) {
+		int mjr1, mnr1, ret;
+
+		ret = dev_get_major(lib, i, &mjr1, &mnr1);
+		if (ret) {
+			if (errno == ENOENT)
+				continue;
+			if (!errno)
+				break;
+			return -1;
+		}
+
+		if (mjr1 == mjr && mnr1 == mnr)
+			return 1;
+	}
+
+	errno = 0;
+	return -1;
+}
diff --git a/oslib/libmtd.h b/oslib/libmtd.h
new file mode 100644
index 0000000..a0c90dc
--- /dev/null
+++ b/oslib/libmtd.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (C) 2008, 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __LIBMTD_H__
+#define __LIBMTD_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Needed for uint8_t, uint64_t
+#include <stdint.h>
+
+/* Maximum MTD device name length */
+#define MTD_NAME_MAX 127
+/* Maximum MTD device type string length */
+#define MTD_TYPE_MAX 64
+
+/* MTD library descriptor */
+typedef void * libmtd_t;
+
+/* Forward decls */
+struct region_info_user;
+
+/**
+ * @mtd_dev_cnt: count of MTD devices in system
+ * @lowest_mtd_num: lowest MTD device number in system
+ * @highest_mtd_num: highest MTD device number in system
+ * @sysfs_supported: non-zero if sysfs is supported by MTD
+ */
+struct mtd_info
+{
+	int mtd_dev_cnt;
+	int lowest_mtd_num;
+	int highest_mtd_num;
+	unsigned int sysfs_supported:1;
+};
+
+/**
+ * struct mtd_dev_info - information about an MTD device.
+ * @mtd_num: MTD device number
+ * @major: major number of corresponding character device
+ * @minor: minor number of corresponding character device
+ * @type: flash type (constants like %MTD_NANDFLASH defined in mtd-abi.h)
+ * @type_str: static R/O flash type string
+ * @name: device name
+ * @size: device size in bytes
+ * @eb_cnt: count of eraseblocks
+ * @eb_size: eraseblock size
+ * @min_io_size: minimum input/output unit size
+ * @subpage_size: sub-page size
+ * @oob_size: OOB size (zero if the device does not have OOB area)
+ * @region_cnt: count of additional erase regions
+ * @writable: zero if the device is read-only
+ * @bb_allowed: non-zero if the MTD device may have bad eraseblocks
+ */
+struct mtd_dev_info
+{
+	int mtd_num;
+	int major;
+	int minor;
+	int type;
+	char type_str[MTD_TYPE_MAX + 1];
+	char name[MTD_NAME_MAX + 1];
+	long long size;
+	int eb_cnt;
+	int eb_size;
+	int min_io_size;
+	int subpage_size;
+	int oob_size;
+	int region_cnt;
+	unsigned int writable:1;
+	unsigned int bb_allowed:1;
+};
+
+/**
+ * libmtd_open - open MTD library.
+ *
+ * This function initializes and opens the MTD library and returns MTD library
+ * descriptor in case of success and %NULL in case of failure. In case of
+ * failure, errno contains zero if MTD is not present in the system, or
+ * contains the error code if a real error happened.
+ */
+libmtd_t libmtd_open(void);
+
+/**
+ * libmtd_close - close MTD library.
+ * @desc: MTD library descriptor
+ */
+void libmtd_close(libmtd_t desc);
+
+/**
+ * mtd_dev_present - check whether a MTD device is present.
+ * @desc: MTD library descriptor
+ * @mtd_num: MTD device number to check
+ *
+ * This function returns %1 if MTD device is present and %0 if not.
+ */
+int mtd_dev_present(libmtd_t desc, int mtd_num);
+
+/**
+ * mtd_get_info - get general MTD information.
+ * @desc: MTD library descriptor
+ * @info: the MTD device information is returned here
+ *
+ * This function fills the passed @info object with general MTD information and
+ * returns %0 in case of success and %-1 in case of failure. If MTD subsystem is
+ * not present in the system, errno is set to @ENODEV.
+ */
+int mtd_get_info(libmtd_t desc, struct mtd_info *info);
+
+/**
+ * mtd_get_dev_info - get information about an MTD device.
+ * @desc: MTD library descriptor
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function gets information about MTD device defined by the @node device
+ * node file and saves this information in the @mtd object. Returns %0 in case
+ * of success and %-1 in case of failure. If MTD subsystem is not present in the
+ * system, or the MTD device does not exist, errno is set to @ENODEV.
+ */
+int mtd_get_dev_info(libmtd_t desc, const char *node, struct mtd_dev_info *mtd);
+
+/**
+ * mtd_get_dev_info1 - get information about an MTD device.
+ * @desc: MTD library descriptor
+ * @mtd_num: MTD device number to fetch information about
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is identical to 'mtd_get_dev_info()' except that it accepts
+ * MTD device number, not MTD character device.
+ */
+int mtd_get_dev_info1(libmtd_t desc, int mtd_num, struct mtd_dev_info *mtd);
+
+/**
+ * mtd_lock - lock eraseblocks.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to lock
+ *
+ * This function locks eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_lock(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_unlock - unlock eraseblocks.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to lock
+ *
+ * This function unlocks eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_unlock(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_erase - erase an eraseblock.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to erase
+ *
+ * This function erases eraseblock @eb of MTD device described by @fd. Returns
+ * %0 in case of success and %-1 in case of failure.
+ */
+int mtd_erase(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_regioninfo - get information about an erase region.
+ * @fd: MTD device node file descriptor
+ * @regidx: index of region to look up
+ * @reginfo: the region information is returned here
+ *
+ * This function gets information about an erase region defined by the
+ * @regidx index and saves this information in the @reginfo object.
+ * Returns %0 in case of success and %-1 in case of failure. If the
+ * @regidx is not valid or unavailable, errno is set to @ENODEV.
+ */
+int mtd_regioninfo(int fd, int regidx, struct region_info_user *reginfo);
+
+/**
+ * mtd_is_locked - see if the specified eraseblock is locked.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to check
+ *
+ * This function checks to see if eraseblock @eb of MTD device described
+ * by @fd is locked. Returns %0 if it is unlocked, %1 if it is locked, and
+ * %-1 in case of failure. If the ioctl is not supported (support was added in
+ * Linux kernel 2.6.36) or this particular device does not support it, errno is
+ * set to @ENOTSUPP.
+ */
+int mtd_is_locked(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_torture - torture an eraseblock.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to torture
+ *
+ * This function tortures eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_is_bad - check if eraseblock is bad.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to check
+ *
+ * This function checks if eraseblock @eb is bad. Returns %0 if not, %1 if yes,
+ * and %-1 in case of failure.
+ */
+int mtd_is_bad(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_mark_bad - mark an eraseblock as bad.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to mark as bad
+ *
+ * This function marks eraseblock @eb as bad. Returns %0 in case of success and
+ * %-1 in case of failure.
+ */
+int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_read - read data from an MTD device.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to read from
+ * @offs: offset withing the eraseblock to read from
+ * @buf: buffer to read data to
+ * @len: how many bytes to read
+ *
+ * This function reads @len bytes of data from eraseblock @eb and offset @offs
+ * of the MTD device defined by @mtd and stores the read data at buffer @buf.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+	     void *buf, int len);
+
+/**
+ * mtd_write - write data to an MTD device.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to write to
+ * @offs: offset withing the eraseblock to write to
+ * @data: data buffer to write
+ * @len: how many data bytes to write
+ * @oob: OOB buffer to write
+ * @ooblen: how many OOB bytes to write
+ * @mode: write mode (e.g., %MTD_OOB_PLACE, %MTD_OOB_RAW)
+ *
+ * This function writes @len bytes of data to eraseblock @eb and offset @offs
+ * of the MTD device defined by @mtd. Returns %0 in case of success and %-1 in
+ * case of failure.
+ *
+ * Can only write to a single page at a time if writing to OOB.
+ */
+int mtd_write(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb,
+	      int offs, void *data, int len, void *oob, int ooblen,
+	      uint8_t mode);
+
+/**
+ * mtd_read_oob - read out-of-band area.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @start: page-aligned start address
+ * @length: number of OOB bytes to read
+ * @data: read buffer
+ *
+ * This function reads @length OOB bytes starting from address @start on
+ * MTD device described by @fd. The address is specified as page byte offset
+ * from the beginning of the MTD device. This function returns %0 in case of
+ * success and %-1 in case of failure.
+ */
+int mtd_read_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		 uint64_t start, uint64_t length, void *data);
+
+/**
+ * mtd_write_oob - write out-of-band area.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @start: page-aligned start address
+ * @length: number of OOB bytes to write
+ * @data: write buffer
+ *
+ * This function writes @length OOB bytes starting from address @start on
+ * MTD device described by @fd. The address is specified as page byte offset
+ * from the beginning of the MTD device. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		  uint64_t start, uint64_t length, void *data);
+
+/**
+ * mtd_write_img - write a file to MTD device.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to write to
+ * @offs: offset withing the eraseblock to write to
+ * @img_name: the file to write
+ *
+ * This function writes an image @img_name the MTD device defined by @mtd. @eb
+ * and @offs are the starting eraseblock and offset on the MTD device. Returns
+ * %0 in case of success and %-1 in case of failure.
+ */
+int mtd_write_img(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+		  const char *img_name);
+
+/**
+ * mtd_probe_node - test MTD node.
+ * @desc: MTD library descriptor
+ * @node: the node to test
+ *
+ * This function tests whether @node is an MTD device node and returns %1 if it
+ * is, and %-1 if it is not (errno is %ENODEV in this case) or if an error
+ * occurred.
+ */
+int mtd_probe_node(libmtd_t desc, const char *node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LIBMTD_H__ */
diff --git a/oslib/libmtd_common.h b/oslib/libmtd_common.h
new file mode 100644
index 0000000..4ed9f0b
--- /dev/null
+++ b/oslib/libmtd_common.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Artem Bityutskiy, 2007, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __MTD_UTILS_COMMON_H__
+#define __MTD_UTILS_COMMON_H__
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <features.h>
+#include <inttypes.h>
+#include <sys/sysmacros.h>
+
+#ifndef PROGRAM_NAME
+# error "You must define PROGRAM_NAME before including this header"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MIN	/* some C lib headers define this for us */
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#define min(a, b) MIN(a, b) /* glue for linux kernel source */
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
+
+#define min_t(t,x,y) ({ \
+	__typeof__((x)) _x = (x); \
+	__typeof__((y)) _y = (y); \
+	(_x < _y) ? _x : _y; \
+})
+
+#define max_t(t,x,y) ({ \
+	__typeof__((x)) _x = (x); \
+	__typeof__((y)) _y = (y); \
+	(_x > _y) ? _x : _y; \
+})
+
+#ifndef O_CLOEXEC
+#define O_CLOEXEC 0
+#endif
+
+/* define a print format specifier for off_t */
+#ifdef __USE_FILE_OFFSET64
+#define PRIxoff_t PRIx64
+#define PRIdoff_t PRId64
+#else
+#define PRIxoff_t "l"PRIx32
+#define PRIdoff_t "l"PRId32
+#endif
+
+/* Verbose messages */
+#define bareverbose(verbose, fmt, ...) do {                        \
+	if (verbose)                                               \
+		printf(fmt, ##__VA_ARGS__);                        \
+} while(0)
+#define verbose(verbose, fmt, ...) \
+	bareverbose(verbose, "%s: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__)
+
+/* Normal messages */
+#define normsg_cont(fmt, ...) do {                                 \
+	printf("%s: " fmt, PROGRAM_NAME, ##__VA_ARGS__);           \
+} while(0)
+#define normsg(fmt, ...) do {                                      \
+	normsg_cont(fmt "\n", ##__VA_ARGS__);                      \
+} while(0)
+
+/* Error messages */
+#define errmsg(fmt, ...)  ({                                                \
+	fprintf(stderr, "%s: error!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
+	-1;                                                                 \
+})
+#define errmsg_die(fmt, ...) do {                                           \
+	exit(errmsg(fmt, ##__VA_ARGS__));                                   \
+} while(0)
+
+/* System error messages */
+#define sys_errmsg(fmt, ...)  ({                                            \
+	int _err = errno;                                                   \
+	errmsg(fmt, ##__VA_ARGS__);                                         \
+	fprintf(stderr, "%*serror %d (%s)\n", (int)sizeof(PROGRAM_NAME) + 1,\
+		"", _err, strerror(_err));                                  \
+	-1;                                                                 \
+})
+#define sys_errmsg_die(fmt, ...) do {                                       \
+	exit(sys_errmsg(fmt, ##__VA_ARGS__));                               \
+} while(0)
+
+/* Warnings */
+#define warnmsg(fmt, ...) do {                                                \
+	fprintf(stderr, "%s: warning!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
+} while(0)
+
+static inline int is_power_of_2(unsigned long long n)
+{
+	return (n != 0 && ((n & (n - 1)) == 0));
+}
+
+/**
+ * simple_strtoX - convert a hex/dec/oct string into a number
+ * @snum: buffer to convert
+ * @error: set to 1 when buffer isn't fully consumed
+ *
+ * These functions are similar to the standard strtoX() functions, but they are
+ * a little bit easier to use if you want to convert full string of digits into
+ * the binary form. The typical usage:
+ *
+ * int error = 0;
+ * unsigned long num;
+ *
+ * num = simple_strtoul(str, &error);
+ * if (error || ... if needed, your check that num is not out of range ...)
+ * 	error_happened();
+ */
+#define simple_strtoX(func, type) \
+static inline type simple_##func(const char *snum, int *error) \
+{ \
+	char *endptr; \
+	type ret = func(snum, &endptr, 0); \
+ \
+	if (error && (!*snum || *endptr)) { \
+		errmsg("%s: unable to parse the number '%s'", #func, snum); \
+		*error = 1; \
+	} \
+ \
+	return ret; \
+}
+simple_strtoX(strtol, long int)
+simple_strtoX(strtoll, long long int)
+simple_strtoX(strtoul, unsigned long int)
+simple_strtoX(strtoull, unsigned long long int)
+
+/* Simple version-printing for utils */
+#define common_print_version() \
+do { \
+	printf("%s %s\n", PROGRAM_NAME, VERSION); \
+} while (0)
+
+#include "libmtd_xalloc.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !__MTD_UTILS_COMMON_H__ */
diff --git a/oslib/libmtd_int.h b/oslib/libmtd_int.h
new file mode 100644
index 0000000..a08e574
--- /dev/null
+++ b/oslib/libmtd_int.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __LIBMTD_INT_H__
+#define __LIBMTD_INT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PROGRAM_NAME "libmtd"
+
+#define SYSFS_MTD        "class/mtd"
+#define MTD_NAME_PATT    "mtd%d"
+#define MTD_DEV          "dev"
+#define MTD_NAME         "name"
+#define MTD_TYPE         "type"
+#define MTD_EB_SIZE      "erasesize"
+#define MTD_SIZE         "size"
+#define MTD_MIN_IO_SIZE  "writesize"
+#define MTD_SUBPAGE_SIZE "subpagesize"
+#define MTD_OOB_SIZE     "oobsize"
+#define MTD_REGION_CNT   "numeraseregions"
+#define MTD_FLAGS        "flags"
+
+#define OFFS64_IOCTLS_UNKNOWN       0
+#define OFFS64_IOCTLS_NOT_SUPPORTED 1
+#define OFFS64_IOCTLS_SUPPORTED     2
+
+/**
+ * libmtd - MTD library description data structure.
+ * @sysfs_mtd: MTD directory in sysfs
+ * @mtd: MTD device sysfs directory pattern
+ * @mtd_dev: MTD device major/minor numbers file pattern
+ * @mtd_name: MTD device name file pattern
+ * @mtd_type: MTD device type file pattern
+ * @mtd_eb_size: MTD device eraseblock size file pattern
+ * @mtd_size: MTD device size file pattern
+ * @mtd_min_io_size: minimum I/O unit size file pattern
+ * @mtd_subpage_size: sub-page size file pattern
+ * @mtd_oob_size: MTD device OOB size file pattern
+ * @mtd_region_cnt: count of additional erase regions file pattern
+ * @mtd_flags: MTD device flags file pattern
+ * @sysfs_supported: non-zero if sysfs is supported by MTD
+ * @offs64_ioctls: %OFFS64_IOCTLS_SUPPORTED if 64-bit %MEMERASE64,
+ *                 %MEMREADOOB64, %MEMWRITEOOB64 MTD device ioctls are
+ *                 supported, %OFFS64_IOCTLS_NOT_SUPPORTED if not, and
+ *                 %OFFS64_IOCTLS_UNKNOWN if it is not known yet;
+ *
+ *  Note, we cannot find out whether 64-bit ioctls are supported by MTD when we
+ *  are initializing the library, because this requires an MTD device node.
+ *  Indeed, we have to actually call the ioctl and check for %ENOTTY to find
+ *  out whether it is supported or not.
+ *
+ *  Thus, we leave %offs64_ioctls uninitialized in 'libmtd_open()', and
+ *  initialize it later, when corresponding libmtd function is used, and when
+ *  we actually have a device node and can invoke an ioctl command on it.
+ */
+struct libmtd
+{
+	char *sysfs_mtd;
+	char *mtd;
+	char *mtd_dev;
+	char *mtd_name;
+	char *mtd_type;
+	char *mtd_eb_size;
+	char *mtd_size;
+	char *mtd_min_io_size;
+	char *mtd_subpage_size;
+	char *mtd_oob_size;
+	char *mtd_region_cnt;
+	char *mtd_flags;
+	unsigned int sysfs_supported:1;
+	unsigned int offs64_ioctls:2;
+};
+
+int legacy_libmtd_open(void);
+int legacy_dev_present(int mtd_num);
+int legacy_mtd_get_info(struct mtd_info *info);
+int legacy_get_dev_info(const char *node, struct mtd_dev_info *mtd);
+int legacy_get_dev_info1(int dev_num, struct mtd_dev_info *mtd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !__LIBMTD_INT_H__ */
diff --git a/oslib/libmtd_legacy.c b/oslib/libmtd_legacy.c
new file mode 100644
index 0000000..137e80a
--- /dev/null
+++ b/oslib/libmtd_legacy.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * This file  is part of the MTD library. Implements pre-2.6.30 kernels support,
+ * where MTD did not have sysfs interface. The main limitation of the old
+ * kernels was that the sub-page size was not exported to user-space, so it was
+ * not possible to get sub-page size.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <mtd/mtd-user.h>
+
+#include "libmtd.h"
+#include "libmtd_int.h"
+#include "libmtd_common.h"
+
+#define MTD_PROC_FILE "/proc/mtd"
+#define MTD_DEV_PATT  "/dev/mtd%d"
+#define MTD_DEV_MAJOR 90
+
+#define PROC_MTD_FIRST     "dev:    size   erasesize  name\n"
+#define PROC_MTD_FIRST_LEN (sizeof(PROC_MTD_FIRST) - 1)
+#define PROC_MTD_MAX_LEN   4096
+#define PROC_MTD_PATT      "mtd%d: %llx %x"
+
+/**
+ * struct proc_parse_info - /proc/mtd parsing information.
+ * @mtd_num: MTD device number
+ * @size: device size
+ * @eb_size: eraseblock size
+ * @name: device name
+ * @buf: contents of /proc/mtd
+ * @data_size: how much data was read into @buf
+ * @pos: next string in @buf to parse
+ */
+struct proc_parse_info
+{
+	int mtd_num;
+	long long size;
+	char name[MTD_NAME_MAX + 1];
+	int eb_size;
+	char *buf;
+	int data_size;
+	char *next;
+};
+
+static int proc_parse_start(struct proc_parse_info *pi)
+{
+	int fd, ret;
+
+	fd = open(MTD_PROC_FILE, O_RDONLY);
+	if (fd == -1)
+		return -1;
+
+	pi->buf = xmalloc(PROC_MTD_MAX_LEN);
+
+	ret = read(fd, pi->buf, PROC_MTD_MAX_LEN);
+	if (ret == -1) {
+		sys_errmsg("cannot read \"%s\"", MTD_PROC_FILE);
+		goto out_free;
+	}
+
+	if (ret < PROC_MTD_FIRST_LEN ||
+	    memcmp(pi->buf, PROC_MTD_FIRST, PROC_MTD_FIRST_LEN)) {
+		errmsg("\"%s\" does not start with \"%s\"", MTD_PROC_FILE,
+		       PROC_MTD_FIRST);
+		goto out_free;
+	}
+
+	pi->data_size = ret;
+	pi->next = pi->buf + PROC_MTD_FIRST_LEN;
+
+	close(fd);
+	return 0;
+
+out_free:
+	free(pi->buf);
+	close(fd);
+	return -1;
+}
+
+static int proc_parse_next(struct proc_parse_info *pi)
+{
+	int ret, len, pos = pi->next - pi->buf;
+	char *p, *p1;
+
+	if (pos >= pi->data_size) {
+		free(pi->buf);
+		return 0;
+	}
+
+	ret = sscanf(pi->next, PROC_MTD_PATT, &pi->mtd_num, &pi->size,
+		     &pi->eb_size);
+	if (ret != 3)
+		return errmsg("\"%s\" pattern not found", PROC_MTD_PATT);
+
+	p = memchr(pi->next, '\"', pi->data_size - pos);
+	if (!p)
+		return errmsg("opening \" not found");
+	p += 1;
+	pos = p - pi->buf;
+	if (pos >= pi->data_size)
+		return errmsg("opening \" not found");
+
+	p1 = memchr(p, '\"', pi->data_size - pos);
+	if (!p1)
+		return errmsg("closing \" not found");
+	pos = p1 - pi->buf;
+	if (pos >= pi->data_size)
+		return errmsg("closing \" not found");
+
+	len = p1 - p;
+	if (len > MTD_NAME_MAX)
+		return errmsg("too long mtd%d device name", pi->mtd_num);
+
+	memcpy(pi->name, p, len);
+	pi->name[len] = '\0';
+
+	if (p1[1] != '\n')
+		return errmsg("opening \"\n\" not found");
+	pi->next = p1 + 2;
+	return 1;
+}
+
+/**
+ * legacy_libmtd_open - legacy version of 'libmtd_open()'.
+ *
+ * This function is just checks that MTD is present in the system. Returns
+ * zero in case of success and %-1 in case of failure. In case of failure,
+ * errno contains zero if MTD is not present in the system, or contains the
+ * error code if a real error happened. This is similar to the 'libmtd_open()'
+ * return conventions.
+ */
+int legacy_libmtd_open(void)
+{
+	int fd;
+
+	fd = open(MTD_PROC_FILE, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT)
+			errno = 0;
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+/**
+ * legacy_dev_presentl - legacy version of 'mtd_dev_present()'.
+ * @info: the MTD device information is returned here
+ *
+ * When the kernel does not provide sysfs files for the MTD subsystem,
+ * fall-back to parsing the /proc/mtd file to determine whether an mtd device
+ * number @mtd_num is present.
+ */
+int legacy_dev_present(int mtd_num)
+{
+	int ret;
+	struct proc_parse_info pi;
+
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	while (proc_parse_next(&pi)) {
+		if (pi.mtd_num == mtd_num)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * legacy_mtd_get_info - legacy version of 'mtd_get_info()'.
+ * @info: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_info()' and has the same conventions.
+ */
+int legacy_mtd_get_info(struct mtd_info *info)
+{
+	int ret;
+	struct proc_parse_info pi;
+
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	info->lowest_mtd_num = INT_MAX;
+	while (proc_parse_next(&pi)) {
+		info->mtd_dev_cnt += 1;
+		if (pi.mtd_num > info->highest_mtd_num)
+			info->highest_mtd_num = pi.mtd_num;
+		if (pi.mtd_num < info->lowest_mtd_num)
+			info->lowest_mtd_num = pi.mtd_num;
+	}
+
+	return 0;
+}
+
+/**
+ * legacy_get_dev_info - legacy version of 'mtd_get_dev_info()'.
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_dev_info()' and has the same
+ * conventions.
+ */
+int legacy_get_dev_info(const char *node, struct mtd_dev_info *mtd)
+{
+	struct stat st;
+	struct mtd_info_user ui;
+	int fd, ret;
+	loff_t offs = 0;
+	struct proc_parse_info pi;
+
+	if (stat(node, &st)) {
+		sys_errmsg("cannot open \"%s\"", node);
+		if (errno == ENOENT)
+			normsg("MTD subsystem is old and does not support "
+			       "sysfs, so MTD character device nodes have "
+			       "to exist");
+	}
+
+	if (!S_ISCHR(st.st_mode)) {
+		errno = EINVAL;
+		return errmsg("\"%s\" is not a character device", node);
+	}
+
+	memset(mtd, '\0', sizeof(struct mtd_dev_info));
+	mtd->major = major(st.st_rdev);
+	mtd->minor = minor(st.st_rdev);
+
+	if (mtd->major != MTD_DEV_MAJOR) {
+		errno = EINVAL;
+		return errmsg("\"%s\" has major number %d, MTD devices have "
+			      "major %d", node, mtd->major, MTD_DEV_MAJOR);
+	}
+
+	mtd->mtd_num = mtd->minor / 2;
+
+	fd = open(node, O_RDONLY);
+	if (fd == -1)
+		return sys_errmsg("cannot open \"%s\"", node);
+
+	if (ioctl(fd, MEMGETINFO, &ui)) {
+		sys_errmsg("MEMGETINFO ioctl request failed");
+		goto out_close;
+	}
+
+	ret = ioctl(fd, MEMGETBADBLOCK, &offs);
+	if (ret == -1) {
+		if (errno != EOPNOTSUPP) {
+			sys_errmsg("MEMGETBADBLOCK ioctl failed");
+			goto out_close;
+		}
+		errno = 0;
+		mtd->bb_allowed = 0;
+	} else
+		mtd->bb_allowed = 1;
+
+	mtd->type = ui.type;
+	mtd->size = ui.size;
+	mtd->eb_size = ui.erasesize;
+	mtd->min_io_size = ui.writesize;
+	mtd->oob_size = ui.oobsize;
+
+	if (mtd->min_io_size <= 0) {
+		errmsg("mtd%d (%s) has insane min. I/O unit size %d",
+		       mtd->mtd_num, node, mtd->min_io_size);
+		goto out_close;
+	}
+	if (mtd->eb_size <= 0 || mtd->eb_size < mtd->min_io_size) {
+		errmsg("mtd%d (%s) has insane eraseblock size %d",
+		       mtd->mtd_num, node, mtd->eb_size);
+		goto out_close;
+	}
+	if (mtd->size <= 0 || mtd->size < mtd->eb_size) {
+		errmsg("mtd%d (%s) has insane size %lld",
+		       mtd->mtd_num, node, mtd->size);
+		goto out_close;
+	}
+	mtd->eb_cnt = mtd->size / mtd->eb_size;
+
+	switch(mtd->type) {
+	case MTD_ABSENT:
+		errmsg("mtd%d (%s) is removable and is not present",
+		       mtd->mtd_num, node);
+		goto out_close;
+	case MTD_RAM:
+		strcpy((char *)mtd->type_str, "ram");
+		break;
+	case MTD_ROM:
+		strcpy((char *)mtd->type_str, "rom");
+		break;
+	case MTD_NORFLASH:
+		strcpy((char *)mtd->type_str, "nor");
+		break;
+	case MTD_NANDFLASH:
+		strcpy((char *)mtd->type_str, "nand");
+		break;
+	case MTD_MLCNANDFLASH:
+		strcpy((char *)mtd->type_str, "mlc-nand");
+		break;
+	case MTD_DATAFLASH:
+		strcpy((char *)mtd->type_str, "dataflash");
+		break;
+	case MTD_UBIVOLUME:
+		strcpy((char *)mtd->type_str, "ubi");
+		break;
+	default:
+		goto out_close;
+	}
+
+	if (ui.flags & MTD_WRITEABLE)
+		mtd->writable = 1;
+	mtd->subpage_size = mtd->min_io_size;
+
+	close(fd);
+
+	/*
+	 * Unfortunately, the device name is not available via ioctl, and
+	 * we have to parse /proc/mtd to get it.
+	 */
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	while (proc_parse_next(&pi)) {
+		if (pi.mtd_num == mtd->mtd_num) {
+			strcpy((char *)mtd->name, pi.name);
+			return 0;
+		}
+	}
+
+	errmsg("mtd%d not found in \"%s\"", mtd->mtd_num, MTD_PROC_FILE);
+	errno = ENOENT;
+	return -1;
+
+out_close:
+	close(fd);
+	return -1;
+}
+
+/**
+ * legacy_get_dev_info1 - legacy version of 'mtd_get_dev_info1()'.
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_dev_info1()' and has the same
+ * conventions.
+ */
+int legacy_get_dev_info1(int mtd_num, struct mtd_dev_info *mtd)
+{
+	char node[sizeof(MTD_DEV_PATT) + 20];
+
+	sprintf(node, MTD_DEV_PATT, mtd_num);
+	return legacy_get_dev_info(node, mtd);
+}
diff --git a/oslib/libmtd_xalloc.h b/oslib/libmtd_xalloc.h
new file mode 100644
index 0000000..6ac595a
--- /dev/null
+++ b/oslib/libmtd_xalloc.h
@@ -0,0 +1,106 @@
+/*
+ * memory wrappers
+ *
+ * Copyright (c) Artem Bityutskiy, 2007, 2008
+ * Copyright 2001, 2002 Red Hat, Inc.
+ *           2001 David A. Schleef <ds@lineo.com>
+ *           2002 Axis Communications AB
+ *           2001, 2002 Erik Andersen <andersen@codepoet.org>
+ *           2004 University of Szeged, Hungary
+ *           2006 KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __MTD_UTILS_XALLOC_H__
+#define __MTD_UTILS_XALLOC_H__
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Mark these functions as unused so that gcc does not emit warnings
+ * when people include this header but don't use every function.
+ */
+
+__attribute__((unused))
+static void *xmalloc(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (ptr == NULL && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ptr = calloc(nmemb, size);
+
+	if (ptr == NULL && nmemb != 0 && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static void *xzalloc(size_t size)
+{
+	return xcalloc(1, size);
+}
+
+__attribute__((unused))
+static void *xrealloc(void *ptr, size_t size)
+{
+	ptr = realloc(ptr, size);
+	if (ptr == NULL && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static char *xstrdup(const char *s)
+{
+	char *t;
+
+	if (s == NULL)
+		return NULL;
+	t = strdup(s);
+	if (t == NULL)
+		sys_errmsg_die("out of memory");
+	return t;
+}
+
+#ifdef _GNU_SOURCE
+
+__attribute__((unused))
+static int xasprintf(char **strp, const char *fmt, ...)
+{
+	int cnt;
+	va_list ap;
+
+	va_start(ap, fmt);
+	cnt = vasprintf(strp, fmt, ap);
+	va_end(ap);
+
+	if (cnt == -1)
+		sys_errmsg_die("out of memory");
+
+	return cnt;
+}
+#endif
+
+#endif /* !__MTD_UTILS_XALLOC_H__ */
diff --git a/oslib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c
new file mode 100644
index 0000000..1dda93f
--- /dev/null
+++ b/oslib/linux-dev-lookup.c
@@ -0,0 +1,67 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <dirent.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "linux-dev-lookup.h"
+
+int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
+			   unsigned int min)
+{
+	struct dirent *dir;
+	struct stat st;
+	int found = 0;
+	DIR *D;
+
+	D = opendir(path);
+	if (!D)
+		return 0;
+
+	while ((dir = readdir(D)) != NULL) {
+		char full_path[257];
+
+		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
+			continue;
+
+		sprintf(full_path, "%s/%s", path, dir->d_name);
+		if (lstat(full_path, &st) == -1) {
+			perror("lstat");
+			break;
+		}
+
+		if (S_ISDIR(st.st_mode)) {
+			found = blktrace_lookup_device(redirect, full_path,
+								maj, min);
+			if (found) {
+				strcpy(path, full_path);
+				break;
+			}
+		}
+
+		if (!S_ISBLK(st.st_mode))
+			continue;
+
+		/*
+		 * If replay_redirect is set then always return this device
+		 * upon lookup which overrides the device lookup based on
+		 * major minor in the actual blktrace
+		 */
+		if (redirect) {
+			strcpy(path, redirect);
+			found = 1;
+			break;
+		}
+
+		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
+			strcpy(path, full_path);
+			found = 1;
+			break;
+		}
+	}
+
+	closedir(D);
+	return found;
+}
diff --git a/oslib/linux-dev-lookup.h b/oslib/linux-dev-lookup.h
new file mode 100644
index 0000000..144f33a
--- /dev/null
+++ b/oslib/linux-dev-lookup.h
@@ -0,0 +1,7 @@
+#ifndef LINUX_DEV_LOOKUP
+#define LINUX_DEV_LOOKUP
+
+int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
+			   unsigned int min);
+
+#endif
diff --git a/oslib/statx.c b/oslib/statx.c
new file mode 100644
index 0000000..1ca81ad
--- /dev/null
+++ b/oslib/statx.c
@@ -0,0 +1,23 @@
+#ifndef CONFIG_HAVE_STATX
+#include "statx.h"
+
+#ifdef CONFIG_HAVE_STATX_SYSCALL
+#include <unistd.h>
+#include <sys/syscall.h>
+
+int statx(int dfd, const char *pathname, int flags, unsigned int mask,
+	  struct statx *buffer)
+{
+	return syscall(__NR_statx, dfd, pathname, flags, mask, buffer);
+}
+#else
+#include <errno.h>
+
+int statx(int dfd, const char *pathname, int flags, unsigned int mask,
+	  struct statx *buffer)
+{
+	errno = EINVAL;
+	return -1;
+}
+#endif
+#endif
diff --git a/oslib/statx.h b/oslib/statx.h
new file mode 100644
index 0000000..d9758f7
--- /dev/null
+++ b/oslib/statx.h
@@ -0,0 +1,14 @@
+#ifndef CONFIG_HAVE_STATX
+#ifdef CONFIG_HAVE_STATX_SYSCALL
+#include <linux/stat.h>
+#include <sys/stat.h>
+#else
+#define STATX_ALL 0
+#undef statx
+struct statx
+{
+};
+#endif
+int statx(int dfd, const char *pathname, int flags, unsigned int mask,
+	  struct statx *buffer);
+#endif
diff --git a/oslib/strcasestr.c b/oslib/strcasestr.c
new file mode 100644
index 0000000..5fa05fa
--- /dev/null
+++ b/oslib/strcasestr.c
@@ -0,0 +1,30 @@
+#ifndef CONFIG_STRCASESTR
+
+#include <ctype.h>
+#include <stddef.h>
+#include "strcasestr.h"
+
+char *strcasestr(const char *s1, const char *s2)
+{
+	const char *s = s1;
+	const char *p = s2;
+
+	do {
+		if (!*p)
+			return (char *) s1;
+		if ((*p == *s) ||
+		    (tolower(*p) == tolower(*s))) {
+			++p;
+			++s;
+		} else {
+			p = s2;
+			if (!*s)
+				return NULL;
+			s = ++s1;
+		}
+	} while (1);
+
+	return *p ? NULL : (char *) s1;
+}
+
+#endif
diff --git a/oslib/strcasestr.h b/oslib/strcasestr.h
new file mode 100644
index 0000000..f13e929
--- /dev/null
+++ b/oslib/strcasestr.h
@@ -0,0 +1,10 @@
+#ifndef CONFIG_STRCASESTR
+
+#ifndef FIO_STRCASESTR_H
+#define FIO_STRCASESTR_H
+
+char *strcasestr(const char *haystack, const char *needle);
+
+#endif
+
+#endif
diff --git a/oslib/strlcat.c b/oslib/strlcat.c
new file mode 100644
index 0000000..3e86eeb
--- /dev/null
+++ b/oslib/strlcat.c
@@ -0,0 +1,57 @@
+#ifndef CONFIG_STRLCAT
+/*
+ * Copyright (c) 1998, 2015 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <string.h>
+#include "strlcat.h"
+
+/*
+ * Appends src to string dst of size dsize (unlike strncat, dsize is the
+ * full size of dst, not space left).  At most dsize-1 characters
+ * will be copied.  Always NUL terminates (unless dsize <= strlen(dst)).
+ * Returns strlen(src) + MIN(dsize, strlen(initial dst)).
+ * If retval >= dsize, truncation occurred.
+ */
+size_t
+strlcat(char *dst, const char *src, size_t dsize)
+{
+	const char *odst = dst;
+	const char *osrc = src;
+	size_t n = dsize;
+	size_t dlen;
+
+	/* Find the end of dst and adjust bytes left but don't go past end. */
+	while (n-- != 0 && *dst != '\0')
+		dst++;
+	dlen = dst - odst;
+	n = dsize - dlen;
+
+	if (n-- == 0)
+		return(dlen + strlen(src));
+	while (*src != '\0') {
+		if (n != 0) {
+			*dst++ = *src;
+			n--;
+		}
+		src++;
+	}
+	*dst = '\0';
+
+	return(dlen + (src - osrc));	/* count does not include NUL */
+}
+
+#endif
diff --git a/oslib/strlcat.h b/oslib/strlcat.h
new file mode 100644
index 0000000..85e4bda
--- /dev/null
+++ b/oslib/strlcat.h
@@ -0,0 +1,12 @@
+#ifndef CONFIG_STRLCAT
+
+#ifndef FIO_STRLCAT_H
+#define FIO_STRLCAT_H
+
+#include <stddef.h>
+
+size_t strlcat(char *dst, const char *src, size_t dsize);
+
+#endif
+
+#endif
diff --git a/oslib/strndup.c b/oslib/strndup.c
new file mode 100644
index 0000000..657904a
--- /dev/null
+++ b/oslib/strndup.c
@@ -0,0 +1,19 @@
+#ifndef CONFIG_HAVE_STRNDUP
+
+#include <stdlib.h>
+#include <string.h>
+#include "strndup.h"
+
+char *strndup(const char *s, size_t n)
+{
+	char *str = malloc(n + 1);
+
+	if (str) {
+		strncpy(str, s, n);
+		str[n] = '\0';
+	}
+
+	return str;
+}
+
+#endif
diff --git a/oslib/strndup.h b/oslib/strndup.h
new file mode 100644
index 0000000..2f41848
--- /dev/null
+++ b/oslib/strndup.h
@@ -0,0 +1,12 @@
+#ifndef CONFIG_HAVE_STRNDUP
+
+#ifndef FIO_STRNDUP_LIB_H
+#define FIO_STRNDUP_LIB_H
+
+#include <stddef.h>
+
+char *strndup(const char *s, size_t n);
+
+#endif
+
+#endif
diff --git a/oslib/strsep.c b/oslib/strsep.c
new file mode 100644
index 0000000..2d42ca0
--- /dev/null
+++ b/oslib/strsep.c
@@ -0,0 +1,34 @@
+#ifndef CONFIG_STRSEP
+
+#include <stddef.h>
+#include "strsep.h"
+
+char *strsep(char **stringp, const char *delim)
+{
+	char *s, *tok;
+	const char *spanp;
+	int c, sc;
+
+	s = *stringp;
+	if (!s)
+		return NULL;
+
+	tok = s;
+	do {
+		c = *s++;
+		spanp = delim;
+		do {
+			sc = *spanp++;
+			if (sc == c) {
+				if (c == 0)
+					s = NULL;
+				else
+					s[-1] = 0;
+				*stringp = s;
+				return tok;
+			}
+		} while (sc != 0);
+	} while (1);
+}
+
+#endif
diff --git a/oslib/strsep.h b/oslib/strsep.h
new file mode 100644
index 0000000..8cd9ada
--- /dev/null
+++ b/oslib/strsep.h
@@ -0,0 +1,10 @@
+#ifndef CONFIG_STRSEP
+
+#ifndef FIO_STRSEP_LIB_H
+#define FIO_STRSEP_LIB_H
+
+char *strsep(char **, const char *);
+
+#endif
+
+#endif
diff --git a/parse.c b/parse.c
new file mode 100644
index 0000000..04b2e19
--- /dev/null
+++ b/parse.c
@@ -0,0 +1,1474 @@
+/*
+ * This file contains the ini and command liner parser main.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <float.h>
+
+#include "compiler/compiler.h"
+#include "parse.h"
+#include "debug.h"
+#include "log.h"
+#include "options.h"
+#include "optgroup.h"
+#include "minmax.h"
+#include "lib/ieee754.h"
+#include "lib/pow2.h"
+
+#ifdef CONFIG_ARITHMETIC
+#include "y.tab.h"
+#endif
+
+static const char *opt_type_names[] = {
+	"OPT_INVALID",
+	"OPT_STR",
+	"OPT_STR_ULL",
+	"OPT_STR_MULTI",
+	"OPT_STR_VAL",
+	"OPT_STR_VAL_TIME",
+	"OPT_STR_STORE",
+	"OPT_RANGE",
+	"OPT_INT",
+	"OPT_ULL",
+	"OPT_BOOL",
+	"OPT_FLOAT_LIST",
+	"OPT_STR_SET",
+	"OPT_DEPRECATED",
+	"OPT_SOFT_DEPRECATED",
+	"OPT_UNSUPPORTED",
+};
+
+static const struct fio_option *__fio_options;
+
+static int vp_cmp(const void *p1, const void *p2)
+{
+	const struct value_pair *vp1 = p1;
+	const struct value_pair *vp2 = p2;
+
+	return strlen(vp2->ival) - strlen(vp1->ival);
+}
+
+static void posval_sort(const struct fio_option *o, struct value_pair *vpmap)
+{
+	const struct value_pair *vp;
+	int entries;
+
+	memset(vpmap, 0, PARSE_MAX_VP * sizeof(struct value_pair));
+
+	for (entries = 0; entries < PARSE_MAX_VP; entries++) {
+		vp = &o->posval[entries];
+		if (!vp->ival || vp->ival[0] == '\0')
+			break;
+
+		memcpy(&vpmap[entries], vp, sizeof(*vp));
+	}
+
+	qsort(vpmap, entries, sizeof(struct value_pair), vp_cmp);
+}
+
+static void show_option_range(const struct fio_option *o,
+			      ssize_t (*logger)(const char *format, ...))
+{
+	if (o->type == FIO_OPT_FLOAT_LIST) {
+		const char *sep = "";
+		if (!o->minfp && !o->maxfp)
+			return;
+
+		logger("%20s: ", "range");
+		if (o->minfp != DBL_MIN) {
+			logger("min=%f", o->minfp);
+			sep = ", ";
+		}
+		if (o->maxfp != DBL_MAX)
+			logger("%smax=%f", sep, o->maxfp);
+		logger("\n");
+	} else if (!o->posval[0].ival) {
+		if (!o->minval && !o->maxval)
+			return;
+
+		logger("%20s: min=%d", "range", o->minval);
+		if (o->maxval)
+			logger(", max=%d", o->maxval);
+		logger("\n");
+	}
+}
+
+static void show_option_values(const struct fio_option *o)
+{
+	int i;
+
+	for (i = 0; i < PARSE_MAX_VP; i++) {
+		const struct value_pair *vp = &o->posval[i];
+
+		if (!vp->ival)
+			continue;
+
+		log_info("%20s: %-10s", i == 0 ? "valid values" : "", vp->ival);
+		if (vp->help)
+			log_info(" %s", vp->help);
+		log_info("\n");
+	}
+
+	if (i)
+		log_info("\n");
+}
+
+static void show_option_help(const struct fio_option *o, int is_err)
+{
+	const char *typehelp[] = {
+		[FIO_OPT_INVALID]	  = "invalid",
+		[FIO_OPT_STR]		  = "string (opt=bla)",
+		[FIO_OPT_STR_ULL]	  = "string (opt=bla)",
+		[FIO_OPT_STR_MULTI]	  = "string with possible k/m/g postfix (opt=4k)",
+		[FIO_OPT_STR_VAL]	  = "string (opt=bla)",
+		[FIO_OPT_STR_VAL_TIME]	  = "string with time postfix (opt=10s)",
+		[FIO_OPT_STR_STORE]	  = "string (opt=bla)",
+		[FIO_OPT_RANGE]		  = "one to three ranges (opt=1k-4k[,4k-8k[,1k-8k]])",
+		[FIO_OPT_INT]		  = "integer value (opt=100)",
+		[FIO_OPT_ULL]		  = "integer value (opt=100)",
+		[FIO_OPT_BOOL]		  = "boolean value (opt=1)",
+		[FIO_OPT_FLOAT_LIST]	  = "list of floating point values separated by ':' (opt=5.9:7.8)",
+		[FIO_OPT_STR_SET]	  = "empty or boolean value ([0|1])",
+		[FIO_OPT_DEPRECATED]	  = "deprecated",
+		[FIO_OPT_SOFT_DEPRECATED] = "deprecated",
+		[FIO_OPT_UNSUPPORTED]	  = "unsupported",
+	};
+	ssize_t (*logger)(const char *format, ...);
+
+	if (is_err)
+		logger = log_err;
+	else
+		logger = log_info;
+
+	if (o->alias)
+		logger("%20s: %s\n", "alias", o->alias);
+
+	logger("%20s: %s\n", "type", typehelp[o->type]);
+	logger("%20s: %s\n", "default", o->def ? o->def : "no default");
+	if (o->prof_name)
+		logger("%20s: only for profile '%s'\n", "valid", o->prof_name);
+	show_option_range(o, logger);
+	show_option_values(o);
+}
+
+static unsigned long long get_mult_time(const char *str, int len,
+					int is_seconds)
+{
+	const char *p = str;
+	char *c;
+	unsigned long long mult = 1;
+	int i;
+
+	/*
+         * Go forward until we hit a non-digit, or +/- sign
+         */
+	while ((p - str) <= len) {
+		if (!isdigit((int) *p) && (*p != '+') && (*p != '-'))
+			break;
+		p++;
+	}
+
+	if (!isalpha((int) *p)) {
+		if (is_seconds)
+			return 1000000UL;
+		else
+			return 1;
+	}
+
+	c = strdup(p);
+	for (i = 0; i < strlen(c); i++)
+		c[i] = tolower((unsigned char)c[i]);
+
+	if (!strncmp("us", c, 2) || !strncmp("usec", c, 4))
+		mult = 1;
+	else if (!strncmp("ms", c, 2) || !strncmp("msec", c, 4))
+		mult = 1000;
+	else if (!strcmp("s", c))
+		mult = 1000000;
+	else if (!strcmp("m", c))
+		mult = 60 * 1000000UL;
+	else if (!strcmp("h", c))
+		mult = 60 * 60 * 1000000UL;
+	else if (!strcmp("d", c))
+		mult = 24 * 60 * 60 * 1000000ULL;
+
+	free(c);
+	return mult;
+}
+
+static int is_separator(char c)
+{
+	switch (c) {
+	case ':':
+	case '-':
+	case ',':
+	case '/':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static unsigned long long __get_mult_bytes(const char *p, void *data,
+					   int *percent)
+{
+	unsigned int kb_base = fio_get_kb_base(data);
+	unsigned long long ret = 1;
+	unsigned int i, pow = 0, mult = kb_base;
+	char *c;
+
+	if (!p)
+		return 1;
+
+	c = strdup(p);
+
+	for (i = 0; i < strlen(c); i++) {
+		c[i] = tolower((unsigned char)c[i]);
+		if (is_separator(c[i])) {
+			c[i] = '\0';
+			break;
+		}
+	}
+
+	/* If kb_base is 1000, use true units.
+	 * If kb_base is 1024, use opposite units.
+	 */
+	if (!strncmp("pib", c, 3)) {
+		pow = 5;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("tib", c, 3)) {
+		pow = 4;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("gib", c, 3)) {
+		pow = 3;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("mib", c, 3)) {
+		pow = 2;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("kib", c, 3)) {
+		pow = 1;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2)) {
+		pow = 5;
+	} else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2)) {
+		pow = 4;
+	} else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2)) {
+		pow = 3;
+	} else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2)) {
+		pow = 2;
+	} else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2)) {
+		pow = 1;
+	} else if (!strncmp("%", c, 1)) {
+		*percent = 1;
+		free(c);
+		return ret;
+	}
+
+	while (pow--)
+		ret *= (unsigned long long) mult;
+
+	free(c);
+	return ret;
+}
+
+static unsigned long long get_mult_bytes(const char *str, int len, void *data,
+					 int *percent)
+{
+	const char *p = str;
+	int digit_seen = 0;
+
+	if (len < 2)
+		return __get_mult_bytes(str, data, percent);
+
+	/*
+	 * Go forward until we hit a non-digit, or +/- sign
+	 */
+	while ((p - str) <= len) {
+		if (!isdigit((int) *p) &&
+		    (((*p != '+') && (*p != '-')) || digit_seen))
+			break;
+		digit_seen |= isdigit((int) *p);
+		p++;
+	}
+
+	if (!isalpha((int) *p) && (*p != '%'))
+		p = NULL;
+
+	return __get_mult_bytes(p, data, percent);
+}
+
+extern int evaluate_arithmetic_expression(const char *buffer, long long *ival,
+					  double *dval, double implied_units,
+					  int is_time);
+
+/*
+ * Convert string into a floating number. Return 1 for success and 0 otherwise.
+ */
+int str_to_float(const char *str, double *val, int is_time)
+{
+#ifdef CONFIG_ARITHMETIC
+	int rc;
+	long long ival;
+	double dval;
+
+	if (str[0] == '(') {
+		rc = evaluate_arithmetic_expression(str, &ival, &dval, 1.0, is_time);
+		if (!rc) {
+			*val = dval;
+			return 1;
+		}
+	}
+#endif
+	return 1 == sscanf(str, "%lf", val);
+}
+
+/*
+ * convert string into decimal value, noting any size suffix
+ */
+int str_to_decimal(const char *str, long long *val, int kilo, void *data,
+		   int is_seconds, int is_time)
+{
+	int len, base;
+	int rc = 1;
+#ifdef CONFIG_ARITHMETIC
+	long long ival;
+	double dval;
+	double implied_units = 1.0;
+#endif
+
+	len = strlen(str);
+	if (!len)
+		return 1;
+
+#ifdef CONFIG_ARITHMETIC
+	if (is_seconds)
+		implied_units = 1000000.0;
+	if (str[0] == '(')
+		rc = evaluate_arithmetic_expression(str, &ival, &dval, implied_units, is_time);
+	if (str[0] == '(' && !rc) {
+		if (!kilo && is_seconds)
+			*val = ival / 1000000LL;
+		else
+			*val = ival;
+	}
+#endif
+
+	if (rc == 1) {
+		char *endptr;
+
+		if (strstr(str, "0x") || strstr(str, "0X"))
+			base = 16;
+		else
+			base = 10;
+
+		*val = strtoll(str, &endptr, base);
+		if (*val == 0 && endptr == str)
+			return 1;
+		if (*val == LONG_MAX && errno == ERANGE)
+			return 1;
+	}
+
+	if (kilo) {
+		unsigned long long mult;
+		int perc = 0;
+
+		mult = get_mult_bytes(str, len, data, &perc);
+		if (perc)
+			*val = -1ULL - *val;
+		else
+			*val *= mult;
+	} else
+		*val *= get_mult_time(str, len, is_seconds);
+
+	return 0;
+}
+
+int check_str_bytes(const char *p, long long *val, void *data)
+{
+	return str_to_decimal(p, val, 1, data, 0, 0);
+}
+
+int check_str_time(const char *p, long long *val, int is_seconds)
+{
+	return str_to_decimal(p, val, 0, NULL, is_seconds, 1);
+}
+
+void strip_blank_front(char **p)
+{
+	char *s = *p;
+
+	if (!strlen(s))
+		return;
+	while (isspace((int) *s))
+		s++;
+
+	*p = s;
+}
+
+void strip_blank_end(char *p)
+{
+	char *start = p, *s;
+
+	if (!strlen(p))
+		return;
+
+	s = strchr(p, ';');
+	if (s)
+		*s = '\0';
+	s = strchr(p, '#');
+	if (s)
+		*s = '\0';
+	if (s)
+		p = s;
+
+	s = p + strlen(p);
+	while ((isspace((int) *s) || iscntrl((int) *s)) && (s > start))
+		s--;
+
+	*(s + 1) = '\0';
+}
+
+static int check_range_bytes(const char *str, long long *val, void *data)
+{
+	long long __val;
+
+	if (!str_to_decimal(str, &__val, 1, data, 0, 0)) {
+		*val = __val;
+		return 0;
+	}
+
+	return 1;
+}
+
+static int check_int(const char *p, int *val)
+{
+	if (!strlen(p))
+		return 1;
+	if (strstr(p, "0x") || strstr(p, "0X")) {
+		if (sscanf(p, "%x", val) == 1)
+			return 0;
+	} else {
+		if (sscanf(p, "%u", val) == 1)
+			return 0;
+	}
+
+	return 1;
+}
+
+static size_t opt_len(const char *str)
+{
+	char *postfix;
+
+	postfix = strchr(str, ':');
+	if (!postfix)
+		return strlen(str);
+
+	return (int)(postfix - str);
+}
+
+static int str_match_len(const struct value_pair *vp, const char *str)
+{
+	return max(strlen(vp->ival), opt_len(str));
+}
+
+#define val_store(ptr, val, off, or, data, o)		\
+	do {						\
+		ptr = td_var((data), (o), (off));	\
+		if ((or))				\
+			*ptr |= (val);			\
+		else					\
+			*ptr = (val);			\
+	} while (0)
+
+static const char *opt_type_name(const struct fio_option *o)
+{
+	compiletime_assert(ARRAY_SIZE(opt_type_names) - 1 == FIO_OPT_UNSUPPORTED,
+				"opt_type_names[] index");
+
+	if (o->type <= FIO_OPT_UNSUPPORTED)
+		return opt_type_names[o->type];
+
+	return "OPT_UNKNOWN?";
+}
+
+static bool val_too_large(const struct fio_option *o, unsigned long long val,
+			  bool is_uint)
+{
+	if (!o->maxval)
+		return false;
+
+	if (is_uint) {
+		if ((int) val < 0)
+			return (int) val > (int) o->maxval;
+		return (unsigned int) val > o->maxval;
+	}
+
+	return val > o->maxval;
+}
+
+static bool val_too_small(const struct fio_option *o, unsigned long long val,
+			  bool is_uint)
+{
+	if (!o->minval)
+		return false;
+
+	if (is_uint)
+		return (int) val < o->minval;
+
+	return val < o->minval;
+}
+
+static int __handle_option(const struct fio_option *o, const char *ptr,
+			   void *data, int first, int more, int curr)
+{
+	int il=0, *ilp;
+	fio_fp64_t *flp;
+	long long ull, *ullp;
+	long ul2;
+	long long ull1, ull2;
+	double uf;
+	char **cp = NULL;
+	int ret = 0, is_time = 0;
+	const struct value_pair *vp;
+	struct value_pair posval[PARSE_MAX_VP];
+	int i, all_skipped = 1;
+
+	dprint(FD_PARSE, "__handle_option=%s, type=%s, ptr=%s\n", o->name,
+							opt_type_name(o), ptr);
+
+	if (!ptr && o->type != FIO_OPT_STR_SET && o->type != FIO_OPT_STR) {
+		log_err("Option %s requires an argument\n", o->name);
+		return 1;
+	}
+
+	switch (o->type) {
+	case FIO_OPT_STR:
+	case FIO_OPT_STR_ULL:
+	case FIO_OPT_STR_MULTI: {
+		fio_opt_str_fn *fn = o->cb;
+
+		posval_sort(o, posval);
+
+		ret = 1;
+		for (i = 0; i < PARSE_MAX_VP; i++) {
+			vp = &posval[i];
+			if (!vp->ival || vp->ival[0] == '\0')
+				continue;
+			all_skipped = 0;
+			if (!ptr)
+				break;
+			if (!strncmp(vp->ival, ptr, str_match_len(vp, ptr))) {
+				ret = 0;
+				if (!o->off1)
+					continue;
+				if (o->type == FIO_OPT_STR_ULL)
+					val_store(ullp, vp->oval, o->off1, vp->orval, data, o);
+				else
+					val_store(ilp, vp->oval, o->off1, vp->orval, data, o);
+				continue;
+			}
+		}
+
+		if (ret && !all_skipped)
+			show_option_values(o);
+		else if (fn)
+			ret = fn(data, ptr);
+		break;
+	}
+	case FIO_OPT_STR_VAL_TIME:
+		is_time = 1;
+		/* fall through */
+	case FIO_OPT_ULL:
+	case FIO_OPT_INT:
+	case FIO_OPT_STR_VAL: {
+		fio_opt_str_val_fn *fn = o->cb;
+		char tmp[128], *p;
+
+		if (!is_time && o->is_time)
+			is_time = o->is_time;
+
+		snprintf(tmp, sizeof(tmp), "%s", ptr);
+		p = strchr(tmp, ',');
+		if (p)
+			*p = '\0';
+
+		if (is_time)
+			ret = check_str_time(tmp, &ull, o->is_seconds);
+		else
+			ret = check_str_bytes(tmp, &ull, data);
+
+		dprint(FD_PARSE, "  ret=%d, out=%llu\n", ret, ull);
+
+		if (ret)
+			break;
+		if (o->pow2 && !is_power_of_2(ull)) {
+			log_err("%s: must be a power-of-2\n", o->name);
+			return 1;
+		}
+
+		if (val_too_large(o, ull, o->type == FIO_OPT_INT)) {
+			log_err("%s: max value out of range: %llu"
+				" (%llu max)\n", o->name, ull, o->maxval);
+			return 1;
+		}
+		if (val_too_small(o, ull, o->type == FIO_OPT_INT)) {
+			log_err("%s: min value out of range: %lld"
+				" (%d min)\n", o->name, ull, o->minval);
+			return 1;
+		}
+		if (o->posval[0].ival) {
+			posval_sort(o, posval);
+
+			ret = 1;
+			for (i = 0; i < PARSE_MAX_VP; i++) {
+				vp = &posval[i];
+				if (!vp->ival || vp->ival[0] == '\0')
+					continue;
+				if (vp->oval == ull) {
+					ret = 0;
+					break;
+				}
+			}
+			if (ret) {
+				log_err("fio: value %llu not allowed:\n", ull);
+				show_option_values(o);
+				return 1;
+			}
+		}
+
+		if (fn)
+			ret = fn(data, &ull);
+		else {
+			if (o->type == FIO_OPT_INT) {
+				if (first)
+					val_store(ilp, ull, o->off1, 0, data, o);
+				if (curr == 1) {
+					if (o->off2)
+						val_store(ilp, ull, o->off2, 0, data, o);
+				}
+				if (curr == 2) {
+					if (o->off3)
+						val_store(ilp, ull, o->off3, 0, data, o);
+				}
+				if (!more) {
+					if (curr < 1) {
+						if (o->off2)
+							val_store(ilp, ull, o->off2, 0, data, o);
+					}
+					if (curr < 2) {
+						if (o->off3)
+							val_store(ilp, ull, o->off3, 0, data, o);
+					}
+				}
+			} else if (o->type == FIO_OPT_ULL) {
+				if (first)
+					val_store(ullp, ull, o->off1, 0, data, o);
+				if (curr == 1) {
+					if (o->off2)
+						val_store(ullp, ull, o->off2, 0, data, o);
+				}
+				if (curr == 2) {
+					if (o->off3)
+						val_store(ullp, ull, o->off3, 0, data, o);
+				}
+				if (!more) {
+					if (curr < 1) {
+						if (o->off2)
+							val_store(ullp, ull, o->off2, 0, data, o);
+					}
+					if (curr < 2) {
+						if (o->off3)
+							val_store(ullp, ull, o->off3, 0, data, o);
+					}
+				}
+			} else {
+				if (first)
+					val_store(ullp, ull, o->off1, 0, data, o);
+				if (!more) {
+					if (o->off2)
+						val_store(ullp, ull, o->off2, 0, data, o);
+				}
+			}
+		}
+		break;
+	}
+	case FIO_OPT_FLOAT_LIST: {
+		char *cp2;
+
+		if (first) {
+			/*
+			** Initialize precision to 0 and zero out list
+			** in case specified list is shorter than default
+			*/
+			if (o->off2) {
+				ul2 = 0;
+				ilp = td_var(data, o, o->off2);
+				*ilp = ul2;
+			}
+
+			flp = td_var(data, o, o->off1);
+			for(i = 0; i < o->maxlen; i++)
+				flp[i].u.f = 0.0;
+		}
+		if (curr >= o->maxlen) {
+			log_err("the list exceeding max length %d\n",
+					o->maxlen);
+			return 1;
+		}
+		if (!str_to_float(ptr, &uf, 0)) { /* this breaks if we ever have lists of times */
+			log_err("not a floating point value: %s\n", ptr);
+			return 1;
+		}
+		if (o->minfp || o->maxfp) {
+			if (uf > o->maxfp) {
+				log_err("value out of range: %f"
+					" (range max: %f)\n", uf, o->maxfp);
+				return 1;
+			}
+			if (uf < o->minfp) {
+				log_err("value out of range: %f"
+					" (range min: %f)\n", uf, o->minfp);
+				return 1;
+			}
+		}
+
+		flp = td_var(data, o, o->off1);
+		flp[curr].u.f = uf;
+
+		dprint(FD_PARSE, "  out=%f\n", uf);
+
+		/*
+		** Calculate precision for output by counting
+		** number of digits after period. Find first
+		** period in entire remaining list each time
+		*/
+		cp2 = strchr(ptr, '.');
+		if (cp2 != NULL) {
+			int len = 0;
+
+			while (*++cp2 != '\0' && *cp2 >= '0' && *cp2 <= '9')
+				len++;
+
+			if (o->off2) {
+				ilp = td_var(data, o, o->off2);
+				if (len > *ilp)
+					*ilp = len;
+			}
+		}
+
+		break;
+	}
+	case FIO_OPT_STR_STORE: {
+		fio_opt_str_fn *fn = o->cb;
+
+		if (!strlen(ptr))
+			return 1;
+
+		if (o->off1) {
+			cp = td_var(data, o, o->off1);
+			*cp = strdup(ptr);
+		}
+
+		if (fn)
+			ret = fn(data, ptr);
+		else if (o->posval[0].ival) {
+			posval_sort(o, posval);
+
+			ret = 1;
+			for (i = 0; i < PARSE_MAX_VP; i++) {
+				vp = &posval[i];
+				if (!vp->ival || vp->ival[0] == '\0' || !cp)
+					continue;
+				all_skipped = 0;
+				if (!strncmp(vp->ival, ptr, str_match_len(vp, ptr))) {
+					char *rest;
+
+					ret = 0;
+					if (vp->cb)
+						fn = vp->cb;
+					rest = strstr(*cp ?: ptr, ":");
+					if (rest) {
+						if (*cp)
+							*rest = '\0';
+						ptr = rest + 1;
+					} else
+						ptr = NULL;
+					break;
+				}
+			}
+		}
+
+		if (!all_skipped) {
+			if (ret && !*cp)
+				show_option_values(o);
+			else if (ret && *cp)
+				ret = 0;
+			else if (fn && ptr)
+				ret = fn(data, ptr);
+		}
+
+		break;
+	}
+	case FIO_OPT_RANGE: {
+		char tmp[128];
+		char *p1, *p2;
+
+		snprintf(tmp, sizeof(tmp), "%s", ptr);
+
+		/* Handle bsrange with separate read,write values: */
+		p1 = strchr(tmp, ',');
+		if (p1)
+			*p1 = '\0';
+
+		p1 = strchr(tmp, '-');
+		if (!p1) {
+			p1 = strchr(tmp, ':');
+			if (!p1) {
+				ret = 1;
+				break;
+			}
+		}
+
+		p2 = p1 + 1;
+		*p1 = '\0';
+		p1 = tmp;
+
+		ret = 1;
+		if (!check_range_bytes(p1, &ull1, data) &&
+			!check_range_bytes(p2, &ull2, data)) {
+			ret = 0;
+			if (ull1 > ull2) {
+				unsigned long long foo = ull1;
+
+				ull1 = ull2;
+				ull2 = foo;
+			}
+
+			if (first) {
+				val_store(ullp, ull1, o->off1, 0, data, o);
+				val_store(ullp, ull2, o->off2, 0, data, o);
+			}
+			if (curr == 1) {
+				if (o->off3 && o->off4) {
+					val_store(ullp, ull1, o->off3, 0, data, o);
+					val_store(ullp, ull2, o->off4, 0, data, o);
+				}
+			}
+			if (curr == 2) {
+				if (o->off5 && o->off6) {
+					val_store(ullp, ull1, o->off5, 0, data, o);
+					val_store(ullp, ull2, o->off6, 0, data, o);
+				}
+			}
+			if (!more) {
+				if (curr < 1) {
+					if (o->off3 && o->off4) {
+						val_store(ullp, ull1, o->off3, 0, data, o);
+						val_store(ullp, ull2, o->off4, 0, data, o);
+					}
+				}
+				if (curr < 2) {
+					if (o->off5 && o->off6) {
+						val_store(ullp, ull1, o->off5, 0, data, o);
+						val_store(ullp, ull2, o->off6, 0, data, o);
+					}
+				}
+			}
+		}
+
+		break;
+	}
+	case FIO_OPT_BOOL:
+	case FIO_OPT_STR_SET: {
+		fio_opt_int_fn *fn = o->cb;
+
+		if (ptr)
+			ret = check_int(ptr, &il);
+		else if (o->type == FIO_OPT_BOOL)
+			ret = 1;
+		else
+			il = 1;
+
+		dprint(FD_PARSE, "  ret=%d, out=%d\n", ret, il);
+
+		if (ret)
+			break;
+
+		if (o->maxval && il > (int) o->maxval) {
+			log_err("max value out of range: %d (%llu max)\n",
+								il, o->maxval);
+			return 1;
+		}
+		if (o->minval && il < o->minval) {
+			log_err("min value out of range: %d (%d min)\n",
+								il, o->minval);
+			return 1;
+		}
+
+		if (o->neg)
+			il = !il;
+
+		if (fn)
+			ret = fn(data, &il);
+		else {
+			if (first)
+				val_store(ilp, il, o->off1, 0, data, o);
+			if (!more) {
+				if (o->off2)
+					val_store(ilp, il, o->off2, 0, data, o);
+			}
+		}
+		break;
+	}
+	case FIO_OPT_DEPRECATED:
+		ret = 1;
+		/* fall through */
+	case FIO_OPT_SOFT_DEPRECATED:
+		log_info("Option %s is deprecated\n", o->name);
+		break;
+	default:
+		log_err("Bad option type %u\n", o->type);
+		ret = 1;
+	}
+
+	if (ret)
+		return ret;
+
+	if (o->verify) {
+		ret = o->verify(o, data);
+		if (ret) {
+			log_err("Correct format for offending option\n");
+			log_err("%20s: %s\n", o->name, o->help);
+			show_option_help(o, 1);
+		}
+	}
+
+	return ret;
+}
+
+static int handle_option(const struct fio_option *o, const char *__ptr,
+			 void *data)
+{
+	char *o_ptr, *ptr, *ptr2;
+	int ret, done;
+
+	dprint(FD_PARSE, "handle_option=%s, ptr=%s\n", o->name, __ptr);
+
+	o_ptr = ptr = NULL;
+	if (__ptr)
+		o_ptr = ptr = strdup(__ptr);
+
+	/*
+	 * See if we have another set of parameters, hidden after a comma.
+	 * Do this before parsing this round, to check if we should
+	 * copy set 1 options to set 2.
+	 */
+	done = 0;
+	ret = 1;
+	do {
+		int __ret;
+
+		ptr2 = NULL;
+		if (ptr &&
+		    (o->type != FIO_OPT_STR_STORE) &&
+		    (o->type != FIO_OPT_STR) &&
+		    (o->type != FIO_OPT_STR_ULL) &&
+		    (o->type != FIO_OPT_FLOAT_LIST)) {
+			ptr2 = strchr(ptr, ',');
+			if (ptr2 && *(ptr2 + 1) == '\0')
+				*ptr2 = '\0';
+			if (o->type != FIO_OPT_STR_MULTI && o->type != FIO_OPT_RANGE) {
+				if (!ptr2)
+					ptr2 = strchr(ptr, ':');
+				if (!ptr2)
+					ptr2 = strchr(ptr, '-');
+			}
+		} else if (ptr && o->type == FIO_OPT_FLOAT_LIST) {
+			ptr2 = strchr(ptr, ':');
+		}
+
+		/*
+		 * Don't return early if parsing the first option fails - if
+		 * we are doing multiple arguments, we can allow the first one
+		 * being empty.
+		 */
+		__ret = __handle_option(o, ptr, data, !done, !!ptr2, done);
+		if (ret)
+			ret = __ret;
+
+		if (!ptr2)
+			break;
+
+		ptr = ptr2 + 1;
+		done++;
+	} while (1);
+
+	if (o_ptr)
+		free(o_ptr);
+	return ret;
+}
+
+struct fio_option *find_option(struct fio_option *options, const char *opt)
+{
+	struct fio_option *o;
+
+	for (o = &options[0]; o->name; o++) {
+		if (!o_match(o, opt))
+			continue;
+		if (o->type == FIO_OPT_UNSUPPORTED) {
+			log_err("Option <%s>: %s\n", o->name, o->help);
+			continue;
+		}
+
+		return o;
+	}
+
+	return NULL;
+}
+
+const struct fio_option *
+find_option_c(const struct fio_option *options, const char *opt)
+{
+	const struct fio_option *o;
+
+	for (o = &options[0]; o->name; o++) {
+		if (!o_match(o, opt))
+			continue;
+		if (o->type == FIO_OPT_UNSUPPORTED) {
+			log_err("Option <%s>: %s\n", o->name, o->help);
+			continue;
+		}
+
+		return o;
+	}
+
+	return NULL;
+}
+
+static const struct fio_option *
+get_option(char *opt, const struct fio_option *options, char **post)
+{
+	const struct fio_option *o;
+	char *ret;
+
+	ret = strchr(opt, '=');
+	if (ret) {
+		*post = ret;
+		*ret = '\0';
+		ret = opt;
+		(*post)++;
+		strip_blank_end(ret);
+		o = find_option_c(options, ret);
+	} else {
+		o = find_option_c(options, opt);
+		*post = NULL;
+	}
+
+	return o;
+}
+
+static int opt_cmp(const void *p1, const void *p2)
+{
+	const struct fio_option *o;
+	char *s, *foo;
+	int prio1, prio2;
+
+	prio1 = prio2 = 0;
+
+	if (*(char **)p1) {
+		s = strdup(*((char **) p1));
+		o = get_option(s, __fio_options, &foo);
+		if (o)
+			prio1 = o->prio;
+		free(s);
+	}
+	if (*(char **)p2) {
+		s = strdup(*((char **) p2));
+		o = get_option(s, __fio_options, &foo);
+		if (o)
+			prio2 = o->prio;
+		free(s);
+	}
+
+	return prio2 - prio1;
+}
+
+void sort_options(char **opts, const struct fio_option *options, int num_opts)
+{
+	__fio_options = options;
+	qsort(opts, num_opts, sizeof(char *), opt_cmp);
+	__fio_options = NULL;
+}
+
+static void add_to_dump_list(const struct fio_option *o,
+			     struct flist_head *dump_list, const char *post)
+{
+	struct print_option *p;
+
+	if (!dump_list)
+		return;
+
+	p = malloc(sizeof(*p));
+	p->name = strdup(o->name);
+	if (post)
+		p->value = strdup(post);
+	else
+		p->value = NULL;
+
+	flist_add_tail(&p->list, dump_list);
+}
+
+int parse_cmd_option(const char *opt, const char *val,
+		     const struct fio_option *options, void *data,
+		     struct flist_head *dump_list)
+{
+	const struct fio_option *o;
+
+	o = find_option_c(options, opt);
+	if (!o) {
+		log_err("Bad option <%s>\n", opt);
+		return 1;
+	}
+
+	if (handle_option(o, val, data)) {
+		log_err("fio: failed parsing %s=%s\n", opt, val);
+		return 1;
+	}
+
+	add_to_dump_list(o, dump_list, val);
+	return 0;
+}
+
+int parse_option(char *opt, const char *input, const struct fio_option *options,
+		 const struct fio_option **o, void *data,
+		 struct flist_head *dump_list)
+{
+	char *post;
+
+	if (!opt) {
+		log_err("fio: failed parsing %s\n", input);
+		*o = NULL;
+		return 1;
+	}
+
+	*o = get_option(opt, options, &post);
+	if (!*o) {
+		if (post) {
+			int len = strlen(opt);
+			if (opt + len + 1 != post)
+				memmove(opt + len + 1, post, strlen(post));
+			opt[len] = '=';
+		}
+		return 1;
+	}
+
+	if (handle_option(*o, post, data)) {
+		log_err("fio: failed parsing %s\n", input);
+		return 1;
+	}
+
+	add_to_dump_list(*o, dump_list, post);
+	return 0;
+}
+
+/*
+ * Option match, levenshtein distance. Handy for not quite remembering what
+ * the option name is.
+ */
+int string_distance(const char *s1, const char *s2)
+{
+	unsigned int s1_len = strlen(s1);
+	unsigned int s2_len = strlen(s2);
+	unsigned int *p, *q, *r;
+	unsigned int i, j;
+
+	p = malloc(sizeof(unsigned int) * (s2_len + 1));
+	q = malloc(sizeof(unsigned int) * (s2_len + 1));
+
+	p[0] = 0;
+	for (i = 1; i <= s2_len; i++)
+		p[i] = p[i - 1] + 1;
+
+	for (i = 1; i <= s1_len; i++) {
+		q[0] = p[0] + 1;
+		for (j = 1; j <= s2_len; j++) {
+			unsigned int sub = p[j - 1];
+			unsigned int pmin;
+
+			if (s1[i - 1] != s2[j - 1])
+				sub++;
+
+			pmin = min(q[j - 1] + 1, sub);
+			q[j] = min(p[j] + 1, pmin);
+		}
+		r = p;
+		p = q;
+		q = r;
+	}
+
+	i = p[s2_len];
+	free(p);
+	free(q);
+	return i;
+}
+
+/*
+ * Make a guess of whether the distance from 's1' is significant enough
+ * to warrant printing the guess. We set this to a 1/2 match.
+ */
+int string_distance_ok(const char *opt, int distance)
+{
+	size_t len;
+
+	len = strlen(opt);
+	len = (len + 1) / 2;
+	return distance <= len;
+}
+
+static const struct fio_option *find_child(const struct fio_option *options,
+					   const struct fio_option *o)
+{
+	const struct fio_option *__o;
+
+	for (__o = options + 1; __o->name; __o++)
+		if (__o->parent && !strcmp(__o->parent, o->name))
+			return __o;
+
+	return NULL;
+}
+
+static void __print_option(const struct fio_option *o,
+			   const struct fio_option *org,
+			   int level)
+{
+	char name[256], *p;
+	int depth;
+
+	if (!o)
+		return;
+
+	p = name;
+	depth = level;
+	while (depth--)
+		p += sprintf(p, "%s", "  ");
+
+	sprintf(p, "%s", o->name);
+
+	log_info("%-24s: %s\n", name, o->help);
+}
+
+static void print_option(const struct fio_option *o)
+{
+	const struct fio_option *parent;
+	const struct fio_option *__o;
+	unsigned int printed;
+	unsigned int level;
+
+	__print_option(o, NULL, 0);
+	parent = o;
+	level = 0;
+	do {
+		level++;
+		printed = 0;
+
+		while ((__o = find_child(o, parent)) != NULL) {
+			__print_option(__o, o, level);
+			o = __o;
+			printed++;
+		}
+
+		parent = o;
+	} while (printed);
+}
+
+int show_cmd_help(const struct fio_option *options, const char *name)
+{
+	const struct fio_option *o, *closest;
+	unsigned int best_dist = -1U;
+	int found = 0;
+	int show_all = 0;
+
+	if (!name || !strcmp(name, "all"))
+		show_all = 1;
+
+	closest = NULL;
+	best_dist = -1;
+	for (o = &options[0]; o->name; o++) {
+		int match = 0;
+
+		if (o->type == FIO_OPT_DEPRECATED ||
+		    o->type == FIO_OPT_SOFT_DEPRECATED)
+			continue;
+		if (!exec_profile && o->prof_name)
+			continue;
+		if (exec_profile && !(o->prof_name && !strcmp(exec_profile, o->prof_name)))
+			continue;
+
+		if (name) {
+			if (!strcmp(name, o->name) ||
+			    (o->alias && !strcmp(name, o->alias)))
+				match = 1;
+			else {
+				unsigned int dist;
+
+				dist = string_distance(name, o->name);
+				if (dist < best_dist) {
+					best_dist = dist;
+					closest = o;
+				}
+			}
+		}
+
+		if (show_all || match) {
+			found = 1;
+			if (match)
+				log_info("%20s: %s\n", o->name, o->help);
+			if (show_all) {
+				if (!o->parent)
+					print_option(o);
+				continue;
+			}
+		}
+
+		if (!match)
+			continue;
+
+		show_option_help(o, 0);
+	}
+
+	if (found)
+		return 0;
+
+	log_err("No such command: %s", name);
+
+	/*
+	 * Only print an appropriately close option, one where the edit
+	 * distance isn't too big. Otherwise we get crazy matches.
+	 */
+	if (closest && best_dist < 3) {
+		log_info(" - showing closest match\n");
+		log_info("%20s: %s\n", closest->name, closest->help);
+		show_option_help(closest, 0);
+	} else
+		log_info("\n");
+
+	return 1;
+}
+
+/*
+ * Handle parsing of default parameters.
+ */
+void fill_default_options(void *data, const struct fio_option *options)
+{
+	const struct fio_option *o;
+
+	dprint(FD_PARSE, "filling default options\n");
+
+	for (o = &options[0]; o->name; o++)
+		if (o->def)
+			handle_option(o, o->def, data);
+}
+
+static void option_init(struct fio_option *o)
+{
+	if (o->type == FIO_OPT_DEPRECATED || o->type == FIO_OPT_UNSUPPORTED ||
+	    o->type == FIO_OPT_SOFT_DEPRECATED)
+		return;
+	if (o->name && !o->lname)
+		log_err("Option %s: missing long option name\n", o->name);
+	if (o->type == FIO_OPT_BOOL) {
+		o->minval = 0;
+		o->maxval = 1;
+	}
+	if (o->type == FIO_OPT_INT) {
+		if (!o->maxval)
+			o->maxval = UINT_MAX;
+	}
+	if (o->type == FIO_OPT_ULL) {
+		if (!o->maxval)
+			o->maxval = ULLONG_MAX;
+	}
+	if (o->type == FIO_OPT_STR_SET && o->def && !o->no_warn_def) {
+		log_err("Option %s: string set option with"
+				" default will always be true\n", o->name);
+	}
+	if (!o->cb && !o->off1)
+		log_err("Option %s: neither cb nor offset given\n", o->name);
+	if (!o->category) {
+		log_info("Option %s: no category defined. Setting to misc\n", o->name);
+		o->category = FIO_OPT_C_GENERAL;
+		o->group = FIO_OPT_G_INVALID;
+	}
+}
+
+/*
+ * Sanitize the options structure. For now it just sets min/max for bool
+ * values and whether both callback and offsets are given.
+ */
+void options_init(struct fio_option *options)
+{
+	struct fio_option *o;
+
+	dprint(FD_PARSE, "init options\n");
+
+	for (o = &options[0]; o->name; o++) {
+		option_init(o);
+		if (o->inverse)
+			o->inv_opt = find_option(options, o->inverse);
+	}
+}
+
+void options_mem_dupe(const struct fio_option *options, void *data)
+{
+	const struct fio_option *o;
+	char **ptr;
+
+	dprint(FD_PARSE, "dup options\n");
+
+	for (o = &options[0]; o->name; o++) {
+		if (o->type != FIO_OPT_STR_STORE)
+			continue;
+
+		ptr = td_var(data, o, o->off1);
+		if (*ptr)
+			*ptr = strdup(*ptr);
+	}
+}
+
+void options_free(const struct fio_option *options, void *data)
+{
+	const struct fio_option *o;
+	char **ptr;
+
+	dprint(FD_PARSE, "free options\n");
+
+	for (o = &options[0]; o->name; o++) {
+		if (o->type != FIO_OPT_STR_STORE || !o->off1 || o->no_free)
+			continue;
+
+		ptr = td_var(data, o, o->off1);
+		if (*ptr) {
+			free(*ptr);
+			*ptr = NULL;
+		}
+	}
+}
diff --git a/parse.h b/parse.h
new file mode 100644
index 0000000..5828654
--- /dev/null
+++ b/parse.h
@@ -0,0 +1,142 @@
+#ifndef FIO_PARSE_H
+#define FIO_PARSE_H
+
+#include <inttypes.h>
+#include "flist.h"
+
+/*
+ * Option types
+ */
+enum fio_opt_type {
+	FIO_OPT_INVALID = 0,
+	FIO_OPT_STR,
+	FIO_OPT_STR_ULL,
+	FIO_OPT_STR_MULTI,
+	FIO_OPT_STR_VAL,
+	FIO_OPT_STR_VAL_TIME,
+	FIO_OPT_STR_STORE,
+	FIO_OPT_RANGE,
+	FIO_OPT_INT,
+	FIO_OPT_ULL,
+	FIO_OPT_BOOL,
+	FIO_OPT_FLOAT_LIST,
+	FIO_OPT_STR_SET,
+	FIO_OPT_DEPRECATED,
+	FIO_OPT_SOFT_DEPRECATED,
+	FIO_OPT_UNSUPPORTED,	/* keep this last */
+};
+
+/*
+ * Match a possible value string with the integer option.
+ */
+struct value_pair {
+	const char *ival;		/* string option */
+	unsigned long long oval;/* output value */
+	const char *help;		/* help text for sub option */
+	int orval;			/* OR value */
+	void *cb;			/* sub-option callback */
+};
+
+#define OPT_LEN_MAX 	8192
+#define PARSE_MAX_VP	32
+
+/*
+ * Option define
+ */
+struct fio_option {
+	const char *name;		/* option name */
+	const char *lname;		/* long option name */
+	const char *alias;		/* possible old allowed name */
+	enum fio_opt_type type;		/* option type */
+	unsigned int off1;		/* potential parameters */
+	unsigned int off2;
+	unsigned int off3;
+	unsigned int off4;
+	unsigned int off5;
+	unsigned int off6;
+	unsigned long long maxval;		/* max and min value */
+	int minval;
+	double maxfp;			/* max and min floating value */
+	double minfp;
+	unsigned int interval;		/* client hint for suitable interval */
+	unsigned int maxlen;		/* max length */
+	int neg;			/* negate value stored */
+	int prio;
+	void *cb;			/* callback */
+	const char *help;		/* help text for option */
+	const char *def;		/* default setting */
+	struct value_pair posval[PARSE_MAX_VP];/* possible values */
+	const char *parent;		/* parent option */
+	int hide;			/* hide if parent isn't set */
+	int hide_on_set;		/* hide on set, not on unset */
+	const char *inverse;		/* if set, apply opposite action to this option */
+	struct fio_option *inv_opt;	/* cached lookup */
+	int (*verify)(const struct fio_option *, void *);
+	const char *prof_name;		/* only valid for specific profile */
+	void *prof_opts;
+	uint64_t category;		/* what type of option */
+	uint64_t group;			/* who to group with */
+	void *gui_data;
+	int is_seconds;			/* time value with seconds base */
+	int is_time;			/* time based value */
+	int no_warn_def;
+	int pow2;			/* must be a power-of-2 */
+	int no_free;
+};
+
+extern int parse_option(char *, const char *, const struct fio_option *,
+			const struct fio_option **, void *,
+			struct flist_head *);
+extern void sort_options(char **, const struct fio_option *, int);
+extern int parse_cmd_option(const char *t, const char *l,
+			    const struct fio_option *, void *,
+			    struct flist_head *);
+extern int show_cmd_help(const struct fio_option *, const char *);
+extern void fill_default_options(void *, const struct fio_option *);
+extern void options_init(struct fio_option *);
+extern void options_mem_dupe(const struct fio_option *, void *);
+extern void options_free(const struct fio_option *, void *);
+
+extern void strip_blank_front(char **);
+extern void strip_blank_end(char *);
+extern int str_to_decimal(const char *, long long *, int, void *, int, int);
+extern int check_str_bytes(const char *p, long long *val, void *data);
+extern int check_str_time(const char *p, long long *val, int);
+extern int str_to_float(const char *str, double *val, int is_time);
+
+extern int string_distance(const char *s1, const char *s2);
+extern int string_distance_ok(const char *s1, int dist);
+
+/*
+ * Handlers for the options
+ */
+typedef int (fio_opt_str_fn)(void *, const char *);
+typedef int (fio_opt_str_val_fn)(void *, long long *);
+typedef int (fio_opt_int_fn)(void *, int *);
+
+struct thread_options;
+static inline void *td_var(void *to, const struct fio_option *o,
+			   unsigned int offset)
+{
+	void *ret;
+
+	if (o->prof_opts)
+		ret = o->prof_opts;
+	else
+		ret = to;
+
+	return ret + offset;
+}
+
+static inline int parse_is_percent(unsigned long long val)
+{
+	return val <= -1ULL && val >= (-1ULL - 100ULL);
+}
+
+struct print_option {
+	struct flist_head list;
+	char *name;
+	char *value;
+};
+
+#endif
diff --git a/printing.c b/printing.c
new file mode 100644
index 0000000..b58996b
--- /dev/null
+++ b/printing.c
@@ -0,0 +1,139 @@
+#include <gtk/gtk.h>
+#include <cairo.h>
+
+#include "gfio.h"
+#include "cairo_text_helpers.h"
+#include "printing.h"
+
+
+static struct printing_parameters {
+	gdouble width, height, xdpi, ydpi;
+	GtkPrintSettings *settings;
+	GtkPageSetup *page_setup;
+} print_params = { 0 };
+
+static void begin_print(GtkPrintOperation *operation,
+			GtkPrintContext *context, gpointer data)
+{
+	print_params.page_setup = gtk_print_context_get_page_setup(context);
+
+	print_params.width = gtk_print_context_get_width(context);
+	print_params.height = gtk_print_context_get_height(context);
+	print_params.xdpi = gtk_print_context_get_dpi_x(context);
+	print_params.ydpi = gtk_print_context_get_dpi_y(context);
+
+	/* assume 1 page for now. */
+	gtk_print_operation_set_n_pages(operation, 1);
+}
+
+static void results_draw_page(GtkPrintOperation *operation,
+			      GtkPrintContext *context, gint page_nr,
+			      gpointer data)
+{
+	cairo_t *cr;
+	char str[32];
+	double x, y;
+
+	cr = gtk_print_context_get_cairo_context(context);
+
+	cairo_set_source_rgb(cr, 0, 0, 0);
+	cairo_set_line_width(cr, 5.0);
+	cairo_move_to(cr, 0.0, 0.0);
+	cairo_line_to(cr, print_params.width, print_params.height);
+	cairo_move_to(cr, 0.0, print_params.height);
+	cairo_line_to(cr, print_params.width, 0.0);
+	cairo_stroke(cr);
+
+	x = print_params.width / 4.0;
+	y = print_params.height / 5.0;
+	sprintf(str, "(%g,%g)", x, y);
+	draw_right_justified_text(cr, "Sans", x, y, 12.0, str);
+	cairo_set_source_rgb(cr, 0, 0, 0);
+	cairo_set_line_width(cr, 2.0);
+	cairo_move_to(cr, x, y - 30.0);
+	cairo_line_to(cr, x, y + 30.0);
+	cairo_move_to(cr, x - 30, y);
+	cairo_line_to(cr, x + 30, y);
+
+	y *= 4.0;
+	x *= 2.0;
+	sprintf(str, "(%g,%g)", x, y);
+	draw_right_justified_text(cr, "Sans", x, y, 12.0, str);
+	cairo_set_source_rgb(cr, 0, 0, 0);
+	cairo_set_line_width(cr, 2.0);
+	cairo_move_to(cr, x, y - 30.0);
+	cairo_line_to(cr, x, y + 30.0);
+	cairo_move_to(cr, x - 30, y);
+	cairo_line_to(cr, x + 30, y);
+	cairo_stroke(cr);
+}
+
+static void printing_error_dialog(GtkWidget *window, GError *print_error)
+{
+	GtkWidget *error_dialog;
+
+	printf("printing_error_dialog called\n");
+	printf("error message = %s\n", print_error->message);
+	error_dialog = gtk_message_dialog_new(GTK_WINDOW(window),
+			GTK_DIALOG_DESTROY_WITH_PARENT, GTK_MESSAGE_ERROR,
+			GTK_BUTTONS_CLOSE, "Print error:\n%s",
+			print_error->message);
+	g_signal_connect(error_dialog, "response",
+			G_CALLBACK(gtk_widget_destroy), NULL);
+	gtk_widget_show(error_dialog);
+}
+
+static void results_print_done(GtkPrintOperation *operation,
+			GtkPrintOperationResult result, gpointer data)
+{
+	GError *print_error;
+	struct gui_entry *ge = data;
+
+	if (result != GTK_PRINT_OPERATION_RESULT_ERROR)
+		return;
+
+	gtk_print_operation_get_error(operation, &print_error);
+	printing_error_dialog(ge->results_window, print_error);
+	g_error_free(print_error);
+}
+
+void gfio_print_results(struct gui_entry *ge)
+{
+	GtkPrintOperation *print;
+	GtkPrintOperationResult res;
+	GError *print_error;
+
+	print = gtk_print_operation_new();
+	if (print_params.settings != NULL)
+		gtk_print_operation_set_print_settings(print, print_params.settings);
+
+	if (print_params.page_setup != NULL)
+		gtk_print_operation_set_default_page_setup(print, print_params.page_setup);
+
+	g_signal_connect(print, "begin_print", G_CALLBACK(begin_print), NULL);
+	g_signal_connect(print, "draw_page", G_CALLBACK(results_draw_page), NULL);
+	g_signal_connect(print, "done", G_CALLBACK(results_print_done), NULL);
+	gtk_print_operation_set_allow_async(print, TRUE);
+	res = gtk_print_operation_run(print, GTK_PRINT_OPERATION_ACTION_PRINT_DIALOG,
+		GTK_WINDOW(ge->results_window), &print_error);
+
+	/*
+	 * Something's not quite right about the error handling.  If I print
+	 * to a file, and the file exists, and I don't have write permission
+	 * on that file but attempt to replace it anyway, then it just kind of
+	 * hangs and I don't get into any of this error handling stuff at all,
+	 * neither here, nor in results_print_done().
+	 */
+
+	if (res == GTK_PRINT_OPERATION_RESULT_ERROR) {
+		printing_error_dialog(ge->results_window, print_error);
+		g_error_free(print_error);
+	} else {
+		if (res == GTK_PRINT_OPERATION_RESULT_APPLY) {
+			if (print_params.settings != NULL)
+				g_object_unref(print_params.settings);
+			print_params.settings = g_object_ref(gtk_print_operation_get_print_settings(print));
+		}
+	}
+	g_object_unref(print);
+}
diff --git a/printing.h b/printing.h
new file mode 100644
index 0000000..0079919
--- /dev/null
+++ b/printing.h
@@ -0,0 +1,6 @@
+#ifndef PRINTING_H
+#define PRINTING_H
+
+void gfio_print_results(struct gui_entry *ge);
+
+#endif
diff --git a/profile.c b/profile.c
new file mode 100644
index 0000000..90c9ea8
--- /dev/null
+++ b/profile.c
@@ -0,0 +1,122 @@
+#include "fio.h"
+#include "profile.h"
+#include "debug.h"
+#include "flist.h"
+#include "options.h"
+
+static FLIST_HEAD(profile_list);
+
+struct profile_ops *find_profile(const char *profile)
+{
+	struct profile_ops *ops = NULL;
+	struct flist_head *n;
+
+	flist_for_each(n, &profile_list) {
+		ops = flist_entry(n, struct profile_ops, list);
+		if (!strcmp(profile, ops->name))
+			break;
+
+		ops = NULL;
+	}
+
+	return ops;
+}
+
+int load_profile(const char *profile)
+{
+	struct profile_ops *ops;
+
+	dprint(FD_PROFILE, "loading profile '%s'\n", profile);
+
+	ops = find_profile(profile);
+	if (ops) {
+		if (ops->prep_cmd()) {
+			log_err("fio: profile %s prep failed\n", profile);
+			return 1;
+		}
+		add_job_opts(ops->cmdline, FIO_CLIENT_TYPE_CLI);
+		return 0;
+	}
+
+	log_err("fio: profile '%s' not found\n", profile);
+	return 1;
+}
+
+static int add_profile_options(struct profile_ops *ops)
+{
+	struct fio_option *o;
+
+	if (!ops->options)
+		return 0;
+
+	o = ops->options;
+	while (o->name) {
+		o->prof_name = ops->name;
+		o->prof_opts = ops->opt_data;
+		if (add_option(o))
+			return 1;
+		o++;
+	}
+
+	return 0;
+}
+
+int register_profile(struct profile_ops *ops)
+{
+	int ret;
+
+	dprint(FD_PROFILE, "register profile '%s'\n", ops->name);
+
+	ret = add_profile_options(ops);
+	if (!ret) {
+		flist_add_tail(&ops->list, &profile_list);
+		add_opt_posval("profile", ops->name, ops->desc);
+		return 0;
+	}
+
+	invalidate_profile_options(ops->name);
+	return ret;
+}
+
+void unregister_profile(struct profile_ops *ops)
+{
+	dprint(FD_PROFILE, "unregister profile '%s'\n", ops->name);
+	flist_del(&ops->list);
+	invalidate_profile_options(ops->name);
+	del_opt_posval("profile", ops->name);
+}
+
+void profile_add_hooks(struct thread_data *td)
+{
+	struct profile_ops *ops;
+
+	if (!exec_profile)
+		return;
+
+	ops = find_profile(exec_profile);
+	if (!ops)
+		return;
+
+	if (ops->io_ops) {
+		td->prof_io_ops = *ops->io_ops;
+		td->flags |= TD_F_PROFILE_OPS;
+	}
+}
+
+int profile_td_init(struct thread_data *td)
+{
+	struct prof_io_ops *ops = &td->prof_io_ops;
+
+	if (ops->td_init)
+		return ops->td_init(td);
+
+	return 0;
+}
+
+void profile_td_exit(struct thread_data *td)
+{
+	struct prof_io_ops *ops = &td->prof_io_ops;
+
+	if (ops->td_exit)
+		ops->td_exit(td);
+}
diff --git a/profile.h b/profile.h
new file mode 100644
index 0000000..414151e
--- /dev/null
+++ b/profile.h
@@ -0,0 +1,50 @@
+#ifndef FIO_PROFILE_H
+#define FIO_PROFILE_H
+
+#include "flist.h"
+
+/*
+ * Functions for overriding internal fio io_u functions
+ */
+struct prof_io_ops {
+	int (*td_init)(struct thread_data *);
+	void (*td_exit)(struct thread_data *);
+
+	int (*io_u_lat)(struct thread_data *, uint64_t);
+};
+
+struct profile_ops {
+	struct flist_head list;
+	char name[32];
+	char desc[64];
+	int flags;
+
+	/*
+	 * Profile specific options
+	 */
+	struct fio_option *options;
+	void *opt_data;
+
+	/*
+	 * Called after parsing options, to prepare 'cmdline'
+	 */
+	int (*prep_cmd)(void);
+
+	/*
+	 * The complete command line
+	 */
+	const char **cmdline;
+
+	struct prof_io_ops *io_ops;
+};
+
+int register_profile(struct profile_ops *);
+void unregister_profile(struct profile_ops *);
+int load_profile(const char *);
+struct profile_ops *find_profile(const char *);
+void profile_add_hooks(struct thread_data *);
+
+int profile_td_init(struct thread_data *);
+void profile_td_exit(struct thread_data *);
+
+#endif
diff --git a/profiles/act.c b/profiles/act.c
new file mode 100644
index 0000000..5d3bd25
--- /dev/null
+++ b/profiles/act.c
@@ -0,0 +1,483 @@
+#include "../fio.h"
+#include "../profile.h"
+#include "../parse.h"
+#include "../optgroup.h"
+
+/*
+ * 1x loads
+ */
+#define R_LOAD		2000
+#define W_LOAD		1000
+
+#define SAMPLE_SEC	3600		/* 1h checks */
+
+struct act_pass_criteria {
+	unsigned int max_usec;
+	unsigned int max_perm;
+};
+#define ACT_MAX_CRIT	3
+
+static struct act_pass_criteria act_pass[ACT_MAX_CRIT] = {
+	{
+		.max_usec =	1000,
+		.max_perm =	50,
+	},
+	{
+		.max_usec =	8000,
+		.max_perm =	10,
+	},
+	{
+		.max_usec = 	64000,
+		.max_perm =	1,
+	},
+};
+
+struct act_slice {
+	uint64_t lat_buckets[ACT_MAX_CRIT];
+	uint64_t total_ios;
+};
+
+struct act_run_data {
+	struct fio_sem *sem;
+	unsigned int pending;
+
+	struct act_slice *slices;
+	unsigned int nr_slices;
+};
+static struct act_run_data *act_run_data;
+
+struct act_prof_data {
+	struct timespec sample_tv;
+	struct act_slice *slices;
+	unsigned int cur_slice;
+	unsigned int nr_slices;
+};
+
+#define ACT_MAX_OPTS	128
+static const char *act_opts[ACT_MAX_OPTS] = {
+	"direct=1",
+	"ioengine=sync",
+	"random_generator=lfsr",
+	"group_reporting=1",
+	"thread",
+	NULL,
+};
+static unsigned int opt_idx = 5;
+static unsigned int org_idx;
+
+static int act_add_opt(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+
+struct act_options {
+	unsigned int pad;
+	char *device_names;
+	unsigned int load;
+	unsigned int prep;
+	unsigned int threads_per_queue;
+	unsigned int num_read_blocks;
+	unsigned int write_size;
+	unsigned long long test_duration;
+};
+
+static struct act_options act_options;
+
+static struct fio_option options[] = {
+	{
+		.name	= "device-names",
+		.lname	= "device-names",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct act_options, device_names),
+		.help	= "Devices to use",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+		.no_free = true,
+	},
+	{
+		.name	= "load",
+		.lname	= "Load multiplier",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct act_options, load),
+		.help	= "ACT load multipler (default 1x)",
+		.def	= "1",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "test-duration",
+		.lname	= "Test duration",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct act_options, test_duration),
+		.help	= "How long the entire test takes to run",
+		.def	= "24h",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "threads-per-queue",
+		.lname	= "Number of read IO threads per device",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct act_options, threads_per_queue),
+		.help	= "Number of read IO threads per device",
+		.def	= "8",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "read-req-num-512-blocks",
+		.lname	= "Number of 512B blocks to read",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct act_options, num_read_blocks),
+		.help	= "Number of 512B blocks to read at the time",
+		.def	= "3",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "large-block-op-kbytes",
+		.lname	= "Size of large block ops in KiB (writes)",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct act_options, write_size),
+		.help	= "Size of large block ops in KiB (writes)",
+		.def	= "131072",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "prep",
+		.lname	= "Run ACT prep phase",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct act_options, prep),
+		.help	= "Set to run ACT prep phase",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int act_add_opt(const char *str, ...)
+{
+	char buffer[512];
+	va_list args;
+	size_t len;
+
+	if (opt_idx == ACT_MAX_OPTS) {
+		log_err("act: ACT_MAX_OPTS is too small\n");
+		return 1;
+	}
+
+	va_start(args, str);
+	len = vsnprintf(buffer, sizeof(buffer), str, args);
+	va_end(args);
+
+	if (len)
+		act_opts[opt_idx++] = strdup(buffer);
+
+	return 0;
+}
+
+static int act_add_rw(const char *dev, int reads)
+{
+	struct act_options *ao = &act_options;
+
+	if (act_add_opt("name=act-%s-%s", reads ? "read" : "write", dev))
+		return 1;
+	if (act_add_opt("filename=%s", dev))
+		return 1;
+	if (act_add_opt("rw=%s", reads ? "randread" : "randwrite"))
+		return 1;
+	if (reads) {
+		int rload = ao->load * R_LOAD / ao->threads_per_queue;
+
+		if (act_add_opt("numjobs=%u", ao->threads_per_queue))
+			return 1;
+		if (act_add_opt("rate_iops=%u", rload))
+			return 1;
+		if (act_add_opt("bs=%u", ao->num_read_blocks * 512))
+			return 1;
+	} else {
+		const int rsize = ao->write_size / (ao->num_read_blocks * 512);
+		int wload = (ao->load * W_LOAD + rsize - 1) / rsize;
+
+		if (act_add_opt("rate_iops=%u", wload))
+			return 1;
+		if (act_add_opt("bs=%u", ao->write_size))
+			return 1;
+	}
+
+	return 0;
+}
+
+static int act_add_dev_prep(const char *dev)
+{
+	/* Add sequential zero phase */
+	if (act_add_opt("name=act-prep-zeroes-%s", dev))
+		return 1;
+	if (act_add_opt("filename=%s", dev))
+		return 1;
+	if (act_add_opt("bs=1048576"))
+		return 1;
+	if (act_add_opt("zero_buffers"))
+		return 1;
+	if (act_add_opt("rw=write"))
+		return 1;
+
+	/* Randomly overwrite device */
+	if (act_add_opt("name=act-prep-salt-%s", dev))
+		return 1;
+	if (act_add_opt("stonewall"))
+		return 1;
+	if (act_add_opt("filename=%s", dev))
+		return 1;
+	if (act_add_opt("bs=4096"))
+		return 1;
+	if (act_add_opt("ioengine=libaio"))
+		return 1;
+	if (act_add_opt("iodepth=64"))
+		return 1;
+	if (act_add_opt("rw=randwrite"))
+		return 1;
+
+	return 0;
+}
+
+static int act_add_dev(const char *dev)
+{
+	if (act_options.prep)
+		return act_add_dev_prep(dev);
+
+	if (act_add_opt("runtime=%llus", act_options.test_duration))
+		return 1;
+	if (act_add_opt("time_based=1"))
+		return 1;
+
+	if (act_add_rw(dev, 1))
+		return 1;
+	if (act_add_rw(dev, 0))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Fill our private options into the command line
+ */
+static int act_prep_cmdline(void)
+{
+	if (!act_options.device_names) {
+		log_err("act: you need to set IO target(s) with the "
+			"device-names option.\n");
+		return 1;
+	}
+
+	org_idx = opt_idx;
+
+	do {
+		char *dev;
+
+		dev = strsep(&act_options.device_names, ",");
+		if (!dev)
+			break;
+
+		if (act_add_dev(dev)) {
+			log_err("act: failed adding device to the mix\n");
+			break;
+		}
+	} while (1);
+
+	return 0;
+}
+
+static int act_io_u_lat(struct thread_data *td, uint64_t nsec)
+{
+	struct act_prof_data *apd = td->prof_data;
+	struct act_slice *slice;
+	uint64_t usec = nsec / 1000ULL;
+	int i, ret = 0;
+	double perm;
+
+	if (act_options.prep)
+		return 0;
+
+	/*
+	 * Really should not happen, but lets not let jitter at the end
+	 * ruin our day.
+	 */
+	if (apd->cur_slice >= apd->nr_slices)
+		return 0;
+
+	slice = &apd->slices[apd->cur_slice];
+	slice->total_ios++;
+
+	for (i = ACT_MAX_CRIT - 1; i >= 0; i--) {
+		if (usec > act_pass[i].max_usec) {
+			slice->lat_buckets[i]++;
+			break;
+		}
+	}
+
+	if (time_since_now(&apd->sample_tv) < SAMPLE_SEC)
+		return 0;
+
+	/* SAMPLE_SEC has passed, check criteria for pass */
+	for (i = 0; i < ACT_MAX_CRIT; i++) {
+		perm = (1000.0 * slice->lat_buckets[i]) / slice->total_ios;
+		if (perm < act_pass[i].max_perm)
+			continue;
+
+		log_err("act: %f%% exceeds pass criteria of %f%%\n", perm / 10.0, (double) act_pass[i].max_perm / 10.0);
+		ret = 1;
+		break;
+	}
+
+	fio_gettime(&apd->sample_tv, NULL);
+	apd->cur_slice++;
+	return ret;
+}
+
+static void get_act_ref(void)
+{
+	fio_sem_down(act_run_data->sem);
+	act_run_data->pending++;
+	fio_sem_up(act_run_data->sem);
+}
+
+static int show_slice(struct act_slice *slice, unsigned int slice_num)
+{
+	unsigned int i, failed = 0;
+
+	log_info("   %2u", slice_num);
+
+	for (i = 0; i < ACT_MAX_CRIT; i++) {
+		double perc = 0.0;
+
+		if (slice->total_ios)
+			perc = 100.0 * (double) slice->lat_buckets[i] / (double) slice->total_ios;
+		if ((perc * 10.0) >= act_pass[i].max_perm)
+			failed++;
+		log_info("\t%2.2f", perc);
+	}
+	for (i = 0; i < ACT_MAX_CRIT; i++) {
+		double perc = 0.0;
+
+		if (slice->total_ios)
+			perc = 100.0 * (double) slice->lat_buckets[i] / (double) slice->total_ios;
+		log_info("\t%2.2f", perc);
+	}
+	log_info("\n");
+
+	return failed;
+}
+
+static void act_show_all_stats(void)
+{
+	unsigned int i, fails = 0;
+
+	log_info("        trans                   device\n");
+	log_info("        %%>(ms)                  %%>(ms)\n");
+	log_info(" slice");
+
+	for (i = 0; i < ACT_MAX_CRIT; i++)
+		log_info("\t %2u", act_pass[i].max_usec / 1000);
+	for (i = 0; i < ACT_MAX_CRIT; i++)
+		log_info("\t %2u", act_pass[i].max_usec / 1000);
+
+	log_info("\n");
+	log_info(" -----  -----   -----  ------   -----   -----  ------\n");
+
+	for (i = 0; i < act_run_data->nr_slices; i++)
+		fails += show_slice(&act_run_data->slices[i], i + 1);
+
+	log_info("\nact: test complete, device(s): %s\n", fails ? "FAILED" : "PASSED");
+}
+
+static void put_act_ref(struct thread_data *td)
+{
+	struct act_prof_data *apd = td->prof_data;
+	unsigned int i, slice;
+
+	fio_sem_down(act_run_data->sem);
+
+	if (!act_run_data->slices) {
+		act_run_data->slices = calloc(apd->nr_slices, sizeof(struct act_slice));
+		act_run_data->nr_slices = apd->nr_slices;
+	}
+
+	for (slice = 0; slice < apd->nr_slices; slice++) {
+		struct act_slice *dst = &act_run_data->slices[slice];
+		struct act_slice *src = &apd->slices[slice];
+
+		dst->total_ios += src->total_ios;
+
+		for (i = 0; i < ACT_MAX_CRIT; i++)
+			dst->lat_buckets[i] += src->lat_buckets[i];
+	}
+
+	if (!--act_run_data->pending)
+		act_show_all_stats();
+
+	fio_sem_up(act_run_data->sem);
+}
+
+static int act_td_init(struct thread_data *td)
+{
+	struct act_prof_data *apd;
+	unsigned int nr_slices;
+
+	get_act_ref();
+
+	apd = calloc(1, sizeof(*apd));
+	nr_slices = (act_options.test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
+	apd->slices = calloc(nr_slices, sizeof(struct act_slice));
+	apd->nr_slices = nr_slices;
+	fio_gettime(&apd->sample_tv, NULL);
+	td->prof_data = apd;
+	return 0;
+}
+
+static void act_td_exit(struct thread_data *td)
+{
+	struct act_prof_data *apd = td->prof_data;
+
+	put_act_ref(td);
+	free(apd->slices);
+	free(apd);
+	td->prof_data = NULL;
+}
+
+static struct prof_io_ops act_io_ops = {
+	.td_init	= act_td_init,
+	.td_exit	= act_td_exit,
+	.io_u_lat	= act_io_u_lat,
+};
+
+static struct profile_ops act_profile = {
+	.name		= "act",
+	.desc		= "ACT Aerospike like benchmark",
+	.options	= options,
+	.opt_data	= &act_options,
+	.prep_cmd	= act_prep_cmdline,
+	.cmdline	= act_opts,
+	.io_ops		= &act_io_ops,
+};
+
+static void fio_init act_register(void)
+{
+	act_run_data = calloc(1, sizeof(*act_run_data));
+	act_run_data->sem = fio_sem_init(FIO_SEM_UNLOCKED);
+
+	if (register_profile(&act_profile))
+		log_err("fio: failed to register profile 'act'\n");
+}
+
+static void fio_exit act_unregister(void)
+{
+	while (org_idx && org_idx < opt_idx)
+		free((void *) act_opts[++org_idx]);
+
+	unregister_profile(&act_profile);
+	fio_sem_remove(act_run_data->sem);
+	free(act_run_data->slices);
+	free(act_run_data);
+	act_run_data = NULL;
+}
diff --git a/profiles/tiobench.c b/profiles/tiobench.c
new file mode 100644
index 0000000..f19a085
--- /dev/null
+++ b/profiles/tiobench.c
@@ -0,0 +1,133 @@
+#include "../fio.h"
+#include "../profile.h"
+#include "../parse.h"
+#include "../optgroup.h"
+
+static unsigned long long size;
+static unsigned int loops = 1;
+static unsigned int bs = 4096;
+static unsigned int nthreads = 1;
+static char *dir;
+
+static char sz_idx[80], bs_idx[80], loop_idx[80], dir_idx[80], t_idx[80];
+
+static const char *tb_opts[] = {
+	"buffered=0", sz_idx, bs_idx, loop_idx, dir_idx, t_idx,
+	"timeout=600", "group_reporting", "thread", "overwrite=1",
+	"filename=.fio.tio.1:.fio.tio.2:.fio.tio.3:.fio.tio.4",
+	"ioengine=sync",
+	"name=seqwrite", "rw=write", "end_fsync=1",
+	"name=randwrite", "stonewall", "rw=randwrite", "end_fsync=1",
+	"name=seqread", "stonewall", "rw=read",
+	"name=randread", "stonewall", "rw=randread", NULL,
+};
+
+struct tiobench_options {
+	unsigned int pad;
+	unsigned long long size;
+	unsigned int loops;
+	unsigned int bs;
+	unsigned int nthreads;
+	char *dir;
+};
+
+static struct tiobench_options tiobench_options;
+
+static struct fio_option options[] = {
+	{
+		.name	= "size",
+		.lname	= "Tiobench size",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct tiobench_options, size),
+		.help	= "Size in MiB",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= "block",
+		.lname	= "Tiobench block",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct tiobench_options, bs),
+		.help	= "Block size in bytes",
+		.def	= "4096",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= "numruns",
+		.lname	= "Tiobench numruns",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct tiobench_options, loops),
+		.help	= "Number of runs",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= "dir",
+		.lname	= "Tiobench directory",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct tiobench_options, dir),
+		.help	= "Test directory",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_TIOBENCH,
+		.no_free = true,
+	},
+	{
+		.name	= "threads",
+		.lname	= "Tiobench threads",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct tiobench_options, nthreads),
+		.help	= "Number of Threads",
+		.category = FIO_OPT_C_PROFILE,
+		.group	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+/*
+ * Fill our private options into the command line
+ */
+static int tb_prep_cmdline(void)
+{
+	/*
+	 * tiobench uses size as MiB, so multiply up
+	 */
+	size *= 1024 * 1024ULL;
+	if (size)
+		sprintf(sz_idx, "size=%llu", size);
+	else
+		strcpy(sz_idx, "size=4*1024*$mb_memory");
+
+	sprintf(bs_idx, "bs=%u", bs);
+	sprintf(loop_idx, "loops=%u", loops);
+
+	if (dir)
+		sprintf(dir_idx, "directory=%s", dir);
+	else
+		sprintf(dir_idx, "directory=./");
+
+	sprintf(t_idx, "numjobs=%u", nthreads);
+	return 0;
+}
+
+static struct profile_ops tiobench_profile = {
+	.name		= "tiobench",
+	.desc		= "tiotest/tiobench benchmark",
+	.prep_cmd	= tb_prep_cmdline,
+	.cmdline	= tb_opts,
+	.options	= options,
+	.opt_data	= &tiobench_options,
+};
+
+static void fio_init tiobench_register(void)
+{
+	if (register_profile(&tiobench_profile))
+		log_err("fio: failed to register profile 'tiobench'\n");
+}
+
+static void fio_exit tiobench_unregister(void)
+{
+	unregister_profile(&tiobench_profile);
+}
diff --git a/pshared.c b/pshared.c
new file mode 100644
index 0000000..2119255
--- /dev/null
+++ b/pshared.c
@@ -0,0 +1,85 @@
+#include <string.h>
+
+#include "log.h"
+#include "pshared.h"
+
+int cond_init_pshared(pthread_cond_t *cond)
+{
+	pthread_condattr_t cattr;
+	int ret;
+
+	ret = pthread_condattr_init(&cattr);
+	if (ret) {
+		log_err("pthread_condattr_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+#ifdef CONFIG_PSHARED
+	ret = pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_condattr_setpshared: %s\n", strerror(ret));
+		return ret;
+	}
+#endif
+
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+	ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
+	if (ret) {
+		log_err("pthread_condattr_setclock: %s\n", strerror(ret));
+		return ret;
+	}
+#endif
+
+	ret = pthread_cond_init(cond, &cattr);
+	if (ret) {
+		log_err("pthread_cond_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int mutex_init_pshared(pthread_mutex_t *mutex)
+{
+	pthread_mutexattr_t mattr;
+	int ret;
+
+	ret = pthread_mutexattr_init(&mattr);
+	if (ret) {
+		log_err("pthread_mutexattr_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	/*
+	 * Not all platforms support process shared mutexes (FreeBSD)
+	 */
+#ifdef CONFIG_PSHARED
+	ret = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_mutexattr_setpshared: %s\n", strerror(ret));
+		return ret;
+	}
+#endif
+	ret = pthread_mutex_init(mutex, &mattr);
+	if (ret) {
+		log_err("pthread_mutex_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int mutex_cond_init_pshared(pthread_mutex_t *mutex, pthread_cond_t *cond)
+{
+	int ret;
+
+	ret = mutex_init_pshared(mutex);
+	if (ret)
+		return ret;
+
+	ret = cond_init_pshared(cond);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff --git a/pshared.h b/pshared.h
new file mode 100644
index 0000000..a58df6f
--- /dev/null
+++ b/pshared.h
@@ -0,0 +1,10 @@
+#ifndef FIO_PSHARED_H
+#define FIO_PSHARED_H
+
+#include <pthread.h>
+
+extern int mutex_init_pshared(pthread_mutex_t *);
+extern int cond_init_pshared(pthread_cond_t *);
+extern int mutex_cond_init_pshared(pthread_mutex_t *, pthread_cond_t *);
+
+#endif
diff --git a/rate-submit.c b/rate-submit.c
new file mode 100644
index 0000000..cf00d9b
--- /dev/null
+++ b/rate-submit.c
@@ -0,0 +1,295 @@
+/*
+ * Rated submission helpers
+ *
+ * Copyright (C) 2015 Jens Axboe <axboe@kernel.dk>
+ *
+ */
+#include "fio.h"
+#include "ioengines.h"
+#include "lib/getrusage.h"
+#include "rate-submit.h"
+
+static void check_overlap(struct io_u *io_u)
+{
+	int i;
+	struct thread_data *td;
+	bool overlap = false;
+
+	do {
+		/*
+		 * Allow only one thread to check for overlap at a
+		 * time to prevent two threads from thinking the coast
+		 * is clear and then submitting IOs that overlap with
+		 * each other
+		 *
+		 * If an overlap is found, release the lock and
+		 * re-acquire it before checking again to give other
+		 * threads a chance to make progress
+		 *
+		 * If an overlap is not found, release the lock when the
+		 * io_u's IO_U_F_FLIGHT flag is set so that this io_u
+		 * can be checked by other threads as they assess overlap
+		 */
+		pthread_mutex_lock(&overlap_check);
+		for_each_td(td, i) {
+			if (td->runstate <= TD_SETTING_UP ||
+				td->runstate >= TD_FINISHING ||
+				!td->o.serialize_overlap ||
+				td->o.io_submit_mode != IO_MODE_OFFLOAD)
+				continue;
+
+			overlap = in_flight_overlap(&td->io_u_all, io_u);
+			if (overlap) {
+				pthread_mutex_unlock(&overlap_check);
+				break;
+			}
+		}
+	} while (overlap);
+}
+
+static int io_workqueue_fn(struct submit_worker *sw,
+			   struct workqueue_work *work)
+{
+	struct io_u *io_u = container_of(work, struct io_u, work);
+	const enum fio_ddir ddir = io_u->ddir;
+	struct thread_data *td = sw->priv;
+	int ret, error;
+
+	if (td->o.serialize_overlap)
+		check_overlap(io_u);
+
+	dprint(FD_RATE, "io_u %p queued by %u\n", io_u, gettid());
+
+	io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
+
+	td->cur_depth++;
+
+	do {
+		ret = td_io_queue(td, io_u);
+		if (ret != FIO_Q_BUSY)
+			break;
+		ret = io_u_queued_complete(td, 1);
+		if (ret > 0)
+			td->cur_depth -= ret;
+		else if (ret < 0)
+			break;
+		io_u_clear(td, io_u, IO_U_F_FLIGHT);
+	} while (1);
+
+	dprint(FD_RATE, "io_u %p ret %d by %u\n", io_u, ret, gettid());
+
+	error = io_queue_event(td, io_u, &ret, ddir, NULL, 0, NULL);
+
+	if (ret == FIO_Q_COMPLETED)
+		td->cur_depth--;
+	else if (ret == FIO_Q_QUEUED) {
+		unsigned int min_evts;
+
+		if (td->o.iodepth == 1)
+			min_evts = 1;
+		else
+			min_evts = 0;
+
+		ret = io_u_queued_complete(td, min_evts);
+		if (ret > 0)
+			td->cur_depth -= ret;
+	}
+
+	if (error || td->error)
+		pthread_cond_signal(&td->parent->free_cond);
+
+	return 0;
+}
+
+static bool io_workqueue_pre_sleep_flush_fn(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->priv;
+
+	if (td->error)
+		return false;
+	if (td->io_u_queued || td->cur_depth || td->io_u_in_flight)
+		return true;
+
+	return false;
+}
+
+static void io_workqueue_pre_sleep_fn(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->priv;
+	int ret;
+
+	ret = io_u_quiesce(td);
+	if (ret > 0)
+		td->cur_depth -= ret;
+}
+
+static int io_workqueue_alloc_fn(struct submit_worker *sw)
+{
+	struct thread_data *td;
+
+	td = calloc(1, sizeof(*td));
+	sw->priv = td;
+	return 0;
+}
+
+static void io_workqueue_free_fn(struct submit_worker *sw)
+{
+	free(sw->priv);
+	sw->priv = NULL;
+}
+
+static int io_workqueue_init_worker_fn(struct submit_worker *sw)
+{
+	struct thread_data *parent = sw->wq->td;
+	struct thread_data *td = sw->priv;
+
+	memcpy(&td->o, &parent->o, sizeof(td->o));
+	memcpy(&td->ts, &parent->ts, sizeof(td->ts));
+	td->o.uid = td->o.gid = -1U;
+	dup_files(td, parent);
+	td->eo = parent->eo;
+	fio_options_mem_dupe(td);
+
+	if (ioengine_load(td))
+		goto err;
+
+	td->pid = gettid();
+
+	INIT_FLIST_HEAD(&td->io_log_list);
+	INIT_FLIST_HEAD(&td->io_hist_list);
+	INIT_FLIST_HEAD(&td->verify_list);
+	INIT_FLIST_HEAD(&td->trim_list);
+	td->io_hist_tree = RB_ROOT;
+
+	td->o.iodepth = 1;
+	if (td_io_init(td))
+		goto err_io_init;
+
+	if (td->io_ops->post_init && td->io_ops->post_init(td))
+		goto err_io_init;
+
+	set_epoch_time(td, td->o.log_unix_epoch);
+	fio_getrusage(&td->ru_start);
+	clear_io_state(td, 1);
+
+	td_set_runstate(td, TD_RUNNING);
+	td->flags |= TD_F_CHILD | TD_F_NEED_LOCK;
+	td->parent = parent;
+	return 0;
+
+err_io_init:
+	close_ioengine(td);
+err:
+	return 1;
+
+}
+
+static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
+					unsigned int *sum_cnt)
+{
+	struct thread_data *td = sw->priv;
+
+	(*sum_cnt)++;
+	sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+
+	fio_options_free(td);
+	close_and_free_files(td);
+	if (td->io_ops)
+		close_ioengine(td);
+	td_set_runstate(td, TD_EXITED);
+}
+
+#ifdef CONFIG_SFAA
+static void sum_val(uint64_t *dst, uint64_t *src)
+{
+	if (*src) {
+		__sync_fetch_and_add(dst, *src);
+		*src = 0;
+	}
+}
+#else
+static void sum_val(uint64_t *dst, uint64_t *src)
+{
+	if (*src) {
+		*dst += *src;
+		*src = 0;
+	}
+}
+#endif
+
+static void pthread_double_unlock(pthread_mutex_t *lock1,
+				  pthread_mutex_t *lock2)
+{
+#ifndef CONFIG_SFAA
+	pthread_mutex_unlock(lock1);
+	pthread_mutex_unlock(lock2);
+#endif
+}
+
+static void pthread_double_lock(pthread_mutex_t *lock1, pthread_mutex_t *lock2)
+{
+#ifndef CONFIG_SFAA
+	if (lock1 < lock2) {
+		pthread_mutex_lock(lock1);
+		pthread_mutex_lock(lock2);
+	} else {
+		pthread_mutex_lock(lock2);
+		pthread_mutex_lock(lock1);
+	}
+#endif
+}
+
+static void sum_ddir(struct thread_data *dst, struct thread_data *src,
+		     enum fio_ddir ddir)
+{
+	pthread_double_lock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
+
+	sum_val(&dst->io_bytes[ddir], &src->io_bytes[ddir]);
+	sum_val(&dst->io_blocks[ddir], &src->io_blocks[ddir]);
+	sum_val(&dst->this_io_blocks[ddir], &src->this_io_blocks[ddir]);
+	sum_val(&dst->this_io_bytes[ddir], &src->this_io_bytes[ddir]);
+	sum_val(&dst->bytes_done[ddir], &src->bytes_done[ddir]);
+
+	pthread_double_unlock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
+}
+
+static void io_workqueue_update_acct_fn(struct submit_worker *sw)
+{
+	struct thread_data *src = sw->priv;
+	struct thread_data *dst = sw->wq->td;
+
+	if (td_read(src))
+		sum_ddir(dst, src, DDIR_READ);
+	if (td_write(src))
+		sum_ddir(dst, src, DDIR_WRITE);
+	if (td_trim(src))
+		sum_ddir(dst, src, DDIR_TRIM);
+
+}
+
+static struct workqueue_ops rated_wq_ops = {
+	.fn			= io_workqueue_fn,
+	.pre_sleep_flush_fn	= io_workqueue_pre_sleep_flush_fn,
+	.pre_sleep_fn		= io_workqueue_pre_sleep_fn,
+	.update_acct_fn		= io_workqueue_update_acct_fn,
+	.alloc_worker_fn	= io_workqueue_alloc_fn,
+	.free_worker_fn		= io_workqueue_free_fn,
+	.init_worker_fn		= io_workqueue_init_worker_fn,
+	.exit_worker_fn		= io_workqueue_exit_worker_fn,
+};
+
+int rate_submit_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	if (td->o.io_submit_mode != IO_MODE_OFFLOAD)
+		return 0;
+
+	return workqueue_init(td, &td->io_wq, &rated_wq_ops, td->o.iodepth, sk_out);
+}
+
+void rate_submit_exit(struct thread_data *td)
+{
+	if (td->o.io_submit_mode != IO_MODE_OFFLOAD)
+		return;
+
+	workqueue_exit(&td->io_wq);
+}
diff --git a/rate-submit.h b/rate-submit.h
new file mode 100644
index 0000000..19fde3a
--- /dev/null
+++ b/rate-submit.h
@@ -0,0 +1,7 @@
+#ifndef FIO_RATE_SUBMIT
+#define FIO_RATE_SUBMIT
+
+int rate_submit_init(struct thread_data *, struct sk_out *);
+void rate_submit_exit(struct thread_data *);
+
+#endif
diff --git a/rwlock.c b/rwlock.c
new file mode 100644
index 0000000..00e3809
--- /dev/null
+++ b/rwlock.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "log.h"
+#include "rwlock.h"
+#include "os/os.h"
+
+void fio_rwlock_write(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_wrlock(&lock->lock);
+}
+
+void fio_rwlock_read(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_rdlock(&lock->lock);
+}
+
+void fio_rwlock_unlock(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_unlock(&lock->lock);
+}
+
+void fio_rwlock_remove(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_destroy(&lock->lock);
+	munmap((void *) lock, sizeof(*lock));
+}
+
+struct fio_rwlock *fio_rwlock_init(void)
+{
+	struct fio_rwlock *lock;
+	pthread_rwlockattr_t attr;
+	int ret;
+
+	lock = (void *) mmap(NULL, sizeof(struct fio_rwlock),
+				PROT_READ | PROT_WRITE,
+				OS_MAP_ANON | MAP_SHARED, -1, 0);
+	if (lock == MAP_FAILED) {
+		perror("mmap rwlock");
+		lock = NULL;
+		goto err;
+	}
+
+	lock->magic = FIO_RWLOCK_MAGIC;
+
+	ret = pthread_rwlockattr_init(&attr);
+	if (ret) {
+		log_err("pthread_rwlockattr_init: %s\n", strerror(ret));
+		goto err;
+	}
+#ifdef CONFIG_PSHARED
+	ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_rwlockattr_setpshared: %s\n", strerror(ret));
+		goto destroy_attr;
+	}
+
+	ret = pthread_rwlock_init(&lock->lock, &attr);
+#else
+	ret = pthread_rwlock_init(&lock->lock, NULL);
+#endif
+
+	if (ret) {
+		log_err("pthread_rwlock_init: %s\n", strerror(ret));
+		goto destroy_attr;
+	}
+
+	pthread_rwlockattr_destroy(&attr);
+
+	return lock;
+destroy_attr:
+	pthread_rwlockattr_destroy(&attr);
+err:
+	if (lock)
+		fio_rwlock_remove(lock);
+	return NULL;
+}
diff --git a/rwlock.h b/rwlock.h
new file mode 100644
index 0000000..2968eed
--- /dev/null
+++ b/rwlock.h
@@ -0,0 +1,19 @@
+#ifndef FIO_RWLOCK_H
+#define FIO_RWLOCK_H
+
+#include <pthread.h>
+
+#define FIO_RWLOCK_MAGIC	0x52574c4fU
+
+struct fio_rwlock {
+	pthread_rwlock_t lock;
+	int magic;
+};
+
+extern void fio_rwlock_read(struct fio_rwlock *);
+extern void fio_rwlock_write(struct fio_rwlock *);
+extern void fio_rwlock_unlock(struct fio_rwlock *);
+extern struct fio_rwlock *fio_rwlock_init(void);
+extern void fio_rwlock_remove(struct fio_rwlock *);
+
+#endif
diff --git a/server.c b/server.c
new file mode 100644
index 0000000..248a2d4
--- /dev/null
+++ b/server.c
@@ -0,0 +1,2616 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include <sys/uio.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <syslog.h>
+#include <signal.h>
+#ifdef CONFIG_ZLIB
+#include <zlib.h>
+#endif
+
+#include "fio.h"
+#include "options.h"
+#include "server.h"
+#include "crc/crc16.h"
+#include "lib/ieee754.h"
+#include "verify-state.h"
+#include "smalloc.h"
+
+int fio_net_port = FIO_NET_PORT;
+
+bool exit_backend = false;
+
+enum {
+	SK_F_FREE	= 1,
+	SK_F_COPY	= 2,
+	SK_F_SIMPLE	= 4,
+	SK_F_VEC	= 8,
+	SK_F_INLINE	= 16,
+};
+
+struct sk_entry {
+	struct flist_head list;	/* link on sk_out->list */
+	int flags;		/* SK_F_* */
+	int opcode;		/* Actual command fields */
+	void *buf;
+	off_t size;
+	uint64_t tag;
+	struct flist_head next;	/* Other sk_entry's, if linked command */
+};
+
+static char *fio_server_arg;
+static char *bind_sock;
+static struct sockaddr_in saddr_in;
+static struct sockaddr_in6 saddr_in6;
+static int use_ipv6;
+#ifdef CONFIG_ZLIB
+static unsigned int has_zlib = 1;
+#else
+static unsigned int has_zlib = 0;
+#endif
+static unsigned int use_zlib;
+static char me[128];
+
+static pthread_key_t sk_out_key;
+
+struct fio_fork_item {
+	struct flist_head list;
+	int exitval;
+	int signal;
+	int exited;
+	pid_t pid;
+};
+
+struct cmd_reply {
+	struct fio_sem lock;
+	void *data;
+	size_t size;
+	int error;
+};
+
+static const char *fio_server_ops[FIO_NET_CMD_NR] = {
+	"",
+	"QUIT",
+	"EXIT",
+	"JOB",
+	"JOBLINE",
+	"TEXT",
+	"TS",
+	"GS",
+	"SEND_ETA",
+	"ETA",
+	"PROBE",
+	"START",
+	"STOP",
+	"DISK_UTIL",
+	"SERVER_START",
+	"ADD_JOB",
+	"RUN",
+	"IOLOG",
+	"UPDATE_JOB",
+	"LOAD_FILE",
+	"VTRIGGER",
+	"SENDFILE",
+	"JOB_OPT",
+};
+
+static void sk_lock(struct sk_out *sk_out)
+{
+	fio_sem_down(&sk_out->lock);
+}
+
+static void sk_unlock(struct sk_out *sk_out)
+{
+	fio_sem_up(&sk_out->lock);
+}
+
+void sk_out_assign(struct sk_out *sk_out)
+{
+	if (!sk_out)
+		return;
+
+	sk_lock(sk_out);
+	sk_out->refs++;
+	sk_unlock(sk_out);
+	pthread_setspecific(sk_out_key, sk_out);
+}
+
+static void sk_out_free(struct sk_out *sk_out)
+{
+	__fio_sem_remove(&sk_out->lock);
+	__fio_sem_remove(&sk_out->wait);
+	__fio_sem_remove(&sk_out->xmit);
+	sfree(sk_out);
+}
+
+static int __sk_out_drop(struct sk_out *sk_out)
+{
+	if (sk_out) {
+		int refs;
+
+		sk_lock(sk_out);
+		assert(sk_out->refs != 0);
+		refs = --sk_out->refs;
+		sk_unlock(sk_out);
+
+		if (!refs) {
+			sk_out_free(sk_out);
+			pthread_setspecific(sk_out_key, NULL);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+void sk_out_drop(void)
+{
+	struct sk_out *sk_out;
+
+	sk_out = pthread_getspecific(sk_out_key);
+	__sk_out_drop(sk_out);
+}
+
+static void __fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
+			       uint32_t pdu_len, uint64_t tag)
+{
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->version	= __cpu_to_le16(FIO_SERVER_VER);
+	cmd->opcode	= cpu_to_le16(opcode);
+	cmd->tag	= cpu_to_le64(tag);
+	cmd->pdu_len	= cpu_to_le32(pdu_len);
+}
+
+
+static void fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
+			     const void *pdu, uint32_t pdu_len, uint64_t tag)
+{
+	__fio_init_net_cmd(cmd, opcode, pdu_len, tag);
+
+	if (pdu)
+		memcpy(&cmd->payload, pdu, pdu_len);
+}
+
+const char *fio_server_op(unsigned int op)
+{
+	static char buf[32];
+
+	if (op < FIO_NET_CMD_NR)
+		return fio_server_ops[op];
+
+	sprintf(buf, "UNKNOWN/%d", op);
+	return buf;
+}
+
+static ssize_t iov_total_len(const struct iovec *iov, int count)
+{
+	ssize_t ret = 0;
+
+	while (count--) {
+		ret += iov->iov_len;
+		iov++;
+	}
+
+	return ret;
+}
+
+static int fio_sendv_data(int sk, struct iovec *iov, int count)
+{
+	ssize_t total_len = iov_total_len(iov, count);
+	ssize_t ret;
+
+	do {
+		ret = writev(sk, iov, count);
+		if (ret > 0) {
+			total_len -= ret;
+			if (!total_len)
+				break;
+
+			while (ret) {
+				if (ret >= iov->iov_len) {
+					ret -= iov->iov_len;
+					iov++;
+					continue;
+				}
+				iov->iov_base += ret;
+				iov->iov_len -= ret;
+				ret = 0;
+			}
+		} else if (!ret)
+			break;
+		else if (errno == EAGAIN || errno == EINTR)
+			continue;
+		else
+			break;
+	} while (!exit_backend);
+
+	if (!total_len)
+		return 0;
+
+	return 1;
+}
+
+static int fio_send_data(int sk, const void *p, unsigned int len)
+{
+	struct iovec iov = { .iov_base = (void *) p, .iov_len = len };
+
+	assert(len <= sizeof(struct fio_net_cmd) + FIO_SERVER_MAX_FRAGMENT_PDU);
+
+	return fio_sendv_data(sk, &iov, 1);
+}
+
+static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
+{
+	int flags;
+	char *p = buf;
+
+	if (wait)
+		flags = MSG_WAITALL;
+	else
+		flags = OS_MSG_DONTWAIT;
+
+	do {
+		int ret = recv(sk, p, len, flags);
+
+		if (ret > 0) {
+			len -= ret;
+			if (!len)
+				break;
+			p += ret;
+			continue;
+		} else if (!ret)
+			break;
+		else if (errno == EAGAIN || errno == EINTR) {
+			if (wait)
+				continue;
+			break;
+		} else
+			break;
+	} while (!exit_backend);
+
+	if (!len)
+		return 0;
+
+	return -1;
+}
+
+static int verify_convert_cmd(struct fio_net_cmd *cmd)
+{
+	uint16_t crc;
+
+	cmd->cmd_crc16 = le16_to_cpu(cmd->cmd_crc16);
+	cmd->pdu_crc16 = le16_to_cpu(cmd->pdu_crc16);
+
+	crc = fio_crc16(cmd, FIO_NET_CMD_CRC_SZ);
+	if (crc != cmd->cmd_crc16) {
+		log_err("fio: server bad crc on command (got %x, wanted %x)\n",
+				cmd->cmd_crc16, crc);
+		fprintf(f_err, "fio: server bad crc on command (got %x, wanted %x)\n",
+				cmd->cmd_crc16, crc);
+		return 1;
+	}
+
+	cmd->version	= le16_to_cpu(cmd->version);
+	cmd->opcode	= le16_to_cpu(cmd->opcode);
+	cmd->flags	= le32_to_cpu(cmd->flags);
+	cmd->tag	= le64_to_cpu(cmd->tag);
+	cmd->pdu_len	= le32_to_cpu(cmd->pdu_len);
+
+	switch (cmd->version) {
+	case FIO_SERVER_VER:
+		break;
+	default:
+		log_err("fio: bad server cmd version %d\n", cmd->version);
+		fprintf(f_err, "fio: client/server version mismatch (%d != %d)\n",
+				cmd->version, FIO_SERVER_VER);
+		return 1;
+	}
+
+	if (cmd->pdu_len > FIO_SERVER_MAX_FRAGMENT_PDU) {
+		log_err("fio: command payload too large: %u\n", cmd->pdu_len);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Read (and defragment, if necessary) incoming commands
+ */
+struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
+{
+	struct fio_net_cmd cmd, *tmp, *cmdret = NULL;
+	size_t cmd_size = 0, pdu_offset = 0;
+	uint16_t crc;
+	int ret, first = 1;
+	void *pdu = NULL;
+
+	do {
+		ret = fio_recv_data(sk, &cmd, sizeof(cmd), wait);
+		if (ret)
+			break;
+
+		/* We have a command, verify it and swap if need be */
+		ret = verify_convert_cmd(&cmd);
+		if (ret)
+			break;
+
+		if (first) {
+			/* if this is text, add room for \0 at the end */
+			cmd_size = sizeof(cmd) + cmd.pdu_len + 1;
+			assert(!cmdret);
+		} else
+			cmd_size += cmd.pdu_len;
+
+		if (cmd_size / 1024 > FIO_SERVER_MAX_CMD_MB * 1024) {
+			log_err("fio: cmd+pdu too large (%llu)\n", (unsigned long long) cmd_size);
+			ret = 1;
+			break;
+		}
+
+		tmp = realloc(cmdret, cmd_size);
+		if (!tmp) {
+			log_err("fio: server failed allocating cmd\n");
+			ret = 1;
+			break;
+		}
+		cmdret = tmp;
+
+		if (first)
+			memcpy(cmdret, &cmd, sizeof(cmd));
+		else if (cmdret->opcode != cmd.opcode) {
+			log_err("fio: fragment opcode mismatch (%d != %d)\n",
+					cmdret->opcode, cmd.opcode);
+			ret = 1;
+			break;
+		}
+
+		if (!cmd.pdu_len)
+			break;
+
+		/* There's payload, get it */
+		pdu = (char *) cmdret->payload + pdu_offset;
+		ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait);
+		if (ret)
+			break;
+
+		/* Verify payload crc */
+		crc = fio_crc16(pdu, cmd.pdu_len);
+		if (crc != cmd.pdu_crc16) {
+			log_err("fio: server bad crc on payload ");
+			log_err("(got %x, wanted %x)\n", cmd.pdu_crc16, crc);
+			ret = 1;
+			break;
+		}
+
+		pdu_offset += cmd.pdu_len;
+		if (!first)
+			cmdret->pdu_len += cmd.pdu_len;
+		first = 0;
+	} while (cmd.flags & FIO_NET_CMD_F_MORE);
+
+	if (ret) {
+		free(cmdret);
+		cmdret = NULL;
+	} else if (cmdret) {
+		/* zero-terminate text input */
+		if (cmdret->pdu_len) {
+			if (cmdret->opcode == FIO_NET_CMD_TEXT) {
+				struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
+				char *buf = (char *) __pdu->buf;
+
+				buf[__pdu->buf_len] = '\0';
+			} else if (cmdret->opcode == FIO_NET_CMD_JOB) {
+				struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
+				char *buf = (char *) __pdu->buf;
+				int len = le32_to_cpu(__pdu->buf_len);
+
+				buf[len] = '\0';
+			}
+		}
+
+		/* frag flag is internal */
+		cmdret->flags &= ~FIO_NET_CMD_F_MORE;
+	}
+
+	return cmdret;
+}
+
+static void add_reply(uint64_t tag, struct flist_head *list)
+{
+	struct fio_net_cmd_reply *reply;
+
+	reply = (struct fio_net_cmd_reply *) (uintptr_t) tag;
+	flist_add_tail(&reply->list, list);
+}
+
+static uint64_t alloc_reply(uint64_t tag, uint16_t opcode)
+{
+	struct fio_net_cmd_reply *reply;
+
+	reply = calloc(1, sizeof(*reply));
+	INIT_FLIST_HEAD(&reply->list);
+	fio_gettime(&reply->ts, NULL);
+	reply->saved_tag = tag;
+	reply->opcode = opcode;
+
+	return (uintptr_t) reply;
+}
+
+static void free_reply(uint64_t tag)
+{
+	struct fio_net_cmd_reply *reply;
+
+	reply = (struct fio_net_cmd_reply *) (uintptr_t) tag;
+	free(reply);
+}
+
+static void fio_net_cmd_crc_pdu(struct fio_net_cmd *cmd, const void *pdu)
+{
+	uint32_t pdu_len;
+
+	cmd->cmd_crc16 = __cpu_to_le16(fio_crc16(cmd, FIO_NET_CMD_CRC_SZ));
+
+	pdu_len = le32_to_cpu(cmd->pdu_len);
+	cmd->pdu_crc16 = __cpu_to_le16(fio_crc16(pdu, pdu_len));
+}
+
+static void fio_net_cmd_crc(struct fio_net_cmd *cmd)
+{
+	fio_net_cmd_crc_pdu(cmd, cmd->payload);
+}
+
+int fio_net_send_cmd(int fd, uint16_t opcode, const void *buf, off_t size,
+		     uint64_t *tagptr, struct flist_head *list)
+{
+	struct fio_net_cmd *cmd = NULL;
+	size_t this_len, cur_len = 0;
+	uint64_t tag;
+	int ret;
+
+	if (list) {
+		assert(tagptr);
+		tag = *tagptr = alloc_reply(*tagptr, opcode);
+	} else
+		tag = tagptr ? *tagptr : 0;
+
+	do {
+		this_len = size;
+		if (this_len > FIO_SERVER_MAX_FRAGMENT_PDU)
+			this_len = FIO_SERVER_MAX_FRAGMENT_PDU;
+
+		if (!cmd || cur_len < sizeof(*cmd) + this_len) {
+			if (cmd)
+				free(cmd);
+
+			cur_len = sizeof(*cmd) + this_len;
+			cmd = malloc(cur_len);
+		}
+
+		fio_init_net_cmd(cmd, opcode, buf, this_len, tag);
+
+		if (this_len < size)
+			cmd->flags = __cpu_to_le32(FIO_NET_CMD_F_MORE);
+
+		fio_net_cmd_crc(cmd);
+
+		ret = fio_send_data(fd, cmd, sizeof(*cmd) + this_len);
+		size -= this_len;
+		buf += this_len;
+	} while (!ret && size);
+
+	if (list) {
+		if (ret)
+			free_reply(tag);
+		else
+			add_reply(tag, list);
+	}
+
+	if (cmd)
+		free(cmd);
+
+	return ret;
+}
+
+static struct sk_entry *fio_net_prep_cmd(uint16_t opcode, void *buf,
+					 size_t size, uint64_t *tagptr,
+					 int flags)
+{
+	struct sk_entry *entry;
+
+	entry = smalloc(sizeof(*entry));
+	if (!entry)
+		return NULL;
+
+	INIT_FLIST_HEAD(&entry->next);
+	entry->opcode = opcode;
+	if (flags & SK_F_COPY) {
+		entry->buf = smalloc(size);
+		memcpy(entry->buf, buf, size);
+	} else
+		entry->buf = buf;
+
+	entry->size = size;
+	if (tagptr)
+		entry->tag = *tagptr;
+	else
+		entry->tag = 0;
+	entry->flags = flags;
+	return entry;
+}
+
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry);
+
+static void fio_net_queue_entry(struct sk_entry *entry)
+{
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	if (entry->flags & SK_F_INLINE)
+		handle_sk_entry(sk_out, entry);
+	else {
+		sk_lock(sk_out);
+		flist_add_tail(&entry->list, &sk_out->list);
+		sk_unlock(sk_out);
+
+		fio_sem_up(&sk_out->wait);
+	}
+}
+
+static int fio_net_queue_cmd(uint16_t opcode, void *buf, off_t size,
+			     uint64_t *tagptr, int flags)
+{
+	struct sk_entry *entry;
+
+	entry = fio_net_prep_cmd(opcode, buf, size, tagptr, flags);
+	if (entry) {
+		fio_net_queue_entry(entry);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int fio_net_send_simple_stack_cmd(int sk, uint16_t opcode, uint64_t tag)
+{
+	struct fio_net_cmd cmd;
+
+	fio_init_net_cmd(&cmd, opcode, NULL, 0, tag);
+	fio_net_cmd_crc(&cmd);
+
+	return fio_send_data(sk, &cmd, sizeof(cmd));
+}
+
+/*
+ * If 'list' is non-NULL, then allocate and store the sent command for
+ * later verification.
+ */
+int fio_net_send_simple_cmd(int sk, uint16_t opcode, uint64_t tag,
+			    struct flist_head *list)
+{
+	int ret;
+
+	if (list)
+		tag = alloc_reply(tag, opcode);
+
+	ret = fio_net_send_simple_stack_cmd(sk, opcode, tag);
+	if (ret) {
+		if (list)
+			free_reply(tag);
+
+		return ret;
+	}
+
+	if (list)
+		add_reply(tag, list);
+
+	return 0;
+}
+
+static int fio_net_queue_quit(void)
+{
+	dprint(FD_NET, "server: sending quit\n");
+
+	return fio_net_queue_cmd(FIO_NET_CMD_QUIT, NULL, 0, NULL, SK_F_SIMPLE);
+}
+
+int fio_net_send_quit(int sk)
+{
+	dprint(FD_NET, "server: sending quit\n");
+
+	return fio_net_send_simple_cmd(sk, FIO_NET_CMD_QUIT, 0, NULL);
+}
+
+static int fio_net_send_ack(struct fio_net_cmd *cmd, int error, int signal)
+{
+	struct cmd_end_pdu epdu;
+	uint64_t tag = 0;
+
+	if (cmd)
+		tag = cmd->tag;
+
+	epdu.error = __cpu_to_le32(error);
+	epdu.signal = __cpu_to_le32(signal);
+	return fio_net_queue_cmd(FIO_NET_CMD_STOP, &epdu, sizeof(epdu), &tag, SK_F_COPY);
+}
+
+static int fio_net_queue_stop(int error, int signal)
+{
+	dprint(FD_NET, "server: sending stop (%d, %d)\n", error, signal);
+	return fio_net_send_ack(NULL, error, signal);
+}
+
+static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
+{
+	struct fio_fork_item *ffi;
+
+	ffi = malloc(sizeof(*ffi));
+	ffi->exitval = 0;
+	ffi->signal = 0;
+	ffi->exited = 0;
+	ffi->pid = pid;
+	flist_add_tail(&ffi->list, list);
+}
+
+static void fio_server_add_conn_pid(struct flist_head *conn_list, pid_t pid)
+{
+	dprint(FD_NET, "server: forked off connection job (pid=%u)\n", (int) pid);
+	fio_server_add_fork_item(pid, conn_list);
+}
+
+static void fio_server_add_job_pid(struct flist_head *job_list, pid_t pid)
+{
+	dprint(FD_NET, "server: forked off job job (pid=%u)\n", (int) pid);
+	fio_server_add_fork_item(pid, job_list);
+}
+
+static void fio_server_check_fork_item(struct fio_fork_item *ffi)
+{
+	int ret, status;
+
+	ret = waitpid(ffi->pid, &status, WNOHANG);
+	if (ret < 0) {
+		if (errno == ECHILD) {
+			log_err("fio: connection pid %u disappeared\n", (int) ffi->pid);
+			ffi->exited = 1;
+		} else
+			log_err("fio: waitpid: %s\n", strerror(errno));
+	} else if (ret == ffi->pid) {
+		if (WIFSIGNALED(status)) {
+			ffi->signal = WTERMSIG(status);
+			ffi->exited = 1;
+		}
+		if (WIFEXITED(status)) {
+			if (WEXITSTATUS(status))
+				ffi->exitval = WEXITSTATUS(status);
+			ffi->exited = 1;
+		}
+	}
+}
+
+static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
+{
+	dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
+
+	/*
+	 * Fold STOP and QUIT...
+	 */
+	if (stop) {
+		fio_net_queue_stop(ffi->exitval, ffi->signal);
+		fio_net_queue_quit();
+	}
+
+	flist_del(&ffi->list);
+	free(ffi);
+}
+
+static void fio_server_check_fork_items(struct flist_head *list, bool stop)
+{
+	struct flist_head *entry, *tmp;
+	struct fio_fork_item *ffi;
+
+	flist_for_each_safe(entry, tmp, list) {
+		ffi = flist_entry(entry, struct fio_fork_item, list);
+
+		fio_server_check_fork_item(ffi);
+
+		if (ffi->exited)
+			fio_server_fork_item_done(ffi, stop);
+	}
+}
+
+static void fio_server_check_jobs(struct flist_head *job_list)
+{
+	fio_server_check_fork_items(job_list, true);
+}
+
+static void fio_server_check_conns(struct flist_head *conn_list)
+{
+	fio_server_check_fork_items(conn_list, false);
+}
+
+static int handle_load_file_cmd(struct fio_net_cmd *cmd)
+{
+	struct cmd_load_file_pdu *pdu = (struct cmd_load_file_pdu *) cmd->payload;
+	void *file_name = pdu->file;
+	struct cmd_start_pdu spdu;
+
+	dprint(FD_NET, "server: loading local file %s\n", (char *) file_name);
+
+	pdu->name_len = le16_to_cpu(pdu->name_len);
+	pdu->client_type = le16_to_cpu(pdu->client_type);
+
+	if (parse_jobs_ini(file_name, 0, 0, pdu->client_type)) {
+		fio_net_queue_quit();
+		return -1;
+	}
+
+	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
+	return 0;
+}
+
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
+{
+	pid_t pid;
+	int ret;
+
+	sk_out_assign(sk_out);
+
+	fio_time_init();
+	set_genesis_time();
+
+	pid = fork();
+	if (pid) {
+		fio_server_add_job_pid(job_list, pid);
+		return 0;
+	}
+
+	ret = fio_backend(sk_out);
+	free_threads_shm();
+	sk_out_drop();
+	_exit(ret);
+}
+
+static int handle_job_cmd(struct fio_net_cmd *cmd)
+{
+	struct cmd_job_pdu *pdu = (struct cmd_job_pdu *) cmd->payload;
+	void *buf = pdu->buf;
+	struct cmd_start_pdu spdu;
+
+	pdu->buf_len = le32_to_cpu(pdu->buf_len);
+	pdu->client_type = le32_to_cpu(pdu->client_type);
+
+	if (parse_jobs_ini(buf, 1, 0, pdu->client_type)) {
+		fio_net_queue_quit();
+		return -1;
+	}
+
+	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
+
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
+	return 0;
+}
+
+static int handle_jobline_cmd(struct fio_net_cmd *cmd)
+{
+	void *pdu = cmd->payload;
+	struct cmd_single_line_pdu *cslp;
+	struct cmd_line_pdu *clp;
+	unsigned long offset;
+	struct cmd_start_pdu spdu;
+	char **argv;
+	int i;
+
+	clp = pdu;
+	clp->lines = le16_to_cpu(clp->lines);
+	clp->client_type = le16_to_cpu(clp->client_type);
+	argv = malloc(clp->lines * sizeof(char *));
+	offset = sizeof(*clp);
+
+	dprint(FD_NET, "server: %d command line args\n", clp->lines);
+
+	for (i = 0; i < clp->lines; i++) {
+		cslp = pdu + offset;
+		argv[i] = (char *) cslp->text;
+
+		offset += sizeof(*cslp) + le16_to_cpu(cslp->len);
+		dprint(FD_NET, "server: %d: %s\n", i, argv[i]);
+	}
+
+	if (parse_cmd_line(clp->lines, argv, clp->client_type)) {
+		fio_net_queue_quit();
+		free(argv);
+		return -1;
+	}
+
+	free(argv);
+
+	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
+
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
+	return 0;
+}
+
+static int handle_probe_cmd(struct fio_net_cmd *cmd)
+{
+	struct cmd_client_probe_pdu *pdu = (struct cmd_client_probe_pdu *) cmd->payload;
+	uint64_t tag = cmd->tag;
+	struct cmd_probe_reply_pdu probe = {
+#ifdef CONFIG_BIG_ENDIAN
+		.bigendian	= 1,
+#endif
+		.os		= FIO_OS,
+		.arch		= FIO_ARCH,
+		.bpp		= sizeof(void *),
+		.cpus		= __cpu_to_le32(cpus_online()),
+	};
+
+	dprint(FD_NET, "server: sending probe reply\n");
+
+	strcpy(me, (char *) pdu->server);
+
+	gethostname((char *) probe.hostname, sizeof(probe.hostname));
+	snprintf((char *) probe.fio_version, sizeof(probe.fio_version), "%s",
+		 fio_version_string);
+
+	/*
+	 * If the client supports compression and we do too, then enable it
+	 */
+	if (has_zlib && le64_to_cpu(pdu->flags) & FIO_PROBE_FLAG_ZLIB) {
+		probe.flags = __cpu_to_le64(FIO_PROBE_FLAG_ZLIB);
+		use_zlib = 1;
+	} else {
+		probe.flags = 0;
+		use_zlib = 0;
+	}
+
+	return fio_net_queue_cmd(FIO_NET_CMD_PROBE, &probe, sizeof(probe), &tag, SK_F_COPY);
+}
+
+static int handle_send_eta_cmd(struct fio_net_cmd *cmd)
+{
+	struct jobs_eta *je;
+	uint64_t tag = cmd->tag;
+	size_t size;
+	int i;
+
+	dprint(FD_NET, "server sending status\n");
+
+	/*
+	 * Fake ETA return if we don't have a local one, otherwise the client
+	 * will end up timing out waiting for a response to the ETA request
+	 */
+	je = get_jobs_eta(true, &size);
+	if (!je) {
+		size = sizeof(*je);
+		je = calloc(1, size);
+	} else {
+		je->nr_running		= cpu_to_le32(je->nr_running);
+		je->nr_ramp		= cpu_to_le32(je->nr_ramp);
+		je->nr_pending		= cpu_to_le32(je->nr_pending);
+		je->nr_setting_up	= cpu_to_le32(je->nr_setting_up);
+		je->files_open		= cpu_to_le32(je->files_open);
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			je->m_rate[i]	= cpu_to_le64(je->m_rate[i]);
+			je->t_rate[i]	= cpu_to_le64(je->t_rate[i]);
+			je->m_iops[i]	= cpu_to_le32(je->m_iops[i]);
+			je->t_iops[i]	= cpu_to_le32(je->t_iops[i]);
+			je->rate[i]	= cpu_to_le64(je->rate[i]);
+			je->iops[i]	= cpu_to_le32(je->iops[i]);
+		}
+
+		je->elapsed_sec		= cpu_to_le64(je->elapsed_sec);
+		je->eta_sec		= cpu_to_le64(je->eta_sec);
+		je->nr_threads		= cpu_to_le32(je->nr_threads);
+		je->is_pow2		= cpu_to_le32(je->is_pow2);
+		je->unit_base		= cpu_to_le32(je->unit_base);
+	}
+
+	fio_net_queue_cmd(FIO_NET_CMD_ETA, je, size, &tag, SK_F_FREE);
+	return 0;
+}
+
+static int send_update_job_reply(uint64_t __tag, int error)
+{
+	uint64_t tag = __tag;
+	uint32_t pdu_error;
+
+	pdu_error = __cpu_to_le32(error);
+	return fio_net_queue_cmd(FIO_NET_CMD_UPDATE_JOB, &pdu_error, sizeof(pdu_error), &tag, SK_F_COPY);
+}
+
+static int handle_update_job_cmd(struct fio_net_cmd *cmd)
+{
+	struct cmd_add_job_pdu *pdu = (struct cmd_add_job_pdu *) cmd->payload;
+	struct thread_data *td;
+	uint32_t tnumber;
+
+	tnumber = le32_to_cpu(pdu->thread_number);
+
+	dprint(FD_NET, "server: updating options for job %u\n", tnumber);
+
+	if (!tnumber || tnumber > thread_number) {
+		send_update_job_reply(cmd->tag, ENODEV);
+		return 0;
+	}
+
+	td = &threads[tnumber - 1];
+	convert_thread_options_to_cpu(&td->o, &pdu->top);
+	send_update_job_reply(cmd->tag, 0);
+	return 0;
+}
+
+static int handle_trigger_cmd(struct fio_net_cmd *cmd, struct flist_head *job_list)
+{
+	struct cmd_vtrigger_pdu *pdu = (struct cmd_vtrigger_pdu *) cmd->payload;
+	char *buf = (char *) pdu->cmd;
+	struct all_io_list *rep;
+	size_t sz;
+
+	pdu->len = le16_to_cpu(pdu->len);
+	buf[pdu->len] = '\0';
+
+	rep = get_all_io_list(IO_LIST_ALL, &sz);
+	if (!rep) {
+		struct all_io_list state;
+
+		state.threads = cpu_to_le64((uint64_t) 0);
+		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, SK_F_COPY | SK_F_INLINE);
+	} else
+		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE);
+
+	fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
+	fio_server_check_jobs(job_list);
+	exec_trigger(buf);
+	return 0;
+}
+
+static int handle_command(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
+{
+	int ret;
+
+	dprint(FD_NET, "server: got op [%s], pdu=%u, tag=%llx\n",
+			fio_server_op(cmd->opcode), cmd->pdu_len,
+			(unsigned long long) cmd->tag);
+
+	switch (cmd->opcode) {
+	case FIO_NET_CMD_QUIT:
+		fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
+		ret = 0;
+		break;
+	case FIO_NET_CMD_EXIT:
+		exit_backend = true;
+		return -1;
+	case FIO_NET_CMD_LOAD_FILE:
+		ret = handle_load_file_cmd(cmd);
+		break;
+	case FIO_NET_CMD_JOB:
+		ret = handle_job_cmd(cmd);
+		break;
+	case FIO_NET_CMD_JOBLINE:
+		ret = handle_jobline_cmd(cmd);
+		break;
+	case FIO_NET_CMD_PROBE:
+		ret = handle_probe_cmd(cmd);
+		break;
+	case FIO_NET_CMD_SEND_ETA:
+		ret = handle_send_eta_cmd(cmd);
+		break;
+	case FIO_NET_CMD_RUN:
+		ret = handle_run_cmd(sk_out, job_list, cmd);
+		break;
+	case FIO_NET_CMD_UPDATE_JOB:
+		ret = handle_update_job_cmd(cmd);
+		break;
+	case FIO_NET_CMD_VTRIGGER:
+		ret = handle_trigger_cmd(cmd, job_list);
+		break;
+	case FIO_NET_CMD_SENDFILE: {
+		struct cmd_sendfile_reply *in;
+		struct cmd_reply *rep;
+
+		rep = (struct cmd_reply *) (uintptr_t) cmd->tag;
+
+		in = (struct cmd_sendfile_reply *) cmd->payload;
+		in->size = le32_to_cpu(in->size);
+		in->error = le32_to_cpu(in->error);
+		if (in->error) {
+			ret = 1;
+			rep->error = in->error;
+		} else {
+			ret = 0;
+			rep->data = smalloc(in->size);
+			if (!rep->data) {
+				ret = 1;
+				rep->error = ENOMEM;
+			} else {
+				rep->size = in->size;
+				memcpy(rep->data, in->data, in->size);
+			}
+		}
+		fio_sem_up(&rep->lock);
+		break;
+		}
+	default:
+		log_err("fio: unknown opcode: %s\n", fio_server_op(cmd->opcode));
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/*
+ * Send a command with a separate PDU, not inlined in the command
+ */
+static int fio_send_cmd_ext_pdu(int sk, uint16_t opcode, const void *buf,
+				off_t size, uint64_t tag, uint32_t flags)
+{
+	struct fio_net_cmd cmd;
+	struct iovec iov[2];
+	size_t this_len;
+	int ret;
+
+	iov[0].iov_base = (void *) &cmd;
+	iov[0].iov_len = sizeof(cmd);
+
+	do {
+		uint32_t this_flags = flags;
+
+		this_len = size;
+		if (this_len > FIO_SERVER_MAX_FRAGMENT_PDU)
+			this_len = FIO_SERVER_MAX_FRAGMENT_PDU;
+
+		if (this_len < size)
+			this_flags |= FIO_NET_CMD_F_MORE;
+
+		__fio_init_net_cmd(&cmd, opcode, this_len, tag);
+		cmd.flags = __cpu_to_le32(this_flags);
+		fio_net_cmd_crc_pdu(&cmd, buf);
+
+		iov[1].iov_base = (void *) buf;
+		iov[1].iov_len = this_len;
+
+		ret = fio_sendv_data(sk, iov, 2);
+		size -= this_len;
+		buf += this_len;
+	} while (!ret && size);
+
+	return ret;
+}
+
+static void finish_entry(struct sk_entry *entry)
+{
+	if (entry->flags & SK_F_FREE)
+		free(entry->buf);
+	else if (entry->flags & SK_F_COPY)
+		sfree(entry->buf);
+
+	sfree(entry);
+}
+
+static void entry_set_flags(struct sk_entry *entry, struct flist_head *list,
+			    unsigned int *flags)
+{
+	if (!flist_empty(list))
+		*flags = FIO_NET_CMD_F_MORE;
+	else
+		*flags = 0;
+}
+
+static int send_vec_entry(struct sk_out *sk_out, struct sk_entry *first)
+{
+	unsigned int flags;
+	int ret;
+
+	entry_set_flags(first, &first->next, &flags);
+
+	ret = fio_send_cmd_ext_pdu(sk_out->sk, first->opcode, first->buf,
+					first->size, first->tag, flags);
+
+	while (!flist_empty(&first->next)) {
+		struct sk_entry *next;
+
+		next = flist_first_entry(&first->next, struct sk_entry, list);
+		flist_del_init(&next->list);
+
+		entry_set_flags(next, &first->next, &flags);
+
+		ret += fio_send_cmd_ext_pdu(sk_out->sk, next->opcode, next->buf,
+						next->size, next->tag, flags);
+		finish_entry(next);
+	}
+
+	return ret;
+}
+
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry)
+{
+	int ret;
+
+	fio_sem_down(&sk_out->xmit);
+
+	if (entry->flags & SK_F_VEC)
+		ret = send_vec_entry(sk_out, entry);
+	else if (entry->flags & SK_F_SIMPLE) {
+		ret = fio_net_send_simple_cmd(sk_out->sk, entry->opcode,
+						entry->tag, NULL);
+	} else {
+		ret = fio_net_send_cmd(sk_out->sk, entry->opcode, entry->buf,
+					entry->size, &entry->tag, NULL);
+	}
+
+	fio_sem_up(&sk_out->xmit);
+
+	if (ret)
+		log_err("fio: failed handling cmd %s\n", fio_server_op(entry->opcode));
+
+	finish_entry(entry);
+	return ret;
+}
+
+static int handle_xmits(struct sk_out *sk_out)
+{
+	struct sk_entry *entry;
+	FLIST_HEAD(list);
+	int ret = 0;
+
+	sk_lock(sk_out);
+	if (flist_empty(&sk_out->list)) {
+		sk_unlock(sk_out);
+		return 0;
+	}
+
+	flist_splice_init(&sk_out->list, &list);
+	sk_unlock(sk_out);
+
+	while (!flist_empty(&list)) {
+		entry = flist_entry(list.next, struct sk_entry, list);
+		flist_del(&entry->list);
+		ret += handle_sk_entry(sk_out, entry);
+	}
+
+	return ret;
+}
+
+static int handle_connection(struct sk_out *sk_out)
+{
+	struct fio_net_cmd *cmd = NULL;
+	FLIST_HEAD(job_list);
+	int ret = 0;
+
+	reset_fio_state();
+
+	/* read forever */
+	while (!exit_backend) {
+		struct pollfd pfd = {
+			.fd	= sk_out->sk,
+			.events	= POLLIN,
+		};
+
+		do {
+			int timeout = 1000;
+
+			if (!flist_empty(&job_list))
+				timeout = 100;
+
+			handle_xmits(sk_out);
+
+			ret = poll(&pfd, 1, 0);
+			if (ret < 0) {
+				if (errno == EINTR)
+					break;
+				log_err("fio: poll: %s\n", strerror(errno));
+				break;
+			} else if (!ret) {
+				fio_server_check_jobs(&job_list);
+				fio_sem_down_timeout(&sk_out->wait, timeout);
+				continue;
+			}
+
+			if (pfd.revents & POLLIN)
+				break;
+			if (pfd.revents & (POLLERR|POLLHUP)) {
+				ret = 1;
+				break;
+			}
+		} while (!exit_backend);
+
+		fio_server_check_jobs(&job_list);
+
+		if (ret < 0)
+			break;
+
+		cmd = fio_net_recv_cmd(sk_out->sk, true);
+		if (!cmd) {
+			ret = -1;
+			break;
+		}
+
+		ret = handle_command(sk_out, &job_list, cmd);
+		if (ret)
+			break;
+
+		free(cmd);
+		cmd = NULL;
+	}
+
+	if (cmd)
+		free(cmd);
+
+	handle_xmits(sk_out);
+
+	close(sk_out->sk);
+	sk_out->sk = -1;
+	__sk_out_drop(sk_out);
+	_exit(ret);
+}
+
+/* get the address on this host bound by the input socket,
+ * whether it is ipv6 or ipv4 */
+
+static int get_my_addr_str(int sk)
+{
+	struct sockaddr_in6 myaddr6 = { 0, };
+	struct sockaddr_in myaddr4 = { 0, };
+	struct sockaddr *sockaddr_p;
+	char *net_addr;
+	socklen_t len;
+	int ret;
+
+	if (use_ipv6) {
+		len = sizeof(myaddr6);
+		sockaddr_p = (struct sockaddr * )&myaddr6;
+		net_addr = (char * )&myaddr6.sin6_addr;
+	} else {
+		len = sizeof(myaddr4);
+		sockaddr_p = (struct sockaddr * )&myaddr4;
+		net_addr = (char * )&myaddr4.sin_addr;
+	}
+
+	ret = getsockname(sk, sockaddr_p, &len);
+	if (ret) {
+		log_err("fio: getsockname: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (!inet_ntop(use_ipv6?AF_INET6:AF_INET, net_addr, client_sockaddr_str, INET6_ADDRSTRLEN - 1)) {
+		log_err("inet_ntop: failed to convert addr to string\n");
+		return -1;
+	}
+
+	dprint(FD_NET, "fio server bound to addr %s\n", client_sockaddr_str);
+	return 0;
+}
+
+static int accept_loop(int listen_sk)
+{
+	struct sockaddr_in addr;
+	struct sockaddr_in6 addr6;
+	socklen_t len = use_ipv6 ? sizeof(addr6) : sizeof(addr);
+	struct pollfd pfd;
+	int ret = 0, sk, exitval = 0;
+	FLIST_HEAD(conn_list);
+
+	dprint(FD_NET, "server enter accept loop\n");
+
+	fio_set_fd_nonblocking(listen_sk, "server");
+
+	while (!exit_backend) {
+		struct sk_out *sk_out;
+		const char *from;
+		char buf[64];
+		pid_t pid;
+
+		pfd.fd = listen_sk;
+		pfd.events = POLLIN;
+		do {
+			int timeout = 1000;
+
+			if (!flist_empty(&conn_list))
+				timeout = 100;
+
+			ret = poll(&pfd, 1, timeout);
+			if (ret < 0) {
+				if (errno == EINTR)
+					break;
+				log_err("fio: poll: %s\n", strerror(errno));
+				break;
+			} else if (!ret) {
+				fio_server_check_conns(&conn_list);
+				continue;
+			}
+
+			if (pfd.revents & POLLIN)
+				break;
+		} while (!exit_backend);
+
+		fio_server_check_conns(&conn_list);
+
+		if (exit_backend || ret < 0)
+			break;
+
+		if (use_ipv6)
+			sk = accept(listen_sk, (struct sockaddr *) &addr6, &len);
+		else
+			sk = accept(listen_sk, (struct sockaddr *) &addr, &len);
+
+		if (sk < 0) {
+			log_err("fio: accept: %s\n", strerror(errno));
+			return -1;
+		}
+
+		if (use_ipv6)
+			from = inet_ntop(AF_INET6, (struct sockaddr *) &addr6.sin6_addr, buf, sizeof(buf));
+		else
+			from = inet_ntop(AF_INET, (struct sockaddr *) &addr.sin_addr, buf, sizeof(buf));
+
+		dprint(FD_NET, "server: connect from %s\n", from);
+
+		sk_out = scalloc(1, sizeof(*sk_out));
+		if (!sk_out) {
+			close(sk);
+			return -1;
+		}
+
+		sk_out->sk = sk;
+		INIT_FLIST_HEAD(&sk_out->list);
+		__fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED);
+		__fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
+		__fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
+
+		pid = fork();
+		if (pid) {
+			close(sk);
+			fio_server_add_conn_pid(&conn_list, pid);
+			continue;
+		}
+
+		/* if error, it's already logged, non-fatal */
+		get_my_addr_str(sk);
+
+		/*
+		 * Assign sk_out here, it'll be dropped in handle_connection()
+		 * since that function calls _exit() when done
+		 */
+		sk_out_assign(sk_out);
+		handle_connection(sk_out);
+	}
+
+	return exitval;
+}
+
+int fio_server_text_output(int level, const char *buf, size_t len)
+{
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+	struct cmd_text_pdu *pdu;
+	unsigned int tlen;
+	struct timeval tv;
+
+	if (!sk_out || sk_out->sk == -1)
+		return -1;
+
+	tlen = sizeof(*pdu) + len;
+	pdu = malloc(tlen);
+
+	pdu->level	= __cpu_to_le32(level);
+	pdu->buf_len	= __cpu_to_le32(len);
+
+	gettimeofday(&tv, NULL);
+	pdu->log_sec	= __cpu_to_le64(tv.tv_sec);
+	pdu->log_usec	= __cpu_to_le64(tv.tv_usec);
+
+	memcpy(pdu->buf, buf, len);
+
+	fio_net_queue_cmd(FIO_NET_CMD_TEXT, pdu, tlen, NULL, SK_F_COPY);
+	free(pdu);
+	return len;
+}
+
+static void convert_io_stat(struct io_stat *dst, struct io_stat *src)
+{
+	dst->max_val	= cpu_to_le64(src->max_val);
+	dst->min_val	= cpu_to_le64(src->min_val);
+	dst->samples	= cpu_to_le64(src->samples);
+
+	/*
+	 * Encode to IEEE 754 for network transfer
+	 */
+	dst->mean.u.i	= cpu_to_le64(fio_double_to_uint64(src->mean.u.f));
+	dst->S.u.i	= cpu_to_le64(fio_double_to_uint64(src->S.u.f));
+}
+
+static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		dst->max_run[i]		= cpu_to_le64(src->max_run[i]);
+		dst->min_run[i]		= cpu_to_le64(src->min_run[i]);
+		dst->max_bw[i]		= cpu_to_le64(src->max_bw[i]);
+		dst->min_bw[i]		= cpu_to_le64(src->min_bw[i]);
+		dst->iobytes[i]		= cpu_to_le64(src->iobytes[i]);
+		dst->agg[i]		= cpu_to_le64(src->agg[i]);
+	}
+
+	dst->kb_base	= cpu_to_le32(src->kb_base);
+	dst->unit_base	= cpu_to_le32(src->unit_base);
+	dst->groupid	= cpu_to_le32(src->groupid);
+	dst->unified_rw_rep	= cpu_to_le32(src->unified_rw_rep);
+	dst->sig_figs	= cpu_to_le32(src->sig_figs);
+}
+
+/*
+ * Send a CMD_TS, which packs struct thread_stat and group_run_stats
+ * into a single payload.
+ */
+void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
+{
+	struct cmd_ts_pdu p;
+	int i, j, k;
+	void *ss_buf;
+	uint64_t *ss_iops, *ss_bw;
+
+	dprint(FD_NET, "server sending end stats\n");
+
+	memset(&p, 0, sizeof(p));
+
+	snprintf(p.ts.name, sizeof(p.ts.name), "%s", ts->name);
+	snprintf(p.ts.verror, sizeof(p.ts.verror), "%s", ts->verror);
+	snprintf(p.ts.description, sizeof(p.ts.description), "%s",
+		 ts->description);
+
+	p.ts.error		= cpu_to_le32(ts->error);
+	p.ts.thread_number	= cpu_to_le32(ts->thread_number);
+	p.ts.groupid		= cpu_to_le32(ts->groupid);
+	p.ts.pid		= cpu_to_le32(ts->pid);
+	p.ts.members		= cpu_to_le32(ts->members);
+	p.ts.unified_rw_rep	= cpu_to_le32(ts->unified_rw_rep);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		convert_io_stat(&p.ts.clat_stat[i], &ts->clat_stat[i]);
+		convert_io_stat(&p.ts.slat_stat[i], &ts->slat_stat[i]);
+		convert_io_stat(&p.ts.lat_stat[i], &ts->lat_stat[i]);
+		convert_io_stat(&p.ts.bw_stat[i], &ts->bw_stat[i]);
+		convert_io_stat(&p.ts.iops_stat[i], &ts->iops_stat[i]);
+	}
+	convert_io_stat(&p.ts.sync_stat, &ts->sync_stat);
+
+	p.ts.usr_time		= cpu_to_le64(ts->usr_time);
+	p.ts.sys_time		= cpu_to_le64(ts->sys_time);
+	p.ts.ctx		= cpu_to_le64(ts->ctx);
+	p.ts.minf		= cpu_to_le64(ts->minf);
+	p.ts.majf		= cpu_to_le64(ts->majf);
+	p.ts.clat_percentiles	= cpu_to_le32(ts->clat_percentiles);
+	p.ts.lat_percentiles	= cpu_to_le32(ts->lat_percentiles);
+	p.ts.slat_percentiles	= cpu_to_le32(ts->slat_percentiles);
+	p.ts.percentile_precision = cpu_to_le64(ts->percentile_precision);
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
+		fio_fp64_t *src = &ts->percentile_list[i];
+		fio_fp64_t *dst = &p.ts.percentile_list[i];
+
+		dst->u.i = cpu_to_le64(fio_double_to_uint64(src->u.f));
+	}
+
+	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
+		p.ts.io_u_map[i]	= cpu_to_le64(ts->io_u_map[i]);
+		p.ts.io_u_submit[i]	= cpu_to_le64(ts->io_u_submit[i]);
+		p.ts.io_u_complete[i]	= cpu_to_le64(ts->io_u_complete[i]);
+	}
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		p.ts.io_u_lat_n[i]	= cpu_to_le64(ts->io_u_lat_n[i]);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		p.ts.io_u_lat_u[i]	= cpu_to_le64(ts->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		p.ts.io_u_lat_m[i]	= cpu_to_le64(ts->io_u_lat_m[i]);
+
+	for (i = 0; i < FIO_LAT_CNT; i++)
+		for (j = 0; j < DDIR_RWDIR_CNT; j++)
+			for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+				p.ts.io_u_plat[i][j][k] = cpu_to_le64(ts->io_u_plat[i][j][k]);
+
+	for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
+		p.ts.io_u_sync_plat[j] = cpu_to_le64(ts->io_u_sync_plat[j]);
+
+	for (i = 0; i < DDIR_RWDIR_SYNC_CNT; i++)
+		p.ts.total_io_u[i]	= cpu_to_le64(ts->total_io_u[i]);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		p.ts.short_io_u[i]	= cpu_to_le64(ts->short_io_u[i]);
+		p.ts.drop_io_u[i]	= cpu_to_le64(ts->drop_io_u[i]);
+	}
+
+	p.ts.total_submit	= cpu_to_le64(ts->total_submit);
+	p.ts.total_complete	= cpu_to_le64(ts->total_complete);
+	p.ts.nr_zone_resets	= cpu_to_le64(ts->nr_zone_resets);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		p.ts.io_bytes[i]	= cpu_to_le64(ts->io_bytes[i]);
+		p.ts.runtime[i]		= cpu_to_le64(ts->runtime[i]);
+	}
+
+	p.ts.total_run_time	= cpu_to_le64(ts->total_run_time);
+	p.ts.continue_on_error	= cpu_to_le16(ts->continue_on_error);
+	p.ts.total_err_count	= cpu_to_le64(ts->total_err_count);
+	p.ts.first_error	= cpu_to_le32(ts->first_error);
+	p.ts.kb_base		= cpu_to_le32(ts->kb_base);
+	p.ts.unit_base		= cpu_to_le32(ts->unit_base);
+
+	p.ts.latency_depth	= cpu_to_le32(ts->latency_depth);
+	p.ts.latency_target	= cpu_to_le64(ts->latency_target);
+	p.ts.latency_window	= cpu_to_le64(ts->latency_window);
+	p.ts.latency_percentile.u.i = cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
+
+	p.ts.sig_figs		= cpu_to_le32(ts->sig_figs);
+
+	p.ts.nr_block_infos	= cpu_to_le64(ts->nr_block_infos);
+	for (i = 0; i < p.ts.nr_block_infos; i++)
+		p.ts.block_infos[i] = cpu_to_le32(ts->block_infos[i]);
+
+	p.ts.ss_dur		= cpu_to_le64(ts->ss_dur);
+	p.ts.ss_state		= cpu_to_le32(ts->ss_state);
+	p.ts.ss_head		= cpu_to_le32(ts->ss_head);
+	p.ts.ss_limit.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_limit.u.f));
+	p.ts.ss_slope.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_slope.u.f));
+	p.ts.ss_deviation.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_deviation.u.f));
+	p.ts.ss_criterion.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_criterion.u.f));
+
+	p.ts.cachehit		= cpu_to_le64(ts->cachehit);
+	p.ts.cachemiss		= cpu_to_le64(ts->cachemiss);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			p.ts.io_u_plat_high_prio[i][j] = cpu_to_le64(ts->io_u_plat_high_prio[i][j]);
+			p.ts.io_u_plat_low_prio[i][j] = cpu_to_le64(ts->io_u_plat_low_prio[i][j]);
+		}
+		convert_io_stat(&p.ts.clat_high_prio_stat[i], &ts->clat_high_prio_stat[i]);
+		convert_io_stat(&p.ts.clat_low_prio_stat[i], &ts->clat_low_prio_stat[i]);
+	}
+
+	convert_gs(&p.rs, rs);
+
+	dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+	if (ts->ss_state & FIO_SS_DATA) {
+		dprint(FD_NET, "server sending steadystate ring buffers\n");
+
+		ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+
+		memcpy(ss_buf, &p, sizeof(p));
+
+		ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
+		ss_bw = ss_iops + (int) ts->ss_dur;
+		for (i = 0; i < ts->ss_dur; i++) {
+			ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+			ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
+		}
+
+		fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
+
+		free(ss_buf);
+	}
+	else
+		fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+}
+
+void fio_server_send_gs(struct group_run_stats *rs)
+{
+	struct group_run_stats gs;
+
+	dprint(FD_NET, "server sending group run stats\n");
+
+	convert_gs(&gs, rs);
+	fio_net_queue_cmd(FIO_NET_CMD_GS, &gs, sizeof(gs), NULL, SK_F_COPY);
+}
+
+void fio_server_send_job_options(struct flist_head *opt_list,
+				 unsigned int gid)
+{
+	struct cmd_job_option pdu;
+	struct flist_head *entry;
+
+	if (flist_empty(opt_list))
+		return;
+
+	flist_for_each(entry, opt_list) {
+		struct print_option *p;
+		size_t len;
+
+		p = flist_entry(entry, struct print_option, list);
+		memset(&pdu, 0, sizeof(pdu));
+
+		if (gid == -1U) {
+			pdu.global = __cpu_to_le16(1);
+			pdu.groupid = 0;
+		} else {
+			pdu.global = 0;
+			pdu.groupid = cpu_to_le32(gid);
+		}
+		len = strlen(p->name);
+		if (len >= sizeof(pdu.name)) {
+			len = sizeof(pdu.name) - 1;
+			pdu.truncated = __cpu_to_le16(1);
+		}
+		memcpy(pdu.name, p->name, len);
+		if (p->value) {
+			len = strlen(p->value);
+			if (len >= sizeof(pdu.value)) {
+				len = sizeof(pdu.value) - 1;
+				pdu.truncated = __cpu_to_le16(1);
+			}
+			memcpy(pdu.value, p->value, len);
+		}
+		fio_net_queue_cmd(FIO_NET_CMD_JOB_OPT, &pdu, sizeof(pdu), NULL, SK_F_COPY);
+	}
+}
+
+static void convert_agg(struct disk_util_agg *dst, struct disk_util_agg *src)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		dst->ios[i]	= cpu_to_le64(src->ios[i]);
+		dst->merges[i]	= cpu_to_le64(src->merges[i]);
+		dst->sectors[i]	= cpu_to_le64(src->sectors[i]);
+		dst->ticks[i]	= cpu_to_le64(src->ticks[i]);
+	}
+
+	dst->io_ticks		= cpu_to_le64(src->io_ticks);
+	dst->time_in_queue	= cpu_to_le64(src->time_in_queue);
+	dst->slavecount		= cpu_to_le32(src->slavecount);
+	dst->max_util.u.i	= cpu_to_le64(fio_double_to_uint64(src->max_util.u.f));
+}
+
+static void convert_dus(struct disk_util_stat *dst, struct disk_util_stat *src)
+{
+	int i;
+
+	snprintf((char *) dst->name, sizeof(dst->name), "%s", src->name);
+
+	for (i = 0; i < 2; i++) {
+		dst->s.ios[i]		= cpu_to_le64(src->s.ios[i]);
+		dst->s.merges[i]	= cpu_to_le64(src->s.merges[i]);
+		dst->s.sectors[i]	= cpu_to_le64(src->s.sectors[i]);
+		dst->s.ticks[i]		= cpu_to_le64(src->s.ticks[i]);
+	}
+
+	dst->s.io_ticks		= cpu_to_le64(src->s.io_ticks);
+	dst->s.time_in_queue	= cpu_to_le64(src->s.time_in_queue);
+	dst->s.msec		= cpu_to_le64(src->s.msec);
+}
+
+void fio_server_send_du(void)
+{
+	struct disk_util *du;
+	struct flist_head *entry;
+	struct cmd_du_pdu pdu;
+
+	dprint(FD_NET, "server: sending disk_util %d\n", !flist_empty(&disk_list));
+
+	memset(&pdu, 0, sizeof(pdu));
+
+	flist_for_each(entry, &disk_list) {
+		du = flist_entry(entry, struct disk_util, list);
+
+		convert_dus(&pdu.dus, &du->dus);
+		convert_agg(&pdu.agg, &du->agg);
+
+		fio_net_queue_cmd(FIO_NET_CMD_DU, &pdu, sizeof(pdu), NULL, SK_F_COPY);
+	}
+}
+
+#ifdef CONFIG_ZLIB
+
+static inline void __fio_net_prep_tail(z_stream *stream, void *out_pdu,
+					struct sk_entry **last_entry,
+					struct sk_entry *first)
+{
+	unsigned int this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
+
+	*last_entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+				 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+	if (*last_entry)
+		flist_add_tail(&(*last_entry)->list, &first->next);
+}
+
+/*
+ * Deflates the next input given, creating as many new packets in the
+ * linked list as necessary.
+ */
+static int __deflate_pdu_buffer(void *next_in, unsigned int next_sz, void **out_pdu,
+				struct sk_entry **last_entry, z_stream *stream,
+				struct sk_entry *first)
+{
+	int ret;
+
+	stream->next_in = next_in;
+	stream->avail_in = next_sz;
+	do {
+		if (!stream->avail_out) {
+			__fio_net_prep_tail(stream, *out_pdu, last_entry, first);
+			if (*last_entry == NULL)
+				return 1;
+
+			*out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+
+			stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+			stream->next_out = *out_pdu;
+		}
+
+		ret = deflate(stream, Z_BLOCK);
+
+		if (ret < 0) {
+			free(*out_pdu);
+			return 1;
+		}
+	} while (stream->avail_in);
+
+	return 0;
+}
+
+static int __fio_append_iolog_gz_hist(struct sk_entry *first, struct io_log *log,
+				      struct io_logs *cur_log, z_stream *stream)
+{
+	struct sk_entry *entry;
+	void *out_pdu;
+	int ret, i, j;
+	int sample_sz = log_entry_sz(log);
+
+	out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+	stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+	stream->next_out = out_pdu;
+
+	for (i = 0; i < cur_log->nr_samples; i++) {
+		struct io_sample *s;
+		struct io_u_plat_entry *cur_plat_entry, *prev_plat_entry;
+		uint64_t *cur_plat, *prev_plat;
+
+		s = get_sample(log, cur_log, i);
+		ret = __deflate_pdu_buffer(s, sample_sz, &out_pdu, &entry, stream, first);
+		if (ret)
+			return ret;
+
+		/* Do the subtraction on server side so that client doesn't have to
+		 * reconstruct our linked list from packets.
+		 */
+		cur_plat_entry  = s->data.plat_entry;
+		prev_plat_entry = flist_first_entry(&cur_plat_entry->list, struct io_u_plat_entry, list);
+		cur_plat  = cur_plat_entry->io_u_plat;
+		prev_plat = prev_plat_entry->io_u_plat;
+
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			cur_plat[j] -= prev_plat[j];
+		}
+
+		flist_del(&prev_plat_entry->list);
+		free(prev_plat_entry);
+
+		ret = __deflate_pdu_buffer(cur_plat_entry, sizeof(*cur_plat_entry),
+					   &out_pdu, &entry, stream, first);
+
+		if (ret)
+			return ret;
+	}
+
+	__fio_net_prep_tail(stream, out_pdu, &entry, first);
+	return entry == NULL;
+}
+
+static int __fio_append_iolog_gz(struct sk_entry *first, struct io_log *log,
+				 struct io_logs *cur_log, z_stream *stream)
+{
+	unsigned int this_len;
+	void *out_pdu;
+	int ret;
+
+	if (log->log_type == IO_LOG_TYPE_HIST)
+		return __fio_append_iolog_gz_hist(first, log, cur_log, stream);
+
+	stream->next_in = (void *) cur_log->log;
+	stream->avail_in = cur_log->nr_samples * log_entry_sz(log);
+
+	do {
+		struct sk_entry *entry;
+
+		/*
+		 * Dirty - since the log is potentially huge, compress it into
+		 * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
+		 * side defragment it.
+		 */
+		out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+
+		stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+		stream->next_out = out_pdu;
+		ret = deflate(stream, Z_BLOCK);
+		/* may be Z_OK, or Z_STREAM_END */
+		if (ret < 0) {
+			free(out_pdu);
+			return 1;
+		}
+
+		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+					 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		if (!entry) {
+			free(out_pdu);
+			return 1;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	} while (stream->avail_in);
+
+	return 0;
+}
+
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+	z_stream stream = {
+		.zalloc	= Z_NULL,
+		.zfree	= Z_NULL,
+		.opaque	= Z_NULL,
+	};
+	int ret = 0;
+
+	if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+		return 1;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		ret = __fio_append_iolog_gz(first, log, cur_log, &stream);
+		if (ret)
+			break;
+	}
+
+	ret = deflate(&stream, Z_FINISH);
+
+	while (ret != Z_STREAM_END) {
+		struct sk_entry *entry;
+		unsigned int this_len;
+		void *out_pdu;
+
+		out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+		stream.avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+		stream.next_out = out_pdu;
+
+		ret = deflate(&stream, Z_FINISH);
+		/* may be Z_OK, or Z_STREAM_END */
+		if (ret < 0) {
+			free(out_pdu);
+			break;
+		}
+
+		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream.avail_out;
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+					 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		if (!entry) {
+			free(out_pdu);
+			break;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	} while (ret != Z_STREAM_END);
+
+	ret = deflateEnd(&stream);
+	if (ret == Z_OK)
+		return 0;
+
+	return 1;
+}
+#else
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+	return 1;
+}
+#endif
+
+static int fio_append_gz_chunks(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	struct flist_head *node;
+	int ret = 0;
+
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(node, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(node, struct iolog_compress, list);
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, c->buf, c->len,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		if (!entry) {
+			ret = 1;
+			break;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+	return ret;
+}
+
+static int fio_append_text_log(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	int ret = 0;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+		size_t size;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		size = cur_log->nr_samples * log_entry_sz(log);
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, cur_log->log, size,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		if (!entry) {
+			ret = 1;
+			break;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	}
+
+	return ret;
+}
+
+int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
+{
+	struct cmd_iolog_pdu pdu = {
+		.nr_samples		= cpu_to_le64(iolog_nr_samples(log)),
+		.thread_number		= cpu_to_le32(td->thread_number),
+		.log_type		= cpu_to_le32(log->log_type),
+		.log_hist_coarseness	= cpu_to_le32(log->hist_coarseness),
+	};
+	struct sk_entry *first;
+	struct flist_head *entry;
+	int ret = 0;
+
+	if (!flist_empty(&log->chunk_list))
+		pdu.compressed = __cpu_to_le32(STORE_COMPRESSED);
+	else if (use_zlib)
+		pdu.compressed = __cpu_to_le32(XMIT_COMPRESSED);
+	else
+		pdu.compressed = 0;
+
+	snprintf((char *) pdu.name, sizeof(pdu.name), "%s", name);
+
+	/*
+	 * We can't do this for a pre-compressed log, but for that case,
+	 * log->nr_samples is zero anyway.
+	 */
+	flist_for_each(entry, &log->io_logs) {
+		struct io_logs *cur_log;
+		int i;
+
+		cur_log = flist_entry(entry, struct io_logs, list);
+
+		for (i = 0; i < cur_log->nr_samples; i++) {
+			struct io_sample *s = get_sample(log, cur_log, i);
+
+			s->time		= cpu_to_le64(s->time);
+			s->data.val	= cpu_to_le64(s->data.val);
+			s->__ddir	= __cpu_to_le32(s->__ddir);
+			s->bs		= cpu_to_le64(s->bs);
+
+			if (log->log_offset) {
+				struct io_sample_offset *so = (void *) s;
+
+				so->offset = cpu_to_le64(so->offset);
+			}
+		}
+	}
+
+	/*
+	 * Assemble header entry first
+	 */
+	first = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, &pdu, sizeof(pdu), NULL, SK_F_VEC | SK_F_INLINE | SK_F_COPY);
+	if (!first)
+		return 1;
+
+	/*
+	 * Now append actual log entries. If log compression was enabled on
+	 * the job, just send out the compressed chunks directly. If we
+	 * have a plain log, compress if we can, then send. Otherwise, send
+	 * the plain text output.
+	 */
+	if (!flist_empty(&log->chunk_list))
+		ret = fio_append_gz_chunks(first, log);
+	else if (use_zlib)
+		ret = fio_append_iolog_gz(first, log);
+	else
+		ret = fio_append_text_log(first, log);
+
+	fio_net_queue_entry(first);
+	return ret;
+}
+
+void fio_server_send_add_job(struct thread_data *td)
+{
+	struct cmd_add_job_pdu pdu = {
+		.thread_number = cpu_to_le32(td->thread_number),
+		.groupid = cpu_to_le32(td->groupid),
+	};
+
+	convert_thread_options_to_net(&pdu.top, &td->o);
+
+	fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL,
+				SK_F_COPY);
+}
+
+void fio_server_send_start(struct thread_data *td)
+{
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	assert(sk_out->sk != -1);
+
+	fio_net_queue_cmd(FIO_NET_CMD_SERVER_START, NULL, 0, NULL, SK_F_SIMPLE);
+}
+
+int fio_server_get_verify_state(const char *name, int threadnumber,
+				void **datap)
+{
+	struct thread_io_list *s;
+	struct cmd_sendfile out;
+	struct cmd_reply *rep;
+	uint64_t tag;
+	void *data;
+	int ret;
+
+	dprint(FD_NET, "server: request verify state\n");
+
+	rep = smalloc(sizeof(*rep));
+	if (!rep)
+		return ENOMEM;
+
+	__fio_sem_init(&rep->lock, FIO_SEM_LOCKED);
+	rep->data = NULL;
+	rep->error = 0;
+
+	verify_state_gen_name((char *) out.path, sizeof(out.path), name, me,
+				threadnumber);
+	tag = (uint64_t) (uintptr_t) rep;
+	fio_net_queue_cmd(FIO_NET_CMD_SENDFILE, &out, sizeof(out), &tag,
+				SK_F_COPY);
+
+	/*
+	 * Wait for the backend to receive the reply
+	 */
+	if (fio_sem_down_timeout(&rep->lock, 10000)) {
+		log_err("fio: timed out waiting for reply\n");
+		ret = ETIMEDOUT;
+		goto fail;
+	}
+
+	if (rep->error) {
+		log_err("fio: failure on receiving state file %s: %s\n",
+				out.path, strerror(rep->error));
+		ret = rep->error;
+fail:
+		*datap = NULL;
+		sfree(rep);
+		fio_net_queue_quit();
+		return ret;
+	}
+
+	/*
+	 * The format is verify_state_hdr, then thread_io_list. Verify
+	 * the header, and the thread_io_list checksum
+	 */
+	s = rep->data + sizeof(struct verify_state_hdr);
+	if (verify_state_hdr(rep->data, s)) {
+		ret = EILSEQ;
+		goto fail;
+	}
+
+	/*
+	 * Don't need the header from now, copy just the thread_io_list
+	 */
+	ret = 0;
+	rep->size -= sizeof(struct verify_state_hdr);
+	data = malloc(rep->size);
+	memcpy(data, s, rep->size);
+	*datap = data;
+
+	sfree(rep->data);
+	__fio_sem_remove(&rep->lock);
+	sfree(rep);
+	return ret;
+}
+
+static int fio_init_server_ip(void)
+{
+	struct sockaddr *addr;
+	socklen_t socklen;
+	char buf[80];
+	const char *str;
+	int sk, opt;
+
+	if (use_ipv6)
+		sk = socket(AF_INET6, SOCK_STREAM, 0);
+	else
+		sk = socket(AF_INET, SOCK_STREAM, 0);
+
+	if (sk < 0) {
+		log_err("fio: socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	opt = 1;
+	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, (void *)&opt, sizeof(opt)) < 0) {
+		log_err("fio: setsockopt(REUSEADDR): %s\n", strerror(errno));
+		close(sk);
+		return -1;
+	}
+#ifdef SO_REUSEPORT
+	/*
+	 * Not fatal if fails, so just ignore it if that happens
+	 */
+	if (setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) {
+	}
+#endif
+
+	if (use_ipv6) {
+		void *src = &saddr_in6.sin6_addr;
+
+		addr = (struct sockaddr *) &saddr_in6;
+		socklen = sizeof(saddr_in6);
+		saddr_in6.sin6_family = AF_INET6;
+		str = inet_ntop(AF_INET6, src, buf, sizeof(buf));
+	} else {
+		void *src = &saddr_in.sin_addr;
+
+		addr = (struct sockaddr *) &saddr_in;
+		socklen = sizeof(saddr_in);
+		saddr_in.sin_family = AF_INET;
+		str = inet_ntop(AF_INET, src, buf, sizeof(buf));
+	}
+
+	if (bind(sk, addr, socklen) < 0) {
+		log_err("fio: bind: %s\n", strerror(errno));
+		log_info("fio: failed with IPv%c %s\n", use_ipv6 ? '6' : '4', str);
+		close(sk);
+		return -1;
+	}
+
+	return sk;
+}
+
+static int fio_init_server_sock(void)
+{
+	struct sockaddr_un addr;
+	socklen_t len;
+	mode_t mode;
+	int sk;
+
+	sk = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (sk < 0) {
+		log_err("fio: socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	mode = umask(000);
+
+	addr.sun_family = AF_UNIX;
+	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", bind_sock);
+
+	len = sizeof(addr.sun_family) + strlen(bind_sock) + 1;
+
+	if (bind(sk, (struct sockaddr *) &addr, len) < 0) {
+		log_err("fio: bind: %s\n", strerror(errno));
+		close(sk);
+		return -1;
+	}
+
+	umask(mode);
+	return sk;
+}
+
+static int fio_init_server_connection(void)
+{
+	char bind_str[128];
+	int sk;
+
+	dprint(FD_NET, "starting server\n");
+
+	if (!bind_sock)
+		sk = fio_init_server_ip();
+	else
+		sk = fio_init_server_sock();
+
+	if (sk < 0)
+		return sk;
+
+	memset(bind_str, 0, sizeof(bind_str));
+
+	if (!bind_sock) {
+		char *p, port[16];
+		void *src;
+		int af;
+
+		if (use_ipv6) {
+			af = AF_INET6;
+			src = &saddr_in6.sin6_addr;
+		} else {
+			af = AF_INET;
+			src = &saddr_in.sin_addr;
+		}
+
+		p = (char *) inet_ntop(af, src, bind_str, sizeof(bind_str));
+
+		sprintf(port, ",%u", fio_net_port);
+		if (p)
+			strcat(p, port);
+		else
+			snprintf(bind_str, sizeof(bind_str), "%s", port);
+	} else
+		snprintf(bind_str, sizeof(bind_str), "%s", bind_sock);
+
+	log_info("fio: server listening on %s\n", bind_str);
+
+	if (listen(sk, 4) < 0) {
+		log_err("fio: listen: %s\n", strerror(errno));
+		close(sk);
+		return -1;
+	}
+
+	return sk;
+}
+
+int fio_server_parse_host(const char *host, int ipv6, struct in_addr *inp,
+			  struct in6_addr *inp6)
+
+{
+	int ret = 0;
+
+	if (ipv6)
+		ret = inet_pton(AF_INET6, host, inp6);
+	else
+		ret = inet_pton(AF_INET, host, inp);
+
+	if (ret != 1) {
+		struct addrinfo *res, hints = {
+			.ai_family = ipv6 ? AF_INET6 : AF_INET,
+			.ai_socktype = SOCK_STREAM,
+		};
+
+		ret = getaddrinfo(host, NULL, &hints, &res);
+		if (ret) {
+			log_err("fio: failed to resolve <%s> (%s)\n", host,
+					gai_strerror(ret));
+			return 1;
+		}
+
+		if (ipv6)
+			memcpy(inp6, &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr, sizeof(*inp6));
+		else
+			memcpy(inp, &((struct sockaddr_in *) res->ai_addr)->sin_addr, sizeof(*inp));
+
+		ret = 1;
+		freeaddrinfo(res);
+	}
+
+	return !(ret == 1);
+}
+
+/*
+ * Parse a host/ip/port string. Reads from 'str'.
+ *
+ * Outputs:
+ *
+ * For IPv4:
+ *	*ptr is the host, *port is the port, inp is the destination.
+ * For IPv6:
+ *	*ptr is the host, *port is the port, inp6 is the dest, and *ipv6 is 1.
+ * For local domain sockets:
+ *	*ptr is the filename, *is_sock is 1.
+ */
+int fio_server_parse_string(const char *str, char **ptr, bool *is_sock,
+			    int *port, struct in_addr *inp,
+			    struct in6_addr *inp6, int *ipv6)
+{
+	const char *host = str;
+	char *portp;
+	int lport = 0;
+
+	*ptr = NULL;
+	*is_sock = false;
+	*port = fio_net_port;
+	*ipv6 = 0;
+
+	if (!strncmp(str, "sock:", 5)) {
+		*ptr = strdup(str + 5);
+		*is_sock = true;
+
+		return 0;
+	}
+
+	/*
+	 * Is it ip:<ip or host>:port
+	 */
+	if (!strncmp(host, "ip:", 3))
+		host += 3;
+	else if (!strncmp(host, "ip4:", 4))
+		host += 4;
+	else if (!strncmp(host, "ip6:", 4)) {
+		host += 4;
+		*ipv6 = 1;
+	} else if (host[0] == ':') {
+		/* String is :port */
+		host++;
+		lport = atoi(host);
+		if (!lport || lport > 65535) {
+			log_err("fio: bad server port %u\n", lport);
+			return 1;
+		}
+		/* no hostname given, we are done */
+		*port = lport;
+		return 0;
+	}
+
+	/*
+	 * If no port seen yet, check if there's a last ',' at the end
+	 */
+	if (!lport) {
+		portp = strchr(host, ',');
+		if (portp) {
+			*portp = '\0';
+			portp++;
+			lport = atoi(portp);
+			if (!lport || lport > 65535) {
+				log_err("fio: bad server port %u\n", lport);
+				return 1;
+			}
+		}
+	}
+
+	if (lport)
+		*port = lport;
+
+	if (!strlen(host))
+		return 0;
+
+	*ptr = strdup(host);
+
+	if (fio_server_parse_host(*ptr, *ipv6, inp, inp6)) {
+		free(*ptr);
+		*ptr = NULL;
+		return 1;
+	}
+
+	if (*port == 0)
+		*port = fio_net_port;
+
+	return 0;
+}
+
+/*
+ * Server arg should be one of:
+ *
+ * sock:/path/to/socket
+ *   ip:1.2.3.4
+ *      1.2.3.4
+ *
+ * Where sock uses unix domain sockets, and ip binds the server to
+ * a specific interface. If no arguments are given to the server, it
+ * uses IP and binds to 0.0.0.0.
+ *
+ */
+static int fio_handle_server_arg(void)
+{
+	int port = fio_net_port;
+	bool is_sock;
+	int ret = 0;
+
+	saddr_in.sin_addr.s_addr = htonl(INADDR_ANY);
+
+	if (!fio_server_arg)
+		goto out;
+
+	ret = fio_server_parse_string(fio_server_arg, &bind_sock, &is_sock,
+					&port, &saddr_in.sin_addr,
+					&saddr_in6.sin6_addr, &use_ipv6);
+
+	if (!is_sock && bind_sock) {
+		free(bind_sock);
+		bind_sock = NULL;
+	}
+
+out:
+	fio_net_port = port;
+	saddr_in.sin_port = htons(port);
+	saddr_in6.sin6_port = htons(port);
+	return ret;
+}
+
+static void sig_int(int sig)
+{
+	if (bind_sock)
+		unlink(bind_sock);
+}
+
+static void set_sig_handlers(void)
+{
+	struct sigaction act = {
+		.sa_handler = sig_int,
+		.sa_flags = SA_RESTART,
+	};
+
+	sigaction(SIGINT, &act, NULL);
+}
+
+void fio_server_destroy_sk_key(void)
+{
+	pthread_key_delete(sk_out_key);
+}
+
+int fio_server_create_sk_key(void)
+{
+	if (pthread_key_create(&sk_out_key, NULL)) {
+		log_err("fio: can't create sk_out backend key\n");
+		return 1;
+	}
+
+	pthread_setspecific(sk_out_key, NULL);
+	return 0;
+}
+
+static int fio_server(void)
+{
+	int sk, ret;
+
+	dprint(FD_NET, "starting server\n");
+
+	if (fio_handle_server_arg())
+		return -1;
+
+	sk = fio_init_server_connection();
+	if (sk < 0)
+		return -1;
+
+	set_sig_handlers();
+
+	ret = accept_loop(sk);
+
+	close(sk);
+
+	if (fio_server_arg) {
+		free(fio_server_arg);
+		fio_server_arg = NULL;
+	}
+	if (bind_sock)
+		free(bind_sock);
+
+	return ret;
+}
+
+void fio_server_got_signal(int signal)
+{
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	assert(sk_out);
+
+	if (signal == SIGPIPE)
+		sk_out->sk = -1;
+	else {
+		log_info("\nfio: terminating on signal %d\n", signal);
+		exit_backend = true;
+	}
+}
+
+static int check_existing_pidfile(const char *pidfile)
+{
+	struct stat sb;
+	char buf[16];
+	pid_t pid;
+	FILE *f;
+
+	if (stat(pidfile, &sb))
+		return 0;
+
+	f = fopen(pidfile, "r");
+	if (!f)
+		return 0;
+
+	if (fread(buf, sb.st_size, 1, f) <= 0) {
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	pid = atoi(buf);
+	if (kill(pid, SIGCONT) < 0)
+		return errno != ESRCH;
+
+	return 1;
+}
+
+static int write_pid(pid_t pid, const char *pidfile)
+{
+	FILE *fpid;
+
+	fpid = fopen(pidfile, "w");
+	if (!fpid) {
+		log_err("fio: failed opening pid file %s\n", pidfile);
+		return 1;
+	}
+
+	fprintf(fpid, "%u\n", (unsigned int) pid);
+	fclose(fpid);
+	return 0;
+}
+
+/*
+ * If pidfile is specified, background us.
+ */
+int fio_start_server(char *pidfile)
+{
+	pid_t pid;
+	int ret;
+
+#if defined(WIN32)
+	WSADATA wsd;
+	WSAStartup(MAKEWORD(2, 2), &wsd);
+#endif
+
+	if (!pidfile)
+		return fio_server();
+
+	if (check_existing_pidfile(pidfile)) {
+		log_err("fio: pidfile %s exists and server appears alive\n",
+								pidfile);
+		free(pidfile);
+		return -1;
+	}
+
+	pid = fork();
+	if (pid < 0) {
+		log_err("fio: failed server fork: %s\n", strerror(errno));
+		free(pidfile);
+		return -1;
+	} else if (pid) {
+		ret = write_pid(pid, pidfile);
+		free(pidfile);
+		_exit(ret);
+	}
+
+	setsid();
+	openlog("fio", LOG_NDELAY|LOG_NOWAIT|LOG_PID, LOG_USER);
+	log_syslog = true;
+	close(STDIN_FILENO);
+	close(STDOUT_FILENO);
+	close(STDERR_FILENO);
+	f_out = NULL;
+	f_err = NULL;
+
+	ret = fio_server();
+
+	closelog();
+	unlink(pidfile);
+	free(pidfile);
+	return ret;
+}
+
+void fio_server_set_arg(const char *arg)
+{
+	fio_server_arg = strdup(arg);
+}
diff --git a/server.h b/server.h
new file mode 100644
index 0000000..279b691
--- /dev/null
+++ b/server.h
@@ -0,0 +1,238 @@
+#ifndef FIO_SERVER_H
+#define FIO_SERVER_H
+
+#include <inttypes.h>
+#include <string.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+
+#include "stat.h"
+#include "diskutil.h"
+
+#define FIO_NET_PORT 8765
+
+struct sk_out {
+	unsigned int refs;	/* frees sk_out when it drops to zero.
+				 * protected by below ->lock */
+
+	int sk;			/* socket fd to talk to client */
+	struct fio_sem lock;	/* protects ref and below list */
+	struct flist_head list;	/* list of pending transmit work */
+	struct fio_sem wait;	/* wake backend when items added to list */
+	struct fio_sem xmit;	/* held while sending data */
+};
+
+/*
+ * On-wire encoding is little endian
+ */
+struct fio_net_cmd {
+	uint16_t version;	/* protocol version */
+	uint16_t opcode;	/* command opcode */
+	uint32_t flags;		/* modifier flags */
+	uint64_t tag;		/* passed back on reply */
+	uint32_t pdu_len;	/* length of post-cmd layload */
+	/*
+	 * These must be immediately before the payload, anything before
+	 * these fields are checksummed.
+	 */
+	uint16_t cmd_crc16;	/* cmd checksum */
+	uint16_t pdu_crc16;	/* payload checksum */
+	uint8_t payload[];	/* payload */
+};
+
+struct fio_net_cmd_reply {
+	struct flist_head list;
+	struct timespec ts;
+	uint64_t saved_tag;
+	uint16_t opcode;
+};
+
+enum {
+	FIO_SERVER_VER			= 82,
+
+	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
+	FIO_SERVER_MAX_CMD_MB		= 2048,
+
+	FIO_NET_CMD_QUIT		= 1,
+	FIO_NET_CMD_EXIT		= 2,
+	FIO_NET_CMD_JOB			= 3,
+	FIO_NET_CMD_JOBLINE		= 4,
+	FIO_NET_CMD_TEXT		= 5,
+	FIO_NET_CMD_TS			= 6,
+	FIO_NET_CMD_GS			= 7,
+	FIO_NET_CMD_SEND_ETA		= 8,
+	FIO_NET_CMD_ETA			= 9,
+	FIO_NET_CMD_PROBE		= 10,
+	FIO_NET_CMD_START		= 11,
+	FIO_NET_CMD_STOP		= 12,
+	FIO_NET_CMD_DU			= 13,
+	FIO_NET_CMD_SERVER_START	= 14,
+	FIO_NET_CMD_ADD_JOB		= 15,
+	FIO_NET_CMD_RUN			= 16,
+	FIO_NET_CMD_IOLOG		= 17,
+	FIO_NET_CMD_UPDATE_JOB		= 18,
+	FIO_NET_CMD_LOAD_FILE		= 19,
+	FIO_NET_CMD_VTRIGGER		= 20,
+	FIO_NET_CMD_SENDFILE		= 21,
+	FIO_NET_CMD_JOB_OPT		= 22,
+	FIO_NET_CMD_NR			= 23,
+
+	FIO_NET_CMD_F_MORE		= 1UL << 0,
+
+	/* crc does not include the crc fields */
+	FIO_NET_CMD_CRC_SZ		= sizeof(struct fio_net_cmd) -
+						2 * sizeof(uint16_t),
+
+	FIO_NET_NAME_MAX		= 256,
+
+	FIO_NET_CLIENT_TIMEOUT		= 5000,
+
+	FIO_PROBE_FLAG_ZLIB		= 1UL << 0,
+};
+
+struct cmd_sendfile {
+	uint8_t path[FIO_NET_NAME_MAX];
+};
+
+struct cmd_sendfile_reply {
+	uint32_t size;
+	uint32_t error;
+	uint8_t data[0];
+};
+
+/*
+ * Client sends this to server on VTRIGGER, server sends back a full
+ * all_io_list structure.
+ */
+struct cmd_vtrigger_pdu {
+	uint16_t len;
+	uint8_t cmd[];
+};
+
+struct cmd_load_file_pdu {
+	uint16_t name_len;
+	uint16_t client_type;
+	uint8_t file[];
+};
+
+struct cmd_ts_pdu {
+	struct thread_stat ts;
+	struct group_run_stats rs;
+};
+
+struct cmd_du_pdu {
+	struct disk_util_stat dus;
+	struct disk_util_agg agg;
+};
+
+struct cmd_client_probe_pdu {
+	uint64_t flags;
+	uint8_t server[128];
+};
+
+struct cmd_probe_reply_pdu {
+	uint8_t hostname[64];
+	uint8_t bigendian;
+	uint8_t fio_version[32];
+	uint8_t os;
+	uint8_t arch;
+	uint8_t bpp;
+	uint32_t cpus;
+	uint64_t flags;
+};
+
+struct cmd_single_line_pdu {
+	uint16_t len;
+	uint8_t text[];
+};
+
+struct cmd_line_pdu {
+	uint16_t lines;
+	uint16_t client_type;
+	struct cmd_single_line_pdu options[];
+};
+
+struct cmd_job_pdu {
+	uint32_t buf_len;
+	uint32_t client_type;
+	uint8_t buf[0];
+};
+
+struct cmd_start_pdu {
+	uint32_t jobs;
+	uint32_t stat_outputs;
+};
+
+struct cmd_end_pdu {
+	uint32_t error;
+	uint32_t signal;
+};
+
+struct cmd_add_job_pdu {
+	uint32_t thread_number;
+	uint32_t groupid;
+	struct thread_options_pack top;
+};
+
+struct cmd_text_pdu {
+	uint32_t level;
+	uint32_t buf_len;
+	uint64_t log_sec;
+	uint64_t log_usec;
+	uint8_t buf[0];
+};
+
+enum {
+	XMIT_COMPRESSED		= 1U,
+	STORE_COMPRESSED	= 2U,
+};
+
+struct cmd_iolog_pdu {
+	uint64_t nr_samples;
+	uint32_t thread_number;
+	uint32_t log_type;
+	uint32_t compressed;
+	uint32_t log_offset;
+	uint32_t log_hist_coarseness;
+	uint8_t name[FIO_NET_NAME_MAX];
+	struct io_sample samples[0];
+};
+
+struct cmd_job_option {
+	uint16_t global;
+	uint16_t truncated;
+	uint32_t groupid;
+	uint8_t name[64];
+	uint8_t value[128];
+};
+
+extern int fio_start_server(char *);
+extern int fio_server_text_output(int, const char *, size_t);
+extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
+extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
+extern void fio_server_set_arg(const char *);
+extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
+extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
+extern const char *fio_server_op(unsigned int);
+extern void fio_server_got_signal(int);
+
+extern void fio_server_send_ts(struct thread_stat *, struct group_run_stats *);
+extern void fio_server_send_gs(struct group_run_stats *);
+extern void fio_server_send_du(void);
+extern void fio_server_send_job_options(struct flist_head *, unsigned int);
+extern int fio_server_get_verify_state(const char *, int, void **);
+
+extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
+
+extern int fio_send_iolog(struct thread_data *, struct io_log *, const char *);
+extern void fio_server_send_add_job(struct thread_data *);
+extern void fio_server_send_start(struct thread_data *);
+extern int fio_net_send_quit(int sk);
+
+extern int fio_server_create_sk_key(void);
+extern void fio_server_destroy_sk_key(void);
+
+extern bool exit_backend;
+extern int fio_net_port;
+
+#endif
diff --git a/smalloc.c b/smalloc.c
new file mode 100644
index 0000000..fa00f0e
--- /dev/null
+++ b/smalloc.c
@@ -0,0 +1,580 @@
+/*
+ * simple memory allocator, backed by mmap() so that it hands out memory
+ * that can be shared across processes and threads
+ */
+#include <sys/mman.h>
+#include <assert.h>
+#include <string.h>
+
+#include "fio.h"
+#include "fio_sem.h"
+#include "os/os.h"
+#include "smalloc.h"
+#include "log.h"
+
+#define SMALLOC_REDZONE		/* define to detect memory corruption */
+
+#define SMALLOC_BPB	32	/* block size, bytes-per-bit in bitmap */
+#define SMALLOC_BPI	(sizeof(unsigned int) * 8)
+#define SMALLOC_BPL	(SMALLOC_BPB * SMALLOC_BPI)
+
+#define INITIAL_SIZE	16*1024*1024	/* new pool size */
+#define INITIAL_POOLS	8		/* maximum number of pools to setup */
+
+#define MAX_POOLS	16
+
+#define SMALLOC_PRE_RED		0xdeadbeefU
+#define SMALLOC_POST_RED	0x5aa55aa5U
+
+unsigned int smalloc_pool_size = INITIAL_SIZE;
+#ifdef SMALLOC_REDZONE
+static const int int_mask = sizeof(int) - 1;
+#endif
+
+struct pool {
+	struct fio_sem *lock;			/* protects this pool */
+	void *map;				/* map of blocks */
+	unsigned int *bitmap;			/* blocks free/busy map */
+	size_t free_blocks;		/* free blocks */
+	size_t nr_blocks;			/* total blocks */
+	size_t next_non_full;
+	size_t mmap_size;
+};
+
+struct block_hdr {
+	size_t size;
+#ifdef SMALLOC_REDZONE
+	unsigned int prered;
+#endif
+};
+
+/*
+ * This suppresses the voluminous potential bitmap printout when
+ * smalloc encounters an OOM error
+ */
+static const bool enable_smalloc_debug = false;
+
+static struct pool *mp;
+static unsigned int nr_pools;
+static unsigned int last_pool;
+
+static inline int ptr_valid(struct pool *pool, void *ptr)
+{
+	unsigned int pool_size = pool->nr_blocks * SMALLOC_BPL;
+
+	return (ptr >= pool->map) && (ptr < pool->map + pool_size);
+}
+
+static inline size_t size_to_blocks(size_t size)
+{
+	return (size + SMALLOC_BPB - 1) / SMALLOC_BPB;
+}
+
+static int blocks_iter(struct pool *pool, unsigned int pool_idx,
+		       unsigned int idx, size_t nr_blocks,
+		       int (*func)(unsigned int *map, unsigned int mask))
+{
+
+	while (nr_blocks) {
+		unsigned int this_blocks, mask;
+		unsigned int *map;
+
+		if (pool_idx >= pool->nr_blocks)
+			return 0;
+
+		map = &pool->bitmap[pool_idx];
+
+		this_blocks = nr_blocks;
+		if (this_blocks + idx > SMALLOC_BPI) {
+			this_blocks = SMALLOC_BPI - idx;
+			idx = SMALLOC_BPI - this_blocks;
+		}
+
+		if (this_blocks == SMALLOC_BPI)
+			mask = -1U;
+		else
+			mask = ((1U << this_blocks) - 1) << idx;
+
+		if (!func(map, mask))
+			return 0;
+
+		nr_blocks -= this_blocks;
+		idx = 0;
+		pool_idx++;
+	}
+
+	return 1;
+}
+
+static int mask_cmp(unsigned int *map, unsigned int mask)
+{
+	return !(*map & mask);
+}
+
+static int mask_clear(unsigned int *map, unsigned int mask)
+{
+	assert((*map & mask) == mask);
+	*map &= ~mask;
+	return 1;
+}
+
+static int mask_set(unsigned int *map, unsigned int mask)
+{
+	assert(!(*map & mask));
+	*map |= mask;
+	return 1;
+}
+
+static int blocks_free(struct pool *pool, unsigned int pool_idx,
+		       unsigned int idx, size_t nr_blocks)
+{
+	return blocks_iter(pool, pool_idx, idx, nr_blocks, mask_cmp);
+}
+
+static void set_blocks(struct pool *pool, unsigned int pool_idx,
+		       unsigned int idx, size_t nr_blocks)
+{
+	blocks_iter(pool, pool_idx, idx, nr_blocks, mask_set);
+}
+
+static void clear_blocks(struct pool *pool, unsigned int pool_idx,
+			 unsigned int idx, size_t nr_blocks)
+{
+	blocks_iter(pool, pool_idx, idx, nr_blocks, mask_clear);
+}
+
+static int find_next_zero(int word, int start)
+{
+	assert(word != -1U);
+	word >>= start;
+	return ffz(word) + start;
+}
+
+static bool add_pool(struct pool *pool, unsigned int alloc_size)
+{
+	int bitmap_blocks;
+	int mmap_flags;
+	void *ptr;
+
+	if (nr_pools == MAX_POOLS)
+		return false;
+
+#ifdef SMALLOC_REDZONE
+	alloc_size += sizeof(unsigned int);
+#endif
+	alloc_size += sizeof(struct block_hdr);
+	if (alloc_size < INITIAL_SIZE)
+		alloc_size = INITIAL_SIZE;
+
+	/* round up to nearest full number of blocks */
+	alloc_size = (alloc_size + SMALLOC_BPL - 1) & ~(SMALLOC_BPL - 1);
+	bitmap_blocks = alloc_size / SMALLOC_BPL;
+	alloc_size += bitmap_blocks * sizeof(unsigned int);
+	pool->mmap_size = alloc_size;
+
+	pool->nr_blocks = bitmap_blocks;
+	pool->free_blocks = bitmap_blocks * SMALLOC_BPI;
+
+	mmap_flags = OS_MAP_ANON;
+#ifdef CONFIG_ESX
+	mmap_flags |= MAP_PRIVATE;
+#else
+	mmap_flags |= MAP_SHARED;
+#endif
+	ptr = mmap(NULL, alloc_size, PROT_READ|PROT_WRITE, mmap_flags, -1, 0);
+
+	if (ptr == MAP_FAILED)
+		goto out_fail;
+
+	pool->map = ptr;
+	pool->bitmap = (unsigned int *)((char *) ptr + (pool->nr_blocks * SMALLOC_BPL));
+	memset(pool->bitmap, 0, bitmap_blocks * sizeof(unsigned int));
+
+	pool->lock = fio_sem_init(FIO_SEM_UNLOCKED);
+	if (!pool->lock)
+		goto out_fail;
+
+	nr_pools++;
+	return true;
+out_fail:
+	log_err("smalloc: failed adding pool\n");
+	if (pool->map)
+		munmap(pool->map, pool->mmap_size);
+	return false;
+}
+
+void sinit(void)
+{
+	bool ret;
+	int i;
+
+	/*
+	 * sinit() can be called more than once if alloc-size is
+	 * set. But we want to allocate space for the struct pool
+	 * instances only once.
+	 */
+	if (!mp) {
+		mp = (struct pool *) mmap(NULL,
+			MAX_POOLS * sizeof(struct pool),
+			PROT_READ | PROT_WRITE,
+			OS_MAP_ANON | MAP_SHARED, -1, 0);
+
+		assert(mp != MAP_FAILED);
+	}
+
+	for (i = 0; i < INITIAL_POOLS; i++) {
+		ret = add_pool(&mp[nr_pools], smalloc_pool_size);
+		if (!ret)
+			break;
+	}
+
+	/*
+	 * If we added at least one pool, we should be OK for most
+	 * cases.
+	 */
+	assert(i);
+}
+
+static void cleanup_pool(struct pool *pool)
+{
+	/*
+	 * This will also remove the temporary file we used as a backing
+	 * store, it was already unlinked
+	 */
+	munmap(pool->map, pool->mmap_size);
+
+	if (pool->lock)
+		fio_sem_remove(pool->lock);
+}
+
+void scleanup(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_pools; i++)
+		cleanup_pool(&mp[i]);
+
+	munmap(mp, MAX_POOLS * sizeof(struct pool));
+}
+
+#ifdef SMALLOC_REDZONE
+static void *postred_ptr(struct block_hdr *hdr)
+{
+	uintptr_t ptr;
+
+	ptr = (uintptr_t) hdr + hdr->size - sizeof(unsigned int);
+	ptr = (uintptr_t) PTR_ALIGN(ptr, int_mask);
+
+	return (void *) ptr;
+}
+
+static void fill_redzone(struct block_hdr *hdr)
+{
+	unsigned int *postred = postred_ptr(hdr);
+
+	hdr->prered = SMALLOC_PRE_RED;
+	*postred = SMALLOC_POST_RED;
+}
+
+static void sfree_check_redzone(struct block_hdr *hdr)
+{
+	unsigned int *postred = postred_ptr(hdr);
+
+	if (hdr->prered != SMALLOC_PRE_RED) {
+		log_err("smalloc pre redzone destroyed!\n"
+			" ptr=%p, prered=%x, expected %x\n",
+				hdr, hdr->prered, SMALLOC_PRE_RED);
+		assert(0);
+	}
+	if (*postred != SMALLOC_POST_RED) {
+		log_err("smalloc post redzone destroyed!\n"
+			"  ptr=%p, postred=%x, expected %x\n",
+				hdr, *postred, SMALLOC_POST_RED);
+		assert(0);
+	}
+}
+#else
+static void fill_redzone(struct block_hdr *hdr)
+{
+}
+
+static void sfree_check_redzone(struct block_hdr *hdr)
+{
+}
+#endif
+
+static void sfree_pool(struct pool *pool, void *ptr)
+{
+	struct block_hdr *hdr;
+	unsigned int i, idx;
+	unsigned long offset;
+
+	if (!ptr)
+		return;
+
+	ptr -= sizeof(*hdr);
+	hdr = ptr;
+
+	assert(ptr_valid(pool, ptr));
+
+	sfree_check_redzone(hdr);
+
+	offset = ptr - pool->map;
+	i = offset / SMALLOC_BPL;
+	idx = (offset % SMALLOC_BPL) / SMALLOC_BPB;
+
+	fio_sem_down(pool->lock);
+	clear_blocks(pool, i, idx, size_to_blocks(hdr->size));
+	if (i < pool->next_non_full)
+		pool->next_non_full = i;
+	pool->free_blocks += size_to_blocks(hdr->size);
+	fio_sem_up(pool->lock);
+}
+
+void sfree(void *ptr)
+{
+	struct pool *pool = NULL;
+	unsigned int i;
+
+	if (!ptr)
+		return;
+
+	for (i = 0; i < nr_pools; i++) {
+		if (ptr_valid(&mp[i], ptr)) {
+			pool = &mp[i];
+			break;
+		}
+	}
+
+	if (pool) {
+		sfree_pool(pool, ptr);
+		return;
+	}
+
+	log_err("smalloc: ptr %p not from smalloc pool\n", ptr);
+}
+
+static unsigned int find_best_index(struct pool *pool)
+{
+	unsigned int i;
+
+	assert(pool->free_blocks);
+
+	for (i = pool->next_non_full; pool->bitmap[i] == -1U; i++) {
+		if (i == pool->nr_blocks - 1) {
+			unsigned int j;
+
+			for (j = 0; j < pool->nr_blocks; j++)
+				if (pool->bitmap[j] != -1U)
+					return j;
+		}
+	}
+
+	return i;
+}
+
+static void *__smalloc_pool(struct pool *pool, size_t size)
+{
+	size_t nr_blocks;
+	unsigned int i;
+	unsigned int offset;
+	unsigned int last_idx;
+	void *ret = NULL;
+
+	fio_sem_down(pool->lock);
+
+	nr_blocks = size_to_blocks(size);
+	if (nr_blocks > pool->free_blocks)
+		goto fail;
+
+	pool->next_non_full = find_best_index(pool);
+
+	last_idx = 0;
+	offset = -1U;
+	i = pool->next_non_full;
+	while (i < pool->nr_blocks) {
+		unsigned int idx;
+
+		if (pool->bitmap[i] == -1U) {
+			i++;
+			last_idx = 0;
+			continue;
+		}
+
+		idx = find_next_zero(pool->bitmap[i], last_idx);
+		if (!blocks_free(pool, i, idx, nr_blocks)) {
+			idx += nr_blocks;
+			if (idx < SMALLOC_BPI)
+				last_idx = idx;
+			else {
+				last_idx = 0;
+				while (idx >= SMALLOC_BPI) {
+					i++;
+					idx -= SMALLOC_BPI;
+				}
+			}
+			continue;
+		}
+		set_blocks(pool, i, idx, nr_blocks);
+		offset = i * SMALLOC_BPL + idx * SMALLOC_BPB;
+		break;
+	}
+
+	if (i < pool->nr_blocks) {
+		pool->free_blocks -= nr_blocks;
+		ret = pool->map + offset;
+	}
+fail:
+	fio_sem_up(pool->lock);
+	return ret;
+}
+
+static size_t size_to_alloc_size(size_t size)
+{
+	size_t alloc_size = size + sizeof(struct block_hdr);
+
+	/*
+	 * Round to int alignment, so that the postred pointer will
+	 * be naturally aligned as well.
+	 */
+#ifdef SMALLOC_REDZONE
+	alloc_size += sizeof(unsigned int);
+	alloc_size = (alloc_size + int_mask) & ~int_mask;
+#endif
+
+	return alloc_size;
+}
+
+static void *smalloc_pool(struct pool *pool, size_t size)
+{
+	size_t alloc_size = size_to_alloc_size(size);
+	void *ptr;
+
+	ptr = __smalloc_pool(pool, alloc_size);
+	if (ptr) {
+		struct block_hdr *hdr = ptr;
+
+		hdr->size = alloc_size;
+		fill_redzone(hdr);
+
+		ptr += sizeof(*hdr);
+		memset(ptr, 0, size);
+	}
+
+	return ptr;
+}
+
+static void smalloc_print_bitmap(struct pool *pool)
+{
+	size_t nr_blocks = pool->nr_blocks;
+	unsigned int *bitmap = pool->bitmap;
+	unsigned int i, j;
+	char *buffer;
+
+	if (!enable_smalloc_debug)
+		return;
+
+	buffer = malloc(SMALLOC_BPI + 1);
+	if (!buffer)
+		return;
+	buffer[SMALLOC_BPI] = '\0';
+
+	for (i = 0; i < nr_blocks; i++) {
+		unsigned int line = bitmap[i];
+
+		/* skip completely full lines */
+		if (line == -1U)
+			continue;
+
+		for (j = 0; j < SMALLOC_BPI; j++)
+			if ((1 << j) & line)
+				buffer[SMALLOC_BPI-1-j] = '1';
+			else
+				buffer[SMALLOC_BPI-1-j] = '0';
+
+		log_err("smalloc: bitmap %5u, %s\n", i, buffer);
+	}
+
+	free(buffer);
+}
+
+void smalloc_debug(size_t size)
+{
+	unsigned int i;
+	size_t alloc_size = size_to_alloc_size(size);
+	size_t alloc_blocks;
+
+	alloc_blocks = size_to_blocks(alloc_size);
+
+	if (size)
+		log_err("smalloc: size = %lu, alloc_size = %lu, blocks = %lu\n",
+			(unsigned long) size, (unsigned long) alloc_size,
+			(unsigned long) alloc_blocks);
+	for (i = 0; i < nr_pools; i++) {
+		log_err("smalloc: pool %u, free/total blocks %u/%u\n", i,
+			(unsigned int) (mp[i].free_blocks),
+			(unsigned int) (mp[i].nr_blocks*sizeof(unsigned int)*8));
+		if (size && mp[i].free_blocks >= alloc_blocks) {
+			void *ptr = smalloc_pool(&mp[i], size);
+			if (ptr) {
+				sfree(ptr);
+				last_pool = i;
+				log_err("smalloc: smalloc_pool %u succeeded\n", i);
+			} else {
+				log_err("smalloc: smalloc_pool %u failed\n", i);
+				log_err("smalloc: next_non_full=%u, nr_blocks=%u\n",
+					(unsigned int) mp[i].next_non_full, (unsigned int) mp[i].nr_blocks);
+				smalloc_print_bitmap(&mp[i]);
+			}
+		}
+	}
+}
+
+void *smalloc(size_t size)
+{
+	unsigned int i, end_pool;
+
+	if (size != (unsigned int) size)
+		return NULL;
+
+	i = last_pool;
+	end_pool = nr_pools;
+
+	do {
+		for (; i < end_pool; i++) {
+			void *ptr = smalloc_pool(&mp[i], size);
+
+			if (ptr) {
+				last_pool = i;
+				return ptr;
+			}
+		}
+		if (last_pool) {
+			end_pool = last_pool;
+			last_pool = i = 0;
+			continue;
+		}
+
+		break;
+	} while (1);
+
+	log_err("smalloc: OOM. Consider using --alloc-size to increase the "
+		"shared memory available.\n");
+	smalloc_debug(size);
+	return NULL;
+}
+
+void *scalloc(size_t nmemb, size_t size)
+{
+	return smalloc(nmemb * size);
+}
+
+char *smalloc_strdup(const char *str)
+{
+	char *ptr = NULL;
+
+	ptr = smalloc(strlen(str) + 1);
+	if (ptr)
+		strcpy(ptr, str);
+	return ptr;
+}
diff --git a/smalloc.h b/smalloc.h
new file mode 100644
index 0000000..1f7716f
--- /dev/null
+++ b/smalloc.h
@@ -0,0 +1,16 @@
+#ifndef FIO_SMALLOC_H
+#define FIO_SMALLOC_H
+
+#include <stddef.h>
+
+extern void *smalloc(size_t);
+extern void *scalloc(size_t, size_t);
+extern void sfree(void *);
+extern char *smalloc_strdup(const char *);
+extern void sinit(void);
+extern void scleanup(void);
+extern void smalloc_debug(size_t);
+
+extern unsigned int smalloc_pool_size;
+
+#endif
diff --git a/stat.c b/stat.c
new file mode 100644
index 0000000..d8c01d1
--- /dev/null
+++ b/stat.c
@@ -0,0 +1,3148 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <math.h>
+
+#include "fio.h"
+#include "diskutil.h"
+#include "lib/ieee754.h"
+#include "json.h"
+#include "lib/getrusage.h"
+#include "idletime.h"
+#include "lib/pow2.h"
+#include "lib/output_buffer.h"
+#include "helper_thread.h"
+#include "smalloc.h"
+#include "zbd.h"
+#include "oslib/asprintf.h"
+
+#define LOG_MSEC_SLACK	1
+
+struct fio_sem *stat_sem;
+
+void clear_rusage_stat(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+
+	fio_getrusage(&td->ru_start);
+	ts->usr_time = ts->sys_time = 0;
+	ts->ctx = 0;
+	ts->minf = ts->majf = 0;
+}
+
+void update_rusage_stat(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+
+	fio_getrusage(&td->ru_end);
+	ts->usr_time += mtime_since_tv(&td->ru_start.ru_utime,
+					&td->ru_end.ru_utime);
+	ts->sys_time += mtime_since_tv(&td->ru_start.ru_stime,
+					&td->ru_end.ru_stime);
+	ts->ctx += td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw
+			- (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw);
+	ts->minf += td->ru_end.ru_minflt - td->ru_start.ru_minflt;
+	ts->majf += td->ru_end.ru_majflt - td->ru_start.ru_majflt;
+
+	memcpy(&td->ru_start, &td->ru_end, sizeof(td->ru_end));
+}
+
+/*
+ * Given a latency, return the index of the corresponding bucket in
+ * the structure tracking percentiles.
+ *
+ * (1) find the group (and error bits) that the value (latency)
+ * belongs to by looking at its MSB. (2) find the bucket number in the
+ * group by looking at the index bits.
+ *
+ */
+static unsigned int plat_val_to_idx(unsigned long long val)
+{
+	unsigned int msb, error_bits, base, offset, idx;
+
+	/* Find MSB starting from bit 0 */
+	if (val == 0)
+		msb = 0;
+	else
+		msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
+
+	/*
+	 * MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index
+	 */
+	if (msb <= FIO_IO_U_PLAT_BITS)
+		return val;
+
+	/* Compute the number of error bits to discard*/
+	error_bits = msb - FIO_IO_U_PLAT_BITS;
+
+	/* Compute the number of buckets before the group */
+	base = (error_bits + 1) << FIO_IO_U_PLAT_BITS;
+
+	/*
+	 * Discard the error bits and apply the mask to find the
+	 * index for the buckets in the group
+	 */
+	offset = (FIO_IO_U_PLAT_VAL - 1) & (val >> error_bits);
+
+	/* Make sure the index does not exceed (array size - 1) */
+	idx = (base + offset) < (FIO_IO_U_PLAT_NR - 1) ?
+		(base + offset) : (FIO_IO_U_PLAT_NR - 1);
+
+	return idx;
+}
+
+/*
+ * Convert the given index of the bucket array to the value
+ * represented by the bucket
+ */
+static unsigned long long plat_idx_to_val(unsigned int idx)
+{
+	unsigned int error_bits;
+	unsigned long long k, base;
+
+	assert(idx < FIO_IO_U_PLAT_NR);
+
+	/* MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index */
+	if (idx < (FIO_IO_U_PLAT_VAL << 1))
+		return idx;
+
+	/* Find the group and compute the minimum value of that group */
+	error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1;
+	base = ((unsigned long long) 1) << (error_bits + FIO_IO_U_PLAT_BITS);
+
+	/* Find its bucket number of the group */
+	k = idx % FIO_IO_U_PLAT_VAL;
+
+	/* Return the mean of the range of the bucket */
+	return base + ((k + 0.5) * (1 << error_bits));
+}
+
+static int double_cmp(const void *a, const void *b)
+{
+	const fio_fp64_t fa = *(const fio_fp64_t *) a;
+	const fio_fp64_t fb = *(const fio_fp64_t *) b;
+	int cmp = 0;
+
+	if (fa.u.f > fb.u.f)
+		cmp = 1;
+	else if (fa.u.f < fb.u.f)
+		cmp = -1;
+
+	return cmp;
+}
+
+unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
+				   fio_fp64_t *plist, unsigned long long **output,
+				   unsigned long long *maxv, unsigned long long *minv)
+{
+	unsigned long long sum = 0;
+	unsigned int len, i, j = 0;
+	unsigned long long *ovals = NULL;
+	bool is_last;
+
+	*minv = -1ULL;
+	*maxv = 0;
+
+	len = 0;
+	while (len < FIO_IO_U_LIST_MAX_LEN && plist[len].u.f != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Sort the percentile list. Note that it may already be sorted if
+	 * we are using the default values, but since it's a short list this
+	 * isn't a worry. Also note that this does not work for NaN values.
+	 */
+	if (len > 1)
+		qsort(plist, len, sizeof(plist[0]), double_cmp);
+
+	ovals = malloc(len * sizeof(*ovals));
+	if (!ovals)
+		return 0;
+
+	/*
+	 * Calculate bucket values, note down max and min values
+	 */
+	is_last = false;
+	for (i = 0; i < FIO_IO_U_PLAT_NR && !is_last; i++) {
+		sum += io_u_plat[i];
+		while (sum >= ((long double) plist[j].u.f / 100.0 * nr)) {
+			assert(plist[j].u.f <= 100.0);
+
+			ovals[j] = plat_idx_to_val(i);
+			if (ovals[j] < *minv)
+				*minv = ovals[j];
+			if (ovals[j] > *maxv)
+				*maxv = ovals[j];
+
+			is_last = (j == len - 1) != 0;
+			if (is_last)
+				break;
+
+			j++;
+		}
+	}
+
+	if (!is_last)
+		log_err("fio: error calculating latency percentiles\n");
+
+	*output = ovals;
+	return len;
+}
+
+/*
+ * Find and display the p-th percentile of clat
+ */
+static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
+				  fio_fp64_t *plist, unsigned int precision,
+				  const char *pre, struct buf_output *out)
+{
+	unsigned int divisor, len, i, j = 0;
+	unsigned long long minv, maxv;
+	unsigned long long *ovals;
+	int per_line, scale_down, time_width;
+	bool is_last;
+	char fmt[32];
+
+	len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
+	if (!len || !ovals)
+		goto out;
+
+	/*
+	 * We default to nsecs, but if the value range is such that we
+	 * should scale down to usecs or msecs, do that.
+	 */
+	if (minv > 2000000 && maxv > 99999999ULL) {
+		scale_down = 2;
+		divisor = 1000000;
+		log_buf(out, "    %s percentiles (msec):\n     |", pre);
+	} else if (minv > 2000 && maxv > 99999) {
+		scale_down = 1;
+		divisor = 1000;
+		log_buf(out, "    %s percentiles (usec):\n     |", pre);
+	} else {
+		scale_down = 0;
+		divisor = 1;
+		log_buf(out, "    %s percentiles (nsec):\n     |", pre);
+	}
+
+
+	time_width = max(5, (int) (log10(maxv / divisor) + 1));
+	snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+			precision, time_width);
+	/* fmt will be something like " %5.2fth=[%4llu]%c" */
+	per_line = (80 - 7) / (precision + 10 + time_width);
+
+	for (j = 0; j < len; j++) {
+		/* for formatting */
+		if (j != 0 && (j % per_line) == 0)
+			log_buf(out, "     |");
+
+		/* end of the list */
+		is_last = (j == len - 1) != 0;
+
+		for (i = 0; i < scale_down; i++)
+			ovals[j] = (ovals[j] + 999) / 1000;
+
+		log_buf(out, fmt, plist[j].u.f, ovals[j], is_last ? '\n' : ',');
+
+		if (is_last)
+			break;
+
+		if ((j % per_line) == per_line - 1)	/* for formatting */
+			log_buf(out, "\n");
+	}
+
+out:
+	free(ovals);
+}
+
+bool calc_lat(struct io_stat *is, unsigned long long *min,
+	      unsigned long long *max, double *mean, double *dev)
+{
+	double n = (double) is->samples;
+
+	if (n == 0)
+		return false;
+
+	*min = is->min_val;
+	*max = is->max_val;
+	*mean = is->mean.u.f;
+
+	if (n > 1.0)
+		*dev = sqrt(is->S.u.f / (n - 1.0));
+	else
+		*dev = 0;
+
+	return true;
+}
+
+void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
+{
+	char *io, *agg, *min, *max;
+	char *ioalt, *aggalt, *minalt, *maxalt;
+	const char *str[] = { "   READ", "  WRITE" , "   TRIM"};
+	int i;
+
+	log_buf(out, "\nRun status group %d (all jobs):\n", rs->groupid);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		const int i2p = is_power_of_2(rs->kb_base);
+
+		if (!rs->max_run[i])
+			continue;
+
+		io = num2str(rs->iobytes[i], rs->sig_figs, 1, i2p, N2S_BYTE);
+		ioalt = num2str(rs->iobytes[i], rs->sig_figs, 1, !i2p, N2S_BYTE);
+		agg = num2str(rs->agg[i], rs->sig_figs, 1, i2p, rs->unit_base);
+		aggalt = num2str(rs->agg[i], rs->sig_figs, 1, !i2p, rs->unit_base);
+		min = num2str(rs->min_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
+		minalt = num2str(rs->min_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
+		max = num2str(rs->max_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
+		maxalt = num2str(rs->max_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
+		log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
+				rs->unified_rw_rep ? "  MIXED" : str[i],
+				agg, aggalt, min, max, minalt, maxalt, io, ioalt,
+				(unsigned long long) rs->min_run[i],
+				(unsigned long long) rs->max_run[i]);
+
+		free(io);
+		free(agg);
+		free(min);
+		free(max);
+		free(ioalt);
+		free(aggalt);
+		free(minalt);
+		free(maxalt);
+	}
+}
+
+void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist)
+{
+	int i;
+
+	/*
+	 * Do depth distribution calculations
+	 */
+	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
+		if (total) {
+			io_u_dist[i] = (double) map[i] / (double) total;
+			io_u_dist[i] *= 100.0;
+			if (io_u_dist[i] < 0.1 && map[i])
+				io_u_dist[i] = 0.1;
+		} else
+			io_u_dist[i] = 0.0;
+	}
+}
+
+static void stat_calc_lat(struct thread_stat *ts, double *dst,
+			  uint64_t *src, int nr)
+{
+	unsigned long total = ddir_rw_sum(ts->total_io_u);
+	int i;
+
+	/*
+	 * Do latency distribution calculations
+	 */
+	for (i = 0; i < nr; i++) {
+		if (total) {
+			dst[i] = (double) src[i] / (double) total;
+			dst[i] *= 100.0;
+			if (dst[i] < 0.01 && src[i])
+				dst[i] = 0.01;
+		} else
+			dst[i] = 0.0;
+	}
+}
+
+/*
+ * To keep the terse format unaltered, add all of the ns latency
+ * buckets to the first us latency bucket
+ */
+static void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u)
+{
+	unsigned long ntotal = 0, total = ddir_rw_sum(ts->total_io_u);
+	int i;
+
+	stat_calc_lat(ts, io_u_lat_u, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		ntotal += ts->io_u_lat_n[i];
+
+	io_u_lat_u[0] += 100.0 * (double) ntotal / (double) total;
+}
+
+void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat)
+{
+	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_n, FIO_IO_U_LAT_N_NR);
+}
+
+void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat)
+{
+	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
+}
+
+void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat)
+{
+	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR);
+}
+
+static void display_lat(const char *name, unsigned long long min,
+			unsigned long long max, double mean, double dev,
+			struct buf_output *out)
+{
+	const char *base = "(nsec)";
+	char *minp, *maxp;
+
+	if (nsec_to_msec(&min, &max, &mean, &dev))
+		base = "(msec)";
+	else if (nsec_to_usec(&min, &max, &mean, &dev))
+		base = "(usec)";
+
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
+
+	log_buf(out, "    %s %s: min=%s, max=%s, avg=%5.02f,"
+		 " stdev=%5.02f\n", name, base, minp, maxp, mean, dev);
+
+	free(minp);
+	free(maxp);
+}
+
+static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
+			     int ddir, struct buf_output *out)
+{
+	unsigned long runt;
+	unsigned long long min, max, bw, iops;
+	double mean, dev;
+	char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
+	int i2p;
+
+	if (ddir_sync(ddir)) {
+		if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
+			log_buf(out, "  %s:\n", "fsync/fdatasync/sync_file_range");
+			display_lat(io_ddir_name(ddir), min, max, mean, dev, out);
+			show_clat_percentiles(ts->io_u_sync_plat,
+						ts->sync_stat.samples,
+						ts->percentile_list,
+						ts->percentile_precision,
+						io_ddir_name(ddir), out);
+		}
+		return;
+	}
+
+	assert(ddir_rw(ddir));
+
+	if (!ts->runtime[ddir])
+		return;
+
+	i2p = is_power_of_2(rs->kb_base);
+	runt = ts->runtime[ddir];
+
+	bw = (1000 * ts->io_bytes[ddir]) / runt;
+	io_p = num2str(ts->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
+	bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
+	bw_p_alt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
+
+	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
+	iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
+	if (ddir == DDIR_WRITE)
+		post_st = zbd_write_status(ts);
+	else if (ddir == DDIR_READ && ts->cachehit && ts->cachemiss) {
+		uint64_t total;
+		double hit;
+
+		total = ts->cachehit + ts->cachemiss;
+		hit = (double) ts->cachehit / (double) total;
+		hit *= 100.0;
+		if (asprintf(&post_st, "; Cachehit=%0.2f%%", hit) < 0)
+			post_st = NULL;
+	}
+
+	log_buf(out, "  %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
+			rs->unified_rw_rep ? "mixed" : io_ddir_name(ddir),
+			iops_p, bw_p, bw_p_alt, io_p,
+			(unsigned long long) ts->runtime[ddir],
+			post_st ? : "");
+
+	free(post_st);
+	free(io_p);
+	free(bw_p);
+	free(bw_p_alt);
+	free(iops_p);
+
+	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
+		display_lat("slat", min, max, mean, dev, out);
+	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
+		display_lat("clat", min, max, mean, dev, out);
+	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
+		display_lat(" lat", min, max, mean, dev, out);
+	if (calc_lat(&ts->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
+		display_lat(ts->lat_percentiles ? "high prio_lat" : "high prio_clat",
+				min, max, mean, dev, out);
+		if (calc_lat(&ts->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
+			display_lat(ts->lat_percentiles ? "low prio_lat" : "low prio_clat",
+					min, max, mean, dev, out);
+	}
+
+	if (ts->slat_percentiles && ts->slat_stat[ddir].samples > 0)
+		show_clat_percentiles(ts->io_u_plat[FIO_SLAT][ddir],
+					ts->slat_stat[ddir].samples,
+					ts->percentile_list,
+					ts->percentile_precision, "slat", out);
+	if (ts->clat_percentiles && ts->clat_stat[ddir].samples > 0)
+		show_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir],
+					ts->clat_stat[ddir].samples,
+					ts->percentile_list,
+					ts->percentile_precision, "clat", out);
+	if (ts->lat_percentiles && ts->lat_stat[ddir].samples > 0)
+		show_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir],
+					ts->lat_stat[ddir].samples,
+					ts->percentile_list,
+					ts->percentile_precision, "lat", out);
+
+	if (ts->clat_percentiles || ts->lat_percentiles) {
+		const char *name = ts->lat_percentiles ? "lat" : "clat";
+		char prio_name[32];
+		uint64_t samples;
+
+		if (ts->lat_percentiles)
+			samples = ts->lat_stat[ddir].samples;
+		else
+			samples = ts->clat_stat[ddir].samples;
+
+		/* Only print this if some high and low priority stats were collected */
+		if (ts->clat_high_prio_stat[ddir].samples > 0 &&
+			ts->clat_low_prio_stat[ddir].samples > 0)
+		{
+			sprintf(prio_name, "high prio (%.2f%%) %s",
+					100. * (double) ts->clat_high_prio_stat[ddir].samples / (double) samples,
+					name);
+			show_clat_percentiles(ts->io_u_plat_high_prio[ddir],
+						ts->clat_high_prio_stat[ddir].samples,
+						ts->percentile_list,
+						ts->percentile_precision, prio_name, out);
+
+			sprintf(prio_name, "low prio (%.2f%%) %s",
+					100. * (double) ts->clat_low_prio_stat[ddir].samples / (double) samples,
+					name);
+			show_clat_percentiles(ts->io_u_plat_low_prio[ddir],
+						ts->clat_low_prio_stat[ddir].samples,
+						ts->percentile_list,
+						ts->percentile_precision, prio_name, out);
+		}
+	}
+
+	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
+		double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
+		const char *bw_str;
+
+		if ((rs->unit_base == 1) && i2p)
+			bw_str = "Kibit";
+		else if (rs->unit_base == 1)
+			bw_str = "kbit";
+		else if (i2p)
+			bw_str = "KiB";
+		else
+			bw_str = "kB";
+
+		if (rs->agg[ddir]) {
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
+
+		if (rs->unit_base == 1) {
+			min *= 8.0;
+			max *= 8.0;
+			mean *= 8.0;
+			dev *= 8.0;
+		}
+
+		if (mean > fkb_base * fkb_base) {
+			min /= fkb_base;
+			max /= fkb_base;
+			mean /= fkb_base;
+			dev /= fkb_base;
+			bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
+		}
+
+		log_buf(out, "   bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
+			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+			bw_str, min, max, p_of_agg, mean, dev,
+			(&ts->bw_stat[ddir])->samples);
+	}
+	if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+		log_buf(out, "   iops        : min=%5llu, max=%5llu, "
+			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+			min, max, mean, dev, (&ts->iops_stat[ddir])->samples);
+	}
+}
+
+static bool show_lat(double *io_u_lat, int nr, const char **ranges,
+		     const char *msg, struct buf_output *out)
+{
+	bool new_line = true, shown = false;
+	int i, line = 0;
+
+	for (i = 0; i < nr; i++) {
+		if (io_u_lat[i] <= 0.0)
+			continue;
+		shown = true;
+		if (new_line) {
+			if (line)
+				log_buf(out, "\n");
+			log_buf(out, "  lat (%s)   : ", msg);
+			new_line = false;
+			line = 0;
+		}
+		if (line)
+			log_buf(out, ", ");
+		log_buf(out, "%s%3.2f%%", ranges[i], io_u_lat[i]);
+		line++;
+		if (line == 5)
+			new_line = true;
+	}
+
+	if (shown)
+		log_buf(out, "\n");
+
+	return true;
+}
+
+static void show_lat_n(double *io_u_lat_n, struct buf_output *out)
+{
+	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
+				 "250=", "500=", "750=", "1000=", };
+
+	show_lat(io_u_lat_n, FIO_IO_U_LAT_N_NR, ranges, "nsec", out);
+}
+
+static void show_lat_u(double *io_u_lat_u, struct buf_output *out)
+{
+	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
+				 "250=", "500=", "750=", "1000=", };
+
+	show_lat(io_u_lat_u, FIO_IO_U_LAT_U_NR, ranges, "usec", out);
+}
+
+static void show_lat_m(double *io_u_lat_m, struct buf_output *out)
+{
+	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
+				 "250=", "500=", "750=", "1000=", "2000=",
+				 ">=2000=", };
+
+	show_lat(io_u_lat_m, FIO_IO_U_LAT_M_NR, ranges, "msec", out);
+}
+
+static void show_latencies(struct thread_stat *ts, struct buf_output *out)
+{
+	double io_u_lat_n[FIO_IO_U_LAT_N_NR];
+	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
+	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
+
+	stat_calc_lat_n(ts, io_u_lat_n);
+	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_m(ts, io_u_lat_m);
+
+	show_lat_n(io_u_lat_n, out);
+	show_lat_u(io_u_lat_u, out);
+	show_lat_m(io_u_lat_m, out);
+}
+
+static int block_state_category(int block_state)
+{
+	switch (block_state) {
+	case BLOCK_STATE_UNINIT:
+		return 0;
+	case BLOCK_STATE_TRIMMED:
+	case BLOCK_STATE_WRITTEN:
+		return 1;
+	case BLOCK_STATE_WRITE_FAILURE:
+	case BLOCK_STATE_TRIM_FAILURE:
+		return 2;
+	default:
+		/* Silence compile warning on some BSDs and have a return */
+		assert(0);
+		return -1;
+	}
+}
+
+static int compare_block_infos(const void *bs1, const void *bs2)
+{
+	uint64_t block1 = *(uint64_t *)bs1;
+	uint64_t block2 = *(uint64_t *)bs2;
+	int state1 = BLOCK_INFO_STATE(block1);
+	int state2 = BLOCK_INFO_STATE(block2);
+	int bscat1 = block_state_category(state1);
+	int bscat2 = block_state_category(state2);
+	int cycles1 = BLOCK_INFO_TRIMS(block1);
+	int cycles2 = BLOCK_INFO_TRIMS(block2);
+
+	if (bscat1 < bscat2)
+		return -1;
+	if (bscat1 > bscat2)
+		return 1;
+
+	if (cycles1 < cycles2)
+		return -1;
+	if (cycles1 > cycles2)
+		return 1;
+
+	if (state1 < state2)
+		return -1;
+	if (state1 > state2)
+		return 1;
+
+	assert(block1 == block2);
+	return 0;
+}
+
+static int calc_block_percentiles(int nr_block_infos, uint32_t *block_infos,
+				  fio_fp64_t *plist, unsigned int **percentiles,
+				  unsigned int *types)
+{
+	int len = 0;
+	int i, nr_uninit;
+
+	qsort(block_infos, nr_block_infos, sizeof(uint32_t), compare_block_infos);
+
+	while (len < FIO_IO_U_LIST_MAX_LEN && plist[len].u.f != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Sort the percentile list. Note that it may already be sorted if
+	 * we are using the default values, but since it's a short list this
+	 * isn't a worry. Also note that this does not work for NaN values.
+	 */
+	if (len > 1)
+		qsort(plist, len, sizeof(plist[0]), double_cmp);
+
+	/* Start only after the uninit entries end */
+	for (nr_uninit = 0;
+	     nr_uninit < nr_block_infos
+		&& BLOCK_INFO_STATE(block_infos[nr_uninit]) == BLOCK_STATE_UNINIT;
+	     nr_uninit ++)
+		;
+
+	if (nr_uninit == nr_block_infos)
+		return 0;
+
+	*percentiles = calloc(len, sizeof(**percentiles));
+
+	for (i = 0; i < len; i++) {
+		int idx = (plist[i].u.f * (nr_block_infos - nr_uninit) / 100)
+				+ nr_uninit;
+		(*percentiles)[i] = BLOCK_INFO_TRIMS(block_infos[idx]);
+	}
+
+	memset(types, 0, sizeof(*types) * BLOCK_STATE_COUNT);
+	for (i = 0; i < nr_block_infos; i++)
+		types[BLOCK_INFO_STATE(block_infos[i])]++;
+
+	return len;
+}
+
+static const char *block_state_names[] = {
+	[BLOCK_STATE_UNINIT] = "unwritten",
+	[BLOCK_STATE_TRIMMED] = "trimmed",
+	[BLOCK_STATE_WRITTEN] = "written",
+	[BLOCK_STATE_TRIM_FAILURE] = "trim failure",
+	[BLOCK_STATE_WRITE_FAILURE] = "write failure",
+};
+
+static void show_block_infos(int nr_block_infos, uint32_t *block_infos,
+			     fio_fp64_t *plist, struct buf_output *out)
+{
+	int len, pos, i;
+	unsigned int *percentiles = NULL;
+	unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+	len = calc_block_percentiles(nr_block_infos, block_infos, plist,
+				     &percentiles, block_state_counts);
+
+	log_buf(out, "  block lifetime percentiles :\n   |");
+	pos = 0;
+	for (i = 0; i < len; i++) {
+		uint32_t block_info = percentiles[i];
+#define LINE_LENGTH	75
+		char str[LINE_LENGTH];
+		int strln = snprintf(str, LINE_LENGTH, " %3.2fth=%u%c",
+				     plist[i].u.f, block_info,
+				     i == len - 1 ? '\n' : ',');
+		assert(strln < LINE_LENGTH);
+		if (pos + strln > LINE_LENGTH) {
+			pos = 0;
+			log_buf(out, "\n   |");
+		}
+		log_buf(out, "%s", str);
+		pos += strln;
+#undef LINE_LENGTH
+	}
+	if (percentiles)
+		free(percentiles);
+
+	log_buf(out, "        states               :");
+	for (i = 0; i < BLOCK_STATE_COUNT; i++)
+		log_buf(out, " %s=%u%c",
+			 block_state_names[i], block_state_counts[i],
+			 i == BLOCK_STATE_COUNT - 1 ? '\n' : ',');
+}
+
+static void show_ss_normal(struct thread_stat *ts, struct buf_output *out)
+{
+	char *p1, *p1alt, *p2;
+	unsigned long long bw_mean, iops_mean;
+	const int i2p = is_power_of_2(ts->kb_base);
+
+	if (!ts->ss_dur)
+		return;
+
+	bw_mean = steadystate_bw_mean(ts);
+	iops_mean = steadystate_iops_mean(ts);
+
+	p1 = num2str(bw_mean / ts->kb_base, ts->sig_figs, ts->kb_base, i2p, ts->unit_base);
+	p1alt = num2str(bw_mean / ts->kb_base, ts->sig_figs, ts->kb_base, !i2p, ts->unit_base);
+	p2 = num2str(iops_mean, ts->sig_figs, 1, 0, N2S_NONE);
+
+	log_buf(out, "  steadystate  : attained=%s, bw=%s (%s), iops=%s, %s%s=%.3f%s\n",
+		ts->ss_state & FIO_SS_ATTAINED ? "yes" : "no",
+		p1, p1alt, p2,
+		ts->ss_state & FIO_SS_IOPS ? "iops" : "bw",
+		ts->ss_state & FIO_SS_SLOPE ? " slope": " mean dev",
+		ts->ss_criterion.u.f,
+		ts->ss_state & FIO_SS_PCT ? "%" : "");
+
+	free(p1);
+	free(p1alt);
+	free(p2);
+}
+
+static void show_agg_stats(struct disk_util_agg *agg, int terse,
+			   struct buf_output *out)
+{
+	if (!agg->slavecount)
+		return;
+
+	if (!terse) {
+		log_buf(out, ", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
+			 "aggrticks=%llu/%llu, aggrin_queue=%llu, "
+			 "aggrutil=%3.2f%%",
+			(unsigned long long) agg->ios[0] / agg->slavecount,
+			(unsigned long long) agg->ios[1] / agg->slavecount,
+			(unsigned long long) agg->merges[0] / agg->slavecount,
+			(unsigned long long) agg->merges[1] / agg->slavecount,
+			(unsigned long long) agg->ticks[0] / agg->slavecount,
+			(unsigned long long) agg->ticks[1] / agg->slavecount,
+			(unsigned long long) agg->time_in_queue / agg->slavecount,
+			agg->max_util.u.f);
+	} else {
+		log_buf(out, ";slaves;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+			(unsigned long long) agg->ios[0] / agg->slavecount,
+			(unsigned long long) agg->ios[1] / agg->slavecount,
+			(unsigned long long) agg->merges[0] / agg->slavecount,
+			(unsigned long long) agg->merges[1] / agg->slavecount,
+			(unsigned long long) agg->ticks[0] / agg->slavecount,
+			(unsigned long long) agg->ticks[1] / agg->slavecount,
+			(unsigned long long) agg->time_in_queue / agg->slavecount,
+			agg->max_util.u.f);
+	}
+}
+
+static void aggregate_slaves_stats(struct disk_util *masterdu)
+{
+	struct disk_util_agg *agg = &masterdu->agg;
+	struct disk_util_stat *dus;
+	struct flist_head *entry;
+	struct disk_util *slavedu;
+	double util;
+
+	flist_for_each(entry, &masterdu->slaves) {
+		slavedu = flist_entry(entry, struct disk_util, slavelist);
+		dus = &slavedu->dus;
+		agg->ios[0] += dus->s.ios[0];
+		agg->ios[1] += dus->s.ios[1];
+		agg->merges[0] += dus->s.merges[0];
+		agg->merges[1] += dus->s.merges[1];
+		agg->sectors[0] += dus->s.sectors[0];
+		agg->sectors[1] += dus->s.sectors[1];
+		agg->ticks[0] += dus->s.ticks[0];
+		agg->ticks[1] += dus->s.ticks[1];
+		agg->time_in_queue += dus->s.time_in_queue;
+		agg->slavecount++;
+
+		util = (double) (100 * dus->s.io_ticks / (double) slavedu->dus.s.msec);
+		/* System utilization is the utilization of the
+		 * component with the highest utilization.
+		 */
+		if (util > agg->max_util.u.f)
+			agg->max_util.u.f = util;
+
+	}
+
+	if (agg->max_util.u.f > 100.0)
+		agg->max_util.u.f = 100.0;
+}
+
+void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg,
+		     int terse, struct buf_output *out)
+{
+	double util = 0;
+
+	if (dus->s.msec)
+		util = (double) 100 * dus->s.io_ticks / (double) dus->s.msec;
+	if (util > 100.0)
+		util = 100.0;
+
+	if (!terse) {
+		if (agg->slavecount)
+			log_buf(out, "  ");
+
+		log_buf(out, "  %s: ios=%llu/%llu, merge=%llu/%llu, "
+			 "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%",
+				dus->name,
+				(unsigned long long) dus->s.ios[0],
+				(unsigned long long) dus->s.ios[1],
+				(unsigned long long) dus->s.merges[0],
+				(unsigned long long) dus->s.merges[1],
+				(unsigned long long) dus->s.ticks[0],
+				(unsigned long long) dus->s.ticks[1],
+				(unsigned long long) dus->s.time_in_queue,
+				util);
+	} else {
+		log_buf(out, ";%s;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+				dus->name,
+				(unsigned long long) dus->s.ios[0],
+				(unsigned long long) dus->s.ios[1],
+				(unsigned long long) dus->s.merges[0],
+				(unsigned long long) dus->s.merges[1],
+				(unsigned long long) dus->s.ticks[0],
+				(unsigned long long) dus->s.ticks[1],
+				(unsigned long long) dus->s.time_in_queue,
+				util);
+	}
+
+	/*
+	 * If the device has slaves, aggregate the stats for
+	 * those slave devices also.
+	 */
+	show_agg_stats(agg, terse, out);
+
+	if (!terse)
+		log_buf(out, "\n");
+}
+
+void json_array_add_disk_util(struct disk_util_stat *dus,
+		struct disk_util_agg *agg, struct json_array *array)
+{
+	struct json_object *obj;
+	double util = 0;
+
+	if (dus->s.msec)
+		util = (double) 100 * dus->s.io_ticks / (double) dus->s.msec;
+	if (util > 100.0)
+		util = 100.0;
+
+	obj = json_create_object();
+	json_array_add_value_object(array, obj);
+
+	json_object_add_value_string(obj, "name", (const char *)dus->name);
+	json_object_add_value_int(obj, "read_ios", dus->s.ios[0]);
+	json_object_add_value_int(obj, "write_ios", dus->s.ios[1]);
+	json_object_add_value_int(obj, "read_merges", dus->s.merges[0]);
+	json_object_add_value_int(obj, "write_merges", dus->s.merges[1]);
+	json_object_add_value_int(obj, "read_ticks", dus->s.ticks[0]);
+	json_object_add_value_int(obj, "write_ticks", dus->s.ticks[1]);
+	json_object_add_value_int(obj, "in_queue", dus->s.time_in_queue);
+	json_object_add_value_float(obj, "util", util);
+
+	/*
+	 * If the device has slaves, aggregate the stats for
+	 * those slave devices also.
+	 */
+	if (!agg->slavecount)
+		return;
+	json_object_add_value_int(obj, "aggr_read_ios",
+				agg->ios[0] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_write_ios",
+				agg->ios[1] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_read_merges",
+				agg->merges[0] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_write_merge",
+				agg->merges[1] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_read_ticks",
+				agg->ticks[0] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_write_ticks",
+				agg->ticks[1] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_in_queue",
+				agg->time_in_queue / agg->slavecount);
+	json_object_add_value_float(obj, "aggr_util", agg->max_util.u.f);
+}
+
+static void json_object_add_disk_utils(struct json_object *obj,
+				       struct flist_head *head)
+{
+	struct json_array *array = json_create_array();
+	struct flist_head *entry;
+	struct disk_util *du;
+
+	json_object_add_value_array(obj, "disk_util", array);
+
+	flist_for_each(entry, head) {
+		du = flist_entry(entry, struct disk_util, list);
+
+		aggregate_slaves_stats(du);
+		json_array_add_disk_util(&du->dus, &du->agg, array);
+	}
+}
+
+void show_disk_util(int terse, struct json_object *parent,
+		    struct buf_output *out)
+{
+	struct flist_head *entry;
+	struct disk_util *du;
+	bool do_json;
+
+	if (!is_running_backend())
+		return;
+
+	if (flist_empty(&disk_list)) {
+		return;
+	}
+
+	if ((output_format & FIO_OUTPUT_JSON) && parent)
+		do_json = true;
+	else
+		do_json = false;
+
+	if (!terse && !do_json)
+		log_buf(out, "\nDisk stats (read/write):\n");
+
+	if (do_json)
+		json_object_add_disk_utils(parent, &disk_list);
+	else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+		flist_for_each(entry, &disk_list) {
+			du = flist_entry(entry, struct disk_util, list);
+
+			aggregate_slaves_stats(du);
+			print_disk_util(&du->dus, &du->agg, terse, out);
+		}
+	}
+}
+
+static void show_thread_status_normal(struct thread_stat *ts,
+				      struct group_run_stats *rs,
+				      struct buf_output *out)
+{
+	double usr_cpu, sys_cpu;
+	unsigned long runtime;
+	double io_u_dist[FIO_IO_U_MAP_NR];
+	time_t time_p;
+	char time_buf[32];
+
+	if (!ddir_rw_sum(ts->io_bytes) && !ddir_rw_sum(ts->total_io_u))
+		return;
+
+	memset(time_buf, 0, sizeof(time_buf));
+
+	time(&time_p);
+	os_ctime_r((const time_t *) &time_p, time_buf, sizeof(time_buf));
+
+	if (!ts->error) {
+		log_buf(out, "%s: (groupid=%d, jobs=%d): err=%2d: pid=%d: %s",
+					ts->name, ts->groupid, ts->members,
+					ts->error, (int) ts->pid, time_buf);
+	} else {
+		log_buf(out, "%s: (groupid=%d, jobs=%d): err=%2d (%s): pid=%d: %s",
+					ts->name, ts->groupid, ts->members,
+					ts->error, ts->verror, (int) ts->pid,
+					time_buf);
+	}
+
+	if (strlen(ts->description))
+		log_buf(out, "  Description  : [%s]\n", ts->description);
+
+	if (ts->io_bytes[DDIR_READ])
+		show_ddir_status(rs, ts, DDIR_READ, out);
+	if (ts->io_bytes[DDIR_WRITE])
+		show_ddir_status(rs, ts, DDIR_WRITE, out);
+	if (ts->io_bytes[DDIR_TRIM])
+		show_ddir_status(rs, ts, DDIR_TRIM, out);
+
+	show_latencies(ts, out);
+
+	if (ts->sync_stat.samples)
+		show_ddir_status(rs, ts, DDIR_SYNC, out);
+
+	runtime = ts->total_run_time;
+	if (runtime) {
+		double runt = (double) runtime;
+
+		usr_cpu = (double) ts->usr_time * 100 / runt;
+		sys_cpu = (double) ts->sys_time * 100 / runt;
+	} else {
+		usr_cpu = 0;
+		sys_cpu = 0;
+	}
+
+	log_buf(out, "  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%llu,"
+		 " majf=%llu, minf=%llu\n", usr_cpu, sys_cpu,
+			(unsigned long long) ts->ctx,
+			(unsigned long long) ts->majf,
+			(unsigned long long) ts->minf);
+
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	log_buf(out, "  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
+		 " 16=%3.1f%%, 32=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
+					io_u_dist[1], io_u_dist[2],
+					io_u_dist[3], io_u_dist[4],
+					io_u_dist[5], io_u_dist[6]);
+
+	stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
+	log_buf(out, "     submit    : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
+					io_u_dist[1], io_u_dist[2],
+					io_u_dist[3], io_u_dist[4],
+					io_u_dist[5], io_u_dist[6]);
+	stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
+	log_buf(out, "     complete  : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
+					io_u_dist[1], io_u_dist[2],
+					io_u_dist[3], io_u_dist[4],
+					io_u_dist[5], io_u_dist[6]);
+	log_buf(out, "     issued rwts: total=%llu,%llu,%llu,%llu"
+				 " short=%llu,%llu,%llu,0"
+				 " dropped=%llu,%llu,%llu,0\n",
+					(unsigned long long) ts->total_io_u[0],
+					(unsigned long long) ts->total_io_u[1],
+					(unsigned long long) ts->total_io_u[2],
+					(unsigned long long) ts->total_io_u[3],
+					(unsigned long long) ts->short_io_u[0],
+					(unsigned long long) ts->short_io_u[1],
+					(unsigned long long) ts->short_io_u[2],
+					(unsigned long long) ts->drop_io_u[0],
+					(unsigned long long) ts->drop_io_u[1],
+					(unsigned long long) ts->drop_io_u[2]);
+	if (ts->continue_on_error) {
+		log_buf(out, "     errors    : total=%llu, first_error=%d/<%s>\n",
+					(unsigned long long)ts->total_err_count,
+					ts->first_error,
+					strerror(ts->first_error));
+	}
+	if (ts->latency_depth) {
+		log_buf(out, "     latency   : target=%llu, window=%llu, percentile=%.2f%%, depth=%u\n",
+					(unsigned long long)ts->latency_target,
+					(unsigned long long)ts->latency_window,
+					ts->latency_percentile.u.f,
+					ts->latency_depth);
+	}
+
+	if (ts->nr_block_infos)
+		show_block_infos(ts->nr_block_infos, ts->block_infos,
+				  ts->percentile_list, out);
+
+	if (ts->ss_dur)
+		show_ss_normal(ts, out);
+}
+
+static void show_ddir_status_terse(struct thread_stat *ts,
+				   struct group_run_stats *rs, int ddir,
+				   int ver, struct buf_output *out)
+{
+	unsigned long long min, max, minv, maxv, bw, iops;
+	unsigned long long *ovals = NULL;
+	double mean, dev;
+	unsigned int len;
+	int i, bw_stat;
+
+	assert(ddir_rw(ddir));
+
+	iops = bw = 0;
+	if (ts->runtime[ddir]) {
+		uint64_t runt = ts->runtime[ddir];
+
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */
+		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
+	}
+
+	log_buf(out, ";%llu;%llu;%llu;%llu",
+		(unsigned long long) ts->io_bytes[ddir] >> 10, bw, iops,
+					(unsigned long long) ts->runtime[ddir]);
+
+	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
+	else
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
+
+	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
+	else
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
+
+	if (ts->lat_percentiles)
+		len = calc_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir],
+					ts->lat_stat[ddir].samples,
+					ts->percentile_list, &ovals, &maxv,
+					&minv);
+	else if (ts->clat_percentiles)
+		len = calc_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir],
+					ts->clat_stat[ddir].samples,
+					ts->percentile_list, &ovals, &maxv,
+					&minv);
+	else
+		len = 0;
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
+		if (i >= len) {
+			log_buf(out, ";0%%=0");
+			continue;
+		}
+		log_buf(out, ";%f%%=%llu", ts->percentile_list[i].u.f, ovals[i]/1000);
+	}
+
+	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
+	else
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
+
+	free(ovals);
+
+	bw_stat = calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev);
+	if (bw_stat) {
+		double p_of_agg = 100.0;
+
+		if (rs->agg[ddir]) {
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
+
+		log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
+	} else
+		log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+
+	if (ver == 5) {
+		if (bw_stat)
+			log_buf(out, ";%" PRIu64, (&ts->bw_stat[ddir])->samples);
+		else
+			log_buf(out, ";%lu", 0UL);
+
+		if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev))
+			log_buf(out, ";%llu;%llu;%f;%f;%" PRIu64, min, max,
+				mean, dev, (&ts->iops_stat[ddir])->samples);
+		else
+			log_buf(out, ";%llu;%llu;%f;%f;%lu", 0ULL, 0ULL, 0.0, 0.0, 0UL);
+	}
+}
+
+static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles,
+		struct io_stat *lat_stat, uint64_t *io_u_plat)
+{
+	char buf[120];
+	double mean, dev;
+	unsigned int i, len;
+	struct json_object *lat_object, *percentile_object, *clat_bins_object;
+	unsigned long long min, max, maxv, minv, *ovals = NULL;
+
+	if (!calc_lat(lat_stat, &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
+	}
+	lat_object = json_create_object();
+	json_object_add_value_int(lat_object, "min", min);
+	json_object_add_value_int(lat_object, "max", max);
+	json_object_add_value_float(lat_object, "mean", mean);
+	json_object_add_value_float(lat_object, "stddev", dev);
+	json_object_add_value_int(lat_object, "N", lat_stat->samples);
+
+	if (percentiles && lat_stat->samples) {
+		len = calc_clat_percentiles(io_u_plat, lat_stat->samples,
+				ts->percentile_list, &ovals, &maxv, &minv);
+
+		if (len > FIO_IO_U_LIST_MAX_LEN)
+			len = FIO_IO_U_LIST_MAX_LEN;
+
+		percentile_object = json_create_object();
+		json_object_add_value_object(lat_object, "percentile", percentile_object);
+		for (i = 0; i < len; i++) {
+			snprintf(buf, sizeof(buf), "%f", ts->percentile_list[i].u.f);
+			json_object_add_value_int(percentile_object, buf, ovals[i]);
+		}
+		free(ovals);
+
+		if (output_format & FIO_OUTPUT_JSON_PLUS) {
+			clat_bins_object = json_create_object();
+			json_object_add_value_object(lat_object, "bins", clat_bins_object);
+
+			for(i = 0; i < FIO_IO_U_PLAT_NR; i++)
+				if (io_u_plat[i]) {
+					snprintf(buf, sizeof(buf), "%llu", plat_idx_to_val(i));
+					json_object_add_value_int(clat_bins_object, buf, io_u_plat[i]);
+				}
+		}
+	}
+
+	return lat_object;
+}
+
+static void add_ddir_status_json(struct thread_stat *ts,
+		struct group_run_stats *rs, int ddir, struct json_object *parent)
+{
+	unsigned long long min, max;
+	unsigned long long bw_bytes, bw;
+	double mean, dev, iops;
+	struct json_object *dir_object, *tmp_object;
+	double p_of_agg = 100.0;
+
+	assert(ddir_rw(ddir) || ddir_sync(ddir));
+
+	if (ts->unified_rw_rep && ddir != DDIR_READ)
+		return;
+
+	dir_object = json_create_object();
+	json_object_add_value_object(parent,
+		ts->unified_rw_rep ? "mixed" : io_ddir_name(ddir), dir_object);
+
+	if (ddir_rw(ddir)) {
+		bw_bytes = 0;
+		bw = 0;
+		iops = 0.0;
+		if (ts->runtime[ddir]) {
+			uint64_t runt = ts->runtime[ddir];
+
+			bw_bytes = ((1000 * ts->io_bytes[ddir]) / runt); /* Bytes/s */
+			bw = bw_bytes / 1024; /* KiB/s */
+			iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt;
+		}
+
+		json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir]);
+		json_object_add_value_int(dir_object, "io_kbytes", ts->io_bytes[ddir] >> 10);
+		json_object_add_value_int(dir_object, "bw_bytes", bw_bytes);
+		json_object_add_value_int(dir_object, "bw", bw);
+		json_object_add_value_float(dir_object, "iops", iops);
+		json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
+		json_object_add_value_int(dir_object, "total_ios", ts->total_io_u[ddir]);
+		json_object_add_value_int(dir_object, "short_ios", ts->short_io_u[ddir]);
+		json_object_add_value_int(dir_object, "drop_ios", ts->drop_io_u[ddir]);
+
+		tmp_object = add_ddir_lat_json(ts, ts->slat_percentiles,
+				&ts->slat_stat[ddir], ts->io_u_plat[FIO_SLAT][ddir]);
+		json_object_add_value_object(dir_object, "slat_ns", tmp_object);
+
+		tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles,
+				&ts->clat_stat[ddir], ts->io_u_plat[FIO_CLAT][ddir]);
+		json_object_add_value_object(dir_object, "clat_ns", tmp_object);
+
+		tmp_object = add_ddir_lat_json(ts, ts->lat_percentiles,
+				&ts->lat_stat[ddir], ts->io_u_plat[FIO_LAT][ddir]);
+		json_object_add_value_object(dir_object, "lat_ns", tmp_object);
+	} else {
+		json_object_add_value_int(dir_object, "total_ios", ts->total_io_u[DDIR_SYNC]);
+		tmp_object = add_ddir_lat_json(ts, ts->lat_percentiles | ts->clat_percentiles,
+				&ts->sync_stat, ts->io_u_sync_plat);
+		json_object_add_value_object(dir_object, "lat_ns", tmp_object);
+	}
+
+	if (!ddir_rw(ddir))
+		return;
+
+	/* Only print PRIO latencies if some high priority samples were gathered */
+	if (ts->clat_high_prio_stat[ddir].samples > 0) {
+		const char *high, *low;
+
+		if (ts->lat_percentiles) {
+			high = "lat_high_prio";
+			low = "lat_low_prio";
+		} else {
+			high = "clat_high_prio";
+			low = "clat_low_prio";
+		}
+
+		tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
+				&ts->clat_high_prio_stat[ddir], ts->io_u_plat_high_prio[ddir]);
+		json_object_add_value_object(dir_object, high, tmp_object);
+
+		tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
+				&ts->clat_low_prio_stat[ddir], ts->io_u_plat_low_prio[ddir]);
+		json_object_add_value_object(dir_object, low, tmp_object);
+	}
+
+	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
+		if (rs->agg[ddir]) {
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
+	} else {
+		min = max = 0;
+		p_of_agg = mean = dev = 0.0;
+	}
+
+	json_object_add_value_int(dir_object, "bw_min", min);
+	json_object_add_value_int(dir_object, "bw_max", max);
+	json_object_add_value_float(dir_object, "bw_agg", p_of_agg);
+	json_object_add_value_float(dir_object, "bw_mean", mean);
+	json_object_add_value_float(dir_object, "bw_dev", dev);
+	json_object_add_value_int(dir_object, "bw_samples",
+				(&ts->bw_stat[ddir])->samples);
+
+	if (!calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
+	}
+	json_object_add_value_int(dir_object, "iops_min", min);
+	json_object_add_value_int(dir_object, "iops_max", max);
+	json_object_add_value_float(dir_object, "iops_mean", mean);
+	json_object_add_value_float(dir_object, "iops_stddev", dev);
+	json_object_add_value_int(dir_object, "iops_samples",
+				(&ts->iops_stat[ddir])->samples);
+
+	if (ts->cachehit + ts->cachemiss) {
+		uint64_t total;
+		double hit;
+
+		total = ts->cachehit + ts->cachemiss;
+		hit = (double) ts->cachehit / (double) total;
+		hit *= 100.0;
+		json_object_add_value_float(dir_object, "cachehit", hit);
+	}
+}
+
+static void show_thread_status_terse_all(struct thread_stat *ts,
+					 struct group_run_stats *rs, int ver,
+					 struct buf_output *out)
+{
+	double io_u_dist[FIO_IO_U_MAP_NR];
+	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
+	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
+	double usr_cpu, sys_cpu;
+	int i;
+
+	/* General Info */
+	if (ver == 2)
+		log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
+	else
+		log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
+			ts->name, ts->groupid, ts->error);
+
+	/* Log Read Status */
+	show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
+	/* Log Write Status */
+	show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
+	/* Log Trim Status */
+	if (ver == 2 || ver == 4 || ver == 5)
+		show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
+
+	/* CPU Usage */
+	if (ts->total_run_time) {
+		double runt = (double) ts->total_run_time;
+
+		usr_cpu = (double) ts->usr_time * 100 / runt;
+		sys_cpu = (double) ts->sys_time * 100 / runt;
+	} else {
+		usr_cpu = 0;
+		sys_cpu = 0;
+	}
+
+	log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
+						(unsigned long long) ts->ctx,
+						(unsigned long long) ts->majf,
+						(unsigned long long) ts->minf);
+
+	/* Calc % distribution of IO depths, usecond, msecond latency */
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	stat_calc_lat_nu(ts, io_u_lat_u);
+	stat_calc_lat_m(ts, io_u_lat_m);
+
+	/* Only show fixed 7 I/O depth levels*/
+	log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
+			io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
+			io_u_dist[4], io_u_dist[5], io_u_dist[6]);
+
+	/* Microsecond latency */
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		log_buf(out, ";%3.2f%%", io_u_lat_u[i]);
+	/* Millisecond latency */
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
+
+	/* disk util stats, if any */
+	if (ver >= 3 && is_running_backend())
+		show_disk_util(1, NULL, out);
+
+	/* Additional output if continue_on_error set - default off*/
+	if (ts->continue_on_error)
+		log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
+
+	/* Additional output if description is set */
+	if (strlen(ts->description)) {
+		if (ver == 2)
+			log_buf(out, "\n");
+		log_buf(out, ";%s", ts->description);
+	}
+
+	log_buf(out, "\n");
+}
+
+static void json_add_job_opts(struct json_object *root, const char *name,
+			      struct flist_head *opt_list)
+{
+	struct json_object *dir_object;
+	struct flist_head *entry;
+	struct print_option *p;
+
+	if (flist_empty(opt_list))
+		return;
+
+	dir_object = json_create_object();
+	json_object_add_value_object(root, name, dir_object);
+
+	flist_for_each(entry, opt_list) {
+		const char *pos = "";
+
+		p = flist_entry(entry, struct print_option, list);
+		if (p->value)
+			pos = p->value;
+		json_object_add_value_string(dir_object, p->name, pos);
+	}
+}
+
+static struct json_object *show_thread_status_json(struct thread_stat *ts,
+						   struct group_run_stats *rs,
+						   struct flist_head *opt_list)
+{
+	struct json_object *root, *tmp;
+	struct jobs_eta *je;
+	double io_u_dist[FIO_IO_U_MAP_NR];
+	double io_u_lat_n[FIO_IO_U_LAT_N_NR];
+	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
+	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
+	double usr_cpu, sys_cpu;
+	int i;
+	size_t size;
+
+	root = json_create_object();
+	json_object_add_value_string(root, "jobname", ts->name);
+	json_object_add_value_int(root, "groupid", ts->groupid);
+	json_object_add_value_int(root, "error", ts->error);
+
+	/* ETA Info */
+	je = get_jobs_eta(true, &size);
+	if (je) {
+		json_object_add_value_int(root, "eta", je->eta_sec);
+		json_object_add_value_int(root, "elapsed", je->elapsed_sec);
+	}
+
+	if (opt_list)
+		json_add_job_opts(root, "job options", opt_list);
+
+	add_ddir_status_json(ts, rs, DDIR_READ, root);
+	add_ddir_status_json(ts, rs, DDIR_WRITE, root);
+	add_ddir_status_json(ts, rs, DDIR_TRIM, root);
+	add_ddir_status_json(ts, rs, DDIR_SYNC, root);
+
+	/* CPU Usage */
+	if (ts->total_run_time) {
+		double runt = (double) ts->total_run_time;
+
+		usr_cpu = (double) ts->usr_time * 100 / runt;
+		sys_cpu = (double) ts->sys_time * 100 / runt;
+	} else {
+		usr_cpu = 0;
+		sys_cpu = 0;
+	}
+	json_object_add_value_int(root, "job_runtime", ts->total_run_time);
+	json_object_add_value_float(root, "usr_cpu", usr_cpu);
+	json_object_add_value_float(root, "sys_cpu", sys_cpu);
+	json_object_add_value_int(root, "ctx", ts->ctx);
+	json_object_add_value_int(root, "majf", ts->majf);
+	json_object_add_value_int(root, "minf", ts->minf);
+
+	/* Calc % distribution of IO depths */
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	tmp = json_create_object();
+	json_object_add_value_object(root, "iodepth_level", tmp);
+	/* Only show fixed 7 I/O depth levels*/
+	for (i = 0; i < 7; i++) {
+		char name[20];
+		if (i < 6)
+			snprintf(name, 20, "%d", 1 << i);
+		else
+			snprintf(name, 20, ">=%d", 1 << i);
+		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
+	}
+
+	/* Calc % distribution of submit IO depths */
+	stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
+	tmp = json_create_object();
+	json_object_add_value_object(root, "iodepth_submit", tmp);
+	/* Only show fixed 7 I/O depth levels*/
+	for (i = 0; i < 7; i++) {
+		char name[20];
+		if (i == 0)
+			snprintf(name, 20, "0");
+		else if (i < 6)
+			snprintf(name, 20, "%d", 1 << (i+1));
+		else
+			snprintf(name, 20, ">=%d", 1 << i);
+		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
+	}
+
+	/* Calc % distribution of completion IO depths */
+	stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
+	tmp = json_create_object();
+	json_object_add_value_object(root, "iodepth_complete", tmp);
+	/* Only show fixed 7 I/O depth levels*/
+	for (i = 0; i < 7; i++) {
+		char name[20];
+		if (i == 0)
+			snprintf(name, 20, "0");
+		else if (i < 6)
+			snprintf(name, 20, "%d", 1 << (i+1));
+		else
+			snprintf(name, 20, ">=%d", 1 << i);
+		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
+	}
+
+	/* Calc % distribution of nsecond, usecond, msecond latency */
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	stat_calc_lat_n(ts, io_u_lat_n);
+	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_m(ts, io_u_lat_m);
+
+	/* Nanosecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_ns", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_n[i]);
+	}
+	/* Microsecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_us", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_u[i]);
+	}
+	/* Millisecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_ms", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", "2000",
+				 ">=2000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_m[i]);
+	}
+
+	/* Additional output if continue_on_error set - default off*/
+	if (ts->continue_on_error) {
+		json_object_add_value_int(root, "total_err", ts->total_err_count);
+		json_object_add_value_int(root, "first_error", ts->first_error);
+	}
+
+	if (ts->latency_depth) {
+		json_object_add_value_int(root, "latency_depth", ts->latency_depth);
+		json_object_add_value_int(root, "latency_target", ts->latency_target);
+		json_object_add_value_float(root, "latency_percentile", ts->latency_percentile.u.f);
+		json_object_add_value_int(root, "latency_window", ts->latency_window);
+	}
+
+	/* Additional output if description is set */
+	if (strlen(ts->description))
+		json_object_add_value_string(root, "desc", ts->description);
+
+	if (ts->nr_block_infos) {
+		/* Block error histogram and types */
+		int len;
+		unsigned int *percentiles = NULL;
+		unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+		len = calc_block_percentiles(ts->nr_block_infos, ts->block_infos,
+					     ts->percentile_list,
+					     &percentiles, block_state_counts);
+
+		if (len) {
+			struct json_object *block, *percentile_object, *states;
+			int state;
+			block = json_create_object();
+			json_object_add_value_object(root, "block", block);
+
+			percentile_object = json_create_object();
+			json_object_add_value_object(block, "percentiles",
+						     percentile_object);
+			for (i = 0; i < len; i++) {
+				char buf[20];
+				snprintf(buf, sizeof(buf), "%f",
+					 ts->percentile_list[i].u.f);
+				json_object_add_value_int(percentile_object,
+							  buf,
+							  percentiles[i]);
+			}
+
+			states = json_create_object();
+			json_object_add_value_object(block, "states", states);
+			for (state = 0; state < BLOCK_STATE_COUNT; state++) {
+				json_object_add_value_int(states,
+					block_state_names[state],
+					block_state_counts[state]);
+			}
+			free(percentiles);
+		}
+	}
+
+	if (ts->ss_dur) {
+		struct json_object *data;
+		struct json_array *iops, *bw;
+		int j, k, l;
+		char ss_buf[64];
+
+		snprintf(ss_buf, sizeof(ss_buf), "%s%s:%f%s",
+			ts->ss_state & FIO_SS_IOPS ? "iops" : "bw",
+			ts->ss_state & FIO_SS_SLOPE ? "_slope" : "",
+			(float) ts->ss_limit.u.f,
+			ts->ss_state & FIO_SS_PCT ? "%" : "");
+
+		tmp = json_create_object();
+		json_object_add_value_object(root, "steadystate", tmp);
+		json_object_add_value_string(tmp, "ss", ss_buf);
+		json_object_add_value_int(tmp, "duration", (int)ts->ss_dur);
+		json_object_add_value_int(tmp, "attained", (ts->ss_state & FIO_SS_ATTAINED) > 0);
+
+		snprintf(ss_buf, sizeof(ss_buf), "%f%s", (float) ts->ss_criterion.u.f,
+			ts->ss_state & FIO_SS_PCT ? "%" : "");
+		json_object_add_value_string(tmp, "criterion", ss_buf);
+		json_object_add_value_float(tmp, "max_deviation", ts->ss_deviation.u.f);
+		json_object_add_value_float(tmp, "slope", ts->ss_slope.u.f);
+
+		data = json_create_object();
+		json_object_add_value_object(tmp, "data", data);
+		bw = json_create_array();
+		iops = json_create_array();
+
+		/*
+		** if ss was attained or the buffer is not full,
+		** ss->head points to the first element in the list.
+		** otherwise it actually points to the second element
+		** in the list
+		*/
+		if ((ts->ss_state & FIO_SS_ATTAINED) || !(ts->ss_state & FIO_SS_BUFFER_FULL))
+			j = ts->ss_head;
+		else
+			j = ts->ss_head == 0 ? ts->ss_dur - 1 : ts->ss_head - 1;
+		for (l = 0; l < ts->ss_dur; l++) {
+			k = (j + l) % ts->ss_dur;
+			json_array_add_value_int(bw, ts->ss_bw_data[k]);
+			json_array_add_value_int(iops, ts->ss_iops_data[k]);
+		}
+		json_object_add_value_int(data, "bw_mean", steadystate_bw_mean(ts));
+		json_object_add_value_int(data, "iops_mean", steadystate_iops_mean(ts));
+		json_object_add_value_array(data, "iops", iops);
+		json_object_add_value_array(data, "bw", bw);
+	}
+
+	return root;
+}
+
+static void show_thread_status_terse(struct thread_stat *ts,
+				     struct group_run_stats *rs,
+				     struct buf_output *out)
+{
+	if (terse_version >= 2 && terse_version <= 5)
+		show_thread_status_terse_all(ts, rs, terse_version, out);
+	else
+		log_err("fio: bad terse version!? %d\n", terse_version);
+}
+
+struct json_object *show_thread_status(struct thread_stat *ts,
+				       struct group_run_stats *rs,
+				       struct flist_head *opt_list,
+				       struct buf_output *out)
+{
+	struct json_object *ret = NULL;
+
+	if (output_format & FIO_OUTPUT_TERSE)
+		show_thread_status_terse(ts, rs,  out);
+	if (output_format & FIO_OUTPUT_JSON)
+		ret = show_thread_status_json(ts, rs, opt_list);
+	if (output_format & FIO_OUTPUT_NORMAL)
+		show_thread_status_normal(ts, rs,  out);
+
+	return ret;
+}
+
+static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first)
+{
+	double mean, S;
+
+	dst->min_val = min(dst->min_val, src->min_val);
+	dst->max_val = max(dst->max_val, src->max_val);
+
+	/*
+	 * Compute new mean and S after the merge
+	 * <http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+	 *  #Parallel_algorithm>
+	 */
+	if (first) {
+		mean = src->mean.u.f;
+		S = src->S.u.f;
+	} else {
+		double delta = src->mean.u.f - dst->mean.u.f;
+
+		mean = ((src->mean.u.f * src->samples) +
+			(dst->mean.u.f * dst->samples)) /
+			(dst->samples + src->samples);
+
+		S =  src->S.u.f + dst->S.u.f + pow(delta, 2.0) *
+			(dst->samples * src->samples) /
+			(dst->samples + src->samples);
+	}
+
+	dst->samples += src->samples;
+	dst->mean.u.f = mean;
+	dst->S.u.f = S;
+
+}
+
+/*
+ * We sum two kinds of stats - one that is time based, in which case we
+ * apply the proper summing technique, and then one that is iops/bw
+ * numbers. For group_reporting, we should just add those up, not make
+ * them the mean of everything.
+ */
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first,
+		     bool pure_sum)
+{
+	if (src->samples == 0)
+		return;
+
+	if (!pure_sum) {
+		__sum_stat(dst, src, first);
+		return;
+	}
+
+	if (first) {
+		dst->min_val = src->min_val;
+		dst->max_val = src->max_val;
+		dst->samples = src->samples;
+		dst->mean.u.f = src->mean.u.f;
+		dst->S.u.f = src->S.u.f;
+	} else {
+		dst->min_val += src->min_val;
+		dst->max_val += src->max_val;
+		dst->samples += src->samples;
+		dst->mean.u.f += src->mean.u.f;
+		dst->S.u.f += src->S.u.f;
+	}
+}
+
+void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
+{
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (dst->max_run[i] < src->max_run[i])
+			dst->max_run[i] = src->max_run[i];
+		if (dst->min_run[i] && dst->min_run[i] > src->min_run[i])
+			dst->min_run[i] = src->min_run[i];
+		if (dst->max_bw[i] < src->max_bw[i])
+			dst->max_bw[i] = src->max_bw[i];
+		if (dst->min_bw[i] && dst->min_bw[i] > src->min_bw[i])
+			dst->min_bw[i] = src->min_bw[i];
+
+		dst->iobytes[i] += src->iobytes[i];
+		dst->agg[i] += src->agg[i];
+	}
+
+	if (!dst->kb_base)
+		dst->kb_base = src->kb_base;
+	if (!dst->unit_base)
+		dst->unit_base = src->unit_base;
+	if (!dst->sig_figs)
+		dst->sig_figs = src->sig_figs;
+}
+
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
+		      bool first)
+{
+	int k, l, m;
+
+	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
+		if (!dst->unified_rw_rep) {
+			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
+			sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
+			sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
+			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false);
+			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false);
+			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true);
+			sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true);
+
+			dst->io_bytes[l] += src->io_bytes[l];
+
+			if (dst->runtime[l] < src->runtime[l])
+				dst->runtime[l] = src->runtime[l];
+		} else {
+			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false);
+			sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], first, false);
+			sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], first, false);
+			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false);
+			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false);
+			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true);
+			sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true);
+
+			dst->io_bytes[0] += src->io_bytes[l];
+
+			if (dst->runtime[0] < src->runtime[l])
+				dst->runtime[0] = src->runtime[l];
+
+			/*
+			 * We're summing to the same destination, so override
+			 * 'first' after the first iteration of the loop
+			 */
+			first = false;
+		}
+	}
+
+	sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
+	dst->usr_time += src->usr_time;
+	dst->sys_time += src->sys_time;
+	dst->ctx += src->ctx;
+	dst->majf += src->majf;
+	dst->minf += src->minf;
+
+	for (k = 0; k < FIO_IO_U_MAP_NR; k++) {
+		dst->io_u_map[k] += src->io_u_map[k];
+		dst->io_u_submit[k] += src->io_u_submit[k];
+		dst->io_u_complete[k] += src->io_u_complete[k];
+	}
+
+	for (k = 0; k < FIO_IO_U_LAT_N_NR; k++)
+		dst->io_u_lat_n[k] += src->io_u_lat_n[k];
+	for (k = 0; k < FIO_IO_U_LAT_U_NR; k++)
+		dst->io_u_lat_u[k] += src->io_u_lat_u[k];
+	for (k = 0; k < FIO_IO_U_LAT_M_NR; k++)
+		dst->io_u_lat_m[k] += src->io_u_lat_m[k];
+
+	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
+		if (!dst->unified_rw_rep) {
+			dst->total_io_u[k] += src->total_io_u[k];
+			dst->short_io_u[k] += src->short_io_u[k];
+			dst->drop_io_u[k] += src->drop_io_u[k];
+		} else {
+			dst->total_io_u[0] += src->total_io_u[k];
+			dst->short_io_u[0] += src->short_io_u[k];
+			dst->drop_io_u[0] += src->drop_io_u[k];
+		}
+	}
+
+	dst->total_io_u[DDIR_SYNC] += src->total_io_u[DDIR_SYNC];
+
+	for (k = 0; k < FIO_LAT_CNT; k++)
+		for (l = 0; l < DDIR_RWDIR_CNT; l++)
+			for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
+				if (!dst->unified_rw_rep)
+					dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m];
+				else
+					dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m];
+
+	for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+		dst->io_u_sync_plat[k] += src->io_u_sync_plat[k];
+
+	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
+		for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
+			if (!dst->unified_rw_rep) {
+				dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
+				dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
+			} else {
+				dst->io_u_plat_high_prio[0][m] += src->io_u_plat_high_prio[k][m];
+				dst->io_u_plat_low_prio[0][m] += src->io_u_plat_low_prio[k][m];
+			}
+
+		}
+	}
+
+	dst->total_run_time += src->total_run_time;
+	dst->total_submit += src->total_submit;
+	dst->total_complete += src->total_complete;
+	dst->nr_zone_resets += src->nr_zone_resets;
+	dst->cachehit += src->cachehit;
+	dst->cachemiss += src->cachemiss;
+}
+
+void init_group_run_stat(struct group_run_stats *gs)
+{
+	int i;
+	memset(gs, 0, sizeof(*gs));
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		gs->min_bw[i] = gs->min_run[i] = ~0UL;
+}
+
+void init_thread_stat(struct thread_stat *ts)
+{
+	int j;
+
+	memset(ts, 0, sizeof(*ts));
+
+	for (j = 0; j < DDIR_RWDIR_CNT; j++) {
+		ts->lat_stat[j].min_val = -1UL;
+		ts->clat_stat[j].min_val = -1UL;
+		ts->slat_stat[j].min_val = -1UL;
+		ts->bw_stat[j].min_val = -1UL;
+		ts->iops_stat[j].min_val = -1UL;
+		ts->clat_high_prio_stat[j].min_val = -1UL;
+		ts->clat_low_prio_stat[j].min_val = -1UL;
+	}
+	ts->sync_stat.min_val = -1UL;
+	ts->groupid = -1;
+}
+
+void __show_run_stats(void)
+{
+	struct group_run_stats *runstats, *rs;
+	struct thread_data *td;
+	struct thread_stat *threadstats, *ts;
+	int i, j, k, nr_ts, last_ts, idx;
+	bool kb_base_warned = false;
+	bool unit_base_warned = false;
+	struct json_object *root = NULL;
+	struct json_array *array = NULL;
+	struct buf_output output[FIO_OUTPUT_NR];
+	struct flist_head **opt_lists;
+
+	runstats = malloc(sizeof(struct group_run_stats) * (groupid + 1));
+
+	for (i = 0; i < groupid + 1; i++)
+		init_group_run_stat(&runstats[i]);
+
+	/*
+	 * find out how many threads stats we need. if group reporting isn't
+	 * enabled, it's one-per-td.
+	 */
+	nr_ts = 0;
+	last_ts = -1;
+	for_each_td(td, i) {
+		if (!td->o.group_reporting) {
+			nr_ts++;
+			continue;
+		}
+		if (last_ts == td->groupid)
+			continue;
+		if (!td->o.stats)
+			continue;
+
+		last_ts = td->groupid;
+		nr_ts++;
+	}
+
+	threadstats = malloc(nr_ts * sizeof(struct thread_stat));
+	opt_lists = malloc(nr_ts * sizeof(struct flist_head *));
+
+	for (i = 0; i < nr_ts; i++) {
+		init_thread_stat(&threadstats[i]);
+		opt_lists[i] = NULL;
+	}
+
+	j = 0;
+	last_ts = -1;
+	idx = 0;
+	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
+		if (idx && (!td->o.group_reporting ||
+		    (td->o.group_reporting && last_ts != td->groupid))) {
+			idx = 0;
+			j++;
+		}
+
+		last_ts = td->groupid;
+
+		ts = &threadstats[j];
+
+		ts->clat_percentiles = td->o.clat_percentiles;
+		ts->lat_percentiles = td->o.lat_percentiles;
+		ts->slat_percentiles = td->o.slat_percentiles;
+		ts->percentile_precision = td->o.percentile_precision;
+		memcpy(ts->percentile_list, td->o.percentile_list, sizeof(td->o.percentile_list));
+		opt_lists[j] = &td->opt_list;
+
+		idx++;
+		ts->members++;
+
+		if (ts->groupid == -1) {
+			/*
+			 * These are per-group shared already
+			 */
+			snprintf(ts->name, sizeof(ts->name), "%s", td->o.name);
+			if (td->o.description)
+				snprintf(ts->description,
+					 sizeof(ts->description), "%s",
+					 td->o.description);
+			else
+				memset(ts->description, 0, FIO_JOBDESC_SIZE);
+
+			/*
+			 * If multiple entries in this group, this is
+			 * the first member.
+			 */
+			ts->thread_number = td->thread_number;
+			ts->groupid = td->groupid;
+
+			/*
+			 * first pid in group, not very useful...
+			 */
+			ts->pid = td->pid;
+
+			ts->kb_base = td->o.kb_base;
+			ts->unit_base = td->o.unit_base;
+			ts->sig_figs = td->o.sig_figs;
+			ts->unified_rw_rep = td->o.unified_rw_rep;
+		} else if (ts->kb_base != td->o.kb_base && !kb_base_warned) {
+			log_info("fio: kb_base differs for jobs in group, using"
+				 " %u as the base\n", ts->kb_base);
+			kb_base_warned = true;
+		} else if (ts->unit_base != td->o.unit_base && !unit_base_warned) {
+			log_info("fio: unit_base differs for jobs in group, using"
+				 " %u as the base\n", ts->unit_base);
+			unit_base_warned = true;
+		}
+
+		ts->continue_on_error = td->o.continue_on_error;
+		ts->total_err_count += td->total_err_count;
+		ts->first_error = td->first_error;
+		if (!ts->error) {
+			if (!td->error && td->o.continue_on_error &&
+			    td->first_error) {
+				ts->error = td->first_error;
+				snprintf(ts->verror, sizeof(ts->verror), "%s",
+					 td->verror);
+			} else  if (td->error) {
+				ts->error = td->error;
+				snprintf(ts->verror, sizeof(ts->verror), "%s",
+					 td->verror);
+			}
+		}
+
+		ts->latency_depth = td->latency_qd;
+		ts->latency_target = td->o.latency_target;
+		ts->latency_percentile = td->o.latency_percentile;
+		ts->latency_window = td->o.latency_window;
+
+		ts->nr_block_infos = td->ts.nr_block_infos;
+		for (k = 0; k < ts->nr_block_infos; k++)
+			ts->block_infos[k] = td->ts.block_infos[k];
+
+		sum_thread_stats(ts, &td->ts, idx == 1);
+
+		if (td->o.ss_dur) {
+			ts->ss_state = td->ss.state;
+			ts->ss_dur = td->ss.dur;
+			ts->ss_head = td->ss.head;
+			ts->ss_bw_data = td->ss.bw_data;
+			ts->ss_iops_data = td->ss.iops_data;
+			ts->ss_limit.u.f = td->ss.limit;
+			ts->ss_slope.u.f = td->ss.slope;
+			ts->ss_deviation.u.f = td->ss.deviation;
+			ts->ss_criterion.u.f = td->ss.criterion;
+		}
+		else
+			ts->ss_dur = ts->ss_state = 0;
+	}
+
+	for (i = 0; i < nr_ts; i++) {
+		unsigned long long bw;
+
+		ts = &threadstats[i];
+		if (ts->groupid == -1)
+			continue;
+		rs = &runstats[ts->groupid];
+		rs->kb_base = ts->kb_base;
+		rs->unit_base = ts->unit_base;
+		rs->sig_figs = ts->sig_figs;
+		rs->unified_rw_rep += ts->unified_rw_rep;
+
+		for (j = 0; j < DDIR_RWDIR_CNT; j++) {
+			if (!ts->runtime[j])
+				continue;
+			if (ts->runtime[j] < rs->min_run[j] || !rs->min_run[j])
+				rs->min_run[j] = ts->runtime[j];
+			if (ts->runtime[j] > rs->max_run[j])
+				rs->max_run[j] = ts->runtime[j];
+
+			bw = 0;
+			if (ts->runtime[j])
+				bw = ts->io_bytes[j] * 1000 / ts->runtime[j];
+			if (bw < rs->min_bw[j])
+				rs->min_bw[j] = bw;
+			if (bw > rs->max_bw[j])
+				rs->max_bw[j] = bw;
+
+			rs->iobytes[j] += ts->io_bytes[j];
+		}
+	}
+
+	for (i = 0; i < groupid + 1; i++) {
+		int ddir;
+
+		rs = &runstats[i];
+
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			if (rs->max_run[ddir])
+				rs->agg[ddir] = (rs->iobytes[ddir] * 1000) /
+						rs->max_run[ddir];
+		}
+	}
+
+	for (i = 0; i < FIO_OUTPUT_NR; i++)
+		buf_output_init(&output[i]);
+
+	/*
+	 * don't overwrite last signal output
+	 */
+	if (output_format & FIO_OUTPUT_NORMAL)
+		log_buf(&output[__FIO_OUTPUT_NORMAL], "\n");
+	if (output_format & FIO_OUTPUT_JSON) {
+		struct thread_data *global;
+		char time_buf[32];
+		struct timeval now;
+		unsigned long long ms_since_epoch;
+		time_t tv_sec;
+
+		gettimeofday(&now, NULL);
+		ms_since_epoch = (unsigned long long)(now.tv_sec) * 1000 +
+		                 (unsigned long long)(now.tv_usec) / 1000;
+
+		tv_sec = now.tv_sec;
+		os_ctime_r(&tv_sec, time_buf, sizeof(time_buf));
+		if (time_buf[strlen(time_buf) - 1] == '\n')
+			time_buf[strlen(time_buf) - 1] = '\0';
+
+		root = json_create_object();
+		json_object_add_value_string(root, "fio version", fio_version_string);
+		json_object_add_value_int(root, "timestamp", now.tv_sec);
+		json_object_add_value_int(root, "timestamp_ms", ms_since_epoch);
+		json_object_add_value_string(root, "time", time_buf);
+		global = get_global_options();
+		json_add_job_opts(root, "global options", &global->opt_list);
+		array = json_create_array();
+		json_object_add_value_array(root, "jobs", array);
+	}
+
+	if (is_backend)
+		fio_server_send_job_options(&get_global_options()->opt_list, -1U);
+
+	for (i = 0; i < nr_ts; i++) {
+		ts = &threadstats[i];
+		rs = &runstats[ts->groupid];
+
+		if (is_backend) {
+			fio_server_send_job_options(opt_lists[i], i);
+			fio_server_send_ts(ts, rs);
+		} else {
+			if (output_format & FIO_OUTPUT_TERSE)
+				show_thread_status_terse(ts, rs, &output[__FIO_OUTPUT_TERSE]);
+			if (output_format & FIO_OUTPUT_JSON) {
+				struct json_object *tmp = show_thread_status_json(ts, rs, opt_lists[i]);
+				json_array_add_value_object(array, tmp);
+			}
+			if (output_format & FIO_OUTPUT_NORMAL)
+				show_thread_status_normal(ts, rs, &output[__FIO_OUTPUT_NORMAL]);
+		}
+	}
+	if (!is_backend && (output_format & FIO_OUTPUT_JSON)) {
+		/* disk util stats, if any */
+		show_disk_util(1, root, &output[__FIO_OUTPUT_JSON]);
+
+		show_idle_prof_stats(FIO_OUTPUT_JSON, root, &output[__FIO_OUTPUT_JSON]);
+
+		json_print_object(root, &output[__FIO_OUTPUT_JSON]);
+		log_buf(&output[__FIO_OUTPUT_JSON], "\n");
+		json_free_object(root);
+	}
+
+	for (i = 0; i < groupid + 1; i++) {
+		rs = &runstats[i];
+
+		rs->groupid = i;
+		if (is_backend)
+			fio_server_send_gs(rs);
+		else if (output_format & FIO_OUTPUT_NORMAL)
+			show_group_stats(rs, &output[__FIO_OUTPUT_NORMAL]);
+	}
+
+	if (is_backend)
+		fio_server_send_du();
+	else if (output_format & FIO_OUTPUT_NORMAL) {
+		show_disk_util(0, NULL, &output[__FIO_OUTPUT_NORMAL]);
+		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL, &output[__FIO_OUTPUT_NORMAL]);
+	}
+
+	for (i = 0; i < FIO_OUTPUT_NR; i++) {
+		struct buf_output *out = &output[i];
+
+		log_info_buf(out->buf, out->buflen);
+		buf_output_free(out);
+	}
+
+	fio_idle_prof_cleanup();
+
+	log_info_flush();
+	free(runstats);
+	free(threadstats);
+	free(opt_lists);
+}
+
+void __show_running_run_stats(void)
+{
+	struct thread_data *td;
+	unsigned long long *rt;
+	struct timespec ts;
+	int i;
+
+	fio_sem_down(stat_sem);
+
+	rt = malloc(thread_number * sizeof(unsigned long long));
+	fio_gettime(&ts, NULL);
+
+	for_each_td(td, i) {
+		td->update_rusage = 1;
+		td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
+		td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
+		td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
+		td->ts.total_run_time = mtime_since(&td->epoch, &ts);
+
+		rt[i] = mtime_since(&td->start, &ts);
+		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
+			td->ts.runtime[DDIR_READ] += rt[i];
+		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
+			td->ts.runtime[DDIR_WRITE] += rt[i];
+		if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
+			td->ts.runtime[DDIR_TRIM] += rt[i];
+	}
+
+	for_each_td(td, i) {
+		if (td->runstate >= TD_EXITED)
+			continue;
+		if (td->rusage_sem) {
+			td->update_rusage = 1;
+			fio_sem_down(td->rusage_sem);
+		}
+		td->update_rusage = 0;
+	}
+
+	__show_run_stats();
+
+	for_each_td(td, i) {
+		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
+			td->ts.runtime[DDIR_READ] -= rt[i];
+		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
+			td->ts.runtime[DDIR_WRITE] -= rt[i];
+		if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
+			td->ts.runtime[DDIR_TRIM] -= rt[i];
+	}
+
+	free(rt);
+	fio_sem_up(stat_sem);
+}
+
+static bool status_interval_init;
+static struct timespec status_time;
+static bool status_file_disabled;
+
+#define FIO_STATUS_FILE		"fio-dump-status"
+
+static int check_status_file(void)
+{
+	struct stat sb;
+	const char *temp_dir;
+	char fio_status_file_path[PATH_MAX];
+
+	if (status_file_disabled)
+		return 0;
+
+	temp_dir = getenv("TMPDIR");
+	if (temp_dir == NULL) {
+		temp_dir = getenv("TEMP");
+		if (temp_dir && strlen(temp_dir) >= PATH_MAX)
+			temp_dir = NULL;
+	}
+	if (temp_dir == NULL)
+		temp_dir = "/tmp";
+#ifdef __COVERITY__
+	__coverity_tainted_data_sanitize__(temp_dir);
+#endif
+
+	snprintf(fio_status_file_path, sizeof(fio_status_file_path), "%s/%s", temp_dir, FIO_STATUS_FILE);
+
+	if (stat(fio_status_file_path, &sb))
+		return 0;
+
+	if (unlink(fio_status_file_path) < 0) {
+		log_err("fio: failed to unlink %s: %s\n", fio_status_file_path,
+							strerror(errno));
+		log_err("fio: disabling status file updates\n");
+		status_file_disabled = true;
+	}
+
+	return 1;
+}
+
+void check_for_running_stats(void)
+{
+	if (status_interval) {
+		if (!status_interval_init) {
+			fio_gettime(&status_time, NULL);
+			status_interval_init = true;
+		} else if (mtime_since_now(&status_time) >= status_interval) {
+			show_running_run_stats();
+			fio_gettime(&status_time, NULL);
+			return;
+		}
+	}
+	if (check_status_file()) {
+		show_running_run_stats();
+		return;
+	}
+}
+
+static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
+{
+	double val = data;
+	double delta;
+
+	if (data > is->max_val)
+		is->max_val = data;
+	if (data < is->min_val)
+		is->min_val = data;
+
+	delta = val - is->mean.u.f;
+	if (delta) {
+		is->mean.u.f += delta / (is->samples + 1.0);
+		is->S.u.f += delta * (val - is->mean.u.f);
+	}
+
+	is->samples++;
+}
+
+/*
+ * Return a struct io_logs, which is added to the tail of the log
+ * list for 'iolog'.
+ */
+static struct io_logs *get_new_log(struct io_log *iolog)
+{
+	size_t new_size, new_samples;
+	struct io_logs *cur_log;
+
+	/*
+	 * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
+	 * forever
+	 */
+	if (!iolog->cur_log_max)
+		new_samples = DEF_LOG_ENTRIES;
+	else {
+		new_samples = iolog->cur_log_max * 2;
+		if (new_samples > MAX_LOG_ENTRIES)
+			new_samples = MAX_LOG_ENTRIES;
+	}
+
+	new_size = new_samples * log_entry_sz(iolog);
+
+	cur_log = smalloc(sizeof(*cur_log));
+	if (cur_log) {
+		INIT_FLIST_HEAD(&cur_log->list);
+		cur_log->log = malloc(new_size);
+		if (cur_log->log) {
+			cur_log->nr_samples = 0;
+			cur_log->max_samples = new_samples;
+			flist_add_tail(&cur_log->list, &iolog->io_logs);
+			iolog->cur_log_max = new_samples;
+			return cur_log;
+		}
+		sfree(cur_log);
+	}
+
+	return NULL;
+}
+
+/*
+ * Add and return a new log chunk, or return current log if big enough
+ */
+static struct io_logs *regrow_log(struct io_log *iolog)
+{
+	struct io_logs *cur_log;
+	int i;
+
+	if (!iolog || iolog->disabled)
+		goto disable;
+
+	cur_log = iolog_cur_log(iolog);
+	if (!cur_log) {
+		cur_log = get_new_log(iolog);
+		if (!cur_log)
+			return NULL;
+	}
+
+	if (cur_log->nr_samples < cur_log->max_samples)
+		return cur_log;
+
+	/*
+	 * No room for a new sample. If we're compressing on the fly, flush
+	 * out the current chunk
+	 */
+	if (iolog->log_gz) {
+		if (iolog_cur_flush(iolog, cur_log)) {
+			log_err("fio: failed flushing iolog! Will stop logging.\n");
+			return NULL;
+		}
+	}
+
+	/*
+	 * Get a new log array, and add to our list
+	 */
+	cur_log = get_new_log(iolog);
+	if (!cur_log) {
+		log_err("fio: failed extending iolog! Will stop logging.\n");
+		return NULL;
+	}
+
+	if (!iolog->pending || !iolog->pending->nr_samples)
+		return cur_log;
+
+	/*
+	 * Flush pending items to new log
+	 */
+	for (i = 0; i < iolog->pending->nr_samples; i++) {
+		struct io_sample *src, *dst;
+
+		src = get_sample(iolog, iolog->pending, i);
+		dst = get_sample(iolog, cur_log, i);
+		memcpy(dst, src, log_entry_sz(iolog));
+	}
+	cur_log->nr_samples = iolog->pending->nr_samples;
+
+	iolog->pending->nr_samples = 0;
+	return cur_log;
+disable:
+	if (iolog)
+		iolog->disabled = true;
+	return NULL;
+}
+
+void regrow_logs(struct thread_data *td)
+{
+	regrow_log(td->slat_log);
+	regrow_log(td->clat_log);
+	regrow_log(td->clat_hist_log);
+	regrow_log(td->lat_log);
+	regrow_log(td->bw_log);
+	regrow_log(td->iops_log);
+	td->flags &= ~TD_F_REGROW_LOGS;
+}
+
+static struct io_logs *get_cur_log(struct io_log *iolog)
+{
+	struct io_logs *cur_log;
+
+	cur_log = iolog_cur_log(iolog);
+	if (!cur_log) {
+		cur_log = get_new_log(iolog);
+		if (!cur_log)
+			return NULL;
+	}
+
+	if (cur_log->nr_samples < cur_log->max_samples)
+		return cur_log;
+
+	/*
+	 * Out of space. If we're in IO offload mode, or we're not doing
+	 * per unit logging (hence logging happens outside of the IO thread
+	 * as well), add a new log chunk inline. If we're doing inline
+	 * submissions, flag 'td' as needing a log regrow and we'll take
+	 * care of it on the submission side.
+	 */
+	if ((iolog->td && iolog->td->o.io_submit_mode == IO_MODE_OFFLOAD) ||
+	    !per_unit_log(iolog))
+		return regrow_log(iolog);
+
+	if (iolog->td)
+		iolog->td->flags |= TD_F_REGROW_LOGS;
+	if (iolog->pending)
+		assert(iolog->pending->nr_samples < iolog->pending->max_samples);
+	return iolog->pending;
+}
+
+static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
+			     enum fio_ddir ddir, unsigned long long bs,
+			     unsigned long t, uint64_t offset, uint8_t priority_bit)
+{
+	struct io_logs *cur_log;
+
+	if (iolog->disabled)
+		return;
+	if (flist_empty(&iolog->io_logs))
+		iolog->avg_last[ddir] = t;
+
+	cur_log = get_cur_log(iolog);
+	if (cur_log) {
+		struct io_sample *s;
+
+		s = get_sample(iolog, cur_log, cur_log->nr_samples);
+
+		s->data = data;
+		s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
+		io_sample_set_ddir(iolog, s, ddir);
+		s->bs = bs;
+		s->priority_bit = priority_bit;
+
+		if (iolog->log_offset) {
+			struct io_sample_offset *so = (void *) s;
+
+			so->offset = offset;
+		}
+
+		cur_log->nr_samples++;
+		return;
+	}
+
+	iolog->disabled = true;
+}
+
+static inline void reset_io_stat(struct io_stat *ios)
+{
+	ios->min_val = -1ULL;
+	ios->max_val = ios->samples = 0;
+	ios->mean.u.f = ios->S.u.f = 0;
+}
+
+void reset_io_stats(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+	int i, j, k;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		reset_io_stat(&ts->clat_high_prio_stat[i]);
+		reset_io_stat(&ts->clat_low_prio_stat[i]);
+		reset_io_stat(&ts->clat_stat[i]);
+		reset_io_stat(&ts->slat_stat[i]);
+		reset_io_stat(&ts->lat_stat[i]);
+		reset_io_stat(&ts->bw_stat[i]);
+		reset_io_stat(&ts->iops_stat[i]);
+
+		ts->io_bytes[i] = 0;
+		ts->runtime[i] = 0;
+		ts->total_io_u[i] = 0;
+		ts->short_io_u[i] = 0;
+		ts->drop_io_u[i] = 0;
+
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			ts->io_u_plat_high_prio[i][j] = 0;
+			ts->io_u_plat_low_prio[i][j] = 0;
+			if (!i)
+				ts->io_u_sync_plat[j] = 0;
+		}
+	}
+
+	for (i = 0; i < FIO_LAT_CNT; i++)
+		for (j = 0; j < DDIR_RWDIR_CNT; j++)
+			for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+				ts->io_u_plat[i][j][k] = 0;
+
+	ts->total_io_u[DDIR_SYNC] = 0;
+
+	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
+		ts->io_u_map[i] = 0;
+		ts->io_u_submit[i] = 0;
+		ts->io_u_complete[i] = 0;
+	}
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		ts->io_u_lat_n[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		ts->io_u_lat_u[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		ts->io_u_lat_m[i] = 0;
+
+	ts->total_submit = 0;
+	ts->total_complete = 0;
+	ts->nr_zone_resets = 0;
+	ts->cachehit = ts->cachemiss = 0;
+}
+
+static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
+			      unsigned long elapsed, bool log_max, uint8_t priority_bit)
+{
+	/*
+	 * Note an entry in the log. Use the mean from the logged samples,
+	 * making sure to properly round up. Only write a log entry if we
+	 * had actual samples done.
+	 */
+	if (iolog->avg_window[ddir].samples) {
+		union io_sample_data data;
+
+		if (log_max)
+			data.val = iolog->avg_window[ddir].max_val;
+		else
+			data.val = iolog->avg_window[ddir].mean.u.f + 0.50;
+
+		__add_log_sample(iolog, data, ddir, 0, elapsed, 0, priority_bit);
+	}
+
+	reset_io_stat(&iolog->avg_window[ddir]);
+}
+
+static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
+			     bool log_max, uint8_t priority_bit)
+{
+	int ddir;
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		__add_stat_to_log(iolog, ddir, elapsed, log_max, priority_bit);
+}
+
+static unsigned long add_log_sample(struct thread_data *td,
+				    struct io_log *iolog,
+				    union io_sample_data data,
+				    enum fio_ddir ddir, unsigned long long bs,
+				    uint64_t offset, uint8_t priority_bit)
+{
+	unsigned long elapsed, this_window;
+
+	if (!ddir_rw(ddir))
+		return 0;
+
+	elapsed = mtime_since_now(&td->epoch);
+
+	/*
+	 * If no time averaging, just add the log sample.
+	 */
+	if (!iolog->avg_msec) {
+		__add_log_sample(iolog, data, ddir, bs, elapsed, offset, priority_bit);
+		return 0;
+	}
+
+	/*
+	 * Add the sample. If the time period has passed, then
+	 * add that entry to the log and clear.
+	 */
+	add_stat_sample(&iolog->avg_window[ddir], data.val);
+
+	/*
+	 * If period hasn't passed, adding the above sample is all we
+	 * need to do.
+	 */
+	this_window = elapsed - iolog->avg_last[ddir];
+	if (elapsed < iolog->avg_last[ddir])
+		return iolog->avg_last[ddir] - elapsed;
+	else if (this_window < iolog->avg_msec) {
+		unsigned long diff = iolog->avg_msec - this_window;
+
+		if (inline_log(iolog) || diff > LOG_MSEC_SLACK)
+			return diff;
+	}
+
+	_add_stat_to_log(iolog, elapsed, td->o.log_max != 0, priority_bit);
+
+	iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec);
+	return iolog->avg_msec;
+}
+
+void finalize_logs(struct thread_data *td, bool unit_logs)
+{
+	unsigned long elapsed;
+
+	elapsed = mtime_since_now(&td->epoch);
+
+	if (td->clat_log && unit_logs)
+		_add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0, 0);
+	if (td->slat_log && unit_logs)
+		_add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0, 0);
+	if (td->lat_log && unit_logs)
+		_add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0, 0);
+	if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
+		_add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0, 0);
+	if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
+		_add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0, 0);
+}
+
+void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long long bs,
+					uint8_t priority_bit)
+{
+	struct io_log *iolog;
+
+	if (!ddir_rw(ddir))
+		return;
+
+	iolog = agg_io_log[ddir];
+	__add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, priority_bit);
+}
+
+void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
+{
+	unsigned int idx = plat_val_to_idx(nsec);
+	assert(idx < FIO_IO_U_PLAT_NR);
+
+	ts->io_u_sync_plat[idx]++;
+	add_stat_sample(&ts->sync_stat, nsec);
+}
+
+static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
+				unsigned long long nsec, enum fio_ddir ddir, enum fio_lat lat)
+{
+	unsigned int idx = plat_val_to_idx(nsec);
+	assert(idx < FIO_IO_U_PLAT_NR);
+
+	ts->io_u_plat[lat][ddir][idx]++;
+}
+
+static void add_lat_percentile_sample(struct thread_stat *ts,
+				unsigned long long nsec, enum fio_ddir ddir, uint8_t priority_bit,
+				enum fio_lat lat)
+{
+	unsigned int idx = plat_val_to_idx(nsec);
+
+	add_lat_percentile_sample_noprio(ts, nsec, ddir, lat);
+
+	if (!priority_bit)
+		ts->io_u_plat_low_prio[ddir][idx]++;
+	else
+		ts->io_u_plat_high_prio[ddir][idx]++;
+}
+
+void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
+		     unsigned long long nsec, unsigned long long bs,
+		     uint64_t offset, uint8_t priority_bit)
+{
+	const bool needs_lock = td_async_processing(td);
+	unsigned long elapsed, this_window;
+	struct thread_stat *ts = &td->ts;
+	struct io_log *iolog = td->clat_hist_log;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->clat_stat[ddir], nsec);
+
+	if (!ts->lat_percentiles) {
+		if (priority_bit)
+			add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
+		else
+			add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
+	}
+
+	if (td->clat_log)
+		add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
+			       offset, priority_bit);
+
+	if (ts->clat_percentiles) {
+		if (ts->lat_percentiles)
+			add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_CLAT);
+		else
+			add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_CLAT);
+	}
+
+	if (iolog && iolog->hist_msec) {
+		struct io_hist *hw = &iolog->hist_window[ddir];
+
+		hw->samples++;
+		elapsed = mtime_since_now(&td->epoch);
+		if (!hw->hist_last)
+			hw->hist_last = elapsed;
+		this_window = elapsed - hw->hist_last;
+
+		if (this_window >= iolog->hist_msec) {
+			uint64_t *io_u_plat;
+			struct io_u_plat_entry *dst;
+
+			/*
+			 * Make a byte-for-byte copy of the latency histogram
+			 * stored in td->ts.io_u_plat[ddir], recording it in a
+			 * log sample. Note that the matching call to free() is
+			 * located in iolog.c after printing this sample to the
+			 * log file.
+			 */
+			io_u_plat = (uint64_t *) td->ts.io_u_plat[FIO_CLAT][ddir];
+			dst = malloc(sizeof(struct io_u_plat_entry));
+			memcpy(&(dst->io_u_plat), io_u_plat,
+				FIO_IO_U_PLAT_NR * sizeof(uint64_t));
+			flist_add(&dst->list, &hw->list);
+			__add_log_sample(iolog, sample_plat(dst), ddir, bs,
+						elapsed, offset, priority_bit);
+
+			/*
+			 * Update the last time we recorded as being now, minus
+			 * any drift in time we encountered before actually
+			 * making the record.
+			 */
+			hw->hist_last = elapsed - (this_window - iolog->hist_msec);
+			hw->samples = 0;
+		}
+	}
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
+			unsigned long long nsec, unsigned long long bs, uint64_t offset,
+			uint8_t priority_bit)
+{
+	const bool needs_lock = td_async_processing(td);
+	struct thread_stat *ts = &td->ts;
+
+	if (!ddir_rw(ddir))
+		return;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->slat_stat[ddir], nsec);
+
+	if (td->slat_log)
+		add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs, offset,
+			priority_bit);
+
+	if (ts->slat_percentiles)
+		add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_SLAT);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
+		    unsigned long long nsec, unsigned long long bs,
+		    uint64_t offset, uint8_t priority_bit)
+{
+	const bool needs_lock = td_async_processing(td);
+	struct thread_stat *ts = &td->ts;
+
+	if (!ddir_rw(ddir))
+		return;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->lat_stat[ddir], nsec);
+
+	if (td->lat_log)
+		add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
+			       offset, priority_bit);
+
+	if (ts->lat_percentiles) {
+		add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_LAT);
+		if (priority_bit)
+			add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
+		else
+			add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
+
+	}
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+void add_bw_sample(struct thread_data *td, struct io_u *io_u,
+		   unsigned int bytes, unsigned long long spent)
+{
+	const bool needs_lock = td_async_processing(td);
+	struct thread_stat *ts = &td->ts;
+	unsigned long rate;
+
+	if (spent)
+		rate = (unsigned long) (bytes * 1000000ULL / spent);
+	else
+		rate = 0;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->bw_stat[io_u->ddir], rate);
+
+	if (td->bw_log)
+		add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir,
+			       bytes, io_u->offset, io_u_is_prio(io_u));
+
+	td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
+			 struct timespec *t, unsigned int avg_time,
+			 uint64_t *this_io_bytes, uint64_t *stat_io_bytes,
+			 struct io_stat *stat, struct io_log *log,
+			 bool is_kb)
+{
+	const bool needs_lock = td_async_processing(td);
+	unsigned long spent, rate;
+	enum fio_ddir ddir;
+	unsigned long next, next_log;
+
+	next_log = avg_time;
+
+	spent = mtime_since(parent_tv, t);
+	if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK)
+		return avg_time - spent;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	/*
+	 * Compute both read and write rates for the interval.
+	 */
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+		uint64_t delta;
+
+		delta = this_io_bytes[ddir] - stat_io_bytes[ddir];
+		if (!delta)
+			continue; /* No entries for interval */
+
+		if (spent) {
+			if (is_kb)
+				rate = delta * 1000 / spent / 1024; /* KiB/s */
+			else
+				rate = (delta * 1000) / spent;
+		} else
+			rate = 0;
+
+		add_stat_sample(&stat[ddir], rate);
+
+		if (log) {
+			unsigned long long bs = 0;
+
+			if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
+				bs = td->o.min_bs[ddir];
+
+			next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0, 0);
+			next_log = min(next_log, next);
+		}
+
+		stat_io_bytes[ddir] = this_io_bytes[ddir];
+	}
+
+	*parent_tv = *t;
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+
+	if (spent <= avg_time)
+		next = avg_time;
+	else
+		next = avg_time - (1 + spent - avg_time);
+
+	return min(next, next_log);
+}
+
+static int add_bw_samples(struct thread_data *td, struct timespec *t)
+{
+	return __add_samples(td, &td->bw_sample_time, t, td->o.bw_avg_time,
+				td->this_io_bytes, td->stat_io_bytes,
+				td->ts.bw_stat, td->bw_log, true);
+}
+
+void add_iops_sample(struct thread_data *td, struct io_u *io_u,
+		     unsigned int bytes)
+{
+	const bool needs_lock = td_async_processing(td);
+	struct thread_stat *ts = &td->ts;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->iops_stat[io_u->ddir], 1);
+
+	if (td->iops_log)
+		add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir,
+			       bytes, io_u->offset, io_u_is_prio(io_u));
+
+	td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+static int add_iops_samples(struct thread_data *td, struct timespec *t)
+{
+	return __add_samples(td, &td->iops_sample_time, t, td->o.iops_avg_time,
+				td->this_io_blocks, td->stat_io_blocks,
+				td->ts.iops_stat, td->iops_log, false);
+}
+
+/*
+ * Returns msecs to next event
+ */
+int calc_log_samples(void)
+{
+	struct thread_data *td;
+	unsigned int next = ~0U, tmp;
+	struct timespec now;
+	int i;
+
+	fio_gettime(&now, NULL);
+
+	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
+		if (in_ramp_time(td) ||
+		    !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
+			next = min(td->o.iops_avg_time, td->o.bw_avg_time);
+			continue;
+		}
+		if (!td->bw_log ||
+			(td->bw_log && !per_unit_log(td->bw_log))) {
+			tmp = add_bw_samples(td, &now);
+			if (tmp < next)
+				next = tmp;
+		}
+		if (!td->iops_log ||
+			(td->iops_log && !per_unit_log(td->iops_log))) {
+			tmp = add_iops_samples(td, &now);
+			if (tmp < next)
+				next = tmp;
+		}
+	}
+
+	return next == ~0U ? 0 : next;
+}
+
+void stat_init(void)
+{
+	stat_sem = fio_sem_init(FIO_SEM_UNLOCKED);
+}
+
+void stat_exit(void)
+{
+	/*
+	 * When we have the mutex, we know out-of-band access to it
+	 * have ended.
+	 */
+	fio_sem_down(stat_sem);
+	fio_sem_remove(stat_sem);
+}
+
+/*
+ * Called from signal handler. Wake up status thread.
+ */
+void show_running_run_stats(void)
+{
+	helper_do_stat();
+}
+
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u)
+{
+	/* Ignore io_u's which span multiple blocks--they will just get
+	 * inaccurate counts. */
+	int idx = (io_u->offset - io_u->file->file_offset)
+			/ td->o.bs[DDIR_TRIM];
+	uint32_t *info = &td->ts.block_infos[idx];
+	assert(idx < td->ts.nr_block_infos);
+	return info;
+}
diff --git a/stat.h b/stat.h
new file mode 100644
index 0000000..0d14166
--- /dev/null
+++ b/stat.h
@@ -0,0 +1,401 @@
+#ifndef FIO_STAT_H
+#define FIO_STAT_H
+
+#include "iolog.h"
+#include "lib/output_buffer.h"
+#include "diskutil.h"
+#include "json.h"
+
+struct group_run_stats {
+	uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT];
+	uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT];
+	uint64_t iobytes[DDIR_RWDIR_CNT];
+	uint64_t agg[DDIR_RWDIR_CNT];
+	uint32_t kb_base;
+	uint32_t unit_base;
+	uint32_t sig_figs;
+	uint32_t groupid;
+	uint32_t unified_rw_rep;
+} __attribute__((packed));
+
+/*
+ * How many depth levels to log
+ */
+#define FIO_IO_U_MAP_NR	7
+#define FIO_IO_U_LAT_N_NR 10
+#define FIO_IO_U_LAT_U_NR 10
+#define FIO_IO_U_LAT_M_NR 12
+
+/*
+ * Constants for clat percentiles
+ */
+#define FIO_IO_U_PLAT_BITS 6
+#define FIO_IO_U_PLAT_VAL (1 << FIO_IO_U_PLAT_BITS)
+#define FIO_IO_U_PLAT_GROUP_NR 29
+#define FIO_IO_U_PLAT_NR (FIO_IO_U_PLAT_GROUP_NR * FIO_IO_U_PLAT_VAL)
+#define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
+					list of percentiles */
+
+/*
+ * Aggregate latency samples for reporting percentile(s).
+ *
+ * EXECUTIVE SUMMARY
+ *
+ * FIO_IO_U_PLAT_BITS determines the maximum statistical error on the
+ * value of resulting percentiles. The error will be approximately
+ * 1/2^(FIO_IO_U_PLAT_BITS+1) of the value.
+ *
+ * FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the maximum
+ * range being tracked for latency samples. The maximum value tracked
+ * accurately will be 2^(GROUP_NR + PLAT_BITS - 1) nanoseconds.
+ *
+ * FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the memory
+ * requirement of storing those aggregate counts. The memory used will
+ * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(int)
+ * bytes.
+ *
+ * FIO_IO_U_PLAT_NR is the total number of buckets.
+ *
+ * DETAILS
+ *
+ * Suppose the lat varies from 0 to 999 (usec), the straightforward
+ * method is to keep an array of (999 + 1) buckets, in which a counter
+ * keeps the count of samples which fall in the bucket, e.g.,
+ * {[0],[1],...,[999]}. However this consumes a huge amount of space,
+ * and can be avoided if an approximation is acceptable.
+ *
+ * One such method is to let the range of the bucket to be greater
+ * than one. This method has low accuracy when the value is small. For
+ * example, let the buckets be {[0,99],[100,199],...,[900,999]}, and
+ * the represented value of each bucket be the mean of the range. Then
+ * a value 0 has an round-off error of 49.5. To improve on this, we
+ * use buckets with non-uniform ranges, while bounding the error of
+ * each bucket within a ratio of the sample value. A simple example
+ * would be when error_bound = 0.005, buckets are {
+ * {[0],[1],...,[99]}, {[100,101],[102,103],...,[198,199]},..,
+ * {[900,909],[910,919]...}  }. The total range is partitioned into
+ * groups with different ranges, then buckets with uniform ranges. An
+ * upper bound of the error is (range_of_bucket/2)/value_of_bucket
+ *
+ * For better efficiency, we implement this using base two. We group
+ * samples by their Most Significant Bit (MSB), extract the next M bit
+ * of them as an index within the group, and discard the rest of the
+ * bits.
+ *
+ * E.g., assume a sample 'x' whose MSB is bit n (starting from bit 0),
+ * and use M bit for indexing
+ *
+ *        | n |    M bits   | bit (n-M-1) ... bit 0 |
+ *
+ * Because x is at least 2^n, and bit 0 to bit (n-M-1) is at most
+ * (2^(n-M) - 1), discarding bit 0 to (n-M-1) makes the round-off
+ * error
+ *
+ *           2^(n-M)-1    2^(n-M)    1
+ *      e <= --------- <= ------- = ---
+ *             2^n          2^n     2^M
+ *
+ * Furthermore, we use "mean" of the range to represent the bucket,
+ * the error e can be lowered by half to 1 / 2^(M+1). By using M bits
+ * as the index, each group must contains 2^M buckets.
+ *
+ * E.g. Let M (FIO_IO_U_PLAT_BITS) be 6
+ *      Error bound is 1/2^(6+1) = 0.0078125 (< 1%)
+ *
+ *	Group	MSB	#discarded	range of		#buckets
+ *			error_bits	value
+ *	----------------------------------------------------------------
+ *	0*	0~5	0		[0,63]			64
+ *	1*	6	0		[64,127]		64
+ *	2	7	1		[128,255]		64
+ *	3	8	2		[256,511]		64
+ *	4	9	3		[512,1023]		64
+ *	...	...	...		[...,...]		...
+ *	28	33	27		[8589934592,+inf]**	64
+ *
+ *  * Special cases: when n < (M-1) or when n == (M-1), in both cases,
+ *    the value cannot be rounded off. Use all bits of the sample as
+ *    index.
+ *
+ *  ** If a sample's MSB is greater than 33, it will be counted as 33.
+ */
+
+/*
+ * Trim cycle count measurements
+ */
+#define MAX_NR_BLOCK_INFOS	8192
+#define BLOCK_INFO_STATE_SHIFT	29
+#define BLOCK_INFO_TRIMS(block_info)	\
+	((block_info) & ((1 << BLOCK_INFO_STATE_SHIFT) - 1))
+#define BLOCK_INFO_STATE(block_info)		\
+	((block_info) >> BLOCK_INFO_STATE_SHIFT)
+#define BLOCK_INFO(state, trim_cycles)	\
+	((trim_cycles) | ((unsigned int) (state) << BLOCK_INFO_STATE_SHIFT))
+#define BLOCK_INFO_SET_STATE(block_info, state)	\
+	BLOCK_INFO(state, BLOCK_INFO_TRIMS(block_info))
+enum block_info_state {
+	BLOCK_STATE_UNINIT,
+	BLOCK_STATE_TRIMMED,
+	BLOCK_STATE_WRITTEN,
+	BLOCK_STATE_TRIM_FAILURE,
+	BLOCK_STATE_WRITE_FAILURE,
+	BLOCK_STATE_COUNT,
+};
+
+#define MAX_PATTERN_SIZE	512
+#define FIO_JOBNAME_SIZE	128
+#define FIO_JOBDESC_SIZE	256
+#define FIO_VERROR_SIZE		128
+
+enum fio_lat {
+	FIO_SLAT = 0,
+	FIO_CLAT,
+	FIO_LAT,
+
+	FIO_LAT_CNT = 3,
+};
+
+struct thread_stat {
+	char name[FIO_JOBNAME_SIZE];
+	char verror[FIO_VERROR_SIZE];
+	uint32_t error;
+	uint32_t thread_number;
+	uint32_t groupid;
+	uint32_t pid;
+	char description[FIO_JOBDESC_SIZE];
+	uint32_t members;
+	uint32_t unified_rw_rep;
+
+	/*
+	 * bandwidth and latency stats
+	 */
+	struct io_stat sync_stat __attribute__((aligned(8)));/* fsync etc stats */
+	struct io_stat clat_stat[DDIR_RWDIR_CNT]; /* completion latency */
+	struct io_stat slat_stat[DDIR_RWDIR_CNT]; /* submission latency */
+	struct io_stat lat_stat[DDIR_RWDIR_CNT]; /* total latency */
+	struct io_stat bw_stat[DDIR_RWDIR_CNT]; /* bandwidth stats */
+	struct io_stat iops_stat[DDIR_RWDIR_CNT]; /* IOPS stats */
+
+	/*
+	 * fio system usage accounting
+	 */
+	uint64_t usr_time;
+	uint64_t sys_time;
+	uint64_t ctx;
+	uint64_t minf, majf;
+
+	/*
+	 * IO depth and latency stats
+	 */
+	uint32_t clat_percentiles;
+	uint32_t lat_percentiles;
+	uint32_t slat_percentiles;
+	uint32_t pad;
+	uint64_t percentile_precision;
+	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
+
+	uint64_t io_u_map[FIO_IO_U_MAP_NR];
+	uint64_t io_u_submit[FIO_IO_U_MAP_NR];
+	uint64_t io_u_complete[FIO_IO_U_MAP_NR];
+	uint64_t io_u_lat_n[FIO_IO_U_LAT_N_NR];
+	uint64_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
+	uint64_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
+	uint64_t io_u_plat[FIO_LAT_CNT][DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
+	uint64_t io_u_sync_plat[FIO_IO_U_PLAT_NR];
+
+	uint64_t total_io_u[DDIR_RWDIR_SYNC_CNT];
+	uint64_t short_io_u[DDIR_RWDIR_CNT];
+	uint64_t drop_io_u[DDIR_RWDIR_CNT];
+	uint64_t total_submit;
+	uint64_t total_complete;
+
+	uint64_t io_bytes[DDIR_RWDIR_CNT];
+	uint64_t runtime[DDIR_RWDIR_CNT];
+	uint64_t total_run_time;
+
+	/*
+	 * IO Error related stats
+	 */
+	union {
+		uint16_t continue_on_error;
+		uint32_t pad2;
+	};
+	uint32_t first_error;
+	uint64_t total_err_count;
+
+	/* ZBD stats */
+	uint64_t nr_zone_resets;
+
+	uint64_t nr_block_infos;
+	uint32_t block_infos[MAX_NR_BLOCK_INFOS];
+
+	uint32_t kb_base;
+	uint32_t unit_base;
+
+	uint32_t latency_depth;
+	uint32_t pad3;
+	uint64_t latency_target;
+	fio_fp64_t latency_percentile;
+	uint64_t latency_window;
+
+	uint32_t sig_figs;
+
+	uint64_t ss_dur;
+	uint32_t ss_state;
+	uint32_t ss_head;
+
+	fio_fp64_t ss_limit;
+	fio_fp64_t ss_slope;
+	fio_fp64_t ss_deviation;
+	fio_fp64_t ss_criterion;
+
+	uint64_t io_u_plat_high_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR] __attribute__((aligned(8)));;
+	uint64_t io_u_plat_low_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
+	struct io_stat clat_high_prio_stat[DDIR_RWDIR_CNT] __attribute__((aligned(8)));
+	struct io_stat clat_low_prio_stat[DDIR_RWDIR_CNT];
+
+	union {
+		uint64_t *ss_iops_data;
+		uint64_t pad4;
+	};
+
+	union {
+		uint64_t *ss_bw_data;
+		uint64_t pad5;
+	};
+
+	uint64_t cachehit;
+	uint64_t cachemiss;
+} __attribute__((packed));
+
+#define JOBS_ETA {							\
+	uint32_t nr_running;						\
+	uint32_t nr_ramp;						\
+									\
+	uint32_t nr_pending;						\
+	uint32_t nr_setting_up;						\
+									\
+	uint64_t m_rate[DDIR_RWDIR_CNT];				\
+	uint64_t t_rate[DDIR_RWDIR_CNT];				\
+	uint64_t rate[DDIR_RWDIR_CNT];					\
+	uint32_t m_iops[DDIR_RWDIR_CNT];				\
+	uint32_t t_iops[DDIR_RWDIR_CNT];				\
+	uint32_t iops[DDIR_RWDIR_CNT];					\
+	uint32_t pad;							\
+	uint64_t elapsed_sec;						\
+	uint64_t eta_sec;						\
+	uint32_t is_pow2;						\
+	uint32_t unit_base;						\
+									\
+	uint32_t sig_figs;						\
+									\
+	uint32_t files_open;						\
+									\
+	/*								\
+	 * Network 'copy' of run_str[]					\
+	 */								\
+	uint32_t nr_threads;						\
+	uint32_t pad2;							\
+	uint8_t run_str[];						\
+}
+
+struct jobs_eta JOBS_ETA;
+struct jobs_eta_packed JOBS_ETA __attribute__((packed));
+
+struct io_u_plat_entry {
+	struct flist_head list;
+	uint64_t io_u_plat[FIO_IO_U_PLAT_NR];
+};
+
+extern struct fio_sem *stat_sem;
+
+extern struct jobs_eta *get_jobs_eta(bool force, size_t *size);
+
+extern void stat_init(void);
+extern void stat_exit(void);
+
+extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs, struct flist_head *, struct buf_output *);
+extern void show_group_stats(struct group_run_stats *rs, struct buf_output *);
+extern bool calc_thread_status(struct jobs_eta *je, int force);
+extern void display_thread_status(struct jobs_eta *je);
+extern void __show_run_stats(void);
+extern void __show_running_run_stats(void);
+extern void show_running_run_stats(void);
+extern void check_for_running_stats(void);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
+extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
+extern void init_thread_stat(struct thread_stat *ts);
+extern void init_group_run_stat(struct group_run_stats *gs);
+extern void eta_to_str(char *str, unsigned long eta_sec);
+extern bool calc_lat(struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev);
+extern unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv);
+extern void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat);
+extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat);
+extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
+extern void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist);
+extern void reset_io_stats(struct thread_data *);
+extern void update_rusage_stat(struct thread_data *);
+extern void clear_rusage_stat(struct thread_data *);
+
+extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
+				unsigned long long, uint64_t, uint8_t);
+extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
+				unsigned long long, uint64_t, uint8_t);
+extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
+				unsigned long long, uint64_t, uint8_t);
+extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long bs,
+				uint8_t priority_bit);
+extern void add_iops_sample(struct thread_data *, struct io_u *,
+				unsigned int);
+extern void add_bw_sample(struct thread_data *, struct io_u *,
+				unsigned int, unsigned long long);
+extern void add_sync_clat_sample(struct thread_stat *ts,
+				unsigned long long nsec);
+extern int calc_log_samples(void);
+
+extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
+extern void json_array_add_disk_util(struct disk_util_stat *dus,
+				struct disk_util_agg *agg, struct json_array *parent);
+
+extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
+extern bool write_bw_log;
+
+static inline bool nsec_to_usec(unsigned long long *min,
+				unsigned long long *max, double *mean,
+				double *dev)
+{
+	if (*min > 2000 && *max > 99999 && *dev > 1000.0) {
+		*min /= 1000;
+		*max /= 1000;
+		*mean /= 1000.0;
+		*dev /= 1000.0;
+		return true;
+	}
+
+	return false;
+}
+
+static inline bool nsec_to_msec(unsigned long long *min,
+				unsigned long long *max, double *mean,
+				double *dev)
+{
+	if (*min > 2000000 && *max > 99999999ULL && *dev > 1000000.0) {
+		*min /= 1000000;
+		*max /= 1000000;
+		*mean /= 1000000.0;
+		*dev /= 1000000.0;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Worst level condensing would be 1:5, so allow enough room for that
+ */
+#define __THREAD_RUNSTR_SZ(nr)	((nr) * 5)
+#define THREAD_RUNSTR_SZ	__THREAD_RUNSTR_SZ(thread_number)
+
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u);
+
+#endif
diff --git a/steadystate.c b/steadystate.c
new file mode 100644
index 0000000..bd2f70d
--- /dev/null
+++ b/steadystate.c
@@ -0,0 +1,379 @@
+#include <stdlib.h>
+
+#include "fio.h"
+#include "steadystate.h"
+
+bool steadystate_enabled = false;
+
+void steadystate_free(struct thread_data *td)
+{
+	free(td->ss.iops_data);
+	free(td->ss.bw_data);
+	td->ss.iops_data = NULL;
+	td->ss.bw_data = NULL;
+}
+
+static void steadystate_alloc(struct thread_data *td)
+{
+	td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t));
+	td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t));
+
+	td->ss.state |= FIO_SS_DATA;
+}
+
+void steadystate_setup(void)
+{
+	struct thread_data *td, *prev_td;
+	int i, prev_groupid;
+
+	if (!steadystate_enabled)
+		return;
+
+	/*
+	 * if group reporting is enabled, identify the last td
+	 * for each group and use it for storing steady state
+	 * data
+	 */
+	prev_groupid = -1;
+	prev_td = NULL;
+	for_each_td(td, i) {
+		if (!td->ss.dur)
+			continue;
+
+		if (!td->o.group_reporting) {
+			steadystate_alloc(td);
+			continue;
+		}
+
+		if (prev_groupid != td->groupid) {
+			if (prev_td)
+				steadystate_alloc(prev_td);
+			prev_groupid = td->groupid;
+		}
+		prev_td = td;
+	}
+
+	if (prev_td && prev_td->o.group_reporting)
+		steadystate_alloc(prev_td);
+}
+
+static bool steadystate_slope(uint64_t iops, uint64_t bw,
+			      struct thread_data *td)
+{
+	int i, j;
+	double result;
+	struct steadystate_data *ss = &td->ss;
+	uint64_t new_val;
+
+	ss->bw_data[ss->tail] = bw;
+	ss->iops_data[ss->tail] = iops;
+
+	if (ss->state & FIO_SS_IOPS)
+		new_val = iops;
+	else
+		new_val = bw;
+
+	if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+		if (!(ss->state & FIO_SS_BUFFER_FULL)) {
+			/* first time through */
+			for(i = 0, ss->sum_y = 0; i < ss->dur; i++) {
+				if (ss->state & FIO_SS_IOPS)
+					ss->sum_y += ss->iops_data[i];
+				else
+					ss->sum_y += ss->bw_data[i];
+				j = (ss->head + i) % ss->dur;
+				if (ss->state & FIO_SS_IOPS)
+					ss->sum_xy += i * ss->iops_data[j];
+				else
+					ss->sum_xy += i * ss->bw_data[j];
+			}
+			ss->state |= FIO_SS_BUFFER_FULL;
+		} else {		/* easy to update the sums */
+			ss->sum_y -= ss->oldest_y;
+			ss->sum_y += new_val;
+			ss->sum_xy = ss->sum_xy - ss->sum_y + ss->dur * new_val;
+		}
+
+		if (ss->state & FIO_SS_IOPS)
+			ss->oldest_y = ss->iops_data[ss->head];
+		else
+			ss->oldest_y = ss->bw_data[ss->head];
+
+		/*
+		 * calculate slope as (sum_xy - sum_x * sum_y / n) / (sum_(x^2)
+		 * - (sum_x)^2 / n) This code assumes that all x values are
+		 * equally spaced when they are often off by a few milliseconds.
+		 * This assumption greatly simplifies the calculations.
+		 */
+		ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / ss->dur) /
+				(ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / ss->dur);
+		if (ss->state & FIO_SS_PCT)
+			ss->criterion = 100.0 * ss->slope / (ss->sum_y / ss->dur);
+		else
+			ss->criterion = ss->slope;
+
+		dprint(FD_STEADYSTATE, "sum_y: %llu, sum_xy: %llu, slope: %f, "
+					"criterion: %f, limit: %f\n",
+					(unsigned long long) ss->sum_y,
+					(unsigned long long) ss->sum_xy,
+					ss->slope, ss->criterion, ss->limit);
+
+		result = ss->criterion * (ss->criterion < 0.0 ? -1.0 : 1.0);
+		if (result < ss->limit)
+			return true;
+	}
+
+	ss->tail = (ss->tail + 1) % ss->dur;
+	if (ss->tail <= ss->head)
+		ss->head = (ss->head + 1) % ss->dur;
+
+	return false;
+}
+
+static bool steadystate_deviation(uint64_t iops, uint64_t bw,
+				  struct thread_data *td)
+{
+	int i;
+	double diff;
+	double mean;
+
+	struct steadystate_data *ss = &td->ss;
+
+	ss->bw_data[ss->tail] = bw;
+	ss->iops_data[ss->tail] = iops;
+
+	if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+		if (!(ss->state & FIO_SS_BUFFER_FULL)) {
+			/* first time through */
+			for(i = 0, ss->sum_y = 0; i < ss->dur; i++)
+				if (ss->state & FIO_SS_IOPS)
+					ss->sum_y += ss->iops_data[i];
+				else
+					ss->sum_y += ss->bw_data[i];
+			ss->state |= FIO_SS_BUFFER_FULL;
+		} else {		/* easy to update the sum */
+			ss->sum_y -= ss->oldest_y;
+			if (ss->state & FIO_SS_IOPS)
+				ss->sum_y += ss->iops_data[ss->tail];
+			else
+				ss->sum_y += ss->bw_data[ss->tail];
+		}
+
+		if (ss->state & FIO_SS_IOPS)
+			ss->oldest_y = ss->iops_data[ss->head];
+		else
+			ss->oldest_y = ss->bw_data[ss->head];
+
+		mean = (double) ss->sum_y / ss->dur;
+		ss->deviation = 0.0;
+
+		for (i = 0; i < ss->dur; i++) {
+			if (ss->state & FIO_SS_IOPS)
+				diff = ss->iops_data[i] - mean;
+			else
+				diff = ss->bw_data[i] - mean;
+			ss->deviation = max(ss->deviation, diff * (diff < 0.0 ? -1.0 : 1.0));
+		}
+
+		if (ss->state & FIO_SS_PCT)
+			ss->criterion = 100.0 * ss->deviation / mean;
+		else
+			ss->criterion = ss->deviation;
+
+		dprint(FD_STEADYSTATE, "sum_y: %llu, mean: %f, max diff: %f, "
+					"objective: %f, limit: %f\n",
+					(unsigned long long) ss->sum_y, mean,
+					ss->deviation, ss->criterion, ss->limit);
+
+		if (ss->criterion < ss->limit)
+			return true;
+	}
+
+	ss->tail = (ss->tail + 1) % ss->dur;
+	if (ss->tail <= ss->head)
+		ss->head = (ss->head + 1) % ss->dur;
+
+	return false;
+}
+
+void steadystate_check(void)
+{
+	int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
+	unsigned long rate_time;
+	struct thread_data *td, *td2;
+	struct timespec now;
+	uint64_t group_bw = 0, group_iops = 0;
+	uint64_t td_iops, td_bytes;
+	bool ret;
+
+	prev_groupid = -1;
+	for_each_td(td, i) {
+		const bool needs_lock = td_async_processing(td);
+		struct steadystate_data *ss = &td->ss;
+
+		if (!ss->dur || td->runstate <= TD_SETTING_UP ||
+		    td->runstate >= TD_EXITED || !ss->state ||
+		    ss->state & FIO_SS_ATTAINED)
+			continue;
+
+		td_iops = 0;
+		td_bytes = 0;
+		if (!td->o.group_reporting ||
+		    (td->o.group_reporting && td->groupid != prev_groupid)) {
+			group_bw = 0;
+			group_iops = 0;
+			group_ramp_time_over = 0;
+		}
+		prev_groupid = td->groupid;
+
+		fio_gettime(&now, NULL);
+		if (ss->ramp_time && !(ss->state & FIO_SS_RAMP_OVER)) {
+			/*
+			 * Begin recording data one second after ss->ramp_time
+			 * has elapsed
+			 */
+			if (utime_since(&td->epoch, &now) >= (ss->ramp_time + 1000000L))
+				ss->state |= FIO_SS_RAMP_OVER;
+		}
+
+		if (needs_lock)
+			__td_io_u_lock(td);
+
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			td_iops += td->io_blocks[ddir];
+			td_bytes += td->io_bytes[ddir];
+		}
+
+		if (needs_lock)
+			__td_io_u_unlock(td);
+
+		rate_time = mtime_since(&ss->prev_time, &now);
+		memcpy(&ss->prev_time, &now, sizeof(now));
+
+		/*
+		 * Begin monitoring when job starts but don't actually use
+		 * data in checking stopping criterion until ss->ramp_time is
+		 * over. This ensures that we will have a sane value in
+		 * prev_iops/bw the first time through after ss->ramp_time
+		 * is done.
+		 */
+		if (ss->state & FIO_SS_RAMP_OVER) {
+			group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time;
+			group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time;
+			++group_ramp_time_over;
+		}
+		ss->prev_iops = td_iops;
+		ss->prev_bytes = td_bytes;
+
+		if (td->o.group_reporting && !(ss->state & FIO_SS_DATA))
+			continue;
+
+		/*
+		 * Don't begin checking criterion until ss->ramp_time is over
+		 * for at least one thread in group
+		 */
+		if (!group_ramp_time_over)
+			continue;
+
+		dprint(FD_STEADYSTATE, "steadystate_check() thread: %d, "
+					"groupid: %u, rate_msec: %ld, "
+					"iops: %llu, bw: %llu, head: %d, tail: %d\n",
+					i, td->groupid, rate_time,
+					(unsigned long long) group_iops,
+					(unsigned long long) group_bw,
+					ss->head, ss->tail);
+
+		if (ss->state & FIO_SS_SLOPE)
+			ret = steadystate_slope(group_iops, group_bw, td);
+		else
+			ret = steadystate_deviation(group_iops, group_bw, td);
+
+		if (ret) {
+			if (td->o.group_reporting) {
+				for_each_td(td2, j) {
+					if (td2->groupid == td->groupid) {
+						td2->ss.state |= FIO_SS_ATTAINED;
+						fio_mark_td_terminate(td2);
+					}
+				}
+			} else {
+				ss->state |= FIO_SS_ATTAINED;
+				fio_mark_td_terminate(td);
+			}
+		}
+	}
+}
+
+int td_steadystate_init(struct thread_data *td)
+{
+	struct steadystate_data *ss = &td->ss;
+	struct thread_options *o = &td->o;
+	struct thread_data *td2;
+	int j;
+
+	memset(ss, 0, sizeof(*ss));
+
+	if (o->ss_dur) {
+		steadystate_enabled = true;
+		o->ss_dur /= 1000000L;
+
+		/* put all steady state info in one place */
+		ss->dur = o->ss_dur;
+		ss->limit = o->ss_limit.u.f;
+		ss->ramp_time = o->ss_ramp_time;
+
+		ss->state = o->ss_state;
+		if (!td->ss.ramp_time)
+			ss->state |= FIO_SS_RAMP_OVER;
+
+		ss->sum_x = o->ss_dur * (o->ss_dur - 1) / 2;
+		ss->sum_x_sq = (o->ss_dur - 1) * (o->ss_dur) * (2*o->ss_dur - 1) / 6;
+	}
+
+	/* make sure that ss options are consistent within reporting group */
+	for_each_td(td2, j) {
+		if (td2->groupid == td->groupid) {
+			struct steadystate_data *ss2 = &td2->ss;
+
+			if (ss2->dur != ss->dur ||
+			    ss2->limit != ss->limit ||
+			    ss2->ramp_time != ss->ramp_time ||
+			    ss2->state != ss->state ||
+			    ss2->sum_x != ss->sum_x ||
+			    ss2->sum_x_sq != ss->sum_x_sq) {
+				td_verror(td, EINVAL, "job rejected: steadystate options must be consistent within reporting groups");
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+uint64_t steadystate_bw_mean(struct thread_stat *ts)
+{
+	int i;
+	uint64_t sum;
+
+	if (!ts->ss_dur)
+		return 0;
+
+	for (i = 0, sum = 0; i < ts->ss_dur; i++)
+		sum += ts->ss_bw_data[i];
+
+	return sum / ts->ss_dur;
+}
+
+uint64_t steadystate_iops_mean(struct thread_stat *ts)
+{
+	int i;
+	uint64_t sum;
+
+	if (!ts->ss_dur)
+		return 0;
+
+	for (i = 0, sum = 0; i < ts->ss_dur; i++)
+		sum += ts->ss_iops_data[i];
+
+	return sum / ts->ss_dur;
+}
diff --git a/steadystate.h b/steadystate.h
new file mode 100644
index 0000000..51472c4
--- /dev/null
+++ b/steadystate.h
@@ -0,0 +1,69 @@
+#ifndef FIO_STEADYSTATE_H
+#define FIO_STEADYSTATE_H
+
+#include "thread_options.h"
+
+extern void steadystate_free(struct thread_data *);
+extern void steadystate_check(void);
+extern void steadystate_setup(void);
+extern int td_steadystate_init(struct thread_data *);
+extern uint64_t steadystate_bw_mean(struct thread_stat *);
+extern uint64_t steadystate_iops_mean(struct thread_stat *);
+
+extern bool steadystate_enabled;
+
+struct steadystate_data {
+	double limit;
+	unsigned long long dur;
+	unsigned long long ramp_time;
+
+	uint32_t state;
+
+	unsigned int head;
+	unsigned int tail;
+	uint64_t *iops_data;
+	uint64_t *bw_data;
+
+	double slope;
+	double deviation;
+	double criterion;
+
+	uint64_t sum_y;
+	uint64_t sum_x;
+	uint64_t sum_x_sq;
+	uint64_t sum_xy;
+	uint64_t oldest_y;
+
+	struct timespec prev_time;
+	uint64_t prev_iops;
+	uint64_t prev_bytes;
+};
+
+enum {
+	__FIO_SS_IOPS = 0,
+	__FIO_SS_BW,
+	__FIO_SS_SLOPE,
+	__FIO_SS_ATTAINED,
+	__FIO_SS_RAMP_OVER,
+	__FIO_SS_DATA,
+	__FIO_SS_PCT,
+	__FIO_SS_BUFFER_FULL,
+};
+
+enum {
+	FIO_SS_IOPS		= 1 << __FIO_SS_IOPS,
+	FIO_SS_BW		= 1 << __FIO_SS_BW,
+	FIO_SS_SLOPE		= 1 << __FIO_SS_SLOPE,
+	FIO_SS_ATTAINED		= 1 << __FIO_SS_ATTAINED,
+	FIO_SS_RAMP_OVER	= 1 << __FIO_SS_RAMP_OVER,
+	FIO_SS_DATA		= 1 << __FIO_SS_DATA,
+	FIO_SS_PCT		= 1 << __FIO_SS_PCT,
+	FIO_SS_BUFFER_FULL	= 1 << __FIO_SS_BUFFER_FULL,
+
+	FIO_SS_IOPS_SLOPE	= FIO_SS_IOPS | FIO_SS_SLOPE,
+	FIO_SS_BW_SLOPE		= FIO_SS_BW | FIO_SS_SLOPE,
+};
+
+#define STEADYSTATE_MSEC	1000
+
+#endif
diff --git a/t/arch.c b/t/arch.c
new file mode 100644
index 0000000..a72cef3
--- /dev/null
+++ b/t/arch.c
@@ -0,0 +1,4 @@
+#include "../arch/arch.h"
+
+unsigned long arch_flags = 0;
+int arch_random;
diff --git a/t/axmap.c b/t/axmap.c
new file mode 100644
index 0000000..9d6bdee
--- /dev/null
+++ b/t/axmap.c
@@ -0,0 +1,427 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "../lib/lfsr.h"
+#include "../lib/axmap.h"
+
+static int test_regular(uint64_t size, int seed)
+{
+	struct fio_lfsr lfsr;
+	struct axmap *map;
+	int err;
+
+	printf("Using %llu entries...", (unsigned long long) size);
+	fflush(stdout);
+
+	lfsr_init(&lfsr, size, seed, seed & 0xF);
+	map = axmap_new(size);
+	err = 0;
+
+	while (size--) {
+		uint64_t val;
+
+		if (lfsr_next(&lfsr, &val)) {
+			printf("lfsr: short loop\n");
+			err = 1;
+			break;
+		}
+		if (axmap_isset(map, val)) {
+			printf("bit already set\n");
+			err = 1;
+			break;
+		}
+		axmap_set(map, val);
+		if (!axmap_isset(map, val)) {
+			printf("bit not set\n");
+			err = 1;
+			break;
+		}
+	}
+
+	if (err)
+		return err;
+
+	printf("pass!\n");
+	axmap_free(map);
+	return 0;
+}
+
+static int check_next_free(struct axmap *map, uint64_t start, uint64_t expected)
+{
+
+	uint64_t ff;
+
+	ff = axmap_next_free(map, start);
+	if (ff != expected) {
+		printf("axmap_next_free broken: Expected %llu, got %llu\n",
+				(unsigned long long)expected, (unsigned long long) ff);
+		return 1;
+	}
+	return 0;
+}
+
+static int test_next_free(uint64_t size, int seed)
+{
+	struct fio_lfsr lfsr;
+	struct axmap *map;
+	uint64_t osize;
+	uint64_t ff, lastfree;
+	int err, i;
+
+	printf("Test next_free %llu entries...", (unsigned long long) size);
+	fflush(stdout);
+
+	map = axmap_new(size);
+	err = 0;
+
+
+	/* Empty map.  Next free after 0 should be 1. */
+	if (check_next_free(map, 0, 1))
+		err = 1;
+
+	/* Empty map.  Next free after 63 should be 64. */
+	if (check_next_free(map, 63, 64))
+		err = 1;
+
+	/* Empty map.  Next free after size - 2 should be size - 1 */
+	if (check_next_free(map, size - 2, size - 1))
+		err = 1;
+
+	/* Empty map.  Next free after size - 1 should be 0 */
+	if (check_next_free(map, size - 1, 0))
+		err = 1;
+
+	/* Empty map.  Next free after 63 should be 64. */
+	if (check_next_free(map, 63, 64))
+		err = 1;
+
+
+	/* Bit 63 set.  Next free after 62 should be 64. */
+	axmap_set(map, 63);
+	if (check_next_free(map, 62, 64))
+		err = 1;
+
+	/* Last bit set.  Next free after size - 2 should be 0. */
+	axmap_set(map, size - 1);
+	if (check_next_free(map, size - 2, 0))
+		err = 1;
+
+	/* Last bit set.  Next free after size - 1 should be 0. */
+	if (check_next_free(map, size - 1, 0))
+		err = 1;
+	
+	/* Last 64 bits set.  Next free after size - 66 or size - 65 should be 0. */
+	for (i=size - 65; i < size; i++)
+		axmap_set(map, i);
+	if (check_next_free(map, size - 66, 0))
+		err = 1;
+	if (check_next_free(map, size - 65, 0))
+		err = 1;
+	
+	/* Last 64 bits set.  Next free after size - 67 should be size - 66. */
+	if (check_next_free(map, size - 67, size - 66))
+		err = 1;
+
+	axmap_free(map);
+	
+	/* Start with a fresh map and mostly fill it up */
+	lfsr_init(&lfsr, size, seed, seed & 0xF);
+	map = axmap_new(size);
+	osize = size;
+
+	/* Leave 1 entry free */
+	size--;
+	while (size--) {
+		uint64_t val;
+
+		if (lfsr_next(&lfsr, &val)) {
+			printf("lfsr: short loop\n");
+			err = 1;
+			break;
+		}
+		if (axmap_isset(map, val)) {
+			printf("bit already set\n");
+			err = 1;
+			break;
+		}
+		axmap_set(map, val);
+		if (!axmap_isset(map, val)) {
+			printf("bit not set\n");
+			err = 1;
+			break;
+		}
+	}
+
+	/* Get last free bit */
+	lastfree = axmap_next_free(map, 0);
+	if (lastfree == -1ULL) {
+		printf("axmap_next_free broken: Couldn't find last free bit\n");
+		err = 1;
+	}
+
+	/* Start with last free bit and test wrap-around */
+	ff = axmap_next_free(map, lastfree);
+	if (ff != lastfree) {
+		printf("axmap_next_free broken: wrap-around test #1 failed\n");
+		err = 1;
+	}
+
+	/* Start with last bit and test wrap-around */
+	ff = axmap_next_free(map, osize - 1);
+	if (ff != lastfree) {
+		printf("axmap_next_free broken: wrap-around test #2 failed\n");
+		err = 1;
+	}
+
+	/* Set last free bit */
+	axmap_set(map, lastfree);
+	ff = axmap_next_free(map, 0);
+	if (ff != -1ULL) {
+		printf("axmap_next_free broken: Expected -1 from full map\n");
+		err = 1;
+	}
+
+	ff = axmap_next_free(map, osize);
+	if (ff != -1ULL) {
+		printf("axmap_next_free broken: Expected -1 from out of bounds request\n");
+		err = 1;
+	}
+
+	if (err)
+		return err;
+
+	printf("pass!\n");
+	axmap_free(map);
+	return 0;
+}
+
+static int test_multi(uint64_t size, unsigned int bit_off)
+{
+	unsigned int map_size = size;
+	struct axmap *map;
+	uint64_t val = bit_off;
+	int i, err;
+
+	printf("Test multi %llu entries %u offset...", (unsigned long long) size, bit_off);
+	fflush(stdout);
+
+	map = axmap_new(map_size);
+	while (val + 128 <= map_size) {
+		err = 0;
+		for (i = val; i < val + 128; i++) {
+			if (axmap_isset(map, val + i)) {
+				printf("bit already set\n");
+				err = 1;
+				break;
+			}
+		}
+
+		if (err)
+			break;
+
+		err = axmap_set_nr(map, val, 128);
+		if (err != 128) {
+			printf("only set %u bits\n", err);
+			break;
+		}
+
+		err = 0;
+		for (i = 0; i < 128; i++) {
+			if (!axmap_isset(map, val + i)) {
+				printf("bit not set: %llu\n", (unsigned long long) val + i);
+				err = 1;
+				break;
+			}
+		}
+
+		val += 128;
+		if (err)
+			break;
+	}
+
+	if (!err)
+		printf("pass!\n");
+
+	axmap_free(map);
+	return err;
+}
+
+struct overlap_test {
+	unsigned int start;
+	unsigned int nr;
+	unsigned int ret;
+};
+
+static int test_overlap(void)
+{
+	struct overlap_test tests[] = {
+		{
+			.start	= 0,
+			.nr	= 0,
+			.ret	= 0,
+		},
+		{
+			.start	= 16,
+			.nr	= 16,
+			.ret	= 16,
+		},
+		{
+			.start	= 16,
+			.nr	= 0,
+			.ret	= 0,
+		},
+		{
+			.start	= 0,
+			.nr	= 32,
+			.ret	= 16,
+		},
+		{
+			.start	= 48,
+			.nr	= 32,
+			.ret	= 32,
+		},
+		{
+			.start	= 32,
+			.nr	= 32,
+			.ret	= 16,
+		},
+		{
+			.start	= 79,
+			.nr	= 1,
+			.ret	= 0,
+		},
+		{
+			.start	= 80,
+			.nr	= 21,
+			.ret	= 21,
+		},
+		{
+			.start	= 102,
+			.nr	= 1,
+			.ret	= 1,
+		},
+		{
+			.start	= 101,
+			.nr	= 3,
+			.ret	= 1,
+		},
+		{
+			.start	= 106,
+			.nr	= 4,
+			.ret	= 4,
+		},
+		{
+			.start	= 105,
+			.nr	= 3,
+			.ret	= 1,
+		},
+		{
+			.start	= 120,
+			.nr	= 4,
+			.ret	= 4,
+		},
+		{
+			.start	= 118,
+			.nr	= 2,
+			.ret	= 2,
+		},
+		{
+			.start	= 118,
+			.nr	= 2,
+			.ret	= 0,
+		},
+		{
+			.start	= 1100,
+			.nr	= 1,
+			.ret	= 1,
+		},
+		{
+			.start	= 1000,
+			.nr	= 256,
+			.ret	= 100,
+		},
+		{
+			.start	= 22684,
+			.nr	= 1,
+			.ret	= 1,
+		},
+		{
+			.start	= 22670,
+			.nr	= 60,
+			.ret	= 14,
+		},
+		{
+			.start	= 22670,
+			.nr	= 60,
+			.ret	= 0,
+		},
+		{
+			.start	= -1U,
+		},
+	};
+	struct axmap *map;
+	int entries, i, ret, err = 0;
+
+	entries = 0;
+	for (i = 0; tests[i].start != -1U; i++) {
+		unsigned int this = tests[i].start + tests[i].nr;
+
+		if (this > entries)
+			entries = this;
+	}
+
+	printf("Test overlaps...\n");
+	fflush(stdout);
+
+	map = axmap_new(entries);
+
+	for (i = 0; tests[i].start != -1U; i++) {
+		struct overlap_test *t = &tests[i];
+
+		printf("\tstart=%6u, nr=%3u: ", t->start, t->nr);
+		ret = axmap_set_nr(map, t->start, t->nr);
+		if (ret != t->ret) {
+			printf("%3d (FAIL, wanted %d)\n", ret, t->ret);
+			err = 1;
+			break;
+		}
+		printf("%3d (PASS)\n", ret);
+	}
+
+	axmap_free(map);
+	return err;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t size = (1ULL << 23) - 200;
+	int seed = 1;
+
+	if (argc > 1) {
+		size = strtoul(argv[1], NULL, 10);
+		if (argc > 2)
+			seed = strtoul(argv[2], NULL, 10);
+	}
+
+	if (test_regular(size, seed))
+		return 1;
+	if (test_multi(size, 0))
+		return 2;
+	if (test_multi(size, 17))
+		return 3;
+	if (test_overlap())
+		return 4;
+	if (test_next_free(size, seed))
+		return 5;
+
+	/* Test 3 levels, all full:  64*64*64 */
+	if (test_next_free(64*64*64, seed))
+		return 6;
+
+	/* Test 4 levels, with 2 inner levels not full */
+	if (test_next_free(((((64*64)-63)*64)-63)*64*12, seed))
+		return 7;
+
+	return 0;
+}
diff --git a/t/btrace2fio.c b/t/btrace2fio.c
new file mode 100644
index 0000000..a8a9d62
--- /dev/null
+++ b/t/btrace2fio.c
@@ -0,0 +1,1144 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <math.h>
+#include <assert.h>
+
+#include "../io_ddir.h"
+#include "../flist.h"
+#include "../hash.h"
+#include "../fifo.h"
+#include "../blktrace_api.h"
+#include "../os/os.h"
+#include "../log.h"
+#include "../minmax.h"
+#include "../oslib/linux-dev-lookup.h"
+
+#define TRACE_FIFO_SIZE	8192
+
+static unsigned int rt_threshold = 1000000;
+static unsigned int ios_threshold = 10;
+static unsigned int rate_threshold;
+static unsigned int set_rate;
+static unsigned int max_depth = 256;
+static int output_ascii = 1;
+static char *filename;
+
+static char **add_opts;
+static int n_add_opts;
+
+/*
+ * Collapse defaults
+ */
+static unsigned int collapse_entries = 0;
+static unsigned int depth_diff = 1;
+static unsigned int random_diff = 5;
+
+struct bs {
+	unsigned int bs;
+	unsigned int nr;
+	int merges;
+};
+
+struct trace_file {
+	char *name;
+	int major, minor;
+};
+
+struct btrace_out {
+	unsigned long ios[DDIR_RWDIR_CNT];
+	unsigned long merges[DDIR_RWDIR_CNT];
+
+	uint64_t last_end[DDIR_RWDIR_CNT];
+	uint64_t seq[DDIR_RWDIR_CNT];
+
+	struct bs *bs[DDIR_RWDIR_CNT];
+	unsigned int nr_bs[DDIR_RWDIR_CNT];
+
+	int inflight;
+	unsigned int depth;
+	int depth_disabled;
+	int complete_seen;
+
+	uint64_t first_ttime[DDIR_RWDIR_CNT];
+	uint64_t last_ttime[DDIR_RWDIR_CNT];
+	uint64_t kib[DDIR_RWDIR_CNT];
+
+	uint64_t start_delay;
+};
+
+struct btrace_pid {
+	struct flist_head hash_list;
+	struct flist_head pid_list;
+	pid_t pid;
+
+	pid_t *merge_pids;
+	unsigned int nr_merge_pids;
+
+	struct trace_file *files;
+	int nr_files;
+	unsigned int last_major, last_minor;
+	int numjobs;
+	int ignore;
+
+	struct btrace_out o;
+};
+
+struct inflight {
+	struct flist_head list;
+	struct btrace_pid *p;
+	uint64_t end_sector;
+};
+
+#define PID_HASH_BITS	10
+#define PID_HASH_SIZE	(1U << PID_HASH_BITS)
+
+static struct flist_head pid_hash[PID_HASH_SIZE];
+static FLIST_HEAD(pid_list);
+
+#define INFLIGHT_HASH_BITS	8
+#define INFLIGHT_HASH_SIZE	(1U << INFLIGHT_HASH_BITS)
+static struct flist_head inflight_hash[INFLIGHT_HASH_SIZE];
+
+static uint64_t first_ttime = -1ULL;
+
+static struct inflight *inflight_find(uint64_t sector)
+{
+	struct flist_head *inflight_list;
+	struct flist_head *e;
+
+	inflight_list = &inflight_hash[hash_long(sector, INFLIGHT_HASH_BITS)];
+
+	flist_for_each(e, inflight_list) {
+		struct inflight *i = flist_entry(e, struct inflight, list);
+
+		if (i->end_sector == sector)
+			return i;
+	}
+
+	return NULL;
+}
+
+static void inflight_remove(struct inflight *i)
+{
+	struct btrace_out *o = &i->p->o;
+
+	o->inflight--;
+	assert(o->inflight >= 0);
+	flist_del(&i->list);
+	free(i);
+}
+
+static void __inflight_add(struct inflight *i)
+{
+	struct flist_head *list;
+
+	list = &inflight_hash[hash_long(i->end_sector, INFLIGHT_HASH_BITS)];
+	flist_add_tail(&i->list, list);
+}
+
+static void inflight_add(struct btrace_pid *p, uint64_t sector, uint32_t len)
+{
+	struct btrace_out *o = &p->o;
+	struct inflight *i;
+
+	i = calloc(1, sizeof(*i));
+	i->p = p;
+	o->inflight++;
+	if (!o->depth_disabled) {
+		o->depth = max((int) o->depth, o->inflight);
+		if (o->depth >= max_depth && !o->complete_seen) {
+			o->depth_disabled = 1;
+			o->depth = max_depth;
+		}
+	}
+	i->end_sector = sector + (len >> 9);
+	__inflight_add(i);
+}
+
+static void inflight_merge(struct inflight *i, int rw, unsigned int size)
+{
+	i->p->o.merges[rw]++;
+	if (size) {
+		i->end_sector += (size >> 9);
+		flist_del(&i->list);
+		__inflight_add(i);
+	}
+}
+
+/*
+ * fifo refill frontend, to avoid reading data in trace sized bites
+ */
+static int refill_fifo(struct fifo *fifo, int fd)
+{
+	char buf[TRACE_FIFO_SIZE];
+	unsigned int total;
+	int ret;
+
+	total = sizeof(buf);
+	if (total > fifo_room(fifo))
+		total = fifo_room(fifo);
+
+	ret = read(fd, buf, total);
+	if (ret < 0) {
+		perror("read refill");
+		return -1;
+	}
+
+	if (ret > 0)
+		ret = fifo_put(fifo, buf, ret);
+
+	return ret;
+}
+
+/*
+ * Retrieve 'len' bytes from the fifo, refilling if necessary.
+ */
+static int trace_fifo_get(struct fifo *fifo, int fd, void *buf,
+			  unsigned int len)
+{
+	if (fifo_len(fifo) < len) {
+		int ret = refill_fifo(fifo, fd);
+
+		if (ret < 0)
+			return ret;
+	}
+
+	return fifo_get(fifo, buf, len);
+}
+
+/*
+ * Just discard the pdu by seeking past it.
+ */
+static int discard_pdu(struct fifo *fifo, int fd, struct blk_io_trace *t)
+{
+	if (t->pdu_len == 0)
+		return 0;
+
+	return trace_fifo_get(fifo, fd, NULL, t->pdu_len);
+}
+
+static int handle_trace_notify(struct blk_io_trace *t)
+{
+	switch (t->action) {
+	case BLK_TN_PROCESS:
+		//printf("got process notify: %x, %d\n", t->action, t->pid);
+		break;
+	case BLK_TN_TIMESTAMP:
+		//printf("got timestamp notify: %x, %d\n", t->action, t->pid);
+		break;
+	case BLK_TN_MESSAGE:
+		break;
+	default:
+		log_err("unknown trace act %x\n", t->action);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void __add_bs(struct btrace_out *o, unsigned int len, int rw)
+{
+	o->bs[rw] = realloc(o->bs[rw], (o->nr_bs[rw] + 1) * sizeof(struct bs));
+	o->bs[rw][o->nr_bs[rw]].bs = len;
+	o->bs[rw][o->nr_bs[rw]].nr = 1;
+	o->nr_bs[rw]++;
+}
+
+static void add_bs(struct btrace_out *o, unsigned int len, int rw)
+{
+	struct bs *bs = o->bs[rw];
+	int i;
+
+	if (!o->nr_bs[rw]) {
+		__add_bs(o, len, rw);
+		return;
+	}
+
+	for (i = 0; i < o->nr_bs[rw]; i++) {
+		if (bs[i].bs == len) {
+			bs[i].nr++;
+			return;
+		}
+	}
+
+	__add_bs(o, len, rw);
+}
+
+#define FMINORBITS	20
+#define FMINORMASK	((1U << FMINORBITS) - 1)
+#define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
+#define FMINOR(dev)	((unsigned int) ((dev) & FMINORMASK))
+
+static int btrace_add_file(struct btrace_pid *p, uint32_t devno)
+{
+	unsigned int maj = FMAJOR(devno);
+	unsigned int min = FMINOR(devno);
+	struct trace_file *f;
+	unsigned int i;
+	char dev[256];
+
+	if (filename)
+		return 0;
+	if (p->last_major == maj && p->last_minor == min)
+		return 0;
+
+	p->last_major = maj;
+	p->last_minor = min;
+
+	/*
+	 * check for this file in our list
+	 */
+	for (i = 0; i < p->nr_files; i++) {
+		f = &p->files[i];
+
+		if (f->major == maj && f->minor == min)
+			return 0;
+	}
+
+	strcpy(dev, "/dev");
+	if (!blktrace_lookup_device(NULL, dev, maj, min)) {
+		log_err("fio: failed to find device %u/%u\n", maj, min);
+		if (!output_ascii) {
+			log_err("fio: use -d to specify device\n");
+			return 1;
+		}
+		return 0;
+	}
+
+	p->files = realloc(p->files, (p->nr_files + 1) * sizeof(*f));
+	f = &p->files[p->nr_files];
+	f->name = strdup(dev);
+	f->major = maj;
+	f->minor = min;
+	p->nr_files++;
+	return 0;
+}
+
+static int t_to_rwdir(struct blk_io_trace *t)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return DDIR_TRIM;
+
+	return (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+}
+
+static int handle_trace_discard(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+
+	if (btrace_add_file(p, t->device))
+		return 1;
+
+	if (o->first_ttime[2] == -1ULL)
+		o->first_ttime[2] = t->time;
+
+	o->ios[DDIR_TRIM]++;
+	add_bs(o, t->bytes, DDIR_TRIM);
+	return 0;
+}
+
+static int handle_trace_fs(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+	int rw;
+
+	if (btrace_add_file(p, t->device))
+		return 1;
+
+	first_ttime = min(first_ttime, (uint64_t) t->time);
+
+	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+
+	if (o->first_ttime[rw] == -1ULL)
+		o->first_ttime[rw] = t->time;
+
+	add_bs(o, t->bytes, rw);
+	o->ios[rw]++;
+
+	if (t->sector == o->last_end[rw] || o->last_end[rw] == -1ULL)
+		o->seq[rw]++;
+
+	o->last_end[rw] = t->sector + (t->bytes >> 9);
+	return 0;
+}
+
+static int handle_queue_trace(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
+		return handle_trace_notify(t);
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return handle_trace_discard(t, p);
+	else
+		return handle_trace_fs(t, p);
+}
+
+static int handle_trace(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	unsigned int act = t->action & 0xffff;
+	int ret = 0;
+
+	if (act == __BLK_TA_QUEUE) {
+		inflight_add(p, t->sector, t->bytes);
+		ret = handle_queue_trace(t, p);
+	} else if (act == __BLK_TA_BACKMERGE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i)
+			inflight_remove(i);
+
+		i = inflight_find(t->sector);
+		if (i)
+			inflight_merge(i, t_to_rwdir(t), t->bytes);
+	} else if (act == __BLK_TA_FRONTMERGE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i)
+			inflight_remove(i);
+
+		i = inflight_find(t->sector);
+		if (i)
+			inflight_merge(i, t_to_rwdir(t), 0);
+	} else if (act == __BLK_TA_COMPLETE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i) {
+			i->p->o.kib[t_to_rwdir(t)] += (t->bytes >> 10);
+			i->p->o.complete_seen = 1;
+			inflight_remove(i);
+		}
+	}
+
+	return ret;
+}
+
+static void byteswap_trace(struct blk_io_trace *t)
+{
+	t->magic = fio_swap32(t->magic);
+	t->sequence = fio_swap32(t->sequence);
+	t->time = fio_swap64(t->time);
+	t->sector = fio_swap64(t->sector);
+	t->bytes = fio_swap32(t->bytes);
+	t->action = fio_swap32(t->action);
+	t->pid = fio_swap32(t->pid);
+	t->device = fio_swap32(t->device);
+	t->cpu = fio_swap32(t->cpu);
+	t->error = fio_swap16(t->error);
+	t->pdu_len = fio_swap16(t->pdu_len);
+}
+
+static struct btrace_pid *pid_hash_find(pid_t pid, struct flist_head *list)
+{
+	struct flist_head *e;
+	struct btrace_pid *p;
+
+	flist_for_each(e, list) {
+		p = flist_entry(e, struct btrace_pid, hash_list);
+		if (p->pid == pid)
+			return p;
+	}
+
+	return NULL;
+}
+
+static struct btrace_pid *pid_hash_get(pid_t pid)
+{
+	struct flist_head *hash_list;
+	struct btrace_pid *p;
+
+	hash_list = &pid_hash[hash_long(pid, PID_HASH_BITS)];
+
+	p = pid_hash_find(pid, hash_list);
+	if (!p) {
+		int i;
+
+		p = calloc(1, sizeof(*p));
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			p->o.first_ttime[i] = -1ULL;
+			p->o.last_ttime[i] = -1ULL;
+			p->o.last_end[i] = -1ULL;
+		}
+
+		p->pid = pid;
+		p->numjobs = 1;
+		flist_add_tail(&p->hash_list, hash_list);
+		flist_add_tail(&p->pid_list, &pid_list);
+	}
+
+	return p;
+}
+
+/*
+ * Load a blktrace file by reading all the blk_io_trace entries, and storing
+ * them as io_pieces like the fio text version would do.
+ */
+static int load_blktrace(const char *fname, int need_swap)
+{
+	struct btrace_pid *p;
+	unsigned long traces;
+	struct blk_io_trace t;
+	struct fifo *fifo;
+	int fd, ret = 0;
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		perror("open trace file\n");
+		return 1;
+	}
+
+	fifo = fifo_alloc(TRACE_FIFO_SIZE);
+
+	traces = 0;
+	do {
+		ret = trace_fifo_get(fifo, fd, &t, sizeof(t));
+		if (ret < 0)
+			goto err;
+		else if (!ret)
+			break;
+		else if (ret < (int) sizeof(t)) {
+			log_err("fio: short fifo get\n");
+			break;
+		}
+
+		if (need_swap)
+			byteswap_trace(&t);
+
+		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+			log_err("fio: bad magic in blktrace data: %x\n", t.magic);
+			goto err;
+		}
+		if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
+			log_err("fio: bad blktrace version %d\n", t.magic & 0xff);
+			goto err;
+		}
+		ret = discard_pdu(fifo, fd, &t);
+		if (ret < 0) {
+			log_err("blktrace lseek\n");
+			goto err;
+		} else if (t.pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
+			goto err;
+		}
+
+		p = pid_hash_get(t.pid);
+		ret = handle_trace(&t, p);
+		if (ret)
+			break;
+		p->o.last_ttime[t_to_rwdir(&t)] = t.time;
+		traces++;
+	} while (1);
+
+	fifo_free(fifo);
+	close(fd);
+
+	if (ret)
+		return ret;
+
+	if (output_ascii)
+		printf("Traces loaded: %lu\n", traces);
+
+	return 0;
+err:
+	close(fd);
+	fifo_free(fifo);
+	return 1;
+}
+
+static int bs_cmp(const void *ba, const void *bb)
+{
+	const struct bs *bsa = ba;
+	const struct bs *bsb = bb;
+
+	return bsb->nr - bsa->nr;
+}
+
+static unsigned long o_to_kib_rate(struct btrace_out *o, int rw)
+{
+	uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL;
+	uint64_t val;
+
+	if (!usec)
+		return 0;
+
+	usec /= 1000;
+	if (!usec)
+		return 0;
+
+	val = o->kib[rw] * 1000ULL;
+	return val / usec;
+}
+
+static uint64_t o_first_ttime(struct btrace_out *o)
+{
+	uint64_t first;
+
+	first = min(o->first_ttime[0], o->first_ttime[1]);
+	return min(first, o->first_ttime[2]);
+}
+
+static uint64_t o_longest_ttime(struct btrace_out *o)
+{
+	uint64_t ret = 0;
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		uint64_t diff;
+
+		diff = o->last_ttime[i] - o->first_ttime[i];
+		ret = max(diff, ret);
+	}
+
+	return ret;
+}
+
+static void __output_p_ascii(struct btrace_pid *p, unsigned long *ios)
+{
+	const char *msg[] = { "reads", "writes", "trims" };
+	struct btrace_out *o = &p->o;
+	unsigned long total, usec;
+	int i, j;
+
+	printf("[pid:\t%u", p->pid);
+	if (p->nr_merge_pids)
+		for (i = 0; i < p->nr_merge_pids; i++)
+			printf(", %u", p->merge_pids[i]);
+	printf("]\n");
+
+	total = ddir_rw_sum(o->ios);
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		float perc;
+
+		if (!o->ios[i])
+			continue;
+
+		ios[i] += o->ios[i] + o->merges[i];
+		printf("%s\n", msg[i]);
+		perc = ((float) o->ios[i] * 100.0) / (float) total;
+		printf("\tios:    %lu (perc=%3.2f%%)\n", o->ios[i], perc);
+		perc = ((float) o->merges[i] * 100.0) / (float) total;
+		printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc);
+		perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
+		printf("\tseq:    %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc);
+		printf("\trate:   %lu KiB/sec\n", o_to_kib_rate(o, i));
+
+		for (j = 0; j < o->nr_bs[i]; j++) {
+			struct bs *bs = &o->bs[i][j];
+
+			perc = (((float) bs->nr * 100.0) / (float) o->ios[i]);
+			printf("\tbs=%u, perc=%3.2f%%\n", bs->bs, perc);
+		}
+	}
+
+	printf("depth:\t%u\n", o->depth);
+	usec = o_longest_ttime(o) / 1000ULL;
+	printf("usec:\t%lu (delay=%llu)\n", usec, (unsigned long long) o->start_delay);
+
+	printf("files:\t");
+	for (i = 0; i < p->nr_files; i++)
+		printf("%s,", p->files[i].name);
+	printf("\n");
+
+	printf("\n");
+}
+
+static int __output_p_fio(struct btrace_pid *p, unsigned long *ios)
+{
+	struct btrace_out *o = &p->o;
+	unsigned long total;
+	unsigned long long time;
+	float perc;
+	int i, j;
+
+	if ((o->ios[0] + o->ios[1]) && o->ios[2]) {
+		log_err("fio: trace has both read/write and trim\n");
+		return 1;
+	}
+	if (!p->nr_files) {
+		log_err("fio: no devices found\n");
+		return 1;
+	}
+
+	printf("[pid%u", p->pid);
+	if (p->nr_merge_pids)
+		for (i = 0; i < p->nr_merge_pids; i++)
+			printf(",pid%u", p->merge_pids[i]);
+	printf("]\n");
+
+	printf("numjobs=%u\n", p->numjobs);
+	printf("direct=1\n");
+	if (o->depth == 1)
+		printf("ioengine=sync\n");
+	else
+		printf("ioengine=libaio\niodepth=%u\n", o->depth);
+
+	if (o->ios[0] && !o->ios[1])
+		printf("rw=randread\n");
+	else if (!o->ios[0] && o->ios[1])
+		printf("rw=randwrite\n");
+	else if (o->ios[2])
+		printf("rw=randtrim\n");
+	else {
+		printf("rw=randrw\n");
+		total = ddir_rw_sum(o->ios);
+		perc = ((float) o->ios[0] * 100.0) / (float) total;
+		printf("rwmixread=%u\n", (int) floor(perc + 0.50));
+	}
+
+	printf("percentage_random=");
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (o->seq[i] && o->ios[i]) {
+			perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
+			if (perc >= 99.0)
+				perc = 100.0;
+		} else
+			perc = 100.0;
+
+		if (i)
+			printf(",");
+		perc = 100.0 - perc;
+		printf("%u", (int) floor(perc + 0.5));
+	}
+	printf("\n");
+
+	printf("filename=");
+	for (i = 0; i < p->nr_files; i++) {
+		if (i)
+			printf(":");
+		printf("%s", p->files[i].name);
+	}
+	printf("\n");
+
+	if (o->start_delay / 1000000ULL)
+		printf("startdelay=%llus\n", o->start_delay / 1000000ULL);
+
+	time = o_longest_ttime(o);
+	time = (time + 1000000000ULL - 1) / 1000000000ULL;
+	printf("runtime=%llus\n", time);
+
+	printf("bssplit=");
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+
+		if (i && o->nr_bs[i - 1] && o->nr_bs[i])
+			printf(",");
+
+		for (j = 0; j < o->nr_bs[i]; j++) {
+			struct bs *bs = &o->bs[i][j];
+
+			perc = (((float) bs->nr * 100.0) / (float) o->ios[i]);
+			if (perc < 1.00)
+				continue;
+			if (j)
+				printf(":");
+			if (j + 1 == o->nr_bs[i])
+				printf("%u/", bs->bs);
+			else
+				printf("%u/%u", bs->bs, (int) floor(perc + 0.5));
+		}
+	}
+	printf("\n");
+
+	if (set_rate) {
+		printf("rate=");
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			unsigned long rate;
+
+			rate = o_to_kib_rate(o, i);
+			if (i)
+				printf(",");
+			if (rate)
+				printf("%luk", rate);
+		}
+		printf("\n");
+	}
+
+	if (n_add_opts)
+		for (i = 0; i < n_add_opts; i++)
+			printf("%s\n", add_opts[i]);
+
+	printf("\n");
+	return 0;
+}
+
+static int __output_p(struct btrace_pid *p, unsigned long *ios)
+{
+	struct btrace_out *o = &p->o;
+	int i, ret = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (o->nr_bs[i] <= 1)
+			continue;
+		qsort(o->bs[i], o->nr_bs[i], sizeof(struct bs), bs_cmp);
+	}
+
+	if (filename) {
+		p->files = malloc(sizeof(struct trace_file));
+		p->nr_files++;
+		p->files[0].name = filename;
+	}
+
+	if (output_ascii)
+		__output_p_ascii(p, ios);
+	else
+		ret = __output_p_fio(p, ios);
+
+	return ret;
+}
+
+static void remove_ddir(struct btrace_out *o, int rw)
+{
+	o->ios[rw] = 0;
+}
+
+static int prune_entry(struct btrace_out *o)
+{
+	unsigned long rate;
+	uint64_t time;
+	int i;
+
+	if (ddir_rw_sum(o->ios) < ios_threshold)
+		return 1;
+
+	time = o_longest_ttime(o) / 1000ULL;
+	if (time < rt_threshold)
+		return 1;
+
+	rate = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		unsigned long this_rate;
+
+		this_rate = o_to_kib_rate(o, i);
+		if (this_rate < rate_threshold) {
+			remove_ddir(o, i);
+			this_rate = 0;
+		}
+		rate += this_rate;
+	}
+
+	if (rate < rate_threshold)
+		return 1;
+
+	return 0;
+}
+
+static int entry_cmp(void *priv, struct flist_head *a, struct flist_head *b)
+{
+	struct btrace_pid *pa = flist_entry(a, struct btrace_pid, pid_list);
+	struct btrace_pid *pb = flist_entry(b, struct btrace_pid, pid_list);
+
+	return ddir_rw_sum(pb->o.ios) - ddir_rw_sum(pa->o.ios);
+}
+
+static void free_p(struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+	int i;
+
+	for (i = 0; i < p->nr_files; i++) {
+		if (p->files[i].name && p->files[i].name != filename)
+			free(p->files[i].name);
+	}
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		free(o->bs[i]);
+
+	free(p->files);
+	flist_del(&p->pid_list);
+	flist_del(&p->hash_list);
+	free(p);
+}
+
+static int entries_close(struct btrace_pid *pida, struct btrace_pid *pidb)
+{
+	float perca, percb, fdiff;
+	int i, idiff;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if ((pida->o.ios[i] && !pidb->o.ios[i]) ||
+		    (pidb->o.ios[i] && !pida->o.ios[i]))
+			return 0;
+		if (pida->o.ios[i] && pidb->o.ios[i]) {
+			perca = ((float) pida->o.seq[i] * 100.0) / (float) pida->o.ios[i];
+			percb = ((float) pidb->o.seq[i] * 100.0) / (float) pidb->o.ios[i];
+			fdiff = perca - percb;
+			if (fabs(fdiff) > random_diff)
+				return 0;
+		}
+
+		idiff = pida->o.depth - pidb->o.depth;
+		if (abs(idiff) > depth_diff)
+			return 0;
+	}
+
+	return 1;
+}
+
+static void merge_bs(struct bs **bsap, unsigned int *nr_bsap,
+		     struct bs *bsb, unsigned int nr_bsb)
+{
+	struct bs *bsa = *bsap;
+	unsigned int nr_bsa = *nr_bsap;
+	int a, b;
+
+	for (b = 0; b < nr_bsb; b++) {
+		int next, found = 0;
+
+		for (a = 0; a < nr_bsa; a++) {
+			if (bsb[b].bs != bsa[a].bs)
+				continue;
+
+			bsa[a].nr += bsb[b].nr;
+			bsa[a].merges += bsb[b].merges;
+			found = 1;
+			break;
+		}
+
+		if (found)
+			continue;
+
+		next = *nr_bsap;
+		bsa = realloc(bsa, (next + 1) * sizeof(struct bs));
+		bsa[next].bs = bsb[b].bs;
+		bsa[next].nr = bsb[b].nr;
+		(*nr_bsap)++;
+		*bsap = bsa;
+	}
+}
+
+static int merge_entries(struct btrace_pid *pida, struct btrace_pid *pidb)
+{
+	int i;
+
+	if (!entries_close(pida, pidb))
+		return 0;
+
+	pida->nr_merge_pids++;
+	pida->merge_pids = realloc(pida->merge_pids, pida->nr_merge_pids * sizeof(pid_t));
+	pida->merge_pids[pida->nr_merge_pids - 1] = pidb->pid;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct btrace_out *oa = &pida->o;
+		struct btrace_out *ob = &pidb->o;
+
+		oa->ios[i] += ob->ios[i];
+		oa->merges[i] += ob->merges[i];
+		oa->seq[i] += ob->seq[i];
+		oa->kib[i] += ob->kib[i];
+		oa->first_ttime[i] = min(oa->first_ttime[i], ob->first_ttime[i]);
+		oa->last_ttime[i] = max(oa->last_ttime[i], ob->last_ttime[i]);
+		merge_bs(&oa->bs[i], &oa->nr_bs[i], ob->bs[i], ob->nr_bs[i]);
+	}
+
+	pida->o.start_delay = min(pida->o.start_delay, pidb->o.start_delay);
+	pida->o.depth = (pida->o.depth + pidb->o.depth) / 2;
+	return 1;
+}
+
+static void check_merges(struct btrace_pid *p, struct flist_head *pidlist)
+{
+	struct flist_head *e, *tmp;
+
+	if (p->ignore)
+		return;
+
+	flist_for_each_safe(e, tmp, pidlist) {
+		struct btrace_pid *pidb;
+
+		pidb = flist_entry(e, struct btrace_pid, pid_list);
+		if (pidb == p)
+			continue;
+
+		if (merge_entries(p, pidb)) {
+			pidb->ignore = 1;
+			p->numjobs++;
+		}
+	}
+}
+
+static int output_p(void)
+{
+	unsigned long ios[DDIR_RWDIR_CNT];
+	struct flist_head *e, *tmp;
+	int depth_disabled = 0;
+	int ret = 0;
+
+	flist_for_each_safe(e, tmp, &pid_list) {
+		struct btrace_pid *p;
+
+		p = flist_entry(e, struct btrace_pid, pid_list);
+		if (prune_entry(&p->o)) {
+			free_p(p);
+			continue;
+		}
+		p->o.start_delay = (o_first_ttime(&p->o) / 1000ULL) - first_ttime;
+		depth_disabled += p->o.depth_disabled;
+	}
+
+	if (collapse_entries) {
+		struct btrace_pid *p;
+
+		flist_for_each_safe(e, tmp, &pid_list) {
+			p = flist_entry(e, struct btrace_pid, pid_list);
+			check_merges(p, &pid_list);
+		}
+
+		flist_for_each_safe(e, tmp, &pid_list) {
+			p = flist_entry(e, struct btrace_pid, pid_list);
+			if (p->ignore)
+				free_p(p);
+		}
+	}
+
+	if (depth_disabled)
+		log_err("fio: missing completion traces, depths capped at %u\n", max_depth);
+
+	memset(ios, 0, sizeof(ios));
+
+	flist_sort(NULL, &pid_list, entry_cmp);
+
+	flist_for_each(e, &pid_list) {
+		struct btrace_pid *p;
+
+		p = flist_entry(e, struct btrace_pid, pid_list);
+		ret |= __output_p(p, ios);
+		if (ret && !output_ascii)
+			break;
+	}
+
+	if (output_ascii)
+		printf("Total: reads=%lu, writes=%lu\n", ios[0], ios[1]);
+
+	return ret;
+}
+
+static int usage(char *argv[])
+{
+	log_err("%s: [options] <blktrace bin file>\n", argv[0]);
+	log_err("\t-t\tUsec threshold to ignore task\n");
+	log_err("\t-n\tNumber IOS threshold to ignore task\n");
+	log_err("\t-f\tFio job file output\n");
+	log_err("\t-d\tUse this file/device for replay\n");
+	log_err("\t-r\tIgnore jobs with less than this KiB/sec rate\n");
+	log_err("\t-R\tSet rate in fio job (def=%u)\n", set_rate);
+	log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth);
+	log_err("\t-c\tCollapse \"identical\" jobs (def=%u)\n", collapse_entries);
+	log_err("\t-u\tDepth difference for collapse (def=%u)\n", depth_diff);
+	log_err("\t-x\tRandom difference for collapse (def=%u)\n", random_diff);
+	log_err("\t-a\tAdditional fio option to add to job file\n");
+	return 1;
+}
+
+static int trace_needs_swap(const char *trace_file, int *swap)
+{
+	struct blk_io_trace t;
+	int fd, ret;
+
+	*swap = -1;
+	
+	fd = open(trace_file, O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		return 1;
+	}
+
+	ret = read(fd, &t, sizeof(t));
+	if (ret < 0) {
+		close(fd);
+		perror("read");
+		return 1;
+	} else if (ret != sizeof(t)) {
+		close(fd);
+		log_err("fio: short read on trace file\n");
+		return 1;
+	}
+
+	close(fd);
+
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
+		*swap = 0;
+	else {
+		/*
+		 * Maybe it needs to be endian swapped...
+		 */
+		t.magic = fio_swap32(t.magic);
+		if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
+			*swap = 1;
+	}
+
+	if (*swap == -1) {
+		log_err("fio: blktrace appears corrupt\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int need_swap, i, c;
+
+	if (argc < 2)
+		return usage(argv);
+
+	while ((c = getopt(argc, argv, "t:n:fd:r:RD:c:u:x:a:")) != -1) {
+		switch (c) {
+		case 'R':
+			set_rate = 1;
+			break;
+		case 'r':
+			rate_threshold = atoi(optarg);
+			break;
+		case 't':
+			rt_threshold = atoi(optarg);
+			break;
+		case 'n':
+			ios_threshold = atoi(optarg);
+			break;
+		case 'f':
+			output_ascii = 0;
+			break;
+		case 'd':
+			filename = strdup(optarg);
+			break;
+		case 'D':
+			max_depth = atoi(optarg);
+			break;
+		case 'c':
+			collapse_entries = atoi(optarg);
+			break;
+		case 'u':
+			depth_diff = atoi(optarg);
+			break;
+		case 'x':
+			random_diff = atoi(optarg);
+			break;
+		case 'a':
+			add_opts = realloc(add_opts, (n_add_opts + 1) * sizeof(char *));
+			add_opts[n_add_opts] = strdup(optarg);
+			n_add_opts++;
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (argc == optind)
+		return usage(argv);
+
+	if (trace_needs_swap(argv[optind], &need_swap))
+		return 1;
+
+	for (i = 0; i < PID_HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&pid_hash[i]);
+	for (i = 0; i < INFLIGHT_HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&inflight_hash[i]);
+
+	load_blktrace(argv[optind], need_swap);
+	first_ttime /= 1000ULL;
+
+	return output_p();
+}
diff --git a/t/debug.c b/t/debug.c
new file mode 100644
index 0000000..8965cfb
--- /dev/null
+++ b/t/debug.c
@@ -0,0 +1,14 @@
+#include <stdio.h>
+
+FILE *f_err;
+struct timespec *fio_ts = NULL;
+unsigned long fio_debug = 0;
+
+void __dprint(int type, const char *str, ...)
+{
+}
+
+void debug_init(void)
+{
+	f_err = stderr;
+}
diff --git a/t/debug.h b/t/debug.h
new file mode 100644
index 0000000..9d1d415
--- /dev/null
+++ b/t/debug.h
@@ -0,0 +1,6 @@
+#ifndef FIO_DEBUG_INC_H
+#define FIO_DEBUG_INC_H
+
+extern void debug_init(void);
+
+#endif
diff --git a/t/dedupe.c b/t/dedupe.c
new file mode 100644
index 0000000..68d31f1
--- /dev/null
+++ b/t/dedupe.c
@@ -0,0 +1,596 @@
+/*
+ * Small tool to check for dedupable blocks in a file or device. Basically
+ * just scans the filename for extents of the given size, checksums them,
+ * and orders them up.
+ */
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "../fio.h"
+#include "../flist.h"
+#include "../log.h"
+#include "../fio_sem.h"
+#include "../smalloc.h"
+#include "../minmax.h"
+#include "../crc/md5.h"
+#include "../os/os.h"
+#include "../gettime.h"
+#include "../fio_time.h"
+#include "../lib/rbtree.h"
+
+#include "../lib/bloom.h"
+#include "debug.h"
+
+struct worker_thread {
+	pthread_t thread;
+
+	volatile int done;
+
+	int fd;
+	uint64_t cur_offset;
+	uint64_t size;
+
+	unsigned long items;
+	unsigned long dupes;
+	int err;
+};
+
+struct extent {
+	struct flist_head list;
+	uint64_t offset;
+};
+
+struct chunk {
+	struct fio_rb_node rb_node;
+	uint64_t count;
+	uint32_t hash[MD5_HASH_WORDS];
+	struct flist_head extent_list[0];
+};
+
+struct item {
+	uint64_t offset;
+	uint32_t hash[MD5_HASH_WORDS];
+};
+
+static struct rb_root rb_root;
+static struct bloom *bloom;
+static struct fio_sem *rb_lock;
+
+static unsigned int blocksize = 4096;
+static unsigned int num_threads;
+static unsigned int chunk_size = 1048576;
+static unsigned int dump_output;
+static unsigned int odirect;
+static unsigned int collision_check;
+static unsigned int print_progress = 1;
+static unsigned int use_bloom = 1;
+
+static uint64_t total_size;
+static uint64_t cur_offset;
+static struct fio_sem *size_lock;
+
+static struct fio_file file;
+
+static uint64_t get_size(struct fio_file *f, struct stat *sb)
+{
+	uint64_t ret;
+
+	if (S_ISBLK(sb->st_mode)) {
+		unsigned long long bytes = 0;
+
+		if (blockdev_size(f, &bytes)) {
+			log_err("dedupe: failed getting bdev size\n");
+			return 0;
+		}
+		ret = bytes;
+	} else
+		ret = sb->st_size;
+
+	return (ret & ~((uint64_t)blocksize - 1));
+}
+
+static int get_work(uint64_t *offset, uint64_t *size)
+{
+	uint64_t this_chunk;
+	int ret = 1;
+
+	fio_sem_down(size_lock);
+
+	if (cur_offset < total_size) {
+		*offset = cur_offset;
+		this_chunk = min((uint64_t)chunk_size, total_size - cur_offset);
+		*size = this_chunk;
+		cur_offset += this_chunk;
+		ret = 0;
+	}
+
+	fio_sem_up(size_lock);
+	return ret;
+}
+
+static int __read_block(int fd, void *buf, off_t offset, size_t count)
+{
+	ssize_t ret;
+
+	ret = pread(fd, buf, count, offset);
+	if (ret < 0) {
+		perror("pread");
+		return 1;
+	} else if (!ret)
+		return 1;
+	else if (ret != count) {
+		log_err("dedupe: short read on block\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int read_block(int fd, void *buf, off_t offset)
+{
+	return __read_block(fd, buf, offset, blocksize);
+}
+
+static void add_item(struct chunk *c, struct item *i)
+{
+	/*	
+	 * Save some memory and don't add extent items, if we don't
+	 * use them.
+	 */
+	if (dump_output || collision_check) {
+		struct extent *e;
+
+		e = malloc(sizeof(*e));
+		e->offset = i->offset;
+		flist_add_tail(&e->list, &c->extent_list[0]);
+	}
+
+	c->count++;
+}
+
+static int col_check(struct chunk *c, struct item *i)
+{
+	struct extent *e;
+	char *cbuf, *ibuf;
+	int ret = 1;
+
+	cbuf = fio_memalign(blocksize, blocksize, false);
+	ibuf = fio_memalign(blocksize, blocksize, false);
+
+	e = flist_entry(c->extent_list[0].next, struct extent, list);
+	if (read_block(file.fd, cbuf, e->offset))
+		goto out;
+
+	if (read_block(file.fd, ibuf, i->offset))
+		goto out;
+
+	ret = memcmp(ibuf, cbuf, blocksize);
+out:
+	fio_memfree(cbuf, blocksize, false);
+	fio_memfree(ibuf, blocksize, false);
+	return ret;
+}
+
+static struct chunk *alloc_chunk(void)
+{
+	struct chunk *c;
+
+	if (collision_check || dump_output) {
+		c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
+		INIT_FLIST_HEAD(&c->extent_list[0]);
+	} else
+		c = malloc(sizeof(struct chunk));
+
+	return c;
+}
+
+static void insert_chunk(struct item *i)
+{
+	struct fio_rb_node **p, *parent;
+	struct chunk *c;
+	int diff;
+
+	p = &rb_root.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		c = rb_entry(parent, struct chunk, rb_node);
+		diff = memcmp(i->hash, c->hash, sizeof(i->hash));
+		if (diff < 0)
+			p = &(*p)->rb_left;
+		else if (diff > 0)
+			p = &(*p)->rb_right;
+		else {
+			int ret;
+
+			if (!collision_check)
+				goto add;
+
+			fio_sem_up(rb_lock);
+			ret = col_check(c, i);
+			fio_sem_down(rb_lock);
+
+			if (!ret)
+				goto add;
+
+			p = &(*p)->rb_right;
+		}
+	}
+
+	c = alloc_chunk();
+	RB_CLEAR_NODE(&c->rb_node);
+	c->count = 0;
+	memcpy(c->hash, i->hash, sizeof(i->hash));
+	rb_link_node(&c->rb_node, parent, p);
+	rb_insert_color(&c->rb_node, &rb_root);
+add:
+	add_item(c, i);
+}
+
+static void insert_chunks(struct item *items, unsigned int nitems,
+			  uint64_t *ndupes)
+{
+	int i;
+
+	fio_sem_down(rb_lock);
+
+	for (i = 0; i < nitems; i++) {
+		if (bloom) {
+			unsigned int s;
+			int r;
+
+			s = sizeof(items[i].hash) / sizeof(uint32_t);
+			r = bloom_set(bloom, items[i].hash, s);
+			*ndupes += r;
+		} else
+			insert_chunk(&items[i]);
+	}
+
+	fio_sem_up(rb_lock);
+}
+
+static void crc_buf(void *buf, uint32_t *hash)
+{
+	struct fio_md5_ctx ctx = { .hash = hash };
+
+	fio_md5_init(&ctx);
+	fio_md5_update(&ctx, buf, blocksize);
+	fio_md5_final(&ctx);
+}
+
+static unsigned int read_blocks(int fd, void *buf, off_t offset, size_t size)
+{
+	if (__read_block(fd, buf, offset, size))
+		return 0;
+
+	return size / blocksize;
+}
+
+static int do_work(struct worker_thread *thread, void *buf)
+{
+	unsigned int nblocks, i;
+	off_t offset;
+	int nitems = 0;
+	uint64_t ndupes = 0;
+	struct item *items;
+
+	offset = thread->cur_offset;
+
+	nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+	if (!nblocks)
+		return 1;
+
+	items = malloc(sizeof(*items) * nblocks);
+
+	for (i = 0; i < nblocks; i++) {
+		void *thisptr = buf + (i * blocksize);
+
+		items[i].offset = offset;
+		crc_buf(thisptr, items[i].hash);
+		offset += blocksize;
+		nitems++;
+	}
+
+	insert_chunks(items, nitems, &ndupes);
+
+	free(items);
+	thread->items += nitems;
+	thread->dupes += ndupes;
+	return 0;
+}
+
+static void *thread_fn(void *data)
+{
+	struct worker_thread *thread = data;
+	void *buf;
+
+	buf = fio_memalign(blocksize, chunk_size, false);
+
+	do {
+		if (get_work(&thread->cur_offset, &thread->size)) {
+			thread->err = 1;
+			break;
+		}
+		if (do_work(thread, buf)) {
+			thread->err = 1;
+			break;
+		}
+	} while (1);
+
+	thread->done = 1;
+	fio_memfree(buf, chunk_size, false);
+	return NULL;
+}
+
+static void show_progress(struct worker_thread *threads, unsigned long total)
+{
+	unsigned long last_nitems = 0;
+	struct timespec last_tv;
+
+	fio_gettime(&last_tv, NULL);
+
+	while (print_progress) {
+		unsigned long this_items;
+		unsigned long nitems = 0;
+		uint64_t tdiff;
+		float perc;
+		int some_done = 0;
+		int i;
+
+		for (i = 0; i < num_threads; i++) {
+			nitems += threads[i].items;
+			some_done = threads[i].done;
+			if (some_done)
+				break;
+		}
+
+		if (some_done)
+			break;
+
+		perc = (float) nitems / (float) total;
+		perc *= 100.0;
+		this_items = nitems - last_nitems;
+		this_items *= blocksize;
+		tdiff = mtime_since_now(&last_tv);
+		if (tdiff) {
+			this_items = (this_items * 1000) / (tdiff * 1024);
+			printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
+			last_nitems = nitems;
+			fio_gettime(&last_tv, NULL);
+		} else
+			printf("%3.2f%% done\r", perc);
+		fflush(stdout);
+		usleep(250000);
+	};
+}
+
+static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
+			      uint64_t *nextents, uint64_t *nchunks)
+{
+	struct worker_thread *threads;
+	unsigned long nitems, total_items;
+	int i, err = 0;
+
+	total_size = dev_size;
+	total_items = dev_size / blocksize;
+	cur_offset = 0;
+	size_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+
+	threads = malloc(num_threads * sizeof(struct worker_thread));
+	for (i = 0; i < num_threads; i++) {
+		memset(&threads[i], 0, sizeof(struct worker_thread));
+		threads[i].fd = f->fd;
+
+		err = pthread_create(&threads[i].thread, NULL, thread_fn, &threads[i]);
+		if (err) {
+			log_err("fio: thread startup failed\n");
+			break;
+		}
+	}
+
+	show_progress(threads, total_items);
+
+	nitems = 0;
+	*nextents = 0;
+	*nchunks = 1;
+	for (i = 0; i < num_threads; i++) {
+		void *ret;
+		pthread_join(threads[i].thread, &ret);
+		nitems += threads[i].items;
+		*nchunks += threads[i].dupes;
+	}
+
+	printf("Threads(%u): %lu items processed\n", num_threads, nitems);
+
+	*nextents = nitems;
+	*nchunks = nitems - *nchunks;
+
+	fio_sem_remove(size_lock);
+	free(threads);
+	return err;
+}
+
+static int dedupe_check(const char *filename, uint64_t *nextents,
+			uint64_t *nchunks)
+{
+	uint64_t dev_size;
+	struct stat sb;
+	int flags;
+
+	flags = O_RDONLY;
+	if (odirect)
+		flags |= OS_O_DIRECT;
+
+	memset(&file, 0, sizeof(file));
+	file.file_name = strdup(filename);
+
+	file.fd = open(filename, flags);
+	if (file.fd == -1) {
+		perror("open");
+		goto err;
+	}
+
+	if (fstat(file.fd, &sb) < 0) {
+		perror("fstat");
+		goto err;
+	}
+
+	dev_size = get_size(&file, &sb);
+	if (!dev_size)
+		goto err;
+
+	if (use_bloom) {
+		uint64_t bloom_entries;
+
+		bloom_entries = 8 * (dev_size / blocksize);
+		bloom = bloom_new(bloom_entries);
+	}
+
+	printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+
+	return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+err:
+	if (file.fd != -1)
+		close(file.fd);
+	free(file.file_name);
+	return 1;
+}
+
+static void show_chunk(struct chunk *c)
+{
+	struct flist_head *n;
+	struct extent *e;
+
+	printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+	flist_for_each(n, &c->extent_list[0]) {
+		e = flist_entry(n, struct extent, list);
+		printf("\toffset %llu\n", (unsigned long long) e->offset);
+	}
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks)
+{
+	double perc, ratio;
+
+	printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+
+	if (nchunks) {
+		ratio = (double) nextents / (double) nchunks;
+		printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
+	} else
+		printf("De-dupe ratio: 1:infinite\n");
+
+	perc = 1.00 - ((double) nchunks / (double) nextents);
+	perc *= 100.0;
+	printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
+
+}
+
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+{
+	struct fio_rb_node *n;
+
+	*nchunks = *nextents = 0;
+
+	n = rb_first(&rb_root);
+	if (!n)
+		return;
+
+	do {
+		struct chunk *c;
+
+		c = rb_entry(n, struct chunk, rb_node);
+		(*nchunks)++;
+		*nextents += c->count;
+
+		if (dump_output)
+			show_chunk(c);
+
+	} while ((n = rb_next(n)) != NULL);
+}
+
+static int usage(char *argv[])
+{
+	log_err("Check for dedupable blocks on a device/file\n\n");
+	log_err("%s: [options] <device or file>\n", argv[0]);
+	log_err("\t-b\tChunk size to use\n");
+	log_err("\t-t\tNumber of threads to use\n");
+	log_err("\t-d\tFull extent/chunk debug output\n");
+	log_err("\t-o\tUse O_DIRECT\n");
+	log_err("\t-c\tFull collision check\n");
+	log_err("\t-B\tUse probabilistic bloom filter\n");
+	log_err("\t-p\tPrint progress indicator\n");
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t nextents = 0, nchunks = 0;
+	int c, ret;
+
+	arch_init(argv);
+	debug_init();
+
+	while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+		switch (c) {
+		case 'b':
+			blocksize = atoi(optarg);
+			break;
+		case 't':
+			num_threads = atoi(optarg);
+			break;
+		case 'd':
+			dump_output = atoi(optarg);
+			break;
+		case 'o':
+			odirect = atoi(optarg);
+			break;
+		case 'c':
+			collision_check = atoi(optarg);
+			break;
+		case 'p':
+			print_progress = atoi(optarg);
+			break;
+		case 'B':
+			use_bloom = atoi(optarg);
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (collision_check || dump_output)
+		use_bloom = 0;
+
+	if (!num_threads)
+		num_threads = cpus_online();
+
+	if (argc == optind)
+		return usage(argv);
+
+	sinit();
+
+	rb_root = RB_ROOT;
+	rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+
+	ret = dedupe_check(argv[optind], &nextents, &nchunks);
+
+	if (!ret) {
+		if (!bloom)
+			iter_rb_tree(&nextents, &nchunks);
+
+		show_stat(nextents, nchunks);
+	}
+
+	fio_sem_remove(rb_lock);
+	if (bloom)
+		bloom_free(bloom);
+	scleanup();
+	return ret;
+}
diff --git a/t/gen-rand.c b/t/gen-rand.c
new file mode 100644
index 0000000..b050bd7
--- /dev/null
+++ b/t/gen-rand.c
@@ -0,0 +1,62 @@
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../lib/types.h"
+#include "../lib/rand.h"
+#include "../log.h"
+
+int main(int argc, char *argv[])
+{
+	struct frand_state s;
+	uint64_t i, start, end, nvalues;
+	unsigned long *buckets, index, pass, fail;
+	double p, dev, mean, vmin, vmax;
+
+	if (argc < 4) {
+		log_err("%s: start end nvalues\n", argv[0]);
+		return 1;
+	}
+
+	start = strtoul(argv[1], NULL, 10);
+	end = strtoul(argv[2], NULL, 10);
+
+	if (start >= end) {
+		log_err("%s: start must be smaller than end\n", argv[0]);
+		return 1;
+	}
+	index = 1 + end - start;
+	buckets = calloc(index, sizeof(unsigned long));
+
+	nvalues = strtoul(argv[3], NULL, 10);
+
+	init_rand(&s, false);
+
+	for (i = 0; i < nvalues; i++) {
+		int v = rand_between(&s, start, end);
+
+		buckets[v - start]++;
+	}
+
+	p = 1.0 / index;
+	dev = sqrt(nvalues * p * (1.0 - p));
+	mean = nvalues * p;
+	vmin = mean - dev;
+	vmax = mean + dev;
+
+	pass = fail = 0;
+	for (i = 0; i < index; i++) {
+		if (buckets[i] < vmin || buckets[i] > vmax) {
+			printf("FAIL bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+			fail++;
+		} else {
+			printf("PASS bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+			pass++;
+		}
+	}
+
+	printf("Passes=%lu, Fail=%lu\n", pass, fail);
+	free(buckets);
+	return 0;
+}
diff --git a/t/genzipf.c b/t/genzipf.c
new file mode 100644
index 0000000..4fc10ae
--- /dev/null
+++ b/t/genzipf.c
@@ -0,0 +1,349 @@
+/*
+ * Generate/analyze pareto/zipf distributions to better understand
+ * what an access pattern would look like.
+ *
+ * For instance, the following would generate a zipf distribution
+ * with theta 1.2, using 262144 (1 GiB / 4096) values and split the
+ * reporting into 20 buckets:
+ *
+ *	./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
+ *
+ * Only the distribution type (zipf or pareto) and spread input need
+ * to be given, if not given defaults are used.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../lib/zipf.h"
+#include "../lib/gauss.h"
+#include "../flist.h"
+#include "../hash.h"
+
+#define DEF_NR_OUTPUT	20
+
+struct node {
+	struct flist_head list;
+	unsigned long long val;
+	unsigned long hits;
+};
+
+static struct flist_head *hash;
+static unsigned long hash_bits = 24;
+static unsigned long hash_size = 1 << 24;
+
+enum {
+	TYPE_NONE = 0,
+	TYPE_ZIPF,
+	TYPE_PARETO,
+	TYPE_NORMAL,
+};
+static const char *dist_types[] = { "None", "Zipf", "Pareto", "Normal" };
+
+enum {
+	OUTPUT_NORMAL,
+	OUTPUT_CSV,
+};
+
+static int dist_type = TYPE_ZIPF;
+static unsigned long gib_size = 500;
+static unsigned long block_size = 4096;
+static unsigned long output_nranges = DEF_NR_OUTPUT;
+static double percentage;
+static double dist_val;
+static int output_type = OUTPUT_NORMAL;
+
+#define DEF_ZIPF_VAL	1.2
+#define DEF_PARETO_VAL	0.3
+
+static unsigned int hashv(unsigned long long val)
+{
+	return jhash(&val, sizeof(val), 0) & (hash_size - 1);
+}
+
+static struct node *hash_lookup(unsigned long long val)
+{
+	struct flist_head *l = &hash[hashv(val)];
+	struct flist_head *entry;
+	struct node *n;
+
+	flist_for_each(entry, l) {
+		n = flist_entry(entry, struct node, list);
+		if (n->val == val)
+			return n;
+	}
+
+	return NULL;
+}
+
+static void hash_insert(struct node *n, unsigned long long val)
+{
+	struct flist_head *l = &hash[hashv(val)];
+
+	n->val = val;
+	n->hits = 1;
+	flist_add_tail(&n->list, l);
+}
+
+static void usage(void)
+{
+	printf("genzipf: test zipf/pareto values for fio input\n");
+	printf("\t-h\tThis help screen\n");
+	printf("\t-p\tGenerate size of data set that are hit by this percentage\n");
+	printf("\t-t\tDistribution type (zipf, pareto, or normal)\n");
+	printf("\t-i\tDistribution algorithm input (zipf theta, pareto power,\n"
+		"\t\tor normal %% deviation)\n");
+	printf("\t-b\tBlock size of a given range (in bytes)\n");
+	printf("\t-g\tSize of data set (in gigabytes)\n");
+	printf("\t-o\tNumber of output rows\n");
+	printf("\t-c\tOutput ranges in CSV format\n");
+}
+
+static int parse_options(int argc, char *argv[])
+{
+	const char *optstring = "t:g:i:o:b:p:ch";
+	int c, dist_val_set = 0;
+
+	while ((c = getopt(argc, argv, optstring)) != -1) {
+		switch (c) {
+		case 'h':
+			usage();
+			return 1;
+		case 'p':
+			percentage = atof(optarg);
+			break;
+		case 'b':
+			block_size = strtoul(optarg, NULL, 10);
+			break;
+		case 't':
+			if (!strncmp(optarg, "zipf", 4))
+				dist_type = TYPE_ZIPF;
+			else if (!strncmp(optarg, "pareto", 6))
+				dist_type = TYPE_PARETO;
+			else if (!strncmp(optarg, "normal", 6))
+				dist_type = TYPE_NORMAL;
+			else {
+				printf("wrong dist type: %s\n", optarg);
+				return 1;
+			}
+			break;
+		case 'g':
+			gib_size = strtoul(optarg, NULL, 10);
+			break;
+		case 'i':
+			dist_val = atof(optarg);
+			dist_val_set = 1;
+			break;
+		case 'o':
+			output_nranges = strtoul(optarg, NULL, 10);
+			break;
+		case 'c':
+			output_type = OUTPUT_CSV;
+			break;
+		default:
+			printf("bad option %c\n", c);
+			return 1;
+		}
+	}
+
+	if (dist_type == TYPE_PARETO) {
+		if ((dist_val >= 1.00 || dist_val < 0.00)) {
+			printf("pareto input must be > 0.00 and < 1.00\n");
+			return 1;
+		}
+		if (!dist_val_set)
+			dist_val = DEF_PARETO_VAL;
+	} else if (dist_type == TYPE_ZIPF) {
+		if (dist_val == 1.0) {
+			printf("zipf input must be different than 1.0\n");
+			return 1;
+		}
+		if (!dist_val_set)
+			dist_val = DEF_ZIPF_VAL;
+	}
+
+	return 0;
+}
+
+struct output_sum {
+	double output;
+	unsigned int nranges;
+};
+
+static int node_cmp(const void *p1, const void *p2)
+{
+	const struct node *n1 = p1;
+	const struct node *n2 = p2;
+
+	return n2->hits - n1->hits;
+}
+
+static void output_csv(struct node *nodes, unsigned long nnodes)
+{
+	unsigned long i;
+
+	printf("rank, count\n");
+	for (i = 0; i < nnodes; i++)
+		printf("%lu, %lu\n", i, nodes[i].hits);
+}
+
+static void output_normal(struct node *nodes, unsigned long nnodes,
+			  unsigned long nranges)
+{
+	unsigned long i, j, cur_vals, interval_step, next_interval, total_vals;
+	unsigned long blocks = percentage * nnodes / 100;
+	double hit_percent_sum = 0;
+	unsigned long long hit_sum = 0;
+	double perc, perc_i;
+	struct output_sum *output_sums;
+
+	interval_step = (nnodes - 1) / output_nranges + 1;
+	next_interval = interval_step;
+	output_sums = malloc(output_nranges * sizeof(struct output_sum));
+
+	for (i = 0; i < output_nranges; i++) {
+		output_sums[i].output = 0.0;
+		output_sums[i].nranges = 0;
+	}
+
+	j = total_vals = cur_vals = 0;
+
+	for (i = 0; i < nnodes; i++) {
+		struct output_sum *os = &output_sums[j];
+		struct node *node = &nodes[i];
+		cur_vals += node->hits;
+		total_vals += node->hits;
+		os->nranges += node->hits;
+		if (i == (next_interval) -1 || i == nnodes - 1) {
+			os->output = (double) cur_vals / (double) nranges;
+			os->output *= 100.0;
+			cur_vals = 0;
+			next_interval += interval_step;
+			j++;
+		}
+
+		if (percentage) {
+			if (total_vals >= blocks) {
+				double cs = (double) i * block_size / (1024.0 * 1024.0);
+				char p = 'M';
+
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'G';
+				}
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'T';
+				}
+
+				printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
+				percentage = 0.0;
+			}
+		}
+	}
+
+	perc_i = 100.0 / (double)output_nranges;
+	perc = 0.0;
+
+	printf("\n   Rows           Hits %%         Sum %%           # Hits          Size\n");
+	printf("-----------------------------------------------------------------------\n");
+	for (i = 0; i < output_nranges; i++) {
+		struct output_sum *os = &output_sums[i];
+		double gb = (double)os->nranges * block_size / 1024.0;
+		char p = 'K';
+
+		if (gb > 1024.0) {
+			p = 'M';
+			gb /= 1024.0;
+		}
+		if (gb > 1024.0) {
+			p = 'G';
+			gb /= 1024.0;
+		}
+
+		perc += perc_i;
+		hit_percent_sum += os->output;
+		hit_sum += os->nranges;
+		printf("%s %6.2f%%\t%6.2f%%\t\t%6.2f%%\t\t%8u\t%6.2f%c\n",
+			i ? "|->" : "Top", perc, os->output, hit_percent_sum,
+			os->nranges, gb, p);
+	}
+
+	printf("-----------------------------------------------------------------------\n");
+	printf("Total\t\t\t\t\t\t%8llu\n", hit_sum);
+	free(output_sums);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long offset;
+	unsigned long long nranges;
+	unsigned long nnodes;
+	struct node *nodes;
+	struct zipf_state zs;
+	struct gauss_state gs;
+	int i, j;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	if (output_type != OUTPUT_CSV)
+		printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n",
+		       dist_types[dist_type], dist_val, gib_size, block_size);
+
+	nranges = gib_size * 1024 * 1024 * 1024ULL;
+	nranges /= block_size;
+
+	if (dist_type == TYPE_ZIPF)
+		zipf_init(&zs, nranges, dist_val, 1);
+	else if (dist_type == TYPE_PARETO)
+		pareto_init(&zs, nranges, dist_val, 1);
+	else
+		gauss_init(&gs, nranges, dist_val, 1);
+
+	hash_bits = 0;
+	hash_size = nranges;
+	while ((hash_size >>= 1) != 0)
+		hash_bits++;
+
+	hash_size = 1 << hash_bits;
+
+	hash = calloc(hash_size, sizeof(struct flist_head));
+	for (i = 0; i < hash_size; i++)
+		INIT_FLIST_HEAD(&hash[i]);
+
+	nodes = malloc(nranges * sizeof(struct node));
+
+	for (i = j = 0; i < nranges; i++) {
+		struct node *n;
+
+		if (dist_type == TYPE_ZIPF)
+			offset = zipf_next(&zs);
+		else if (dist_type == TYPE_PARETO)
+			offset = pareto_next(&zs);
+		else
+			offset = gauss_next(&gs);
+
+		n = hash_lookup(offset);
+		if (n)
+			n->hits++;
+		else {
+			hash_insert(&nodes[j], offset);
+			j++;
+		}
+	}
+
+	qsort(nodes, j, sizeof(struct node), node_cmp);
+	nnodes = j;
+
+	if (output_type == OUTPUT_CSV)
+		output_csv(nodes, nnodes);
+	else
+		output_normal(nodes, nnodes, nranges);
+
+	free(hash);
+	free(nodes);
+	return 0;
+}
diff --git a/t/ieee754.c b/t/ieee754.c
new file mode 100644
index 0000000..b652639
--- /dev/null
+++ b/t/ieee754.c
@@ -0,0 +1,26 @@
+#include <stdio.h>
+#include "../lib/ieee754.h"
+
+static double values[] = { -17.23, 17.23, 123.4567, 98765.4321,
+	3.14159265358979323, 0.0 };
+
+int main(int argc, char *argv[])
+{
+	uint64_t i;
+	double f, delta;
+	int j, differences = 0;
+
+	j = 0;
+	do {
+		i = fio_double_to_uint64(values[j]);
+		f = fio_uint64_to_double(i);
+		delta = values[j] - f;
+		printf("%26.20lf -> %26.20lf, delta = %26.20lf\n", values[j],
+			f, delta);
+		if (f != values[j])
+			differences++;
+		j++;
+	} while (values[j] != 0.0);
+
+	return differences;
+}
diff --git a/t/io_uring.c b/t/io_uring.c
new file mode 100644
index 0000000..e84a2b6
--- /dev/null
+++ b/t/io_uring.c
@@ -0,0 +1,627 @@
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <signal.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+#include <linux/fs.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <sched.h>
+
+#include "../arch/arch.h"
+#include "../lib/types.h"
+#include "../os/linux/io_uring.h"
+
+#define min(a, b)		((a < b) ? (a) : (b))
+
+struct io_sq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	unsigned *flags;
+	unsigned *array;
+};
+
+struct io_cq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	struct io_uring_cqe *cqes;
+};
+
+#define DEPTH			128
+#define BATCH_SUBMIT		32
+#define BATCH_COMPLETE		32
+
+#define BS			4096
+
+#define MAX_FDS			16
+
+static unsigned sq_ring_mask, cq_ring_mask;
+
+struct file {
+	unsigned long max_blocks;
+	unsigned pending_ios;
+	int real_fd;
+	int fixed_fd;
+};
+
+struct submitter {
+	pthread_t thread;
+	int ring_fd;
+	struct drand48_data rand;
+	struct io_sq_ring sq_ring;
+	struct io_uring_sqe *sqes;
+	struct io_cq_ring cq_ring;
+	int inflight;
+	unsigned long reaps;
+	unsigned long done;
+	unsigned long calls;
+	volatile int finish;
+
+	__s32 *fds;
+
+	struct file files[MAX_FDS];
+	unsigned nr_files;
+	unsigned cur_file;
+	struct iovec iovecs[];
+};
+
+static struct submitter *submitter;
+static volatile int finish;
+
+static int depth = DEPTH;
+static int batch_submit = BATCH_SUBMIT;
+static int batch_complete = BATCH_COMPLETE;
+static int polled = 1;		/* use IO polling */
+static int fixedbufs = 1;	/* use fixed user buffers */
+static int register_files = 1;	/* use fixed files */
+static int buffered = 0;	/* use buffered IO, not O_DIRECT */
+static int sq_thread_poll = 0;	/* use kernel submission/poller thread */
+static int sq_thread_cpu = -1;	/* pin above thread to this CPU */
+static int do_nop = 0;		/* no-op SQ ring commands */
+
+static int io_uring_register_buffers(struct submitter *s)
+{
+	if (do_nop)
+		return 0;
+
+	return syscall(__NR_io_uring_register, s->ring_fd,
+			IORING_REGISTER_BUFFERS, s->iovecs, depth);
+}
+
+static int io_uring_register_files(struct submitter *s)
+{
+	int i;
+
+	if (do_nop)
+		return 0;
+
+	s->fds = calloc(s->nr_files, sizeof(__s32));
+	for (i = 0; i < s->nr_files; i++) {
+		s->fds[i] = s->files[i].real_fd;
+		s->files[i].fixed_fd = i;
+	}
+
+	return syscall(__NR_io_uring_register, s->ring_fd,
+			IORING_REGISTER_FILES, s->fds, s->nr_files);
+}
+
+static int io_uring_setup(unsigned entries, struct io_uring_params *p)
+{
+	return syscall(__NR_io_uring_setup, entries, p);
+}
+
+static int io_uring_enter(struct submitter *s, unsigned int to_submit,
+			  unsigned int min_complete, unsigned int flags)
+{
+	return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
+			flags, NULL, 0);
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static int gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+#endif
+
+static unsigned file_depth(struct submitter *s)
+{
+	return (depth + s->nr_files - 1) / s->nr_files;
+}
+
+static void init_io(struct submitter *s, unsigned index)
+{
+	struct io_uring_sqe *sqe = &s->sqes[index];
+	unsigned long offset;
+	struct file *f;
+	long r;
+
+	if (do_nop) {
+		sqe->opcode = IORING_OP_NOP;
+		return;
+	}
+
+	if (s->nr_files == 1) {
+		f = &s->files[0];
+	} else {
+		f = &s->files[s->cur_file];
+		if (f->pending_ios >= file_depth(s)) {
+			s->cur_file++;
+			if (s->cur_file == s->nr_files)
+				s->cur_file = 0;
+			f = &s->files[s->cur_file];
+		}
+	}
+	f->pending_ios++;
+
+	lrand48_r(&s->rand, &r);
+	offset = (r % (f->max_blocks - 1)) * BS;
+
+	if (register_files) {
+		sqe->flags = IOSQE_FIXED_FILE;
+		sqe->fd = f->fixed_fd;
+	} else {
+		sqe->flags = 0;
+		sqe->fd = f->real_fd;
+	}
+	if (fixedbufs) {
+		sqe->opcode = IORING_OP_READ_FIXED;
+		sqe->addr = (unsigned long) s->iovecs[index].iov_base;
+		sqe->len = BS;
+		sqe->buf_index = index;
+	} else {
+		sqe->opcode = IORING_OP_READV;
+		sqe->addr = (unsigned long) &s->iovecs[index];
+		sqe->len = 1;
+		sqe->buf_index = 0;
+	}
+	sqe->ioprio = 0;
+	sqe->off = offset;
+	sqe->user_data = (unsigned long) f;
+}
+
+static int prep_more_ios(struct submitter *s, int max_ios)
+{
+	struct io_sq_ring *ring = &s->sq_ring;
+	unsigned index, tail, next_tail, prepped = 0;
+
+	next_tail = tail = *ring->tail;
+	do {
+		next_tail++;
+		read_barrier();
+		if (next_tail == *ring->head)
+			break;
+
+		index = tail & sq_ring_mask;
+		init_io(s, index);
+		ring->array[index] = index;
+		prepped++;
+		tail = next_tail;
+	} while (prepped < max_ios);
+
+	if (*ring->tail != tail) {
+		*ring->tail = tail;
+		write_barrier();
+	}
+	return prepped;
+}
+
+static int get_file_size(struct file *f)
+{
+	struct stat st;
+
+	if (fstat(f->real_fd, &st) < 0)
+		return -1;
+	if (S_ISBLK(st.st_mode)) {
+		unsigned long long bytes;
+
+		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
+			return -1;
+
+		f->max_blocks = bytes / BS;
+		return 0;
+	} else if (S_ISREG(st.st_mode)) {
+		f->max_blocks = st.st_size / BS;
+		return 0;
+	}
+
+	return -1;
+}
+
+static int reap_events(struct submitter *s)
+{
+	struct io_cq_ring *ring = &s->cq_ring;
+	struct io_uring_cqe *cqe;
+	unsigned head, reaped = 0;
+
+	head = *ring->head;
+	do {
+		struct file *f;
+
+		read_barrier();
+		if (head == *ring->tail)
+			break;
+		cqe = &ring->cqes[head & cq_ring_mask];
+		if (!do_nop) {
+			f = (struct file *) (uintptr_t) cqe->user_data;
+			f->pending_ios--;
+			if (cqe->res != BS) {
+				printf("io: unexpected ret=%d\n", cqe->res);
+				if (polled && cqe->res == -EOPNOTSUPP)
+					printf("Your filesystem/driver/kernel doesn't support polled IO\n");
+				return -1;
+			}
+		}
+		reaped++;
+		head++;
+	} while (1);
+
+	s->inflight -= reaped;
+	*ring->head = head;
+	write_barrier();
+	return reaped;
+}
+
+static void *submitter_fn(void *data)
+{
+	struct submitter *s = data;
+	struct io_sq_ring *ring = &s->sq_ring;
+	int ret, prepped;
+
+	printf("submitter=%d\n", gettid());
+
+	srand48_r(pthread_self(), &s->rand);
+
+	prepped = 0;
+	do {
+		int to_wait, to_submit, this_reap, to_prep;
+
+		if (!prepped && s->inflight < depth) {
+			to_prep = min(depth - s->inflight, batch_submit);
+			prepped = prep_more_ios(s, to_prep);
+		}
+		s->inflight += prepped;
+submit_more:
+		to_submit = prepped;
+submit:
+		if (to_submit && (s->inflight + to_submit <= depth))
+			to_wait = 0;
+		else
+			to_wait = min(s->inflight + to_submit, batch_complete);
+
+		/*
+		 * Only need to call io_uring_enter if we're not using SQ thread
+		 * poll, or if IORING_SQ_NEED_WAKEUP is set.
+		 */
+		if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
+			unsigned flags = 0;
+
+			if (to_wait)
+				flags = IORING_ENTER_GETEVENTS;
+			if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
+				flags |= IORING_ENTER_SQ_WAKEUP;
+			ret = io_uring_enter(s, to_submit, to_wait, flags);
+			s->calls++;
+		}
+
+		/*
+		 * For non SQ thread poll, we already got the events we needed
+		 * through the io_uring_enter() above. For SQ thread poll, we
+		 * need to loop here until we find enough events.
+		 */
+		this_reap = 0;
+		do {
+			int r;
+			r = reap_events(s);
+			if (r == -1) {
+				s->finish = 1;
+				break;
+			} else if (r > 0)
+				this_reap += r;
+		} while (sq_thread_poll && this_reap < to_wait);
+		s->reaps += this_reap;
+
+		if (ret >= 0) {
+			if (!ret) {
+				to_submit = 0;
+				if (s->inflight)
+					goto submit;
+				continue;
+			} else if (ret < to_submit) {
+				int diff = to_submit - ret;
+
+				s->done += ret;
+				prepped -= diff;
+				goto submit_more;
+			}
+			s->done += ret;
+			prepped = 0;
+			continue;
+		} else if (ret < 0) {
+			if (errno == EAGAIN) {
+				if (s->finish)
+					break;
+				if (this_reap)
+					goto submit;
+				to_submit = 0;
+				goto submit;
+			}
+			printf("io_submit: %s\n", strerror(errno));
+			break;
+		}
+	} while (!s->finish);
+
+	finish = 1;
+	return NULL;
+}
+
+static void sig_int(int sig)
+{
+	printf("Exiting on signal %d\n", sig);
+	submitter->finish = 1;
+	finish = 1;
+}
+
+static void arm_sig_int(void)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGINT, &act, NULL);
+}
+
+static int setup_ring(struct submitter *s)
+{
+	struct io_sq_ring *sring = &s->sq_ring;
+	struct io_cq_ring *cring = &s->cq_ring;
+	struct io_uring_params p;
+	int ret, fd;
+	void *ptr;
+
+	memset(&p, 0, sizeof(p));
+
+	if (polled && !do_nop)
+		p.flags |= IORING_SETUP_IOPOLL;
+	if (sq_thread_poll) {
+		p.flags |= IORING_SETUP_SQPOLL;
+		if (sq_thread_cpu != -1) {
+			p.flags |= IORING_SETUP_SQ_AFF;
+			p.sq_thread_cpu = sq_thread_cpu;
+		}
+	}
+
+	fd = io_uring_setup(depth, &p);
+	if (fd < 0) {
+		perror("io_uring_setup");
+		return 1;
+	}
+	s->ring_fd = fd;
+
+	if (fixedbufs) {
+		ret = io_uring_register_buffers(s);
+		if (ret < 0) {
+			perror("io_uring_register_buffers");
+			return 1;
+		}
+	}
+
+	if (register_files) {
+		ret = io_uring_register_files(s);
+		if (ret < 0) {
+			perror("io_uring_register_files");
+			return 1;
+		}
+	}
+
+	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_SQ_RING);
+	printf("sq_ring ptr = 0x%p\n", ptr);
+	sring->head = ptr + p.sq_off.head;
+	sring->tail = ptr + p.sq_off.tail;
+	sring->ring_mask = ptr + p.sq_off.ring_mask;
+	sring->ring_entries = ptr + p.sq_off.ring_entries;
+	sring->flags = ptr + p.sq_off.flags;
+	sring->array = ptr + p.sq_off.array;
+	sq_ring_mask = *sring->ring_mask;
+
+	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_SQES);
+	printf("sqes ptr    = 0x%p\n", s->sqes);
+
+	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_CQ_RING);
+	printf("cq_ring ptr = 0x%p\n", ptr);
+	cring->head = ptr + p.cq_off.head;
+	cring->tail = ptr + p.cq_off.tail;
+	cring->ring_mask = ptr + p.cq_off.ring_mask;
+	cring->ring_entries = ptr + p.cq_off.ring_entries;
+	cring->cqes = ptr + p.cq_off.cqes;
+	cq_ring_mask = *cring->ring_mask;
+	return 0;
+}
+
+static void file_depths(char *buf)
+{
+	struct submitter *s = submitter;
+	char *p;
+	int i;
+
+	buf[0] = '\0';
+	p = buf;
+	for (i = 0; i < s->nr_files; i++) {
+		struct file *f = &s->files[i];
+
+		if (i + 1 == s->nr_files)
+			p += sprintf(p, "%d", f->pending_ios);
+		else
+			p += sprintf(p, "%d, ", f->pending_ios);
+	}
+}
+
+static void usage(char *argv)
+{
+	printf("%s [options] -- [filenames]\n"
+		" -d <int> : IO Depth, default %d\n"
+		" -s <int> : Batch submit, default %d\n"
+		" -c <int> : Batch complete, default %d\n",
+		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE);
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct submitter *s;
+	unsigned long done, calls, reap;
+	int err, i, flags, fd, opt;
+	char *fdepths;
+	void *ret;
+
+	if (!do_nop && argc < 2) {
+		printf("%s: filename [options]\n", argv[0]);
+		return 1;
+	}
+
+	while ((opt = getopt(argc, argv, "d:s:c:h?")) != -1) {
+		switch (opt) {
+		case 'd':
+			depth = atoi(optarg);
+			break;
+		case 's':
+			batch_submit = atoi(optarg);
+			break;
+		case 'c':
+			batch_complete = atoi(optarg);
+			break;
+		case 'h':
+		case '?':
+		default:
+			usage(argv[0]);
+			break;
+		}
+	}
+
+	submitter = malloc(sizeof(*submitter) + depth * sizeof(struct iovec));
+	memset(submitter, 0, sizeof(*submitter) + depth * sizeof(struct iovec));
+	s = submitter;
+
+	flags = O_RDONLY | O_NOATIME;
+	if (!buffered)
+		flags |= O_DIRECT;
+
+	i = optind;
+	while (!do_nop && i < argc) {
+		struct file *f;
+
+		if (s->nr_files == MAX_FDS) {
+			printf("Max number of files (%d) reached\n", MAX_FDS);
+			break;
+		}
+		fd = open(argv[i], flags);
+		if (fd < 0) {
+			perror("open");
+			return 1;
+		}
+
+		f = &s->files[s->nr_files];
+		f->real_fd = fd;
+		if (get_file_size(f)) {
+			printf("failed getting size of device/file\n");
+			return 1;
+		}
+		if (f->max_blocks <= 1) {
+			printf("Zero file/device size?\n");
+			return 1;
+		}
+		f->max_blocks--;
+
+		printf("Added file %s\n", argv[i]);
+		s->nr_files++;
+		i++;
+	}
+
+	if (fixedbufs) {
+		struct rlimit rlim;
+
+		rlim.rlim_cur = RLIM_INFINITY;
+		rlim.rlim_max = RLIM_INFINITY;
+		if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
+			perror("setrlimit");
+			return 1;
+		}
+	}
+
+	arm_sig_int();
+
+	for (i = 0; i < depth; i++) {
+		void *buf;
+
+		if (posix_memalign(&buf, BS, BS)) {
+			printf("failed alloc\n");
+			return 1;
+		}
+		s->iovecs[i].iov_base = buf;
+		s->iovecs[i].iov_len = BS;
+	}
+
+	err = setup_ring(s);
+	if (err) {
+		printf("ring setup failed: %s, %d\n", strerror(errno), err);
+		return 1;
+	}
+	printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
+	printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+
+	pthread_create(&s->thread, NULL, submitter_fn, s);
+
+	fdepths = malloc(8 * s->nr_files);
+	reap = calls = done = 0;
+	do {
+		unsigned long this_done = 0;
+		unsigned long this_reap = 0;
+		unsigned long this_call = 0;
+		unsigned long rpc = 0, ipc = 0;
+
+		sleep(1);
+		this_done += s->done;
+		this_call += s->calls;
+		this_reap += s->reaps;
+		if (this_call - calls) {
+			rpc = (this_done - done) / (this_call - calls);
+			ipc = (this_reap - reap) / (this_call - calls);
+		} else
+			rpc = ipc = -1;
+		file_depths(fdepths);
+		printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
+				this_done - done, rpc, ipc, s->inflight,
+				fdepths);
+		done = this_done;
+		calls = this_call;
+		reap = this_reap;
+	} while (!finish);
+
+	pthread_join(s->thread, &ret);
+	close(s->ring_fd);
+	free(fdepths);
+	return 0;
+}
diff --git a/t/jobs/t0001-52c58027.fio b/t/jobs/t0001-52c58027.fio
new file mode 100644
index 0000000..49b7d11
--- /dev/null
+++ b/t/jobs/t0001-52c58027.fio
@@ -0,0 +1,6 @@
+#Commit 52c580272d87d2b9b8a65d317bf7c2d432a30fec
+[foo]
+size=20000
+bsrange=1k-4k
+rw=randread
+ioengine=null
diff --git a/t/jobs/t0002-13af05ae-post.fio b/t/jobs/t0002-13af05ae-post.fio
new file mode 100644
index 0000000..d141d40
--- /dev/null
+++ b/t/jobs/t0002-13af05ae-post.fio
@@ -0,0 +1,24 @@
+[global]
+ioengine=libaio
+direct=1
+filename=t0002file
+iodepth=128
+size=1G
+loops=1
+group_reporting=1
+readwrite=read
+do_verify=1
+verify=md5
+verify_fatal=1
+numjobs=1
+thread
+bssplit=512/50:1M/50
+
+[thread0]
+offset=0G
+
+[thread-mix0]
+offset=4G
+size=1G
+readwrite=rw
+bsrange=512:1M
diff --git a/t/jobs/t0002-13af05ae-pre.fio b/t/jobs/t0002-13af05ae-pre.fio
new file mode 100644
index 0000000..0e044d4
--- /dev/null
+++ b/t/jobs/t0002-13af05ae-pre.fio
@@ -0,0 +1,23 @@
+[global]
+ioengine=libaio
+direct=1
+filename=t0002file
+iodepth=128
+size=1G
+loops=1
+group_reporting=1
+readwrite=write
+do_verify=0
+verify=md5
+numjobs=1
+thread
+bssplit=512/50:1M/50
+
+[thread0]
+offset=0G
+
+[thread-mix0]
+offset=4G
+readwrite=rw
+size=1G
+bsrange=512:1M
diff --git a/t/jobs/t0003-0ae2c6e1-post.fio b/t/jobs/t0003-0ae2c6e1-post.fio
new file mode 100644
index 0000000..4e7887a
--- /dev/null
+++ b/t/jobs/t0003-0ae2c6e1-post.fio
@@ -0,0 +1,26 @@
+# Expected result: verify fails.
+# Buggy result: fio segfaults
+[global]
+ioengine=libaio
+direct=1
+filename=foo
+iodepth=128
+size=1M
+loops=1
+group_reporting=1
+readwrite=read
+do_verify=1
+verify=md5
+verify_fatal=1
+numjobs=1
+thread
+verify_dump=1
+bs=4k
+
+[large_reads]
+offset=0G
+blocksize=1M
+
+[small_reads]
+offset=1G
+blocksize=512
diff --git a/t/jobs/t0003-0ae2c6e1-pre.fio b/t/jobs/t0003-0ae2c6e1-pre.fio
new file mode 100644
index 0000000..a9a9f31
--- /dev/null
+++ b/t/jobs/t0003-0ae2c6e1-pre.fio
@@ -0,0 +1,22 @@
+[global]
+ioengine=libaio
+direct=1
+filename=foo
+iodepth=128
+size=10M
+loops=1
+group_reporting=1
+readwrite=write
+do_verify=0
+verify=md5
+numjobs=1
+thread
+verify_dump=1
+
+[small_writes]
+offset=0G
+blocksize=512
+
+[large_writes]
+offset=1G
+blocksize=1M
diff --git a/t/jobs/t0004-8a99fdf6.fio b/t/jobs/t0004-8a99fdf6.fio
new file mode 100644
index 0000000..0fc3e0d
--- /dev/null
+++ b/t/jobs/t0004-8a99fdf6.fio
@@ -0,0 +1,27 @@
+# Expected result: fio runs to completion
+# Buggy result: fio segfaults
+[global]
+ioengine=libaio
+direct=1
+filename=foo
+iodepth=128
+size=10M
+loops=1
+group_reporting=1
+readwrite=write
+do_verify=0
+verify=md5
+numjobs=1
+thread
+verify_dump=1
+
+[small_writes]
+offset=0G
+blocksize=512
+verify_interval=1M
+
+[large_writes]
+stonewall
+offset=1G
+blocksize=1M
+verify_interval=512
diff --git a/t/jobs/t0005-f7078f7b.fio b/t/jobs/t0005-f7078f7b.fio
new file mode 100644
index 0000000..3481d63
--- /dev/null
+++ b/t/jobs/t0005-f7078f7b.fio
@@ -0,0 +1,13 @@
+# Expected result: fio reads and writes 100m
+# Buggy result: fio reads and writes ~100m/2
+[global]
+bs=4k
+ioengine=sync
+size=100m
+direct=1
+filename=xxx
+
+[write]
+verify=md5
+verify_backlog=32
+rw=write
diff --git a/t/jobs/t0006-82af2a7c.fio b/t/jobs/t0006-82af2a7c.fio
new file mode 100644
index 0000000..7117067
--- /dev/null
+++ b/t/jobs/t0006-82af2a7c.fio
@@ -0,0 +1,16 @@
+# Expected results: workload runs and switches between 'm' and 'V' state
+# Buggy result: workload stays in 'm' mode, never doing actual verifies
+[global]
+rw=randrw
+bs=4k
+direct=1
+ioengine=libaio
+iodepth=32
+verify=meta
+verify_backlog=1024
+verify_fatal=1
+
+[ver-test]
+filename=foo
+size=4g
+verify_pattern=0xaaa
diff --git a/t/jobs/t0007-37cf9e3c.fio b/t/jobs/t0007-37cf9e3c.fio
new file mode 100644
index 0000000..d3c9875
--- /dev/null
+++ b/t/jobs/t0007-37cf9e3c.fio
@@ -0,0 +1,9 @@
+# Expected result: fio reads 87040KB of data
+# Buggy result: fio reads the full 128MB of data
+[foo]
+size=128mb
+rw=read:512k
+bs=1m
+norandommap
+write_iolog=log
+direct=1
diff --git a/t/jobs/t0008-ae2fafc8.fio b/t/jobs/t0008-ae2fafc8.fio
new file mode 100644
index 0000000..4b36485
--- /dev/null
+++ b/t/jobs/t0008-ae2fafc8.fio
@@ -0,0 +1,12 @@
+# Expected result: fio writes 16MB, reads 16+16MB
+# Buggy result: fio writes 16MB, reads ~21MB
+[global]
+bs=4k
+verify=crc32c
+rw=readwrite
+direct=1
+
+[foo]
+size=32m
+do_verify=1
+verify_backlog=1
diff --git a/t/jobs/t0009-f8b0bd10.fio b/t/jobs/t0009-f8b0bd10.fio
new file mode 100644
index 0000000..20f376e
--- /dev/null
+++ b/t/jobs/t0009-f8b0bd10.fio
@@ -0,0 +1,40 @@
+# Expected result: fio verifies and runs for 1m
+# Buggy result: fio crashes with:
+# __get_io_u: Assertion `io_u->flags & IO_U_F_FREE' failed
+
+[global]
+direct=1
+ioengine=null
+size=20g
+norandommap
+randrepeat=0
+bs=4096
+iodepth=170
+#iodepth=96
+#numjobs=1
+numjobs=1
+#numjobs=24
+# number_ios=1
+# runtime=216000
+#runtime=3600
+time_based=1
+group_reporting=1
+thread
+gtod_reduce=1
+iodepth_batch=4
+iodepth_batch_complete=4
+cpus_allowed=0-3
+cpus_allowed_policy=split
+rw=randwrite
+verify=crc32c-intel
+verify_backlog=1m
+do_verify=1
+verify_async=6
+verify_async_cpus=0-3
+runtime=1m
+
+[4_KiB_RR_drive_r]
+
+[4_KiB_RR_drive_s]
+
+
diff --git a/t/jobs/t0010-b7aae4ba.fio b/t/jobs/t0010-b7aae4ba.fio
new file mode 100644
index 0000000..0223770
--- /dev/null
+++ b/t/jobs/t0010-b7aae4ba.fio
@@ -0,0 +1,8 @@
+# Expected result: fio runs and completes the job
+# Buggy result: fio segfaults
+#
+[test]
+ioengine=null
+size=10g
+io_submit_mode=offload
+iodepth=16
diff --git a/t/jobs/t0011-5d2788d5.fio b/t/jobs/t0011-5d2788d5.fio
new file mode 100644
index 0000000..50daf61
--- /dev/null
+++ b/t/jobs/t0011-5d2788d5.fio
@@ -0,0 +1,18 @@
+# Expected results: no parse warnings, runs and with roughly 1/8 iops between
+#			the two jobs.
+# Buggy result: parse warning on flow value overflow, no 1/8 division between
+#			jobs.
+#
+[global]
+bs=4k
+ioengine=null
+size=100g
+runtime=3
+flow_id=1
+
+[flow1]
+flow=-8
+rate_iops=1000
+
+[flow2]
+flow=1
diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py
new file mode 100755
index 0000000..5cdd49c
--- /dev/null
+++ b/t/latency_percentiles.py
@@ -0,0 +1,1329 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (c) 2020 Western Digital Corporation or its affiliates.
+#
+"""
+# latency_percentiles.py
+#
+# Test the code that produces latency percentiles
+# This is mostly to test the code changes to allow reporting
+# of slat, clat, and lat percentiles
+#
+# USAGE
+# python3 latency-tests.py [-f fio-path] [-a artifact-root] [--debug]
+#
+#
+# Test scenarios:
+#
+# - DONE json
+#   unified rw reporting
+#   compare with latency log
+#   try various combinations of the ?lat_percentile options
+#   null, aio
+#   r, w, t
+# - DONE json+
+#   check presence of latency bins
+#   if the json percentiles match those from the raw data
+#   then the latency bin values and counts are probably ok
+# - DONE terse
+#   produce both terse, JSON output and confirm that they match
+#   lat only; both lat and clat
+# - DONE sync_lat
+#   confirm that sync_lat data appears
+# - MANUAL TESTING normal output:
+#       null ioengine
+#           enable all, but only clat and lat appear
+#           enable subset of latency types
+#           read, write, trim, unified
+#       libaio ioengine
+#           enable all latency types
+#           enable subset of latency types
+#           read, write, trim, unified
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=null --slat_percentiles=1 --clat_percentiles=1 --lat_percentiles=1
+# echo confirm that clat and lat percentiles appear
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=null --slat_percentiles=0 --clat_percentiles=0 --lat_percentiles=1
+# echo confirm that only lat percentiles appear
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=null --slat_percentiles=0 --clat_percentiles=1 --lat_percentiles=0
+# echo confirm that only clat percentiles appear
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=libaio --slat_percentiles=1 --clat_percentiles=1 --lat_percentiles=1
+# echo confirm that slat, clat, lat percentiles appear
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=libaio --slat_percentiles=0 --clat_percentiles=1 --lat_percentiles=1
+# echo confirm that clat and lat percentiles appear
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=libaio -rw=randrw
+# echo confirm that clat percentiles appear for reads and writes
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=libaio --slat_percentiles=1 --clat_percentiles=0 --lat_percentiles=0 --rw=randrw
+# echo confirm that slat percentiles appear for both reads and writes
+# ./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=libaio --slat_percentiles=1 --clat_percentiles=1 --lat_percentiles=1 \
+#       --rw=randrw --unified_rw_reporting=1
+# echo confirm that slat, clat, and lat percentiles appear for 'mixed' IOs
+#./fio/fio --name=test --randrepeat=0 --norandommap --time_based --runtime=2s --size=512M \
+#       --ioengine=null --slat_percentiles=1 --clat_percentiles=1 --lat_percentiles=1 \
+#       --rw=randrw --fsync=32
+# echo confirm that fsync latencies appear
+"""
+
+import os
+import csv
+import sys
+import json
+import math
+import time
+import argparse
+import platform
+import subprocess
+from pathlib import Path
+
+
+class FioLatTest():
+    """fio latency percentile test."""
+
+    def __init__(self, artifact_root, test_options, debug):
+        """
+        artifact_root   root directory for artifacts (subdirectory will be created under here)
+        test            test specification
+        """
+        self.artifact_root = artifact_root
+        self.test_options = test_options
+        self.debug = debug
+        self.filename = None
+        self.json_data = None
+        self.terse_data = None
+
+        self.test_dir = os.path.join(self.artifact_root,
+                                     "{:03d}".format(self.test_options['test_id']))
+        if not os.path.exists(self.test_dir):
+            os.mkdir(self.test_dir)
+
+        self.filename = "latency{:03d}".format(self.test_options['test_id'])
+
+    def run_fio(self, fio_path):
+        """Run a test."""
+
+        fio_args = [
+            "--name=latency",
+            "--randrepeat=0",
+            "--norandommap",
+            "--time_based",
+            "--size=16M",
+            "--rwmixread=50",
+            "--group_reporting=1",
+            "--write_lat_log={0}".format(self.filename),
+            "--output={0}.out".format(self.filename),
+            "--ioengine={ioengine}".format(**self.test_options),
+            "--rw={rw}".format(**self.test_options),
+            "--runtime={runtime}".format(**self.test_options),
+            "--output-format={output-format}".format(**self.test_options),
+        ]
+        for opt in ['slat_percentiles', 'clat_percentiles', 'lat_percentiles',
+                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', 'cmdprio_percentage']:
+            if opt in self.test_options:
+                option = '--{0}={{{0}}}'.format(opt)
+                fio_args.append(option.format(**self.test_options))
+
+        command = [fio_path] + fio_args
+        with open(os.path.join(self.test_dir, "{0}.command".format(self.filename)), "w+") as \
+                command_file:
+            command_file.write("%s\n" % command)
+
+        passed = True
+        stdout_file = open(os.path.join(self.test_dir, "{0}.stdout".format(self.filename)), "w+")
+        stderr_file = open(os.path.join(self.test_dir, "{0}.stderr".format(self.filename)), "w+")
+        exitcode_file = open(os.path.join(self.test_dir,
+                                          "{0}.exitcode".format(self.filename)), "w+")
+        try:
+            proc = None
+            # Avoid using subprocess.run() here because when a timeout occurs,
+            # fio will be stopped with SIGKILL. This does not give fio a
+            # chance to clean up and means that child processes may continue
+            # running and submitting IO.
+            proc = subprocess.Popen(command,
+                                    stdout=stdout_file,
+                                    stderr=stderr_file,
+                                    cwd=self.test_dir,
+                                    universal_newlines=True)
+            proc.communicate(timeout=300)
+            exitcode_file.write('{0}\n'.format(proc.returncode))
+            passed &= (proc.returncode == 0)
+        except subprocess.TimeoutExpired:
+            proc.terminate()
+            proc.communicate()
+            assert proc.poll()
+            print("Timeout expired")
+            passed = False
+        except Exception:
+            if proc:
+                if not proc.poll():
+                    proc.terminate()
+                    proc.communicate()
+            print("Exception: %s" % sys.exc_info())
+            passed = False
+        finally:
+            stdout_file.close()
+            stderr_file.close()
+            exitcode_file.close()
+
+        if passed:
+            if 'json' in self.test_options['output-format']:
+                if not self.get_json():
+                    print('Unable to decode JSON data')
+                    passed = False
+            if 'terse' in self.test_options['output-format']:
+                if not self.get_terse():
+                    print('Unable to decode terse data')
+                    passed = False
+
+        return passed
+
+    def get_json(self):
+        """Convert fio JSON output into a python JSON object"""
+
+        filename = os.path.join(self.test_dir, "{0}.out".format(self.filename))
+        with open(filename, 'r') as file:
+            file_data = file.read()
+
+        #
+        # Sometimes fio informational messages are included at the top of the
+        # JSON output, especially under Windows. Try to decode output as JSON
+        # data, lopping off up to the first four lines
+        #
+        lines = file_data.splitlines()
+        for i in range(5):
+            file_data = '\n'.join(lines[i:])
+            try:
+                self.json_data = json.loads(file_data)
+            except json.JSONDecodeError:
+                continue
+            else:
+                return True
+
+        return False
+
+    def get_terse(self):
+        """Read fio output and return terse format data."""
+
+        filename = os.path.join(self.test_dir, "{0}.out".format(self.filename))
+        with open(filename, 'r') as file:
+            file_data = file.read()
+
+        #
+        # Read the first few lines and see if any of them begin with '3;fio-'
+        # If so, the line is probably terse output. Obviously, this only
+        # works for fio terse version 3 and it does not work for
+        # multi-line terse output
+        #
+        lines = file_data.splitlines()
+        for i in range(8):
+            file_data = lines[i]
+            if file_data.startswith('3;fio-'):
+                self.terse_data = file_data.split(';')
+                return True
+
+        return False
+
+    def check_latencies(self, jsondata, ddir, slat=True, clat=True, tlat=True, plus=False,
+                        unified=False):
+        """Check fio latency data.
+
+        ddir                data direction to check (0=read, 1=write, 2=trim)
+        slat                True if submission latency data available to check
+        clat                True if completion latency data available to check
+        tlat                True of total latency data available to check
+        plus                True if we actually have json+ format data where additional checks can
+                            be carried out
+        unified             True if fio is reporting unified r/w data
+        """
+
+        types = {
+            'slat': slat,
+            'clat': clat,
+            'lat': tlat
+        }
+
+        retval = True
+
+        for lat in ['slat', 'clat', 'lat']:
+            this_iter = True
+            if not types[lat]:
+                if 'percentile' in jsondata[lat+'_ns']:
+                    this_iter = False
+                    print('unexpected %s percentiles found' % lat)
+                else:
+                    print("%s percentiles skipped" % lat)
+                continue
+            else:
+                if 'percentile' not in jsondata[lat+'_ns']:
+                    this_iter = False
+                    print('%s percentiles not found in fio output' % lat)
+
+            #
+            # Check only for the presence/absence of json+
+            # latency bins. Future work can check the
+            # accurracy of the bin values and counts.
+            #
+            # Because the latency percentiles are based on
+            # the bins, we can be confident that the bin
+            # values and counts are correct if fio's
+            # latency percentiles match what we compute
+            # from the raw data.
+            #
+            if plus:
+                if 'bins' not in jsondata[lat+'_ns']:
+                    print('bins not found with json+ output format')
+                    this_iter = False
+                else:
+                    if not self.check_jsonplus(jsondata[lat+'_ns']):
+                        this_iter = False
+            else:
+                if 'bins' in jsondata[lat+'_ns']:
+                    print('json+ bins found with json output format')
+                    this_iter = False
+
+            latencies = []
+            for i in range(10):
+                lat_file = os.path.join(self.test_dir, "%s_%s.%s.log" % (self.filename, lat, i+1))
+                if not os.path.exists(lat_file):
+                    break
+                with open(lat_file, 'r', newline='') as file:
+                    reader = csv.reader(file)
+                    for line in reader:
+                        if unified or int(line[2]) == ddir:
+                            latencies.append(int(line[1]))
+
+            if int(jsondata['total_ios']) != len(latencies):
+                this_iter = False
+                print('%s: total_ios = %s, latencies logged = %d' % \
+                        (lat, jsondata['total_ios'], len(latencies)))
+            elif self.debug:
+                print("total_ios %s match latencies logged" % jsondata['total_ios'])
+
+            latencies.sort()
+            ptiles = jsondata[lat+'_ns']['percentile']
+
+            for percentile in ptiles.keys():
+                #
+                # numpy.percentile(latencies, float(percentile),
+                #       interpolation='higher')
+                # produces values that mostly match what fio reports
+                # however, in the tails of the distribution, the values produced
+                # by fio's and numpy.percentile's algorithms are occasionally off
+                # by one latency measurement. So instead of relying on the canned
+                # numpy.percentile routine, implement here fio's algorithm
+                #
+                rank = math.ceil(float(percentile)/100 * len(latencies))
+                if rank > 0:
+                    index = rank - 1
+                else:
+                    index = 0
+                value = latencies[int(index)]
+                fio_val = int(ptiles[percentile])
+                # The theory in stat.h says that the proportional error will be
+                # less than 1/128
+                if not self.similar(fio_val, value):
+                    delta = abs(fio_val - value) / value
+                    print("Error with %s %sth percentile: "
+                          "fio: %d, expected: %d, proportional delta: %f" %
+                          (lat, percentile, fio_val, value, delta))
+                    print("Rank: %d, index: %d" % (rank, index))
+                    this_iter = False
+                elif self.debug:
+                    print('%s %sth percentile values match: %d, %d' %
+                          (lat, percentile, fio_val, value))
+
+            if this_iter:
+                print("%s percentiles match" % lat)
+            else:
+                retval = False
+
+        return retval
+
+    @staticmethod
+    def check_empty(job):
+        """
+        Make sure JSON data is empty.
+
+        Some data structures should be empty. This function makes sure that they are.
+
+        job         JSON object that we need to check for emptiness
+        """
+
+        return job['total_ios'] == 0 and \
+                job['slat_ns']['N'] == 0 and \
+                job['clat_ns']['N'] == 0 and \
+                job['lat_ns']['N'] == 0
+
+    def check_nocmdprio_lat(self, job):
+        """
+        Make sure no high/low priority latencies appear.
+
+        job         JSON object to check
+        """
+
+        for ddir in ['read', 'write', 'trim']:
+            if ddir in job:
+                if 'lat_high_prio' in job[ddir] or 'lat_low_prio' in job[ddir] or \
+                    'clat_high_prio' in job[ddir] or 'clat_low_prio' in job[ddir]:
+                    print("Unexpected high/low priority latencies found in %s output" % ddir)
+                    return False
+
+        if self.debug:
+            print("No high/low priority latencies found")
+
+        return True
+
+    @staticmethod
+    def similar(approximation, actual):
+        """
+        Check whether the approximate values recorded by fio are within the theoretical bound.
+
+        Since it is impractical to store exact latency measurements for each and every IO, fio
+        groups similar latency measurements into variable-sized bins. The theory in stat.h says
+        that the proportional error will be less than 1/128. This function checks whether this
+        is true.
+
+        TODO This test will fail when comparing a value from the largest latency bin against its
+        actual measurement. Find some way to detect this and avoid failing.
+
+        approximation   value of the bin used by fio to store a given latency
+        actual          actual latency value
+        """
+
+        # Avoid a division by zero. The smallest latency values have no error.
+        if actual == 0:
+            return approximation == 0
+
+        delta = abs(approximation - actual) / actual
+        return delta <= 1/128
+
+    def check_jsonplus(self, jsondata):
+        """Check consistency of json+ data
+
+        When we have json+ data we can check the min value, max value, and
+        sample size reported by fio
+
+        jsondata            json+ data that we need to check
+        """
+
+        retval = True
+
+        keys = [int(k) for k in jsondata['bins'].keys()]
+        values = [int(jsondata['bins'][k]) for k in jsondata['bins'].keys()]
+        smallest = min(keys)
+        biggest = max(keys)
+        sampsize = sum(values)
+
+        if not self.similar(jsondata['min'], smallest):
+            retval = False
+            print('reported min %d does not match json+ min %d' % (jsondata['min'], smallest))
+        elif self.debug:
+            print('json+ min values match: %d' % jsondata['min'])
+
+        if not self.similar(jsondata['max'], biggest):
+            retval = False
+            print('reported max %d does not match json+ max %d' % (jsondata['max'], biggest))
+        elif self.debug:
+            print('json+ max values match: %d' % jsondata['max'])
+
+        if sampsize != jsondata['N']:
+            retval = False
+            print('reported sample size %d does not match json+ total count %d' % \
+                    (jsondata['N'], sampsize))
+        elif self.debug:
+            print('json+ sample sizes match: %d' % sampsize)
+
+        return retval
+
+    def check_sync_lat(self, jsondata, plus=False):
+        """Check fsync latency percentile data.
+
+        All we can check is that some percentiles are reported, unless we have json+ data.
+        If we actually have json+ data then we can do more checking.
+
+        jsondata        JSON data for fsync operations
+        plus            True if we actually have json+ data
+        """
+        retval = True
+
+        if 'percentile' not in jsondata['lat_ns']:
+            print("Sync percentile data not found")
+            return False
+
+        if int(jsondata['total_ios']) != int(jsondata['lat_ns']['N']):
+            retval = False
+            print('Mismatch between total_ios and lat_ns sample size')
+        elif self.debug:
+            print('sync sample sizes match: %d' % jsondata['total_ios'])
+
+        if not plus:
+            if 'bins' in jsondata['lat_ns']:
+                print('Unexpected json+ bin data found')
+                return False
+
+        if not self.check_jsonplus(jsondata['lat_ns']):
+            retval = False
+
+        return retval
+
+    def check_terse(self, terse, jsondata):
+        """Compare terse latencies with JSON latencies.
+
+        terse           terse format data for checking
+        jsondata        JSON format data for checking
+        """
+
+        retval = True
+
+        for lat in terse:
+            split = lat.split('%')
+            pct = split[0]
+            terse_val = int(split[1][1:])
+            json_val = math.floor(jsondata[pct]/1000)
+            if terse_val != json_val:
+                retval = False
+                print('Mismatch with %sth percentile: json value=%d,%d terse value=%d' % \
+                        (pct, jsondata[pct], json_val, terse_val))
+            elif self.debug:
+                print('Terse %sth percentile matches JSON value: %d' % (pct, terse_val))
+
+        return retval
+
+    def check_prio_latencies(self, jsondata, clat=True, plus=False):
+        """Check consistency of high/low priority latencies.
+
+        clat                True if we should check clat data; other check lat data
+        plus                True if we have json+ format data where additional checks can
+                            be carried out
+        unified             True if fio is reporting unified r/w data
+        """
+
+        if clat:
+            high = 'clat_high_prio'
+            low = 'clat_low_prio'
+            combined = 'clat_ns'
+        else:
+            high = 'lat_high_prio'
+            low = 'lat_low_prio'
+            combined = 'lat_ns'
+
+        if not high in jsondata or not low in jsondata or not combined in jsondata:
+            print("Error identifying high/low priority latencies")
+            return False
+
+        if jsondata[high]['N'] + jsondata[low]['N'] != jsondata[combined]['N']:
+            print("High %d + low %d != combined sample size %d" % \
+                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+            return False
+        elif self.debug:
+            print("High %d + low %d == combined sample size %d" % \
+                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+
+        if min(jsondata[high]['min'], jsondata[low]['min']) != jsondata[combined]['min']:
+            print("Min of high %d, low %d min latencies does not match min %d from combined data" % \
+                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+            return False
+        elif self.debug:
+            print("Min of high %d, low %d min latencies matches min %d from combined data" % \
+                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+
+        if max(jsondata[high]['max'], jsondata[low]['max']) != jsondata[combined]['max']:
+            print("Max of high %d, low %d max latencies does not match max %d from combined data" % \
+                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+            return False
+        elif self.debug:
+            print("Max of high %d, low %d max latencies matches max %d from combined data" % \
+                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+
+        weighted_avg = (jsondata[high]['mean'] * jsondata[high]['N'] + \
+                        jsondata[low]['mean'] * jsondata[low]['N']) / jsondata[combined]['N']
+        delta = abs(weighted_avg - jsondata[combined]['mean'])
+        if (delta / jsondata[combined]['mean']) > 0.0001:
+            print("Difference between weighted average %f of high, low means "
+                  "and actual mean %f exceeds 0.01%%" % (weighted_avg, jsondata[combined]['mean']))
+            return False
+        elif self.debug:
+            print("Weighted average %f of high, low means matches actual mean %f" % \
+                    (weighted_avg, jsondata[combined]['mean']))
+
+        if plus:
+            if not self.check_jsonplus(jsondata[high]):
+                return False
+            if not self.check_jsonplus(jsondata[low]):
+                return False
+
+            bins = {**jsondata[high]['bins'], **jsondata[low]['bins']}
+            for duration in bins.keys():
+                if duration in jsondata[high]['bins'] and duration in jsondata[low]['bins']:
+                    bins[duration] = jsondata[high]['bins'][duration] + \
+                            jsondata[low]['bins'][duration]
+
+            if len(bins) != len(jsondata[combined]['bins']):
+                print("Number of combined high/low bins does not match number of overall bins")
+                return False
+            elif self.debug:
+                print("Number of bins from merged high/low data matches number of overall bins")
+
+            for duration in bins.keys():
+                if bins[duration] != jsondata[combined]['bins'][duration]:
+                    print("Merged high/low count does not match overall count for duration %d" \
+                            % duration)
+                    return False
+
+        print("Merged high/low priority latency data match combined latency data")
+        return True
+
+    def check(self):
+        """Check test output."""
+
+        raise NotImplementedError()
+
+
+class Test001(FioLatTest):
+    """Test object for Test 1."""
+
+    def check(self):
+        """Check Test 1 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['write']):
+            print("Unexpected write data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False)
+
+        return retval
+
+
+class Test002(FioLatTest):
+    """Test object for Test 2."""
+
+    def check(self):
+        """Check Test 2 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['read']):
+            print("Unexpected read data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['write'], 1, slat=False, clat=False)
+
+        return retval
+
+
+class Test003(FioLatTest):
+    """Test object for Test 3."""
+
+    def check(self):
+        """Check Test 3 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['read']):
+            print("Unexpected read data found in output")
+            retval = False
+        if not self.check_empty(job['write']):
+            print("Unexpected write data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['trim'], 2, slat=False, tlat=False)
+
+        return retval
+
+
+class Test004(FioLatTest):
+    """Test object for Tests 4, 13."""
+
+    def check(self):
+        """Check Test 4, 13 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['write']):
+            print("Unexpected write data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, plus=True)
+
+        return retval
+
+
+class Test005(FioLatTest):
+    """Test object for Test 5."""
+
+    def check(self):
+        """Check Test 5 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['read']):
+            print("Unexpected read data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
+
+        return retval
+
+
+class Test006(FioLatTest):
+    """Test object for Test 6."""
+
+    def check(self):
+        """Check Test 6 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['write']):
+            print("Unexpected write data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
+
+        return retval
+
+
+class Test007(FioLatTest):
+    """Test object for Test 7."""
+
+    def check(self):
+        """Check Test 7 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True)
+        retval &= self.check_latencies(job['write'], 1, clat=False, tlat=False, plus=True)
+
+        return retval
+
+
+class Test008(FioLatTest):
+    """Test object for Tests 8, 14."""
+
+    def check(self):
+        """Check Test 8, 14 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if 'read' in job or 'write'in job or 'trim' in job:
+            print("Unexpected data direction found in fio output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True)
+
+        return retval
+
+
+class Test009(FioLatTest):
+    """Test object for Test 9."""
+
+    def check(self):
+        """Check Test 9 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['read']):
+            print("Unexpected read data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_sync_lat(job['sync'], plus=True):
+            print("Error checking fsync latency data")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
+
+        return retval
+
+
+class Test010(FioLatTest):
+    """Test object for Test 10."""
+
+    def check(self):
+        """Check Test 10 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, plus=True)
+        retval &= self.check_latencies(job['write'], 1, plus=True)
+        retval &= self.check_terse(self.terse_data[17:34], job['read']['lat_ns']['percentile'])
+        retval &= self.check_terse(self.terse_data[58:75], job['write']['lat_ns']['percentile'])
+        # Terse data checking only works for default percentiles.
+        # This needs to be changed if something other than the default is ever used.
+
+        return retval
+
+
+class Test011(FioLatTest):
+    """Test object for Test 11."""
+
+    def check(self):
+        """Check Test 11 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+        if not self.check_nocmdprio_lat(job):
+            print("Unexpected high/low priority latencies found")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False, clat=False, plus=True)
+        retval &= self.check_latencies(job['write'], 1, slat=False, clat=False, plus=True)
+        retval &= self.check_terse(self.terse_data[17:34], job['read']['lat_ns']['percentile'])
+        retval &= self.check_terse(self.terse_data[58:75], job['write']['lat_ns']['percentile'])
+        # Terse data checking only works for default percentiles.
+        # This needs to be changed if something other than the default is ever used.
+
+        return retval
+
+
+class Test015(FioLatTest):
+    """Test object for Test 15."""
+
+    def check(self):
+        """Check Test 15 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['write']):
+            print("Unexpected write data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, plus=True)
+        retval &= self.check_prio_latencies(job['read'], clat=False, plus=True)
+
+        return retval
+
+
+class Test016(FioLatTest):
+    """Test object for Test 16."""
+
+    def check(self):
+        """Check Test 16 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['read']):
+            print("Unexpected read data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
+        retval &= self.check_prio_latencies(job['write'], clat=False, plus=True)
+
+        return retval
+
+
+class Test017(FioLatTest):
+    """Test object for Test 17."""
+
+    def check(self):
+        """Check Test 17 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['write']):
+            print("Unexpected write data found in output")
+            retval = False
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
+        retval &= self.check_prio_latencies(job['read'], plus=True)
+
+        return retval
+
+
+class Test018(FioLatTest):
+    """Test object for Test 18."""
+
+    def check(self):
+        """Check Test 18 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True)
+        retval &= self.check_latencies(job['write'], 1, clat=False, tlat=False, plus=True)
+
+        # We actually have json+ data but setting plus=False below avoids checking the
+        # json+ bins which did not exist for clat and lat because this job is run with
+        # clat_percentiles=0, lat_percentiles=0, However, we can still check the summary
+        # statistics
+        retval &= self.check_prio_latencies(job['write'], plus=False)
+        retval &= self.check_prio_latencies(job['read'], plus=False)
+
+        return retval
+
+
+class Test019(FioLatTest):
+    """Test object for Tests 19, 20."""
+
+    def check(self):
+        """Check Test 19, 20 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if 'read' in job or 'write'in job or 'trim' in job:
+            print("Unexpected data direction found in fio output")
+            retval = False
+
+        retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True)
+        retval &= self.check_prio_latencies(job['mixed'], clat=False, plus=True)
+
+        return retval
+
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-d', '--debug', help='enable debug output', action='store_true')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Run tests of fio latency percentile reporting"""
+
+    args = parse_args()
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        "latency-test-{0}".format(time.strftime("%Y%m%d-%H%M%S"))
+    os.mkdir(artifact_root)
+    print("Artifact directory is %s" % artifact_root)
+
+    if args.fio:
+        fio = str(Path(args.fio).absolute())
+    else:
+        fio = 'fio'
+    print("fio path is %s" % fio)
+
+    if platform.system() == 'Linux':
+        aio = 'libaio'
+    elif platform.system() == 'Windows':
+        aio = 'windowsaio'
+    else:
+        aio = 'posixaio'
+
+    test_list = [
+        {
+            # randread, null
+            # enable slat, clat, lat
+            # only clat and lat will appear because
+            # because the null ioengine is syncrhonous
+            "test_id": 1,
+            "runtime": 2,
+            "output-format": "json",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": 'null',
+            'rw': 'randread',
+            "test_obj": Test001,
+        },
+        {
+            # randwrite, null
+            # enable lat only
+            "test_id": 2,
+            "runtime": 2,
+            "output-format": "json",
+            "slat_percentiles": 0,
+            "clat_percentiles": 0,
+            "lat_percentiles": 1,
+            "ioengine": 'null',
+            'rw': 'randwrite',
+            "test_obj": Test002,
+        },
+        {
+            # randtrim, null
+            # enable clat only
+            "test_id": 3,
+            "runtime": 2,
+            "output-format": "json",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": 'null',
+            'rw': 'randtrim',
+            "test_obj": Test003,
+        },
+        {
+            # randread, aio
+            # enable slat, clat, lat
+            # all will appear because liaio is asynchronous
+            "test_id": 4,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randread',
+            "test_obj": Test004,
+        },
+        {
+            # randwrite, aio
+            # enable only clat, lat
+            "test_id": 5,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randwrite',
+            "test_obj": Test005,
+        },
+        {
+            # randread, aio
+            # by default only clat should appear
+            "test_id": 6,
+            "runtime": 5,
+            "output-format": "json+",
+            "ioengine": aio,
+            'rw': 'randread',
+            "test_obj": Test006,
+        },
+        {
+            # 50/50 r/w, aio
+            # enable only slat
+            "test_id": 7,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 0,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            "test_obj": Test007,
+        },
+        {
+            # 50/50 r/w, aio, unified_rw_reporting
+            # enable slat, clat, lat
+            "test_id": 8,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'unified_rw_reporting': 1,
+            "test_obj": Test008,
+        },
+        {
+            # randwrite, null
+            # enable slat, clat, lat
+            # fsync
+            "test_id": 9,
+            "runtime": 2,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": 'null',
+            'rw': 'randwrite',
+            'fsync': 32,
+            "test_obj": Test009,
+        },
+        {
+            # 50/50 r/w, aio
+            # enable slat, clat, lat
+            "test_id": 10,
+            "runtime": 5,
+            "output-format": "terse,json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randrw',
+            "test_obj": Test010,
+        },
+        {
+            # 50/50 r/w, aio
+            # enable only lat
+            "test_id": 11,
+            "runtime": 5,
+            "output-format": "terse,json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 0,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randrw',
+            "test_obj": Test011,
+        },
+        {
+            # randread, null
+            # enable slat, clat, lat
+            # only clat and lat will appear because
+            # because the null ioengine is syncrhonous
+            # same as Test 1 except
+            # numjobs = 4 to test sum_thread_stats() changes
+            "test_id": 12,
+            "runtime": 2,
+            "output-format": "json",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": 'null',
+            'rw': 'randread',
+            'numjobs': 4,
+            "test_obj": Test001,
+        },
+        {
+            # randread, aio
+            # enable slat, clat, lat
+            # all will appear because liaio is asynchronous
+            # same as Test 4 except
+            # numjobs = 4 to test sum_thread_stats() changes
+            "test_id": 13,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randread',
+            'numjobs': 4,
+            "test_obj": Test004,
+        },
+        {
+            # 50/50 r/w, aio, unified_rw_reporting
+            # enable slat, clat, lata
+            # same as Test 8 except
+            # numjobs = 4 to test sum_thread_stats() changes
+            "test_id": 14,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'unified_rw_reporting': 1,
+            'numjobs': 4,
+            "test_obj": Test008,
+        },
+        {
+            # randread, aio
+            # enable slat, clat, lat
+            # all will appear because liaio is asynchronous
+            # same as Test 4 except add cmdprio_percentage
+            "test_id": 15,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randread',
+            'cmdprio_percentage': 50,
+            "test_obj": Test015,
+        },
+        {
+            # randwrite, aio
+            # enable only clat, lat
+            # same as Test 5 except add cmdprio_percentage
+            "test_id": 16,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randwrite',
+            'cmdprio_percentage': 50,
+            "test_obj": Test016,
+        },
+        {
+            # randread, aio
+            # by default only clat should appear
+            # same as Test 6 except add cmdprio_percentage
+            "test_id": 17,
+            "runtime": 5,
+            "output-format": "json+",
+            "ioengine": aio,
+            'rw': 'randread',
+            'cmdprio_percentage': 50,
+            "test_obj": Test017,
+        },
+        {
+            # 50/50 r/w, aio
+            # enable only slat
+            # same as Test 7 except add cmdprio_percentage
+            "test_id": 18,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 0,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'cmdprio_percentage': 50,
+            "test_obj": Test018,
+        },
+        {
+            # 50/50 r/w, aio, unified_rw_reporting
+            # enable slat, clat, lat
+            # same as Test 8 except add cmdprio_percentage
+            "test_id": 19,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'unified_rw_reporting': 1,
+            'cmdprio_percentage': 50,
+            "test_obj": Test019,
+        },
+        {
+            # 50/50 r/w, aio, unified_rw_reporting
+            # enable slat, clat, lat
+            # same as Test 19 except
+            # add numjobs = 4 to test sum_thread_stats() changes
+            "test_id": 20,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 1,
+            "clat_percentiles": 1,
+            "lat_percentiles": 1,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'unified_rw_reporting': 1,
+            'cmdprio_percentage': 50,
+            'numjobs': 4,
+            "test_obj": Test019,
+        },
+    ]
+
+    passed = 0
+    failed = 0
+    skipped = 0
+
+    for test in test_list:
+        if (args.skip and test['test_id'] in args.skip) or \
+           (args.run_only and test['test_id'] not in args.run_only):
+            skipped = skipped + 1
+            outcome = 'SKIPPED (User request)'
+        elif platform.system() != 'Linux' and 'cmdprio_percentage' in test:
+            skipped = skipped + 1
+            outcome = 'SKIPPED (Linux required for cmdprio_percentage tests)'
+        else:
+            test_obj = test['test_obj'](artifact_root, test, args.debug)
+            status = test_obj.run_fio(fio)
+            if status:
+                status = test_obj.check()
+            if status:
+                passed = passed + 1
+                outcome = 'PASSED'
+            else:
+                failed = failed + 1
+                outcome = 'FAILED'
+
+        print("**********Test {0} {1}**********".format(test['test_id'], outcome))
+
+    print("{0} tests passed, {1} failed, {2} skipped".format(passed, failed, skipped))
+
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/lfsr-test.c b/t/lfsr-test.c
new file mode 100644
index 0000000..ea8c8dd
--- /dev/null
+++ b/t/lfsr-test.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "../lib/lfsr.h"
+#include "../gettime.h"
+#include "../fio_time.h"
+
+static void usage(void)
+{
+	printf("Usage: lfsr-test 0x<numbers> [seed] [spin] [verify]\n");
+	printf("-------------------------------------------------------------\n");
+	printf("*numbers: how many random numbers to produce (in hex)\n"
+		   "seed:     initial value\n"
+		   "spin:     how many iterations before we produce a number\n"
+		   "verify:   check if LFSR has iterated correctly\n\n"
+		   "Only <numbers> is required. The rest are evaluated to 0 or false\n"
+		   "Elapsed/mean time and verification results are printed at the"
+	       "end of the test\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int r;
+	struct timespec start, end;
+	struct fio_lfsr *fl;
+	int verify = 0;
+	unsigned int spin = 0;
+	uint64_t seed = 0;
+	uint64_t numbers;
+	uint64_t v_size;
+	uint64_t i;
+	void *v = NULL, *v_start;
+	double total, mean;
+
+	arch_init(argv);
+
+	/* Read arguments */
+	switch (argc) {
+		case 5: if (strncmp(argv[4], "verify", 7) == 0)
+				verify = 1;
+			/* fall through */
+		case 4: spin = atoi(argv[3]);
+			/* fall through */
+		case 3: seed = atol(argv[2]);
+			/* fall through */
+		case 2: numbers = strtol(argv[1], NULL, 16);
+				break;
+		default: usage();
+				 return 1;
+	}
+
+	/* Initialize LFSR */
+	fl = malloc(sizeof(struct fio_lfsr));
+	if (!fl) {
+		perror("malloc");
+		return 1;
+	}
+
+	r = lfsr_init(fl, numbers, seed, spin);
+	if (r) {
+		printf("Initialization failed.\n");
+		return r;
+	}
+
+	/* Print specs */
+	printf("LFSR specs\n");
+	printf("==========================\n");
+	printf("Size is         %u\n", 64 - __builtin_clzl(fl->cached_bit));
+	printf("Max val is      %lu\n", (unsigned long) fl->max_val);
+	printf("XOR-mask is     0x%lX\n", (unsigned long) fl->xormask);
+	printf("Seed is         %lu\n", (unsigned long) fl->last_val);
+	printf("Spin is         %u\n", fl->spin);
+	printf("Cycle length is %lu\n", (unsigned long) fl->cycle_length);
+
+	/* Create verification table */
+	if (verify) {
+		v_size = numbers * sizeof(uint8_t);
+		v = malloc(v_size);
+		memset(v, 0, v_size);
+		printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024);
+	}
+	v_start = v;
+
+	/*
+	 * Iterate over a tight loop until we have produced all the requested
+	 * numbers. Verifying the results should introduce some small yet not
+	 * negligible overhead.
+	 */
+	fprintf(stderr, "\nTest initiated... ");
+	fio_gettime(&start, NULL);
+	while (!lfsr_next(fl, &i)) {
+		if (verify)
+			*(uint8_t *)(v + i) += 1;
+	}
+	fio_gettime(&end, NULL);
+	fprintf(stderr, "finished.\n");
+
+
+	/* Check if all expected numbers within range have been calculated */
+	r = 0;
+	if (verify) {
+		fprintf(stderr, "Verifying results... ");
+		for (i = 0; i < numbers; i++) {
+			if (*(uint8_t *)(v + i) != 1) {
+				fprintf(stderr, "failed (%lu = %d).\n",
+						(unsigned long) i,
+						*(uint8_t *)(v + i));
+				r = 1;
+				break;
+			}
+		}
+		if (!r)
+			fprintf(stderr, "OK!\n");
+	}
+
+	/* Calculate elapsed time and mean time per number */
+	total = utime_since(&start, &end);
+	mean = total / fl->num_vals;
+
+	printf("\nTime results ");
+	if (verify)
+		printf("(slower due to verification)");
+	printf("\n==============================\n");
+	printf("Elapsed: %lf s\n", total / pow(10,6));
+	printf("Mean:    %lf us\n", mean);
+
+	free(v_start);
+	free(fl);
+	return r;
+}
diff --git a/t/log.c b/t/log.c
new file mode 100644
index 0000000..929aac6
--- /dev/null
+++ b/t/log.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include "../minmax.h"
+
+size_t log_err(const char *format, ...)
+{
+	char buffer[1024];
+	va_list args;
+	size_t len;
+
+	va_start(args, format);
+	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	va_end(args);
+	len = min(len, sizeof(buffer) - 1);
+
+	return fwrite(buffer, len, 1, stderr);
+}
+
+size_t log_info(const char *format, ...)
+{
+	char buffer[1024];
+	va_list args;
+	size_t len;
+
+	va_start(args, format);
+	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	va_end(args);
+	len = min(len, sizeof(buffer) - 1);
+
+	return fwrite(buffer, len, 1, stdout);
+}
diff --git a/t/memlock.c b/t/memlock.c
new file mode 100644
index 0000000..418dc3c
--- /dev/null
+++ b/t/memlock.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+static struct thread_data {
+	unsigned long mib;
+} td;
+
+static void *worker(void *data)
+{
+	struct thread_data *td = data;
+	unsigned long index;
+	size_t size;
+	char *buf;
+	int i, first = 1;
+
+	size = td->mib * 1024UL * 1024UL;
+	buf = malloc(size);
+
+	for (i = 0; i < 100000; i++) {
+		for (index = 0; index + 4096 < size; index += 4096)
+			memset(&buf[index+512], 0x89, 512);
+		if (first) {
+			printf("loop%d: did %lu MiB\n", i+1, size/(1024UL*1024UL));
+			first = 0;
+		}
+	}
+	free(buf);
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long mib, threads;
+	pthread_t *pthreads;
+	int i;
+
+	if (argc < 3) {
+		printf("%s: <MiB per thread> <threads>\n", argv[0]);
+		return 1;
+	}
+
+	mib = strtoul(argv[1], NULL, 10);
+	threads = strtoul(argv[2], NULL, 10);
+	if (threads < 1 || threads > 65536) {
+		printf("%s: invalid 'threads' argument\n", argv[0]);
+		return 1;
+	}
+
+	pthreads = calloc(threads, sizeof(pthread_t));
+	td.mib = mib;
+
+	for (i = 0; i < threads; i++)
+		pthread_create(&pthreads[i], NULL, worker, &td);
+
+	for (i = 0; i < threads; i++) {
+		void *ret;
+
+		pthread_join(pthreads[i], &ret);
+	}
+	return 0;
+}
diff --git a/t/read-to-pipe-async.c b/t/read-to-pipe-async.c
new file mode 100644
index 0000000..586e3c9
--- /dev/null
+++ b/t/read-to-pipe-async.c
@@ -0,0 +1,700 @@
+/*
+ * Read a file and write the contents to stdout. If a given read takes
+ * longer than 'max_us' time, then we schedule a new thread to handle
+ * the next read. This avoids the coordinated omission problem, where
+ * one request appears to take a long time, but in reality a lot of
+ * requests would have been slow, but we don't notice since new submissions
+ * are not being issued if just 1 is held up.
+ *
+ * One test case:
+ *
+ * $ time (./read-to-pipe-async -f randfile.gz | gzip -dc > outfile; sync)
+ *
+ * This will read randfile.gz and log the latencies of doing so, while
+ * piping the output to gzip to decompress it. Any latencies over max_us
+ * are logged when they happen, and latency buckets are displayed at the
+ * end of the run
+ *
+ * gcc -Wall -g -O2 -o read-to-pipe-async read-to-pipe-async.c -lpthread
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../flist.h"
+
+static int bs = 4096;
+static int max_us = 10000;
+static char *file;
+static int separate_writer = 1;
+
+#define PLAT_BITS	8
+#define PLAT_VAL	(1 << PLAT_BITS)
+#define PLAT_GROUP_NR	19
+#define PLAT_NR		(PLAT_GROUP_NR * PLAT_VAL)
+#define PLAT_LIST_MAX	20
+
+struct stats {
+	unsigned int plat[PLAT_NR];
+	unsigned int nr_samples;
+	unsigned int max;
+	unsigned int min;
+	unsigned int over;
+};
+
+static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.99, 99.999, 99.9999, };
+
+struct thread_data {
+	int exit;
+	int done;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	pthread_mutex_t done_lock;
+	pthread_cond_t done_cond;
+	pthread_t thread;
+};
+
+struct writer_thread {
+	struct flist_head list;
+	struct flist_head done_list;
+	struct stats s;
+	struct thread_data thread;
+};
+
+struct reader_thread {
+	struct flist_head list;
+	struct flist_head done_list;
+	int started;
+	int busy;
+	int write_seq;
+	struct stats s;
+	struct thread_data thread;
+};
+
+struct work_item {
+	struct flist_head list;
+	void *buf;
+	size_t buf_size;
+	off_t off;
+	int fd;
+	int seq;
+	struct writer_thread *writer;
+	struct reader_thread *reader;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	pthread_t thread;
+};
+
+static struct reader_thread reader_thread;
+static struct writer_thread writer_thread;
+
+uint64_t utime_since(const struct timespec *s, const struct timespec *e)
+{
+	long sec, usec;
+	uint64_t ret;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	ret = sec * 1000000ULL + usec;
+
+	return ret;
+}
+
+static struct work_item *find_seq(struct writer_thread *w, unsigned int seq)
+{
+	struct work_item *work;
+	struct flist_head *entry;
+
+	if (flist_empty(&w->list))
+		return NULL;
+
+	flist_for_each(entry, &w->list) {
+		work = flist_entry(entry, struct work_item, list);
+		if (work->seq == seq)
+			return work;
+	}
+
+	return NULL;
+}
+
+static unsigned int plat_val_to_idx(unsigned int val)
+{
+	unsigned int msb, error_bits, base, offset;
+
+	/* Find MSB starting from bit 0 */
+	if (val == 0)
+		msb = 0;
+	else
+		msb = sizeof(val)*8 - __builtin_clz(val) - 1;
+
+	/*
+	 * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index
+	 */
+	if (msb <= PLAT_BITS)
+		return val;
+
+	/* Compute the number of error bits to discard*/
+	error_bits = msb - PLAT_BITS;
+
+	/* Compute the number of buckets before the group */
+	base = (error_bits + 1) << PLAT_BITS;
+
+	/*
+	 * Discard the error bits and apply the mask to find the
+	 * index for the buckets in the group
+	 */
+	offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+	/* Make sure the index does not exceed (array size - 1) */
+	return (base + offset) < (PLAT_NR - 1) ?
+		(base + offset) : (PLAT_NR - 1);
+}
+
+/*
+ * Convert the given index of the bucket array to the value
+ * represented by the bucket
+ */
+static unsigned int plat_idx_to_val(unsigned int idx)
+{
+	unsigned int error_bits, k, base;
+
+	assert(idx < PLAT_NR);
+
+	/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index */
+	if (idx < (PLAT_VAL << 1))
+		return idx;
+
+	/* Find the group and compute the minimum value of that group */
+	error_bits = (idx >> PLAT_BITS) - 1;
+	base = 1 << (error_bits + PLAT_BITS);
+
+	/* Find its bucket number of the group */
+	k = idx % PLAT_VAL;
+
+	/* Return the mean of the range of the bucket */
+	return base + ((k + 0.5) * (1 << error_bits));
+}
+
+static void add_lat(struct stats *s, unsigned int us, const char *name)
+{
+	int lat_index = 0;
+
+	if (us > s->max)
+		s->max = us;
+	if (us < s->min)
+		s->min = us;
+
+	if (us > max_us) {
+		fprintf(stderr, "%s latency=%u usec\n", name, us);
+		s->over++;
+	}
+
+	lat_index = plat_val_to_idx(us);
+	__sync_fetch_and_add(&s->plat[lat_index], 1);
+	__sync_fetch_and_add(&s->nr_samples, 1);
+}
+
+static int write_work(struct work_item *work)
+{
+	struct timespec s, e;
+	ssize_t ret;
+
+	clock_gettime(CLOCK_MONOTONIC, &s);
+	ret = write(STDOUT_FILENO, work->buf, work->buf_size);
+	clock_gettime(CLOCK_MONOTONIC, &e);
+	assert(ret == work->buf_size);
+
+	add_lat(&work->writer->s, utime_since(&s, &e), "write");
+	return work->seq + 1;
+}
+
+static void thread_exiting(struct thread_data *thread)
+{
+	__sync_fetch_and_add(&thread->done, 1);
+	pthread_cond_signal(&thread->done_cond);
+}
+
+static void *writer_fn(void *data)
+{
+	struct writer_thread *wt = data;
+	struct work_item *work;
+	unsigned int seq = 1;
+
+	work = NULL;
+	while (!wt->thread.exit || !flist_empty(&wt->list)) {
+		pthread_mutex_lock(&wt->thread.lock);
+
+		if (work) {
+			flist_add_tail(&work->list, &wt->done_list);
+			work = NULL;
+		}
+	
+		work = find_seq(wt, seq);
+		if (work)
+			flist_del_init(&work->list);
+		else
+			pthread_cond_wait(&wt->thread.cond, &wt->thread.lock);
+
+		pthread_mutex_unlock(&wt->thread.lock);
+
+		if (work)
+			seq = write_work(work);
+	}
+
+	thread_exiting(&wt->thread);
+	return NULL;
+}
+
+static void reader_work(struct work_item *work)
+{
+	struct timespec s, e;
+	ssize_t ret;
+	size_t left;
+	void *buf;
+	off_t off;
+
+	clock_gettime(CLOCK_MONOTONIC, &s);
+
+	left = work->buf_size;
+	buf = work->buf;
+	off = work->off;
+	while (left) {
+		ret = pread(work->fd, buf, left, off);
+		if (!ret) {
+			fprintf(stderr, "zero read\n");
+			break;
+		} else if (ret < 0) {
+			fprintf(stderr, "errno=%d\n", errno);
+			break;
+		}
+		left -= ret;
+		off += ret;
+		buf += ret;
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &e);
+
+	add_lat(&work->reader->s, utime_since(&s, &e), "read");
+
+	pthread_cond_signal(&work->cond);
+
+	if (separate_writer) {
+		pthread_mutex_lock(&work->writer->thread.lock);
+		flist_add_tail(&work->list, &work->writer->list);
+		pthread_mutex_unlock(&work->writer->thread.lock);
+		pthread_cond_signal(&work->writer->thread.cond);
+	} else {
+		struct reader_thread *rt = work->reader;
+		struct work_item *next = NULL;
+		struct flist_head *entry;
+
+		/*
+		 * Write current work if it matches in sequence.
+		 */
+		if (work->seq == rt->write_seq)
+			goto write_it;
+
+		pthread_mutex_lock(&rt->thread.lock);
+
+		flist_add_tail(&work->list, &rt->done_list);
+
+		/*
+		 * See if the next work item is here, if so, write it
+		 */
+		work = NULL;
+		flist_for_each(entry, &rt->done_list) {
+			next = flist_entry(entry, struct work_item, list);
+			if (next->seq == rt->write_seq) {
+				work = next;
+				flist_del(&work->list);
+				break;
+			}
+		}
+
+		pthread_mutex_unlock(&rt->thread.lock);
+	
+		if (work) {
+write_it:
+			write_work(work);
+			__sync_fetch_and_add(&rt->write_seq, 1);
+		}
+	}
+}
+
+static void *reader_one_off(void *data)
+{
+	reader_work(data);
+	return NULL;
+}
+
+static void *reader_fn(void *data)
+{
+	struct reader_thread *rt = data;
+	struct work_item *work;
+
+	while (!rt->thread.exit || !flist_empty(&rt->list)) {
+		work = NULL;
+		pthread_mutex_lock(&rt->thread.lock);
+		if (!flist_empty(&rt->list)) {
+			work = flist_first_entry(&rt->list, struct work_item, list);
+			flist_del_init(&work->list);
+		} else
+			pthread_cond_wait(&rt->thread.cond, &rt->thread.lock);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		if (work) {
+			__sync_fetch_and_add(&rt->busy, 1);
+			reader_work(work);
+			__sync_fetch_and_sub(&rt->busy, 1);
+		}
+	}
+
+	thread_exiting(&rt->thread);
+	return NULL;
+}
+
+static void queue_work(struct reader_thread *rt, struct work_item *work)
+{
+	if (!rt->started) {
+		pthread_mutex_lock(&rt->thread.lock);
+		flist_add_tail(&work->list, &rt->list);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		rt->started = 1;
+		pthread_create(&rt->thread.thread, NULL, reader_fn, rt);
+	} else if (!rt->busy && !pthread_mutex_trylock(&rt->thread.lock)) {
+		flist_add_tail(&work->list, &rt->list);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		pthread_cond_signal(&rt->thread.cond);
+	} else {
+		int ret = pthread_create(&work->thread, NULL, reader_one_off, work);
+		if (ret) {
+			fprintf(stderr, "pthread_create=%d\n", ret);
+		} else {
+			ret = pthread_detach(work->thread);
+			if (ret)
+				fprintf(stderr, "pthread_detach=%d\n", ret);
+		}
+	}
+}
+
+static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
+				     unsigned int **output)
+{
+	unsigned long sum = 0;
+	unsigned int len, i, j = 0;
+	unsigned int oval_len = 0;
+	unsigned int *ovals = NULL;
+	int is_last;
+
+	len = 0;
+	while (len < PLAT_LIST_MAX && plist[len] != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Calculate bucket values, note down max and min values
+	 */
+	is_last = 0;
+	for (i = 0; i < PLAT_NR && !is_last; i++) {
+		sum += io_u_plat[i];
+		while (sum >= (plist[j] / 100.0 * nr)) {
+			assert(plist[j] <= 100.0);
+
+			if (j == oval_len) {
+				oval_len += 100;
+				ovals = realloc(ovals, oval_len * sizeof(unsigned int));
+			}
+
+			ovals[j] = plat_idx_to_val(i);
+			is_last = (j == len - 1);
+			if (is_last)
+				break;
+
+			j++;
+		}
+	}
+
+	*output = ovals;
+	return len;
+}
+
+static void show_latencies(struct stats *s, const char *msg)
+{
+	unsigned int *ovals = NULL;
+	unsigned int len, i;
+
+	len = calc_percentiles(s->plat, s->nr_samples, &ovals);
+	if (len) {
+		fprintf(stderr, "Latency percentiles (usec) (%s)\n", msg);
+		for (i = 0; i < len; i++)
+			fprintf(stderr, "\t%2.4fth: %u\n", plist[i], ovals[i]);
+	}
+
+	if (ovals)
+		free(ovals);
+
+	fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
+}
+
+static void init_thread(struct thread_data *thread)
+{
+	pthread_condattr_t cattr;
+	int ret;
+
+	ret = pthread_condattr_init(&cattr);
+	assert(ret == 0);
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+	ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
+	assert(ret == 0);
+#endif
+	pthread_cond_init(&thread->cond, &cattr);
+	pthread_cond_init(&thread->done_cond, &cattr);
+	pthread_mutex_init(&thread->lock, NULL);
+	pthread_mutex_init(&thread->done_lock, NULL);
+	thread->exit = 0;
+}
+
+static void exit_thread(struct thread_data *thread,
+			void fn(struct writer_thread *),
+			struct writer_thread *wt)
+{
+	__sync_fetch_and_add(&thread->exit, 1);
+	pthread_cond_signal(&thread->cond);
+
+	while (!thread->done) {
+		pthread_mutex_lock(&thread->done_lock);
+
+		if (fn) {
+			struct timespec ts;
+
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+			clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+			clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+			ts.tv_sec++;
+
+			pthread_cond_timedwait(&thread->done_cond, &thread->done_lock, &ts);
+			fn(wt);
+		} else
+			pthread_cond_wait(&thread->done_cond, &thread->done_lock);
+
+		pthread_mutex_unlock(&thread->done_lock);
+	}
+}
+
+static int usage(char *argv[])
+{
+	fprintf(stderr, "%s: [-b blocksize] [-t max usec] [-w separate writer] -f file\n", argv[0]);
+	return 1;
+}
+
+static int parse_options(int argc, char *argv[])
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "f:b:t:w:")) != -1) {
+		switch (c) {
+		case 'f':
+			if (file)
+				return usage(argv);
+			file = strdup(optarg);
+			break;
+		case 'b':
+			bs = atoi(optarg);
+			break;
+		case 't':
+			max_us = atoi(optarg);
+			break;
+		case 'w':
+			separate_writer = atoi(optarg);
+			if (!separate_writer)
+				fprintf(stderr, "inline writing is broken\n");
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (!file)
+		return usage(argv);
+
+	return 0;
+}
+
+static void prune_done_entries(struct writer_thread *wt)
+{
+	FLIST_HEAD(list);
+
+	if (flist_empty(&wt->done_list))
+		return;
+
+	if (pthread_mutex_trylock(&wt->thread.lock))
+		return;
+
+	if (!flist_empty(&wt->done_list))
+		flist_splice_init(&wt->done_list, &list);
+	pthread_mutex_unlock(&wt->thread.lock);
+
+	while (!flist_empty(&list)) {
+		struct work_item *work;
+
+		work = flist_first_entry(&list, struct work_item, list);
+		flist_del(&work->list);
+
+		pthread_cond_destroy(&work->cond);
+		pthread_mutex_destroy(&work->lock);
+		free(work->buf);
+		free(work);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	pthread_condattr_t cattr;
+	struct timespec s, re, we;
+	struct reader_thread *rt;
+	struct writer_thread *wt;
+	unsigned long rate;
+	uint64_t elapsed;
+	struct stat sb;
+	size_t bytes;
+	off_t off;
+	int fd, seq;
+	int ret;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		return 2;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		perror("stat");
+		return 3;
+	}
+
+	wt = &writer_thread;
+	init_thread(&wt->thread);
+	INIT_FLIST_HEAD(&wt->list);
+	INIT_FLIST_HEAD(&wt->done_list);
+	wt->s.max = 0;
+	wt->s.min = -1U;
+	pthread_create(&wt->thread.thread, NULL, writer_fn, wt);
+
+	rt = &reader_thread;
+	init_thread(&rt->thread);
+	INIT_FLIST_HEAD(&rt->list);
+	INIT_FLIST_HEAD(&rt->done_list);
+	rt->s.max = 0;
+	rt->s.min = -1U;
+	rt->write_seq = 1;
+
+	off = 0;
+	seq = 0;
+	bytes = 0;
+
+	ret = pthread_condattr_init(&cattr);
+	assert(ret == 0);
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+	ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
+	assert(ret == 0);
+#endif
+
+	clock_gettime(CLOCK_MONOTONIC, &s);
+
+	while (sb.st_size) {
+		struct work_item *work;
+		size_t this_len;
+		struct timespec ts;
+
+		prune_done_entries(wt);
+
+		this_len = sb.st_size;
+		if (this_len > bs)
+			this_len = bs;
+
+		work = calloc(1, sizeof(*work));
+		work->buf = malloc(this_len);
+		work->buf_size = this_len;
+		work->off = off;
+		work->fd = fd;
+		work->seq = ++seq;
+		work->writer = wt;
+		work->reader = rt;
+		pthread_cond_init(&work->cond, &cattr);
+		pthread_mutex_init(&work->lock, NULL);
+
+		queue_work(rt, work);
+
+#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
+		clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+		clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+		ts.tv_nsec += max_us * 1000ULL;
+		if (ts.tv_nsec >= 1000000000ULL) {
+			ts.tv_nsec -= 1000000000ULL;
+			ts.tv_sec++;
+		}
+
+		pthread_mutex_lock(&work->lock);
+		pthread_cond_timedwait(&work->cond, &work->lock, &ts);
+		pthread_mutex_unlock(&work->lock);
+
+		off += this_len;
+		sb.st_size -= this_len;
+		bytes += this_len;
+	}
+
+	exit_thread(&rt->thread, NULL, NULL);
+	clock_gettime(CLOCK_MONOTONIC, &re);
+
+	exit_thread(&wt->thread, prune_done_entries, wt);
+	clock_gettime(CLOCK_MONOTONIC, &we);
+
+	show_latencies(&rt->s, "READERS");
+	show_latencies(&wt->s, "WRITERS");
+
+	bytes /= 1024;
+	elapsed = utime_since(&s, &re);
+	rate = elapsed ? (bytes * 1000UL * 1000UL) / elapsed : 0;
+	fprintf(stderr, "Read rate (KiB/sec) : %lu\n", rate);
+	elapsed = utime_since(&s, &we);
+	rate = elapsed ? (bytes * 1000UL * 1000UL) / elapsed : 0;
+	fprintf(stderr, "Write rate (KiB/sec): %lu\n", rate);
+
+	close(fd);
+	return 0;
+}
diff --git a/t/readonly.py b/t/readonly.py
new file mode 100755
index 0000000..43686c9
--- /dev/null
+++ b/t/readonly.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+#
+#
+# readonly.py
+#
+# Do some basic tests of the --readonly paramter
+#
+# USAGE
+# python readonly.py [-f fio-executable]
+#
+# EXAMPLES
+# python t/readonly.py
+# python t/readonly.py -f ./fio
+#
+# REQUIREMENTS
+# Python 3.5+
+#
+#
+
+import sys
+import argparse
+import subprocess
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fio',
+                        help='path to fio executable (e.g., ./fio)')
+    args = parser.parse_args()
+
+    return args
+
+
+def run_fio(fio, test, index):
+    fio_args = [
+                "--name=readonly",
+                "--ioengine=null",
+                "--time_based",
+                "--runtime=1s",
+                "--size=1M",
+                "--rw={rw}".format(**test),
+               ]
+    if 'readonly-pre' in test:
+        fio_args.insert(0, "--readonly")
+    if 'readonly-post' in test:
+        fio_args.append("--readonly")
+
+    output = subprocess.run([fio] + fio_args, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+
+    return output
+
+
+def check_output(output, test):
+    expect_error = False
+    if 'readonly-pre' in test or 'readonly-post' in test:
+        if 'write' in test['rw'] or 'trim' in test['rw']:
+            expect_error = True
+
+#    print(output.stdout)
+#    print(output.stderr)
+
+    if output.returncode == 0:
+        if expect_error:
+            return False
+        else:
+            return True
+    else:
+        if expect_error:
+            return True
+        else:
+            return False
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    tests = [
+                {
+                    "rw": "randread",
+                    "readonly-pre": 1,
+                },
+                {
+                    "rw": "randwrite",
+                    "readonly-pre": 1,
+                },
+                {
+                    "rw": "randtrim",
+                    "readonly-pre": 1,
+                },
+                {
+                    "rw": "randread",
+                    "readonly-post": 1,
+                },
+                {
+                    "rw": "randwrite",
+                    "readonly-post": 1,
+                },
+                {
+                    "rw": "randtrim",
+                    "readonly-post": 1,
+                },
+                {
+                    "rw": "randread",
+                },
+                {
+                    "rw": "randwrite",
+                },
+                {
+                    "rw": "randtrim",
+                },
+            ]
+
+    index = 1
+    passed = 0
+    failed = 0
+
+    if args.fio:
+        fio_path = args.fio
+    else:
+        fio_path = 'fio'
+
+    for test in tests:
+        output = run_fio(fio_path, test, index)
+        status = check_output(output, test)
+        print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED")))
+        if status:
+            passed = passed + 1
+        else:
+            failed = failed + 1
+        index = index + 1
+
+    print("{0} tests passed, {1} failed".format(passed, failed))
+
+    sys.exit(failed)
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py
new file mode 100755
index 0000000..36fcb2f
--- /dev/null
+++ b/t/run-fio-tests.py
@@ -0,0 +1,906 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+#
+"""
+# run-fio-tests.py
+#
+# Automate running of fio tests
+#
+# USAGE
+# python3 run-fio-tests.py [-r fio-root] [-f fio-path] [-a artifact-root]
+#                           [--skip # # #...] [--run-only # # #...]
+#
+#
+# EXAMPLE
+# # git clone git://git.kernel.dk/fio.git
+# # cd fio
+# # make -j
+# # python3 t/run-fio-tests.py
+#
+#
+# REQUIREMENTS
+# - Python 3.5 (subprocess.run)
+# - Linux (libaio ioengine, zbd tests, etc)
+# - The artifact directory must be on a file system that accepts 512-byte IO
+#   (t0002, t0003, t0004).
+# - The artifact directory needs to be on an SSD. Otherwise tests that carry
+#   out file-based IO will trigger a timeout (t0006).
+# - 4 CPUs (t0009)
+# - SciPy (steadystate_tests.py)
+# - libzbc (zbd tests)
+# - root privileges (zbd test)
+# - kernel 4.19 or later for zoned null block devices (zbd tests)
+# - CUnit support (unittests)
+#
+"""
+
+#
+# TODO  run multiple tests simultaneously
+# TODO  Add sgunmap tests (requires SAS SSD)
+#
+
+import os
+import sys
+import json
+import time
+import shutil
+import logging
+import argparse
+import platform
+import subprocess
+import multiprocessing
+from pathlib import Path
+
+
+class FioTest(object):
+    """Base for all fio tests."""
+
+    def __init__(self, exe_path, parameters, success):
+        self.exe_path = exe_path
+        self.parameters = parameters
+        self.success = success
+        self.output = {}
+        self.artifact_root = None
+        self.testnum = None
+        self.test_dir = None
+        self.passed = True
+        self.failure_reason = ''
+        self.command_file = None
+        self.stdout_file = None
+        self.stderr_file = None
+        self.exitcode_file = None
+
+    def setup(self, artifact_root, testnum):
+        """Setup instance variables for test."""
+
+        self.artifact_root = artifact_root
+        self.testnum = testnum
+        self.test_dir = os.path.join(artifact_root, "{:04d}".format(testnum))
+        if not os.path.exists(self.test_dir):
+            os.mkdir(self.test_dir)
+
+        self.command_file = os.path.join(
+            self.test_dir,
+            "{0}.command".format(os.path.basename(self.exe_path)))
+        self.stdout_file = os.path.join(
+            self.test_dir,
+            "{0}.stdout".format(os.path.basename(self.exe_path)))
+        self.stderr_file = os.path.join(
+            self.test_dir,
+            "{0}.stderr".format(os.path.basename(self.exe_path)))
+        self.exitcode_file = os.path.join(
+            self.test_dir,
+            "{0}.exitcode".format(os.path.basename(self.exe_path)))
+
+    def run(self):
+        """Run the test."""
+
+        raise NotImplementedError()
+
+    def check_result(self):
+        """Check test results."""
+
+        raise NotImplementedError()
+
+
+class FioExeTest(FioTest):
+    """Test consists of an executable binary or script"""
+
+    def __init__(self, exe_path, parameters, success):
+        """Construct a FioExeTest which is a FioTest consisting of an
+        executable binary or script.
+
+        exe_path:       location of executable binary or script
+        parameters:     list of parameters for executable
+        success:        Definition of test success
+        """
+
+        FioTest.__init__(self, exe_path, parameters, success)
+
+    def run(self):
+        """Execute the binary or script described by this instance."""
+
+        if self.parameters:
+            command = [self.exe_path] + self.parameters
+        else:
+            command = [self.exe_path]
+        command_file = open(self.command_file, "w+")
+        command_file.write("%s\n" % command)
+        command_file.close()
+
+        stdout_file = open(self.stdout_file, "w+")
+        stderr_file = open(self.stderr_file, "w+")
+        exitcode_file = open(self.exitcode_file, "w+")
+        try:
+            proc = None
+            # Avoid using subprocess.run() here because when a timeout occurs,
+            # fio will be stopped with SIGKILL. This does not give fio a
+            # chance to clean up and means that child processes may continue
+            # running and submitting IO.
+            proc = subprocess.Popen(command,
+                                    stdout=stdout_file,
+                                    stderr=stderr_file,
+                                    cwd=self.test_dir,
+                                    universal_newlines=True)
+            proc.communicate(timeout=self.success['timeout'])
+            exitcode_file.write('{0}\n'.format(proc.returncode))
+            logging.debug("Test %d: return code: %d", self.testnum, proc.returncode)
+            self.output['proc'] = proc
+        except subprocess.TimeoutExpired:
+            proc.terminate()
+            proc.communicate()
+            assert proc.poll()
+            self.output['failure'] = 'timeout'
+        except Exception:
+            if proc:
+                if not proc.poll():
+                    proc.terminate()
+                    proc.communicate()
+            self.output['failure'] = 'exception'
+            self.output['exc_info'] = sys.exc_info()
+        finally:
+            stdout_file.close()
+            stderr_file.close()
+            exitcode_file.close()
+
+    def check_result(self):
+        """Check results of test run."""
+
+        if 'proc' not in self.output:
+            if self.output['failure'] == 'timeout':
+                self.failure_reason = "{0} timeout,".format(self.failure_reason)
+            else:
+                assert self.output['failure'] == 'exception'
+                self.failure_reason = '{0} exception: {1}, {2}'.format(
+                    self.failure_reason, self.output['exc_info'][0],
+                    self.output['exc_info'][1])
+
+            self.passed = False
+            return
+
+        if 'zero_return' in self.success:
+            if self.success['zero_return']:
+                if self.output['proc'].returncode != 0:
+                    self.passed = False
+                    self.failure_reason = "{0} non-zero return code,".format(self.failure_reason)
+            else:
+                if self.output['proc'].returncode == 0:
+                    self.failure_reason = "{0} zero return code,".format(self.failure_reason)
+                    self.passed = False
+
+        stderr_size = os.path.getsize(self.stderr_file)
+        if 'stderr_empty' in self.success:
+            if self.success['stderr_empty']:
+                if stderr_size != 0:
+                    self.failure_reason = "{0} stderr not empty,".format(self.failure_reason)
+                    self.passed = False
+            else:
+                if stderr_size == 0:
+                    self.failure_reason = "{0} stderr empty,".format(self.failure_reason)
+                    self.passed = False
+
+
+class FioJobTest(FioExeTest):
+    """Test consists of a fio job"""
+
+    def __init__(self, fio_path, fio_job, success, fio_pre_job=None,
+                 fio_pre_success=None, output_format="normal"):
+        """Construct a FioJobTest which is a FioExeTest consisting of a
+        single fio job file with an optional setup step.
+
+        fio_path:           location of fio executable
+        fio_job:            location of fio job file
+        success:            Definition of test success
+        fio_pre_job:        fio job for preconditioning
+        fio_pre_success:    Definition of test success for fio precon job
+        output_format:      normal (default), json, jsonplus, or terse
+        """
+
+        self.fio_job = fio_job
+        self.fio_pre_job = fio_pre_job
+        self.fio_pre_success = fio_pre_success if fio_pre_success else success
+        self.output_format = output_format
+        self.precon_failed = False
+        self.json_data = None
+        self.fio_output = "{0}.output".format(os.path.basename(self.fio_job))
+        self.fio_args = [
+            "--output-format={0}".format(self.output_format),
+            "--output={0}".format(self.fio_output),
+            self.fio_job,
+            ]
+        FioExeTest.__init__(self, fio_path, self.fio_args, success)
+
+    def setup(self, artifact_root, testnum):
+        """Setup instance variables for fio job test."""
+
+        super(FioJobTest, self).setup(artifact_root, testnum)
+
+        self.command_file = os.path.join(
+            self.test_dir,
+            "{0}.command".format(os.path.basename(self.fio_job)))
+        self.stdout_file = os.path.join(
+            self.test_dir,
+            "{0}.stdout".format(os.path.basename(self.fio_job)))
+        self.stderr_file = os.path.join(
+            self.test_dir,
+            "{0}.stderr".format(os.path.basename(self.fio_job)))
+        self.exitcode_file = os.path.join(
+            self.test_dir,
+            "{0}.exitcode".format(os.path.basename(self.fio_job)))
+
+    def run_pre_job(self):
+        """Run fio job precondition step."""
+
+        precon = FioJobTest(self.exe_path, self.fio_pre_job,
+                            self.fio_pre_success,
+                            output_format=self.output_format)
+        precon.setup(self.artifact_root, self.testnum)
+        precon.run()
+        precon.check_result()
+        self.precon_failed = not precon.passed
+        self.failure_reason = precon.failure_reason
+
+    def run(self):
+        """Run fio job test."""
+
+        if self.fio_pre_job:
+            self.run_pre_job()
+
+        if not self.precon_failed:
+            super(FioJobTest, self).run()
+        else:
+            logging.debug("Test %d: precondition step failed", self.testnum)
+
+    def check_result(self):
+        """Check fio job results."""
+
+        if self.precon_failed:
+            self.passed = False
+            self.failure_reason = "{0} precondition step failed,".format(self.failure_reason)
+            return
+
+        super(FioJobTest, self).check_result()
+
+        if not self.passed:
+            return
+
+        if 'json' not in self.output_format:
+            return
+
+        try:
+            with open(os.path.join(self.test_dir, self.fio_output), "r") as output_file:
+                file_data = output_file.read()
+        except EnvironmentError:
+            self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
+            self.passed = False
+            return
+
+        #
+        # Sometimes fio informational messages are included at the top of the
+        # JSON output, especially under Windows. Try to decode output as JSON
+        # data, lopping off up to the first four lines
+        #
+        lines = file_data.splitlines()
+        for i in range(5):
+            file_data = '\n'.join(lines[i:])
+            try:
+                self.json_data = json.loads(file_data)
+            except json.JSONDecodeError:
+                continue
+            else:
+                logging.debug("Test %d: skipped %d lines decoding JSON data", self.testnum, i)
+                return
+
+        self.failure_reason = "{0} unable to decode JSON data,".format(self.failure_reason)
+        self.passed = False
+
+
+class FioJobTest_t0005(FioJobTest):
+    """Test consists of fio test job t0005
+    Confirm that read['io_kbytes'] == write['io_kbytes'] == 102400"""
+
+    def check_result(self):
+        super(FioJobTest_t0005, self).check_result()
+
+        if not self.passed:
+            return
+
+        if self.json_data['jobs'][0]['read']['io_kbytes'] != 102400:
+            self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+            self.passed = False
+        if self.json_data['jobs'][0]['write']['io_kbytes'] != 102400:
+            self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason)
+            self.passed = False
+
+
+class FioJobTest_t0006(FioJobTest):
+    """Test consists of fio test job t0006
+    Confirm that read['io_kbytes'] ~ 2*write['io_kbytes']"""
+
+    def check_result(self):
+        super(FioJobTest_t0006, self).check_result()
+
+        if not self.passed:
+            return
+
+        ratio = self.json_data['jobs'][0]['read']['io_kbytes'] \
+            / self.json_data['jobs'][0]['write']['io_kbytes']
+        logging.debug("Test %d: ratio: %f", self.testnum, ratio)
+        if ratio < 1.99 or ratio > 2.01:
+            self.failure_reason = "{0} read/write ratio mismatch,".format(self.failure_reason)
+            self.passed = False
+
+
+class FioJobTest_t0007(FioJobTest):
+    """Test consists of fio test job t0007
+    Confirm that read['io_kbytes'] = 87040"""
+
+    def check_result(self):
+        super(FioJobTest_t0007, self).check_result()
+
+        if not self.passed:
+            return
+
+        if self.json_data['jobs'][0]['read']['io_kbytes'] != 87040:
+            self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+            self.passed = False
+
+
+class FioJobTest_t0008(FioJobTest):
+    """Test consists of fio test job t0008
+    Confirm that read['io_kbytes'] = 32768 and that
+                write['io_kbytes'] ~ 16568
+
+    I did runs with fio-ae2fafc8 and saw write['io_kbytes'] values of
+    16585, 16588. With two runs of fio-3.16 I obtained 16568"""
+
+    def check_result(self):
+        super(FioJobTest_t0008, self).check_result()
+
+        if not self.passed:
+            return
+
+        ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16568
+        logging.debug("Test %d: ratio: %f", self.testnum, ratio)
+
+        if ratio < 0.99 or ratio > 1.01:
+            self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason)
+            self.passed = False
+        if self.json_data['jobs'][0]['read']['io_kbytes'] != 32768:
+            self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+            self.passed = False
+
+
+class FioJobTest_t0009(FioJobTest):
+    """Test consists of fio test job t0009
+    Confirm that runtime >= 60s"""
+
+    def check_result(self):
+        super(FioJobTest_t0009, self).check_result()
+
+        if not self.passed:
+            return
+
+        logging.debug('Test %d: elapsed: %d', self.testnum, self.json_data['jobs'][0]['elapsed'])
+
+        if self.json_data['jobs'][0]['elapsed'] < 60:
+            self.failure_reason = "{0} elapsed time mismatch,".format(self.failure_reason)
+            self.passed = False
+
+
+class FioJobTest_t0011(FioJobTest):
+    """Test consists of fio test job t0009
+    Confirm that job0 iops == 1000
+    and that job1_iops / job0_iops ~ 8
+    With two runs of fio-3.16 I observed a ratio of 8.3"""
+
+    def check_result(self):
+        super(FioJobTest_t0011, self).check_result()
+
+        if not self.passed:
+            return
+
+        iops1 = self.json_data['jobs'][0]['read']['iops']
+        iops2 = self.json_data['jobs'][1]['read']['iops']
+        ratio = iops2 / iops1
+        logging.debug("Test %d: iops1: %f", self.testnum, iops1)
+        logging.debug("Test %d: ratio: %f", self.testnum, ratio)
+
+        if iops1 < 998 or iops1 > 1002:
+            self.failure_reason = "{0} iops value mismatch,".format(self.failure_reason)
+            self.passed = False
+
+        if ratio < 7 or ratio > 9:
+            self.failure_reason = "{0} iops ratio mismatch,".format(self.failure_reason)
+            self.passed = False
+
+
+class Requirements(object):
+    """Requirements consists of multiple run environment characteristics.
+    These are to determine if a particular test can be run"""
+
+    _linux = False
+    _libaio = False
+    _zbd = False
+    _root = False
+    _zoned_nullb = False
+    _not_macos = False
+    _not_windows = False
+    _unittests = False
+    _cpucount4 = False
+
+    def __init__(self, fio_root):
+        Requirements._not_macos = platform.system() != "Darwin"
+        Requirements._not_windows = platform.system() != "Windows"
+        Requirements._linux = platform.system() == "Linux"
+
+        if Requirements._linux:
+            try:
+                config_file = os.path.join(fio_root, "config-host.h")
+                with open(config_file, "r") as config:
+                    contents = config.read()
+            except Exception:
+                print("Unable to open {0} to check requirements".format(config_file))
+                Requirements._zbd = True
+            else:
+                Requirements._zbd = "CONFIG_LINUX_BLKZONED" in contents
+                Requirements._libaio = "CONFIG_LIBAIO" in contents
+
+            Requirements._root = (os.geteuid() == 0)
+            if Requirements._zbd and Requirements._root:
+                subprocess.run(["modprobe", "null_blk"],
+                               stdout=subprocess.PIPE,
+                               stderr=subprocess.PIPE)
+                if os.path.exists("/sys/module/null_blk/parameters/zoned"):
+                    Requirements._zoned_nullb = True
+
+        if platform.system() == "Windows":
+            utest_exe = "unittest.exe"
+        else:
+            utest_exe = "unittest"
+        unittest_path = os.path.join(fio_root, "unittests", utest_exe)
+        Requirements._unittests = os.path.exists(unittest_path)
+
+        Requirements._cpucount4 = multiprocessing.cpu_count() >= 4
+
+        req_list = [Requirements.linux,
+                    Requirements.libaio,
+                    Requirements.zbd,
+                    Requirements.root,
+                    Requirements.zoned_nullb,
+                    Requirements.not_macos,
+                    Requirements.not_windows,
+                    Requirements.unittests,
+                    Requirements.cpucount4]
+        for req in req_list:
+            value, desc = req()
+            logging.debug("Requirements: Requirement '%s' met? %s", desc, value)
+
+    @classmethod
+    def linux(cls):
+        """Are we running on Linux?"""
+        return Requirements._linux, "Linux required"
+
+    @classmethod
+    def libaio(cls):
+        """Is libaio available?"""
+        return Requirements._libaio, "libaio required"
+
+    @classmethod
+    def zbd(cls):
+        """Is ZBD support available?"""
+        return Requirements._zbd, "Zoned block device support required"
+
+    @classmethod
+    def root(cls):
+        """Are we running as root?"""
+        return Requirements._root, "root required"
+
+    @classmethod
+    def zoned_nullb(cls):
+        """Are zoned null block devices available?"""
+        return Requirements._zoned_nullb, "Zoned null block device support required"
+
+    @classmethod
+    def not_macos(cls):
+        """Are we running on a platform other than macOS?"""
+        return Requirements._not_macos, "platform other than macOS required"
+
+    @classmethod
+    def not_windows(cls):
+        """Are we running on a platform other than Windws?"""
+        return Requirements._not_windows, "platform other than Windows required"
+
+    @classmethod
+    def unittests(cls):
+        """Were unittests built?"""
+        return Requirements._unittests, "Unittests support required"
+
+    @classmethod
+    def cpucount4(cls):
+        """Do we have at least 4 CPUs?"""
+        return Requirements._cpucount4, "4+ CPUs required"
+
+
+SUCCESS_DEFAULT = {
+    'zero_return': True,
+    'stderr_empty': True,
+    'timeout': 600,
+    }
+SUCCESS_NONZERO = {
+    'zero_return': False,
+    'stderr_empty': False,
+    'timeout': 600,
+    }
+SUCCESS_STDERR = {
+    'zero_return': True,
+    'stderr_empty': False,
+    'timeout': 600,
+    }
+TEST_LIST = [
+    {
+        'test_id':          1,
+        'test_class':       FioJobTest,
+        'job':              't0001-52c58027.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          2,
+        'test_class':       FioJobTest,
+        'job':              't0002-13af05ae-post.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          't0002-13af05ae-pre.fio',
+        'pre_success':      None,
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
+    {
+        'test_id':          3,
+        'test_class':       FioJobTest,
+        'job':              't0003-0ae2c6e1-post.fio',
+        'success':          SUCCESS_NONZERO,
+        'pre_job':          't0003-0ae2c6e1-pre.fio',
+        'pre_success':      SUCCESS_DEFAULT,
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
+    {
+        'test_id':          4,
+        'test_class':       FioJobTest,
+        'job':              't0004-8a99fdf6.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
+    {
+        'test_id':          5,
+        'test_class':       FioJobTest_t0005,
+        'job':              't0005-f7078f7b.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [Requirements.not_windows],
+    },
+    {
+        'test_id':          6,
+        'test_class':       FioJobTest_t0006,
+        'job':              't0006-82af2a7c.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
+    {
+        'test_id':          7,
+        'test_class':       FioJobTest_t0007,
+        'job':              't0007-37cf9e3c.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
+    {
+        'test_id':          8,
+        'test_class':       FioJobTest_t0008,
+        'job':              't0008-ae2fafc8.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
+    {
+        'test_id':          9,
+        'test_class':       FioJobTest_t0009,
+        'job':              't0009-f8b0bd10.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [Requirements.not_macos,
+                             Requirements.cpucount4],
+        # mac os does not support CPU affinity
+    },
+    {
+        'test_id':          10,
+        'test_class':       FioJobTest,
+        'job':              't0010-b7aae4ba.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          11,
+        'test_class':       FioJobTest_t0011,
+        'job':              't0011-5d2788d5.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
+    {
+        'test_id':          1000,
+        'test_class':       FioExeTest,
+        'exe':              't/axmap',
+        'parameters':       None,
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1001,
+        'test_class':       FioExeTest,
+        'exe':              't/ieee754',
+        'parameters':       None,
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1002,
+        'test_class':       FioExeTest,
+        'exe':              't/lfsr-test',
+        'parameters':       ['0xFFFFFF', '0', '0', 'verify'],
+        'success':          SUCCESS_STDERR,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1003,
+        'test_class':       FioExeTest,
+        'exe':              't/readonly.py',
+        'parameters':       ['-f', '{fio_path}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1004,
+        'test_class':       FioExeTest,
+        'exe':              't/steadystate_tests.py',
+        'parameters':       ['{fio_path}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1005,
+        'test_class':       FioExeTest,
+        'exe':              't/stest',
+        'parameters':       None,
+        'success':          SUCCESS_STDERR,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1006,
+        'test_class':       FioExeTest,
+        'exe':              't/strided.py',
+        'parameters':       ['{fio_path}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1007,
+        'test_class':       FioExeTest,
+        'exe':              't/zbd/run-tests-against-regular-nullb',
+        'parameters':       None,
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [Requirements.linux, Requirements.zbd,
+                             Requirements.root],
+    },
+    {
+        'test_id':          1008,
+        'test_class':       FioExeTest,
+        'exe':              't/zbd/run-tests-against-zoned-nullb',
+        'parameters':       None,
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [Requirements.linux, Requirements.zbd,
+                             Requirements.root, Requirements.zoned_nullb],
+    },
+    {
+        'test_id':          1009,
+        'test_class':       FioExeTest,
+        'exe':              'unittests/unittest',
+        'parameters':       None,
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [Requirements.unittests],
+    },
+    {
+        'test_id':          1010,
+        'test_class':       FioExeTest,
+        'exe':              't/latency_percentiles.py',
+        'parameters':       ['-f', '{fio_path}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+]
+
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--fio-root',
+                        help='fio root path')
+    parser.add_argument('-f', '--fio',
+                        help='path to fio executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root',
+                        help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('-d', '--debug', action='store_true',
+                        help='provide debug output')
+    parser.add_argument('-k', '--skip-req', action='store_true',
+                        help='skip requirements checking')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Entry point."""
+
+    args = parse_args()
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    if args.fio_root:
+        fio_root = args.fio_root
+    else:
+        fio_root = str(Path(__file__).absolute().parent.parent)
+    print("fio root is %s" % fio_root)
+
+    if args.fio:
+        fio_path = args.fio
+    else:
+        if platform.system() == "Windows":
+            fio_exe = "fio.exe"
+        else:
+            fio_exe = "fio"
+        fio_path = os.path.join(fio_root, fio_exe)
+    print("fio path is %s" % fio_path)
+    if not shutil.which(fio_path):
+        print("Warning: fio executable not found")
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        "fio-test-{0}".format(time.strftime("%Y%m%d-%H%M%S"))
+    os.mkdir(artifact_root)
+    print("Artifact directory is %s" % artifact_root)
+
+    if not args.skip_req:
+        req = Requirements(fio_root)
+
+    passed = 0
+    failed = 0
+    skipped = 0
+
+    for config in TEST_LIST:
+        if (args.skip and config['test_id'] in args.skip) or \
+           (args.run_only and config['test_id'] not in args.run_only):
+            skipped = skipped + 1
+            print("Test {0} SKIPPED (User request)".format(config['test_id']))
+            continue
+
+        if issubclass(config['test_class'], FioJobTest):
+            if config['pre_job']:
+                fio_pre_job = os.path.join(fio_root, 't', 'jobs',
+                                           config['pre_job'])
+            else:
+                fio_pre_job = None
+            if config['pre_success']:
+                fio_pre_success = config['pre_success']
+            else:
+                fio_pre_success = None
+            if 'output_format' in config:
+                output_format = config['output_format']
+            else:
+                output_format = 'normal'
+            test = config['test_class'](
+                fio_path,
+                os.path.join(fio_root, 't', 'jobs', config['job']),
+                config['success'],
+                fio_pre_job=fio_pre_job,
+                fio_pre_success=fio_pre_success,
+                output_format=output_format)
+        elif issubclass(config['test_class'], FioExeTest):
+            exe_path = os.path.join(fio_root, config['exe'])
+            if config['parameters']:
+                parameters = [p.format(fio_path=fio_path) for p in config['parameters']]
+            else:
+                parameters = None
+            if Path(exe_path).suffix == '.py' and platform.system() == "Windows":
+                if parameters:
+                    parameters.insert(0, exe_path)
+                else:
+                    parameters = [exe_path]
+                exe_path = "python.exe"
+            test = config['test_class'](exe_path, parameters,
+                                        config['success'])
+        else:
+            print("Test {0} FAILED: unable to process test config".format(config['test_id']))
+            failed = failed + 1
+            continue
+
+        if not args.skip_req:
+            reqs_met = True
+            for req in config['requirements']:
+                reqs_met, reason = req()
+                logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason,
+                              reqs_met)
+                if not reqs_met:
+                    break
+            if not reqs_met:
+                print("Test {0} SKIPPED ({1})".format(config['test_id'], reason))
+                skipped = skipped + 1
+                continue
+
+        test.setup(artifact_root, config['test_id'])
+        test.run()
+        test.check_result()
+        if test.passed:
+            result = "PASSED"
+            passed = passed + 1
+        else:
+            result = "FAILED: {0}".format(test.failure_reason)
+            failed = failed + 1
+            with open(test.stderr_file, "r") as stderr_file:
+                logging.debug("Test %d: stderr:\n%s", config['test_id'], stderr_file.read())
+            with open(test.stdout_file, "r") as stdout_file:
+                logging.debug("Test %d: stdout:\n%s", config['test_id'], stdout_file.read())
+        print("Test {0} {1}".format(config['test_id'], result))
+
+    print("{0} test(s) passed, {1} failed, {2} skipped".format(passed, failed, skipped))
+
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/sgunmap-perf.py b/t/sgunmap-perf.py
new file mode 100755
index 0000000..fadbb85
--- /dev/null
+++ b/t/sgunmap-perf.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python2.7
+#
+# sgunmap-test.py
+#
+# Basic performance testing using fio's sg ioengine
+#
+# USAGE
+# sgunmap-perf.py char-device block-device fio-executable
+#
+# EXAMPLE
+# t/sgunmap-perf.py /dev/sg1 /dev/sdb ./fio
+#
+# REQUIREMENTS
+# Python 2.6+
+#
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import sys
+import json
+import argparse
+import subprocess
+from six.moves import range
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('cdev',
+                        help='character device target (e.g., /dev/sg0)')
+    parser.add_argument('bdev',
+                        help='block device target (e.g., /dev/sda)')
+    parser.add_argument('fioc',
+                        help='path to candidate fio executable (e.g., ./fio)')
+    parser.add_argument('fior',
+                        help='path to reference fio executable (e.g., ./fio)')
+    args = parser.parse_args()
+
+    return args
+
+
+def fulldevice(fio, dev, ioengine='psync', rw='trim', bs='1M'):
+    parameters = ["--name=test",
+                  "--output-format=json",
+                  "--random_generator=lfsr",
+                  "--bs={0}".format(bs),
+                  "--rw={0}".format(rw),
+                  "--ioengine={0}".format(ioengine),
+                  "--filename={0}".format(dev)]
+
+    output = subprocess.check_output([fio] + parameters)
+    jsondata = json.loads(output)
+    jobdata = jsondata['jobs'][0]
+    return jobdata
+
+
+def runtest(fio, dev, rw, qd, batch, bs='512', runtime='30s'):
+    parameters = ["--name=test",
+                  "--random_generator=tausworthe64",
+                  "--time_based",
+                  "--runtime={0}".format(runtime),
+                  "--output-format=json",
+                  "--ioengine=sg",
+                  "--blocksize={0}".format(bs),
+                  "--rw={0}".format(rw),
+                  "--filename={0}".format(dev),
+                  "--iodepth={0}".format(qd),
+                  "--iodepth_batch={0}".format(batch)]
+
+    output = subprocess.check_output([fio] + parameters)
+    jsondata = json.loads(output)
+    jobdata = jsondata['jobs'][0]
+#    print(parameters)
+
+    return jobdata
+
+
+def runtests(fio, dev, qd, batch, rw, bs='512', trials=5):
+    iops = []
+    for x in range(trials):
+        jd = runtest(fio, dev, rw, qd, batch, bs=bs)
+        total = jd['read']['iops'] + jd['write']['iops'] + jd['trim']['iops']
+#       print(total)
+        iops.extend([total])
+    return iops, (sum(iops) / trials)
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print("Trimming full device {0}".format(args.cdev))
+    fulldevice(args.fior, args.cdev, ioengine='sg')
+
+    print("Running rand read tests on {0}"
+        " with fio candidate build {1}".format(args.cdev, args.fioc))
+    randread, rrmean = runtests(args.fioc, args.cdev, 16, 1, 'randread',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rrmean, randread))
+
+    print("Running rand read tests on {0}"
+        " with fio reference build {1}".format(args.cdev, args.fior))
+    randread, rrmean = runtests(args.fior, args.cdev, 16, 1, 'randread',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rrmean, randread))
+
+    print("Running rand write tests on {0}"
+        " with fio candidate build {1}".format(args.cdev, args.fioc))
+    randwrite, rwmean = runtests(args.fioc, args.cdev, 16, 1, 'randwrite',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rwmean, randwrite))
+
+    print("Running rand write tests on {0}"
+        " with fio reference build {1}".format(args.cdev, args.fior))
+    randwrite, rwmean = runtests(args.fior, args.cdev, 16, 1, 'randwrite',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rwmean, randwrite))
diff --git a/t/sgunmap-test.py b/t/sgunmap-test.py
new file mode 100755
index 0000000..f8f10ab
--- /dev/null
+++ b/t/sgunmap-test.py
@@ -0,0 +1,172 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python 3 compatible.
+#
+# sgunmap-test.py
+#
+# Limited functonality test for trim workloads using fio's sg ioengine
+# This checks only the three sets of reported iodepths
+#
+# !!!WARNING!!!
+# This script carries out destructive tests. Be sure that
+# there is no data you want to keep on the supplied devices.
+#
+# USAGE
+# sgunmap-test.py char-device block-device fio-executable
+#
+# EXAMPLE
+# t/sgunmap-test.py /dev/sg1 /dev/sdb ./fio
+#
+# REQUIREMENTS
+# Python 2.6+
+#
+# TEST MATRIX
+# For both char-dev and block-dev these are the expected
+# submit/complete IO depths
+#
+#                       blockdev                chardev
+#                       iodepth                 iodepth
+# R QD1                 sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# W QD1                 sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# T QD1                 sub/comp: 1-4=100%      sub/comp: 1-4=100%
+#
+# R QD16, batch8        sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# W QD16, batch8        sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# T QD16, batch8        sub/comp: 1-4=100%      sub/comp: 5-8=100%
+#
+# R QD16, batch16       sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# W QD16, batch16       sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# T QD16, batch16       sub/comp: 1-4=100%      sub/comp: 9-16=100%
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import sys
+import json
+import argparse
+import traceback
+import subprocess
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('chardev',
+                        help='character device target (e.g., /dev/sg0)')
+    parser.add_argument('blockdev',
+                        help='block device target (e.g., /dev/sda)')
+    parser.add_argument('fio',
+                        help='path to fio executable (e.g., ./fio)')
+    args = parser.parse_args()
+
+    return args
+
+#
+# With block devices,
+#     iodepth = 1 always
+#     submit = complete = 1-4 always
+# With character devices,
+# RW
+#     iodepth = qd
+#     submit = 1-4
+#     complete = 1-4 except for the IOs in flight
+#                when the job is ending
+# T
+#     iodepth = qd
+#     submit = qdbatch
+#     complete = qdbatch except for the IOs in flight
+#                when the job is ending
+#
+
+
+def check(jsondata, parameters, block, qd, qdbatch, rw):
+    iodepth = jsondata['iodepth_level']
+    submit = jsondata['iodepth_submit']
+    complete = jsondata['iodepth_complete']
+
+    try:
+        if block:
+            assert iodepth['1'] == 100.0
+            assert submit['4'] == 100.0
+            assert complete['4'] == 100.0
+        elif 'read' in rw or 'write' in rw:
+            assert iodepth[str(qd)] > 99.9
+            assert submit['4'] == 100.0
+            assert complete['4'] > 99.9
+        else:
+            if qdbatch <= 4:
+                batchkey = '4'
+            elif qdbatch > 64:
+                batchkey = '>=64'
+            else:
+                batchkey = str(qdbatch)
+            if qd >= 64:
+                qdkey = ">=64"
+            else:
+                qdkey = str(qd)
+            assert iodepth[qdkey] > 99
+            assert submit[batchkey] == 100.0
+            assert complete[batchkey] > 99
+    except AssertionError:
+        print("Assertion failed")
+        traceback.print_exc()
+        print(jsondata)
+        return
+
+    print("**********passed*********")
+
+
+def runalltests(args, qd, batch):
+    block = False
+    for dev in [args.chardev, args.blockdev]:
+        for rw in ["randread", "randwrite", "randtrim"]:
+            parameters = ["--name=test",
+                           "--time_based",
+                           "--runtime=30s",
+                           "--output-format=json",
+                           "--ioengine=sg",
+                           "--rw={0}".format(rw),
+                           "--filename={0}".format(dev),
+                           "--iodepth={0}".format(qd),
+                           "--iodepth_batch={0}".format(batch)]
+
+            print(parameters)
+            output = subprocess.check_output([args.fio] + parameters)
+            jsondata = json.loads(output)
+            jobdata = jsondata['jobs'][0]
+            check(jobdata, parameters, block, qd, batch, rw)
+        block = True
+
+
+def runcdevtrimtest(args, qd, batch):
+    parameters = ["--name=test",
+                   "--time_based",
+                   "--runtime=30s",
+                   "--output-format=json",
+                   "--ioengine=sg",
+                   "--rw=randtrim",
+                   "--filename={0}".format(args.chardev),
+                   "--iodepth={0}".format(qd),
+                   "--iodepth_batch={0}".format(batch)]
+
+    print(parameters)
+    output = subprocess.check_output([args.fio] + parameters)
+    jsondata = json.loads(output)
+    jobdata = jsondata['jobs'][0]
+    check(jobdata, parameters, False, qd, batch, "randtrim")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    runcdevtrimtest(args, 32, 2)
+    runcdevtrimtest(args, 32, 4)
+    runcdevtrimtest(args, 32, 8)
+    runcdevtrimtest(args, 64, 4)
+    runcdevtrimtest(args, 64, 8)
+    runcdevtrimtest(args, 64, 16)
+    runcdevtrimtest(args, 128, 8)
+    runcdevtrimtest(args, 128, 16)
+    runcdevtrimtest(args, 128, 32)
+
+    runalltests(args, 1, 1)
+    runalltests(args, 16, 2)
+    runalltests(args, 16, 16)
diff --git a/t/steadystate_tests.py b/t/steadystate_tests.py
new file mode 100755
index 0000000..b55a67a
--- /dev/null
+++ b/t/steadystate_tests.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+# Note: this script is python2 and python3 compatible.
+#
+# steadystate_tests.py
+#
+# Test option parsing and functonality for fio's steady state detection feature.
+#
+# steadystate_tests.py --read file-for-read-testing --write file-for-write-testing ./fio
+#
+# REQUIREMENTS
+# Python 2.6+
+# SciPy
+#
+# KNOWN ISSUES
+# only option parsing and read tests are carried out
+# On Windows this script works under Cygwin but not from cmd.exe
+# On Windows I encounter frequent fio problems generating JSON output (nothing to decode)
+# min runtime:
+# if ss attained: min runtime = ss_dur + ss_ramp
+# if not attained: runtime = timeout
+
+from __future__ import absolute_import
+from __future__ import print_function
+import os
+import sys
+import json
+import pprint
+import argparse
+import subprocess
+from scipy import stats
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('fio', help='path to fio executable')
+    args = parser.parse_args()
+
+    return args
+
+
+def check(data, iops, slope, pct, limit, dur, criterion):
+    measurement = 'iops' if iops else 'bw'
+    data = data[measurement]
+    mean = sum(data) / len(data)
+    if slope:
+        x = list(range(len(data)))
+        m, intercept, r_value, p_value, std_err = stats.linregress(x,data)
+        m = abs(m)
+        if pct:
+            target = (m / mean * 100) if mean != 0 else 0
+            criterion = criterion[:-1]
+        else:
+            target = m
+    else:
+        maxdev = 0
+        for x in data:
+            maxdev = max(abs(mean-x), maxdev)
+        if pct:
+            target = maxdev / mean * 100
+            criterion = criterion[:-1]
+        else:
+            target = maxdev
+
+    criterion = float(criterion)
+    if criterion == 0.0:
+        objsame = False
+    else:
+        objsame = abs(target - criterion) / criterion < 0.005
+    return (objsame, target < limit, mean, target)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    pp = pprint.PrettyPrinter(indent=4)
+
+    passed = 0
+    failed = 0
+
+#
+# test option parsing
+#
+    parsing = [ { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:10", "--ss_ramp=5"],
+                  'output': "set steady state IOPS threshold to 10.000000" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:10%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 10.000000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:.1%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 0.100000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:10%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 10.000000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:.1%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 0.100000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:12", "--ss_ramp=5"],
+                  'output': "set steady state BW threshold to 12" },
+              ]
+    for test in parsing:
+        output = subprocess.check_output([args.fio] + test['args'])
+        if test['output'] in output.decode():
+            print("PASSED '{0}' found with arguments {1}".format(test['output'], test['args']))
+            passed = passed + 1
+        else:
+            print("FAILED '{0}' NOT found with arguments {1}".format(test['output'], test['args']))
+            failed = failed + 1
+
+#
+# test some read workloads
+#
+# if ss active and attained,
+#   check that runtime is less than job time
+#   check criteria
+#   how to check ramp time?
+#
+# if ss inactive
+#   check that runtime is what was specified
+#
+    reads = [ {'s': True, 'timeout': 100, 'numjobs': 1, 'ss_dur': 5, 'ss_ramp': 3, 'iops': True, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': False, 'timeout': 20, 'numjobs': 2},
+              {'s': True, 'timeout': 100, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 5, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+            ]
+
+    jobnum = 0
+    for job in reads:
+
+        tf = "steadystate_job{0}.json".format(jobnum)
+        parameters = [ "--name=job{0}".format(jobnum) ]
+        parameters.extend([ "--thread",
+                            "--output-format=json",
+                            "--output={0}".format(tf),
+                            "--ioengine=null",
+                            "--size=1G",
+                            "--rw=randrw",
+                            "--rwmixread=100",
+                            "--stonewall",
+                            "--group_reporting",
+                            "--numjobs={0}".format(job['numjobs']),
+                            "--time_based",
+                            "--runtime={0}".format(job['timeout']) ])
+        if job['s']:
+           if job['iops']:
+               ss = 'iops'
+           else:
+               ss = 'bw'
+           if job['slope']:
+               ss += "_slope"
+           ss += ":" + str(job['ss_limit'])
+           if job['pct']:
+               ss += '%'
+           parameters.extend([ '--ss_dur={0}'.format(job['ss_dur']),
+                               '--ss={0}'.format(ss),
+                               '--ss_ramp={0}'.format(job['ss_ramp']) ])
+
+        output = subprocess.call([args.fio] + parameters)
+        with open(tf, 'r') as source:
+            jsondata = json.loads(source.read())
+            source.close()
+
+        for jsonjob in jsondata['jobs']:
+            line = "{0}".format(jsonjob['job options']['name'])
+            if job['s']:
+                if jsonjob['steadystate']['attained'] == 1:
+                    # check runtime >= ss_dur + ss_ramp, check criterion, check criterion < limit
+                    mintime = (job['ss_dur'] + job['ss_ramp']) * 1000
+                    actual = jsonjob['read']['runtime']
+                    if mintime > actual:
+                        line = 'FAILED ' + line + ' ss attained, runtime {0} < ss_dur {1} + ss_ramp {2}'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        failed = failed + 1
+                    else:
+                        line = line + ' ss attained, runtime {0} > ss_dur {1} + ss_ramp {2},'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        objsame, met, mean, target = check(data=jsonjob['steadystate']['data'],
+                            iops=job['iops'],
+                            slope=job['slope'],
+                            pct=job['pct'],
+                            limit=job['ss_limit'],
+                            dur=job['ss_dur'],
+                            criterion=jsonjob['steadystate']['criterion'])
+                        if not objsame:
+                            line = 'FAILED ' + line + ' fio criterion {0} != calculated criterion {1} '.format(jsonjob['steadystate']['criterion'], target)
+                            failed = failed + 1
+                        else:
+                            if met:
+                                line = 'PASSED ' + line + ' target {0} < limit {1}'.format(target, job['ss_limit'])
+                                passed = passed + 1
+                            else:
+                                line = 'FAILED ' + line + ' target {0} < limit {1} but fio reports ss not attained '.format(target, job['ss_limit'])
+                                failed = failed + 1
+                else:
+                    # check runtime, confirm criterion calculation, and confirm that criterion was not met
+                    expected = job['timeout'] * 1000
+                    actual = jsonjob['read']['runtime']
+                    if abs(expected - actual) > 50:
+                        line = 'FAILED ' + line + ' ss not attained, expected runtime {0} != actual runtime {1}'.format(expected, actual)
+                    else:
+                        line = line + ' ss not attained, runtime {0} != ss_dur {1} + ss_ramp {2},'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        objsame, met, mean, target = check(data=jsonjob['steadystate']['data'],
+                            iops=job['iops'],
+                            slope=job['slope'],
+                            pct=job['pct'],
+                            limit=job['ss_limit'],
+                            dur=job['ss_dur'],
+                            criterion=jsonjob['steadystate']['criterion'])
+                        if not objsame:
+                            if actual > (job['ss_dur'] + job['ss_ramp'])*1000:
+                                line = 'FAILED ' + line + ' fio criterion {0} != calculated criterion {1} '.format(jsonjob['steadystate']['criterion'], target)
+                                failed = failed + 1
+                            else:
+                                line = 'PASSED ' + line + ' fio criterion {0} == 0.0 since ss_dur + ss_ramp has not elapsed '.format(jsonjob['steadystate']['criterion'])
+                                passed = passed + 1
+                        else:
+                            if met:
+                                line = 'FAILED ' + line + ' target {0} < threshold {1} but fio reports ss not attained '.format(target, job['ss_limit'])
+                                failed = failed + 1
+                            else:
+                                line = 'PASSED ' + line + ' criterion {0} > threshold {1}'.format(target, job['ss_limit'])
+                                passed = passed + 1
+            else:
+                expected = job['timeout'] * 1000
+                actual = jsonjob['read']['runtime']
+                if abs(expected - actual) > 50:
+                    result = 'FAILED '
+                    failed = failed + 1
+                else:
+                    result = 'PASSED '
+                    passed = passed + 1
+                line = result + line + ' no ss, expected runtime {0} ~= actual runtime {1}'.format(expected, actual)
+            print(line)
+            if 'steadystate' in jsonjob:
+                pp.pprint(jsonjob['steadystate'])
+        jobnum += 1
+
+    print("{0} test(s) PASSED, {1} test(s) FAILED".format(passed,failed))
+    sys.exit(failed)
diff --git a/t/stest.c b/t/stest.c
new file mode 100644
index 0000000..c6bf2d1
--- /dev/null
+++ b/t/stest.c
@@ -0,0 +1,96 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "../smalloc.h"
+#include "../flist.h"
+#include "../arch/arch.h"
+#include "debug.h"
+
+#define MAGIC1	0xa9b1c8d2
+#define MAGIC2	0xf0a1e9b3
+
+#define LOOPS	32
+#define MAXSMALLOC	120*1024*1024UL
+#define LARGESMALLOC	128*1024U
+
+struct elem {
+	unsigned int magic1;
+	struct flist_head list;
+	unsigned int magic2;
+	unsigned int size;
+};
+
+static FLIST_HEAD(list);
+
+static int do_rand_allocs(void)
+{
+	unsigned int size, nr, rounds = 0, ret = 0;
+	unsigned long total;
+	struct elem *e;
+	bool error;
+
+	while (rounds++ < LOOPS) {
+#ifdef STEST_SEED
+		srand(MAGIC1);
+#endif
+		error = false;
+		nr = total = 0;
+		while (total < MAXSMALLOC) {
+			size = 8 * sizeof(struct elem) + (int) (999.0 * (rand() / (RAND_MAX + 1.0)));
+			e = smalloc(size);
+			if (!e) {
+				printf("fail at %lu, size %u\n", total, size);
+				ret++;
+				break;
+			}
+			e->magic1 = MAGIC1;
+			e->magic2 = MAGIC2;
+			e->size = size;
+			total += size;
+			flist_add_tail(&e->list, &list);
+			nr++;
+		}
+
+		printf("Got items: %u\n", nr);
+
+		while (!flist_empty(&list)) {
+			e = flist_entry(list.next, struct elem, list);
+			assert(e->magic1 == MAGIC1);
+			assert(e->magic2 == MAGIC2);
+			total -= e->size;
+			flist_del(&e->list);
+			sfree(e);
+
+			if (!error) {
+				e = smalloc(LARGESMALLOC);
+				if (!e) {
+					error = true;
+					ret++;
+					printf("failure allocating %u bytes at %lu allocated during sfree phase\n",
+						LARGESMALLOC, total);
+				}
+				else
+					sfree(e);
+			}
+		}
+	}
+
+	return ret;
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+
+	arch_init(argv);
+	sinit();
+	debug_init();
+
+	ret = do_rand_allocs();
+	smalloc_debug(0);	/* TODO: check that free and total blocks
+				** match */
+
+	scleanup();
+	return ret;
+}
diff --git a/t/strided.py b/t/strided.py
new file mode 100755
index 0000000..aac15d1
--- /dev/null
+++ b/t/strided.py
@@ -0,0 +1,349 @@
+#!/usr/bin/python
+# Note: this script is python2 and python3 compatible.
+#
+# strided.py
+#
+# Test zonemode=strided. This uses the null ioengine when no file is
+# specified. If a file is specified, use it for randdom read testing.
+# Some of the zoneranges in the tests are 16MiB. So when using a file
+# a minimum size of 32MiB is recommended.
+#
+# USAGE
+# python strided.py fio-executable [-f file/device]
+#
+# EXAMPLES
+# python t/strided.py ./fio
+# python t/strided.py ./fio -f /dev/sda
+# dd if=/dev/zero of=temp bs=1M count=32
+# python t/strided.py ./fio -f temp
+#
+# REQUIREMENTS
+# Python 2.6+
+#
+# ===TEST MATRIX===
+#
+# --zonemode=strided, zoneskip unset
+#   w/ randommap and LFSR
+#       zonesize=zonerange  all blocks in zonerange touched
+#       zonesize>zonerange  all blocks touched and roll-over back into zone
+#       zonesize<zonerange  all blocks inside zone
+#
+#   w/o randommap       all blocks inside zone
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import os
+import sys
+import argparse
+import subprocess
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('fio',
+                        help='path to fio executable (e.g., ./fio)')
+    parser.add_argument('-f', '--filename', help="file/device to test")
+    args = parser.parse_args()
+
+    return args
+
+
+def run_fio(fio, test, index):
+    filename = "strided"
+    fio_args = [
+                "--name=strided",
+                "--zonemode=strided",
+                "--log_offset=1",
+                "--randrepeat=0",
+                "--rw=randread",
+                "--write_iops_log={0}{1:03d}".format(filename, index),
+                "--output={0}{1:03d}.out".format(filename, index),
+                "--zonerange={zonerange}".format(**test),
+                "--zonesize={zonesize}".format(**test),
+                "--bs={bs}".format(**test),
+               ]
+    if 'norandommap' in test:
+        fio_args.append('--norandommap')
+    if 'random_generator' in test:
+        fio_args.append('--random_generator={random_generator}'.format(**test))
+    if 'offset' in test:
+        fio_args.append('--offset={offset}'.format(**test))
+    if 'filename' in test:
+        fio_args.append('--filename={filename}'.format(**test))
+        fio_args.append('--filesize={filesize})'.format(**test))
+    else:
+        fio_args.append('--ioengine=null')
+        fio_args.append('--size={size}'.format(**test))
+        fio_args.append('--io_size={io_size}'.format(**test))
+        fio_args.append('--filesize={size})'.format(**test))
+
+    output = subprocess.check_output([fio] + fio_args, universal_newlines=True)
+
+    f = open("{0}{1:03d}_iops.1.log".format(filename, index), "r")
+    log = f.read()
+    f.close()
+
+    return log
+
+
+def check_output(iops_log, test):
+    zonestart = 0 if 'offset' not in test else test['offset']
+    iospersize = test['zonesize'] / test['bs']
+    iosperrange = test['zonerange'] / test['bs']
+    iosperzone = 0
+    lines = iops_log.split('\n')
+    zoneset = set()
+
+    for line in lines:
+        if len(line) == 0:
+            continue
+
+        if iosperzone == iospersize:
+            # time to move to a new zone
+            iosperzone = 0
+            zoneset = set()
+            zonestart += test['zonerange']
+            if zonestart >= test['filesize']:
+                zonestart = 0 if 'offset' not in test else test['offset']
+
+        iosperzone = iosperzone + 1
+        tokens = line.split(',')
+        offset = int(tokens[4])
+        if offset < zonestart or offset >= zonestart + test['zonerange']:
+            print("Offset {0} outside of zone starting at {1}".format(
+                    offset, zonestart))
+            return False
+
+        # skip next section if norandommap is enabled with no
+        # random_generator or with a random_generator != lfsr
+        if 'norandommap' in test:
+            if 'random_generator' in test:
+                if test['random_generator'] != 'lfsr':
+                    continue
+            else:
+                continue
+
+        # we either have a random map enabled or we
+        # are using an LFSR
+        # so all blocks should be unique and we should have
+        # covered the entire zone when iosperzone % iosperrange == 0
+        block = (offset - zonestart) / test['bs']
+        if block in zoneset:
+            print("Offset {0} in zone already touched".format(offset))
+            return False
+
+        zoneset.add(block)
+        if iosperzone % iosperrange == 0:
+            if len(zoneset) != iosperrange:
+                print("Expected {0} blocks in zone but only saw {1}".format(
+                        iosperrange, len(zoneset)))
+                return False
+            zoneset = set()
+
+    return True
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    tests = [   # randommap enabled
+                {
+                    "zonerange": 4096,
+                    "zonesize": 4096,
+                    "bs": 4096,
+                    "offset": 8*4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "zonerange": 4096,
+                    "zonesize": 4096,
+                    "bs": 4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 16*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                {
+                    "zonerange": 4096,
+                    "zonesize": 4*4096,
+                    "bs": 4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 32*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                {
+                    "zonerange": 8192,
+                    "zonesize": 4096,
+                    "bs": 4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 8*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                # lfsr
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 4096*1024,
+                    "zonesize": 4096*1024,
+                    "bs": 4096,
+                    "offset": 8*4096*1024,
+                    "size": 16*4096*1024,
+                    "io_size": 16*4096*1024,
+                },
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 4096*1024,
+                    "zonesize": 4096*1024,
+                    "bs": 4096,
+                    "size": 16*4096*1024,
+                    "io_size": 16*4096*1024,
+                },
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 16*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 4096*1024,
+                    "zonesize": 4*4096*1024,
+                    "bs": 4096,
+                    "size": 16*4096*1024,
+                    "io_size": 16*4096*1024,
+                },
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 32*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 8192*1024,
+                    "zonesize": 4096*1024,
+                    "bs": 4096,
+                    "size": 16*4096*1024,
+                    "io_size": 16*4096*1024,
+                },
+                {
+                    "random_generator": "lfsr",
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 8*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                # norandommap
+                {
+                    "norandommap": 1,
+                    "zonerange": 4096,
+                    "zonesize": 4096,
+                    "bs": 4096,
+                    "offset": 8*4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "norandommap": 1,
+                    "zonerange": 4096,
+                    "zonesize": 4096,
+                    "bs": 4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "norandommap": 1,
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 16*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                {
+                    "norandommap": 1,
+                    "zonerange": 4096,
+                    "zonesize": 8192,
+                    "bs": 4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "norandommap": 1,
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 32*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*204,
+                },
+                {
+                    "norandommap": 1,
+                    "zonerange": 8192,
+                    "zonesize": 4096,
+                    "bs": 4096,
+                    "size": 16*4096,
+                    "io_size": 16*4096,
+                },
+                {
+                    "norandommap": 1,
+                    "zonerange": 16*1024*1024,
+                    "zonesize": 8*1024*1024,
+                    "bs": 4096,
+                    "size": 256*1024*1024,
+                    "io_size": 256*1024*1024,
+                },
+
+            ]
+
+    index = 1
+    passed = 0
+    failed = 0
+
+    if args.filename:
+        statinfo = os.stat(args.filename)
+        filesize = statinfo.st_size
+        if filesize == 0:
+            f = os.open(args.filename, os.O_RDONLY)
+            filesize = os.lseek(f, 0, os.SEEK_END)
+            os.close(f)
+
+    for test in tests:
+        if args.filename:
+            test['filename'] = args.filename
+            test['filesize'] = filesize
+        else:
+            test['filesize'] = test['size']
+        iops_log = run_fio(args.fio, test, index)
+        status = check_output(iops_log, test)
+        print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED")))
+        if status:
+            passed = passed + 1
+        else:
+            failed = failed + 1
+        index = index + 1
+
+    print("{0} tests passed, {1} failed".format(passed, failed))
+
+    sys.exit(failed)
diff --git a/t/time-test.c b/t/time-test.c
new file mode 100644
index 0000000..a74d920
--- /dev/null
+++ b/t/time-test.c
@@ -0,0 +1,544 @@
+/*
+ * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
+ *
+ * When we use the CPU clock for timing, we do the following:
+ *
+ * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
+ *    to actual time.
+ *
+ *    Using gettimeofday() or clock_gettime(), count how many CPU clock
+ *    ticks occur per usec
+ *
+ * 2) Calculate conversion factors so that we can ultimately convert
+ *    from clocks ticks to nsec with
+ *      nsec = (ticks * clock_mult) >> clock_shift
+ *
+ *    This is equivalent to
+ *	nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
+ *    where
+ *	clock_mult = MULTIPLIER / cycles_per_nsec
+ *      MULTIPLIER = 2^clock_shift
+ *
+ *    It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
+ *    but all of this is necessary because of rounding when calculating
+ *    cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
+ *    be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
+ *    portion is lost with integer arithmetic.
+ *
+ *    This multiply and shift calculation also has a performance benefit
+ *    as multiplication and bit shift operations are faster than integer
+ *    division.
+ *
+ * 3) Dynamically determine clock_shift and clock_mult at run time based
+ *    on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
+ *    duration for which the conversion will be valid.
+ *
+ *    The primary constraint is that (ticks * clock_mult) must not overflow
+ *    when ticks is at its maximum value.
+ *
+ *    So we have
+ *	max_ticks = MAX_CLOCK_SEC * 1000000000 * cycles_per_nsec
+ *	max_ticks * clock_mult <= ULLONG_MAX
+ *	max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
+ *      MULTIPLIER <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Then choose the largest clock_shift that satisfies
+ *	2^clock_shift <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Finally calculate the appropriate clock_mult associated with clock_shift
+ *	clock_mult = 2^clock_shift / cycles_per_nsec
+ *
+ * 4) In the code below we have cycles_per_usec and use
+ *	cycles_per_nsec = cycles_per_usec / 1000
+ *
+ *
+ * The code below implements 4 clock tick to nsec conversion strategies
+ *
+ *   i) 64-bit arithmetic for the (ticks * clock_mult) product with the
+ *	conversion valid for at most MAX_CLOCK_SEC
+ *
+ *  ii) NOT IMPLEMENTED Use 64-bit integers to emulate 128-bit multiplication
+ *	for the (ticks * clock_mult) product
+ *
+ * iii) 64-bit arithmetic with clock ticks to nsec conversion occurring in
+ *	two stages. The first stage counts the number of discrete, large chunks
+ *	of time that have elapsed. To this is added the time represented by
+ *	the remaining clock ticks. The advantage of this strategy is better
+ *	accuracy because the (ticks * clock_mult) product used for final
+ *	fractional chunk
+ *
+ *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ *	two stages. This is carried out using locks to update the number of
+ *	large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
+ *
+ *   v) 128-bit arithmetic used for the clock ticks to nsec conversion.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "lib/seqlock.h"
+
+#define DEBUG 0
+#define MAX_CLOCK_SEC 365*24*60*60ULL
+#define MAX_CLOCK_SEC_2STAGE 60*60ULL
+#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
+
+enum {
+	__CLOCK64_BIT		= 1 << 0,
+	__CLOCK128_BIT		= 1 << 1,
+	__CLOCK_MULT_SHIFT	= 1 << 2,
+	__CLOCK_EMULATE_128	= 1 << 3,
+	__CLOCK_2STAGE		= 1 << 4,
+	__CLOCK_LOCK		= 1 << 5,
+
+	CLOCK64_MULT_SHIFT	= __CLOCK64_BIT | __CLOCK_MULT_SHIFT,
+	CLOCK64_EMULATE_128	= __CLOCK64_BIT | __CLOCK_EMULATE_128,
+	CLOCK64_2STAGE		= __CLOCK64_BIT | __CLOCK_2STAGE,
+	CLOCK64_LOCK		= __CLOCK64_BIT | __CLOCK_LOCK,
+	CLOCK128_MULT_SHIFT	= __CLOCK128_BIT | __CLOCK_MULT_SHIFT,
+};
+
+static struct seqlock clock_seqlock;
+static unsigned long long cycles_start;
+static unsigned long long elapsed_nsec;
+
+static unsigned int max_cycles_shift;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+
+static unsigned int clock_shift;
+static unsigned long long clock_mult;
+
+static unsigned long long *nsecs;
+static unsigned long long clock_mult64_128[2];
+static __uint128_t clock_mult128;
+
+/*
+ * Functions for carrying out 128-bit
+ * arithmetic using 64-bit integers
+ *
+ * 128-bit integers are stored as
+ * arrays of two 64-bit integers
+ *
+ * Ordering is little endian
+ *
+ * a[0] has the less significant bits
+ * a[1] has the more significant bits
+ *
+ * NOT FULLY IMPLEMENTED
+ */
+static void do_mult(unsigned long long a[2], unsigned long long b,
+		    unsigned long long product[2])
+{
+	product[0] = product[1] = 0;
+	return;
+}
+
+static void do_div(unsigned long long a[2], unsigned long long b,
+		   unsigned long long c[2])
+{
+	return;
+}
+
+static void do_shift64(unsigned long long a[2], unsigned int count)
+{
+	a[0] = a[1] >> (count-64);
+	a[1] = 0;
+}
+
+static void do_shift(unsigned long long a[2], unsigned int count)
+{
+	if (count > 64)
+		do_shift64(a, count);
+	else {
+		while (count--) {
+			a[0] >>= 1;
+			a[0] |= a[1] << 63;
+			a[1] >>= 1;
+		}
+	}
+}
+
+static void update_clock(unsigned long long t)
+{
+	write_seqlock_begin(&clock_seqlock);
+	elapsed_nsec = (t >> max_cycles_shift) * nsecs_for_max_cycles;
+	cycles_start = t & ~max_cycles_mask;
+	write_seqlock_end(&clock_seqlock);
+}
+
+static unsigned long long _get_nsec(int mode, unsigned long long t)
+{
+	switch(mode) {
+	case CLOCK64_MULT_SHIFT:
+		return (t * clock_mult) >> clock_shift;
+	case CLOCK64_EMULATE_128: {
+		unsigned long long product[2] =  { };
+
+		do_mult(clock_mult64_128, t, product);
+		do_shift(product, clock_shift);
+		return product[0];
+		}
+	case CLOCK64_2STAGE: {
+		unsigned long long multiples, nsec;
+
+		multiples = t >> max_cycles_shift;
+		dprintf("multiples=%llu\n", multiples);
+		nsec = multiples * nsecs_for_max_cycles;
+		nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
+		return nsec;
+		}
+	case CLOCK64_LOCK: {
+		unsigned int seq;
+		unsigned long long nsec;
+
+		do {
+			seq = read_seqlock_begin(&clock_seqlock);
+			nsec = elapsed_nsec;
+			nsec += ((t - cycles_start) * clock_mult) >> clock_shift;
+		} while (read_seqlock_retry(&clock_seqlock, seq));
+		return nsec;
+		}
+	case CLOCK128_MULT_SHIFT:
+		return (unsigned long long)((t * clock_mult128) >> clock_shift);
+		default:
+			assert(0);
+	}
+}
+
+static unsigned long long get_nsec(int mode, unsigned long long t)
+{
+	if (mode == CLOCK64_LOCK) {
+		update_clock(t);
+	}
+
+	return _get_nsec(mode, t);
+}
+
+static void calc_mult_shift(int mode, void *mult, unsigned int *shift,
+			    unsigned long long max_sec,
+			    unsigned long long cycles_per_usec)
+{
+	unsigned long long max_ticks;
+	max_ticks = max_sec * cycles_per_usec * 1000000ULL;
+
+	switch (mode) {
+	case CLOCK64_MULT_SHIFT: {
+		unsigned long long max_mult, tmp;
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 64-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion
+		 */
+		max_mult = ULLONG_MAX / max_ticks;
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		tmp = max_mult * cycles_per_usec / 1000;
+		while (tmp > 1) {
+			tmp >>= 1;
+			sft++;
+			dprintf("tmp=%llu, sft=%u\n", tmp, sft);
+		}
+
+		*shift = sft;
+		*((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	case CLOCK64_EMULATE_128: {
+		unsigned long long max_mult[2], tmp[2] = { };
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 128-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion,
+		 * but use only 64-bit integers in the process
+		 */
+		max_mult[0] = max_mult[1] = ULLONG_MAX;
+		do_div(max_mult, max_ticks, max_mult);
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+			max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		do_div(max_mult, cycles_per_usec, tmp);
+		do_div(tmp, 1000ULL, tmp);
+		while (tmp[0] > 1 || tmp[1] > 1) {
+			do_shift(tmp, 1);
+			sft++;
+			dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
+		}
+
+		*shift = sft;
+//		*((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	case CLOCK64_2STAGE: {
+		unsigned long long tmp;
+/*
+ * This clock tick to nsec conversion requires two stages.
+ *
+ * Stage 1: Determine how many ~MAX_CLOCK_SEC_2STAGE periods worth of clock ticks
+ * 	have elapsed and set nsecs to the appropriate value for those
+ *	~MAX_CLOCK_SEC_2STAGE periods.
+ * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC_2STAGE periods from
+ *	Stage 1. Convert remaining clock ticks to nsecs and add to previously
+ *	set nsec value.
+ *
+ * To optimize the arithmetic operations, use the greatest power of 2 ticks
+ * less than the number of ticks in MAX_CLOCK_SEC_2STAGE seconds.
+ *
+ */
+		// Use a period shorter than MAX_CLOCK_SEC here for better accuracy
+		calc_mult_shift(CLOCK64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC_2STAGE, cycles_per_usec);
+
+		// Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC_2STAGE
+		max_cycles_shift = max_cycles_mask = 0;
+		tmp = MAX_CLOCK_SEC_2STAGE * 1000000ULL * cycles_per_usec;
+		dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+		while (tmp > 1) {
+			tmp >>= 1;
+			max_cycles_shift++;
+			dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+		}
+		// if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
+		// have a discontinuity every (1ULL << max_cycles_shift) cycles
+		nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;
+
+		// Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
+		for (tmp = 0; tmp < max_cycles_shift; tmp++)
+			max_cycles_mask |= 1ULL << tmp;
+
+		dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
+				max_cycles_shift, (1ULL << max_cycles_shift),
+				nsecs_for_max_cycles, max_cycles_mask);
+
+
+		break;
+		}
+	case CLOCK64_LOCK: {
+/*
+ * This clock tick to nsec conversion also requires two stages.
+ *
+ * Stage 1: Add to nsec the current running total of elapsed long periods
+ * Stage 2: Subtract from clock ticks the tick count corresponding to the
+ *	most recently elapsed long period. Convert the remaining ticks to
+ *	nsec and add to the previous nsec value.
+ *
+ * In practice the elapsed nsec from Stage 1 and the tick count subtracted
+ * in Stage 2 will be maintained in a separate thread.
+ *
+ */
+		calc_mult_shift(CLOCK64_2STAGE, mult, shift, MAX_CLOCK_SEC, cycles_per_usec);
+		cycles_start = 0;
+		break;
+		}
+	case CLOCK128_MULT_SHIFT: {
+		__uint128_t max_mult, tmp;
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 128-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion
+		 */
+		max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
+		max_mult /= max_ticks;
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+				max_ticks, __builtin_clzll(max_ticks),
+				(unsigned long long) (max_mult >> 64),
+				(unsigned long long) max_mult);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		tmp = max_mult * cycles_per_usec / 1000;
+		while (tmp > 1) {
+			tmp >>= 1;
+			sft++;
+			dprintf("tmp=0x%016llx%016llx, sft=%u\n",
+					(unsigned long long) (tmp >> 64),
+					(unsigned long long) tmp, sft);
+		}
+
+		*shift = sft;
+		*((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	}
+}
+
+static int discontinuity(int mode, int delta_ticks, int delta_nsec,
+			 unsigned long long start, unsigned long len)
+{
+	int i;
+	unsigned long mismatches = 0, bad_mismatches = 0;
+	unsigned long long delta, max_mismatch = 0;
+	unsigned long long *ns = nsecs;
+
+	for (i = 0; i < len; ns++, i++) {
+		*ns = get_nsec(mode, start + i);
+		if (i - delta_ticks >= 0) {
+			if (*ns > *(ns - delta_ticks))
+				delta = *ns - *(ns - delta_ticks);
+			else
+				delta = *(ns - delta_ticks) - *ns;
+			if (delta > delta_nsec)
+				delta -= delta_nsec;
+			else
+				delta = delta_nsec - delta;
+			if (delta) {
+				mismatches++;
+				if (delta > 1)
+					bad_mismatches++;
+				if (delta > max_mismatch)
+					max_mismatch = delta;
+			}
+		}
+		if (!bad_mismatches)
+			assert(max_mismatch == 0 || max_mismatch == 1);
+		if (!mismatches)
+			assert(max_mismatch == 0);
+	}
+
+	printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
+		mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
+		start + len - 1);
+	return mismatches;
+}
+
+#define MIN_TICKS 1ULL
+#define LEN 1000000000ULL
+#define NSEC_ONE_SEC 1000000000ULL
+#define TESTLEN 9
+
+static long long test_clock(int mode, int cycles_per_usec, int fast_test,
+			    int quiet, int delta_ticks, int delta_nsec)
+{
+	int i;
+	long long delta;
+	unsigned long long max_ticks;
+	unsigned long long nsecs;
+	void *mult;
+	unsigned long long test_ns[TESTLEN] =
+			{NSEC_ONE_SEC, NSEC_ONE_SEC,
+			 NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
+			 NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
+			 NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
+	unsigned long long test_ticks[TESTLEN];
+
+	max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
+
+	switch(mode) {
+	case CLOCK64_MULT_SHIFT:
+		mult = &clock_mult;
+		break;
+	case CLOCK64_EMULATE_128:
+		mult = clock_mult64_128;
+		break;
+	case CLOCK64_2STAGE:
+		mult = &clock_mult;
+		break;
+	case CLOCK64_LOCK:
+		mult = &clock_mult;
+		break;
+	case CLOCK128_MULT_SHIFT:
+		mult = &clock_mult128;
+		break;
+	default:
+		assert(0);
+	}
+	calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
+	nsecs = get_nsec(mode, max_ticks);
+	delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
+
+	if (mode == CLOCK64_2STAGE) {
+		test_ns[0] = nsecs_for_max_cycles - 1;
+		test_ns[1] = nsecs_for_max_cycles;
+		test_ticks[0] = (1ULL << max_cycles_shift) - 1;
+		test_ticks[1] = (1ULL << max_cycles_shift);
+
+		for (i = 2; i < TESTLEN; i++)
+			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+	}
+	else {
+		for (i = 0; i < TESTLEN; i++)
+			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+	}
+
+	if (!quiet) {
+		printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
+			cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
+		switch(mode) {
+			case CLOCK64_LOCK:
+			case CLOCK64_2STAGE:
+			case CLOCK64_MULT_SHIFT: {
+				printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
+					clock_mult, (double) clock_mult / (1ULL << clock_shift));
+				break;
+			}
+			case CLOCK64_EMULATE_128: {
+				printf("clock_mult=0x%016llx%016llx\n",
+					clock_mult64_128[1], clock_mult64_128[0]);
+				break;
+			}
+			case CLOCK128_MULT_SHIFT: {
+				printf("clock_mult=0x%016llx%016llx\n",
+					(unsigned long long) (clock_mult128 >> 64),
+					(unsigned long long) clock_mult128);
+				break;
+			}
+		}
+		printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
+			nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
+	}
+
+	for (i = 0; i < TESTLEN; i++)
+	{
+		nsecs = get_nsec(mode, test_ticks[i]);
+		delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
+		if (!quiet || delta > 0)
+			printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
+				test_ticks[i], nsecs, test_ns[i], delta);
+	}
+
+	if (!fast_test) {
+		discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
+		discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
+	}
+
+	if (!quiet)
+		printf("\n\n");
+
+	return delta;
+}
+
+int main(int argc, char *argv[])
+{
+	nsecs = malloc(LEN * sizeof(unsigned long long));
+
+	test_clock(CLOCK64_LOCK, 3333, 1, 0, 0, 0);
+	test_clock(CLOCK64_LOCK, 1000, 1, 0, 1, 1);
+	test_clock(CLOCK64_LOCK, 1100, 1, 0, 11, 10);
+	test_clock(CLOCK64_LOCK, 3000, 1, 0, 3, 1);
+	test_clock(CLOCK64_LOCK, 3333, 1, 0, 3333, 1000);
+	test_clock(CLOCK64_LOCK, 3392, 1, 0, 424, 125);
+	test_clock(CLOCK64_LOCK, 4500, 1, 0, 9, 2);
+	test_clock(CLOCK64_LOCK, 5000, 1, 0, 5, 1);
+
+	free(nsecs);
+	return 0;
+}
diff --git a/t/verify-state.c b/t/verify-state.c
new file mode 100644
index 0000000..734c1e4
--- /dev/null
+++ b/t/verify-state.c
@@ -0,0 +1,157 @@
+/*
+ * Dump the contents of a verify state file in plain text
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include "../log.h"
+#include "../os/os.h"
+#include "../verify-state.h"
+#include "../crc/crc32c.h"
+#include "debug.h"
+
+static void show_s(struct thread_io_list *s, unsigned int no_s)
+{
+	int i;
+
+	printf("Thread:\t\t%u\n", no_s);
+	printf("Name:\t\t%s\n", s->name);
+	printf("Completions:\t%llu\n", (unsigned long long) s->no_comps);
+	printf("Depth:\t\t%llu\n", (unsigned long long) s->depth);
+	printf("Number IOs:\t%llu\n", (unsigned long long) s->numberio);
+	printf("Index:\t\t%llu\n", (unsigned long long) s->index);
+
+	printf("Completions:\n");
+	if (!s->no_comps)
+		return;
+	for (i = s->no_comps - 1; i >= 0; i--) {
+		printf("\t(file=%2llu) %llu\n",
+				(unsigned long long) s->comps[i].fileno,
+				(unsigned long long) s->comps[i].offset);
+	}
+}
+
+static void show(struct thread_io_list *s, size_t size)
+{
+	int no_s;
+
+	no_s = 0;
+	do {
+		int i;
+
+		s->no_comps = le64_to_cpu(s->no_comps);
+		s->depth = le32_to_cpu(s->depth);
+		s->nofiles = le32_to_cpu(s->nofiles);
+		s->numberio = le64_to_cpu(s->numberio);
+		s->index = le64_to_cpu(s->index);
+
+		for (i = 0; i < s->no_comps; i++) {
+			s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno);
+			s->comps[i].offset = le64_to_cpu(s->comps[i].offset);
+		}
+
+		show_s(s, no_s);
+		no_s++;
+		size -= __thread_io_list_sz(s->depth, s->nofiles);
+		s = (struct thread_io_list *)((char *) s +
+			__thread_io_list_sz(s->depth, s->nofiles));
+	} while (size != 0);
+}
+
+static void show_verify_state(void *buf, size_t size)
+{
+	struct verify_state_hdr *hdr = buf;
+	struct thread_io_list *s;
+	uint32_t crc;
+
+	hdr->version = le64_to_cpu(hdr->version);
+	hdr->size = le64_to_cpu(hdr->size);
+	hdr->crc = le64_to_cpu(hdr->crc);
+
+	printf("Version:\t0x%x\n", (unsigned int) hdr->version);
+	printf("Size:\t\t%u\n", (unsigned int) hdr->size);
+	printf("CRC:\t\t0x%x\n", (unsigned int) hdr->crc);
+
+	size -= sizeof(*hdr);
+	if (hdr->size != size) {
+		log_err("Size mismatch\n");
+		return;
+	}
+
+	s = buf + sizeof(*hdr);
+	crc = fio_crc32c((unsigned char *) s, hdr->size);
+	if (crc != hdr->crc) {
+		log_err("crc mismatch %x != %x\n", crc, (unsigned int) hdr->crc);
+		return;
+	}
+
+	if (hdr->version == 0x03)
+		show(s, size);
+	else
+		log_err("Unsupported version %d\n", (int) hdr->version);
+}
+
+static int show_file(const char *file)
+{
+	struct stat sb;
+	void *buf;
+	int ret, fd;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		log_err("open %s: %s\n", file, strerror(errno));
+		return 1;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		log_err("stat: %s\n", strerror(errno));
+		close(fd);
+		return 1;
+	}
+
+	buf = malloc(sb.st_size);
+	ret = read(fd, buf, sb.st_size);
+	if (ret < 0) {
+		log_err("read: %s\n", strerror(errno));
+		close(fd);
+		free(buf);
+		return 1;
+	} else if (ret != sb.st_size) {
+		log_err("Short read\n");
+		close(fd);
+		free(buf);
+		return 1;
+	}
+
+	close(fd);
+	show_verify_state(buf, sb.st_size);
+
+	free(buf);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, ret;
+
+	debug_init();
+
+	if (argc < 2) {
+		log_err("Usage: %s <state file>\n", argv[0]);
+		return 1;
+	}
+
+	ret = 0;
+	for (i = 1; i < argc; i++) {
+		ret = show_file(argv[i]);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
diff --git a/t/zbd/functions b/t/zbd/functions
new file mode 100644
index 0000000..d49555a
--- /dev/null
+++ b/t/zbd/functions
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+blkzone=$(type -p blkzone 2>/dev/null)
+sg_inq=$(type -p sg_inq 2>/dev/null)
+zbc_report_zones=$(type -p zbc_report_zones 2>/dev/null)
+zbc_reset_zone=$(type -p zbc_reset_zone 2>/dev/null)
+if [ -z "${blkzone}" ] &&
+       { [ -z "${zbc_report_zones}" ] || [ -z "${zbc_reset_zone}" ]; }; then
+    echo "Error: neither blkzone nor zbc_report_zones is available"
+    exit 1
+fi
+
+# Reports the starting sector and length of the first sequential zone of device
+# $1.
+first_sequential_zone() {
+    local dev=$1
+
+    if [ -n "${blkzone}" ]; then
+	${blkzone} report "$dev" |
+	    sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*type:[[:blank:]]2(.*/\1 \2/p' |
+	    {
+		read -r starting_sector length &&
+		    # Convert from hex to decimal
+		    echo $((starting_sector)) $((length))
+	    }
+    else
+	${zbc_report_zones} "$dev" |
+	    sed -n 's/^Zone [0-9]*: type 0x2 .*, sector \([0-9]*\), \([0-9]*\) sectors,.*$/\1 \2/p' |
+	    head -n1
+    fi
+}
+
+max_open_zones() {
+    local dev=$1
+
+    if [ -n "${sg_inq}" ]; then
+	if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" 2> /dev/null; then
+	    # Non scsi device such as null_blk can not return max open zones.
+	    # Use default value.
+	    echo 128
+	else
+	    ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 |
+		{
+		    read -r offset b0 b1 b2 b3 trailer || return $?
+		    # Convert from hex to decimal
+		    max_nr_open_zones=$((0x${b0}))
+		    max_nr_open_zones=$((max_nr_open_zones * 256 + 0x${b1}))
+		    max_nr_open_zones=$((max_nr_open_zones * 256 + 0x${b2}))
+		    max_nr_open_zones=$((max_nr_open_zones * 256 + 0x${b3}))
+		    echo ${max_nr_open_zones}
+		}
+	fi
+    else
+	${zbc_report_zones} "$dev" |
+	    sed -n 's/^[[:blank:]]*Maximum number of open sequential write required zones:[[:blank:]]*//p'
+    fi
+}
+
+# Reset the write pointer of one zone on device $1 at offset $2. The offset
+# must be specified in units of 512 byte sectors. Offset -1 means reset all
+# zones.
+reset_zone() {
+    local dev=$1 offset=$2 sectors
+
+    if [ -n "${blkzone}" ]; then
+	if [ "$offset" -lt 0 ]; then
+	    sectors=$(<"/sys/class/block/${dev#/dev/}/size")
+	    ${blkzone} reset -o "${offset}" -l "$sectors" "$dev"
+	else
+	    ${blkzone} reset -o "${offset}" -c 1 "$dev"
+	fi
+    else
+	if [ "$offset" -lt 0 ]; then
+	    ${zbc_reset_zone} -all "$dev" "${offset}" >/dev/null
+	else
+	    ${zbc_reset_zone} -sector "$dev" "${offset}" >/dev/null
+	fi
+    fi
+}
+
+# Extract the number of bytes that have been transferred from a line like
+# READ: bw=6847KiB/s (7011kB/s), 6847KiB/s-6847KiB/s (7011kB/s-7011kB/s), io=257MiB (269MB), run=38406-38406msec
+fio_io() {
+    sed -n 's/^[[:blank:]]*'"$1"'.*, io=\([^[:blank:]]*\).*/\1/p' |
+	tail -n 1 |
+	(
+	    read -r io;
+	    # Parse <number>.<number><suffix> into n1, n2 and s. See also
+	    # num2str().
+	    shopt -s extglob
+	    n1=${io%${io##*([0-9])}}
+	    s=${io#${io%%*([a-zA-Z])}}
+	    n2=${io#${n1}}
+	    n2=${n2#.}
+	    n2=${n2%$s}000
+	    n2=${n2:0:3}
+	    case "$s" in
+		KiB) m=10;;
+		MiB) m=20;;
+		GiB) m=30;;
+		B)   m=0;;
+		*)   return 1;;
+	    esac
+	    [ -n "$n1" ] || return 1
+	    echo $(((n1 << m) + (n2 << m) / 1000))
+	)
+}
+
+fio_read() {
+    fio_io 'READ:'
+}
+
+fio_written() {
+    fio_io 'WRITE:'
+}
+
+fio_reset_count() {
+    local count
+
+    count=$(sed -n 's/^.*write:[^;]*; \([0-9]*\) zone resets$/\1/p')
+    echo "${count:-0}"
+}
diff --git a/t/zbd/run-tests-against-regular-nullb b/t/zbd/run-tests-against-regular-nullb
new file mode 100755
index 0000000..0f6e4b6
--- /dev/null
+++ b/t/zbd/run-tests-against-regular-nullb
@@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+
+scriptdir="$(cd "$(dirname "$0")" && pwd)"
+
+for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
+modprobe -r null_blk
+modprobe null_blk nr_devices=0 || return $?
+for d in /sys/kernel/config/nullb/*; do
+    [ -d "$d" ] && rmdir "$d"
+done
+modprobe -r null_blk
+[ -e /sys/module/null_blk ] && exit $?
+modprobe null_blk nr_devices=0 &&
+    cd /sys/kernel/config/nullb &&
+    mkdir nullb0 &&
+    cd nullb0 &&
+    echo 0 > completion_nsec &&
+    echo 4096 > blocksize &&
+    echo 1024 > size &&
+    echo 1 > memory_backed &&
+    echo 1 > power
+
+"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
diff --git a/t/zbd/run-tests-against-zoned-nullb b/t/zbd/run-tests-against-zoned-nullb
new file mode 100755
index 0000000..0952011
--- /dev/null
+++ b/t/zbd/run-tests-against-zoned-nullb
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+
+scriptdir="$(cd "$(dirname "$0")" && pwd)"
+
+for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
+modprobe -r null_blk
+modprobe null_blk nr_devices=0 || return $?
+for d in /sys/kernel/config/nullb/*; do
+    [ -d "$d" ] && rmdir "$d"
+done
+modprobe -r null_blk
+[ -e /sys/module/null_blk ] && exit $?
+modprobe null_blk nr_devices=0 &&
+    cd /sys/kernel/config/nullb &&
+    mkdir nullb0 &&
+    cd nullb0 &&
+    echo 1 > zoned &&
+    echo 1 > zone_size &&
+    echo 0 > completion_nsec &&
+    echo 4096 > blocksize &&
+    echo 1024 > size &&
+    echo 1 > memory_backed &&
+    echo 1 > power || exit $?
+
+"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
new file mode 100755
index 0000000..5d079a8
--- /dev/null
+++ b/t/zbd/test-zbd-support
@@ -0,0 +1,857 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+
+usage() {
+    echo "Usage: $(basename "$0") [-d] [-e] [-r] [-v] [-t <test>] <SMR drive device node>"
+}
+
+max() {
+    if [ "$1" -gt "$2" ]; then
+	echo "$1"
+    else
+	echo "$2"
+    fi
+}
+
+min() {
+    if [ "$1" -lt "$2" ]; then
+	echo "$1"
+    else
+	echo "$2"
+    fi
+}
+
+set_io_scheduler() {
+    local dev=$1 sched=$2
+
+    [ -e "/sys/block/$dev" ] || return $?
+    if [ -e "/sys/block/$dev/mq" ]; then
+	case "$sched" in
+	    noop)        sched=none;;
+	    deadline)    sched=mq-deadline;;
+	esac
+    else
+	case "$sched" in
+	    none)        sched=noop;;
+	    mq-deadline) sched=deadline;;
+	esac
+    fi
+
+    echo "$sched" >"/sys/block/$dev/queue/scheduler"
+}
+
+check_read() {
+    local read
+
+    read=$(fio_read <"${logfile}.${test_number}")
+    echo "read: $read <> $1" >> "${logfile}.${test_number}"
+    [ "$read" = "$1" ]
+}
+
+check_written() {
+    local written
+
+    written=$(fio_written <"${logfile}.${test_number}")
+    echo "written: $written <> $1" >> "${logfile}.${test_number}"
+    [ "$written" = "$1" ]
+}
+
+# Compare the reset count from the log file with reset count $2 using operator
+# $1 (=, -ge, -gt, -le, -lt).
+check_reset_count() {
+    local reset_count
+
+    reset_count=$(fio_reset_count <"${logfile}.${test_number}")
+    echo "reset_count: test $reset_count $1 $2" >> "${logfile}.${test_number}"
+    eval "[ '$reset_count' '$1' '$2' ]"
+}
+
+# Whether or not $1 (/dev/...) is a SCSI device.
+is_scsi_device() {
+    local d f
+
+    d=$(basename "$dev")
+    for f in /sys/class/scsi_device/*/device/block/"$d"; do
+	[ -e "$f" ] && return 0
+    done
+    return 1
+}
+
+run_fio() {
+    local fio opts
+
+    fio=$(dirname "$0")/../../fio
+
+    opts=("--aux-path=/tmp" "--allow_file_create=0" \
+			    "--significant_figures=10" "$@")
+    { echo; echo "fio ${opts[*]}"; echo; } >>"${logfile}.${test_number}"
+
+    "${dynamic_analyzer[@]}" "$fio" "${opts[@]}"
+}
+
+run_one_fio_job() {
+    local r
+
+    r=$(((RANDOM << 16) | RANDOM))
+    run_fio --name="$dev" --filename="$dev" "$@" --randseed="$r"	\
+	    --thread=1 --direct=1
+}
+
+# Run fio on the first four sequential zones of the disk.
+run_fio_on_seq() {
+    local opts=()
+
+    opts+=("--offset=$((first_sequential_zone_sector * 512))")
+    opts+=("--size=$((4 * zone_size))" "--zonemode=zbd")
+    if [ -z "$is_zbd" ]; then
+	opts+=("--zonesize=${zone_size}")
+    fi
+    run_one_fio_job "${opts[@]}" "$@"
+}
+
+# Check whether buffered writes are refused.
+test1() {
+    run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K	\
+	    --size="${zone_size}" --thread=1				\
+	    --zonemode=zbd --zonesize="${zone_size}" 2>&1 |
+	tee -a "${logfile}.${test_number}" |
+	grep -q 'Using direct I/O is mandatory for writing to ZBD drives'
+    local fio_rc=${PIPESTATUS[0]} grep_rc=${PIPESTATUS[2]}
+    case "$fio_rc" in
+	0|1) ;;
+	*)   return "$fio_rc"
+    esac
+    if [ -n "$is_zbd" ]; then
+	[ "$grep_rc" = 0 ]
+    else
+	[ "$grep_rc" != 0 ]
+    fi
+}
+
+# Block size exceeds zone size.
+test2() {
+    local bs off opts=() rc
+
+    off=$(((first_sequential_zone_sector + 2 * sectors_per_zone) * 512))
+    bs=$((2 * zone_size))
+    opts+=("--name=job1" "--filename=$dev" "--rw=write" "--direct=1")
+    opts+=("--zonemode=zbd" "--offset=$off" "--bs=$bs" "--size=$bs")
+    if [ -z "$is_zbd" ]; then
+	opts+=("--zonesize=${zone_size}")
+    fi
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    ! grep -q 'WRITE:' "${logfile}.${test_number}"
+}
+
+# Run fio against an empty zone. This causes fio to report "No I/O performed".
+test3() {
+    local off opts=() rc
+
+    off=$((first_sequential_zone_sector * 512 + 128 * zone_size))
+    size=$((zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=4K")
+    opts+=("--size=$size" "--zonemode=zbd")
+    opts+=("--ioengine=psync" "--rw=read" "--direct=1" "--thread=1")
+    if [ -z "$is_zbd" ]; then
+	opts+=("--zonesize=${zone_size}")
+    fi
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    grep -q 'READ:' "${logfile}.${test_number}"
+    rc=$?
+    if [ -n "$is_zbd" ]; then
+	[ $rc != 0 ]
+    else
+	[ $rc = 0 ]
+    fi
+}
+
+# Run fio with --read_beyond_wp=1 against an empty zone.
+test4() {
+    local off opts=()
+
+    off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
+    size=$((zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=$size")
+    opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1")
+    opts+=("--ioengine=psync" "--rw=read" "--direct=1" "--disable_lat=1")
+    opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_read $size || return $?
+}
+
+# Sequential write to sequential zones.
+test5() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=write		\
+		   --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Sequential read from sequential zones. Must be run after test5.
+test6() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=read		\
+		   --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 1.
+test7() {
+    local size=$((zone_size))
+
+    run_fio_on_seq --ioengine=libaio --iodepth=1 --rw=randwrite		\
+		   --bs="$(min 16384 "${zone_size}")"			\
+		   --do_verify=1 --verify=md5 --size="$size"		\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64.
+test8() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite	\
+		   --bs="$(min 16384 "${zone_size}")"			\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, sg, queue depth 1.
+test9() {
+    local size
+
+    if ! is_scsi_device "$dev"; then
+	echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
+	return 0
+    fi
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=sg --iodepth=1 --rw=randwrite --bs=16K	\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, sg, queue depth 64.
+test10() {
+    local size
+
+    if ! is_scsi_device "$dev"; then
+	echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
+	return 0
+    fi
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=sg --iodepth=64 --rw=randwrite --bs=16K	\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64, random block size.
+test11() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite	\
+		   --bsrange=4K-64K --do_verify=1 --verify=md5		\
+		   --debug=zbd >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64, max 1 open zone.
+test12() {
+    local size
+
+    size=$((8 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite --bs=16K     \
+		   --max_open_zones=1 --size=$size --do_verify=1 --verify=md5 \
+		   --debug=zbd >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64, max 4 open zones.
+test13() {
+    local size
+
+    size=$((8 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite --bs=16K     \
+		   --max_open_zones=4 --size=$size --do_verify=1 --verify=md5 \
+		   --debug=zbd						      \
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to conventional zones.
+test14() {
+    local size
+
+    size=$((16 * 2**20)) # 20 MB
+    if [ $size -gt $((first_sequential_zone_sector * 512)) ]; then
+	echo "$dev does not have enough sequential zones" \
+	     >>"${logfile}.${test_number}"
+	return 0
+    fi
+    run_one_fio_job --ioengine=libaio --iodepth=64 --rw=randwrite --bs=16K \
+		    --zonemode=zbd --zonesize="${zone_size}" --do_verify=1 \
+		    --verify=md5 --size=$size				   \
+		    >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((size)) || return $?
+    check_read $((size)) || return $?
+}
+
+# Sequential read on a mix of empty and full zones.
+test15() {
+    local i off size
+
+    for ((i=0;i<4;i++)); do
+	[ -n "$is_zbd" ] &&
+	    reset_zone "$dev" $((first_sequential_zone_sector +
+				 i*sectors_per_zone))
+    done
+    off=$(((first_sequential_zone_sector + 2 * sectors_per_zone) * 512))
+    size=$((2 * zone_size))
+    run_one_fio_job --ioengine=psync --rw=write --bs=$((zone_size / 16))\
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
+		    --size=$size >>"${logfile}.${test_number}" 2>&1 ||
+	return $?
+    check_written $size || return $?
+    off=$((first_sequential_zone_sector * 512))
+    size=$((4 * zone_size))
+    run_one_fio_job --ioengine=psync --rw=read --bs=$((zone_size / 16))	\
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
+		    --size=$((size)) >>"${logfile}.${test_number}" 2>&1 ||
+	return $?
+    if [ -n "$is_zbd" ]; then
+	check_read $((size / 2))
+    else
+	check_read $size
+    fi
+}
+
+# Random read on a mix of empty and full zones. Must be run after test15.
+test16() {
+    local off size
+
+    off=$((first_sequential_zone_sector * 512))
+    size=$((4 * zone_size))
+    run_one_fio_job --ioengine=libaio --iodepth=64 --rw=randread --bs=16K \
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
+		    --size=$size >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_read $size || return $?
+}
+
+# Random reads and writes in the last zone.
+test17() {
+    local io off read size written
+
+    off=$(((disk_size / zone_size - 1) * zone_size))
+    size=$((disk_size - off))
+    # Overwrite the last zone to avoid that reading from that zone fails.
+    if [ -n "$is_zbd" ]; then
+	reset_zone "$dev" $((off / 512)) || return $?
+    fi
+    run_one_fio_job --ioengine=psync --rw=write --offset="$off"		\
+		    --zonemode=zbd --zonesize="${zone_size}"		\
+		    --bs="$zone_size" --size="$zone_size"		\
+		    >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written "$zone_size" || return $?
+    run_one_fio_job --ioengine=libaio --iodepth=8 --rw=randrw --bs=4K	\
+		    --zonemode=zbd --zonesize="${zone_size}"		\
+		    --offset=$off --loops=2 --norandommap=1\
+		    >>"${logfile}.${test_number}" 2>&1 || return $?
+    written=$(fio_written <"${logfile}.${test_number}")
+    read=$(fio_read <"${logfile}.${test_number}")
+    io=$((written + read))
+    echo "Total number of bytes read and written: $io <> $size" \
+	 >>"${logfile}.${test_number}"
+    [ $io = $((size * 2)) ];
+}
+
+# Out-of-range zone reset threshold and frequency parameters.
+test18() {
+    run_fio_on_seq --zone_reset_threshold=-1 |&
+	tee -a "${logfile}.${test_number}"   |
+	    grep -q 'value out of range' || return $?
+}
+
+test19() {
+    run_fio_on_seq --zone_reset_threshold=2  |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'value out of range' || return $?
+}
+
+test20() {
+    run_fio_on_seq --zone_reset_threshold=.4:.6 |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'the list exceeding max length' || return $?
+}
+
+test21() {
+    run_fio_on_seq --zone_reset_frequency=-1 |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'value out of range' || return $?
+}
+
+test22() {
+    run_fio_on_seq --zone_reset_frequency=2  |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'value out of range' || return $?
+}
+
+test23() {
+    run_fio_on_seq --zone_reset_frequency=.4:.6  |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'the list exceeding max length' || return $?
+}
+
+test24() {
+    local bs loops=9 size=$((zone_size))
+
+    bs=$(min $((256*1024)) "$zone_size")
+    run_fio_on_seq --ioengine=psync --rw=write --bs="$bs" --size=$size	 \
+		   --loops=$loops					 \
+		   --zone_reset_frequency=.01 --zone_reset_threshold=.90 \
+		   >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((size * loops)) || return $?
+    check_reset_count -eq 8 ||
+	check_reset_count -eq 9 ||
+	check_reset_count -eq 10 || return $?
+}
+
+# Multiple non-overlapping sequential write jobs for the same drive.
+test25() {
+    local i opts=()
+
+    for ((i=0;i<16;i++)); do
+        [ -n "$is_zbd" ] &&
+	    reset_zone "$dev" $((first_sequential_zone_sector + i*sectors_per_zone))
+    done
+    for ((i=0;i<16;i++)); do
+	opts+=("--name=job$i" "--filename=$dev" "--thread=1" "--direct=1")
+	opts+=("--offset=$((first_sequential_zone_sector*512 + zone_size*i))")
+	opts+=("--size=$zone_size" "--ioengine=psync" "--rw=write" "--bs=16K")
+	opts+=("--zonemode=zbd" "--zonesize=${zone_size}" "--group_reporting=1")
+    done
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+write_to_first_seq_zone() {
+    local loops=4 r
+
+    r=$(((RANDOM << 16) | RANDOM))
+    run_fio --name="$dev" --filename="$dev" --ioengine=psync --rw="$1"	\
+	    --thread=1 --do_verify=1 --verify=md5 --direct=1 --bs=4K	\
+	    --offset=$((first_sequential_zone_sector * 512))		\
+	    "--size=$zone_size" --loops=$loops --randseed="$r"		\
+	    --zonemode=zbd --zonesize="${zone_size}" --group_reporting=1	\
+	    --gtod_reduce=1 >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((loops * zone_size)) || return $?
+}
+
+# Overwrite the first sequential zone four times sequentially.
+test26() {
+    write_to_first_seq_zone write
+}
+
+# Overwrite the first sequential zone four times using random writes.
+test27() {
+    write_to_first_seq_zone randwrite
+}
+
+# Multiple overlapping random write jobs for the same drive.
+test28() {
+    local i jobs=16 off opts
+
+    off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts=("--debug=zbd")
+    for ((i=0;i<jobs;i++)); do
+	opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
+	opts+=("--size=$zone_size" "--ioengine=psync" "--rw=randwrite")
+	opts+=("--thread=1" "--direct=1" "--zonemode=zbd")
+	opts+=("--zonesize=${zone_size}" "--group_reporting=1")
+    done
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((jobs * zone_size)) || return $?
+    check_reset_count -eq $jobs ||
+	check_reset_count -eq $((jobs - 1)) ||
+	return $?
+}
+
+# Multiple overlapping random write jobs for the same drive and with a limited
+# number of open zones.
+test29() {
+    local i jobs=16 off opts=()
+
+    off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
+    size=$((16*zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts=("--debug=zbd")
+    for ((i=0;i<jobs;i++)); do
+	opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
+	opts+=("--size=$size" "--io_size=$zone_size" "--thread=1")
+	opts+=("--ioengine=psync" "--rw=randwrite" "--direct=1")
+	opts+=("--max_open_zones=4" "--group_reporting=1")
+	opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
+    done
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((jobs * zone_size)) || return $?
+}
+
+# Random reads and writes across the entire disk for 30s.
+test30() {
+    local off
+
+    off=$((first_sequential_zone_sector * 512))
+    run_one_fio_job --ioengine=libaio --iodepth=8 --rw=randrw		\
+		    --bs="$(max $((zone_size / 128)) "$logical_block_size")"\
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off\
+		    --loops=2 --time_based --runtime=30s --norandommap=1\
+		    >>"${logfile}.${test_number}" 2>&1
+}
+
+# Random reads across all sequential zones for 30s. This is not only a fio
+# test but also allows to verify the performance of a drive.
+test31() {
+    local bs inc nz off opts size
+
+    # Start with writing 128 KB to 128 sequential zones.
+    bs=128K
+    nz=128
+    # shellcheck disable=SC2017
+    inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size)
+	   * zone_size))
+    opts=()
+    for ((off = first_sequential_zone_sector * 512; off < disk_size;
+	  off += inc)); do
+	opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs")
+	opts+=("--bs=$bs" "--size=$zone_size" "--ioengine=libaio")
+	opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0")
+	opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
+    done
+    "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
+    # Next, run the test.
+    off=$((first_sequential_zone_sector * 512))
+    size=$((disk_size - off))
+    opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
+    opts+=("--bs=$bs" "--ioengine=psync" "--rw=randread" "--direct=1")
+    opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd")
+    opts+=("--zonesize=${zone_size}")
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Random writes across all sequential zones. This is not only a fio test but
+# also allows to verify the performance of a drive.
+test32() {
+    local off opts=() size
+
+    off=$((first_sequential_zone_sector * 512))
+    size=$((disk_size - off))
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
+    opts+=("--bs=128K" "--ioengine=psync" "--rw=randwrite" "--direct=1")
+    opts+=("--thread=1" "--time_based" "--runtime=30")
+    opts+=("--max_open_zones=$max_open_zones" "--zonemode=zbd")
+    opts+=("--zonesize=${zone_size}")
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Write to sequential zones with a block size that is not a divisor of the
+# zone size.
+test33() {
+    local bs io_size size
+
+    size=$((2 * zone_size))
+    io_size=$((5 * zone_size))
+    bs=$((3 * zone_size / 4))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=write --size=$size	\
+		   --io_size=$io_size --bs=$bs				\
+		   >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $(((io_size + bs - 1) / bs * bs)) || return $?
+}
+
+# Write to sequential zones with a block size that is not a divisor of the
+# zone size and with data verification enabled.
+test34() {
+    local size
+
+    size=$((2 * zone_size))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=write --size=$size	  \
+		   --do_verify=1 --verify=md5 --bs=$((3 * zone_size / 4)) \
+		   >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'not a divisor of' "${logfile}.${test_number}"
+}
+
+# Test 1/4 for the I/O boundary rounding code: $size < $zone_size.
+test35() {
+    local bs off io_size size
+
+    off=$(((first_sequential_zone_sector + 1) * 512))
+    size=$((zone_size - 2 * 512))
+    bs=$((zone_size / 4))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'io_size must be at least one zone' "${logfile}.${test_number}"
+}
+
+# Test 2/4 for the I/O boundary rounding code: $size < $zone_size.
+test36() {
+    local bs off io_size size
+
+    off=$(((first_sequential_zone_sector) * 512))
+    size=$((zone_size - 512))
+    bs=$((zone_size / 4))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'io_size must be at least one zone' "${logfile}.${test_number}"
+}
+
+# Test 3/4 for the I/O boundary rounding code: $size > $zone_size.
+test37() {
+    local bs off size
+
+    if [ "$first_sequential_zone_sector" = 0 ]; then
+	off=0
+    else
+	off=$(((first_sequential_zone_sector - 1) * 512))
+    fi
+    size=$((zone_size + 2 * 512))
+    bs=$((zone_size / 4))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1
+    check_written $((zone_size)) || return $?
+}
+
+# Test 4/4 for the I/O boundary rounding code: $offset > $disk_size - $zone_size
+test38() {
+    local bs off size
+
+    size=$((logical_block_size))
+    off=$((disk_size - logical_block_size))
+    bs=$((logical_block_size))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'io_size must be at least one zone' "${logfile}.${test_number}"
+}
+
+# Read one block from a block device.
+read_one_block() {
+    local bs
+
+    bs=$((logical_block_size))
+    run_one_fio_job --rw=read --ioengine=psync --bs=$bs --size=$bs "$@" 2>&1 |
+	tee -a "${logfile}.${test_number}"
+}
+
+# Check whether fio accepts --zonemode=none for zoned block devices.
+test39() {
+    [ -n "$is_zbd" ] || return 0
+    read_one_block --zonemode=none >/dev/null || return $?
+    check_read $((logical_block_size)) || return $?
+}
+
+# Check whether fio accepts --zonemode=strided for zoned block devices.
+test40() {
+    local bs
+
+    bs=$((logical_block_size))
+    [ -n "$is_zbd" ] || return 0
+    read_one_block --zonemode=strided |
+	grep -q 'fio: --zonesize must be specified when using --zonemode=strided' ||
+	return $?
+    read_one_block --zonemode=strided --zonesize=$bs >/dev/null || return $?
+    check_read $bs || return $?
+}
+
+# Check whether fio checks the zone size for zoned block devices.
+test41() {
+    [ -n "$is_zbd" ] || return 0
+    read_one_block --zonemode=zbd --zonesize=$((2 * zone_size)) |
+	grep -q 'job parameter zonesize.*does not match disk zone size'
+}
+
+# Check whether fio handles --zonesize=0 correctly for regular block devices.
+test42() {
+    [ -n "$is_zbd" ] && return 0
+    read_one_block --zonemode=zbd --zonesize=0 |
+	grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd'
+}
+
+# Check whether fio handles --zonesize=1 correctly for regular block devices.
+test43() {
+    [ -n "$is_zbd" ] && return 0
+    read_one_block --zonemode=zbd --zonesize=1 |
+	grep -q 'zone size must be at least 512 bytes for --zonemode=zbd'
+}
+
+# Check whether fio handles --zonemode=none --zonesize=1 correctly.
+test44() {
+    read_one_block --zonemode=none --zonesize=1 |
+	grep -q 'fio: --zonemode=none and --zonesize are not compatible'
+}
+
+test45() {
+    local bs i
+
+    [ -z "$is_zbd" ] && return 0
+    bs=$((logical_block_size))
+    run_one_fio_job --ioengine=psync --iodepth=1 --rw=randwrite --bs=$bs\
+		    --offset=$((first_sequential_zone_sector * 512)) \
+		    --size="$zone_size" --do_verify=1 --verify=md5 2>&1 |
+	tee -a "${logfile}.${test_number}" |
+	grep -q "fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd"
+}
+
+# Random write to sequential zones, libaio, 8 jobs, queue depth 64 per job
+test46() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite --bs=4K \
+		   --group_reporting=1 --numjobs=8 \
+		   >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((size * 8)) || return $?
+}
+
+# Check whether fio handles --zonemode=zbd --zoneskip=1 correctly.
+test47() {
+    local bs
+
+    [ -z "$is_zbd" ] && return 0
+    bs=$((logical_block_size))
+    run_one_fio_job --ioengine=psync --rw=write --bs=$bs \
+		    --zonemode=zbd --zoneskip=1		 \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'zoneskip 1 is not a multiple of the device zone size' "${logfile}.${test_number}"
+}
+
+tests=()
+dynamic_analyzer=()
+reset_all_zones=
+
+while [ "${1#-}" != "$1" ]; do
+  case "$1" in
+    -d) dynamic_analyzer=(valgrind "--read-var-info=yes" "--tool=drd"
+			  "--show-confl-seg=no");
+	shift;;
+    -e) dynamic_analyzer=(valgrind "--read-var-info=yes" "--tool=helgrind");
+	shift;;
+    -r) reset_all_zones=1; shift;;
+    -t) tests+=("$2"); shift; shift;;
+    -v) dynamic_analyzer=(valgrind "--read-var-info=yes");
+	shift;;
+    --) shift; break;;
+  esac
+done
+
+if [ $# != 1 ]; then
+    usage
+    exit 1
+fi
+
+# shellcheck source=functions
+source "$(dirname "$0")/functions" || exit $?
+
+dev=$1
+realdev=$(readlink -f "$dev")
+basename=$(basename "$realdev")
+major=$((0x$(stat -L -c '%t' "$realdev"))) || exit $?
+minor=$((0x$(stat -L -c '%T' "$realdev"))) || exit $?
+disk_size=$(($(<"/sys/dev/block/$major:$minor/size")*512))
+# When the target is a partition device, get basename of its holder device to
+# access sysfs path of the holder device
+if [[ -r "/sys/dev/block/$major:$minor/partition" ]]; then
+	realsysfs=$(readlink "/sys/dev/block/$major:$minor")
+	basename=$(basename "${realsysfs%/*}")
+fi
+logical_block_size=$(<"/sys/block/$basename/queue/logical_block_size")
+case "$(<"/sys/class/block/$basename/queue/zoned")" in
+    host-managed|host-aware)
+	is_zbd=true
+	if ! result=($(first_sequential_zone "$dev")); then
+	    echo "Failed to determine first sequential zone"
+	    exit 1
+	fi
+	first_sequential_zone_sector=${result[0]}
+	sectors_per_zone=${result[1]}
+	zone_size=$((sectors_per_zone * 512))
+	if ! max_open_zones=$(max_open_zones "$dev"); then
+	    echo "Failed to determine maximum number of open zones"
+	    exit 1
+	fi
+	echo "First sequential zone starts at sector $first_sequential_zone_sector; zone size: $((zone_size >> 20)) MB"
+	set_io_scheduler "$basename" deadline || exit $?
+	if [ -n "$reset_all_zones" ]; then
+	    reset_zone "$dev" -1
+	fi
+	;;
+    *)
+	first_sequential_zone_sector=$(((disk_size / 2) &
+					(logical_block_size - 1)))
+	zone_size=$(max 65536 "$logical_block_size")
+	sectors_per_zone=$((zone_size / 512))
+	max_open_zones=128
+	set_io_scheduler "$basename" none || exit $?
+	;;
+esac
+
+if [ "${#tests[@]}" = 0 ]; then
+    readarray -t tests < <(declare -F | grep "test[0-9]*" | \
+				   tr -c -d "[:digit:]\n" | sort -n)
+fi
+
+logfile=$0.log
+
+passed=0
+failed=0
+rc=0
+for test_number in "${tests[@]}"; do
+    rm -f "${logfile}.${test_number}"
+    echo -n "Running test $test_number ... "
+    if eval "test$test_number"; then
+	status="PASS"
+	((passed++))
+    else
+	status="FAIL"
+	((failed++))
+	rc=1
+    fi
+    echo "$status"
+    echo "$status" >> "${logfile}.${test_number}"
+done
+
+echo "$passed tests passed"
+if [ $failed -gt 0 ]; then
+    echo " and $failed tests failed"
+fi
+exit $rc
diff --git a/td_error.c b/td_error.c
new file mode 100644
index 0000000..9d58a31
--- /dev/null
+++ b/td_error.c
@@ -0,0 +1,40 @@
+#include "fio.h"
+#include "io_ddir.h"
+#include "td_error.h"
+
+static int __NON_FATAL_ERR[] = { EIO, EILSEQ };
+
+enum error_type_bit td_error_type(enum fio_ddir ddir, int err)
+{
+	if (err == EILSEQ)
+		return ERROR_TYPE_VERIFY_BIT;
+	if (ddir == DDIR_READ)
+		return ERROR_TYPE_READ_BIT;
+	return ERROR_TYPE_WRITE_BIT;
+}
+
+int td_non_fatal_error(struct thread_data *td, enum error_type_bit etype,
+		       int err)
+{
+	unsigned int i;
+
+	if (!td->o.ignore_error[etype]) {
+		td->o.ignore_error[etype] = __NON_FATAL_ERR;
+		td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR);
+	}
+
+	if (!(td->o.continue_on_error & (1 << etype)))
+		return 0;
+	for (i = 0; i < td->o.ignore_error_nr[etype]; i++)
+		if (td->o.ignore_error[etype][i] == err)
+			return 1;
+
+	return 0;
+}
+
+void update_error_count(struct thread_data *td, int err)
+{
+	td->total_err_count++;
+	if (td->total_err_count == 1)
+		td->first_error = err;
+}
diff --git a/td_error.h b/td_error.h
new file mode 100644
index 0000000..1cc3a75
--- /dev/null
+++ b/td_error.h
@@ -0,0 +1,30 @@
+#ifndef FIO_TD_ERROR_H
+#define FIO_TD_ERROR_H
+
+#include "io_ddir.h"
+
+/*
+ * What type of errors to continue on when continue_on_error is used,
+ * and what type of errors to ignore when ignore_error is used.
+ */
+enum error_type_bit {
+	ERROR_TYPE_READ_BIT = 0,
+	ERROR_TYPE_WRITE_BIT = 1,
+	ERROR_TYPE_VERIFY_BIT = 2,
+	ERROR_TYPE_CNT = 3,
+};
+
+enum error_type {
+        ERROR_TYPE_NONE = 0,
+        ERROR_TYPE_READ = 1 << ERROR_TYPE_READ_BIT,
+        ERROR_TYPE_WRITE = 1 << ERROR_TYPE_WRITE_BIT,
+        ERROR_TYPE_VERIFY = 1 << ERROR_TYPE_VERIFY_BIT,
+        ERROR_TYPE_ANY = 0xffff,
+};
+
+enum error_type_bit td_error_type(enum fio_ddir ddir, int err);
+int td_non_fatal_error(struct thread_data *td, enum error_type_bit etype,
+		       int err);
+void update_error_count(struct thread_data *td, int err);
+
+#endif
diff --git a/thread_options.h b/thread_options.h
new file mode 100644
index 0000000..c78ed43
--- /dev/null
+++ b/thread_options.h
@@ -0,0 +1,638 @@
+#ifndef FIO_THREAD_OPTIONS_H
+#define FIO_THREAD_OPTIONS_H
+
+#include "arch/arch.h"
+#include "os/os.h"
+#include "options.h"
+#include "stat.h"
+#include "gettime.h"
+#include "lib/ieee754.h"
+#include "lib/pattern.h"
+#include "td_error.h"
+
+enum fio_zone_mode {
+	ZONE_MODE_NOT_SPECIFIED	= 0,
+	ZONE_MODE_NONE		= 1,
+	ZONE_MODE_STRIDED	= 2, /* perform I/O in one zone at a time */
+	/* perform I/O across multiple zones simultaneously */
+	ZONE_MODE_ZBD		= 3,
+};
+
+/*
+ * What type of allocation to use for io buffers
+ */
+enum fio_memtype {
+	MEM_MALLOC = 0,	/* ordinary malloc */
+	MEM_SHM,	/* use shared memory segments */
+	MEM_SHMHUGE,	/* use shared memory segments with huge pages */
+	MEM_MMAP,	/* use anonynomous mmap */
+	MEM_MMAPHUGE,	/* memory mapped huge file */
+	MEM_MMAPSHARED, /* use mmap with shared flag */
+	MEM_CUDA_MALLOC,/* use GPU memory */
+};
+
+#define ERROR_STR_MAX	128
+
+#define BSSPLIT_MAX	64
+#define ZONESPLIT_MAX	256
+
+struct bssplit {
+	uint64_t bs;
+	uint32_t perc;
+};
+
+struct zone_split {
+	uint8_t access_perc;
+	uint8_t size_perc;
+	uint8_t pad[6];
+	uint64_t size;
+};
+
+#define NR_OPTS_SZ	(FIO_MAX_OPTS / (8 * sizeof(uint64_t)))
+
+#define OPT_MAGIC	0x4f50544e
+
+struct thread_options {
+	int magic;
+	uint64_t set_options[NR_OPTS_SZ];
+	char *description;
+	char *name;
+	char *wait_for;
+	char *directory;
+	char *filename;
+	char *filename_format;
+	char *opendir;
+	char *ioengine;
+	char *ioengine_so_path;
+	char *mmapfile;
+	enum td_ddir td_ddir;
+	unsigned int rw_seq;
+	unsigned int kb_base;
+	unsigned int unit_base;
+	unsigned int ddir_seq_nr;
+	long long ddir_seq_add;
+	unsigned int iodepth;
+	unsigned int iodepth_low;
+	unsigned int iodepth_batch;
+	unsigned int iodepth_batch_complete_min;
+	unsigned int iodepth_batch_complete_max;
+	unsigned int serialize_overlap;
+
+	unsigned int unique_filename;
+
+	unsigned long long size;
+	unsigned long long io_size;
+	unsigned int size_percent;
+	unsigned int fill_device;
+	unsigned int file_append;
+	unsigned long long file_size_low;
+	unsigned long long file_size_high;
+	unsigned long long start_offset;
+	unsigned long long start_offset_align;
+
+	unsigned long long bs[DDIR_RWDIR_CNT];
+	unsigned long long ba[DDIR_RWDIR_CNT];
+	unsigned long long min_bs[DDIR_RWDIR_CNT];
+	unsigned long long max_bs[DDIR_RWDIR_CNT];
+	struct bssplit *bssplit[DDIR_RWDIR_CNT];
+	unsigned int bssplit_nr[DDIR_RWDIR_CNT];
+
+	int *ignore_error[ERROR_TYPE_CNT];
+	unsigned int ignore_error_nr[ERROR_TYPE_CNT];
+	unsigned int error_dump;
+
+	unsigned int nr_files;
+	unsigned int open_files;
+	enum file_lock_mode file_lock_mode;
+
+	unsigned int odirect;
+	unsigned int oatomic;
+	unsigned int invalidate_cache;
+	unsigned int create_serialize;
+	unsigned int create_fsync;
+	unsigned int create_on_open;
+	unsigned int create_only;
+	unsigned int end_fsync;
+	unsigned int pre_read;
+	unsigned int sync_io;
+	unsigned int write_hint;
+	unsigned int verify;
+	unsigned int do_verify;
+	unsigned int verify_interval;
+	unsigned int verify_offset;
+	char verify_pattern[MAX_PATTERN_SIZE];
+	unsigned int verify_pattern_bytes;
+	struct pattern_fmt verify_fmt[8];
+	unsigned int verify_fmt_sz;
+	unsigned int verify_fatal;
+	unsigned int verify_dump;
+	unsigned int verify_async;
+	unsigned long long verify_backlog;
+	unsigned int verify_batch;
+	unsigned int experimental_verify;
+	unsigned int verify_state;
+	unsigned int verify_state_save;
+	unsigned int use_thread;
+	unsigned int unlink;
+	unsigned int unlink_each_loop;
+	unsigned int do_disk_util;
+	unsigned int override_sync;
+	unsigned int rand_repeatable;
+	unsigned int allrand_repeatable;
+	unsigned long long rand_seed;
+	unsigned int log_avg_msec;
+	unsigned int log_hist_msec;
+	unsigned int log_hist_coarseness;
+	unsigned int log_max;
+	unsigned int log_offset;
+	unsigned int log_gz;
+	unsigned int log_gz_store;
+	unsigned int log_unix_epoch;
+	unsigned int norandommap;
+	unsigned int softrandommap;
+	unsigned int bs_unaligned;
+	unsigned int fsync_on_close;
+	unsigned int bs_is_seq_rand;
+
+	unsigned int verify_only;
+
+	unsigned int random_distribution;
+	unsigned int exitall_error;
+
+	struct zone_split *zone_split[DDIR_RWDIR_CNT];
+	unsigned int zone_split_nr[DDIR_RWDIR_CNT];
+
+	fio_fp64_t zipf_theta;
+	fio_fp64_t pareto_h;
+	fio_fp64_t gauss_dev;
+
+	unsigned int random_generator;
+
+	unsigned int perc_rand[DDIR_RWDIR_CNT];
+
+	unsigned int hugepage_size;
+	unsigned long long rw_min_bs;
+	unsigned int thinktime;
+	unsigned int thinktime_spin;
+	unsigned int thinktime_blocks;
+	unsigned int fsync_blocks;
+	unsigned int fdatasync_blocks;
+	unsigned int barrier_blocks;
+	unsigned long long start_delay;
+	unsigned long long start_delay_orig;
+	unsigned long long start_delay_high;
+	unsigned long long timeout;
+	unsigned long long ramp_time;
+	unsigned int ss_state;
+	fio_fp64_t ss_limit;
+	unsigned long long ss_dur;
+	unsigned long long ss_ramp_time;
+	unsigned int overwrite;
+	unsigned int bw_avg_time;
+	unsigned int iops_avg_time;
+	unsigned int loops;
+	unsigned long long zone_range;
+	unsigned long long zone_size;
+	unsigned long long zone_skip;
+	enum fio_zone_mode zone_mode;
+	unsigned long long lockmem;
+	enum fio_memtype mem_type;
+	unsigned int mem_align;
+
+	unsigned long long max_latency;
+
+	unsigned short exit_what;
+	unsigned short stonewall;
+	unsigned int new_group;
+	unsigned int numjobs;
+	os_cpu_mask_t cpumask;
+	os_cpu_mask_t verify_cpumask;
+	os_cpu_mask_t log_gz_cpumask;
+	unsigned int cpus_allowed_policy;
+	char *numa_cpunodes;
+	unsigned short numa_mem_mode;
+	unsigned int numa_mem_prefer_node;
+	char *numa_memnodes;
+	unsigned int gpu_dev_id;
+	unsigned int start_offset_percent;
+
+	unsigned int iolog;
+	unsigned int rwmixcycle;
+	unsigned int rwmix[DDIR_RWDIR_CNT];
+	unsigned int nice;
+	unsigned int ioprio;
+	unsigned int ioprio_class;
+	unsigned int file_service_type;
+	unsigned int group_reporting;
+	unsigned int stats;
+	unsigned int fadvise_hint;
+	enum fio_fallocate_mode fallocate_mode;
+	unsigned int zero_buffers;
+	unsigned int refill_buffers;
+	unsigned int scramble_buffers;
+	char buffer_pattern[MAX_PATTERN_SIZE];
+	unsigned int buffer_pattern_bytes;
+	unsigned int compress_percentage;
+	unsigned int compress_chunk;
+	unsigned int dedupe_percentage;
+	unsigned int time_based;
+	unsigned int disable_lat;
+	unsigned int disable_clat;
+	unsigned int disable_slat;
+	unsigned int disable_bw;
+	unsigned int unified_rw_rep;
+	unsigned int gtod_reduce;
+	unsigned int gtod_cpu;
+	enum fio_cs clocksource;
+	unsigned int no_stall;
+	unsigned int trim_percentage;
+	unsigned int trim_batch;
+	unsigned int trim_zero;
+	unsigned long long trim_backlog;
+	unsigned int clat_percentiles;
+	unsigned int slat_percentiles;
+	unsigned int lat_percentiles;
+	unsigned int percentile_precision;	/* digits after decimal for percentiles */
+	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
+
+	char *read_iolog_file;
+	bool read_iolog_chunked;
+	char *write_iolog_file;
+	char *merge_blktrace_file;
+	fio_fp64_t merge_blktrace_scalars[FIO_IO_U_LIST_MAX_LEN];
+	fio_fp64_t merge_blktrace_iters[FIO_IO_U_LIST_MAX_LEN];
+
+	unsigned int write_bw_log;
+	unsigned int write_lat_log;
+	unsigned int write_iops_log;
+	unsigned int write_hist_log;
+
+	char *bw_log_file;
+	char *lat_log_file;
+	char *iops_log_file;
+	char *hist_log_file;
+	char *replay_redirect;
+
+	/*
+	 * Pre-run and post-run shell
+	 */
+	char *exec_prerun;
+	char *exec_postrun;
+
+	uint64_t rate[DDIR_RWDIR_CNT];
+	uint64_t ratemin[DDIR_RWDIR_CNT];
+	unsigned int ratecycle;
+	unsigned int io_submit_mode;
+	unsigned int rate_iops[DDIR_RWDIR_CNT];
+	unsigned int rate_iops_min[DDIR_RWDIR_CNT];
+	unsigned int rate_process;
+	unsigned int rate_ign_think;
+
+	char *ioscheduler;
+
+	/*
+	 * I/O Error handling
+	 */
+	enum error_type continue_on_error;
+
+	/*
+	 * Benchmark profile type
+	 */
+	char *profile;
+
+	/*
+	 * blkio cgroup support
+	 */
+	char *cgroup;
+	unsigned int cgroup_weight;
+	unsigned int cgroup_nodelete;
+
+	unsigned int uid;
+	unsigned int gid;
+
+	int flow_id;
+	int flow;
+	int flow_watermark;
+	unsigned int flow_sleep;
+
+	unsigned int offset_increment_percent;
+	unsigned long long offset_increment;
+	unsigned long long number_ios;
+
+	unsigned int sync_file_range;
+
+	unsigned long long latency_target;
+	unsigned long long latency_window;
+	fio_fp64_t latency_percentile;
+
+	unsigned int sig_figs;
+
+	unsigned block_error_hist;
+
+	unsigned int replay_align;
+	unsigned int replay_scale;
+	unsigned int replay_time_scale;
+	unsigned int replay_skip;
+
+	unsigned int per_job_logs;
+
+	unsigned int allow_create;
+	unsigned int allow_mounted_write;
+
+	/* Parameters that affect zonemode=zbd */
+	unsigned int read_beyond_wp;
+	int max_open_zones;
+	fio_fp64_t zrt;
+	fio_fp64_t zrf;
+};
+
+#define FIO_TOP_STR_MAX		256
+
+struct thread_options_pack {
+	uint64_t set_options[NR_OPTS_SZ];
+	uint8_t description[FIO_TOP_STR_MAX];
+	uint8_t name[FIO_TOP_STR_MAX];
+	uint8_t wait_for[FIO_TOP_STR_MAX];
+	uint8_t directory[FIO_TOP_STR_MAX];
+	uint8_t filename[FIO_TOP_STR_MAX];
+	uint8_t filename_format[FIO_TOP_STR_MAX];
+	uint8_t opendir[FIO_TOP_STR_MAX];
+	uint8_t ioengine[FIO_TOP_STR_MAX];
+	uint8_t mmapfile[FIO_TOP_STR_MAX];
+	uint32_t td_ddir;
+	uint32_t rw_seq;
+	uint32_t kb_base;
+	uint32_t unit_base;
+	uint32_t ddir_seq_nr;
+	uint64_t ddir_seq_add;
+	uint32_t iodepth;
+	uint32_t iodepth_low;
+	uint32_t iodepth_batch;
+	uint32_t iodepth_batch_complete_min;
+	uint32_t iodepth_batch_complete_max;
+	uint32_t serialize_overlap;
+	uint32_t pad;
+
+	uint64_t size;
+	uint64_t io_size;
+	uint32_t size_percent;
+	uint32_t fill_device;
+	uint32_t file_append;
+	uint32_t unique_filename;
+	uint64_t file_size_low;
+	uint64_t file_size_high;
+	uint64_t start_offset;
+	uint64_t start_offset_align;
+
+	uint64_t bs[DDIR_RWDIR_CNT];
+	uint64_t ba[DDIR_RWDIR_CNT];
+	uint64_t min_bs[DDIR_RWDIR_CNT];
+	uint64_t max_bs[DDIR_RWDIR_CNT];
+	struct bssplit bssplit[DDIR_RWDIR_CNT][BSSPLIT_MAX];
+	uint32_t bssplit_nr[DDIR_RWDIR_CNT];
+
+	uint32_t ignore_error[ERROR_TYPE_CNT][ERROR_STR_MAX];
+	uint32_t ignore_error_nr[ERROR_TYPE_CNT];
+	uint32_t error_dump;
+
+	uint32_t nr_files;
+	uint32_t open_files;
+	uint32_t file_lock_mode;
+
+	uint32_t odirect;
+	uint32_t oatomic;
+	uint32_t invalidate_cache;
+	uint32_t create_serialize;
+	uint32_t create_fsync;
+	uint32_t create_on_open;
+	uint32_t create_only;
+	uint32_t end_fsync;
+	uint32_t pre_read;
+	uint32_t sync_io;
+	uint32_t write_hint;
+	uint32_t verify;
+	uint32_t do_verify;
+	uint32_t verify_interval;
+	uint32_t verify_offset;
+	uint8_t verify_pattern[MAX_PATTERN_SIZE];
+	uint32_t verify_pattern_bytes;
+	uint32_t verify_fatal;
+	uint32_t verify_dump;
+	uint32_t verify_async;
+	uint64_t verify_backlog;
+	uint32_t verify_batch;
+	uint32_t experimental_verify;
+	uint32_t verify_state;
+	uint32_t verify_state_save;
+	uint32_t use_thread;
+	uint32_t unlink;
+	uint32_t unlink_each_loop;
+	uint32_t do_disk_util;
+	uint32_t override_sync;
+	uint32_t rand_repeatable;
+	uint32_t allrand_repeatable;
+	uint32_t pad2;
+	uint64_t rand_seed;
+	uint32_t log_avg_msec;
+	uint32_t log_hist_msec;
+	uint32_t log_hist_coarseness;
+	uint32_t log_max;
+	uint32_t log_offset;
+	uint32_t log_gz;
+	uint32_t log_gz_store;
+	uint32_t log_unix_epoch;
+	uint32_t norandommap;
+	uint32_t softrandommap;
+	uint32_t bs_unaligned;
+	uint32_t fsync_on_close;
+	uint32_t bs_is_seq_rand;
+
+	uint32_t random_distribution;
+	uint32_t exitall_error;
+
+	uint32_t sync_file_range;
+
+	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
+	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
+
+	fio_fp64_t zipf_theta;
+	fio_fp64_t pareto_h;
+	fio_fp64_t gauss_dev;
+
+	uint32_t random_generator;
+
+	uint32_t perc_rand[DDIR_RWDIR_CNT];
+
+	uint32_t hugepage_size;
+	uint64_t rw_min_bs;
+	uint32_t thinktime;
+	uint32_t thinktime_spin;
+	uint32_t thinktime_blocks;
+	uint32_t fsync_blocks;
+	uint32_t fdatasync_blocks;
+	uint32_t barrier_blocks;
+	uint64_t start_delay;
+	uint64_t start_delay_high;
+	uint64_t timeout;
+	uint64_t ramp_time;
+	uint64_t ss_dur;
+	uint64_t ss_ramp_time;
+	uint32_t ss_state;
+	fio_fp64_t ss_limit;
+	uint32_t overwrite;
+	uint32_t bw_avg_time;
+	uint32_t iops_avg_time;
+	uint32_t loops;
+	uint64_t zone_range;
+	uint64_t zone_size;
+	uint64_t zone_skip;
+	uint64_t lockmem;
+	uint32_t mem_type;
+	uint32_t mem_align;
+
+	uint16_t exit_what;
+	uint16_t stonewall;
+	uint32_t new_group;
+	uint32_t numjobs;
+	/*
+	 * We currently can't convert these, so don't enable them
+	 */
+#if 0
+	uint8_t cpumask[FIO_TOP_STR_MAX];
+	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
+#endif
+	uint32_t gpu_dev_id;
+	uint32_t start_offset_percent;
+	uint32_t cpus_allowed_policy;
+	uint32_t iolog;
+	uint32_t rwmixcycle;
+	uint32_t rwmix[DDIR_RWDIR_CNT];
+	uint32_t nice;
+	uint32_t ioprio;
+	uint32_t ioprio_class;
+	uint32_t file_service_type;
+	uint32_t group_reporting;
+	uint32_t stats;
+	uint32_t fadvise_hint;
+	uint32_t fallocate_mode;
+	uint32_t zero_buffers;
+	uint32_t refill_buffers;
+	uint32_t scramble_buffers;
+	uint8_t buffer_pattern[MAX_PATTERN_SIZE];
+	uint32_t buffer_pattern_bytes;
+	uint32_t compress_percentage;
+	uint32_t compress_chunk;
+	uint32_t dedupe_percentage;
+	uint32_t time_based;
+	uint32_t disable_lat;
+	uint32_t disable_clat;
+	uint32_t disable_slat;
+	uint32_t disable_bw;
+	uint32_t unified_rw_rep;
+	uint32_t gtod_reduce;
+	uint32_t gtod_cpu;
+	uint32_t clocksource;
+	uint32_t no_stall;
+	uint32_t trim_percentage;
+	uint32_t trim_batch;
+	uint32_t trim_zero;
+	uint64_t trim_backlog;
+	uint32_t clat_percentiles;
+	uint32_t lat_percentiles;
+	uint32_t slat_percentiles;
+	uint32_t percentile_precision;
+	uint32_t pad3;
+	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
+
+	uint8_t read_iolog_file[FIO_TOP_STR_MAX];
+	uint8_t write_iolog_file[FIO_TOP_STR_MAX];
+	uint8_t merge_blktrace_file[FIO_TOP_STR_MAX];
+	fio_fp64_t merge_blktrace_scalars[FIO_IO_U_LIST_MAX_LEN];
+	fio_fp64_t merge_blktrace_iters[FIO_IO_U_LIST_MAX_LEN];
+
+	uint32_t write_bw_log;
+	uint32_t write_lat_log;
+	uint32_t write_iops_log;
+	uint32_t write_hist_log;
+
+	uint8_t bw_log_file[FIO_TOP_STR_MAX];
+	uint8_t lat_log_file[FIO_TOP_STR_MAX];
+	uint8_t iops_log_file[FIO_TOP_STR_MAX];
+	uint8_t hist_log_file[FIO_TOP_STR_MAX];
+	uint8_t replay_redirect[FIO_TOP_STR_MAX];
+
+	/*
+	 * Pre-run and post-run shell
+	 */
+	uint8_t exec_prerun[FIO_TOP_STR_MAX];
+	uint8_t exec_postrun[FIO_TOP_STR_MAX];
+
+	uint64_t rate[DDIR_RWDIR_CNT];
+	uint64_t ratemin[DDIR_RWDIR_CNT];
+	uint32_t ratecycle;
+	uint32_t io_submit_mode;
+	uint32_t rate_iops[DDIR_RWDIR_CNT];
+	uint32_t rate_iops_min[DDIR_RWDIR_CNT];
+	uint32_t rate_process;
+	uint32_t rate_ign_think;
+
+	uint8_t ioscheduler[FIO_TOP_STR_MAX];
+
+	/*
+	 * I/O Error handling
+	 */
+	uint32_t continue_on_error;
+
+	/*
+	 * Benchmark profile type
+	 */
+	uint8_t profile[FIO_TOP_STR_MAX];
+
+	/*
+	 * blkio cgroup support
+	 */
+	uint8_t cgroup[FIO_TOP_STR_MAX];
+	uint32_t cgroup_weight;
+	uint32_t cgroup_nodelete;
+
+	uint32_t uid;
+	uint32_t gid;
+
+	int32_t flow_id;
+	int32_t flow;
+	int32_t flow_watermark;
+	uint32_t flow_sleep;
+
+	uint32_t offset_increment_percent;
+	uint64_t offset_increment;
+	uint64_t number_ios;
+
+	uint64_t latency_target;
+	uint64_t latency_window;
+	uint64_t max_latency;
+	fio_fp64_t latency_percentile;
+
+	uint32_t sig_figs;
+
+	uint32_t block_error_hist;
+
+	uint32_t replay_align;
+	uint32_t replay_scale;
+	uint32_t replay_time_scale;
+	uint32_t replay_skip;
+
+	uint32_t per_job_logs;
+
+	uint32_t allow_create;
+	uint32_t allow_mounted_write;
+
+	uint32_t zone_mode;
+} __attribute__((packed));
+
+extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
+extern void convert_thread_options_to_net(struct thread_options_pack *top, struct thread_options *);
+extern int fio_test_cconv(struct thread_options *);
+extern void options_default_fill(struct thread_options *o);
+
+#endif
diff --git a/tickmarks.c b/tickmarks.c
new file mode 100644
index 0000000..88bace0
--- /dev/null
+++ b/tickmarks.c
@@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * adapted from Paul Heckbert's algorithm on p 657-659 of
+ * Andrew S. Glassner's book, "Graphics Gems"
+ * ISBN 0-12-286166-3
+ *
+ */
+
+#include "tickmarks.h"
+
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+
+static double nicenum(double x, int round)
+{
+	int exp;	/* exponent of x */
+	double f;	/* fractional part of x */
+
+	exp = floor(log10(x));
+	f = x / pow(10.0, exp);
+	if (round) {
+		if (f < 1.5)
+			return 1.0 * pow(10.0, exp);
+		if (f < 3.0)
+			return 2.0 * pow(10.0, exp);
+		if (f < 7.0)
+			return 5.0 * pow(10.0, exp);
+		return 10.0 * pow(10.0, exp);
+	}
+	if (f <= 1.0)
+		return 1.0 * pow(10.0, exp);
+	if (f <= 2.0)
+		return 2.0 * pow(10.0, exp);
+	if (f <= 5.0)
+		return 5.0 * pow(10.0, exp);
+	return 10.0 * pow(10.0, exp);
+}
+
+static void shorten(struct tickmark *tm, int nticks, int *power_of_ten,
+			int use_KMG_symbols, int base_offset)
+{
+	const char shorten_chr[] = { 0, 'K', 'M', 'G', 'P', 'E', 0 };
+	int i, l, minshorten, shorten_idx = 0;
+	char *str;
+
+	minshorten = 100;
+	for (i = 0; i < nticks; i++) {
+		str = tm[i].string;
+		l = strlen(str);
+
+		if (strcmp(str, "0") == 0)
+			continue;
+		if (l > 9 && strcmp(&str[l - 9], "000000000") == 0) {
+			*power_of_ten = 9;
+			shorten_idx = 3;
+		} else if (6 < minshorten && l > 6 &&
+				strcmp(&str[l - 6], "000000") == 0) {
+			*power_of_ten = 6;
+			shorten_idx = 2;
+		} else if (l > 3 && strcmp(&str[l - 3], "000") == 0) {
+			*power_of_ten = 3;
+			shorten_idx = 1;
+		} else {
+			*power_of_ten = 0;
+		}
+
+		if (*power_of_ten < minshorten)
+			minshorten = *power_of_ten;
+	}
+
+	if (minshorten == 0)
+		return;
+	if (!use_KMG_symbols)
+		shorten_idx = 0;
+	else if (base_offset)
+		shorten_idx += base_offset;
+
+	for (i = 0; i < nticks; i++) {
+		str = tm[i].string;
+		l = strlen(str);
+		str[l - minshorten] = shorten_chr[shorten_idx];
+		if (shorten_idx)
+			str[l - minshorten + 1] = '\0';
+	}
+}
+
+int calc_tickmarks(double min, double max, int nticks, struct tickmark **tm,
+		int *power_of_ten, int use_KMG_symbols, int base_offset)
+{
+	char str[100];
+	int nfrac;
+	double d;	/* tick mark spacing */
+	double graphmin, graphmax;	/* graph range min and max */
+	double range, x;
+	int count, i;
+
+	/* we expect min != max */
+	range = nicenum(max - min, 0);
+	d = nicenum(range / (nticks - 1), 1);
+	graphmin = floor(min / d) * d;
+	graphmax = ceil(max / d) * d;
+	nfrac = MAX(-floor(log10(d)), 0);
+	snprintf(str, sizeof(str)-1, "%%.%df", nfrac);
+
+	count = ((graphmax + 0.5 * d) - graphmin) / d + 1;
+	*tm = malloc(sizeof(**tm) * count);
+
+	i = 0;
+	for (x = graphmin; x < graphmax + 0.5 * d; x += d) {
+		(*tm)[i].value = x;
+		sprintf((*tm)[i].string, str, x);
+		i++;
+	}
+	shorten(*tm, i, power_of_ten, use_KMG_symbols, base_offset);
+	return i;
+}
+
+#if 0
+
+static void test_range(double x, double y)
+{
+	int nticks, i;
+
+	struct tickmark *tm = NULL;
+	printf("Testing range %g - %g\n", x, y);
+	nticks = calc_tickmarks(x, y, 10, &tm);
+
+	for (i = 0; i < nticks; i++)
+		printf("   (%s) %g\n", tm[i].string, tm[i].value);
+
+	printf("\n\n");
+	free(tm);
+}
+
+int main(int argc, char *argv[])
+{
+	test_range(0.0005, 0.008);
+	test_range(0.5, 0.8);
+	test_range(5.5, 8.8);
+	test_range(50.5, 80.8);
+	test_range(-20, 20.8);
+	test_range(-30, 700.8);
+}
+#endif
diff --git a/tickmarks.h b/tickmarks.h
new file mode 100644
index 0000000..1e310db
--- /dev/null
+++ b/tickmarks.h
@@ -0,0 +1,12 @@
+#ifndef TICKMARKS_H
+#define TICKMARKS_H
+
+struct tickmark {
+	double value;
+	char string[20];
+};
+
+int calc_tickmarks(double min, double max, int nticks, struct tickmark **tm,
+			int *power_of_ten, int use_KMG_symbols, int base_off);
+
+#endif
diff --git a/time.c b/time.c
new file mode 100644
index 0000000..cd0e2a8
--- /dev/null
+++ b/time.c
@@ -0,0 +1,189 @@
+#include <time.h>
+#include <sys/time.h>
+
+#include "fio.h"
+
+static struct timespec genesis;
+static unsigned long ns_granularity;
+
+void timespec_add_msec(struct timespec *ts, unsigned int msec)
+{
+	uint64_t adj_nsec = 1000000ULL * msec;
+
+	ts->tv_nsec += adj_nsec;
+	if (adj_nsec >= 1000000000) {
+		uint64_t adj_sec = adj_nsec / 1000000000;
+
+		ts->tv_nsec -= adj_sec * 1000000000;
+		ts->tv_sec += adj_sec;
+	}
+	if (ts->tv_nsec >= 1000000000){
+		ts->tv_nsec -= 1000000000;
+		ts->tv_sec++;
+	}
+}
+
+/*
+ * busy looping version for the last few usec
+ */
+uint64_t usec_spin(unsigned int usec)
+{
+	struct timespec start;
+	uint64_t t;
+
+	fio_gettime(&start, NULL);
+	while ((t = utime_since_now(&start)) < usec)
+		nop;
+
+	return t;
+}
+
+uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
+{
+	struct timespec req;
+	struct timespec tv;
+	uint64_t t = 0;
+
+	do {
+		unsigned long ts = usec;
+
+		if (usec < ns_granularity) {
+			t += usec_spin(usec);
+			break;
+		}
+
+		ts = usec - ns_granularity;
+
+		if (ts >= 1000000) {
+			req.tv_sec = ts / 1000000;
+			ts -= 1000000 * req.tv_sec;
+			/*
+			 * Limit sleep to ~1 second at most, otherwise we
+			 * don't notice then someone signaled the job to
+			 * exit manually.
+			 */
+			if (req.tv_sec > 1)
+				req.tv_sec = 1;
+		} else
+			req.tv_sec = 0;
+
+		req.tv_nsec = ts * 1000;
+		fio_gettime(&tv, NULL);
+
+		if (nanosleep(&req, NULL) < 0)
+			break;
+
+		ts = utime_since_now(&tv);
+		t += ts;
+		if (ts >= usec)
+			break;
+
+		usec -= ts;
+	} while (!td->terminate);
+
+	return t;
+}
+
+uint64_t time_since_genesis(void)
+{
+	return time_since_now(&genesis);
+}
+
+uint64_t mtime_since_genesis(void)
+{
+	return mtime_since_now(&genesis);
+}
+
+uint64_t utime_since_genesis(void)
+{
+	return utime_since_now(&genesis);
+}
+
+bool in_ramp_time(struct thread_data *td)
+{
+	return td->o.ramp_time && !td->ramp_time_over;
+}
+
+static bool parent_update_ramp(struct thread_data *td)
+{
+	struct thread_data *parent = td->parent;
+
+	if (!parent || parent->ramp_time_over)
+		return false;
+
+	reset_all_stats(parent);
+	parent->ramp_time_over = true;
+	td_set_runstate(parent, TD_RAMP);
+	return true;
+}
+
+bool ramp_time_over(struct thread_data *td)
+{
+	if (!td->o.ramp_time || td->ramp_time_over)
+		return true;
+
+	if (utime_since_now(&td->epoch) >= td->o.ramp_time) {
+		td->ramp_time_over = true;
+		reset_all_stats(td);
+		reset_io_stats(td);
+		td_set_runstate(td, TD_RAMP);
+
+		/*
+		 * If we have a parent, the parent isn't doing IO. Hence
+		 * the parent never enters do_io(), which will switch us
+		 * from RAMP -> RUNNING. Do this manually here.
+		 */
+		if (parent_update_ramp(td))
+			td_set_runstate(td, TD_RUNNING);
+
+		return true;
+	}
+
+	return false;
+}
+
+void fio_time_init(void)
+{
+	int i;
+
+	fio_clock_init();
+
+	/*
+	 * Check the granularity of the nanosleep function
+	 */
+	for (i = 0; i < 10; i++) {
+		struct timespec tv, ts;
+		unsigned long elapsed;
+
+		fio_gettime(&tv, NULL);
+		ts.tv_sec = 0;
+		ts.tv_nsec = 1000;
+
+		nanosleep(&ts, NULL);
+		elapsed = utime_since_now(&tv);
+
+		if (elapsed > ns_granularity)
+			ns_granularity = elapsed;
+	}
+}
+
+void set_genesis_time(void)
+{
+	fio_gettime(&genesis, NULL);
+}
+
+void set_epoch_time(struct thread_data *td, int log_unix_epoch)
+{
+	fio_gettime(&td->epoch, NULL);
+	if (log_unix_epoch) {
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+		td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 +
+		                 (unsigned long long)(tv.tv_usec) / 1000;
+	}
+}
+
+void fill_start_time(struct timespec *t)
+{
+	memcpy(t, &genesis, sizeof(genesis));
+}
diff --git a/tools/.gitignore b/tools/.gitignore
new file mode 100644
index 0000000..b25c15b
--- /dev/null
+++ b/tools/.gitignore
@@ -0,0 +1 @@
+*~
diff --git a/tools/fio.service b/tools/fio.service
new file mode 100644
index 0000000..678158b
--- /dev/null
+++ b/tools/fio.service
@@ -0,0 +1,10 @@
+[Unit]
+Description=Flexible I/O tester server
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/fio --server
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tools/fio_generate_plots b/tools/fio_generate_plots
new file mode 100755
index 0000000..8872206
--- /dev/null
+++ b/tools/fio_generate_plots
@@ -0,0 +1,132 @@
+#!/bin/sh
+#
+# This script is an almost total rewrite by Louwrentius 
+# of the original fio_generate_plots script provided as part of the FIO storage
+# benchmark utiliy. I only retained how GNUplot is used to generate graphs, as
+# that is something I know nothing about.
+#
+# The script uses the files generated by FIO to create nice graphs in the
+# SVG format. This output format is supported by most modern browsers and
+# allows resolution independent graphs to be generated.
+#
+# This script supports GNUPLOT 4.4 and higher.
+# 
+# Version 1.0 @ 20121231
+#
+# 
+#
+
+if [ -z "$1" ]; then
+	echo "Usage: fio_generate_plots subtitle [xres yres]"
+	exit 1
+fi
+
+GNUPLOT=$(which gnuplot)
+if [ ! -x "$GNUPLOT" ]
+then
+	echo You need gnuplot installed to generate graphs
+	exit 1
+fi
+
+TITLE="$1"
+
+# set resolution
+if [ ! -z "$2" ] && [ ! -z "$3" ]
+then
+	XRES="$2"
+	YRES="$3"
+else
+	XRES=1280
+	YRES=768
+fi
+
+if [ -z "$SAMPLE_DURATION" ]
+then
+    SAMPLE_DURATION="*"
+fi
+
+DEFAULT_GRID_LINE_TYPE=3
+DEFAULT_LINE_WIDTH=2
+DEFAULT_LINE_COLORS="
+set object 1 rectangle from screen 0,0 to screen 1,1 fillcolor rgb\"#ffffff\" behind 
+set style line 1 lc rgb \"#E41A1C\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 2 lc rgb \"#377EB8\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 3 lc rgb \"#4DAF4A\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 4 lc rgb \"#984EA3\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 5 lc rgb \"#FF7F00\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 6 lc rgb \"#DADA33\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 7 lc rgb \"#A65628\" lw $DEFAULT_LINE_WIDTH lt 1;
+set style line 20 lc rgb \"#000000\" lt $DEFAULT_GRID_LINE_TYPE lw $DEFAULT_LINE_WIDTH;
+"
+
+DEFAULT_TERMINAL="set terminal svg enhanced dashed size $XRES,$YRES dynamic"
+DEFAULT_TITLE_FONT="\"Helvetica,28\""
+DEFAULT_AXIS_FONT="\"Helvetica,14\""
+DEFAULT_AXIS_LABEL_FONT="\"Helvetica,16\""
+DEFAULT_XLABEL="set xlabel \"Time (sec)\" font $DEFAULT_AXIS_LABEL_FONT"
+DEFAULT_XTIC="set xtics font $DEFAULT_AXIS_FONT"
+DEFAULT_YTIC="set ytics font $DEFAULT_AXIS_FONT"
+DEFAULT_MXTIC="set mxtics 0"
+DEFAULT_MYTIC="set mytics 2"
+DEFAULT_XRANGE="set xrange [0:$SAMPLE_DURATION]"
+DEFAULT_YRANGE="set yrange [0:*]"
+DEFAULT_GRID="set grid ls 20"
+DEFAULT_KEY="set key outside bottom center ; set key box enhanced spacing 2.0 samplen 3 horizontal width 4 height 1.2 "
+DEFAULT_SOURCE="set label 30 \"Data source: http://example.com\" font $DEFAULT_AXIS_FONT tc rgb \"#00000f\" at screen 0.976,0.175 right"
+DEFAULT_OPTS="$DEFAULT_LINE_COLORS ; $DEFAULT_GRID_LINE ; $DEFAULT_GRID ; $DEFAULT_GRID_MINOR ; $DEFAULT_XLABEL ; $DEFAULT_XRANGE ; $DEFAULT_YRANGE ; $DEFAULT_XTIC ;  $DEFAULT_YTIC ; $DEFAULT_MXTIC ; $DEFAULT_MYTIC ; $DEFAULT_KEY ; $DEFAULT_TERMINAL ; $DEFAULT_SOURCE"
+
+plot () {
+    
+    if [ -z "$TITLE" ]
+    then	
+        PLOT_TITLE=" set title \"$1\" font $DEFAULT_TITLE_FONT"
+    else
+        PLOT_TITLE=" set title \"$TITLE\\\n\\\n{/*0.6 "$1"}\" font $DEFAULT_TITLE_FONT"
+    fi
+    FILETYPE="$2"
+    YAXIS="set ylabel \"$3\" font $DEFAULT_AXIS_LABEL_FONT"
+    SCALE=$4
+
+    echo "Title: $PLOT_TITLE"
+    echo "File type: $FILETYPE"
+    echo "yaxis: $YAXIS"
+
+    i=0
+    
+    for x in *_"$FILETYPE".log *_"$FILETYPE".*.log
+    do
+        if [ -e "$x" ]; then
+            i=$((i+1))
+            PT=$(echo $x | sed 's/\(.*\)_'$FILETYPE'\(.*\).log$/\1\2/')
+            if [ ! -z "$PLOT_LINE" ]
+            then
+                PLOT_LINE=$PLOT_LINE", "
+            fi
+
+            DEPTH=$(echo $PT | cut -d "-" -f 4)
+            PLOT_LINE=$PLOT_LINE"'$x' using (\$1/1000):(\$2/$SCALE) title \"Queue depth $DEPTH\" with lines ls $i" 
+        fi
+    done
+
+    if [ $i -eq 0 ]; then
+       echo "No log files found"
+       exit 1
+    fi
+
+    OUTPUT="set output \"$TITLE-$FILETYPE.svg\" "
+
+    echo " $PLOT_TITLE ; $YAXIS ; $DEFAULT_OPTS ; show style lines ; $OUTPUT ; plot "  $PLOT_LINE  | $GNUPLOT -
+    unset PLOT_LINE
+}
+
+#
+# plot <sub title> <file name tag> <y axis label> <y axis scale>
+#
+
+plot "I/O Latency" lat "Time (msec)" 1000
+plot "I/O Operations Per Second" iops "IOPS" 1
+plot "I/O Submission Latency" slat "Time (μsec)" 1
+plot "I/O Completion Latency" clat "Time (msec)" 1000
+plot "I/O Bandwidth" bw "Throughput (KB/s)" 1
+
+
diff --git a/tools/fio_generate_plots.1 b/tools/fio_generate_plots.1
new file mode 100644
index 0000000..92b2421
--- /dev/null
+++ b/tools/fio_generate_plots.1
@@ -0,0 +1,44 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.\" First parameter, NAME, should be all caps
+.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection
+.\" other parameters are allowed: see man(7), man(1)
+.TH FIO_GENERATE_PLOTS 1 "May 19, 2009"
+.\" Please adjust this date whenever revising the manpage.
+.\"
+.\" Some roff macros, for reference:
+.\" .nh        disable hyphenation
+.\" .hy        enable hyphenation
+.\" .ad l      left justify
+.\" .ad b      justify to both left and right margins
+.\" .nf        disable filling
+.\" .fi        enable filling
+.\" .br        insert line break
+.\" .sp <n>    insert n+1 empty lines
+.\" for manpage-specific macros, see man(7)
+.SH NAME
+fio_generate_plots \- Generate plots for Flexible I/O Tester
+.SH SYNOPSIS
+.B fio_generate_plots
+.RI " title"
+.br
+.SH DESCRIPTION
+This manual page documents briefly the
+.B fio_generate_plots
+command. This manual page was written for the Debian distribution
+because the original program does not have a manual page.
+.PP
+.\" TeX users may be more comfortable with the \fB<whatever>\fP and
+.\" \fI<whatever>\fP escape sequences to invode bold face and italics,
+.\" respectively.
+\fBfio_generate_plots\fP is a shell script that uses gnuplot to
+generate plots from fio run with \-\-latency-log (\-l) and/or 
+\-\-bandwidth-log (\-w). It expects the log files that fio
+generated in the current directory.
+.SH OPTIONS
+The script takes the title of the plot as only argument. It does
+not offer any additional options.
+.SH AUTHOR
+fio_generate_plots was written by Jens Axboe <axboe@kernel.dk>
+.PP
+This manual page was written by Martin Steigerwald <ms@teamix.de>,
+for the Debian project (but may be used by others).
diff --git a/tools/fio_jsonplus_clat2csv b/tools/fio_jsonplus_clat2csv
new file mode 100755
index 0000000..78a007e
--- /dev/null
+++ b/tools/fio_jsonplus_clat2csv
@@ -0,0 +1,177 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python3 compatible.
+#
+# fio_jsonplus_clat2csv
+#
+# This script converts fio's json+ completion latency data to CSV format.
+#
+# For example:
+#
+# Run the following fio jobs:
+# ../fio --output=fio-jsonplus.output --output-format=json+ --name=test1
+#  	--ioengine=null --time_based --runtime=5s --size=1G --rw=randrw
+# 	--name=test2 --ioengine=null --time_based --runtime=3s --size=1G
+# 	--rw=read --name=test3 --ioengine=null --time_based --runtime=4s
+# 	--size=8G --rw=write
+#
+# Then run:
+# fio_jsonplus_clat2csv fio-jsonplus.output fio-latency.csv
+#
+# You will end up with the following 3 files
+#
+# -rw-r--r-- 1 root root  6467 Jun 27 14:57 fio-latency_job0.csv
+# -rw-r--r-- 1 root root  3985 Jun 27 14:57 fio-latency_job1.csv
+# -rw-r--r-- 1 root root  4490 Jun 27 14:57 fio-latency_job2.csv
+#
+# fio-latency_job0.csv will look something like:
+#
+# clat_nsec, read_count, read_cumulative, read_percentile, write_count,
+# 	write_cumulative, write_percentile, trim_count, trim_cumulative,
+# 	trim_percentile,
+# 25, 1, 1, 1.50870705013e-07, , , , , , ,
+# 26, 12, 13, 1.96131916517e-06, 947, 947, 0.000142955890032, , , ,
+# 27, 843677, 843690, 0.127288105112, 838347, 839294, 0.126696959629, , , ,
+# 28, 1877982, 2721672, 0.410620573454, 1870189, 2709483, 0.409014312345, , , ,
+# 29, 4471, 2726143, 0.411295116376, 7718, 2717201, 0.410179395301, , , ,
+# 30, 2142885, 4869028, 0.734593687087, 2138164, 4855365, 0.732949340025, , , ,
+# ...
+# 2544, , , , 2, 6624404, 0.999997433738, , , ,
+# 2576, 3, 6628178, 0.99999788781, 4, 6624408, 0.999998037564, , , ,
+# 2608, 4, 6628182, 0.999998491293, 4, 6624412, 0.999998641391, , , ,
+# 2640, 3, 6628185, 0.999998943905, 2, 6624414, 0.999998943304, , , ,
+# 2672, 1, 6628186, 0.999999094776, 3, 6624417, 0.999999396174, , , ,
+# 2736, 1, 6628187, 0.999999245646, 1, 6624418, 0.99999954713, , , ,
+# 2768, 2, 6628189, 0.999999547388, 1, 6624419, 0.999999698087, , , ,
+# 2800, , , , 1, 6624420, 0.999999849043, , , ,
+# 2832, 1, 6628190, 0.999999698259, , , , , , ,
+# 4192, 1, 6628191, 0.999999849129, , , , , , ,
+# 5792, , , , 1, 6624421, 1.0, , , ,
+# 10304, 1, 6628192, 1.0, , , , , , ,
+#
+# The first line says that you had one read IO with 25ns clat,
+# the cumulative number of read IOs at or below 25ns is 1, and
+# 25ns is the 0.00001509th percentile for read latency
+#
+# The job had 2 write IOs complete in 2544ns,
+# 6624404 write IOs completed in 2544ns or less,
+# and this represents the 99.99974th percentile for write latency
+#
+# The last line says that one read IO had 10304ns clat,
+# 6628192 read IOs had 10304ns or shorter clat, and
+# 10304ns is the 100th percentile for read latency
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import os
+import json
+import argparse
+import six
+from six.moves import range
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('source',
+                        help='fio json+ output file containing completion '
+                             'latency data')
+    parser.add_argument('dest',
+                        help='destination file stub for latency data in CSV '
+                             'format. job number will be appended to filename')
+    args = parser.parse_args()
+
+    return args
+
+
+def percentile(idx, run_total):
+    total = run_total[len(run_total)-1]
+    if total == 0:
+        return 0
+
+    return float(run_total[idx]) / total
+
+
+def more_lines(indices, bins):
+    for key, value in six.iteritems(indices):
+        if value < len(bins[key]):
+            return True
+
+    return False
+
+
+def main():
+    args = parse_args()
+
+    with open(args.source, 'r') as source:
+        jsondata = json.loads(source.read())
+
+    for jobnum in range(0, len(jsondata['jobs'])):
+        bins = {}
+        run_total = {}
+        ddir_set = set(['read', 'write', 'trim'])
+
+        prev_ddir = None
+        for ddir in ddir_set:
+            if 'bins' in jsondata['jobs'][jobnum][ddir]['clat_ns']:
+                bins_loc = 'clat_ns'
+            elif 'bins' in jsondata['jobs'][jobnum][ddir]['lat_ns']:
+                bins_loc = 'lat_ns'
+            else:
+                raise RuntimeError("Latency bins not found. "
+                                   "Are you sure you are using json+ output?")
+
+            bins[ddir] = [[int(key), value] for key, value in
+                          six.iteritems(jsondata['jobs'][jobnum][ddir][bins_loc]
+                          ['bins'])]
+            bins[ddir] = sorted(bins[ddir], key=lambda bin: bin[0])
+
+            run_total[ddir] = [0 for x in range(0, len(bins[ddir]))]
+            if len(bins[ddir]) > 0:
+                run_total[ddir][0] = bins[ddir][0][1]
+                for x in range(1, len(bins[ddir])):
+                    run_total[ddir][x] = run_total[ddir][x-1] + \
+                        bins[ddir][x][1]
+
+        stub, ext = os.path.splitext(args.dest)
+        outfile = stub + '_job' + str(jobnum) + ext
+
+        with open(outfile, 'w') as output:
+            output.write("{0}ec, ".format(bins_loc))
+            ddir_list = list(ddir_set)
+            for ddir in ddir_list:
+                output.write("{0}_count, {0}_cumulative, {0}_percentile, ".
+                             format(ddir))
+            output.write("\n")
+
+#
+# Have a counter for each ddir
+# In each round, pick the shortest remaining duration
+# and output a line with any values for that duration
+#
+            indices = {x: 0 for x in ddir_list}
+            while more_lines(indices, bins):
+                min_lat = 17112760320
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]):
+                        min_lat = min(bins[ddir][indices[ddir]][0], min_lat)
+
+                output.write("{0}, ".format(min_lat))
+
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]) and \
+                       min_lat == bins[ddir][indices[ddir]][0]:
+                        count = bins[ddir][indices[ddir]][1]
+                        cumulative = run_total[ddir][indices[ddir]]
+                        ptile = percentile(indices[ddir], run_total[ddir])
+                        output.write("{0}, {1}, {2}, ".format(count,
+                                     cumulative, ptile))
+                        indices[ddir] += 1
+                    else:
+                        output.write(", , , ")
+                output.write("\n")
+
+            print("{0} generated".format(outfile))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/fiologparser.py b/tools/fiologparser.py
new file mode 100755
index 0000000..cc29f1c
--- /dev/null
+++ b/tools/fiologparser.py
@@ -0,0 +1,223 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python 3 compatible.
+#
+# fiologparser.py
+#
+# This tool lets you parse multiple fio log files and look at interaval
+# statistics even when samples are non-uniform.  For instance:
+#
+# fiologparser.py -s *bw*
+#
+# to see per-interval sums for all bandwidth logs or:
+#
+# fiologparser.py -a *clat*
+#
+# to see per-interval average completion latency.
+
+from __future__ import absolute_import
+from __future__ import print_function
+import argparse
+import math
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--interval', required=False, type=int, default=1000, help='interval of time in seconds.')
+    parser.add_argument('-d', '--divisor', required=False, type=int, default=1, help='divide the results by this value.')
+    parser.add_argument('-f', '--full', dest='full', action='store_true', default=False, help='print full output.')
+    parser.add_argument('-A', '--all', dest='allstats', action='store_true', default=False, 
+                        help='print all stats for each interval.')
+    parser.add_argument('-a', '--average', dest='average', action='store_true', default=False, help='print the average for each interval.')
+    parser.add_argument('-s', '--sum', dest='sum', action='store_true', default=False, help='print the sum for each interval.')
+    parser.add_argument("FILE", help="collectl log output files to parse", nargs="+")
+    args = parser.parse_args()
+
+    return args
+
+def get_ftime(series):
+    ftime = 0
+    for ts in series:
+        if ftime == 0 or ts.last.end < ftime:
+            ftime = ts.last.end
+    return ftime
+
+def print_full(ctx, series):
+    ftime = get_ftime(series)
+    start = 0 
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %s" % (end, ', '.join(["%0.3f" % i for i in results])))
+        start += ctx.interval
+        end += ctx.interval
+
+def print_sums(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %0.3f" % (end, sum(results)))
+        start += ctx.interval
+        end += ctx.interval
+
+def print_averages(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %0.3f" % (end, float(sum(results))/len(results)))
+        start += ctx.interval
+        end += ctx.interval
+
+# FIXME: this routine is computationally inefficient
+# and has O(N^2) behavior
+# it would be better to make one pass through samples
+# to segment them into a series of time intervals, and
+# then compute stats on each time interval instead.
+# to debug this routine, use
+#   # sort -n -t ',' -k 2 small.log
+# on your input.
+
+def my_extend( vlist, val ):
+    vlist.extend(val)
+    return vlist
+
+array_collapser = lambda vlist, val:  my_extend(vlist, val) 
+
+def print_all_stats(ctx, series):
+    ftime = get_ftime(series)
+    start = 0 
+    end = ctx.interval
+    print('start-time, samples, min, avg, median, 90%, 95%, 99%, max')
+    while (start < ftime):  # for each time interval
+        end = ftime if ftime < end else end
+        sample_arrays = [ s.get_samples(start, end) for s in series ]
+        samplevalue_arrays = []
+        for sample_array in sample_arrays:
+            samplevalue_arrays.append( 
+                [ sample.value for sample in sample_array ] )
+        # collapse list of lists of sample values into list of sample values
+        samplevalues = reduce( array_collapser, samplevalue_arrays, [] )
+        # compute all stats and print them
+        mymin = min(samplevalues)
+        myavg = sum(samplevalues) / float(len(samplevalues))
+        mymedian = median(samplevalues)
+        my90th = percentile(samplevalues, 0.90) 
+        my95th = percentile(samplevalues, 0.95)
+        my99th = percentile(samplevalues, 0.99)
+        mymax = max(samplevalues)
+        print( '%f, %d, %f, %f, %f, %f, %f, %f, %f' % (
+            start, len(samplevalues), 
+            mymin, myavg, mymedian, my90th, my95th, my99th, mymax))
+
+        # advance to next interval
+        start += ctx.interval
+        end += ctx.interval
+
+def median(values):
+    s=sorted(values)
+    return float(s[(len(s)-1)/2]+s[(len(s)/2)])/2
+
+def percentile(values, p):
+    s = sorted(values)
+    k = (len(s)-1) * p
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return s[int(k)]
+    return (s[int(f)] * (c-k)) + (s[int(c)] * (k-f))
+
+def print_default(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+    averages = []
+    weights = []
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        averages.append(sum(results)) 
+        weights.append(end-start)
+        start += ctx.interval
+        end += ctx.interval
+
+    total = 0
+    for i in range(0, len(averages)):
+        total += averages[i]*weights[i]
+    print('%0.3f' % (total/sum(weights)))
+ 
+class TimeSeries(object):
+    def __init__(self, ctx, fn):
+        self.ctx = ctx
+        self.last = None 
+        self.samples = []
+        self.read_data(fn)
+
+    def read_data(self, fn):
+        f = open(fn, 'r')
+        p_time = 0
+        for line in f:
+            (time, value, foo, bar) = line.rstrip('\r\n').rsplit(', ')
+            self.add_sample(p_time, int(time), int(value))
+            p_time = int(time)
+ 
+    def add_sample(self, start, end, value):
+        sample = Sample(ctx, start, end, value)
+        if not self.last or self.last.end < end:
+            self.last = sample
+        self.samples.append(sample)
+
+    def get_samples(self, start, end):
+        sample_list = []
+        for s in self.samples:
+            if s.start >= start and s.end <= end:
+                sample_list.append(s)
+        return sample_list
+
+    def get_value(self, start, end):
+        value = 0
+        for sample in self.samples:
+            value += sample.get_contribution(start, end)
+        return value
+
+class Sample(object):
+    def __init__(self, ctx, start, end, value):
+       self.ctx = ctx
+       self.start = start
+       self.end = end
+       self.value = value
+
+    def get_contribution(self, start, end):
+       # short circuit if not within the bound
+       if (end < self.start or start > self.end):
+           return 0 
+
+       sbound = self.start if start < self.start else start
+       ebound = self.end if end > self.end else end
+       ratio = float(ebound-sbound) / (end-start) 
+       return self.value*ratio/ctx.divisor
+
+
+if __name__ == '__main__':
+    ctx = parse_args()
+    series = []
+    for fn in ctx.FILE:
+       series.append(TimeSeries(ctx, fn)) 
+    if ctx.sum:
+        print_sums(ctx, series)
+    elif ctx.average:
+        print_averages(ctx, series)
+    elif ctx.full:
+        print_full(ctx, series)
+    elif ctx.allstats:
+        print_all_stats(ctx, series)
+    else:
+        print_default(ctx, series)
diff --git a/tools/genfio b/tools/genfio
new file mode 100755
index 0000000..8518bbc
--- /dev/null
+++ b/tools/genfio
@@ -0,0 +1,355 @@
+#!/bin/bash
+#
+#  Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+#  Author: Erwan Velu  <erwan@enovance.com>
+#
+#  The license below covers all files distributed with fio unless otherwise
+#  noted in the file itself.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License version 2 as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+BLK_SIZE=
+BLOCK_SIZE=4k
+SEQ=-1
+TEMPLATE=/tmp/template.fio
+OUTFILE=
+DISKS=
+PRINTABLE_DISKS=
+RUNTIME=300
+ETA=0
+MODES="write,randwrite,read,randread"
+SHORT_HOSTNAME=
+CACHED_IO="FALSE"
+PREFIX=""
+PREFIX_FILENAME=""
+IODEPTH=1
+
+show_help() {
+	PROG=$(basename $0)
+	echo "usage of $PROG:"
+	cat << EOF
+-h				: Show this help & exit
+-c				: Enable cached-based IOs
+					Disabled by default
+-a				: Run sequential test then parallel one
+					Disabled by default
+-s				: Run sequential test (default value)
+					one test after another then one disk after another
+					Disabled by default
+-p				: Run parallel test
+					one test after anoter but all disks at the same time
+					Enabled by default
+-D iodepth			: Run with the specified iodepth
+					Default is $IODEPTH
+-d disk1[,disk2,disk3,..]	: Run the tests on the selected disks
+					Separated each disk with a comma
+-z filesize                     : Specify the working file size, if you are passing filepaths to -d
+                                        Disabled by default
+-r seconds			: Time in seconds per benchmark
+					0 means till the end of the device
+					Default is $RUNTIME seconds
+-b blocksize[,blocksize1, ...]  : The blocksizes to test under fio format (4k, 1m, ...)
+					Separated each blocksize with a comma
+					Default is $BLOCK_SIZE
+-m mode1,[mode2,mode3, ...]     : Define the fio IO profile to use like read, write, randread, randwrite
+					Default is "$MODES"
+-x prefix			: Add a prefix to the fio filename
+					Useful to let a context associated with the file
+					If the prefix features a / (slash), prefix will be considered as a directory
+-A cmd_to_run			: System command to run after each job (exec_postrun in fio)
+-B cmd_to_run			: System command to run before each job (exec_prerun in fio)
+
+Example:
+
+$PROG -d /dev/sdb,/dev/sdc,/dev/sdd,/dev/sde -a -b 4k,128k,1m -r 100 -a -x dellr720-day2/
+
+	Will generate an fio file that will run
+		- a sequential bench on /dev/sdb /dev/sdc /dev/sdd /dev/sde for block size = 4k with write,randwrite,read,randread tests
+			ETA ~ 4 tests * 4 disks * 100 seconds
+		- a sequential bench on /dev/sdb /dev/sdc /dev/sdd /dev/sde for block size = 128k with write,randwrite,read,randread tests
+			ETA ~ 4 tests * 4 disks * 100 seconds
+		- a sequential bench on /dev/sdb /dev/sdc /dev/sdd /dev/sde for block size = 1m with write,randwrite,read,randread tests
+			ETA ~ 4 tests * 4 disks * 100 seconds
+		- a parallel bench on /dev/sdb /dev/sdc /dev/sdd /dev/sde for block size = 4k with write,randwrite,read,randread tests
+			ETA ~ 4 tests * 100 seconds
+		- a parallel bench on /dev/sdb /dev/sdc /dev/sdd /dev/sde for block size = 128k with write,randwrite,read,randread tests
+			ETA ~ 4 tests * 100 seconds
+		- a parallel bench on /dev/sdb /dev/sdc /dev/sdd /dev/sde for block size = 1m with write,randwrite,read,randread tests
+			ETA ~ 4 tests * 100 seconds
+
+Generating dellr720-day2/localhost-4k,128k,1m-all-write,randwrite,read,randread-sdb,sdc,sdd,sde.fio
+Estimated Time = 6000 seconds : 1 hour 40 minutes
+EOF
+}
+
+finish_template() {
+echo "iodepth=$IODEPTH" >> $TEMPLATE
+
+if [ "$RUNTIME" != "0" ]; then
+	echo "runtime=$RUNTIME" >> $TEMPLATE
+	echo "time_based" >> $TEMPLATE
+fi
+
+if [ "$CACHED_IO" = "FALSE" ]; then
+	echo "direct=1" >> $TEMPLATE
+fi
+}
+
+
+diskname_to_printable() {
+COUNT=0
+for disk in $(echo $@ | tr "," " "); do
+	R=$(basename $disk | sed 's|/|_|g')
+	COUNT=$(($COUNT + 1))
+	if [ $COUNT -eq 1 ]; then
+		P="$R"
+	else
+		P="$P,$R"
+	fi
+done
+echo $P
+}
+
+gen_template() {
+cat >$TEMPLATE << EOF
+[global]
+ioengine=libaio
+invalidate=1
+ramp_time=5
+EOF
+}
+
+gen_seq_suite() {
+TYPE=$1
+disk=$2
+PRINTABLE_DISK=$(diskname_to_printable $disk)
+cat >> $OUTFILE << EOF
+[$TYPE-$PRINTABLE_DISK-$BLK_SIZE-seq]
+stonewall
+bs=$BLK_SIZE
+filename=$disk
+rw=$TYPE
+write_bw_log=${PREFIX_FILENAME}$SHORT_HOSTNAME-$BLK_SIZE-$PRINTABLE_DISK-$TYPE-seq.results
+write_iops_log=${PREFIX_FILENAME}$SHORT_HOSTNAME-$BLK_SIZE-$PRINTABLE_DISK-$TYPE-seq.results
+EOF
+ETA=$(($ETA + $RUNTIME))
+}
+
+gen_seq_fio() {
+for disk in $(echo $DISKS | tr "," " "); do
+	for mode in $(echo $MODES | tr "," " "); do
+		gen_seq_suite "$mode" "$disk"
+	done
+done
+}
+
+
+gen_para_suite() {
+TYPE=$1
+NEED_WALL=$2
+D=0
+for disk in $(echo $DISKS | tr "," " "); do
+    PRINTABLE_DISK=$(diskname_to_printable $disk)
+    cat >> $OUTFILE << EOF
+[$TYPE-$PRINTABLE_DISK-$BLK_SIZE-para]
+bs=$BLK_SIZE
+EOF
+
+if [ "$D" = 0 ]; then
+    echo "stonewall" >> $OUTFILE
+    D=1
+fi
+
+cat >> $OUTFILE << EOF
+filename=$disk
+rw=$TYPE
+write_bw_log=${PREFIX_FILENAME}$SHORT_HOSTNAME-$BLK_SIZE-$PRINTABLE_DISK-$TYPE-para.results
+write_iops_log=${PREFIX_FILENAME}$SHORT_HOSTNAME-$BLK_SIZE-$PRINTABLE_DISK-$TYPE-para.results
+EOF
+done
+
+ETA=$(($ETA + $RUNTIME))
+echo >> $OUTFILE
+}
+
+gen_para_fio() {
+for mode in $(echo $MODES | tr "," " "); do
+	gen_para_suite "$mode"
+done
+}
+
+gen_fio() {
+case $SEQ in
+	2)
+		gen_seq_fio
+		gen_para_fio
+	;;
+	1)
+		gen_seq_fio
+	;;
+	0)
+		gen_para_fio
+	;;
+esac
+}
+
+parse_cmdline() {
+while getopts "hacpsd:b:r:m:x:z:D:A:B:" opt; do
+  case $opt in
+    h)
+	show_help
+	exit 0
+	;;
+    b)
+	BLOCK_SIZE=$OPTARG
+	;;
+    c)
+	CACHED_IO="TRUE"
+	;;
+    s)
+	if [ "$SEQ" = "-1" ]; then
+		SEQ=1
+	fi
+      ;;
+    x)
+	PREFIX=$OPTARG
+	echo "$PREFIX" | grep -q "/"
+	if [ "$?" -eq 0 ]; then
+		mkdir -p $PREFIX
+		# No need to keep the prefix for the log files
+		# we do have a directory for that
+		PREFIX_FILENAME=""
+	else
+		# We need to keep the prefix for the log files
+		PREFIX_FILENAME=$PREFIX
+	fi
+	;;
+    r)
+	RUNTIME=$OPTARG
+      ;;
+    p)
+	if [ "$SEQ" = "-1" ]; then
+		SEQ=0
+	fi
+      ;;
+    m)
+        MODES=$OPTARG;
+      ;;
+    d)
+ 	DISKS=$OPTARG
+	PRINTABLE_DISKS=$(diskname_to_printable "$DISKS")
+      ;;
+    D)
+	IODEPTH=$OPTARG
+      ;;
+    a)
+	SEQ=2
+      ;;
+    B)
+	echo "exec_prerun=$OPTARG" >> $TEMPLATE
+      ;;
+    A)
+	echo "exec_postrun=$OPTARG" >> $TEMPLATE
+      ;;
+    z)
+	FSIZE=$OPTARG
+	echo "size=$FSIZE" >> $TEMPLATE
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      ;;
+  esac
+done
+
+if [ "$SEQ" = "-1" ]; then
+	SEQ=0
+fi
+
+SHORT_HOSTNAME=$(hostname -s)
+case $SEQ in
+	2)
+		OUTFILE=${PREFIX}$SHORT_HOSTNAME-$BLOCK_SIZE-all-$MODES-$PRINTABLE_DISKS.fio
+	;;
+
+	1)
+		OUTFILE=${PREFIX}$SHORT_HOSTNAME-$BLOCK_SIZE-sequential-$MODES-$PRINTABLE_DISKS.fio
+	;;
+	0)
+		OUTFILE=${PREFIX}$SHORT_HOSTNAME-$BLOCK_SIZE-parallel-$MODES-$PRINTABLE_DISKS.fio
+	;;
+esac
+
+if [ -z "$DISKS" ]; then
+	echo "Missing DISKS !"
+	echo "Please read the help !"
+	show_help
+	exit 1
+fi
+
+}
+
+check_mode_order() {
+FOUND_WRITE="NO"
+CAUSE="You are reading data before writing them          "
+
+# If no write occurs, let's show a different message
+echo $MODES | grep -q "write"
+if [ "$?" -ne 0 ]; then
+	CAUSE="You are reading data while never wrote them before"
+fi
+
+for mode in $(echo $MODES | tr "," " "); do
+	echo $mode | grep -q write
+	if [ "$?" -eq 0 ]; then
+		FOUND_WRITE="YES"
+	fi
+	echo $mode | grep -q "read"
+	if [ "$?" -eq 0 ]; then
+		if [ "$FOUND_WRITE" = "NO" ]; then
+			echo "###############################################################"
+			echo "# Warning : $CAUSE#"
+			echo "# On some storage devices, this could lead to invalid results #"
+			echo "#                                                             #"
+			echo "# Press Ctrl-C to adjust pattern order if you have doubts     #"
+			echo "# Or Wait 5 seconds before the file will be created           #"
+			echo "###############################################################"
+			sleep 5
+			# No need to try showing the message more than one time
+			return
+		fi
+	fi
+done
+}
+
+
+########## MAIN
+gen_template
+parse_cmdline "$@"
+finish_template
+check_mode_order
+
+echo "Generating $OUTFILE"
+cp -f $TEMPLATE $OUTFILE
+echo >> $OUTFILE
+
+for BLK_SIZE in $(echo $BLOCK_SIZE | tr "," " "); do
+	gen_fio
+done
+ETA_H=$(($ETA / 3600))
+ETA_M=$((($ETA - ($ETA_H*3600)) / 60))
+if [ "$ETA" = "0" ]; then
+	echo "Cannot estimate ETA as RUNTIME=0"
+else
+	echo "Estimated Time = $ETA seconds : $ETA_H hour $ETA_M minutes"
+fi
diff --git a/tools/hist/.gitignore b/tools/hist/.gitignore
new file mode 100644
index 0000000..4f875da
--- /dev/null
+++ b/tools/hist/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+*.ipynb
+.ipynb_checkpoints
diff --git a/tools/hist/fio-histo-log-pctiles.py b/tools/hist/fio-histo-log-pctiles.py
new file mode 100755
index 0000000..f9df2a3
--- /dev/null
+++ b/tools/hist/fio-histo-log-pctiles.py
@@ -0,0 +1,774 @@
+#!/usr/bin/env python
+
+# module to parse fio histogram log files, not using pandas
+# runs in python v2 or v3
+# to get help with the CLI: $ python fio-histo-log-pctiles.py -h
+# this can be run standalone as a script but is callable
+# assumes all threads run for same time duration
+# assumes all threads are doing the same thing for the entire run
+
+# percentiles:
+#  0 - min latency
+#  50 - median
+#  100 - max latency
+
+# TO-DO: 
+#   separate read and write stats for randrw mixed workload
+#   report average latency if needed
+#   prove that it works (partially done with unit tests)
+
+# to run unit tests, set UNITTEST environment variable to anything
+# if you do this, don't pass normal CLI parameters to it
+# otherwise it runs the CLI
+
+import sys, os, math, copy, time
+from copy import deepcopy
+import argparse
+
+unittest2_imported = True
+try:
+    import unittest2
+except ImportError:
+    unittest2_imported = False
+
+msec_per_sec = 1000
+nsec_per_usec = 1000
+direction_read = 0
+direction_write = 1
+
+class FioHistoLogExc(Exception):
+    pass
+
+# if there is an error, print message, and exit with error status
+
+def myabort(msg):
+    print('ERROR: ' + msg)
+    sys.exit(1)
+
+# convert histogram log file into a list of
+# (time_ms, direction, bsz, buckets) tuples where
+# - time_ms is the time in msec at which the log record was written
+# - direction is 0 (read) or 1 (write)
+# - bsz is block size (not used)
+# - buckets is a CSV list of counters that make up the histogram
+# caller decides if the expected number of counters are present
+
+
+def exception_suffix( record_num, pathname ):
+    return 'in histogram record %d file %s' % (record_num+1, pathname)
+
+# log file parser raises FioHistoLogExc exceptions
+# it returns histogram buckets in whatever unit fio uses
+# inputs:
+#  logfn: pathname to histogram log file
+#  buckets_per_interval - how many histogram buckets to expect
+#  log_hist_msec - if not None, expected time interval between histogram records
+
+def parse_hist_file(logfn, buckets_per_interval, log_hist_msec):
+    previous_ts_ms_read = -1
+    previous_ts_ms_write = -1
+ 
+    with open(logfn, 'r') as f:
+        records = [ l.strip() for l in f.readlines() ]
+    intervals = []
+    last_time_ms = -1
+    last_direction = -1
+    for k, r in enumerate(records):
+        if r == '':
+            continue
+        tokens = r.split(',')
+        try:
+            int_tokens = [ int(t) for t in tokens ]
+        except ValueError as e:
+            raise FioHistoLogExc('non-integer value %s' % exception_suffix(k+1, logfn))
+
+        neg_ints = list(filter( lambda tk : tk < 0, int_tokens ))
+        if len(neg_ints) > 0:
+            raise FioHistoLogExc('negative integer value %s' % exception_suffix(k+1, logfn))
+
+        if len(int_tokens) < 3:
+            raise FioHistoLogExc('too few numbers %s' % exception_suffix(k+1, logfn))
+
+        direction = int_tokens[1]
+        if direction != direction_read and direction != direction_write:
+            raise FioHistoLogExc('invalid I/O direction %s' % exception_suffix(k+1, logfn))
+
+        time_ms = int_tokens[0]
+        if direction == direction_read:
+            if time_ms < previous_ts_ms_read:
+                raise FioHistoLogExc('read timestamp in column 1 decreased %s' % exception_suffix(k+1, logfn))
+            previous_ts_ms_read = time_ms
+        elif direction == direction_write:
+            if time_ms < previous_ts_ms_write:
+                raise FioHistoLogExc('write timestamp in column 1 decreased %s' % exception_suffix(k+1, logfn))
+            previous_ts_ms_write = time_ms
+
+        bsz = int_tokens[2]
+        if bsz > (1 << 24):
+            raise FioHistoLogExc('block size too large %s' % exception_suffix(k+1, logfn))
+
+        buckets = int_tokens[3:]
+        if len(buckets) != buckets_per_interval:
+            raise FioHistoLogExc('%d buckets per interval but %d expected in %s' % 
+                    (len(buckets), buckets_per_interval, exception_suffix(k+1, logfn)))
+
+        # hack to filter out records with the same timestamp
+        # we should not have to do this if fio logs histogram records correctly
+
+        if time_ms == last_time_ms and direction == last_direction:
+            continue
+        last_time_ms = time_ms
+        last_direction = direction
+
+        intervals.append((time_ms, direction, bsz, buckets))
+    if len(intervals) == 0:
+        raise FioHistoLogExc('no records in %s' % logfn)
+    (first_timestamp, _, _, _) = intervals[0]
+    if first_timestamp < 1000000:
+        start_time = 0    # assume log_unix_epoch = 0
+    elif log_hist_msec != None:
+        start_time = first_timestamp - log_hist_msec
+    elif len(intervals) > 1:
+        (second_timestamp, _, _, _) = intervals[1]
+        start_time = first_timestamp - (second_timestamp - first_timestamp)
+    else:
+        raise FioHistoLogExc('no way to estimate test start time')
+    (end_timestamp, _, _, _) = intervals[-1]
+
+    return (intervals, start_time, end_timestamp)
+
+
+# compute time range for each bucket index in histogram record
+# see comments in https://github.com/axboe/fio/blob/master/stat.h
+# for description of bucket groups and buckets
+# fio v3 bucket ranges are in nanosec (since response times are measured in nanosec)
+# but we convert fio v3 nanosecs to floating-point microseconds
+
+def time_ranges(groups, counters_per_group, fio_version=3):
+    bucket_width = 1
+    bucket_base = 0
+    bucket_intervals = []
+    for g in range(0, groups):
+        for b in range(0, counters_per_group):
+            rmin = float(bucket_base)
+            rmax = rmin + bucket_width
+            if fio_version == 3:
+                rmin /= nsec_per_usec
+                rmax /= nsec_per_usec
+            bucket_intervals.append( [rmin, rmax] )
+            bucket_base += bucket_width
+        if g != 0:
+            bucket_width *= 2
+    return bucket_intervals
+
+
+# compute number of time quantum intervals in the test
+
+def get_time_intervals(time_quantum, min_timestamp_ms, max_timestamp_ms):
+    # round down to nearest second
+    max_timestamp = max_timestamp_ms // msec_per_sec
+    min_timestamp = min_timestamp_ms // msec_per_sec
+    # round up to nearest whole multiple of time_quantum
+    time_interval_count = ((max_timestamp - min_timestamp) + time_quantum) // time_quantum
+    end_time = min_timestamp + (time_interval_count * time_quantum)
+    return (end_time, time_interval_count)
+
+# align raw histogram log data to time quantum so 
+# we can then combine histograms from different threads with addition
+# for randrw workload we count both reads and writes in same output bucket
+# but we separate reads and writes for purposes of calculating
+# end time for histogram record.
+# this requires us to weight a raw histogram bucket by the 
+# fraction of time quantum that the bucket overlaps the current
+# time quantum interval
+# for example, if we have a bucket with 515 samples for time interval
+# [ 1010, 2014 ] msec since start of test, and time quantum is 1 sec, then
+# for time quantum interval [ 1000, 2000 ] msec, the overlap is
+# (2000 - 1010) / (2000 - 1000) = 0.99
+# so the contribution of this bucket to this time quantum is
+# 515 x 0.99 = 509.85
+
+def align_histo_log(raw_histogram_log, time_quantum, bucket_count, min_timestamp_ms, max_timestamp_ms):
+
+    # slice up test time int intervals of time_quantum seconds
+
+    (end_time, time_interval_count) = get_time_intervals(time_quantum, min_timestamp_ms, max_timestamp_ms)
+    time_qtm_ms = time_quantum * msec_per_sec
+    end_time_ms = end_time * msec_per_sec
+    aligned_intervals = []
+    for j in range(0, time_interval_count):
+        aligned_intervals.append((
+            min_timestamp_ms + (j * time_qtm_ms),
+            [ 0.0 for j in range(0, bucket_count) ] ))
+
+    log_record_count = len(raw_histogram_log)
+    for k, record in enumerate(raw_histogram_log):
+
+        # find next record with same direction to get end-time
+        # have to avoid going past end of array
+        # for fio randrw workload, 
+        # we have read and write records on same time interval
+        # sometimes read and write records are in opposite order
+        # assertion checks that next read/write record 
+        # can be separated by at most 2 other records
+
+        (time_msec, direction, sz, interval_buckets) = record
+        if k+1 < log_record_count:
+            (time_msec_end, direction2, _, _) = raw_histogram_log[k+1]
+            if direction2 != direction:
+                if k+2 < log_record_count:
+                    (time_msec_end, direction2, _, _) = raw_histogram_log[k+2]
+                    if direction2 != direction:
+                        if k+3 < log_record_count:
+                            (time_msec_end, direction2, _, _) = raw_histogram_log[k+3]
+                            assert direction2 == direction
+                        else:
+                            time_msec_end = end_time_ms
+                else:
+                    time_msec_end = end_time_ms
+        else:
+            time_msec_end = end_time_ms
+
+        # calculate first quantum that overlaps this histogram record 
+
+        offset_from_min_ts = time_msec - min_timestamp_ms
+        qtm_start_ms = min_timestamp_ms + (offset_from_min_ts // time_qtm_ms) * time_qtm_ms
+        qtm_end_ms = min_timestamp_ms + ((offset_from_min_ts + time_qtm_ms) // time_qtm_ms) * time_qtm_ms
+        qtm_index = offset_from_min_ts // time_qtm_ms
+
+        # for each quantum that overlaps this histogram record's time interval
+
+        while qtm_start_ms < time_msec_end:  # while quantum overlaps record
+
+            # some histogram logs may be longer than others
+
+            if len(aligned_intervals) <= qtm_index:
+                break
+
+            # calculate fraction of time that this quantum 
+            # overlaps histogram record's time interval
+            
+            overlap_start = max(qtm_start_ms, time_msec)
+            overlap_end = min(qtm_end_ms, time_msec_end)
+            weight = float(overlap_end - overlap_start)
+            weight /= (time_msec_end - time_msec)
+            (_,aligned_histogram) = aligned_intervals[qtm_index]
+            for bx, b in enumerate(interval_buckets):
+                weighted_bucket = weight * b
+                aligned_histogram[bx] += weighted_bucket
+
+            # advance to the next time quantum
+
+            qtm_start_ms += time_qtm_ms
+            qtm_end_ms += time_qtm_ms
+            qtm_index += 1
+
+    return aligned_intervals
+
+# add histogram in "source" to histogram in "target"
+# it is assumed that the 2 histograms are precisely time-aligned
+
+def add_to_histo_from( target, source ):
+    for b in range(0, len(source)):
+        target[b] += source[b]
+
+
+# calculate total samples in the histogram buckets
+
+def get_samples(buckets):
+    return reduce( lambda x,y: x + y, buckets)
+
+
+# compute percentiles
+# inputs:
+#   buckets: histogram bucket array 
+#   wanted: list of floating-pt percentiles to calculate
+#   time_ranges: [tmin,tmax) time interval for each bucket
+# returns None if no I/O reported.
+# otherwise we would be dividing by zero
+# think of buckets as probability distribution function
+# and this loop is integrating to get cumulative distribution function
+
+def get_pctiles(buckets, wanted, time_ranges):
+
+    # get total of IO requests done
+    total_ios = 0
+    for io_count in buckets:
+        total_ios += io_count
+
+    # don't return percentiles if no I/O was done during interval
+    if total_ios == 0.0:
+        return None
+
+    pctile_count = len(wanted)
+
+    # results returned as dictionary keyed by percentile
+    pctile_result = {}
+
+    # index of next percentile in list
+    pctile_index = 0
+
+    # next percentile
+    next_pctile = wanted[pctile_index]
+
+    # no one is interested in percentiles bigger than this but not 100.0
+    # this prevents floating-point error from preventing loop exit
+    almost_100 = 99.9999
+
+    # pct is the percentile corresponding to 
+    # all I/O requests up through bucket b
+    pct = 0.0
+    total_so_far = 0
+    for b, io_count in enumerate(buckets):
+        if io_count == 0:
+            continue
+        total_so_far += io_count
+        # last_pct_lt is the percentile corresponding to 
+        # all I/O requests up to, but not including, bucket b
+        last_pct = pct
+        pct = 100.0 * float(total_so_far) / total_ios
+        # a single bucket could satisfy multiple pctiles
+        # so this must be a while loop
+        # for 100-percentile (max latency) case, no bucket exceeds it 
+        # so we must stop there.
+        while ((next_pctile == 100.0 and pct >= almost_100) or
+               (next_pctile < 100.0  and pct > next_pctile)):
+            # interpolate between min and max time for bucket time interval
+            # we keep the time_ranges access inside this loop, 
+            # even though it could be above the loop,
+            # because in many cases we will not be even entering 
+            # the loop so we optimize out these accesses
+            range_max_time = time_ranges[b][1]
+            range_min_time = time_ranges[b][0]
+            offset_frac = (next_pctile - last_pct)/(pct - last_pct)
+            interpolation = range_min_time + (offset_frac*(range_max_time - range_min_time))
+            pctile_result[next_pctile] = interpolation
+            pctile_index += 1
+            if pctile_index == pctile_count:
+                break
+            next_pctile = wanted[pctile_index]
+        if pctile_index == pctile_count:
+            break
+    assert pctile_index == pctile_count
+    return pctile_result
+
+
+# this is really the main program
+
+def compute_percentiles_from_logs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fio-version", dest="fio_version", 
+        default="3", choices=[2,3], type=int, 
+        help="fio version (default=3)")
+    parser.add_argument("--bucket-groups", dest="bucket_groups", default="29", type=int, 
+        help="fio histogram bucket groups (default=29)")
+    parser.add_argument("--bucket-bits", dest="bucket_bits", 
+        default="6", type=int, 
+        help="fio histogram buckets-per-group bits (default=6 means 64 buckets/group)")
+    parser.add_argument("--percentiles", dest="pctiles_wanted", 
+        default=[ 0., 50., 95., 99., 100.], type=float, nargs='+',
+        help="fio histogram buckets-per-group bits (default=6 means 64 buckets/group)")
+    parser.add_argument("--time-quantum", dest="time_quantum", 
+        default="1", type=int,
+        help="time quantum in seconds (default=1)")
+    parser.add_argument("--log-hist-msec", dest="log_hist_msec", 
+        type=int, default=None,
+        help="log_hist_msec value in fio job file")
+    parser.add_argument("--output-unit", dest="output_unit", 
+        default="usec", type=str,
+        help="Latency percentile output unit: msec|usec|nsec (default usec)")
+    parser.add_argument("file_list", nargs='+', 
+        help='list of files, preceded by " -- " if necessary')
+    args = parser.parse_args()
+
+    # default changes based on fio version
+    if args.fio_version == 2:
+        args.bucket_groups = 19
+
+    # print parameters
+
+    print('fio version = %d' % args.fio_version)
+    print('bucket groups = %d' % args.bucket_groups)
+    print('bucket bits = %d' % args.bucket_bits)
+    print('time quantum = %d sec' % args.time_quantum)
+    print('percentiles = %s' % ','.join([ str(p) for p in args.pctiles_wanted ]))
+    buckets_per_group = 1 << args.bucket_bits
+    print('buckets per group = %d' % buckets_per_group)
+    buckets_per_interval = buckets_per_group * args.bucket_groups
+    print('buckets per interval = %d ' % buckets_per_interval)
+    bucket_index_range = range(0, buckets_per_interval)
+    if args.log_hist_msec != None:
+        print('log_hist_msec = %d' % args.log_hist_msec)
+    if args.time_quantum == 0:
+        print('ERROR: time-quantum must be a positive number of seconds')
+    print('output unit = ' + args.output_unit)
+    if args.output_unit == 'msec':
+        time_divisor = float(msec_per_sec)
+    elif args.output_unit == 'usec':
+        time_divisor = 1.0
+
+    # construct template for each histogram bucket array with buckets all zeroes
+    # we just copy this for each new histogram
+
+    zeroed_buckets = [ 0.0 for r in bucket_index_range ]
+
+    # calculate response time interval associated with each histogram bucket
+
+    bucket_times = time_ranges(args.bucket_groups, buckets_per_group, fio_version=args.fio_version)
+
+    # parse the histogram logs
+    # assumption: each bucket has a monotonically increasing time
+    # assumption: time ranges do not overlap for a single thread's records
+    # (exception: if randrw workload, then there is a read and a write 
+    # record for the same time interval)
+
+    test_start_time = 0
+    test_end_time = 1.0e18
+    hist_files = {}
+    for fn in args.file_list:
+        try:
+            (hist_files[fn], log_start_time, log_end_time)  = parse_hist_file(fn, buckets_per_interval, args.log_hist_msec)
+        except FioHistoLogExc as e:
+            myabort(str(e))
+        # we consider the test started when all threads have started logging
+        test_start_time = max(test_start_time, log_start_time)
+        # we consider the test over when one of the logs has ended
+        test_end_time = min(test_end_time, log_end_time)
+
+    if test_start_time >= test_end_time:
+        raise FioHistoLogExc('no time interval when all threads logs overlapped')
+    if test_start_time > 0:
+        print('all threads running as of unix epoch time %d = %s' % (
+               test_start_time/float(msec_per_sec), 
+               time.ctime(test_start_time/1000.0)))
+
+    (end_time, time_interval_count) = get_time_intervals(args.time_quantum, test_start_time, test_end_time)
+    all_threads_histograms = [ ((j*args.time_quantum*msec_per_sec), deepcopy(zeroed_buckets))
+                               for j in range(0, time_interval_count) ]
+
+    for logfn in hist_files.keys():
+        aligned_per_thread = align_histo_log(hist_files[logfn], 
+                                             args.time_quantum, 
+                                             buckets_per_interval, 
+                                             test_start_time,
+                                             test_end_time)
+        for t in range(0, time_interval_count):
+            (_, all_threads_histo_t) = all_threads_histograms[t]
+            (_, log_histo_t) = aligned_per_thread[t]
+            add_to_histo_from( all_threads_histo_t, log_histo_t )
+
+    # calculate percentiles across aggregate histogram for all threads
+    # print CSV header just like fiologparser_hist does
+
+    header = 'msec-since-start, samples, '
+    for p in args.pctiles_wanted:
+        if p == 0.:
+            next_pctile_header = 'min'
+        elif p == 100.:
+            next_pctile_header = 'max'
+        elif p == 50.:
+            next_pctile_header = 'median'
+        else:
+            next_pctile_header = '%3.1f' % p
+        header += '%s, ' % next_pctile_header
+
+    print('time (millisec), percentiles in increasing order with values in ' + args.output_unit)
+    print(header)
+
+    for (t_msec, all_threads_histo_t) in all_threads_histograms:
+        samples = get_samples(all_threads_histo_t)
+        record = '%8d, %8d, ' % (t_msec, samples)
+        pct = get_pctiles(all_threads_histo_t, args.pctiles_wanted, bucket_times)
+        if not pct:
+            for w in args.pctiles_wanted:
+                record += ', '
+        else:
+            pct_keys = [ k for k in pct.keys() ]
+            pct_values = [ str(pct[wanted]/time_divisor) for wanted in sorted(pct_keys) ]
+            record += ', '.join(pct_values)
+        print(record)
+
+
+
+#end of MAIN PROGRAM
+
+
+##### below are unit tests ##############
+
+if unittest2_imported:
+  import tempfile, shutil
+  from os.path import join
+  should_not_get_here = False
+
+  class Test(unittest2.TestCase):
+    tempdir = None
+
+    # a little less typing please
+    def A(self, boolean_val):
+        self.assertTrue(boolean_val)
+
+    # initialize unit test environment
+
+    @classmethod
+    def setUpClass(cls):
+        d = tempfile.mkdtemp()
+        Test.tempdir = d
+
+    # remove anything left by unit test environment
+    # unless user sets UNITTEST_LEAVE_FILES environment variable
+
+    @classmethod
+    def tearDownClass(cls):
+        if not os.getenv("UNITTEST_LEAVE_FILES"):
+            shutil.rmtree(cls.tempdir)
+
+    def setUp(self):
+        self.fn = join(Test.tempdir, self.id())
+
+    def test_a_add_histos(self):
+        a = [ 1.0, 2.0 ]
+        b = [ 1.5, 2.5 ]
+        add_to_histo_from( a, b )
+        self.A(a == [2.5, 4.5])
+        self.A(b == [1.5, 2.5])
+
+    def test_b1_parse_log(self):
+        with open(self.fn, 'w') as f:
+            f.write('1234, 0, 4096, 1, 2, 3, 4\n')
+            f.write('5678,1,16384,5,6,7,8 \n')
+        (raw_histo_log, min_timestamp, max_timestamp) = parse_hist_file(self.fn, 4, None) # 4 buckets per interval
+        # if not log_unix_epoch=1, then min_timestamp will always be set to zero
+        self.A(len(raw_histo_log) == 2 and min_timestamp == 0 and max_timestamp == 5678)
+        (time_ms, direction, bsz, histo) = raw_histo_log[0]
+        self.A(time_ms == 1234 and direction == 0 and bsz == 4096 and histo == [ 1, 2, 3, 4 ])
+        (time_ms, direction, bsz, histo) = raw_histo_log[1]
+        self.A(time_ms == 5678 and direction == 1 and bsz == 16384 and histo == [ 5, 6, 7, 8 ])
+
+    def test_b2_parse_empty_log(self):
+        with open(self.fn, 'w') as f:
+            pass
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(should_not_get_here)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('no records'))
+
+    def test_b3_parse_empty_records(self):
+        with open(self.fn, 'w') as f:
+            f.write('\n')
+            f.write('1234, 0, 4096, 1, 2, 3, 4\n')
+            f.write('5678,1,16384,5,6,7,8 \n')
+            f.write('\n')
+        (raw_histo_log, _, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(len(raw_histo_log) == 2 and max_timestamp_ms == 5678)
+        (time_ms, direction, bsz, histo) = raw_histo_log[0]
+        self.A(time_ms == 1234 and direction == 0 and bsz == 4096 and histo == [ 1, 2, 3, 4 ])
+        (time_ms, direction, bsz, histo) = raw_histo_log[1]
+        self.A(time_ms == 5678 and direction == 1 and bsz == 16384 and histo == [ 5, 6, 7, 8 ])
+
+    def test_b4_parse_non_int(self):
+        with open(self.fn, 'w') as f:
+            f.write('12, 0, 4096, 1a, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('non-integer'))
+
+    def test_b5_parse_neg_int(self):
+        with open(self.fn, 'w') as f:
+            f.write('-12, 0, 4096, 1, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('negative integer'))
+
+    def test_b6_parse_too_few_int(self):
+        with open(self.fn, 'w') as f:
+            f.write('0, 0\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('too few numbers'))
+
+    def test_b7_parse_invalid_direction(self):
+        with open(self.fn, 'w') as f:
+            f.write('100, 2, 4096, 1, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('invalid I/O direction'))
+
+    def test_b8_parse_bsz_too_big(self):
+        with open(self.fn+'_good', 'w') as f:
+            f.write('100, 1, %d, 1, 2, 3, 4\n' % (1<<24))
+        (raw_histo_log, _, _) = parse_hist_file(self.fn+'_good', 4, None)
+        with open(self.fn+'_bad', 'w') as f:
+            f.write('100, 1, 20000000, 1, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn+'_bad', 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('block size too large'))
+
+    def test_b9_parse_wrong_bucket_count(self):
+        with open(self.fn, 'w') as f:
+            f.write('100, 1, %d, 1, 2, 3, 4, 5\n' % (1<<24))
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).__contains__('buckets per interval'))
+
+    def test_c1_time_ranges(self):
+        ranges = time_ranges(3, 2)  # fio_version defaults to 3
+        expected_ranges = [ # fio_version 3 is in nanoseconds
+                [0.000, 0.001], [0.001, 0.002],   # first group
+                [0.002, 0.003], [0.003, 0.004],   # second group same width
+                [0.004, 0.006], [0.006, 0.008]]   # subsequent groups double width
+        self.A(ranges == expected_ranges)
+        ranges = time_ranges(3, 2, fio_version=3)
+        self.A(ranges == expected_ranges)
+        ranges = time_ranges(3, 2, fio_version=2)
+        expected_ranges_v2 = [ [ 1000.0 * min_or_max for min_or_max in time_range ] 
+                               for time_range in expected_ranges ]
+        self.A(ranges == expected_ranges_v2)
+        # see fio V3 stat.h for why 29 groups and 2^6 buckets/group
+        normal_ranges_v3 = time_ranges(29, 64)
+        # for v3, bucket time intervals are measured in nanoseconds
+        self.A(len(normal_ranges_v3) == 29 * 64 and normal_ranges_v3[-1][1] == 64*(1<<(29-1))/1000.0)
+        normal_ranges_v2 = time_ranges(19, 64, fio_version=2)
+        # for v2, bucket time intervals are measured in microseconds so we have fewer buckets
+        self.A(len(normal_ranges_v2) == 19 * 64 and normal_ranges_v2[-1][1] == 64*(1<<(19-1)))
+
+    def test_d1_align_histo_log_1_quantum(self):
+        with open(self.fn, 'w') as f:
+            f.write('100, 1, 4096, 1, 2, 3, 4')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 100)
+        aligned_log = align_histo_log(raw_histo_log, 5, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 1)
+        (time_ms0, h) = aligned_log[0]
+        self.A(time_ms0 == 0 and h == [1., 2., 3., 4.])
+
+    # handle case with log_unix_epoch=1 timestamps, 1-second time quantum
+    # here both records will be separated into 2 aligned intervals
+
+    def test_d1a_align_2rec_histo_log_epoch_1_quantum_1sec(self):
+        with open(self.fn, 'w') as f:
+            f.write('1536504002123, 1, 4096, 1, 2, 3, 4\n')
+            f.write('1536504003123, 1, 4096, 4, 3, 2, 1\n')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 1536504001123 and max_timestamp_ms == 1536504003123)
+        aligned_log = align_histo_log(raw_histo_log, 1, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 3)
+        (time_ms0, h) = aligned_log[0]
+        self.A(time_ms0 == 1536504001123 and h == [0., 0., 0., 0.])
+        (time_ms1, h) = aligned_log[1]
+        self.A(time_ms1 == 1536504002123 and h == [1., 2., 3., 4.])
+        (time_ms2, h) = aligned_log[2]
+        self.A(time_ms2 == 1536504003123 and h == [4., 3., 2., 1.])
+
+    # handle case with log_unix_epoch=1 timestamps, 5-second time quantum
+    # here both records will be merged into a single aligned time interval
+
+    def test_d1b_align_2rec_histo_log_epoch_1_quantum_5sec(self):
+        with open(self.fn, 'w') as f:
+            f.write('1536504002123, 1, 4096, 1, 2, 3, 4\n')
+            f.write('1536504003123, 1, 4096, 4, 3, 2, 1\n')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 1536504001123 and max_timestamp_ms == 1536504003123)
+        aligned_log = align_histo_log(raw_histo_log, 5, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 1)
+        (time_ms0, h) = aligned_log[0]
+        self.A(time_ms0 == 1536504001123 and h == [5., 5., 5., 5.])
+
+    # we need this to compare 2 lists of floating point numbers for equality
+    # because of floating-point imprecision
+
+    def compare_2_floats(self, x, y):
+        if x == 0.0 or y == 0.0:
+            return (x+y) < 0.0000001
+        else:
+            return (math.fabs(x-y)/x) < 0.00001
+                
+    def is_close(self, buckets, buckets_expected):
+        if len(buckets) != len(buckets_expected):
+            return False
+        compare_buckets = lambda k: self.compare_2_floats(buckets[k], buckets_expected[k])
+        indices_close = list(filter(compare_buckets, range(0, len(buckets))))
+        return len(indices_close) == len(buckets)
+
+    def test_d2_align_histo_log_2_quantum(self):
+        with open(self.fn, 'w') as f:
+            f.write('2000, 1, 4096, 1, 2, 3, 4\n')
+            f.write('7000, 1, 4096, 1, 2, 3, 4\n')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 7000)
+        (_, _, _, raw_buckets1) = raw_histo_log[0]
+        (_, _, _, raw_buckets2) = raw_histo_log[1]
+        aligned_log = align_histo_log(raw_histo_log, 5, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 2)
+        (time_ms1, h1) = aligned_log[0]
+        (time_ms2, h2) = aligned_log[1]
+        # because first record is from time interval [2000, 7000]
+        # we weight it according
+        expect1 = [float(b) * 0.6 for b in raw_buckets1]
+        expect2 = [float(b) * 0.4 for b in raw_buckets1]
+        for e in range(0, len(expect2)):
+            expect2[e] += raw_buckets2[e]
+        self.A(time_ms1 == 0    and self.is_close(h1, expect1))
+        self.A(time_ms2 == 5000 and self.is_close(h2, expect2))
+
+    # what to expect if histogram buckets are all equal
+    def test_e1_get_pctiles_flat_histo(self):
+        with open(self.fn, 'w') as f:
+            buckets = [ 100 for j in range(0, 128) ]
+            f.write('9000, 1, 4096, %s\n' % ', '.join([str(b) for b in buckets]))
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 128, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 9000)
+        aligned_log = align_histo_log(raw_histo_log, 5, 128, min_timestamp_ms, max_timestamp_ms)
+        time_intervals = time_ranges(4, 32)
+        # since buckets are all equal, then median is halfway through time_intervals
+        # and max latency interval is at end of time_intervals
+        self.A(time_intervals[64][1] == 0.066 and time_intervals[127][1] == 0.256)
+        pctiles_wanted = [ 0, 50, 100 ]
+        pct_vs_time = []
+        for (time_ms, histo) in aligned_log:
+            pct_vs_time.append(get_pctiles(histo, pctiles_wanted, time_intervals))
+        self.A(pct_vs_time[0] == None)  # no I/O in this time interval
+        expected_pctiles = { 0:0.000, 50:0.064, 100:0.256 }
+        self.A(pct_vs_time[1] == expected_pctiles)
+
+    # what to expect if just the highest histogram bucket is used
+    def test_e2_get_pctiles_highest_pct(self):
+        fio_v3_bucket_count = 29 * 64
+        with open(self.fn, 'w') as f:
+            # make a empty fio v3 histogram
+            buckets = [ 0 for j in range(0, fio_v3_bucket_count) ]
+            # add one I/O request to last bucket
+            buckets[-1] = 1
+            f.write('9000, 1, 4096, %s\n' % ', '.join([str(b) for b in buckets]))
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, fio_v3_bucket_count, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 9000)
+        aligned_log = align_histo_log(raw_histo_log, 5, fio_v3_bucket_count, min_timestamp_ms, max_timestamp_ms)
+        (time_ms, histo) = aligned_log[1]
+        time_intervals = time_ranges(29, 64)
+        expected_pctiles = { 100.0:(64*(1<<28))/1000.0 }
+        pct = get_pctiles( histo, [ 100.0 ], time_intervals )
+        self.A(pct == expected_pctiles)
+
+# we are using this module as a standalone program
+
+if __name__ == '__main__':
+    if os.getenv('UNITTEST'):
+        if unittest2_imported:
+            sys.exit(unittest2.main())
+        else:
+            raise Exception('you must install unittest2 module to run unit test')
+    else:
+        compute_percentiles_from_logs()
+
diff --git a/tools/hist/fiologparser_hist.py b/tools/hist/fiologparser_hist.py
new file mode 100755
index 0000000..8910d5f
--- /dev/null
+++ b/tools/hist/fiologparser_hist.py
@@ -0,0 +1,612 @@
+#!/usr/bin/python2.7
+""" 
+    Utility for converting *_clat_hist* files generated by fio into latency statistics.
+    
+    Example usage:
+    
+            $ fiologparser_hist.py *_clat_hist*
+            end-time, samples, min, avg, median, 90%, 95%, 99%, max
+            1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
+            2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
+            4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
+            ...
+    
+    @author Karl Cronburg <karl.cronburg@gmail.com>
+"""
+import os
+import sys
+import pandas
+import re
+import numpy as np
+
+runascmd = False
+
+err = sys.stderr.write
+
+class HistFileRdr():
+    """ Class to read a hist file line by line, buffering
+        a value array for the latest line, and allowing a preview
+        of the next timestamp in next line
+        Note: this does not follow a generator pattern, but must explicitly
+        get next bin array.
+    """
+    def __init__(self, file):
+        self.fp = open(file, 'r')
+        self.data = self.nextData()
+
+    def close(self):
+        self.fp.close()
+        self.fp = None
+
+    def nextData(self):
+        self.data = None
+        if self.fp:
+            line = self.fp.readline()
+            if line == "":
+                self.close()
+            else:
+                self.data = [int(x) for x in line.replace(' ', '').rstrip().split(',')]
+
+        return self.data
+
+    @property
+    def curTS(self):
+        ts = None
+        if self.data:
+            ts = self.data[0]
+        return ts
+
+    @property
+    def curDir(self):
+        d = None
+        if self.data:
+            d = self.data[1]
+        return d
+
+    @property
+    def curBins(self):
+        return self.data[3:]
+
+def weighted_percentile(percs, vs, ws):
+    """ Use linear interpolation to calculate the weighted percentile.
+        
+        Value and weight arrays are first sorted by value. The cumulative
+        distribution function (cdf) is then computed, after which np.interp
+        finds the two values closest to our desired weighted percentile(s)
+        and linearly interpolates them.
+        
+        percs  :: List of percentiles we want to calculate
+        vs     :: Array of values we are computing the percentile of
+        ws     :: Array of weights for our corresponding values
+        return :: Array of percentiles
+    """
+    idx = np.argsort(vs)
+    vs, ws = vs[idx], ws[idx] # weights and values sorted by value
+    cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
+    return np.interp(percs, cdf, vs) # linear interpolation
+
+def weights(start_ts, end_ts, start, end):
+    """ Calculate weights based on fraction of sample falling in the
+        given interval [start,end]. Weights computed using vector / array
+        computation instead of for-loops.
+
+        Note that samples with zero time length are effectively ignored
+        (we set their weight to zero).
+
+        start_ts :: Array of start times for a set of samples
+        end_ts   :: Array of end times for a set of samples
+        start    :: int
+        end      :: int
+        return   :: Array of weights
+    """
+    sbounds = np.maximum(start_ts, start).astype(float)
+    ebounds = np.minimum(end_ts,   end).astype(float)
+    ws = (ebounds - sbounds) / (end_ts - start_ts)
+    if np.any(np.isnan(ws)):
+      err("WARNING: zero-length sample(s) detected. Log file corrupt"
+          " / bad time values? Ignoring these samples.\n")
+    ws[np.where(np.isnan(ws))] = 0.0;
+    return ws
+
+def weighted_average(vs, ws):
+    return np.sum(vs * ws) / np.sum(ws)
+
+
+percs = None
+columns = None
+
+def gen_output_columns(ctx):
+    global percs,columns
+    strpercs = re.split('[,:]', ctx.percentiles)
+    percs = [50.0]  # always print 50% in 'median' column
+    percs.extend(list(map(float,strpercs)))
+    if ctx.directions:
+        columns = ["end-time", "dir", "samples", "min", "avg", "median"]
+    else:
+        columns = ["end-time", "samples", "min", "avg", "median"]
+    columns.extend(list(map(lambda x: x+'%', strpercs)))
+    columns.append("max")
+
+def fmt_float_list(ctx, num=1):
+  """ Return a comma separated list of float formatters to the required number
+      of decimal places. For instance:
+
+        fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
+  """
+  return ', '.join(["%%.%df" % ctx.decimals] * num)
+
+# Default values - see beginning of main() for how we detect number columns in
+# the input files:
+__HIST_COLUMNS = 1216
+__NON_HIST_COLUMNS = 3
+__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
+
+def read_chunk(rdr, sz):
+    """ Read the next chunk of size sz from the given reader. """
+    try:
+        """ StopIteration occurs when the pandas reader is empty, and AttributeError
+            occurs if rdr is None due to the file being empty. """
+        new_arr = rdr.read().values
+    except (StopIteration, AttributeError):
+        return None
+
+    # Let's leave the array as is, and let later code ignore the block size
+    return new_arr
+
+    #""" Extract array of the times, directions wo times, and histograms matrix without times column. """
+    #times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
+    #hists = new_arr[:,__NON_HIST_COLUMNS:]
+    #times = times.reshape((len(times),1))
+    #dirs  = rws.reshape((len(rws),1))
+    #arr = np.append(times, hists, axis=1)
+    #return arr
+
+def get_min(fps, arrs):
+    """ Find the file with the current first row with the smallest start time """
+    return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
+
+def histogram_generator(ctx, fps, sz):
+    
+    # Create a chunked pandas reader for each of the files:
+    rdrs = {}
+    for fp in fps:
+        try:
+            rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
+        except ValueError as e:
+            if e.message == 'No columns to parse from file':
+                if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
+                rdrs[fp] = None
+            else:
+                raise(e)
+
+    # Initial histograms from disk:
+    arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
+    while True:
+
+        try:
+            """ ValueError occurs when nothing more to read """
+            fp = get_min(fps, arrs)
+        except ValueError:
+            return
+        arr = arrs[fp]
+        arri = np.insert(arr[0], 1, fps.index(fp))
+        yield arri
+        arrs[fp] = arr[1:]
+
+        if arrs[fp].shape[0] == 0:
+            arrs[fp] = read_chunk(rdrs[fp], sz)
+
+def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
+    """ Taken from fio's stat.c for calculating the latency value of a bin
+        from that bin's index.
+        
+            idx  : the value of the index into the histogram bins
+            edge : fractional value in the range [0,1]** indicating how far into
+            the bin we wish to compute the latency value of.
+        
+        ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
+           respectively of the given bin index. """
+
+    # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
+    # all bits of the sample as index
+    if (idx < (FIO_IO_U_PLAT_VAL << 1)):
+        return idx 
+
+    # Find the group and compute the minimum value of that group
+    error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 
+    base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
+
+    # Find its bucket number of the group
+    k = idx % FIO_IO_U_PLAT_VAL
+
+    # Return the mean (if edge=0.5) of the range of the bucket
+    return base + ((k + edge) * (1 << error_bits))
+    
+def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
+    """ Converts the given *coarse* index into a non-coarse index as used by fio
+        in stat.h:plat_idx_to_val(), subsequently computing the appropriate
+        latency value for that bin.
+        """
+
+    # Multiply the index by the power of 2 coarseness to get the bin
+    # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
+    stride = 1 << coarseness
+    idx = idx * stride
+    lower = _plat_idx_to_val(idx, edge=0.0)
+    upper = _plat_idx_to_val(idx + stride, edge=1.0)
+    return lower + (upper - lower) * edge
+
+def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx, dir=dir):
+    ps = weighted_percentile(percs, vs, ws)
+
+    avg = weighted_average(vs, ws)
+    values = [mn, avg] + list(ps) + [mx]
+    if ctx.directions:
+        row = [end, dir, ss_cnt]
+        fmt = "%d, %s, %d, "
+    else:
+        row = [end, ss_cnt]
+        fmt = "%d, %d, "
+    row = row + [float(x) / ctx.divisor for x in values]
+    if ctx.divisor > 1:
+        fmt = fmt + fmt_float_list(ctx, len(percs)+3)
+    else:
+        # max and min are decimal values if no divisor
+        fmt = fmt + "%d, " + fmt_float_list(ctx, len(percs)+1) + ", %d"
+
+    print (fmt % tuple(row))
+
+def update_extreme(val, fncn, new_val):
+    """ Calculate min / max in the presence of None values """
+    if val is None: return new_val
+    else: return fncn(val, new_val)
+
+# See beginning of main() for how bin_vals are computed
+bin_vals = []
+lower_bin_vals = [] # lower edge of each bin
+upper_bin_vals = [] # upper edge of each bin 
+
+def process_interval(ctx, iHist, iEnd, dir):
+    """ print estimated percentages for the given merged sample
+    """
+    ss_cnt = 0 # number of samples affecting this interval
+    mn_bin_val, mx_bin_val = None, None
+
+    # Update total number of samples affecting current interval histogram:
+    ss_cnt += np.sum(iHist)
+
+    # Update min and max bin values
+    idxs = np.nonzero(iHist != 0)[0]
+    if idxs.size > 0:
+        mn_bin_val = bin_vals[idxs[0]]
+        mx_bin_val = bin_vals[idxs[-1]]
+
+    if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val, dir=dir)
+
+
+dir_map = ['r', 'w', 't']  # map of directional value in log to textual representation
+def process_weighted_interval(ctx, samples, iStart, iEnd, printdirs):
+    """ Construct the weighted histogram for the given interval by scanning
+        through all the histograms and figuring out which of their bins have
+        samples with latencies which overlap with the given interval
+        [iStart,iEnd].
+    """
+
+    times, files, dirs, sizes, hists = samples[:,0], samples[:,1], samples[:,2], samples[:,3], samples[:,4:]
+    iHist={}; ss_cnt = {}; mn_bin_val={}; mx_bin_val={}
+    for dir in printdirs:
+        iHist[dir] = np.zeros(__HIST_COLUMNS, dtype=float)
+        ss_cnt[dir] = 0 # number of samples affecting this interval
+        mn_bin_val[dir] = None
+        mx_bin_val[dir] = None
+
+    for end_time,file,dir,hist in zip(times,files,dirs,hists):
+
+        # Only look at bins of the current histogram sample which
+        # started before the end of the current time interval [start,end]
+        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / ctx.time_divisor
+        idx = np.where(start_times < iEnd)
+        s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
+
+        # Increment current interval histogram by weighted values of future histogram
+        # total number of samples
+        # and min and max values as necessary
+        textdir = dir_map[dir]
+        ws = hs * weights(s_ts, end_time, iStart, iEnd)
+        mmidx = np.where(hs != 0)[0]
+        if 'm' in printdirs:
+            iHist['m'][idx] += ws
+            ss_cnt['m'] += np.sum(hs)
+            if mmidx.size > 0:
+                mn_bin_val['m'] = update_extreme(mn_bin_val['m'], min, l_bvs[max(0,           mmidx[0]  - 1)])
+                mx_bin_val['m'] = update_extreme(mx_bin_val['m'], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
+        if textdir in printdirs:
+            iHist[textdir][idx] += ws
+            ss_cnt[textdir] += np.sum(hs)  # Update total number of samples affecting current interval histogram:
+            if mmidx.size > 0:
+                mn_bin_val[textdir] = update_extreme(mn_bin_val[textdir], min, l_bvs[max(0,           mmidx[0]  - 1)])
+                mx_bin_val[textdir] = update_extreme(mx_bin_val[textdir], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
+
+    for textdir in sorted(printdirs):
+        if ss_cnt[textdir] > 0: print_all_stats(ctx, iEnd, mn_bin_val[textdir], ss_cnt[textdir], bin_vals, iHist[textdir], mx_bin_val[textdir], dir=textdir)
+
+def guess_max_from_bins(ctx, hist_cols):
+    """ Try to guess the GROUP_NR from given # of histogram
+        columns seen in an input file """
+    max_coarse = 8
+    if ctx.group_nr < 19 or ctx.group_nr > 26:
+        bins = [ctx.group_nr * (1 << 6)]
+    else:
+        bins = [1216,1280,1344,1408,1472,1536,1600,1664]
+    coarses = range(max_coarse + 1)
+    fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else -10, coarses))
+    
+    arr = np.transpose(list(map(fncn, bins)))
+    idx = np.where(arr == hist_cols)
+    if len(idx[1]) == 0:
+        table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
+        errmsg = ("Unable to determine bin values from input clat_hist files. Namely \n"
+            "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
+            "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
+            "This number needs to be equal to one of the following numbers:\n\n"
+            + table + "\n\n"
+            "Possible reasons and corresponding solutions:\n"
+            "  - Input file(s) does not contain histograms.\n"
+            "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
+            "    new GROUP_NR on the command line with --group_nr\n")
+        if runascmd:
+            err(errmsg)
+            exit(1)
+        else:
+            raise RuntimeError(errmsg) 
+
+    return bins[idx[1][0]]
+
+def output_weighted_interval_data(ctx,printdirs):
+
+    fps = [open(f, 'r') for f in ctx.FILE]
+    gen = histogram_generator(ctx, fps, ctx.buff_size)
+
+    print(', '.join(columns))
+
+    try:
+        start, end = 0, ctx.interval
+        arr = np.empty(shape=(0,__TOTAL_COLUMNS + 1),dtype=int)
+        more_data = True
+        while more_data or len(arr) > 0:
+
+            # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
+            while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
+                try:
+                    new_arr = next(gen)
+                except StopIteration:
+                    more_data = False
+                    break
+                nashape  = new_arr.reshape((1,__TOTAL_COLUMNS + 1))
+                arr = np.append(arr, nashape, axis=0)
+            #arr = arr.astype(int)
+            
+            if arr.size > 0:
+                # Jump immediately to the start of the input, rounding
+                # down to the nearest multiple of the interval (useful when --log_unix_epoch
+                # was used to create these histograms):
+                if start == 0 and arr[0][0] - ctx.max_latency > end:
+                    start = arr[0][0] - ctx.max_latency
+                    start = start - (start % ctx.interval)
+                    end = start + ctx.interval
+
+                process_weighted_interval(ctx, arr, start, end, printdirs)
+                
+                # Update arr to throw away samples we no longer need - samples which
+                # end before the start of the next interval, i.e. the end of the
+                # current interval:
+                idx = np.where(arr[:,0] > end)
+                arr = arr[idx]
+            
+            start += ctx.interval
+            end = start + ctx.interval
+    finally:
+        for fp in fps:
+            fp.close()
+
+def output_interval_data(ctx,directions):
+    fps = [HistFileRdr(f) for f in ctx.FILE]
+
+    print(', '.join(columns))
+
+    start = 0
+    end = ctx.interval
+    while True:
+
+        more_data = False
+
+        # add bins from all files in target intervals
+        arr = None
+        numSamples = 0
+        while True:
+            foundSamples = False
+            for fp in fps:
+                ts = fp.curTS
+                if ts and ts+10 < end:  # shift sample time when very close to an end time
+                    curdirect = fp.curDir
+                    numSamples += 1
+                    foundSamples = True
+                    if arr is None:
+                        arr = {}
+                        for d in directions:
+                            arr[d] = np.zeros(shape=(__HIST_COLUMNS), dtype=int)
+                    if 'm' in arr:
+                        arr['m'] = np.add(arr['m'], fp.curBins)
+                    if 'r' in arr and curdirect == 0:
+                        arr['r'] = np.add(arr['r'], fp.curBins)
+                    if 'w' in arr and curdirect == 1:
+                        arr['w'] = np.add(arr['w'], fp.curBins)
+                    if 't' in arr and curdirect == 2:
+                        arr['t'] = np.add(arr['t'], fp.curBins)
+
+                    more_data = True
+                    fp.nextData()
+                elif ts:
+                    more_data = True
+
+            # reached end of all files
+            # or gone through all files without finding sample in interval
+            if not more_data or not foundSamples:
+                break
+
+        if arr is not None:
+            #print("{} size({}) samples({}) nonzero({}):".format(end, arr.size, numSamples, np.count_nonzero(arr)), str(arr), )
+            for d in sorted(arr.keys()):
+                aval = arr[d]
+                process_interval(ctx, aval, end, d)
+
+        # reach end of all files
+        if not more_data:
+            break
+
+        start += ctx.interval
+        end = start + ctx.interval
+
+def main(ctx):
+
+    if ctx.job_file:
+        try:
+            from configparser import SafeConfigParser, NoOptionError
+        except ImportError:
+            from ConfigParser import SafeConfigParser, NoOptionError
+
+        cp = SafeConfigParser(allow_no_value=True)
+        with open(ctx.job_file, 'r') as fp:
+            cp.readfp(fp)
+
+        if ctx.interval is None:
+            # Auto detect --interval value
+            for s in cp.sections():
+                try:
+                    hist_msec = cp.get(s, 'log_hist_msec')
+                    if hist_msec is not None:
+                        ctx.interval = int(hist_msec)
+                except NoOptionError:
+                    pass
+
+    if not hasattr(ctx, 'percentiles'):
+        ctx.percentiles = "90,95,99"
+
+    if ctx.directions:
+        ctx.directions = ctx.directions.lower()
+
+    if ctx.interval is None:
+        ctx.interval = 1000
+
+    if ctx.usbin:
+        ctx.time_divisor = 1000.0        # bins are in us
+    else:
+        ctx.time_divisor = 1000000.0     # bins are in ns
+
+    gen_output_columns(ctx)
+
+
+    # Automatically detect how many columns are in the input files,
+    # calculate the corresponding 'coarseness' parameter used to generate
+    # those files, and calculate the appropriate bin latency values:
+    with open(ctx.FILE[0], 'r') as fp:
+        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
+        __TOTAL_COLUMNS = len(fp.readline().split(','))
+        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
+
+        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
+        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
+        bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+        lower_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 0.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+        upper_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 1.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+
+    # indicate which directions to output (read(0), write(1), trim(2), mixed(3))
+    directions = set()
+    if not ctx.directions or 'm' in ctx.directions: directions.add('m')
+    if ctx.directions and 'r' in ctx.directions:    directions.add('r')
+    if ctx.directions and 'w' in ctx.directions:    directions.add('w')
+    if ctx.directions and 't' in ctx.directions:    directions.add('t')
+
+    if ctx.noweight:
+        output_interval_data(ctx, directions)
+    else:
+        output_weighted_interval_data(ctx, directions)
+
+
+if __name__ == '__main__':
+    import argparse
+    runascmd = True
+    p = argparse.ArgumentParser()
+    arg = p.add_argument
+    arg("FILE", help='space separated list of latency log filenames', nargs='+')
+    arg('--buff_size',
+        default=10000,
+        type=int,
+        help='number of samples to buffer into numpy at a time')
+
+    arg('--max_latency',
+        default=20,
+        type=float,
+        help='number of seconds of data to process at a time')
+
+    arg('-i', '--interval',
+        type=int,
+        help='interval width (ms), default 1000 ms')
+
+    arg('--noweight',
+        action='store_true',
+        default=False,
+        help='do not perform weighting of samples between output intervals')
+
+    arg('-d', '--divisor',
+        required=False,
+        type=int,
+        default=1,
+        help='divide the results by this value.')
+
+    arg('--decimals',
+        default=3,
+        type=int,
+        help='number of decimal places to print floats to')
+
+    arg('--warn',
+        dest='warn',
+        action='store_true',
+        default=False,
+        help='print warning messages to stderr')
+
+    arg('--group_nr',
+        default=29,
+        type=int,
+        help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
+
+    arg('--job-file',
+        default=None,
+        type=str,
+        help='Optional argument pointing to the job file used to create the '
+             'given histogram files. Useful for auto-detecting --log_hist_msec and '
+             '--log_unix_epoch (in fio) values.')
+
+    arg('--percentiles',
+        default="90:95:99",
+        type=str,
+        help='Optional argument of comma or colon separated percentiles to print. '
+             'The default is "90.0:95.0:99.0".  min, median(50%%) and max percentiles are always printed')
+
+    arg('--usbin',
+        default=False,
+        action='store_true',
+        help='histogram bin latencies are in us (fio versions < 2.99. fio uses ns for version >= 2.99')
+
+    arg('--directions',
+        default=None,
+        type=str,
+        help='Optionally split results output by reads, writes, trims or mixed. '
+             'Value may be any combination of "rwtm" characters. '
+             'By default, only "mixed" results are output without a "dir" field. '
+             'But, specifying the --directions option '
+             'adds a "dir" field to the output content, and separate rows for each of the indicated '
+             'directions.')
+
+    main(p.parse_args())
+
diff --git a/tools/hist/fiologparser_hist.py.1 b/tools/hist/fiologparser_hist.py.1
new file mode 100644
index 0000000..449f248
--- /dev/null
+++ b/tools/hist/fiologparser_hist.py.1
@@ -0,0 +1,220 @@
+.TH fiologparser_hist.py 1 "August 18, 2016"
+.SH NAME
+fiologparser_hist.py \- Calculate statistics from fio histograms
+.SH SYNOPSIS
+.B fiologparser_hist.py
+[\fIoptions\fR] [clat_hist_files]...
+.SH DESCRIPTION
+.B fiologparser_hist.py
+is a utility for converting *_clat_hist* files
+generated by fio into a CSV of latency statistics including minimum,
+average, maximum latency, and selectable percentiles.
+.SH EXAMPLES
+.PP
+.nf
+$ fiologparser_hist.py *_clat_hist*
+end-time, samples, min, avg, median, 90%, 95%, 99%, max
+1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
+2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
+4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
+\[char46]..
+.fi
+.PP
+
+.SH OPTIONS
+.TP
+.BR \-\-help
+Print these options.
+.TP
+.BR \-\-buff_size \fR=\fPint
+Number of samples to buffer into numpy at a time. Default is 10,000.
+This can be adjusted to help performance.
+.TP
+.BR \-\-max_latency \fR=\fPint
+Number of seconds of data to process at a time. Defaults to 20 seconds,
+in order to handle the 17 second upper bound on latency in histograms
+reported by fio. This should be increased if fio has been
+run with a larger maximum latency. Lowering this when a lower maximum
+latency is known can improve performance. See NOTES for more details.
+.TP
+.BR \-i ", " \-\-interval \fR=\fPint
+Interval at which statistics are reported. Defaults to 1000 ms. This
+should be set a minimum of the value for \fBlog_hist_msec\fR as given
+to fio.
+.TP
+.BR \-\-noweight
+Do not perform weighting of samples between output intervals. Default is False.
+.TP
+.BR \-d ", " \-\-divisor \fR=\fPint
+Divide statistics by this value. Defaults to 1. Useful if you want to
+convert latencies from milliseconds to seconds (\fBdivisor\fR=\fP1000\fR).
+.TP
+.BR \-\-warn
+Enables warning messages printed to stderr, useful for debugging.
+.TP
+.BR \-\-group_nr \fR=\fPint
+Set this to the value of \fIFIO_IO_U_PLAT_GROUP_NR\fR as defined in
+\fPstat.h\fR if fio has been recompiled. Defaults to 19, the
+current value used in fio. See NOTES for more details.
+.TP
+.BR \-\-percentiles \fR=\fPstr
+Pass desired list of comma or colon separated percentiles to print.
+The default is "90.0:95.0:99.0", but min, median(50%) and max percentiles are always printed
+.TP
+.BR \-\-usbin
+Use to indicate to parser that histogram bin latencies values are in microseconds.
+The default is to use nanoseconds, but histogram logs from fio versions <= 2.99 are in microseconds.
+.TP
+.BR \-\-directions \fR=\fPstr
+By default, all directions (e.g read and write) histogram bins are combined
+producing one 'mixed' result.
+To produce independent directional results, pass some combination of
+\'rwtm\' characters with the \-\-directions\fR=\fPrwtm option.
+A \'dir\' column is added indicating the result direction for a row.
+
+.SH NOTES
+end-times are calculated to be uniform increments of the \fB\-\-interval\fR value given,
+regardless of when histogram samples are reported. Of note:
+
+.RS
+Intervals with no samples are omitted. In the example above this means
+"no statistics from 2 to 3 seconds" and "39 samples influenced the statistics
+of the interval from 3 to 4 seconds".
+.LP
+Intervals with a single sample will have the same value for all statistics
+.RE
+
+.PP
+The number of samples is unweighted, corresponding to the total number of samples
+which have any effect whatsoever on the interval.
+
+Min statistics are computed using value of the lower boundary of the first bin
+(in increasing bin order) with non-zero samples in it. Similarly for max,
+we take the upper boundary of the last bin with non-zero samples in it.
+This is semantically identical to taking the 0th and 100th percentiles with a
+50% bin-width buffer (because percentiles are computed using mid-points of
+the bins). This enforces the following nice properties:
+
+.RS
+min <= 50th <= 90th <= 95th <= 99th <= max
+.LP
+min and max are strict lower and upper bounds on the actual
+min / max seen by fio (and reported in *_clat.* with averaging turned off).
+.RE
+
+.PP
+Average statistics use a standard weighted arithmetic mean.
+
+When --noweights option is false (the default)
+percentile statistics are computed using the weighted percentile method as
+described here: \fIhttps://en.wikipedia.org/wiki/Percentile#Weighted_percentile\fR.
+See weights() method for details on how weights are computed for individual
+samples. In process_interval() we further multiply by the height of each bin
+to get weighted histograms.
+
+We convert files given on the command line, assumed to be fio histogram files,
+An individual histogram file can contain the
+histograms for multiple different r/w directions (notably when \fB\-\-rw\fR=\fPrandrw\fR). This
+is accounted for by tracking each r/w direction separately. In the statistics
+reported we ultimately merge *all* histograms (regardless of r/w direction).
+
+The value of *_GROUP_NR in \fIstat.h\fR (and *_BITS) determines how many latency bins
+fio outputs when histogramming is enabled. Namely for the current default of
+GROUP_NR=19, we get 1,216 bins with a maximum latency of approximately 17
+seconds. For certain applications this may not be sufficient. With GROUP_NR=24
+we have 1,536 bins, giving us a maximum latency of 541 seconds (~ 9 minutes). If
+you expect your application to experience latencies greater than 17 seconds,
+you will need to recompile fio with a larger GROUP_NR, e.g. with:
+
+.RS
+.PP
+.nf
+sed -i.bak 's/^#define FIO_IO_U_PLAT_GROUP_NR 19\n/#define FIO_IO_U_PLAT_GROUP_NR 24/g' stat.h
+make fio
+.fi
+.PP
+.RE
+
+.PP
+Quick reference table for the max latency corresponding to a sampling of
+values for GROUP_NR:
+
+.RS
+.PP
+.nf
+GROUP_NR | # bins | max latency bin value
+19       | 1216   | 16.9 sec
+20       | 1280   | 33.8 sec
+21       | 1344   | 67.6 sec
+22       | 1408   | 2  min, 15 sec
+23       | 1472   | 4  min, 32 sec
+24       | 1536   | 9  min, 4  sec
+25       | 1600   | 18 min, 8  sec
+26       | 1664   | 36 min, 16 sec
+.fi
+.PP
+.RE
+
+.PP
+At present this program automatically detects the number of histogram bins in
+the log files, and adjusts the bin latency values accordingly. In particular if
+you use the \fB\-\-log_hist_coarseness\fR parameter of fio, you get output files with
+a number of bins according to the following table (note that the first
+row is identical to the table above):
+
+.RS
+.PP
+.nf
+coarse \\ GROUP_NR
+        19     20    21     22     23     24     25     26
+   -------------------------------------------------------
+  0  [[ 1216,  1280,  1344,  1408,  1472,  1536,  1600,  1664],
+  1   [  608,   640,   672,   704,   736,   768,   800,   832],
+  2   [  304,   320,   336,   352,   368,   384,   400,   416],
+  3   [  152,   160,   168,   176,   184,   192,   200,   208],
+  4   [   76,    80,    84,    88,    92,    96,   100,   104],
+  5   [   38,    40,    42,    44,    46,    48,    50,    52],
+  6   [   19,    20,    21,    22,    23,    24,    25,    26],
+  7   [  N/A,    10,   N/A,    11,   N/A,    12,   N/A,    13],
+  8   [  N/A,     5,   N/A,   N/A,   N/A,     6,   N/A,   N/A]]
+.fi
+.PP
+.RE
+
+.PP
+For other values of GROUP_NR and coarseness, this table can be computed like this:
+
+.RS
+.PP
+.nf
+bins = [1216,1280,1344,1408,1472,1536,1600,1664]
+max_coarse = 8
+fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else nan, range(max_coarse + 1)))
+np.transpose(list(map(fncn, bins)))
+.fi
+.PP
+.RE
+
+.PP
+If you have not adjusted GROUP_NR for your (high latency) application, then you
+will see the percentiles computed by this tool max out at the max latency bin
+value as in the first table above, and in this plot (where GROUP_NR=19 and thus we see
+a max latency of ~16.7 seconds in the red line):
+
+.RS
+\fIhttps://www.cronburg.com/fio/max_latency_bin_value_bug.png
+.RE
+
+.PP
+Motivation for, design decisions, and the implementation process are
+described in further detail here:
+
+.RS
+\fIhttps://www.cronburg.com/fio/cloud-latency-problem-measurement/
+.RE
+
+.SH AUTHOR
+.B fiologparser_hist.py
+and this manual page were written by Karl Cronburg <karl.cronburg@gmail.com>.
+.SH "REPORTING BUGS"
+Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.
diff --git a/tools/hist/half-bins.py b/tools/hist/half-bins.py
new file mode 100755
index 0000000..1bba8ff
--- /dev/null
+++ b/tools/hist/half-bins.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python2.7
+""" Cut the number bins in half in fio histogram output. Example usage:
+
+        $ half-bins.py -c 2 output_clat_hist.1.log > smaller_clat_hist.1.log
+
+    Which merges e.g. bins [0 .. 3], [4 .. 7], ..., [1212 .. 1215] resulting in
+    304 = 1216 / (2**2) merged bins per histogram sample.
+
+    @author Karl Cronburg <karl.cronburg@gmail.com>
+"""
+import sys
+
+def main(ctx):
+    stride = 1 << ctx.coarseness
+    with open(ctx.FILENAME, 'r') as fp:
+        for line in fp.readlines():
+            vals = line.split(', ')
+            sys.stdout.write("%s, %s, %s, " % tuple(vals[:3]))
+
+            hist = list(map(int, vals[3:]))
+            for i in range(0, len(hist) - stride, stride):
+                sys.stdout.write("%d, " % sum(hist[i : i + stride],))
+            sys.stdout.write("%d\n" % sum(hist[len(hist) - stride:]))
+
+if __name__ == '__main__':
+    import argparse
+    p = argparse.ArgumentParser()
+    arg = p.add_argument
+    arg( 'FILENAME', help='clat_hist file for which we will reduce'
+                         ' (by half or more) the number of bins.')
+    arg('-c', '--coarseness',
+       default=1,
+       type=int,
+       help='number of times to reduce number of bins by half, '
+            'e.g. coarseness of 4 merges each 2^4 = 16 consecutive '
+            'bins.')
+    main(p.parse_args())
diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot
new file mode 100755
index 0000000..cc4ea4c
--- /dev/null
+++ b/tools/plot/fio2gnuplot
@@ -0,0 +1,532 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python3 compatible.
+#
+#  Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+#  Author: Erwan Velu  <erwan@enovance.com>
+#
+#  The license below covers all files distributed with fio unless otherwise
+#  noted in the file itself.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License version 2 as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+from __future__ import absolute_import
+from __future__ import print_function
+import os
+import fnmatch
+import sys
+import getopt
+import re
+import math
+import shutil
+from six.moves import map
+from six.moves import range
+
+def find_file(path, pattern):
+	fio_data_file=[]
+	# For all the local files
+	for file in os.listdir(path):
+		# If the file matches the glob
+		if fnmatch.fnmatch(file, pattern):
+			# Let's consider this file
+			fio_data_file.append(file)
+
+	return fio_data_file
+
+def generate_gnuplot_script(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir):
+	if verbose: print("Generating rendering scripts")
+	filename=gnuplot_output_dir+'mygraph'
+	temporary_files.append(filename)
+	f=open(filename,'w')
+
+	# Plotting 3D or comparing graphs doesn't have a meaning unless if there is at least 2 traces
+	if len(fio_data_file) > 1:
+		f.write("call \'%s/graph3D.gpm\' \'%s' \'%s\' \'\' \'%s\' \'%s\'\n" % (gpm_dir,title,gnuplot_output_filename,gnuplot_output_filename,mode))
+
+		# Setting up the compare files that will be plot later
+		compare=open(gnuplot_output_dir + 'compare.gnuplot','w')
+		compare.write('''
+set title '%s'
+set terminal png size 1280,1024
+set ytics axis out auto
+set key top left reverse
+set xlabel "Time (Seconds)"
+set ylabel '%s'
+set yrange [0:]
+set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
+'''% (title,mode))
+		compare.close()
+		#Copying the common file for all kind of graph (raw/smooth/trend)
+		compare_raw_filename="compare-%s-2Draw" % (gnuplot_output_filename)
+		compare_smooth_filename="compare-%s-2Dsmooth" % (gnuplot_output_filename)
+		compare_trend_filename="compare-%s-2Dtrend" % (gnuplot_output_filename)
+
+		shutil.copy(gnuplot_output_dir+'compare.gnuplot',gnuplot_output_dir+compare_raw_filename+".gnuplot")
+		shutil.copy(gnuplot_output_dir+'compare.gnuplot',gnuplot_output_dir+compare_smooth_filename+".gnuplot")
+		shutil.copy(gnuplot_output_dir+'compare.gnuplot',gnuplot_output_dir+compare_trend_filename+".gnuplot")
+		temporary_files.append(gnuplot_output_dir+compare_raw_filename+".gnuplot")
+		temporary_files.append(gnuplot_output_dir+compare_smooth_filename+".gnuplot")
+		temporary_files.append(gnuplot_output_dir+compare_trend_filename+".gnuplot")
+
+		#Setting up a different output filename for each kind of graph
+		compare_raw=open(gnuplot_output_dir+compare_raw_filename + ".gnuplot",'a')
+		compare_raw.write("set output '%s.png'\n" % compare_raw_filename)
+		compare_smooth=open(gnuplot_output_dir+compare_smooth_filename+".gnuplot",'a')
+		compare_smooth.write("set output '%s.png'\n" % compare_smooth_filename)
+		compare_trend=open(gnuplot_output_dir+compare_trend_filename+".gnuplot",'a')
+		compare_trend.write("set output '%s.png'\n" % compare_trend_filename)
+
+		# Let's plot the average value for all the traces
+		global_disk_perf = sum(disk_perf, [])
+		global_avg  = average(global_disk_perf)
+		compare_raw.write("plot %s w l ls 1 ti 'Global average value (%.2f)'" % (global_avg,global_avg));
+		compare_smooth.write("plot %s w l ls 1 ti 'Global average value (%.2f)'" % (global_avg,global_avg));
+		compare_trend.write("plot %s w l ls 1 ti 'Global average value (%.2f)'" % (global_avg,global_avg));
+
+		pos=0
+		# Let's create a temporary file for each selected fio file
+		for file in fio_data_file:
+			tmp_filename = "gnuplot_temp_file.%d" % pos
+
+		# Plotting comparing graphs doesn't have a meaning unless if there is at least 2 traces
+		if len(fio_data_file) > 1:
+			# Adding the plot instruction for each kind of comparing graphs
+			compare_raw.write(",\\\n'%s' using 2:3 with linespoints title '%s'" % (tmp_filename,fio_data_file[pos]))
+			compare_smooth.write(",\\\n'%s' using 2:3 smooth csplines title '%s'" % (tmp_filename,fio_data_file[pos]))
+			compare_trend.write(",\\\n'%s' using 2:3 smooth bezier title '%s'" % (tmp_filename,fio_data_file[pos]))
+
+		png_file=file.replace('.log','')
+		raw_filename = "%s-2Draw" % (png_file)
+		smooth_filename = "%s-2Dsmooth" % (png_file)
+		trend_filename = "%s-2Dtrend" % (png_file)
+		avg  = average(disk_perf[pos])
+		f.write("call \'%s/graph2D.gpm\' \'%s' \'%s\' \'%s\' \'%s\' \'%s\' \'%s\' \'%s\' \'%f\'\n" % (gpm_dir,title,tmp_filename,fio_data_file[pos],raw_filename,mode,smooth_filename,trend_filename,avg))
+		pos = pos +1
+
+	# Plotting comparing graphs doesn't have a meaning unless if there is at least 2 traces
+	if len(fio_data_file) > 1:
+		os.remove(gnuplot_output_dir+"compare.gnuplot")
+		compare_raw.close()
+		compare_smooth.close()
+		compare_trend.close()
+	f.close()
+
+def generate_gnuplot_math_script(title,gnuplot_output_filename,mode,average,gnuplot_output_dir,gpm_dir):
+	filename=gnuplot_output_dir+'mymath';
+	temporary_files.append(filename)
+	f=open(filename,'a')
+	f.write("call \'%s/math.gpm\' \'%s' \'%s\' \'\' \'%s\' \'%s\' %s\n" % (gpm_dir,title,gnuplot_output_filename,gnuplot_output_filename,mode,average))
+	f.close()
+
+def compute_aggregated_file(fio_data_file, gnuplot_output_filename, gnuplot_output_dir):
+	if verbose: print("Processing data file 2/2")
+	temp_files=[]
+	pos=0
+
+	# Let's create a temporary file for each selected fio file
+	for file in fio_data_file:
+		tmp_filename = "%sgnuplot_temp_file.%d" % (gnuplot_output_dir, pos)
+		temp_files.append(open(tmp_filename,'r'))
+		pos = pos +1
+
+	f = open(gnuplot_output_dir+gnuplot_output_filename, "w")
+	temporary_files.append(gnuplot_output_dir+gnuplot_output_filename)
+	index=0
+	# Let's add some information
+	for tempfile in temp_files:
+		    f.write("# Disk%d was coming from %s\n" % (index,fio_data_file[index]))
+		    f.write(tempfile.read())
+		    f.write("\n")
+		    tempfile.close()
+		    index = index + 1
+	f.close()
+
+def average(s): return sum(s) * 1.0 / len(s)
+
+def compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir, min_time, max_time):
+	end_time=max_time
+	if end_time == -1:
+		end_time="infinite"
+	if verbose: print("Processing data file 1/2 with %s<time<%s" % (min_time,end_time))
+	files=[]
+	temp_outfile=[]
+	blk_size=0
+	for file in fio_data_file:
+		files.append(open(file))
+		pos = len(files) - 1
+		tmp_filename = "%sgnuplot_temp_file.%d" % (gnuplot_output_dir,pos)
+		temporary_files.append(tmp_filename)
+		gnuplot_file=open(tmp_filename,'w')
+		temp_outfile.append(gnuplot_file)
+		gnuplot_file.write("#Temporary file based on file %s\n" % file)
+		disk_perf.append([])
+
+	shall_break = False
+	while True:
+		current_line=[]
+		nb_empty_files=0
+		nb_files=len(files)
+		for myfile in files:
+			s=myfile.readline().replace(',',' ').split()
+			if not s:
+				nb_empty_files+=1
+				s="-1, 0, 0, 0".replace(',',' ').split()
+
+			if (nb_empty_files == nb_files):
+				shall_break=True
+				break;
+
+			current_line.append(s);
+
+		if shall_break == True:
+			break
+
+		last_time = -1
+		index=-1
+		perfs=[]
+		for line in enumerate(current_line):
+			# Index will be used to remember what file was featuring what value
+			index=index+1
+
+			time, perf, x, block_size = line[1]
+			if (blk_size == 0):
+				try:
+					blk_size=int(block_size)
+				except:
+					print("Error while reading the following line :")
+					print(line)
+					sys.exit(1);
+
+			# We ignore the first 500msec as it doesn't seems to be part of the real benchmark
+			# Time < 500 usually reports BW=0 breaking the min computing
+			if (min_time == 0):
+				min_time==0.5
+
+			# Then we estimate if the data we got is part of the time range we want to plot
+			if ((float(time)>(float(min_time)*1000)) and ((int(time) < (int(max_time)*1000)) or max_time==-1)):
+					disk_perf[index].append(int(perf))
+					perfs.append("%d %s %s"% (index, time, perf))
+
+		# If we reach this point, it means that all the traces are coherent
+		for p in enumerate(perfs):
+			index, perf_time,perf = p[1].split()
+			temp_outfile[int(index)].write("%s %.2f %s\n" % (index, float(float(perf_time)/1000), perf))
+
+
+	for file in files:
+		file.close()
+	for file in temp_outfile:
+                file.close()
+	return blk_size
+
+def compute_math(fio_data_file, title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir):
+	if verbose: print("Computing Maths")
+	global_min=[]
+	global_max=[]
+	average_file=open(gnuplot_output_dir+gnuplot_output_filename+'.average', 'w')
+	min_file=open(gnuplot_output_dir+gnuplot_output_filename+'.min', 'w')
+	max_file=open(gnuplot_output_dir+gnuplot_output_filename+'.max', 'w')
+	stddev_file=open(gnuplot_output_dir+gnuplot_output_filename+'.stddev', 'w')
+	global_file=open(gnuplot_output_dir+gnuplot_output_filename+'.global','w')
+	temporary_files.append(gnuplot_output_dir+gnuplot_output_filename+'.average')
+	temporary_files.append(gnuplot_output_dir+gnuplot_output_filename+'.min')
+	temporary_files.append(gnuplot_output_dir+gnuplot_output_filename+'.max')
+	temporary_files.append(gnuplot_output_dir+gnuplot_output_filename+'.stddev')
+	temporary_files.append(gnuplot_output_dir+gnuplot_output_filename+'.global')
+
+	min_file.write('DiskName %s\n' % mode)
+	max_file.write('DiskName %s\n'% mode)
+	average_file.write('DiskName %s\n'% mode)
+	stddev_file.write('DiskName %s\n'% mode )
+	for disk in range(len(fio_data_file)):
+#		print disk_perf[disk]
+		min_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		max_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		average_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		stddev_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		avg  = average(disk_perf[disk])
+		variance = [(x - avg)**2 for x in disk_perf[disk]]
+		standard_deviation = math.sqrt(average(variance))
+#		print "Disk%d [ min=%.2f max=%.2f avg=%.2f stddev=%.2f \n" % (disk,min(disk_perf[disk]),max(disk_perf[disk]),avg, standard_deviation)
+		average_file.write('%d %d\n' % (disk, avg))
+		stddev_file.write('%d %d\n' % (disk, standard_deviation))
+		local_min=min(disk_perf[disk])
+		local_max=max(disk_perf[disk])
+		min_file.write('%d %d\n' % (disk, local_min))
+		max_file.write('%d %d\n' % (disk, local_max))
+		global_min.append(int(local_min))
+		global_max.append(int(local_max))
+
+	global_disk_perf = sum(disk_perf, [])
+	avg  = average(global_disk_perf)
+	variance = [(x - avg)**2 for x in global_disk_perf]
+	standard_deviation = math.sqrt(average(variance))
+
+	global_file.write('min=%.2f\n' % min(global_disk_perf))
+	global_file.write('max=%.2f\n' % max(global_disk_perf))
+	global_file.write('avg=%.2f\n' % avg)
+	global_file.write('stddev=%.2f\n' % standard_deviation)
+	global_file.write('values_count=%d\n' % len(global_disk_perf))
+	global_file.write('disks_count=%d\n' % len(fio_data_file))
+	#print "Global [ min=%.2f max=%.2f avg=%.2f stddev=%.2f \n" % (min(global_disk_perf),max(global_disk_perf),avg, standard_deviation)
+
+	average_file.close()
+	min_file.close()
+	max_file.close()
+	stddev_file.close()
+	global_file.close()
+	try:
+		os.remove(gnuplot_output_dir+'mymath')
+	except:
+		True
+
+	generate_gnuplot_math_script("Average values of "+title,gnuplot_output_filename+'.average',mode,int(avg),gnuplot_output_dir,gpm_dir)
+	generate_gnuplot_math_script("Min values of "+title,gnuplot_output_filename+'.min',mode,average(global_min),gnuplot_output_dir,gpm_dir)
+	generate_gnuplot_math_script("Max values of "+title,gnuplot_output_filename+'.max',mode,average(global_max),gnuplot_output_dir,gpm_dir)
+	generate_gnuplot_math_script("Standard Deviation of "+title,gnuplot_output_filename+'.stddev',mode,int(standard_deviation),gnuplot_output_dir,gpm_dir)
+
+def parse_global_files(fio_data_file, global_search):
+	max_result=0
+	max_file=''
+	for file in fio_data_file:
+		f=open(file)
+		disk_count=0
+		search_value=-1
+
+		# Let's read the complete file
+		while True:
+			try:
+				# We do split the name from the value
+				name,value=f.readline().split("=")
+			except:
+				f.close()
+				break
+			# If we ended the file
+			if not name:
+				# Let's process what we have
+				f.close()
+				break
+			else:
+				# disks_count is not global_search item
+				# As we need it for some computation, let's save it
+				if name=="disks_count":
+					disks_count=int(value)
+
+				# Let's catch the searched item
+				if global_search in name:
+					search_value=float(value)
+
+		# Let's process the avg value by estimated the global bandwidth per file
+		# We keep the biggest in memory for reporting
+		if global_search == "avg":
+			if (disks_count > 0) and (search_value != -1):
+				result=disks_count*search_value
+				if (result > max_result):
+					max_result=result
+					max_file=file
+	# Let's print the avg output
+	if global_search == "avg":
+		print("Biggest aggregated value of %s was %2.f in file %s\n" % (global_search, max_result, max_file))
+	else:
+		print("Global search %s is not yet implemented\n" % global_search)
+
+def render_gnuplot(fio_data_file, gnuplot_output_dir):
+	print("Running gnuplot Rendering")
+	try:
+		# Let's render all the compared files if some
+		if len(fio_data_file) > 1:
+			if verbose: print(" |-> Rendering comparing traces")
+			os.system("cd %s; for i in *.gnuplot; do gnuplot $i; done" % gnuplot_output_dir)
+		if verbose: print(" |-> Rendering math traces")
+		os.system("cd %s; gnuplot mymath" % gnuplot_output_dir)
+		if verbose: print(" |-> Rendering 2D & 3D traces")
+		os.system("cd %s; gnuplot mygraph" % gnuplot_output_dir)
+
+		name_of_directory="the current"
+		if gnuplot_output_dir != "./":
+			name_of_directory=gnuplot_output_dir
+		print("\nRendering traces are available in %s directory" % name_of_directory)
+		global keep_temp_files
+		keep_temp_files=False
+	except:
+		print("Could not run gnuplot on mymath or mygraph !\n")
+		sys.exit(1);
+
+def print_help():
+    print('fio2gnuplot -ghbiodvk -t <title> -o <outputfile> -p <pattern> -G <type> -m <time> -M <time>')
+    print()
+    print('-h --help                           : Print this help')
+    print('-p <pattern> or --pattern <pattern> : A glob pattern to select fio input files')
+    print('-b           or --bandwidth         : A predefined pattern for selecting *_bw.log files')
+    print('-i           or --iops              : A predefined pattern for selecting *_iops.log files')
+    print('-g           or --gnuplot           : Render gnuplot traces before exiting')
+    print('-o           or --outputfile <file> : The basename for gnuplot traces')
+    print('                                       - Basename is set with the pattern if defined')
+    print('-d           or --outputdir <dir>   : The directory where gnuplot shall render files')
+    print('-t           or --title <title>     : The title of the gnuplot traces')
+    print('                                       - Title is set with the block size detected in fio traces')
+    print('-G           or --Global <type>     : Search for <type> in .global files match by a pattern')
+    print('                                       - Available types are : min, max, avg, stddev')
+    print('                                       - The .global extension is added automatically to the pattern')
+    print('-m           or --min_time <time>   : Only consider data starting from <time> seconds (default is 0)')
+    print('-M           or --max_time <time>   : Only consider data ending before <time> seconds (default is -1 aka nolimit)')
+    print('-v           or --verbose           : Increasing verbosity')
+    print('-k           or --keep              : Keep all temporary files from gnuplot\'s output dir')
+
+def main(argv):
+    mode='unknown'
+    pattern=''
+    pattern_set_by_user=False
+    title='No title'
+    gnuplot_output_filename='result'
+    gnuplot_output_dir='./'
+    gpm_dir="/usr/share/fio/"
+    disk_perf=[]
+    run_gnuplot=False
+    parse_global=False
+    global_search=''
+    min_time=0
+    max_time=-1
+    global verbose
+    verbose=False
+    global temporary_files
+    temporary_files=[]
+    global keep_temp_files
+    keep_temp_files=True
+    force_keep_temp_files=False
+
+    if not os.path.isfile(gpm_dir+'math.gpm'):
+        gpm_dir="/usr/local/share/fio/"
+        if not os.path.isfile(gpm_dir+'math.gpm'):
+            print("Looks like fio didn't get installed properly as no gpm files found in '/usr/share/fio' or '/usr/local/share/fio'\n")
+            sys.exit(3)
+
+    try:
+        opts, args = getopt.getopt(argv[1:],"ghkbivo:d:t:p:G:m:M:",['bandwidth', 'iops', 'pattern', 'outputfile', 'outputdir', 'title', 'min_time', 'max_time', 'gnuplot', 'Global', 'help', 'verbose','keep'])
+    except getopt.GetoptError:
+        print("Error: One of the options passed to the cmdline was not supported")
+        print("Please fix your command line or read the help (-h option)")
+        sys.exit(2)
+
+    for opt, arg in opts:
+        if opt in ("-b", "--bandwidth"):
+            pattern='*_bw.log'
+        elif opt in ("-i", "--iops"):
+            pattern='*_iops.log'
+        elif opt in ("-v", "--verbose"):
+            verbose=True
+        elif opt in ("-k", "--keep"):
+            #User really wants to keep the temporary files
+            force_keep_temp_files=True
+        elif opt in ("-p", "--pattern"):
+            pattern_set_by_user=True
+            pattern=arg
+            pattern=pattern.replace('\\','')
+        elif opt in ("-o", "--outputfile"):
+            gnuplot_output_filename=arg
+        elif opt in ("-d", "--outputdir"):
+            gnuplot_output_dir=arg
+            if not gnuplot_output_dir.endswith('/'):
+                gnuplot_output_dir=gnuplot_output_dir+'/'
+            if not os.path.exists(gnuplot_output_dir):
+                os.makedirs(gnuplot_output_dir)
+        elif opt in ("-t", "--title"):
+            title=arg
+        elif opt in ("-m", "--min_time"):
+            min_time=arg
+        elif opt in ("-M", "--max_time"):
+            max_time=arg
+        elif opt in ("-g", "--gnuplot"):
+            run_gnuplot=True
+        elif opt in ("-G", "--Global"):
+            parse_global=True
+            global_search=arg
+        elif opt in ("-h", "--help"):
+            print_help()
+            sys.exit(1)
+
+    # Adding .global extension to the file
+    if parse_global==True:
+        if not gnuplot_output_filename.endswith('.global'):
+            pattern = pattern+'.global'
+
+    fio_data_file=find_file('.',pattern)
+    if len(fio_data_file) == 0:
+        print("No log file found with pattern %s!" % pattern)
+        # Try numjob log file format if per_numjob_logs=1
+        if (pattern == '*_bw.log'):
+            fio_data_file=find_file('.','*_bw.*.log')
+        if (pattern == '*_iops.log'):
+            fio_data_file=find_file('.','*_iops.*.log')
+        if len(fio_data_file) == 0:
+            sys.exit(1)
+        else:
+            print("Using log file per job format instead")
+    else:
+        print("%d files Selected with pattern '%s'" % (len(fio_data_file), pattern))
+
+    fio_data_file=sorted(fio_data_file, key=str.lower)
+    for file in fio_data_file:
+        print(' |-> %s' % file)
+        if "_bw.log" in file :
+            mode="Bandwidth (KB/sec)"
+        if "_iops.log" in file :
+            mode="IO per Seconds (IO/sec)"
+    if (title == 'No title') and (mode != 'unknown'):
+        if "Bandwidth" in mode:
+            title='Bandwidth benchmark with %d fio results' % len(fio_data_file)
+        if "IO" in mode:
+            title='IO benchmark with %d fio results' % len(fio_data_file)
+
+    print()
+    #We need to adjust the output filename regarding the pattern required by the user
+    if (pattern_set_by_user == True):
+        gnuplot_output_filename=pattern
+        # As we do have some glob in the pattern, let's make this simpliest
+        # We do remove the simpliest parts of the expression to get a clear file name
+        gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
+        gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
+        gnuplot_output_filename=gnuplot_output_filename.replace('--','-')
+        gnuplot_output_filename=gnuplot_output_filename.replace('.log','')
+        # Insure that we don't have any starting or trailing dash to the filename
+        gnuplot_output_filename = gnuplot_output_filename[:-1] if gnuplot_output_filename.endswith('-') else gnuplot_output_filename
+        gnuplot_output_filename = gnuplot_output_filename[1:] if gnuplot_output_filename.startswith('-') else gnuplot_output_filename
+        if (gnuplot_output_filename == ''):
+            gnuplot_output_filename='default'
+
+    if parse_global==True:
+        parse_global_files(fio_data_file, global_search)
+    else:
+        blk_size=compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir,min_time,max_time)
+        title="%s @ Blocksize = %dK" % (title,blk_size/1024)
+        compute_aggregated_file(fio_data_file, gnuplot_output_filename, gnuplot_output_dir)
+        compute_math(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir)
+        generate_gnuplot_script(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir)
+
+        if (run_gnuplot==True):
+            render_gnuplot(fio_data_file, gnuplot_output_dir)
+
+        # Shall we clean the temporary files ?
+        if keep_temp_files==False and force_keep_temp_files==False:
+            # Cleaning temporary files
+            if verbose: print("Cleaning temporary files")
+            for f in enumerate(temporary_files):
+                if verbose: print(" -> %s"%f[1])
+                try:
+                    os.remove(f[1])
+                except:
+                    True
+
+#Main
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tools/plot/fio2gnuplot.1 b/tools/plot/fio2gnuplot.1
new file mode 100644
index 0000000..6fb1283
--- /dev/null
+++ b/tools/plot/fio2gnuplot.1
@@ -0,0 +1,161 @@
+.\" Text automatically generated by txt2man
+.TH fio2gnuplot 1 "August 2013"
+.SH NAME
+\fBfio2gnuplot \fP- Render fio's output files with gnuplot
+.SH SYNOPSIS
+.nf
+.fam C
+\fBfio2gnuplot\fP [\fB-ghbiodvk\fP] [\fB-t\fP \fItitle\fP] [\fB-o\fP \fIoutputfile\fP]
+               [\fB-d\fP \fIoutput_dir\fP] [\fB-p\fP \fIpattern\fP]
+               [\fB-G\fP \fItype\fP] [\fB-m\fP \fImin_time\fP] [\fB-M\fP \fImax_time\fP]
+
+.fam T
+.fi
+.fam T
+.fi
+.SH DESCRIPTION
+\fBfio2gnuplot\fP analyze a set of fio's log files to turn them into a set of graphical traces using gnuplot tool.
+Several flavor of plotting are produced
+.TP
+.B
+Individual 2D Graph
+Each file is plotted in a separate image file with several option
+.RS
+.IP \(bu 3
+raw : Plot the exact reported performance. This plotting could be difficult to read
+.IP \(bu 3
+smooth :a smoother version of the raw print
+Using csplines option of gnuplot, the rendering is
+filtered to get an easier to read graph.
+.IP \(bu 3
+trend : an even smoother version of the raw print to get trends
+Bezier's curves makes much more filtered plots
+The resulting graph helps at understanding trends.
+.RE
+.TP
+.B
+Grouped 2D graph
+All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+.RS
+.IP \(bu 3
+raw
+.IP \(bu 3
+smooth
+.IP \(bu 3
+trend
+.RE
+.TP
+.B
+Grouped 3D graph
+All files are plotted into a single 3D graph.
+The 3D plotting generates a 'surface' to estimate how close were
+the performance.
+A flat surface means a good coherency between traces.
+A rugged surface means a lack of coherency between traces
+.TP
+.B
+Mathemical Plotting
+.RS
+.TP
+.B
+Average graph
+A bar graph to show the average performance of each file.
+A green line is added to show the global average performance.
+This green line helps at understanding how far from the average is
+every individual file.
+.TP
+.B
+Min graph
+A green line is added to show the global average of minimal performance.
+This green line helps at understanding how far from the average is
+every individual file.
+.TP
+.B
+Max graph
+A bar graph to show the maximum performance of each file.
+A green line is added to show the global average of maximal performance.
+This green line helps at understanding how far from the average is
+every individual file.
+.TP
+.B
+Standard Deviation
+A bar graph to show the standard deviation of each file.
+A green line is added to show the global average of standard deviation.
+This green line helps at understanding how far from the average is
+every individual file.
+.SH OPTIONS
+.TP
+.B
+\fB-h\fP or \fB--help\fP
+The option \fB-h\fP displays help
+.TP
+.B
+\fB-p\fP '\fIpattern\fP' or --\fIpattern\fP '\fIpattern\fP'
+A \fIpattern\fP in regexp to select fio input files.
+Don't forget the simple quotes to avoid shell's interactions
+.TP
+.B
+\fB-b\fP or \fB--bandwidth\fP
+A predefined \fIpattern\fP for selecting *_bw.log files
+.TP
+.B
+\fB-i\fP or \fB--iops\fP
+A predefined \fIpattern\fP for selecting *_iops.log files
+.TP
+.B
+\fB-g\fP or \fB--gnuplot\fP
+Render gnuplot traces before exiting
+.TP
+.B
+\fB-o\fP file or --\fIoutputfile\fP file
+The basename for gnuplot traces (set with the \fIpattern\fP if defined)
+.TP
+.B
+\fB-d\fP dir or \fB--outputdir\fP dir
+The directory where gnuplot shall render files.
+.TP
+.B
+\fB-t\fP \fItitle\fP or --\fItitle\fP \fItitle\fP
+The \fItitle\fP of the gnuplot traces.
+Title is set with the block size detected in fio trace
+.TP
+.B
+\fB-G\fP \fItype\fP or \fB--Global\fP \fItype\fP
+Search for '\fItype\fP' in .global files match by a \fIpattern\fP.
+Available types are : min, max, avg, stddev.
+The .global extension is added automatically to the \fIpattern\fP
+.TP
+.B
+\fB-m\fP time or --\fImin_time\fP time
+Only consider data starting from 'time' seconds. Default is 0
+.TP
+.B
+\fB-M\fP time or --\fImax_time\fP time
+Only consider data ending before 'time' seconds. Default is \fB-1\fP aka nolimit
+.TP
+.B
+\fB-v\fP or \fB--verbose\fP
+Increasing verbosity
+.TP
+.B
+\fB-k\fP or \fB--keep\fP
+Keep all temporary files from gnuplot's output dir
+.SH EXAMPLE
+.TP
+.B
+To plot all the traces named like 'host*_read_4k_iops.log'
+$ \fBfio2gnuplot\fP \fB-p\fP 'host*_read_4k_iops.log' \fB-g\fP
+.TP
+.B
+To plot all IO oriented log files from the current directory
+$ \fBfio2gnuplot\fP \fB-g\fP \fB-i\fP
+.TP
+.B
+To plot all Bandwidth oriented log files from the current directory
+$ \fBfio2gnuplot\fP \fB-g\fP \fB-b\fP
+.TP
+.B
+To plot all Bandwidth oriented log files in a directory name 'outdir'
+$ \fBfio2gnuplot\fP \fB-g\fP \fB-b\fP \fB-d\fP outdir
+.SH AUTHOR
+Erwan Velu <erwan@enovance.com>
diff --git a/tools/plot/fio2gnuplot.manpage b/tools/plot/fio2gnuplot.manpage
new file mode 100644
index 0000000..6a12cf8
--- /dev/null
+++ b/tools/plot/fio2gnuplot.manpage
@@ -0,0 +1,117 @@
+NAME
+fio2gnuplot - Render fio's output files with gnuplot
+SYNOPSIS
+fio2gnuplot [-ghbiodvk] [-t title] [-o outputfile]
+		 [-d output_dir] [-p pattern]
+		 [-G type] [-m min_time] [-M max_time]
+
+DESCRIPTION
+ fio2gnuplot analyze a set of fio's log files to turn them into a set of graphical traces using gnuplot tool.
+  Several flavor of plotting are produced
+
+ Individual 2D Graph 	
+	Each file is plotted in a separate image file with several option
+	- raw : Plot the exact reported performance. This plotting could be difficult to read
+	- smooth :a smoother version of the raw print
+                  Using csplines option of gnuplot, the rendering is
+                  filtered to get an easier to read graph.
+	- trend : an even smoother version of the raw print to get trends
+                    	Bezier's curves makes much more filtered plots
+                    	The resulting graph helps at understanding trends.
+
+ Grouped 2D graph   
+	All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+         - raw
+         - smooth
+         - trend
+
+ Grouped 3D graph  
+     All files are plotted into a single 3D graph.
+     The 3D plotting generates a 'surface' to estimate how close were
+     the performance.
+     A flat surface means a good coherency between traces.
+     A rugged surface means a lack of coherency between traces
+
+ Mathemical Plotting  
+  Average graph  
+     A bar graph to show the average performance of each file.
+     A green line is added to show the global average performance.
+     This green line helps at understanding how far from the average is
+     every individual file.
+
+  Min graph  
+     A green line is added to show the global average of minimal performance.
+     This green line helps at understanding how far from the average is
+     every individual file.
+
+  Max graph  
+     A bar graph to show the maximum performance of each file.
+     A green line is added to show the global average of maximal performance.
+     This green line helps at understanding how far from the average is
+     every individual file.
+
+  Standard Deviation  
+     A bar graph to show the standard deviation of each file.
+     A green line is added to show the global average of standard deviation.
+     This green line helps at understanding how far from the average is
+     every individual file.
+
+OPTIONS
+ -h or --help  
+	The option -h displays help
+
+ -p 'pattern' or --pattern 'pattern'  
+	A pattern in regexp to select fio input files.
+	Don't forget the simple quotes to avoid shell's interactions
+
+ -b or --bandwidth  
+	A predefined pattern for selecting *_bw.log files
+
+ -i or --iops  
+	A predefined pattern for selecting *_iops.log files
+
+ -g or --gnuplot  
+	Render gnuplot traces before exiting
+
+ -o file or --outputfile file  
+	The basename for gnuplot traces (set with the pattern if defined)
+
+ -d dir or --outputdir dir  
+	The directory where gnuplot shall render files.
+
+ -t title or --title title  
+	The title of the gnuplot traces.
+	Title is set with the block size detected in fio trace
+
+ -G type or --Global type  
+	Search for 'type' in .global files match by a pattern.
+ 	Available types are : min, max, avg, stddev.
+	The .global extension is added automatically to the pattern
+
+ -m time or --min_time time  
+	Only consider data starting from 'time' seconds. Default is 0
+
+ -M time or --max_time time  
+	 Only consider data ending before 'time' seconds. Default is -1 aka nolimit
+
+ -v or --verbose  
+	Increasing verbosity
+
+ -k or --keep  
+	Keep all temporary files from gnuplot's output dir
+
+EXAMPLE
+To plot all the traces named like 'host*_read_4k_iops.log'  
+	$ fio2gnuplot -p 'host*_read_4k_iops.log' -g
+
+To plot all IO oriented log files from the current directory  
+	$ fio2gnuplot -g -i
+
+To plot all Bandwidth oriented log files from the current directory  
+	$ fio2gnuplot -g -b
+
+To plot all Bandwidth oriented log files in a directory name 'outdir'  
+	$ fio2gnuplot -g -b -d outdir
+
+AUTHOR
+  Erwan Velu <erwan@enovance.com>
diff --git a/tools/plot/graph2D.gpm b/tools/plot/graph2D.gpm
new file mode 100644
index 0000000..769b754
--- /dev/null
+++ b/tools/plot/graph2D.gpm
@@ -0,0 +1,55 @@
+# This Gnuplot file has been generated by eNovance
+
+needed_args = 8
+if (exists("ARGC") && ARGC >= needed_args) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" >= needed_args) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+	ARG6 = "$5"; \
+	ARG7 = "$6"; \
+	ARG8 = "$7"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+avg_num = ARG8 + 0
+avg_str = sprintf("%g", avg_num)
+
+set title ARG1
+
+set terminal png size 1280,1024
+set output ARG4 . '.png'
+#set terminal x11
+
+#Preparing Axes
+#set logscale x
+set ytics axis out auto 
+#set data style lines
+set key top left reverse
+set xlabel "Time (Seconds)"
+set ylabel ARG5
+set xrange [0:]
+set yrange [0:]
+
+#Set Color style
+#set palette rgbformulae 22,9,23
+#set palette rgbformulae 7,5,15
+set style line 100 lt 7 lw 0.5
+set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
+
+plot ARG2 using 2:3 with linespoints title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
+
+set output ARG6 . '.png'
+plot ARG2 using 2:3 smooth csplines title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
+
+set output ARG7 . '.png'
+plot ARG2 using 2:3 smooth bezier title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str .')'
+
+#pause -1
+#The End
diff --git a/tools/plot/graph3D.gpm b/tools/plot/graph3D.gpm
new file mode 100644
index 0000000..ac2cdf6
--- /dev/null
+++ b/tools/plot/graph3D.gpm
@@ -0,0 +1,95 @@
+# This Gnuplot file has been generated by eNovance
+
+needed_args = 5
+if (exists("ARGC") && ARGC >= needed_args) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" >= needed_args) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+set title ARG1
+
+set terminal png size 1280,1024
+set output ARG4 . '.png'
+#set terminal x11
+#3D Config
+set isosamples 30
+set hidden3d
+set pm3d at s solid hidden3d 100 scansbackward
+set pm3d depthorder
+
+#Preparing Axes
+#set logscale x
+set ytics axis out 0,1
+#set data style lines
+set grid back
+set key top left reverse
+set ylabel "Disk"
+set xlabel "Time (Seconds)"
+set zlabel ARG5
+set cbrange [0:]
+set zrange [0:]
+
+#Set Color style
+#set palette rgbformulae 22,9,23
+set palette rgbformulae 7,5,15
+set style line 100 lt 7 lw 0.5
+
+#Multiploting
+set multiplot
+
+#Top Left View
+set size 0.5,0.5
+set view 64,216
+set origin 0,0.5
+splot ARG2 using 2:1:3 with linespoints title ARG3
+
+#Top Right View
+set size 0.5,0.5
+set origin 0.5,0.5
+set view 90,0
+set pm3d at s solid hidden3d 100 scansbackward
+set pm3d depthorder
+splot ARG2 using 2:1:3 with linespoints title ARG3
+
+#Bottom Right View
+set size 0.5,0.5
+set origin 0.5,0
+set view 63,161
+set pm3d at s solid hidden3d 100 scansbackward
+set pm3d depthorder
+splot ARG2 using 2:1:3 with linespoints title ARG3
+
+#Bottom Left View
+set size 0.5,0.5
+set origin 0,0
+set pm3d map
+splot ARG2 using 2:1:3 with linespoints title ARG3
+
+#Unsetting multiplotting
+unset multiplot
+#pause -1
+
+#Preparing 3D Interactive view
+set mouse
+set terminal png size 1024,768
+set output ARG4 . '-3D.png'
+
+#set term x11
+set view 64,216
+set origin 0,0
+set size 1,1
+set pm3d at bs solid hidden3d 100 scansbackward
+set pm3d depthorder
+splot ARG2 using 2:1:3 with linespoints title ARG3
+
+#pause -1
+#The End
diff --git a/tools/plot/math.gpm b/tools/plot/math.gpm
new file mode 100644
index 0000000..0a2aff5
--- /dev/null
+++ b/tools/plot/math.gpm
@@ -0,0 +1,42 @@
+# This Gnuplot file has been generated by eNovance
+if (exists("ARGC") && ARGC > 5) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" > 5) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+	ARG6 = "$5"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+avg_num = ARG6 + 0
+avg_str = sprintf("%g", avg_num)
+
+set title ARG1
+
+set terminal png size 1280,1024
+set output ARG4 . '.png'
+
+set palette rgbformulae 7,5,15
+set style line 100 lt 7 lw 0.5
+set style fill transparent solid 0.9 noborder
+set auto x
+set ylabel ARG5
+set xlabel "Disk"
+set yrange [0:]
+set style data histogram
+set style histogram cluster gap 1
+set style fill solid border -1
+set boxwidth 2
+#set xtic rotate by -10 scale 10 font ",8"
+set bmargin 3
+set xtics axis out
+set xtic rotate by 45 scale 0 font ",8" autojustify
+set xtics offset 0,-1 border -5,1,5
+set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
+plot ARG2 using 2:xtic(1) ti col, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
diff --git a/tools/plot/samples/Makefile b/tools/plot/samples/Makefile
new file mode 100644
index 0000000..df0480f
--- /dev/null
+++ b/tools/plot/samples/Makefile
@@ -0,0 +1,19 @@
+all: clean m2sw1-128k-sdb-randwrite-para.results_bw.log io bandwidth
+
+m2sw1-128k-sdb-randwrite-para.results_bw.log:
+	tar -xf fio-logs.tar.gz
+
+io: setup
+	./fio2gnuplot.py -p 'm2sw1-128k-*-read-para*iops.log' -g
+
+bandwidth: setup
+	./fio2gnuplot.py -p 'm2sw1-128k-*-read-para*bw.log' -g
+
+setup:
+	ln -sf ../*py ../*gpm .
+
+clean:
+	rm -rf *png mygraph mymath *py *gpm gnuplot_temp_file* *~
+	rm -rf *.average *.stddev *.min *.max *.global
+	rm -rf m2sw1-128k-read-para-bw m2sw1-128k-read-para-iops
+	rm -rf *log
diff --git a/tools/plot/samples/fio-logs.tar.gz b/tools/plot/samples/fio-logs.tar.gz
new file mode 100644
index 0000000..2237f5e
Binary files /dev/null and b/tools/plot/samples/fio-logs.tar.gz differ
diff --git a/trim.c b/trim.c
new file mode 100644
index 0000000..bf825db
--- /dev/null
+++ b/trim.c
@@ -0,0 +1,83 @@
+/*
+ * TRIM/DISCARD support
+ */
+#include <string.h>
+#include <assert.h>
+
+#include "fio.h"
+#include "trim.h"
+
+#ifdef FIO_HAVE_TRIM
+bool get_next_trim(struct thread_data *td, struct io_u *io_u)
+{
+	struct io_piece *ipo;
+
+	/*
+	 * this io_u is from a requeue, we already filled the offsets
+	 */
+	if (io_u->file)
+		return true;
+	if (flist_empty(&td->trim_list))
+		return false;
+
+	assert(td->trim_entries);
+	ipo = flist_first_entry(&td->trim_list, struct io_piece, trim_list);
+	remove_trim_entry(td, ipo);
+
+	io_u->offset = ipo->offset;
+	io_u->buflen = ipo->len;
+	io_u->file = ipo->file;
+
+	/*
+	 * If not verifying that trimmed ranges return zeroed data,
+	 * remove this from the to-read verify lists
+	 */
+	if (!td->o.trim_zero) {
+		if (ipo->flags & IP_F_ONLIST)
+			flist_del(&ipo->list);
+		else {
+			assert(ipo->flags & IP_F_ONRB);
+			rb_erase(&ipo->rb_node, &td->io_hist_tree);
+		}
+		td->io_hist_len--;
+		free(ipo);
+	} else
+		ipo->flags |= IP_F_TRIMMED;
+
+	if (!fio_file_open(io_u->file)) {
+		int r = td_io_open_file(td, io_u->file);
+
+		if (r) {
+			dprint(FD_VERIFY, "failed file %s open\n",
+					io_u->file->file_name);
+			return false;
+		}
+	}
+
+	get_file(io_u->file);
+	assert(fio_file_open(io_u->file));
+	io_u->ddir = DDIR_TRIM;
+	io_u->xfer_buf = NULL;
+	io_u->xfer_buflen = io_u->buflen;
+
+	dprint(FD_VERIFY, "get_next_trim: ret io_u %p\n", io_u);
+	return true;
+}
+
+bool io_u_should_trim(struct thread_data *td, struct io_u *io_u)
+{
+	unsigned long long val;
+	uint64_t frand_max;
+	unsigned long r;
+
+	if (!td->o.trim_percentage)
+		return false;
+
+	frand_max = rand_max(&td->trim_state);
+	r = __rand(&td->trim_state);
+	val = (frand_max / 100ULL);
+
+	val *= (unsigned long long) td->o.trim_percentage;
+	return r <= val;
+}
+#endif
diff --git a/trim.h b/trim.h
new file mode 100644
index 0000000..fe8f9fe
--- /dev/null
+++ b/trim.h
@@ -0,0 +1,40 @@
+#ifndef FIO_TRIM_H
+#define FIO_TRIM_H
+
+#ifdef FIO_HAVE_TRIM
+#include "flist.h"
+#include "iolog.h"
+#include "compiler/compiler.h"
+#include "lib/types.h"
+#include "os/os.h"
+
+extern bool __must_check get_next_trim(struct thread_data *td, struct io_u *io_u);
+extern bool io_u_should_trim(struct thread_data *td, struct io_u *io_u);
+
+/*
+ * Determine whether a given io_u should be logged for verify or
+ * for discard
+ */
+static inline void remove_trim_entry(struct thread_data *td, struct io_piece *ipo)
+{
+	if (!flist_empty(&ipo->trim_list)) {
+		flist_del_init(&ipo->trim_list);
+		td->trim_entries--;
+	}
+}
+
+#else
+static inline bool get_next_trim(struct thread_data *td, struct io_u *io_u)
+{
+	return false;
+}
+static inline bool io_u_should_trim(struct thread_data *td, struct io_u *io_u)
+{
+	return false;
+}
+static inline void remove_trim_entry(struct thread_data *td, struct io_piece *ipo)
+{
+}
+#endif
+
+#endif
diff --git a/unittests/lib/memalign.c b/unittests/lib/memalign.c
new file mode 100644
index 0000000..42a2e31
--- /dev/null
+++ b/unittests/lib/memalign.c
@@ -0,0 +1,28 @@
+#include <stdlib.h>
+#include "../unittest.h"
+
+#include "../../lib/memalign.h"
+
+static void test_memalign_1(void)
+{
+	size_t align = 4096;
+	void *p = __fio_memalign(align, 1234, malloc);
+
+	if (p)
+		CU_ASSERT_EQUAL(((int)(uintptr_t)p) & (align - 1), 0);
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "memalign/1",
+		.fn	= test_memalign_1,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_lib_memalign(void)
+{
+	return fio_unittest_add_suite("lib/memalign.c", NULL, NULL, tests);
+}
diff --git a/unittests/lib/strntol.c b/unittests/lib/strntol.c
new file mode 100644
index 0000000..14adde2
--- /dev/null
+++ b/unittests/lib/strntol.c
@@ -0,0 +1,59 @@
+#include "../unittest.h"
+
+#include "../../lib/strntol.h"
+
+static void test_strntol_1(void)
+{
+	char s[] = "12345";
+	char *endp = NULL;
+	long ret = strntol(s, strlen(s), &endp, 10);
+
+	CU_ASSERT_EQUAL(ret, 12345);
+	CU_ASSERT_NOT_EQUAL(endp, NULL);
+	CU_ASSERT_EQUAL(*endp, '\0');
+}
+
+static void test_strntol_2(void)
+{
+	char s[] = "     12345";
+	char *endp = NULL;
+	long ret = strntol(s, strlen(s), &endp, 10);
+
+	CU_ASSERT_EQUAL(ret, 12345);
+	CU_ASSERT_NOT_EQUAL(endp, NULL);
+	CU_ASSERT_EQUAL(*endp, '\0');
+}
+
+static void test_strntol_3(void)
+{
+	char s[] = "0x12345";
+	char *endp = NULL;
+	long ret = strntol(s, strlen(s), &endp, 16);
+
+	CU_ASSERT_EQUAL(ret, 0x12345);
+	CU_ASSERT_NOT_EQUAL(endp, NULL);
+	CU_ASSERT_EQUAL(*endp, '\0');
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strntol/1",
+		.fn	= test_strntol_1,
+	},
+	{
+		.name	= "strntol/2",
+		.fn	= test_strntol_2,
+	},
+	{
+		.name	= "strntol/3",
+		.fn	= test_strntol_3,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_lib_strntol(void)
+{
+	return fio_unittest_add_suite("lib/strntol.c", NULL, NULL, tests);
+}
diff --git a/unittests/oslib/strcasestr.c b/unittests/oslib/strcasestr.c
new file mode 100644
index 0000000..19a2de3
--- /dev/null
+++ b/unittests/oslib/strcasestr.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2019 Tomohiro Kusumi <tkusumi@netbsd.org>
+ */
+#include "../unittest.h"
+
+#ifndef CONFIG_STRCASESTR
+#include "../../oslib/strcasestr.h"
+#else
+#include <string.h>
+#endif
+
+static void test_strcasestr_1(void)
+{
+	const char *haystack = "0123456789";
+	const char *p;
+
+	p = strcasestr(haystack, "012");
+	CU_ASSERT_EQUAL(p, haystack);
+
+	p = strcasestr(haystack, "12345");
+	CU_ASSERT_EQUAL(p, haystack + 1);
+
+	p = strcasestr(haystack, "1234567890");
+	CU_ASSERT_EQUAL(p, NULL);
+
+	p = strcasestr(haystack, "");
+	CU_ASSERT_EQUAL(p, haystack); /* is this expected ? */
+}
+
+static void test_strcasestr_2(void)
+{
+	const char *haystack = "ABCDEFG";
+	const char *p;
+
+	p = strcasestr(haystack, "ABC");
+	CU_ASSERT_EQUAL(p, haystack);
+
+	p = strcasestr(haystack, "BCD");
+	CU_ASSERT_EQUAL(p, haystack + 1);
+
+	p = strcasestr(haystack, "ABCDEFGH");
+	CU_ASSERT_EQUAL(p, NULL);
+
+	p = strcasestr(haystack, "");
+	CU_ASSERT_EQUAL(p, haystack); /* is this expected ? */
+}
+
+static void test_strcasestr_3(void)
+{
+	const char *haystack = "ABCDEFG";
+	const char *p;
+
+	p = strcasestr(haystack, "AbC");
+	CU_ASSERT_EQUAL(p, haystack);
+
+	p = strcasestr(haystack, "bCd");
+	CU_ASSERT_EQUAL(p, haystack + 1);
+
+	p = strcasestr(haystack, "AbcdEFGH");
+	CU_ASSERT_EQUAL(p, NULL);
+
+	p = strcasestr(haystack, "");
+	CU_ASSERT_EQUAL(p, haystack); /* is this expected ? */
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strcasestr/1",
+		.fn	= test_strcasestr_1,
+	},
+	{
+		.name	= "strcasestr/2",
+		.fn	= test_strcasestr_2,
+	},
+	{
+		.name	= "strcasestr/3",
+		.fn	= test_strcasestr_3,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_oslib_strcasestr(void)
+{
+	return fio_unittest_add_suite("oslib/strcasestr.c", NULL, NULL, tests);
+}
diff --git a/unittests/oslib/strlcat.c b/unittests/oslib/strlcat.c
new file mode 100644
index 0000000..8d35d41
--- /dev/null
+++ b/unittests/oslib/strlcat.c
@@ -0,0 +1,52 @@
+#include "../unittest.h"
+
+#ifndef CONFIG_STRLCAT
+#include "../../oslib/strlcat.h"
+#else
+#include <string.h>
+#endif
+
+static void test_strlcat_1(void)
+{
+	char dst[32];
+	char src[] = "test";
+	size_t ret;
+
+	dst[0] = '\0';
+	ret = strlcat(dst, src, sizeof(dst));
+
+	CU_ASSERT_EQUAL(strcmp(dst, "test"), 0);
+	CU_ASSERT_EQUAL(ret, 4); /* total length it tried to create */
+}
+
+static void test_strlcat_2(void)
+{
+	char dst[32];
+	char src[] = "test";
+	size_t ret;
+
+	dst[0] = '\0';
+	ret = strlcat(dst, src, strlen(dst));
+
+	CU_ASSERT_EQUAL(strcmp(dst, ""), 0);
+	CU_ASSERT_EQUAL(ret, 4); /* total length it tried to create */
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strlcat/1",
+		.fn	= test_strlcat_1,
+	},
+	{
+		.name	= "strlcat/2",
+		.fn	= test_strlcat_2,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_oslib_strlcat(void)
+{
+	return fio_unittest_add_suite("oslib/strlcat.c", NULL, NULL, tests);
+}
diff --git a/unittests/oslib/strndup.c b/unittests/oslib/strndup.c
new file mode 100644
index 0000000..2d1baf1
--- /dev/null
+++ b/unittests/oslib/strndup.c
@@ -0,0 +1,63 @@
+#include "../unittest.h"
+
+#ifndef CONFIG_HAVE_STRNDUP
+#include "../../oslib/strndup.h"
+#else
+#include <string.h>
+#endif
+
+static void test_strndup_1(void)
+{
+	char s[] = "test";
+	char *p = strndup(s, 3);
+
+	if (p) {
+		CU_ASSERT_EQUAL(strcmp(p, "tes"), 0);
+		CU_ASSERT_EQUAL(strlen(p), 3);
+	}
+}
+
+static void test_strndup_2(void)
+{
+	char s[] = "test";
+	char *p = strndup(s, 4);
+
+	if (p) {
+		CU_ASSERT_EQUAL(strcmp(p, s), 0);
+		CU_ASSERT_EQUAL(strlen(p), 4);
+	}
+}
+
+static void test_strndup_3(void)
+{
+	char s[] = "test";
+	char *p = strndup(s, 5);
+
+	if (p) {
+		CU_ASSERT_EQUAL(strcmp(p, s), 0);
+		CU_ASSERT_EQUAL(strlen(p), 4);
+	}
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strndup/1",
+		.fn	= test_strndup_1,
+	},
+	{
+		.name	= "strndup/2",
+		.fn	= test_strndup_2,
+	},
+	{
+		.name	= "strndup/3",
+		.fn	= test_strndup_3,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_oslib_strndup(void)
+{
+	return fio_unittest_add_suite("oslib/strndup.c", NULL, NULL, tests);
+}
diff --git a/unittests/oslib/strsep.c b/unittests/oslib/strsep.c
new file mode 100644
index 0000000..7f645f4
--- /dev/null
+++ b/unittests/oslib/strsep.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2019 Tomohiro Kusumi <tkusumi@netbsd.org>
+ */
+#include "../unittest.h"
+
+#ifndef CONFIG_STRSEP
+#include "../../oslib/strsep.h"
+#else
+#include <string.h>
+#endif
+
+/*
+ * strsep(3) - "If *stringp is NULL, the strsep() function returns NULL and does
+ * nothing else."
+ */
+static void test_strsep_1(void)
+{
+	char *string = NULL;
+	const char *p;
+
+	p = strsep(&string, "");
+	CU_ASSERT_EQUAL(p, NULL);
+	CU_ASSERT_EQUAL(string, NULL);
+
+	p = strsep(&string, "ABC");
+	CU_ASSERT_EQUAL(p, NULL);
+	CU_ASSERT_EQUAL(string, NULL);
+}
+
+/*
+ * strsep(3) - "In case no delimiter was found, the token is taken to be the
+ * entire string *stringp, and *stringp is made NULL."
+ */
+static void test_strsep_2(void)
+{
+	char src[] = "ABCDEFG";
+	char *string = src;
+	const char *p;
+
+	p = strsep(&string, "");
+	CU_ASSERT_EQUAL(p, src);
+	CU_ASSERT_EQUAL(*p, 'A');
+	CU_ASSERT_EQUAL(string, NULL);
+
+	string = src;
+	p = strsep(&string, "@");
+	CU_ASSERT_EQUAL(p, src);
+	CU_ASSERT_EQUAL(*p, 'A');
+	CU_ASSERT_EQUAL(string, NULL);
+}
+
+/*
+ * strsep(3) - "This token is terminated with a '\0' character (by overwriting
+ * the delimiter) and *stringp is updated to point past the token."
+ */
+static void test_strsep_3(void)
+{
+	char src[] = "ABCDEFG";
+	char *string = src;
+	const char *p;
+
+	p = strsep(&string, "ABC");
+	CU_ASSERT_EQUAL(p, &src[0]);
+	CU_ASSERT_EQUAL(*p, '\0');
+	CU_ASSERT_EQUAL(strcmp(string, "BCDEFG"), 0);
+	CU_ASSERT_EQUAL(*string, 'B');
+
+	p = strsep(&string, "ABC");
+	CU_ASSERT_EQUAL(p, &src[1]);
+	CU_ASSERT_EQUAL(*p, '\0');
+	CU_ASSERT_EQUAL(strcmp(string, "CDEFG"), 0);
+	CU_ASSERT_EQUAL(*string, 'C');
+
+	p = strsep(&string, "ABC");
+	CU_ASSERT_EQUAL(p, &src[2]);
+	CU_ASSERT_EQUAL(*p, '\0');
+	CU_ASSERT_EQUAL(strcmp(string, "DEFG"), 0);
+	CU_ASSERT_EQUAL(*string, 'D');
+
+	p = strsep(&string, "ABC");
+	CU_ASSERT_EQUAL(p, &src[3]);
+	CU_ASSERT_EQUAL(*p, 'D');
+	CU_ASSERT_EQUAL(string, NULL);
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strsep/1",
+		.fn	= test_strsep_1,
+	},
+	{
+		.name	= "strsep/2",
+		.fn	= test_strsep_2,
+	},
+	{
+		.name	= "strsep/3",
+		.fn	= test_strsep_3,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_oslib_strsep(void)
+{
+	return fio_unittest_add_suite("oslib/strsep.c", NULL, NULL, tests);
+}
diff --git a/unittests/unittest.c b/unittests/unittest.c
new file mode 100644
index 0000000..c37e197
--- /dev/null
+++ b/unittests/unittest.c
@@ -0,0 +1,62 @@
+/*
+ * fio unittest
+ * Copyright (C) 2018 Tomohiro Kusumi <kusumi.tomohiro@osnexus.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./unittest.h"
+
+CU_ErrorCode fio_unittest_add_suite(const char *name, CU_InitializeFunc initfn,
+	CU_CleanupFunc cleanfn, struct fio_unittest_entry *tvec)
+{
+	CU_pSuite pSuite;
+	struct fio_unittest_entry *t;
+
+	pSuite = CU_add_suite(name, initfn, cleanfn);
+	if (!pSuite) {
+		CU_cleanup_registry();
+		return CU_get_error();
+	}
+
+	t = tvec;
+	while (t && t->name) {
+		if (!CU_add_test(pSuite, t->name, t->fn)) {
+			CU_cleanup_registry();
+			return CU_get_error();
+		}
+		t++;
+	}
+
+	return CUE_SUCCESS;
+}
+
+static void fio_unittest_register(CU_ErrorCode (*fn)(void))
+{
+	if (fn && fn() != CUE_SUCCESS) {
+		fprintf(stderr, "%s\n", CU_get_error_msg());
+		exit(1);
+	}
+}
+
+int main(void)
+{
+	if (CU_initialize_registry() != CUE_SUCCESS) {
+		fprintf(stderr, "%s\n", CU_get_error_msg());
+		exit(1);
+	}
+
+	fio_unittest_register(fio_unittest_lib_memalign);
+	fio_unittest_register(fio_unittest_lib_strntol);
+	fio_unittest_register(fio_unittest_oslib_strlcat);
+	fio_unittest_register(fio_unittest_oslib_strndup);
+	fio_unittest_register(fio_unittest_oslib_strcasestr);
+	fio_unittest_register(fio_unittest_oslib_strsep);
+
+	CU_basic_set_mode(CU_BRM_VERBOSE);
+	CU_basic_run_tests();
+	CU_cleanup_registry();
+
+	return CU_get_error();
+}
diff --git a/unittests/unittest.h b/unittests/unittest.h
new file mode 100644
index 0000000..786c1c9
--- /dev/null
+++ b/unittests/unittest.h
@@ -0,0 +1,24 @@
+#ifndef FIO_UNITTEST_H
+#define FIO_UNITTEST_H
+
+#include <sys/types.h>
+
+#include <CUnit/CUnit.h>
+#include <CUnit/Basic.h>
+
+struct fio_unittest_entry {
+	const char *name;
+	CU_TestFunc fn;
+};
+
+CU_ErrorCode fio_unittest_add_suite(const char*, CU_InitializeFunc,
+	CU_CleanupFunc, struct fio_unittest_entry*);
+
+CU_ErrorCode fio_unittest_lib_memalign(void);
+CU_ErrorCode fio_unittest_lib_strntol(void);
+CU_ErrorCode fio_unittest_oslib_strlcat(void);
+CU_ErrorCode fio_unittest_oslib_strndup(void);
+CU_ErrorCode fio_unittest_oslib_strcasestr(void);
+CU_ErrorCode fio_unittest_oslib_strsep(void);
+
+#endif
diff --git a/verify-state.h b/verify-state.h
new file mode 100644
index 0000000..6da1585
--- /dev/null
+++ b/verify-state.h
@@ -0,0 +1,109 @@
+#ifndef FIO_VERIFY_STATE_H
+#define FIO_VERIFY_STATE_H
+
+#include <stdint.h>
+#include <string.h>
+#include <limits.h>
+#include "lib/nowarn_snprintf.h"
+
+struct thread_rand32_state {
+	uint32_t s[4];
+};
+
+struct thread_rand64_state {
+	uint64_t s[6];
+};
+
+struct thread_rand_state {
+	uint64_t use64;
+	union {
+		struct thread_rand32_state state32;
+		struct thread_rand64_state state64;
+	};
+};
+
+/*
+ * For dumping current write state
+ */
+struct file_comp {
+	uint64_t fileno;
+	uint64_t offset;
+};
+
+struct thread_io_list {
+	uint64_t no_comps;
+	uint32_t depth;
+	uint32_t nofiles;
+	uint64_t numberio;
+	uint64_t index;
+	struct thread_rand_state rand;
+	uint8_t name[64];
+	struct file_comp comps[0];
+};
+
+struct all_io_list {
+	uint64_t threads;
+	struct thread_io_list state[0];
+};
+
+#define VSTATE_HDR_VERSION	0x03
+
+struct verify_state_hdr {
+	uint64_t version;
+	uint64_t size;
+	uint64_t crc;
+};
+
+#define IO_LIST_ALL		0xffffffff
+
+struct io_u;
+extern struct all_io_list *get_all_io_list(int, size_t *);
+extern void __verify_save_state(struct all_io_list *, const char *);
+extern void verify_save_state(int mask);
+extern int verify_load_state(struct thread_data *, const char *);
+extern void verify_free_state(struct thread_data *);
+extern int verify_state_should_stop(struct thread_data *, struct io_u *);
+extern void verify_assign_state(struct thread_data *, void *);
+extern int verify_state_hdr(struct verify_state_hdr *, struct thread_io_list *);
+
+static inline size_t __thread_io_list_sz(uint32_t depth, uint32_t nofiles)
+{
+	return sizeof(struct thread_io_list) + depth * nofiles * sizeof(struct file_comp);
+}
+
+static inline size_t thread_io_list_sz(struct thread_io_list *s)
+{
+	return __thread_io_list_sz(le32_to_cpu(s->depth), le32_to_cpu(s->nofiles));
+}
+
+static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
+{
+	return (struct thread_io_list *)((char *) s + thread_io_list_sz(s));
+}
+
+static inline void verify_state_gen_name(char *out, size_t size,
+					 const char *name, const char *prefix,
+					 int num)
+{
+	char ename[PATH_MAX];
+	char *ptr;
+
+	/*
+	 * Escape '/', just turn them into '.'
+	 */
+	ptr = ename;
+	do {
+		*ptr = *name;
+		if (*ptr == '\0')
+			break;
+		else if (*ptr == '/')
+			*ptr = '.';
+		ptr++;
+		name++;
+	} while (1);
+
+	nowarn_snprintf(out, size, "%s-%s-%d-verify.state", prefix, ename, num);
+	out[size - 1] = '\0';
+}
+
+#endif
diff --git a/verify.c b/verify.c
new file mode 100644
index 0000000..cf299eb
--- /dev/null
+++ b/verify.c
@@ -0,0 +1,1884 @@
+/*
+ * IO verification helpers
+ */
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+#include <libgen.h>
+
+#include "fio.h"
+#include "verify.h"
+#include "trim.h"
+#include "lib/rand.h"
+#include "lib/hweight.h"
+#include "lib/pattern.h"
+#include "oslib/asprintf.h"
+
+#include "crc/md5.h"
+#include "crc/crc64.h"
+#include "crc/crc32.h"
+#include "crc/crc32c.h"
+#include "crc/crc16.h"
+#include "crc/crc7.h"
+#include "crc/sha256.h"
+#include "crc/sha512.h"
+#include "crc/sha1.h"
+#include "crc/xxhash.h"
+#include "crc/sha3.h"
+
+static void populate_hdr(struct thread_data *td, struct io_u *io_u,
+			 struct verify_header *hdr, unsigned int header_num,
+			 unsigned int header_len);
+static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
+		       struct verify_header *hdr, unsigned int header_num,
+		       unsigned int header_len, uint64_t rand_seed);
+
+void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len)
+{
+	(void)cpy_pattern(td->o.buffer_pattern, td->o.buffer_pattern_bytes, p, len);
+}
+
+static void __fill_buffer(struct thread_options *o, uint64_t seed, void *p,
+			  unsigned int len)
+{
+	__fill_random_buf_percentage(seed, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
+}
+
+static uint64_t fill_buffer(struct thread_data *td, void *p,
+			    unsigned int len)
+{
+	struct frand_state *fs = &td->verify_state;
+	struct thread_options *o = &td->o;
+
+	return fill_random_buf_percentage(fs, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
+}
+
+void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
+			 struct io_u *io_u, uint64_t seed, int use_seed)
+{
+	struct thread_options *o = &td->o;
+
+	if (!o->verify_pattern_bytes) {
+		dprint(FD_VERIFY, "fill random bytes len=%u\n", len);
+
+		if (use_seed)
+			__fill_buffer(o, seed, p, len);
+		else
+			io_u->rand_seed = fill_buffer(td, p, len);
+		return;
+	}
+
+	/* Skip if we were here and we do not need to patch pattern
+	 * with format */
+	if (!td->o.verify_fmt_sz && io_u->buf_filled_len >= len) {
+		dprint(FD_VERIFY, "using already filled verify pattern b=%d len=%u\n",
+			o->verify_pattern_bytes, len);
+		return;
+	}
+
+	(void)paste_format(td->o.verify_pattern, td->o.verify_pattern_bytes,
+			   td->o.verify_fmt, td->o.verify_fmt_sz,
+			   p, len, io_u);
+	io_u->buf_filled_len = len;
+}
+
+static unsigned int get_hdr_inc(struct thread_data *td, struct io_u *io_u)
+{
+	unsigned int hdr_inc;
+
+	/*
+	 * If we use bs_unaligned, buflen can be larger than the verify
+	 * interval (which just defaults to the smallest blocksize possible).
+	 */
+	hdr_inc = io_u->buflen;
+	if (td->o.verify_interval && td->o.verify_interval <= io_u->buflen &&
+	    !td->o.bs_unaligned)
+		hdr_inc = td->o.verify_interval;
+
+	return hdr_inc;
+}
+
+static void fill_pattern_headers(struct thread_data *td, struct io_u *io_u,
+				 uint64_t seed, int use_seed)
+{
+	unsigned int hdr_inc, header_num;
+	struct verify_header *hdr;
+	void *p = io_u->buf;
+
+	fill_verify_pattern(td, p, io_u->buflen, io_u, seed, use_seed);
+
+	hdr_inc = get_hdr_inc(td, io_u);
+	header_num = 0;
+	for (; p < io_u->buf + io_u->buflen; p += hdr_inc) {
+		hdr = p;
+		populate_hdr(td, io_u, hdr, header_num, hdr_inc);
+		header_num++;
+	}
+}
+
+static void memswp(void *buf1, void *buf2, unsigned int len)
+{
+	char swap[200];
+
+	assert(len <= sizeof(swap));
+
+	memcpy(&swap, buf1, len);
+	memcpy(buf1, buf2, len);
+	memcpy(buf2, &swap, len);
+}
+
+static void hexdump(void *buffer, int len)
+{
+	unsigned char *p = buffer;
+	int i;
+
+	for (i = 0; i < len; i++)
+		log_err("%02x", p[i]);
+	log_err("\n");
+}
+
+/*
+ * Prepare for separation of verify_header and checksum header
+ */
+static inline unsigned int __hdr_size(int verify_type)
+{
+	unsigned int len = 0;
+
+	switch (verify_type) {
+	case VERIFY_NONE:
+	case VERIFY_HDR_ONLY:
+	case VERIFY_NULL:
+	case VERIFY_PATTERN:
+		len = 0;
+		break;
+	case VERIFY_MD5:
+		len = sizeof(struct vhdr_md5);
+		break;
+	case VERIFY_CRC64:
+		len = sizeof(struct vhdr_crc64);
+		break;
+	case VERIFY_CRC32C:
+	case VERIFY_CRC32:
+	case VERIFY_CRC32C_INTEL:
+		len = sizeof(struct vhdr_crc32);
+		break;
+	case VERIFY_CRC16:
+		len = sizeof(struct vhdr_crc16);
+		break;
+	case VERIFY_CRC7:
+		len = sizeof(struct vhdr_crc7);
+		break;
+	case VERIFY_SHA256:
+		len = sizeof(struct vhdr_sha256);
+		break;
+	case VERIFY_SHA512:
+		len = sizeof(struct vhdr_sha512);
+		break;
+	case VERIFY_SHA3_224:
+		len = sizeof(struct vhdr_sha3_224);
+		break;
+	case VERIFY_SHA3_256:
+		len = sizeof(struct vhdr_sha3_256);
+		break;
+	case VERIFY_SHA3_384:
+		len = sizeof(struct vhdr_sha3_384);
+		break;
+	case VERIFY_SHA3_512:
+		len = sizeof(struct vhdr_sha3_512);
+		break;
+	case VERIFY_XXHASH:
+		len = sizeof(struct vhdr_xxhash);
+		break;
+	case VERIFY_SHA1:
+		len = sizeof(struct vhdr_sha1);
+		break;
+	case VERIFY_PATTERN_NO_HDR:
+		return 0;
+	default:
+		log_err("fio: unknown verify header!\n");
+		assert(0);
+	}
+
+	return len + sizeof(struct verify_header);
+}
+
+static inline unsigned int hdr_size(struct thread_data *td,
+				    struct verify_header *hdr)
+{
+	if (td->o.verify == VERIFY_PATTERN_NO_HDR)
+		return 0;
+
+	return __hdr_size(hdr->verify_type);
+}
+
+static void *hdr_priv(struct verify_header *hdr)
+{
+	void *priv = hdr;
+
+	return priv + sizeof(struct verify_header);
+}
+
+/*
+ * Verify container, pass info to verify handlers and allow them to
+ * pass info back in case of error
+ */
+struct vcont {
+	/*
+	 * Input
+	 */
+	struct io_u *io_u;
+	unsigned int hdr_num;
+	struct thread_data *td;
+
+	/*
+	 * Output, only valid in case of error
+	 */
+	const char *name;
+	void *good_crc;
+	void *bad_crc;
+	unsigned int crc_len;
+};
+
+#define DUMP_BUF_SZ	255
+
+static void dump_buf(char *buf, unsigned int len, unsigned long long offset,
+		     const char *type, struct fio_file *f)
+{
+	char *ptr, *fname;
+	char sep[2] = { FIO_OS_PATH_SEPARATOR, 0 };
+	int ret, fd;
+
+	ptr = strdup(f->file_name);
+
+	if (asprintf(&fname, "%s%s%s.%llu.%s", aux_path ? : "",
+		     aux_path ? sep : "", basename(ptr), offset, type) < 0) {
+		if (!fio_did_warn(FIO_WARN_VERIFY_BUF))
+			log_err("fio: not enough memory for dump buffer filename\n");
+		goto free_ptr;
+	}
+
+	fd = open(fname, O_CREAT | O_TRUNC | O_WRONLY, 0644);
+	if (fd < 0) {
+		perror("open verify buf file");
+		goto free_fname;
+	}
+
+	while (len) {
+		ret = write(fd, buf, len);
+		if (!ret)
+			break;
+		else if (ret < 0) {
+			perror("write verify buf file");
+			break;
+		}
+		len -= ret;
+		buf += ret;
+	}
+
+	close(fd);
+	log_err("       %s data dumped as %s\n", type, fname);
+
+free_fname:
+	free(fname);
+
+free_ptr:
+	free(ptr);
+}
+
+/*
+ * Dump the contents of the read block and re-generate the correct data
+ * and dump that too.
+ */
+static void __dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
+{
+	struct thread_data *td = vc->td;
+	struct io_u *io_u = vc->io_u;
+	unsigned long hdr_offset;
+	struct io_u dummy;
+	void *buf;
+
+	if (!td->o.verify_dump)
+		return;
+
+	/*
+	 * Dump the contents we just read off disk
+	 */
+	hdr_offset = vc->hdr_num * hdr->len;
+
+	dump_buf(io_u->buf + hdr_offset, hdr->len, io_u->offset + hdr_offset,
+			"received", vc->io_u->file);
+
+	/*
+	 * Allocate a new buf and re-generate the original data
+	 */
+	buf = malloc(io_u->buflen);
+	dummy = *io_u;
+	dummy.buf = buf;
+	dummy.rand_seed = hdr->rand_seed;
+	dummy.buf_filled_len = 0;
+	dummy.buflen = io_u->buflen;
+
+	fill_pattern_headers(td, &dummy, hdr->rand_seed, 1);
+
+	dump_buf(buf + hdr_offset, hdr->len, io_u->offset + hdr_offset,
+			"expected", vc->io_u->file);
+	free(buf);
+}
+
+static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
+{
+	struct thread_data *td = vc->td;
+	struct verify_header shdr;
+
+	if (td->o.verify == VERIFY_PATTERN_NO_HDR) {
+		__fill_hdr(td, vc->io_u, &shdr, 0, vc->io_u->buflen, 0);
+		hdr = &shdr;
+	}
+
+	__dump_verify_buffers(hdr, vc);
+}
+
+static void log_verify_failure(struct verify_header *hdr, struct vcont *vc)
+{
+	unsigned long long offset;
+
+	offset = vc->io_u->offset;
+	offset += vc->hdr_num * hdr->len;
+	log_err("%.8s: verify failed at file %s offset %llu, length %u"
+			" (requested block: offset=%llu, length=%llu)\n",
+			vc->name, vc->io_u->file->file_name, offset, hdr->len,
+			vc->io_u->offset, vc->io_u->buflen);
+
+	if (vc->good_crc && vc->bad_crc) {
+		log_err("       Expected CRC: ");
+		hexdump(vc->good_crc, vc->crc_len);
+		log_err("       Received CRC: ");
+		hexdump(vc->bad_crc, vc->crc_len);
+	}
+
+	dump_verify_buffers(hdr, vc);
+}
+
+/*
+ * Return data area 'header_num'
+ */
+static inline void *io_u_verify_off(struct verify_header *hdr, struct vcont *vc)
+{
+	return vc->io_u->buf + vc->hdr_num * hdr->len + hdr_size(vc->td, hdr);
+}
+
+static int verify_io_u_pattern(struct verify_header *hdr, struct vcont *vc)
+{
+	struct thread_data *td = vc->td;
+	struct io_u *io_u = vc->io_u;
+	char *buf, *pattern;
+	unsigned int header_size = __hdr_size(td->o.verify);
+	unsigned int len, mod, i, pattern_size;
+	int rc;
+
+	pattern = td->o.verify_pattern;
+	pattern_size = td->o.verify_pattern_bytes;
+	assert(pattern_size != 0);
+
+	(void)paste_format_inplace(pattern, pattern_size,
+				   td->o.verify_fmt, td->o.verify_fmt_sz, io_u);
+
+	buf = (char *) hdr + header_size;
+	len = get_hdr_inc(td, io_u) - header_size;
+	mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size;
+
+	rc = cmp_pattern(pattern, pattern_size, mod, buf, len);
+	if (!rc)
+		return 0;
+
+	/* Slow path, compare each byte */
+	for (i = 0; i < len; i++) {
+		if (buf[i] != pattern[mod]) {
+			unsigned int bits;
+
+			bits = hweight8(buf[i] ^ pattern[mod]);
+			log_err("fio: got pattern '%02x', wanted '%02x'. Bad bits %d\n",
+				(unsigned char)buf[i],
+				(unsigned char)pattern[mod],
+				bits);
+			log_err("fio: bad pattern block offset %u\n", i);
+			vc->name = "pattern";
+			log_verify_failure(hdr, vc);
+			return EILSEQ;
+		}
+		mod++;
+		if (mod == td->o.verify_pattern_bytes)
+			mod = 0;
+	}
+
+	/* Unreachable line */
+	assert(0);
+	return EILSEQ;
+}
+
+static int verify_io_u_xxhash(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_xxhash *vh = hdr_priv(hdr);
+	uint32_t hash;
+	void *state;
+
+	dprint(FD_VERIFY, "xxhash verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	state = XXH32_init(1);
+	XXH32_update(state, p, hdr->len - hdr_size(vc->td, hdr));
+	hash = XXH32_digest(state);
+
+	if (vh->hash == hash)
+		return 0;
+
+	vc->name = "xxhash";
+	vc->good_crc = &vh->hash;
+	vc->bad_crc = &hash;
+	vc->crc_len = sizeof(hash);
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha3(struct verify_header *hdr, struct vcont *vc,
+			    struct fio_sha3_ctx *sha3_ctx, uint8_t *sha,
+			    unsigned int sha_size, const char *name)
+{
+	void *p = io_u_verify_off(hdr, vc);
+
+	dprint(FD_VERIFY, "%s verify io_u %p, len %u\n", name, vc->io_u, hdr->len);
+
+	fio_sha3_update(sha3_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha3_final(sha3_ctx);
+
+	if (!memcmp(sha, sha3_ctx->sha, sha_size))
+		return 0;
+
+	vc->name = name;
+	vc->good_crc = sha;
+	vc->bad_crc = sha3_ctx->sha;
+	vc->crc_len = sha_size;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha3_224(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_224_DIGEST_SIZE, "sha3-224");
+}
+
+static int verify_io_u_sha3_256(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_256_DIGEST_SIZE, "sha3-256");
+}
+
+static int verify_io_u_sha3_384(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_384_DIGEST_SIZE, "sha3-384");
+}
+
+static int verify_io_u_sha3_512(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_512_DIGEST_SIZE, "sha3-512");
+}
+
+static int verify_io_u_sha512(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_sha512 *vh = hdr_priv(hdr);
+	uint8_t sha512[128];
+	struct fio_sha512_ctx sha512_ctx = {
+		.buf = sha512,
+	};
+
+	dprint(FD_VERIFY, "sha512 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	fio_sha512_init(&sha512_ctx);
+	fio_sha512_update(&sha512_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+
+	if (!memcmp(vh->sha512, sha512_ctx.buf, sizeof(sha512)))
+		return 0;
+
+	vc->name = "sha512";
+	vc->good_crc = vh->sha512;
+	vc->bad_crc = sha512_ctx.buf;
+	vc->crc_len = sizeof(vh->sha512);
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha256(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_sha256 *vh = hdr_priv(hdr);
+	uint8_t sha256[64];
+	struct fio_sha256_ctx sha256_ctx = {
+		.buf = sha256,
+	};
+
+	dprint(FD_VERIFY, "sha256 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	fio_sha256_init(&sha256_ctx);
+	fio_sha256_update(&sha256_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha256_final(&sha256_ctx);
+
+	if (!memcmp(vh->sha256, sha256_ctx.buf, sizeof(sha256)))
+		return 0;
+
+	vc->name = "sha256";
+	vc->good_crc = vh->sha256;
+	vc->bad_crc = sha256_ctx.buf;
+	vc->crc_len = sizeof(vh->sha256);
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha1(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_sha1 *vh = hdr_priv(hdr);
+	uint32_t sha1[5];
+	struct fio_sha1_ctx sha1_ctx = {
+		.H = sha1,
+	};
+
+	dprint(FD_VERIFY, "sha1 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	fio_sha1_init(&sha1_ctx);
+	fio_sha1_update(&sha1_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha1_final(&sha1_ctx);
+
+	if (!memcmp(vh->sha1, sha1_ctx.H, sizeof(sha1)))
+		return 0;
+
+	vc->name = "sha1";
+	vc->good_crc = vh->sha1;
+	vc->bad_crc = sha1_ctx.H;
+	vc->crc_len = sizeof(vh->sha1);
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_crc7(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_crc7 *vh = hdr_priv(hdr);
+	unsigned char c;
+
+	dprint(FD_VERIFY, "crc7 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	c = fio_crc7(p, hdr->len - hdr_size(vc->td, hdr));
+
+	if (c == vh->crc7)
+		return 0;
+
+	vc->name = "crc7";
+	vc->good_crc = &vh->crc7;
+	vc->bad_crc = &c;
+	vc->crc_len = 1;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_crc16(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_crc16 *vh = hdr_priv(hdr);
+	unsigned short c;
+
+	dprint(FD_VERIFY, "crc16 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	c = fio_crc16(p, hdr->len - hdr_size(vc->td, hdr));
+
+	if (c == vh->crc16)
+		return 0;
+
+	vc->name = "crc16";
+	vc->good_crc = &vh->crc16;
+	vc->bad_crc = &c;
+	vc->crc_len = 2;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_crc64(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_crc64 *vh = hdr_priv(hdr);
+	unsigned long long c;
+
+	dprint(FD_VERIFY, "crc64 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	c = fio_crc64(p, hdr->len - hdr_size(vc->td, hdr));
+
+	if (c == vh->crc64)
+		return 0;
+
+	vc->name = "crc64";
+	vc->good_crc = &vh->crc64;
+	vc->bad_crc = &c;
+	vc->crc_len = 8;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_crc32(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_crc32 *vh = hdr_priv(hdr);
+	uint32_t c;
+
+	dprint(FD_VERIFY, "crc32 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	c = fio_crc32(p, hdr->len - hdr_size(vc->td, hdr));
+
+	if (c == vh->crc32)
+		return 0;
+
+	vc->name = "crc32";
+	vc->good_crc = &vh->crc32;
+	vc->bad_crc = &c;
+	vc->crc_len = 4;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_crc32c(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_crc32 *vh = hdr_priv(hdr);
+	uint32_t c;
+
+	dprint(FD_VERIFY, "crc32c verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	c = fio_crc32c(p, hdr->len - hdr_size(vc->td, hdr));
+
+	if (c == vh->crc32)
+		return 0;
+
+	vc->name = "crc32c";
+	vc->good_crc = &vh->crc32;
+	vc->bad_crc = &c;
+	vc->crc_len = 4;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_md5(struct verify_header *hdr, struct vcont *vc)
+{
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_md5 *vh = hdr_priv(hdr);
+	uint32_t hash[MD5_HASH_WORDS];
+	struct fio_md5_ctx md5_ctx = {
+		.hash = hash,
+	};
+
+	dprint(FD_VERIFY, "md5 verify io_u %p, len %u\n", vc->io_u, hdr->len);
+
+	fio_md5_init(&md5_ctx);
+	fio_md5_update(&md5_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_md5_final(&md5_ctx);
+
+	if (!memcmp(vh->md5_digest, md5_ctx.hash, sizeof(hash)))
+		return 0;
+
+	vc->name = "md5";
+	vc->good_crc = vh->md5_digest;
+	vc->bad_crc = md5_ctx.hash;
+	vc->crc_len = sizeof(hash);
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+/*
+ * Push IO verification to a separate thread
+ */
+int verify_io_u_async(struct thread_data *td, struct io_u **io_u_ptr)
+{
+	struct io_u *io_u = *io_u_ptr;
+
+	pthread_mutex_lock(&td->io_u_lock);
+
+	if (io_u->file)
+		put_file_log(td, io_u->file);
+
+	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
+		td->cur_depth--;
+		io_u_clear(td, io_u, IO_U_F_IN_CUR_DEPTH);
+	}
+	flist_add_tail(&io_u->verify_list, &td->verify_list);
+	*io_u_ptr = NULL;
+
+	pthread_cond_signal(&td->verify_cond);
+	pthread_mutex_unlock(&td->io_u_lock);
+	return 0;
+}
+
+/*
+ * Thanks Rusty, for spending the time so I don't have to.
+ *
+ * http://rusty.ozlabs.org/?p=560
+ */
+static int mem_is_zero(const void *data, size_t length)
+{
+	const unsigned char *p = data;
+	size_t len;
+
+	/* Check first 16 bytes manually */
+	for (len = 0; len < 16; len++) {
+		if (!length)
+			return 1;
+		if (*p)
+			return 0;
+		p++;
+		length--;
+	}
+
+	/* Now we know that's zero, memcmp with self. */
+	return memcmp(data, p, length) == 0;
+}
+
+static int mem_is_zero_slow(const void *data, size_t length, size_t *offset)
+{
+	const unsigned char *p = data;
+
+	*offset = 0;
+	while (length) {
+		if (*p)
+			break;
+		(*offset)++;
+		length--;
+		p++;
+	}
+
+	return !length;
+}
+
+static int verify_trimmed_io_u(struct thread_data *td, struct io_u *io_u)
+{
+	size_t offset;
+
+	if (!td->o.trim_zero)
+		return 0;
+
+	if (mem_is_zero(io_u->buf, io_u->buflen))
+		return 0;
+
+	mem_is_zero_slow(io_u->buf, io_u->buflen, &offset);
+
+	log_err("trim: verify failed at file %s offset %llu, length %llu"
+		", block offset %lu\n",
+			io_u->file->file_name, io_u->offset, io_u->buflen,
+			(unsigned long) offset);
+	return EILSEQ;
+}
+
+static int verify_header(struct io_u *io_u, struct thread_data *td,
+			 struct verify_header *hdr, unsigned int hdr_num,
+			 unsigned int hdr_len)
+{
+	void *p = hdr;
+	uint32_t crc;
+
+	if (hdr->magic != FIO_HDR_MAGIC) {
+		log_err("verify: bad magic header %x, wanted %x",
+			hdr->magic, FIO_HDR_MAGIC);
+		goto err;
+	}
+	if (hdr->len != hdr_len) {
+		log_err("verify: bad header length %u, wanted %u",
+			hdr->len, hdr_len);
+		goto err;
+	}
+	if (hdr->rand_seed != io_u->rand_seed) {
+		log_err("verify: bad header rand_seed %"PRIu64
+			", wanted %"PRIu64,
+			hdr->rand_seed, io_u->rand_seed);
+		goto err;
+	}
+	if (hdr->offset != io_u->offset + hdr_num * td->o.verify_interval) {
+		log_err("verify: bad header offset %"PRIu64
+			", wanted %llu",
+			hdr->offset, io_u->offset);
+		goto err;
+	}
+
+	/*
+	 * For read-only workloads, the program cannot be certain of the
+	 * last numberio written to a block. Checking of numberio will be
+	 * done only for workloads that write data.  For verify_only,
+	 * numberio check is skipped.
+	 */
+	if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) &&
+	    !td->o.time_based)
+		if (!td->o.verify_only)
+			if (hdr->numberio != io_u->numberio) {
+				log_err("verify: bad header numberio %"PRIu16
+					", wanted %"PRIu16,
+					hdr->numberio, io_u->numberio);
+				goto err;
+			}
+
+	crc = fio_crc32c(p, offsetof(struct verify_header, crc32));
+	if (crc != hdr->crc32) {
+		log_err("verify: bad header crc %x, calculated %x",
+			hdr->crc32, crc);
+		goto err;
+	}
+	return 0;
+
+err:
+	log_err(" at file %s offset %llu, length %u"
+		" (requested block: offset=%llu, length=%llu)\n",
+		io_u->file->file_name,
+		io_u->offset + hdr_num * hdr_len, hdr_len,
+		io_u->offset, io_u->buflen);
+
+	if (td->o.verify_dump)
+		dump_buf(p, hdr_len, io_u->offset + hdr_num * hdr_len,
+				"hdr_fail", io_u->file);
+
+	return EILSEQ;
+}
+
+int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr)
+{
+	struct verify_header *hdr;
+	struct io_u *io_u = *io_u_ptr;
+	unsigned int header_size, hdr_inc, hdr_num = 0;
+	void *p;
+	int ret;
+
+	if (td->o.verify == VERIFY_NULL || io_u->ddir != DDIR_READ)
+		return 0;
+	/*
+	 * If the IO engine is faking IO (like null), then just pretend
+	 * we verified everything.
+	 */
+	if (td_ioengine_flagged(td, FIO_FAKEIO))
+		return 0;
+
+	if (io_u->flags & IO_U_F_TRIMMED) {
+		ret = verify_trimmed_io_u(td, io_u);
+		goto done;
+	}
+
+	hdr_inc = get_hdr_inc(td, io_u);
+
+	ret = 0;
+	for (p = io_u->buf; p < io_u->buf + io_u->buflen;
+	     p += hdr_inc, hdr_num++) {
+		struct vcont vc = {
+			.io_u		= io_u,
+			.hdr_num	= hdr_num,
+			.td		= td,
+		};
+		unsigned int verify_type;
+
+		if (ret && td->o.verify_fatal)
+			break;
+
+		header_size = __hdr_size(td->o.verify);
+		if (td->o.verify_offset)
+			memswp(p, p + td->o.verify_offset, header_size);
+		hdr = p;
+
+		/*
+		 * Make rand_seed check pass when have verify_backlog.
+		 */
+		if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG))
+			io_u->rand_seed = hdr->rand_seed;
+
+		if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
+			ret = verify_header(io_u, td, hdr, hdr_num, hdr_inc);
+			if (ret)
+				return ret;
+		}
+
+		if (td->o.verify != VERIFY_NONE)
+			verify_type = td->o.verify;
+		else
+			verify_type = hdr->verify_type;
+
+		switch (verify_type) {
+		case VERIFY_HDR_ONLY:
+			/* Header is always verified, check if pattern is left
+			 * for verification. */
+			if (td->o.verify_pattern_bytes)
+				ret = verify_io_u_pattern(hdr, &vc);
+			break;
+		case VERIFY_MD5:
+			ret = verify_io_u_md5(hdr, &vc);
+			break;
+		case VERIFY_CRC64:
+			ret = verify_io_u_crc64(hdr, &vc);
+			break;
+		case VERIFY_CRC32C:
+		case VERIFY_CRC32C_INTEL:
+			ret = verify_io_u_crc32c(hdr, &vc);
+			break;
+		case VERIFY_CRC32:
+			ret = verify_io_u_crc32(hdr, &vc);
+			break;
+		case VERIFY_CRC16:
+			ret = verify_io_u_crc16(hdr, &vc);
+			break;
+		case VERIFY_CRC7:
+			ret = verify_io_u_crc7(hdr, &vc);
+			break;
+		case VERIFY_SHA256:
+			ret = verify_io_u_sha256(hdr, &vc);
+			break;
+		case VERIFY_SHA512:
+			ret = verify_io_u_sha512(hdr, &vc);
+			break;
+		case VERIFY_SHA3_224:
+			ret = verify_io_u_sha3_224(hdr, &vc);
+			break;
+		case VERIFY_SHA3_256:
+			ret = verify_io_u_sha3_256(hdr, &vc);
+			break;
+		case VERIFY_SHA3_384:
+			ret = verify_io_u_sha3_384(hdr, &vc);
+			break;
+		case VERIFY_SHA3_512:
+			ret = verify_io_u_sha3_512(hdr, &vc);
+			break;
+		case VERIFY_XXHASH:
+			ret = verify_io_u_xxhash(hdr, &vc);
+			break;
+		case VERIFY_SHA1:
+			ret = verify_io_u_sha1(hdr, &vc);
+			break;
+		case VERIFY_PATTERN:
+		case VERIFY_PATTERN_NO_HDR:
+			ret = verify_io_u_pattern(hdr, &vc);
+			break;
+		default:
+			log_err("Bad verify type %u\n", hdr->verify_type);
+			ret = EINVAL;
+		}
+
+		if (ret && verify_type != hdr->verify_type)
+			log_err("fio: verify type mismatch (%u media, %u given)\n",
+					hdr->verify_type, verify_type);
+	}
+
+done:
+	if (ret && td->o.verify_fatal)
+		fio_mark_td_terminate(td);
+
+	return ret;
+}
+
+static void fill_xxhash(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_xxhash *vh = hdr_priv(hdr);
+	void *state;
+
+	state = XXH32_init(1);
+	XXH32_update(state, p, len);
+	vh->hash = XXH32_digest(state);
+}
+
+static void fill_sha3(struct fio_sha3_ctx *sha3_ctx, void *p, unsigned int len)
+{
+	fio_sha3_update(sha3_ctx, p, len);
+	fio_sha3_final(sha3_ctx);
+}
+
+static void fill_sha3_224(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_256(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_384(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_512(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha512(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha512 *vh = hdr_priv(hdr);
+	struct fio_sha512_ctx sha512_ctx = {
+		.buf = vh->sha512,
+	};
+
+	fio_sha512_init(&sha512_ctx);
+	fio_sha512_update(&sha512_ctx, p, len);
+}
+
+static void fill_sha256(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha256 *vh = hdr_priv(hdr);
+	struct fio_sha256_ctx sha256_ctx = {
+		.buf = vh->sha256,
+	};
+
+	fio_sha256_init(&sha256_ctx);
+	fio_sha256_update(&sha256_ctx, p, len);
+	fio_sha256_final(&sha256_ctx);
+}
+
+static void fill_sha1(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha1 *vh = hdr_priv(hdr);
+	struct fio_sha1_ctx sha1_ctx = {
+		.H = vh->sha1,
+	};
+
+	fio_sha1_init(&sha1_ctx);
+	fio_sha1_update(&sha1_ctx, p, len);
+	fio_sha1_final(&sha1_ctx);
+}
+
+static void fill_crc7(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_crc7 *vh = hdr_priv(hdr);
+
+	vh->crc7 = fio_crc7(p, len);
+}
+
+static void fill_crc16(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_crc16 *vh = hdr_priv(hdr);
+
+	vh->crc16 = fio_crc16(p, len);
+}
+
+static void fill_crc32(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_crc32 *vh = hdr_priv(hdr);
+
+	vh->crc32 = fio_crc32(p, len);
+}
+
+static void fill_crc32c(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_crc32 *vh = hdr_priv(hdr);
+
+	vh->crc32 = fio_crc32c(p, len);
+}
+
+static void fill_crc64(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_crc64 *vh = hdr_priv(hdr);
+
+	vh->crc64 = fio_crc64(p, len);
+}
+
+static void fill_md5(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_md5 *vh = hdr_priv(hdr);
+	struct fio_md5_ctx md5_ctx = {
+		.hash = (uint32_t *) vh->md5_digest,
+	};
+
+	fio_md5_init(&md5_ctx);
+	fio_md5_update(&md5_ctx, p, len);
+	fio_md5_final(&md5_ctx);
+}
+
+static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
+		       struct verify_header *hdr, unsigned int header_num,
+		       unsigned int header_len, uint64_t rand_seed)
+{
+	void *p = hdr;
+
+	hdr->magic = FIO_HDR_MAGIC;
+	hdr->verify_type = td->o.verify;
+	hdr->len = header_len;
+	hdr->rand_seed = rand_seed;
+	hdr->offset = io_u->offset + header_num * td->o.verify_interval;
+	hdr->time_sec = io_u->start_time.tv_sec;
+	hdr->time_nsec = io_u->start_time.tv_nsec;
+	hdr->thread = td->thread_number;
+	hdr->numberio = io_u->numberio;
+	hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
+}
+
+
+static void fill_hdr(struct thread_data *td, struct io_u *io_u,
+		     struct verify_header *hdr, unsigned int header_num,
+		     unsigned int header_len, uint64_t rand_seed)
+{
+	if (td->o.verify != VERIFY_PATTERN_NO_HDR)
+		__fill_hdr(td, io_u, hdr, header_num, header_len, rand_seed);
+}
+
+static void populate_hdr(struct thread_data *td, struct io_u *io_u,
+			 struct verify_header *hdr, unsigned int header_num,
+			 unsigned int header_len)
+{
+	unsigned int data_len;
+	void *data;
+	char *p;
+
+	p = (char *) hdr;
+
+	fill_hdr(td, io_u, hdr, header_num, header_len, io_u->rand_seed);
+
+	if (header_len <= hdr_size(td, hdr)) {
+		td_verror(td, EINVAL, "Blocksize too small");
+		return;
+	}
+	data_len = header_len - hdr_size(td, hdr);
+
+	data = p + hdr_size(td, hdr);
+	switch (td->o.verify) {
+	case VERIFY_MD5:
+		dprint(FD_VERIFY, "fill md5 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_md5(hdr, data, data_len);
+		break;
+	case VERIFY_CRC64:
+		dprint(FD_VERIFY, "fill crc64 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_crc64(hdr, data, data_len);
+		break;
+	case VERIFY_CRC32C:
+	case VERIFY_CRC32C_INTEL:
+		dprint(FD_VERIFY, "fill crc32c io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_crc32c(hdr, data, data_len);
+		break;
+	case VERIFY_CRC32:
+		dprint(FD_VERIFY, "fill crc32 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_crc32(hdr, data, data_len);
+		break;
+	case VERIFY_CRC16:
+		dprint(FD_VERIFY, "fill crc16 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_crc16(hdr, data, data_len);
+		break;
+	case VERIFY_CRC7:
+		dprint(FD_VERIFY, "fill crc7 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_crc7(hdr, data, data_len);
+		break;
+	case VERIFY_SHA256:
+		dprint(FD_VERIFY, "fill sha256 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha256(hdr, data, data_len);
+		break;
+	case VERIFY_SHA512:
+		dprint(FD_VERIFY, "fill sha512 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha512(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_224:
+		dprint(FD_VERIFY, "fill sha3-224 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_224(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_256:
+		dprint(FD_VERIFY, "fill sha3-256 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_256(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_384:
+		dprint(FD_VERIFY, "fill sha3-384 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_384(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_512:
+		dprint(FD_VERIFY, "fill sha3-512 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_512(hdr, data, data_len);
+		break;
+	case VERIFY_XXHASH:
+		dprint(FD_VERIFY, "fill xxhash io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_xxhash(hdr, data, data_len);
+		break;
+	case VERIFY_SHA1:
+		dprint(FD_VERIFY, "fill sha1 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha1(hdr, data, data_len);
+		break;
+	case VERIFY_HDR_ONLY:
+	case VERIFY_PATTERN:
+	case VERIFY_PATTERN_NO_HDR:
+		/* nothing to do here */
+		break;
+	default:
+		log_err("fio: bad verify type: %d\n", td->o.verify);
+		assert(0);
+	}
+
+	if (td->o.verify_offset && hdr_size(td, hdr))
+		memswp(p, p + td->o.verify_offset, hdr_size(td, hdr));
+}
+
+/*
+ * fill body of io_u->buf with random data and add a header with the
+ * checksum of choice
+ */
+void populate_verify_io_u(struct thread_data *td, struct io_u *io_u)
+{
+	if (td->o.verify == VERIFY_NULL)
+		return;
+
+	io_u->numberio = td->io_issues[io_u->ddir];
+
+	fill_pattern_headers(td, io_u, 0, 0);
+}
+
+int get_next_verify(struct thread_data *td, struct io_u *io_u)
+{
+	struct io_piece *ipo = NULL;
+
+	/*
+	 * this io_u is from a requeue, we already filled the offsets
+	 */
+	if (io_u->file)
+		return 0;
+
+	if (!RB_EMPTY_ROOT(&td->io_hist_tree)) {
+		struct fio_rb_node *n = rb_first(&td->io_hist_tree);
+
+		ipo = rb_entry(n, struct io_piece, rb_node);
+
+		/*
+		 * Ensure that the associated IO has completed
+		 */
+		read_barrier();
+		if (ipo->flags & IP_F_IN_FLIGHT)
+			goto nothing;
+
+		rb_erase(n, &td->io_hist_tree);
+		assert(ipo->flags & IP_F_ONRB);
+		ipo->flags &= ~IP_F_ONRB;
+	} else if (!flist_empty(&td->io_hist_list)) {
+		ipo = flist_first_entry(&td->io_hist_list, struct io_piece, list);
+
+		/*
+		 * Ensure that the associated IO has completed
+		 */
+		read_barrier();
+		if (ipo->flags & IP_F_IN_FLIGHT)
+			goto nothing;
+
+		flist_del(&ipo->list);
+		assert(ipo->flags & IP_F_ONLIST);
+		ipo->flags &= ~IP_F_ONLIST;
+	}
+
+	if (ipo) {
+		td->io_hist_len--;
+
+		io_u->offset = ipo->offset;
+		io_u->buflen = ipo->len;
+		io_u->numberio = ipo->numberio;
+		io_u->file = ipo->file;
+		io_u_set(td, io_u, IO_U_F_VER_LIST);
+
+		if (ipo->flags & IP_F_TRIMMED)
+			io_u_set(td, io_u, IO_U_F_TRIMMED);
+
+		if (!fio_file_open(io_u->file)) {
+			int r = td_io_open_file(td, io_u->file);
+
+			if (r) {
+				dprint(FD_VERIFY, "failed file %s open\n",
+						io_u->file->file_name);
+				return 1;
+			}
+		}
+
+		get_file(ipo->file);
+		assert(fio_file_open(io_u->file));
+		io_u->ddir = DDIR_READ;
+		io_u->xfer_buf = io_u->buf;
+		io_u->xfer_buflen = io_u->buflen;
+
+		remove_trim_entry(td, ipo);
+		free(ipo);
+		dprint(FD_VERIFY, "get_next_verify: ret io_u %p\n", io_u);
+
+		if (!td->o.verify_pattern_bytes) {
+			io_u->rand_seed = __rand(&td->verify_state);
+			if (sizeof(int) != sizeof(long *))
+				io_u->rand_seed *= __rand(&td->verify_state);
+		}
+		return 0;
+	}
+
+nothing:
+	dprint(FD_VERIFY, "get_next_verify: empty\n");
+	return 1;
+}
+
+void fio_verify_init(struct thread_data *td)
+{
+	if (td->o.verify == VERIFY_CRC32C_INTEL ||
+	    td->o.verify == VERIFY_CRC32C) {
+		crc32c_arm64_probe();
+		crc32c_intel_probe();
+	}
+}
+
+static void *verify_async_thread(void *data)
+{
+	struct thread_data *td = data;
+	struct io_u *io_u;
+	int ret = 0;
+
+	if (fio_option_is_set(&td->o, verify_cpumask) &&
+	    fio_setaffinity(td->pid, td->o.verify_cpumask)) {
+		log_err("fio: failed setting verify thread affinity\n");
+		goto done;
+	}
+
+	do {
+		FLIST_HEAD(list);
+
+		read_barrier();
+		if (td->verify_thread_exit)
+			break;
+
+		pthread_mutex_lock(&td->io_u_lock);
+
+		while (flist_empty(&td->verify_list) &&
+		       !td->verify_thread_exit) {
+			ret = pthread_cond_wait(&td->verify_cond,
+							&td->io_u_lock);
+			if (ret) {
+				pthread_mutex_unlock(&td->io_u_lock);
+				break;
+			}
+		}
+
+		flist_splice_init(&td->verify_list, &list);
+		pthread_mutex_unlock(&td->io_u_lock);
+
+		if (flist_empty(&list))
+			continue;
+
+		while (!flist_empty(&list)) {
+			io_u = flist_first_entry(&list, struct io_u, verify_list);
+			flist_del_init(&io_u->verify_list);
+
+			io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
+			ret = verify_io_u(td, &io_u);
+
+			put_io_u(td, io_u);
+			if (!ret)
+				continue;
+			if (td_non_fatal_error(td, ERROR_TYPE_VERIFY_BIT, ret)) {
+				update_error_count(td, ret);
+				td_clear_error(td);
+				ret = 0;
+			}
+		}
+	} while (!ret);
+
+	if (ret) {
+		td_verror(td, ret, "async_verify");
+		if (td->o.verify_fatal)
+			fio_mark_td_terminate(td);
+	}
+
+done:
+	pthread_mutex_lock(&td->io_u_lock);
+	td->nr_verify_threads--;
+	pthread_cond_signal(&td->free_cond);
+	pthread_mutex_unlock(&td->io_u_lock);
+
+	return NULL;
+}
+
+int verify_async_init(struct thread_data *td)
+{
+	int i, ret;
+	pthread_attr_t attr;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 2 * PTHREAD_STACK_MIN);
+
+	td->verify_thread_exit = 0;
+
+	td->verify_threads = malloc(sizeof(pthread_t) * td->o.verify_async);
+	for (i = 0; i < td->o.verify_async; i++) {
+		ret = pthread_create(&td->verify_threads[i], &attr,
+					verify_async_thread, td);
+		if (ret) {
+			log_err("fio: async verify creation failed: %s\n",
+					strerror(ret));
+			break;
+		}
+		ret = pthread_detach(td->verify_threads[i]);
+		if (ret) {
+			log_err("fio: async verify thread detach failed: %s\n",
+					strerror(ret));
+			break;
+		}
+		td->nr_verify_threads++;
+	}
+
+	pthread_attr_destroy(&attr);
+
+	if (i != td->o.verify_async) {
+		log_err("fio: only %d verify threads started, exiting\n", i);
+
+		pthread_mutex_lock(&td->io_u_lock);
+		td->verify_thread_exit = 1;
+		pthread_cond_broadcast(&td->verify_cond);
+		pthread_mutex_unlock(&td->io_u_lock);
+
+		return 1;
+	}
+
+	return 0;
+}
+
+void verify_async_exit(struct thread_data *td)
+{
+	pthread_mutex_lock(&td->io_u_lock);
+	td->verify_thread_exit = 1;
+	pthread_cond_broadcast(&td->verify_cond);
+
+	while (td->nr_verify_threads)
+		pthread_cond_wait(&td->free_cond, &td->io_u_lock);
+
+	pthread_mutex_unlock(&td->io_u_lock);
+	free(td->verify_threads);
+	td->verify_threads = NULL;
+}
+
+int paste_blockoff(char *buf, unsigned int len, void *priv)
+{
+	struct io_u *io = priv;
+	unsigned long long off;
+
+	typecheck(__typeof__(off), io->offset);
+	off = cpu_to_le64((uint64_t)io->offset);
+	len = min(len, (unsigned int)sizeof(off));
+	memcpy(buf, &off, len);
+	return 0;
+}
+
+static int __fill_file_completions(struct thread_data *td,
+				   struct thread_io_list *s,
+				   struct fio_file *f, unsigned int *index)
+{
+	unsigned int comps;
+	int i, j;
+
+	if (!f->last_write_comp)
+		return 0;
+
+	if (td->io_blocks[DDIR_WRITE] < td->o.iodepth)
+		comps = td->io_blocks[DDIR_WRITE];
+	else
+		comps = td->o.iodepth;
+
+	j = f->last_write_idx - 1;
+	for (i = 0; i < comps; i++) {
+		if (j == -1)
+			j = td->o.iodepth - 1;
+		s->comps[*index].fileno = __cpu_to_le64(f->fileno);
+		s->comps[*index].offset = cpu_to_le64(f->last_write_comp[j]);
+		(*index)++;
+		j--;
+	}
+
+	return comps;
+}
+
+static int fill_file_completions(struct thread_data *td,
+				 struct thread_io_list *s, unsigned int *index)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int comps = 0;
+
+	for_each_file(td, f, i)
+		comps += __fill_file_completions(td, s, f, index);
+
+	return comps;
+}
+
+struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
+{
+	struct all_io_list *rep;
+	struct thread_data *td;
+	size_t depth;
+	void *next;
+	int i, nr;
+
+	compiletime_assert(sizeof(struct all_io_list) == 8, "all_io_list");
+
+	/*
+	 * Calculate reply space needed. We need one 'io_state' per thread,
+	 * and the size will vary depending on depth.
+	 */
+	depth = 0;
+	nr = 0;
+	for_each_td(td, i) {
+		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+			continue;
+		td->stop_io = 1;
+		td->flags |= TD_F_VSTATE_SAVED;
+		depth += (td->o.iodepth * td->o.nr_files);
+		nr++;
+	}
+
+	if (!nr)
+		return NULL;
+
+	*sz = sizeof(*rep);
+	*sz += nr * sizeof(struct thread_io_list);
+	*sz += depth * sizeof(struct file_comp);
+	rep = malloc(*sz);
+	memset(rep, 0, *sz);
+
+	rep->threads = cpu_to_le64((uint64_t) nr);
+
+	next = &rep->state[0];
+	for_each_td(td, i) {
+		struct thread_io_list *s = next;
+		unsigned int comps, index = 0;
+
+		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+			continue;
+
+		comps = fill_file_completions(td, s, &index);
+
+		s->no_comps = cpu_to_le64((uint64_t) comps);
+		s->depth = cpu_to_le64((uint64_t) td->o.iodepth);
+		s->nofiles = cpu_to_le64((uint64_t) td->o.nr_files);
+		s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]);
+		s->index = cpu_to_le64((uint64_t) i);
+		if (td->random_state.use64) {
+			s->rand.state64.s[0] = cpu_to_le64(td->random_state.state64.s1);
+			s->rand.state64.s[1] = cpu_to_le64(td->random_state.state64.s2);
+			s->rand.state64.s[2] = cpu_to_le64(td->random_state.state64.s3);
+			s->rand.state64.s[3] = cpu_to_le64(td->random_state.state64.s4);
+			s->rand.state64.s[4] = cpu_to_le64(td->random_state.state64.s5);
+			s->rand.state64.s[5] = 0;
+			s->rand.use64 = cpu_to_le64((uint64_t)1);
+		} else {
+			s->rand.state32.s[0] = cpu_to_le32(td->random_state.state32.s1);
+			s->rand.state32.s[1] = cpu_to_le32(td->random_state.state32.s2);
+			s->rand.state32.s[2] = cpu_to_le32(td->random_state.state32.s3);
+			s->rand.state32.s[3] = 0;
+			s->rand.use64 = 0;
+		}
+		snprintf((char *) s->name, sizeof(s->name), "%s", td->o.name);
+		next = io_list_next(s);
+	}
+
+	return rep;
+}
+
+static int open_state_file(const char *name, const char *prefix, int num,
+			   int for_write)
+{
+	char out[PATH_MAX];
+	int flags;
+	int fd;
+
+	if (for_write)
+		flags = O_CREAT | O_TRUNC | O_WRONLY | O_SYNC;
+	else
+		flags = O_RDONLY;
+
+	verify_state_gen_name(out, sizeof(out), name, prefix, num);
+
+	fd = open(out, flags, 0644);
+	if (fd == -1) {
+		perror("fio: open state file");
+		log_err("fio: state file: %s (for_write=%d)\n", out, for_write);
+		return -1;
+	}
+
+	return fd;
+}
+
+static int write_thread_list_state(struct thread_io_list *s,
+				   const char *prefix)
+{
+	struct verify_state_hdr hdr;
+	uint64_t crc;
+	ssize_t ret;
+	int fd;
+
+	fd = open_state_file((const char *) s->name, prefix, s->index, 1);
+	if (fd == -1)
+		return 1;
+
+	crc = fio_crc32c((void *)s, thread_io_list_sz(s));
+
+	hdr.version = cpu_to_le64((uint64_t) VSTATE_HDR_VERSION);
+	hdr.size = cpu_to_le64((uint64_t) thread_io_list_sz(s));
+	hdr.crc = cpu_to_le64(crc);
+	ret = write(fd, &hdr, sizeof(hdr));
+	if (ret != sizeof(hdr))
+		goto write_fail;
+
+	ret = write(fd, s, thread_io_list_sz(s));
+	if (ret != thread_io_list_sz(s)) {
+write_fail:
+		if (ret < 0)
+			perror("fio: write state file");
+		log_err("fio: failed to write state file\n");
+		ret = 1;
+	} else
+		ret = 0;
+
+	close(fd);
+	return ret;
+}
+
+void __verify_save_state(struct all_io_list *state, const char *prefix)
+{
+	struct thread_io_list *s = &state->state[0];
+	unsigned int i;
+
+	for (i = 0; i < le64_to_cpu(state->threads); i++) {
+		write_thread_list_state(s,  prefix);
+		s = io_list_next(s);
+	}
+}
+
+void verify_save_state(int mask)
+{
+	struct all_io_list *state;
+	size_t sz;
+
+	state = get_all_io_list(mask, &sz);
+	if (state) {
+		char prefix[PATH_MAX];
+
+		if (aux_path)
+			sprintf(prefix, "%s%clocal", aux_path, FIO_OS_PATH_SEPARATOR);
+		else
+			strcpy(prefix, "local");
+
+		__verify_save_state(state, prefix);
+		free(state);
+	}
+}
+
+void verify_free_state(struct thread_data *td)
+{
+	if (td->vstate)
+		free(td->vstate);
+}
+
+void verify_assign_state(struct thread_data *td, void *p)
+{
+	struct thread_io_list *s = p;
+	int i;
+
+	s->no_comps = le64_to_cpu(s->no_comps);
+	s->depth = le32_to_cpu(s->depth);
+	s->nofiles = le32_to_cpu(s->nofiles);
+	s->numberio = le64_to_cpu(s->numberio);
+	s->rand.use64 = le64_to_cpu(s->rand.use64);
+
+	if (s->rand.use64) {
+		for (i = 0; i < 6; i++)
+			s->rand.state64.s[i] = le64_to_cpu(s->rand.state64.s[i]);
+	} else {
+		for (i = 0; i < 4; i++)
+			s->rand.state32.s[i] = le32_to_cpu(s->rand.state32.s[i]);
+	}
+
+	for (i = 0; i < s->no_comps; i++) {
+		s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno);
+		s->comps[i].offset = le64_to_cpu(s->comps[i].offset);
+	}
+
+	td->vstate = p;
+}
+
+int verify_state_hdr(struct verify_state_hdr *hdr, struct thread_io_list *s)
+{
+	uint64_t crc;
+
+	hdr->version = le64_to_cpu(hdr->version);
+	hdr->size = le64_to_cpu(hdr->size);
+	hdr->crc = le64_to_cpu(hdr->crc);
+
+	if (hdr->version != VSTATE_HDR_VERSION)
+		return 1;
+
+	crc = fio_crc32c((void *)s, hdr->size);
+	if (crc != hdr->crc)
+		return 1;
+
+	return 0;
+}
+
+int verify_load_state(struct thread_data *td, const char *prefix)
+{
+	struct verify_state_hdr hdr;
+	void *s = NULL;
+	uint64_t crc;
+	ssize_t ret;
+	int fd;
+
+	if (!td->o.verify_state)
+		return 0;
+
+	fd = open_state_file(td->o.name, prefix, td->thread_number - 1, 0);
+	if (fd == -1)
+		return 1;
+
+	ret = read(fd, &hdr, sizeof(hdr));
+	if (ret != sizeof(hdr)) {
+		if (ret < 0)
+			td_verror(td, errno, "read verify state hdr");
+		log_err("fio: failed reading verify state header\n");
+		goto err;
+	}
+
+	hdr.version = le64_to_cpu(hdr.version);
+	hdr.size = le64_to_cpu(hdr.size);
+	hdr.crc = le64_to_cpu(hdr.crc);
+
+	if (hdr.version != VSTATE_HDR_VERSION) {
+		log_err("fio: unsupported (%d) version in verify state header\n",
+				(unsigned int) hdr.version);
+		goto err;
+	}
+
+	s = malloc(hdr.size);
+	ret = read(fd, s, hdr.size);
+	if (ret != hdr.size) {
+		if (ret < 0)
+			td_verror(td, errno, "read verify state");
+		log_err("fio: failed reading verity state\n");
+		goto err;
+	}
+
+	crc = fio_crc32c(s, hdr.size);
+	if (crc != hdr.crc) {
+		log_err("fio: verify state is corrupt\n");
+		goto err;
+	}
+
+	close(fd);
+
+	verify_assign_state(td, s);
+	return 0;
+err:
+	if (s)
+		free(s);
+	close(fd);
+	return 1;
+}
+
+/*
+ * Use the loaded verify state to know when to stop doing verification
+ */
+int verify_state_should_stop(struct thread_data *td, struct io_u *io_u)
+{
+	struct thread_io_list *s = td->vstate;
+	struct fio_file *f = io_u->file;
+	int i;
+
+	if (!s || !f)
+		return 0;
+
+	/*
+	 * If we're not into the window of issues - depth yet, continue. If
+	 * issue is shorter than depth, do check.
+	 */
+	if ((td->io_blocks[DDIR_READ] < s->depth ||
+	    s->numberio - td->io_blocks[DDIR_READ] > s->depth) &&
+	    s->numberio > s->depth)
+		return 0;
+
+	/*
+	 * We're in the window of having to check if this io was
+	 * completed or not. If the IO was seen as completed, then
+	 * lets verify it.
+	 */
+	for (i = 0; i < s->no_comps; i++) {
+		if (s->comps[i].fileno != f->fileno)
+			continue;
+		if (io_u->offset == s->comps[i].offset)
+			return 0;
+	}
+
+	/*
+	 * Not found, we have to stop
+	 */
+	return 1;
+}
diff --git a/verify.h b/verify.h
new file mode 100644
index 0000000..539e6f6
--- /dev/null
+++ b/verify.h
@@ -0,0 +1,115 @@
+#ifndef FIO_VERIFY_H
+#define FIO_VERIFY_H
+
+#include <stdint.h>
+#include "compiler/compiler.h"
+#include "verify-state.h"
+
+#define FIO_HDR_MAGIC	0xacca
+
+enum {
+	VERIFY_NONE = 0,		/* no verification */
+	VERIFY_HDR_ONLY,		/* verify header only, kept for sake of
+					 * compatibility with old configurations
+					 * which use 'verify=meta' */
+	VERIFY_MD5,			/* md5 sum data blocks */
+	VERIFY_CRC64,			/* crc64 sum data blocks */
+	VERIFY_CRC32,			/* crc32 sum data blocks */
+	VERIFY_CRC32C,			/* crc32c sum data blocks */
+	VERIFY_CRC32C_INTEL,		/* crc32c sum data blocks with hw */
+	VERIFY_CRC16,			/* crc16 sum data blocks */
+	VERIFY_CRC7,			/* crc7 sum data blocks */
+	VERIFY_SHA256,			/* sha256 sum data blocks */
+	VERIFY_SHA512,			/* sha512 sum data blocks */
+	VERIFY_SHA3_224,		/* sha3-224 sum data blocks */
+	VERIFY_SHA3_256,		/* sha3-256 sum data blocks */
+	VERIFY_SHA3_384,		/* sha3-384 sum data blocks */
+	VERIFY_SHA3_512,		/* sha3-512 sum data blocks */
+	VERIFY_XXHASH,			/* xxhash sum data blocks */
+	VERIFY_SHA1,			/* sha1 sum data blocks */
+	VERIFY_PATTERN,			/* verify specific patterns */
+	VERIFY_PATTERN_NO_HDR,		/* verify specific patterns, no hdr */
+	VERIFY_NULL,			/* pretend to verify */
+};
+
+/*
+ * A header structure associated with each checksummed data block. It is
+ * followed by a checksum specific header that contains the verification
+ * data.
+ */
+struct verify_header {
+	uint16_t magic;
+	uint16_t verify_type;
+	uint32_t len;
+	uint64_t rand_seed;
+	uint64_t offset;
+	uint32_t time_sec;
+	uint32_t time_nsec;
+	uint16_t thread;
+	uint16_t numberio;
+	uint32_t crc32;
+};
+
+struct vhdr_md5 {
+	uint32_t md5_digest[4];
+};
+struct vhdr_sha3_224 {
+	uint8_t sha[224 / 8];
+};
+struct vhdr_sha3_256 {
+	uint8_t sha[256 / 8];
+};
+struct vhdr_sha3_384 {
+	uint8_t sha[384 / 8];
+};
+struct vhdr_sha3_512 {
+	uint8_t sha[512 / 8];
+};
+struct vhdr_sha512 {
+	uint8_t sha512[128];
+};
+struct vhdr_sha256 {
+	uint8_t sha256[64];
+};
+struct vhdr_sha1 {
+	uint32_t sha1[5];
+};
+struct vhdr_crc64 {
+	uint64_t crc64;
+};
+struct vhdr_crc32 {
+	uint32_t crc32;
+};
+struct vhdr_crc16 {
+	uint16_t crc16;
+};
+struct vhdr_crc7 {
+	uint8_t crc7;
+};
+struct vhdr_xxhash {
+	uint32_t hash;
+};
+
+/*
+ * Verify helpers
+ */
+extern void populate_verify_io_u(struct thread_data *, struct io_u *);
+extern int __must_check get_next_verify(struct thread_data *td, struct io_u *);
+extern int __must_check verify_io_u(struct thread_data *, struct io_u **);
+extern int verify_io_u_async(struct thread_data *, struct io_u **);
+extern void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, uint64_t seed, int use_seed);
+extern void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len);
+extern void fio_verify_init(struct thread_data *td);
+
+/*
+ * Async verify offload
+ */
+extern int verify_async_init(struct thread_data *);
+extern void verify_async_exit(struct thread_data *);
+
+/*
+ * Callbacks for pasting formats in the pattern buffer
+ */
+extern int paste_blockoff(char *buf, unsigned int len, void *priv);
+
+#endif
diff --git a/workqueue.c b/workqueue.c
new file mode 100644
index 0000000..b595951
--- /dev/null
+++ b/workqueue.c
@@ -0,0 +1,371 @@
+/*
+ * Generic workqueue offload mechanism
+ *
+ * Copyright (C) 2015 Jens Axboe <axboe@kernel.dk>
+ *
+ */
+#include <unistd.h>
+
+#include "fio.h"
+#include "flist.h"
+#include "workqueue.h"
+#include "smalloc.h"
+#include "pshared.h"
+
+enum {
+	SW_F_IDLE	= 1 << 0,
+	SW_F_RUNNING	= 1 << 1,
+	SW_F_EXIT	= 1 << 2,
+	SW_F_ACCOUNTED	= 1 << 3,
+	SW_F_ERROR	= 1 << 4,
+};
+
+static struct submit_worker *__get_submit_worker(struct workqueue *wq,
+						 unsigned int start,
+						 unsigned int end,
+						 struct submit_worker **best)
+{
+	struct submit_worker *sw = NULL;
+
+	while (start <= end) {
+		sw = &wq->workers[start];
+		if (sw->flags & SW_F_IDLE)
+			return sw;
+		if (!(*best) || sw->seq < (*best)->seq)
+			*best = sw;
+		start++;
+	}
+
+	return NULL;
+}
+
+static struct submit_worker *get_submit_worker(struct workqueue *wq)
+{
+	unsigned int next = wq->next_free_worker;
+	struct submit_worker *sw, *best = NULL;
+
+	assert(next < wq->max_workers);
+
+	sw = __get_submit_worker(wq, next, wq->max_workers - 1, &best);
+	if (!sw && next)
+		sw = __get_submit_worker(wq, 0, next - 1, &best);
+
+	/*
+	 * No truly idle found, use best match
+	 */
+	if (!sw)
+		sw = best;
+
+	if (sw->index == wq->next_free_worker) {
+		if (sw->index + 1 < wq->max_workers)
+			wq->next_free_worker = sw->index + 1;
+		else
+			wq->next_free_worker = 0;
+	}
+
+	return sw;
+}
+
+static bool all_sw_idle(struct workqueue *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->max_workers; i++) {
+		struct submit_worker *sw = &wq->workers[i];
+
+		if (!(sw->flags & SW_F_IDLE))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Must be serialized wrt workqueue_enqueue() by caller
+ */
+void workqueue_flush(struct workqueue *wq)
+{
+	wq->wake_idle = 1;
+
+	while (!all_sw_idle(wq)) {
+		pthread_mutex_lock(&wq->flush_lock);
+		pthread_cond_wait(&wq->flush_cond, &wq->flush_lock);
+		pthread_mutex_unlock(&wq->flush_lock);
+	}
+
+	wq->wake_idle = 0;
+}
+
+/*
+ * Must be serialized by caller.
+ */
+void workqueue_enqueue(struct workqueue *wq, struct workqueue_work *work)
+{
+	struct submit_worker *sw;
+
+	sw = get_submit_worker(wq);
+	assert(sw);
+
+	pthread_mutex_lock(&sw->lock);
+	flist_add_tail(&work->list, &sw->work_list);
+	sw->seq = ++wq->work_seq;
+	sw->flags &= ~SW_F_IDLE;
+
+	pthread_cond_signal(&sw->cond);
+	pthread_mutex_unlock(&sw->lock);
+}
+
+static void handle_list(struct submit_worker *sw, struct flist_head *list)
+{
+	struct workqueue *wq = sw->wq;
+	struct workqueue_work *work;
+
+	while (!flist_empty(list)) {
+		work = flist_first_entry(list, struct workqueue_work, list);
+		flist_del_init(&work->list);
+		wq->ops.fn(sw, work);
+	}
+}
+
+static void *worker_thread(void *data)
+{
+	struct submit_worker *sw = data;
+	struct workqueue *wq = sw->wq;
+	unsigned int ret = 0;
+	FLIST_HEAD(local_list);
+
+	sk_out_assign(sw->sk_out);
+
+	if (wq->ops.nice) {
+		if (nice(wq->ops.nice) < 0) {
+			log_err("workqueue: nice %s\n", strerror(errno));
+			ret = 1;
+		}
+	}
+
+	if (!ret)
+		ret = workqueue_init_worker(sw);
+
+	pthread_mutex_lock(&sw->lock);
+	sw->flags |= SW_F_RUNNING;
+	if (ret)
+		sw->flags |= SW_F_ERROR;
+	pthread_mutex_unlock(&sw->lock);
+
+	pthread_mutex_lock(&wq->flush_lock);
+	pthread_cond_signal(&wq->flush_cond);
+	pthread_mutex_unlock(&wq->flush_lock);
+
+	if (sw->flags & SW_F_ERROR)
+		goto done;
+
+	while (1) {
+		pthread_mutex_lock(&sw->lock);
+
+		if (flist_empty(&sw->work_list)) {
+			if (sw->flags & SW_F_EXIT) {
+				pthread_mutex_unlock(&sw->lock);
+				break;
+			}
+
+			if (workqueue_pre_sleep_check(sw)) {
+				pthread_mutex_unlock(&sw->lock);
+				workqueue_pre_sleep(sw);
+				pthread_mutex_lock(&sw->lock);
+			}
+
+			/*
+			 * We dropped and reaquired the lock, check
+			 * state again.
+			 */
+			if (!flist_empty(&sw->work_list))
+				goto handle_work;
+
+			if (sw->flags & SW_F_EXIT) {
+				pthread_mutex_unlock(&sw->lock);
+				break;
+			} else if (!(sw->flags & SW_F_IDLE)) {
+				sw->flags |= SW_F_IDLE;
+				wq->next_free_worker = sw->index;
+				if (wq->wake_idle)
+					pthread_cond_signal(&wq->flush_cond);
+			}
+
+			pthread_cond_wait(&sw->cond, &sw->lock);
+		} else {
+handle_work:
+			flist_splice_init(&sw->work_list, &local_list);
+		}
+		pthread_mutex_unlock(&sw->lock);
+		handle_list(sw, &local_list);
+		if (wq->ops.update_acct_fn)
+			wq->ops.update_acct_fn(sw);
+	}
+
+done:
+	sk_out_drop();
+	return NULL;
+}
+
+static void free_worker(struct submit_worker *sw, unsigned int *sum_cnt)
+{
+	struct workqueue *wq = sw->wq;
+
+	workqueue_exit_worker(sw, sum_cnt);
+
+	pthread_cond_destroy(&sw->cond);
+	pthread_mutex_destroy(&sw->lock);
+
+	if (wq->ops.free_worker_fn)
+		wq->ops.free_worker_fn(sw);
+}
+
+static void shutdown_worker(struct submit_worker *sw, unsigned int *sum_cnt)
+{
+	pthread_join(sw->thread, NULL);
+	free_worker(sw, sum_cnt);
+}
+
+void workqueue_exit(struct workqueue *wq)
+{
+	unsigned int shutdown, sum_cnt = 0;
+	struct submit_worker *sw;
+	int i;
+
+	if (!wq->workers)
+		return;
+
+	for (i = 0; i < wq->max_workers; i++) {
+		sw = &wq->workers[i];
+
+		pthread_mutex_lock(&sw->lock);
+		sw->flags |= SW_F_EXIT;
+		pthread_cond_signal(&sw->cond);
+		pthread_mutex_unlock(&sw->lock);
+	}
+
+	do {
+		shutdown = 0;
+		for (i = 0; i < wq->max_workers; i++) {
+			sw = &wq->workers[i];
+			if (sw->flags & SW_F_ACCOUNTED)
+				continue;
+			pthread_mutex_lock(&sw->lock);
+			sw->flags |= SW_F_ACCOUNTED;
+			pthread_mutex_unlock(&sw->lock);
+			shutdown_worker(sw, &sum_cnt);
+			shutdown++;
+		}
+	} while (shutdown && shutdown != wq->max_workers);
+
+	sfree(wq->workers);
+	wq->workers = NULL;
+	pthread_mutex_destroy(&wq->flush_lock);
+	pthread_cond_destroy(&wq->flush_cond);
+	pthread_mutex_destroy(&wq->stat_lock);
+}
+
+static int start_worker(struct workqueue *wq, unsigned int index,
+			struct sk_out *sk_out)
+{
+	struct submit_worker *sw = &wq->workers[index];
+	int ret;
+
+	INIT_FLIST_HEAD(&sw->work_list);
+
+	ret = mutex_cond_init_pshared(&sw->lock, &sw->cond);
+	if (ret)
+		return ret;
+
+	sw->wq = wq;
+	sw->index = index;
+	sw->sk_out = sk_out;
+
+	if (wq->ops.alloc_worker_fn) {
+		ret = wq->ops.alloc_worker_fn(sw);
+		if (ret)
+			return ret;
+	}
+
+	ret = pthread_create(&sw->thread, NULL, worker_thread, sw);
+	if (!ret) {
+		pthread_mutex_lock(&sw->lock);
+		sw->flags = SW_F_IDLE;
+		pthread_mutex_unlock(&sw->lock);
+		return 0;
+	}
+
+	free_worker(sw, NULL);
+	return 1;
+}
+
+int workqueue_init(struct thread_data *td, struct workqueue *wq,
+		   struct workqueue_ops *ops, unsigned int max_workers,
+		   struct sk_out *sk_out)
+{
+	unsigned int running;
+	int i, error;
+	int ret;
+
+	wq->max_workers = max_workers;
+	wq->td = td;
+	wq->ops = *ops;
+	wq->work_seq = 0;
+	wq->next_free_worker = 0;
+
+	ret = mutex_cond_init_pshared(&wq->flush_lock, &wq->flush_cond);
+	if (ret)
+		goto err;
+	ret = mutex_init_pshared(&wq->stat_lock);
+	if (ret)
+		goto err;
+
+	wq->workers = smalloc(wq->max_workers * sizeof(struct submit_worker));
+	if (!wq->workers)
+		goto err;
+
+	for (i = 0; i < wq->max_workers; i++)
+		if (start_worker(wq, i, sk_out))
+			break;
+
+	wq->max_workers = i;
+	if (!wq->max_workers)
+		goto err;
+
+	/*
+	 * Wait for them all to be started and initialized
+	 */
+	error = 0;
+	do {
+		struct submit_worker *sw;
+
+		running = 0;
+		pthread_mutex_lock(&wq->flush_lock);
+		for (i = 0; i < wq->max_workers; i++) {
+			sw = &wq->workers[i];
+			pthread_mutex_lock(&sw->lock);
+			if (sw->flags & SW_F_RUNNING)
+				running++;
+			if (sw->flags & SW_F_ERROR)
+				error++;
+			pthread_mutex_unlock(&sw->lock);
+		}
+
+		if (error || running == wq->max_workers) {
+			pthread_mutex_unlock(&wq->flush_lock);
+			break;
+		}
+
+		pthread_cond_wait(&wq->flush_cond, &wq->flush_lock);
+		pthread_mutex_unlock(&wq->flush_lock);
+	} while (1);
+
+	if (!error)
+		return 0;
+
+err:
+	log_err("Can't create rate workqueue\n");
+	td_verror(td, ESRCH, "workqueue_init");
+	workqueue_exit(wq);
+	return 1;
+}
diff --git a/workqueue.h b/workqueue.h
new file mode 100644
index 0000000..0a62b5f
--- /dev/null
+++ b/workqueue.h
@@ -0,0 +1,119 @@
+#ifndef FIO_RATE_H
+#define FIO_RATE_H
+
+#include <inttypes.h>
+#include <pthread.h>
+
+#include "flist.h"
+#include "lib/types.h"
+
+struct sk_out;
+struct thread_data;
+
+struct workqueue_work {
+	struct flist_head list;
+};
+
+struct submit_worker {
+	pthread_t thread;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	struct flist_head work_list;
+	unsigned int flags;
+	unsigned int index;
+	uint64_t seq;
+	struct workqueue *wq;
+	void *priv;
+	struct sk_out *sk_out;
+};
+
+typedef int (workqueue_work_fn)(struct submit_worker *, struct workqueue_work *);
+typedef bool (workqueue_pre_sleep_flush_fn)(struct submit_worker *);
+typedef void (workqueue_pre_sleep_fn)(struct submit_worker *);
+typedef int (workqueue_alloc_worker_fn)(struct submit_worker *);
+typedef void (workqueue_free_worker_fn)(struct submit_worker *);
+typedef int (workqueue_init_worker_fn)(struct submit_worker *);
+typedef void (workqueue_exit_worker_fn)(struct submit_worker *, unsigned int *);
+typedef void (workqueue_update_acct_fn)(struct submit_worker *);
+
+struct workqueue_ops {
+	workqueue_work_fn *fn;
+	workqueue_pre_sleep_flush_fn *pre_sleep_flush_fn;
+	workqueue_pre_sleep_fn *pre_sleep_fn;
+
+	workqueue_update_acct_fn *update_acct_fn;
+
+	workqueue_alloc_worker_fn *alloc_worker_fn;
+	workqueue_free_worker_fn *free_worker_fn;
+
+	workqueue_init_worker_fn *init_worker_fn;
+	workqueue_exit_worker_fn *exit_worker_fn;
+
+	unsigned int nice;
+};
+
+struct workqueue {
+	unsigned int max_workers;
+
+	struct thread_data *td;
+	struct workqueue_ops ops;
+
+	uint64_t work_seq;
+	struct submit_worker *workers;
+	unsigned int next_free_worker;
+
+	pthread_cond_t flush_cond;
+	pthread_mutex_t flush_lock;
+	pthread_mutex_t stat_lock;
+	volatile int wake_idle;
+};
+
+int workqueue_init(struct thread_data *td, struct workqueue *wq, struct workqueue_ops *ops, unsigned int max_workers, struct sk_out *sk_out);
+void workqueue_exit(struct workqueue *wq);
+
+void workqueue_enqueue(struct workqueue *wq, struct workqueue_work *work);
+void workqueue_flush(struct workqueue *wq);
+
+static inline bool workqueue_pre_sleep_check(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (!wq->ops.pre_sleep_flush_fn)
+		return false;
+
+	return wq->ops.pre_sleep_flush_fn(sw);
+}
+
+static inline void workqueue_pre_sleep(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (wq->ops.pre_sleep_fn)
+		wq->ops.pre_sleep_fn(sw);
+}
+
+static inline int workqueue_init_worker(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (!wq->ops.init_worker_fn)
+		return 0;
+
+	return wq->ops.init_worker_fn(sw);
+}
+
+static inline void workqueue_exit_worker(struct submit_worker *sw,
+					 unsigned int *sum_cnt)
+{
+	struct workqueue *wq = sw->wq;
+	unsigned int tmp = 1;
+
+	if (!wq->ops.exit_worker_fn)
+		return;
+
+	if (!sum_cnt)
+		sum_cnt = &tmp;
+
+	wq->ops.exit_worker_fn(sw, sum_cnt);
+}
+#endif
diff --git a/zbd.c b/zbd.c
new file mode 100644
index 0000000..ee8bcb3
--- /dev/null
+++ b/zbd.c
@@ -0,0 +1,1521 @@
+/*
+ * Copyright (C) 2018 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/blkzoned.h>
+
+#include "file.h"
+#include "fio.h"
+#include "lib/pow2.h"
+#include "log.h"
+#include "oslib/asprintf.h"
+#include "smalloc.h"
+#include "verify.h"
+#include "zbd.h"
+
+/**
+ * zbd_zone_idx - convert an offset into a zone number
+ * @f: file pointer.
+ * @offset: offset in bytes. If this offset is in the first zone_size bytes
+ *	    past the disk size then the index of the sentinel is returned.
+ */
+static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
+{
+	uint32_t zone_idx;
+
+	if (f->zbd_info->zone_size_log2 > 0)
+		zone_idx = offset >> f->zbd_info->zone_size_log2;
+	else
+		zone_idx = offset / f->zbd_info->zone_size;
+
+	return min(zone_idx, f->zbd_info->nr_zones);
+}
+
+/**
+ * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
+ * @f: file pointer.
+ * @z: zone info pointer.
+ * @required: minimum number of bytes that must remain in a zone.
+ *
+ * The caller must hold z->mutex.
+ */
+static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
+			  uint64_t required)
+{
+	assert((required & 511) == 0);
+
+	return z->type == BLK_ZONE_TYPE_SEQWRITE_REQ &&
+		z->wp + required > z->start + f->zbd_info->zone_size;
+}
+
+static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+{
+	return (uint64_t)(offset - f->file_offset) < f->io_size;
+}
+
+/* Verify whether direct I/O is used for all host-managed zoned drives. */
+static bool zbd_using_direct_io(void)
+{
+	struct thread_data *td;
+	struct fio_file *f;
+	int i, j;
+
+	for_each_td(td, i) {
+		if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
+			continue;
+		for_each_file(td, f, j) {
+			if (f->zbd_info &&
+			    f->zbd_info->model == ZBD_DM_HOST_MANAGED)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/* Whether or not the I/O range for f includes one or more sequential zones */
+static bool zbd_is_seq_job(struct fio_file *f)
+{
+	uint32_t zone_idx, zone_idx_b, zone_idx_e;
+
+	assert(f->zbd_info);
+	if (f->io_size == 0)
+		return false;
+	zone_idx_b = zbd_zone_idx(f, f->file_offset);
+	zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
+	for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
+		if (f->zbd_info->zone_info[zone_idx].type ==
+		    BLK_ZONE_TYPE_SEQWRITE_REQ)
+			return true;
+
+	return false;
+}
+
+/*
+ * Verify whether offset and size parameters are aligned with zone boundaries.
+ */
+static bool zbd_verify_sizes(void)
+{
+	const struct fio_zone_info *z;
+	struct thread_data *td;
+	struct fio_file *f;
+	uint64_t new_offset, new_end;
+	uint32_t zone_idx;
+	int i, j;
+
+	for_each_td(td, i) {
+		for_each_file(td, f, j) {
+			if (!f->zbd_info)
+				continue;
+			if (f->file_offset >= f->real_file_size)
+				continue;
+			if (!zbd_is_seq_job(f))
+				continue;
+
+			if (!td->o.zone_size) {
+				td->o.zone_size = f->zbd_info->zone_size;
+				if (!td->o.zone_size) {
+					log_err("%s: invalid 0 zone size\n",
+						f->file_name);
+					return false;
+				}
+			} else if (td->o.zone_size != f->zbd_info->zone_size) {
+				log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n",
+					f->file_name, (unsigned long long) td->o.zone_size,
+					(unsigned long long) f->zbd_info->zone_size);
+				return false;
+			}
+
+			if (td->o.zone_skip &&
+			    (td->o.zone_skip < td->o.zone_size ||
+			     td->o.zone_skip % td->o.zone_size)) {
+				log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
+					f->file_name, (unsigned long long) td->o.zone_skip,
+					(unsigned long long) td->o.zone_size);
+				return false;
+			}
+
+			zone_idx = zbd_zone_idx(f, f->file_offset);
+			z = &f->zbd_info->zone_info[zone_idx];
+			if (f->file_offset != z->start) {
+				new_offset = (z+1)->start;
+				if (new_offset >= f->file_offset + f->io_size) {
+					log_info("%s: io_size must be at least one zone\n",
+						 f->file_name);
+					return false;
+				}
+				log_info("%s: rounded up offset from %llu to %llu\n",
+					 f->file_name, (unsigned long long) f->file_offset,
+					 (unsigned long long) new_offset);
+				f->io_size -= (new_offset - f->file_offset);
+				f->file_offset = new_offset;
+			}
+			zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
+			z = &f->zbd_info->zone_info[zone_idx];
+			new_end = z->start;
+			if (f->file_offset + f->io_size != new_end) {
+				if (new_end <= f->file_offset) {
+					log_info("%s: io_size must be at least one zone\n",
+						 f->file_name);
+					return false;
+				}
+				log_info("%s: rounded down io_size from %llu to %llu\n",
+					 f->file_name, (unsigned long long) f->io_size,
+					 (unsigned long long) new_end - f->file_offset);
+				f->io_size = new_end - f->file_offset;
+			}
+		}
+	}
+
+	return true;
+}
+
+static bool zbd_verify_bs(void)
+{
+	struct thread_data *td;
+	struct fio_file *f;
+	uint32_t zone_size;
+	int i, j, k;
+
+	for_each_td(td, i) {
+		for_each_file(td, f, j) {
+			if (!f->zbd_info)
+				continue;
+			zone_size = f->zbd_info->zone_size;
+			for (k = 0; k < ARRAY_SIZE(td->o.bs); k++) {
+				if (td->o.verify != VERIFY_NONE &&
+				    zone_size % td->o.bs[k] != 0) {
+					log_info("%s: block size %llu is not a divisor of the zone size %d\n",
+						 f->file_name, td->o.bs[k],
+						 zone_size);
+					return false;
+				}
+			}
+		}
+	}
+	return true;
+}
+
+/*
+ * Read zone information into @buf starting from sector @start_sector.
+ * @fd is a file descriptor that refers to a block device and @bufsz is the
+ * size of @buf.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ * If the zone report is empty, always assume an error (device problem) and
+ * return -EIO.
+ */
+static int read_zone_info(int fd, uint64_t start_sector,
+			  void *buf, unsigned int bufsz)
+{
+	struct blk_zone_report *hdr = buf;
+	int ret;
+
+	if (bufsz < sizeof(*hdr))
+		return -EINVAL;
+
+	memset(hdr, 0, sizeof(*hdr));
+
+	hdr->nr_zones = (bufsz - sizeof(*hdr)) / sizeof(struct blk_zone);
+	hdr->sector = start_sector;
+	ret = ioctl(fd, BLKREPORTZONE, hdr);
+	if (ret)
+		return -errno;
+	if (!hdr->nr_zones)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * Read up to 255 characters from the first line of a file. Strip the trailing
+ * newline.
+ */
+static char *read_file(const char *path)
+{
+	char line[256], *p = line;
+	FILE *f;
+
+	f = fopen(path, "rb");
+	if (!f)
+		return NULL;
+	if (!fgets(line, sizeof(line), f))
+		line[0] = '\0';
+	strsep(&p, "\n");
+	fclose(f);
+
+	return strdup(line);
+}
+
+static enum blk_zoned_model get_zbd_model(const char *file_name)
+{
+	enum blk_zoned_model model = ZBD_DM_NONE;
+	char *zoned_attr_path = NULL;
+	char *model_str = NULL;
+	struct stat statbuf;
+	char *sys_devno_path = NULL;
+	char *part_attr_path = NULL;
+	char *part_str = NULL;
+	char sys_path[PATH_MAX];
+	ssize_t sz;
+	char *delim = NULL;
+
+	if (stat(file_name, &statbuf) < 0)
+		goto out;
+
+	if (asprintf(&sys_devno_path, "/sys/dev/block/%d:%d",
+		     major(statbuf.st_rdev), minor(statbuf.st_rdev)) < 0)
+		goto out;
+
+	sz = readlink(sys_devno_path, sys_path, sizeof(sys_path) - 1);
+	if (sz < 0)
+		goto out;
+	sys_path[sz] = '\0';
+
+	/*
+	 * If the device is a partition device, cut the device name in the
+	 * canonical sysfs path to obtain the sysfs path of the holder device.
+	 *   e.g.:  /sys/devices/.../sda/sda1 -> /sys/devices/.../sda
+	 */
+	if (asprintf(&part_attr_path, "/sys/dev/block/%s/partition",
+		     sys_path) < 0)
+		goto out;
+	part_str = read_file(part_attr_path);
+	if (part_str && *part_str == '1') {
+		delim = strrchr(sys_path, '/');
+		if (!delim)
+			goto out;
+		*delim = '\0';
+	}
+
+	if (asprintf(&zoned_attr_path,
+		     "/sys/dev/block/%s/queue/zoned", sys_path) < 0)
+		goto out;
+
+	model_str = read_file(zoned_attr_path);
+	if (!model_str)
+		goto out;
+	dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str);
+	if (strcmp(model_str, "host-aware") == 0)
+		model = ZBD_DM_HOST_AWARE;
+	else if (strcmp(model_str, "host-managed") == 0)
+		model = ZBD_DM_HOST_MANAGED;
+
+out:
+	free(model_str);
+	free(zoned_attr_path);
+	free(part_str);
+	free(part_attr_path);
+	free(sys_devno_path);
+	return model;
+}
+
+static int ilog2(uint64_t i)
+{
+	int log = -1;
+
+	while (i) {
+		i >>= 1;
+		log++;
+	}
+	return log;
+}
+
+/*
+ * Initialize f->zbd_info for devices that are not zoned block devices. This
+ * allows to execute a ZBD workload against a non-ZBD device.
+ */
+static int init_zone_info(struct thread_data *td, struct fio_file *f)
+{
+	uint32_t nr_zones;
+	struct fio_zone_info *p;
+	uint64_t zone_size = td->o.zone_size;
+	struct zoned_block_device_info *zbd_info = NULL;
+	pthread_mutexattr_t attr;
+	int i;
+
+	if (zone_size == 0) {
+		log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n",
+			f->file_name);
+		return 1;
+	}
+
+	if (zone_size < 512) {
+		log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n",
+			f->file_name);
+		return 1;
+	}
+
+	nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
+	zbd_info = scalloc(1, sizeof(*zbd_info) +
+			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
+	if (!zbd_info)
+		return -ENOMEM;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutexattr_setpshared(&attr, true);
+	pthread_mutex_init(&zbd_info->mutex, &attr);
+	zbd_info->refcount = 1;
+	p = &zbd_info->zone_info[0];
+	for (i = 0; i < nr_zones; i++, p++) {
+		pthread_mutex_init(&p->mutex, &attr);
+		p->start = i * zone_size;
+		p->wp = p->start + zone_size;
+		p->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+		p->cond = BLK_ZONE_COND_EMPTY;
+	}
+	/* a sentinel */
+	p->start = nr_zones * zone_size;
+
+	f->zbd_info = zbd_info;
+	f->zbd_info->zone_size = zone_size;
+	f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
+		ilog2(zone_size) : -1;
+	f->zbd_info->nr_zones = nr_zones;
+	pthread_mutexattr_destroy(&attr);
+	return 0;
+}
+
+/*
+ * Parse the BLKREPORTZONE output and store it in f->zbd_info. Must be called
+ * only for devices that support this ioctl, namely zoned block devices.
+ */
+static int parse_zone_info(struct thread_data *td, struct fio_file *f)
+{
+	const unsigned int bufsz = sizeof(struct blk_zone_report) +
+		4096 * sizeof(struct blk_zone);
+	uint32_t nr_zones;
+	struct blk_zone_report *hdr;
+	const struct blk_zone *z;
+	struct fio_zone_info *p;
+	uint64_t zone_size, start_sector;
+	struct zoned_block_device_info *zbd_info = NULL;
+	pthread_mutexattr_t attr;
+	void *buf;
+	int fd, i, j, ret = 0;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutexattr_setpshared(&attr, true);
+
+	buf = malloc(bufsz);
+	if (!buf)
+		goto out;
+
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0) {
+		ret = -errno;
+		goto free;
+	}
+
+	ret = read_zone_info(fd, 0, buf, bufsz);
+	if (ret < 0) {
+		log_info("fio: BLKREPORTZONE(%lu) failed for %s (%d).\n",
+			 0UL, f->file_name, -ret);
+		goto close;
+	}
+	hdr = buf;
+	if (hdr->nr_zones < 1) {
+		log_info("fio: %s has invalid zone information.\n",
+			 f->file_name);
+		goto close;
+	}
+	z = (void *)(hdr + 1);
+	zone_size = z->len << 9;
+	nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
+
+	if (td->o.zone_size == 0) {
+		td->o.zone_size = zone_size;
+	} else if (td->o.zone_size != zone_size) {
+		log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n",
+			f->file_name, (unsigned long long) td->o.zone_size,
+			(unsigned long long) zone_size);
+		ret = -EINVAL;
+		goto close;
+	}
+
+	dprint(FD_ZBD, "Device %s has %d zones of size %llu KB\n", f->file_name,
+	       nr_zones, (unsigned long long) zone_size / 1024);
+
+	zbd_info = scalloc(1, sizeof(*zbd_info) +
+			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
+	ret = -ENOMEM;
+	if (!zbd_info)
+		goto close;
+	pthread_mutex_init(&zbd_info->mutex, &attr);
+	zbd_info->refcount = 1;
+	p = &zbd_info->zone_info[0];
+	for (start_sector = 0, j = 0; j < nr_zones;) {
+		z = (void *)(hdr + 1);
+		for (i = 0; i < hdr->nr_zones; i++, j++, z++, p++) {
+			pthread_mutex_init(&p->mutex, &attr);
+			p->start = z->start << 9;
+			switch (z->cond) {
+			case BLK_ZONE_COND_NOT_WP:
+			case BLK_ZONE_COND_FULL:
+				p->wp = p->start + zone_size;
+				break;
+			default:
+				assert(z->start <= z->wp);
+				assert(z->wp <= z->start + (zone_size >> 9));
+				p->wp = z->wp << 9;
+				break;
+			}
+			p->type = z->type;
+			p->cond = z->cond;
+			if (j > 0 && p->start != p[-1].start + zone_size) {
+				log_info("%s: invalid zone data\n",
+					 f->file_name);
+				ret = -EINVAL;
+				goto close;
+			}
+		}
+		z--;
+		start_sector = z->start + z->len;
+		if (j >= nr_zones)
+			break;
+		ret = read_zone_info(fd, start_sector, buf, bufsz);
+		if (ret < 0) {
+			log_info("fio: BLKREPORTZONE(%llu) failed for %s (%d).\n",
+				 (unsigned long long) start_sector, f->file_name, -ret);
+			goto close;
+		}
+	}
+	/* a sentinel */
+	zbd_info->zone_info[nr_zones].start = start_sector << 9;
+
+	f->zbd_info = zbd_info;
+	f->zbd_info->zone_size = zone_size;
+	f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
+		ilog2(zone_size) : -1;
+	f->zbd_info->nr_zones = nr_zones;
+	zbd_info = NULL;
+	ret = 0;
+
+close:
+	sfree(zbd_info);
+	close(fd);
+free:
+	free(buf);
+out:
+	pthread_mutexattr_destroy(&attr);
+	return ret;
+}
+
+/*
+ * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
+{
+	enum blk_zoned_model zbd_model;
+	int ret = 0;
+
+	assert(td->o.zone_mode == ZONE_MODE_ZBD);
+
+	zbd_model = get_zbd_model(f->file_name);
+	switch (zbd_model) {
+	case ZBD_DM_HOST_AWARE:
+	case ZBD_DM_HOST_MANAGED:
+		ret = parse_zone_info(td, f);
+		break;
+	case ZBD_DM_NONE:
+		ret = init_zone_info(td, f);
+		break;
+	}
+	if (ret == 0)
+		f->zbd_info->model = zbd_model;
+	return ret;
+}
+
+void zbd_free_zone_info(struct fio_file *f)
+{
+	uint32_t refcount;
+
+	if (!f->zbd_info)
+		return;
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	refcount = --f->zbd_info->refcount;
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	assert((int32_t)refcount >= 0);
+	if (refcount == 0)
+		sfree(f->zbd_info);
+	f->zbd_info = NULL;
+}
+
+/*
+ * Initialize f->zbd_info.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ *
+ * Note: this function can only work correctly if it is called before the first
+ * fio fork() call.
+ */
+static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
+{
+	struct thread_data *td2;
+	struct fio_file *f2;
+	int i, j, ret;
+
+	for_each_td(td2, i) {
+		for_each_file(td2, f2, j) {
+			if (td2 == td && f2 == file)
+				continue;
+			if (!f2->zbd_info ||
+			    strcmp(f2->file_name, file->file_name) != 0)
+				continue;
+			file->zbd_info = f2->zbd_info;
+			file->zbd_info->refcount++;
+			return 0;
+		}
+	}
+
+	ret = zbd_create_zone_info(td, file);
+	if (ret < 0)
+		td_verror(td, -ret, "zbd_create_zone_info() failed");
+	return ret;
+}
+
+int zbd_init(struct thread_data *td)
+{
+	struct fio_file *f;
+	int i;
+
+	for_each_file(td, f, i) {
+		if (f->filetype != FIO_TYPE_BLOCK)
+			continue;
+		if (zbd_init_zone_info(td, f))
+			return 1;
+	}
+
+	if (!zbd_using_direct_io()) {
+		log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
+		return 1;
+	}
+
+	if (!zbd_verify_sizes())
+		return 1;
+
+	if (!zbd_verify_bs())
+		return 1;
+
+	return 0;
+}
+
+/**
+ * zbd_reset_range - reset zones for a range of sectors
+ * @td: FIO thread data.
+ * @f: Fio file for which to reset zones
+ * @sector: Starting sector in units of 512 bytes
+ * @nr_sectors: Number of sectors in units of 512 bytes
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_reset_range(struct thread_data *td, const struct fio_file *f,
+			   uint64_t offset, uint64_t length)
+{
+	struct blk_zone_range zr = {
+		.sector         = offset >> 9,
+		.nr_sectors     = length >> 9,
+	};
+	uint32_t zone_idx_b, zone_idx_e;
+	struct fio_zone_info *zb, *ze, *z;
+	int ret = 0;
+
+	assert(f->fd != -1);
+	assert(is_valid_offset(f, offset + length - 1));
+	switch (f->zbd_info->model) {
+	case ZBD_DM_HOST_AWARE:
+	case ZBD_DM_HOST_MANAGED:
+		ret = ioctl(f->fd, BLKRESETZONE, &zr);
+		if (ret < 0) {
+			td_verror(td, errno, "resetting wp failed");
+			log_err("%s: resetting wp for %llu sectors at sector %llu failed (%d).\n",
+				f->file_name, zr.nr_sectors, zr.sector, errno);
+			return ret;
+		}
+		break;
+	case ZBD_DM_NONE:
+		break;
+	}
+
+	zone_idx_b = zbd_zone_idx(f, offset);
+	zb = &f->zbd_info->zone_info[zone_idx_b];
+	zone_idx_e = zbd_zone_idx(f, offset + length);
+	ze = &f->zbd_info->zone_info[zone_idx_e];
+	for (z = zb; z < ze; z++) {
+		pthread_mutex_lock(&z->mutex);
+		pthread_mutex_lock(&f->zbd_info->mutex);
+		f->zbd_info->sectors_with_data -= z->wp - z->start;
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		z->wp = z->start;
+		z->verify_block = 0;
+		pthread_mutex_unlock(&z->mutex);
+	}
+
+	td->ts.nr_zone_resets += ze - zb;
+
+	return ret;
+}
+
+static unsigned int zbd_zone_nr(struct zoned_block_device_info *zbd_info,
+				struct fio_zone_info *zone)
+{
+	return zone - zbd_info->zone_info;
+}
+
+/**
+ * zbd_reset_zone - reset the write pointer of a single zone
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_reset_zone(struct thread_data *td, const struct fio_file *f,
+			  struct fio_zone_info *z)
+{
+	dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
+		zbd_zone_nr(f->zbd_info, z));
+
+	return zbd_reset_range(td, f, z->start, (z+1)->start - z->start);
+}
+
+/*
+ * Reset a range of zones. Returns 0 upon success and 1 upon failure.
+ * @td: fio thread data.
+ * @f: fio file for which to reset zones
+ * @zb: first zone to reset.
+ * @ze: first zone not to reset.
+ * @all_zones: whether to reset all zones or only those zones for which the
+ *	write pointer is not a multiple of td->o.min_bs[DDIR_WRITE].
+ */
+static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
+			   struct fio_zone_info *const zb,
+			   struct fio_zone_info *const ze, bool all_zones)
+{
+	struct fio_zone_info *z, *start_z = ze;
+	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	bool reset_wp;
+	int res = 0;
+
+	dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
+		zbd_zone_nr(f->zbd_info, zb), zbd_zone_nr(f->zbd_info, ze));
+	assert(f->fd != -1);
+	for (z = zb; z < ze; z++) {
+		pthread_mutex_lock(&z->mutex);
+		switch (z->type) {
+		case BLK_ZONE_TYPE_SEQWRITE_REQ:
+			reset_wp = all_zones ? z->wp != z->start :
+					(td->o.td_ddir & TD_DDIR_WRITE) &&
+					z->wp % min_bs != 0;
+			if (start_z == ze && reset_wp) {
+				start_z = z;
+			} else if (start_z < ze && !reset_wp) {
+				dprint(FD_ZBD,
+				       "%s: resetting zones %u .. %u\n",
+				       f->file_name,
+					zbd_zone_nr(f->zbd_info, start_z),
+					zbd_zone_nr(f->zbd_info, z));
+				if (zbd_reset_range(td, f, start_z->start,
+						z->start - start_z->start) < 0)
+					res = 1;
+				start_z = ze;
+			}
+			break;
+		default:
+			if (start_z == ze)
+				break;
+			dprint(FD_ZBD, "%s: resetting zones %u .. %u\n",
+			       f->file_name, zbd_zone_nr(f->zbd_info, start_z),
+			       zbd_zone_nr(f->zbd_info, z));
+			if (zbd_reset_range(td, f, start_z->start,
+					    z->start - start_z->start) < 0)
+				res = 1;
+			start_z = ze;
+			break;
+		}
+	}
+	if (start_z < ze) {
+		dprint(FD_ZBD, "%s: resetting zones %u .. %u\n", f->file_name,
+			zbd_zone_nr(f->zbd_info, start_z),
+			zbd_zone_nr(f->zbd_info, z));
+		if (zbd_reset_range(td, f, start_z->start,
+				    z->start - start_z->start) < 0)
+			res = 1;
+	}
+	for (z = zb; z < ze; z++)
+		pthread_mutex_unlock(&z->mutex);
+
+	return res;
+}
+
+/*
+ * Reset zbd_info.write_cnt, the counter that counts down towards the next
+ * zone reset.
+ */
+static void zbd_reset_write_cnt(const struct thread_data *td,
+				const struct fio_file *f)
+{
+	assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	f->zbd_info->write_cnt = td->o.zrf.u.f ?
+		min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+}
+
+static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
+					const struct fio_file *f)
+{
+	uint32_t write_cnt = 0;
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	assert(f->zbd_info->write_cnt);
+	if (f->zbd_info->write_cnt)
+		write_cnt = --f->zbd_info->write_cnt;
+	if (write_cnt == 0)
+		zbd_reset_write_cnt(td, f);
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	return write_cnt == 0;
+}
+
+enum swd_action {
+	CHECK_SWD,
+	SET_SWD,
+};
+
+/* Calculate the number of sectors with data (swd) and perform action 'a' */
+static uint64_t zbd_process_swd(const struct fio_file *f, enum swd_action a)
+{
+	struct fio_zone_info *zb, *ze, *z;
+	uint64_t swd = 0;
+
+	zb = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)];
+	ze = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset +
+						  f->io_size)];
+	for (z = zb; z < ze; z++) {
+		pthread_mutex_lock(&z->mutex);
+		swd += z->wp - z->start;
+	}
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	switch (a) {
+	case CHECK_SWD:
+		assert(f->zbd_info->sectors_with_data == swd);
+		break;
+	case SET_SWD:
+		f->zbd_info->sectors_with_data = swd;
+		break;
+	}
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+	for (z = zb; z < ze; z++)
+		pthread_mutex_unlock(&z->mutex);
+
+	return swd;
+}
+
+/*
+ * The swd check is useful for debugging but takes too much time to leave
+ * it enabled all the time. Hence it is disabled by default.
+ */
+static const bool enable_check_swd = false;
+
+/* Check whether the value of zbd_info.sectors_with_data is correct. */
+static void zbd_check_swd(const struct fio_file *f)
+{
+	if (!enable_check_swd)
+		return;
+
+	zbd_process_swd(f, CHECK_SWD);
+}
+
+static void zbd_init_swd(struct fio_file *f)
+{
+	uint64_t swd;
+
+	swd = zbd_process_swd(f, SET_SWD);
+	dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name,
+	       swd);
+}
+
+void zbd_file_reset(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_zone_info *zb, *ze;
+	uint32_t zone_idx_e;
+
+	if (!f->zbd_info)
+		return;
+
+	zb = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)];
+	zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size);
+	ze = &f->zbd_info->zone_info[zone_idx_e];
+	zbd_init_swd(f);
+	/*
+	 * If data verification is enabled reset the affected zones before
+	 * writing any data to avoid that a zone reset has to be issued while
+	 * writing data, which causes data loss.
+	 */
+	zbd_reset_zones(td, f, zb, ze, td->o.verify != VERIFY_NONE &&
+			(td->o.td_ddir & TD_DDIR_WRITE) &&
+			td->runstate != TD_VERIFYING);
+	zbd_reset_write_cnt(td, f);
+}
+
+/* The caller must hold f->zbd_info->mutex. */
+static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
+			 unsigned int zone_idx)
+{
+	struct zoned_block_device_info *zbdi = f->zbd_info;
+	int i;
+
+	assert(td->o.max_open_zones <= ARRAY_SIZE(zbdi->open_zones));
+	assert(zbdi->num_open_zones <= td->o.max_open_zones);
+
+	for (i = 0; i < zbdi->num_open_zones; i++)
+		if (zbdi->open_zones[i] == zone_idx)
+			return true;
+
+	return false;
+}
+
+/*
+ * Open a ZBD zone if it was not yet open. Returns true if either the zone was
+ * already open or if opening a new zone is allowed. Returns false if the zone
+ * was not yet open and opening a new zone would cause the zone limit to be
+ * exceeded.
+ */
+static bool zbd_open_zone(struct thread_data *td, const struct io_u *io_u,
+			  uint32_t zone_idx)
+{
+	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	const struct fio_file *f = io_u->file;
+	struct fio_zone_info *z = &f->zbd_info->zone_info[zone_idx];
+	bool res = true;
+
+	if (z->cond == BLK_ZONE_COND_OFFLINE)
+		return false;
+
+	/*
+	 * Skip full zones with data verification enabled because resetting a
+	 * zone causes data loss and hence causes verification to fail.
+	 */
+	if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
+		return false;
+
+	/* Zero means no limit */
+	if (!td->o.max_open_zones)
+		return true;
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	if (is_zone_open(td, f, zone_idx))
+		goto out;
+	res = false;
+	if (f->zbd_info->num_open_zones >= td->o.max_open_zones)
+		goto out;
+	dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
+	f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
+	z->open = 1;
+	res = true;
+
+out:
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+	return res;
+}
+
+/* The caller must hold f->zbd_info->mutex */
+static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
+			   unsigned int open_zone_idx)
+{
+	uint32_t zone_idx;
+
+	assert(open_zone_idx < f->zbd_info->num_open_zones);
+	zone_idx = f->zbd_info->open_zones[open_zone_idx];
+	memmove(f->zbd_info->open_zones + open_zone_idx,
+		f->zbd_info->open_zones + open_zone_idx + 1,
+		(FIO_MAX_OPEN_ZBD_ZONES - (open_zone_idx + 1)) *
+		sizeof(f->zbd_info->open_zones[0]));
+	f->zbd_info->num_open_zones--;
+	f->zbd_info->zone_info[zone_idx].open = 0;
+}
+
+/*
+ * Modify the offset of an I/O unit that does not refer to an open zone such
+ * that it refers to an open zone. Close an open zone and open a new zone if
+ * necessary. This algorithm can only work correctly if all write pointers are
+ * a multiple of the fio block size. The caller must neither hold z->mutex
+ * nor f->zbd_info->mutex. Returns with z->mutex held upon success.
+ */
+static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
+						      struct io_u *io_u)
+{
+	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	const struct fio_file *f = io_u->file;
+	struct fio_zone_info *z;
+	unsigned int open_zone_idx = -1;
+	uint32_t zone_idx, new_zone_idx;
+	int i;
+
+	assert(is_valid_offset(f, io_u->offset));
+
+	if (td->o.max_open_zones) {
+		/*
+		 * This statement accesses f->zbd_info->open_zones[] on purpose
+		 * without locking.
+		 */
+		zone_idx = f->zbd_info->open_zones[(io_u->offset -
+						    f->file_offset) *
+				f->zbd_info->num_open_zones / f->io_size];
+	} else {
+		zone_idx = zbd_zone_idx(f, io_u->offset);
+	}
+	dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
+	       __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
+
+	/*
+	 * Since z->mutex is the outer lock and f->zbd_info->mutex the inner
+	 * lock it can happen that the state of the zone with index zone_idx
+	 * has changed after 'z' has been assigned and before f->zbd_info->mutex
+	 * has been obtained. Hence the loop.
+	 */
+	for (;;) {
+		z = &f->zbd_info->zone_info[zone_idx];
+
+		pthread_mutex_lock(&z->mutex);
+		pthread_mutex_lock(&f->zbd_info->mutex);
+		if (td->o.max_open_zones == 0)
+			goto examine_zone;
+		if (f->zbd_info->num_open_zones == 0) {
+			pthread_mutex_unlock(&f->zbd_info->mutex);
+			pthread_mutex_unlock(&z->mutex);
+			dprint(FD_ZBD, "%s(%s): no zones are open\n",
+			       __func__, f->file_name);
+			return NULL;
+		}
+		open_zone_idx = (io_u->offset - f->file_offset) *
+			f->zbd_info->num_open_zones / f->io_size;
+		assert(open_zone_idx < f->zbd_info->num_open_zones);
+		new_zone_idx = f->zbd_info->open_zones[open_zone_idx];
+		if (new_zone_idx == zone_idx)
+			break;
+		zone_idx = new_zone_idx;
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		pthread_mutex_unlock(&z->mutex);
+	}
+
+	/* Both z->mutex and f->zbd_info->mutex are held. */
+
+examine_zone:
+	if (z->wp + min_bs <= (z+1)->start) {
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		goto out;
+	}
+	dprint(FD_ZBD, "%s(%s): closing zone %d\n", __func__, f->file_name,
+	       zone_idx);
+	if (td->o.max_open_zones)
+		zbd_close_zone(td, f, open_zone_idx);
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	/* Only z->mutex is held. */
+
+	/* Zone 'z' is full, so try to open a new zone. */
+	for (i = f->io_size / f->zbd_info->zone_size; i > 0; i--) {
+		zone_idx++;
+		pthread_mutex_unlock(&z->mutex);
+		z++;
+		if (!is_valid_offset(f, z->start)) {
+			/* Wrap-around. */
+			zone_idx = zbd_zone_idx(f, f->file_offset);
+			z = &f->zbd_info->zone_info[zone_idx];
+		}
+		assert(is_valid_offset(f, z->start));
+		pthread_mutex_lock(&z->mutex);
+		if (z->open)
+			continue;
+		if (zbd_open_zone(td, io_u, zone_idx))
+			goto out;
+	}
+
+	/* Only z->mutex is held. */
+
+	/* Check whether the write fits in any of the already opened zones. */
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	for (i = 0; i < f->zbd_info->num_open_zones; i++) {
+		zone_idx = f->zbd_info->open_zones[i];
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		pthread_mutex_unlock(&z->mutex);
+
+		z = &f->zbd_info->zone_info[zone_idx];
+
+		pthread_mutex_lock(&z->mutex);
+		if (z->wp + min_bs <= (z+1)->start)
+			goto out;
+		pthread_mutex_lock(&f->zbd_info->mutex);
+	}
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+	pthread_mutex_unlock(&z->mutex);
+	dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
+	       f->file_name);
+	return NULL;
+
+out:
+	dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name,
+	       zone_idx);
+	io_u->offset = z->start;
+	return z;
+}
+
+/* The caller must hold z->mutex. */
+static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
+						    struct io_u *io_u,
+						    struct fio_zone_info *z)
+{
+	const struct fio_file *f = io_u->file;
+	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+
+	if (!zbd_open_zone(td, io_u, z - f->zbd_info->zone_info)) {
+		pthread_mutex_unlock(&z->mutex);
+		z = zbd_convert_to_open_zone(td, io_u);
+		assert(z);
+	}
+
+	if (z->verify_block * min_bs >= f->zbd_info->zone_size)
+		log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
+			min_bs, (unsigned long long) f->zbd_info->zone_size);
+	io_u->offset = z->start + z->verify_block++ * min_bs;
+	return z;
+}
+
+/*
+ * Find another zone for which @io_u fits below the write pointer. Start
+ * searching in zones @zb + 1 .. @zl and continue searching in zones
+ * @zf .. @zb - 1.
+ *
+ * Either returns NULL or returns a zone pointer and holds the mutex for that
+ * zone.
+ */
+static struct fio_zone_info *
+zbd_find_zone(struct thread_data *td, struct io_u *io_u,
+	      struct fio_zone_info *zb, struct fio_zone_info *zl)
+{
+	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	const struct fio_file *f = io_u->file;
+	struct fio_zone_info *z1, *z2;
+	const struct fio_zone_info *const zf =
+		&f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)];
+
+	/*
+	 * Skip to the next non-empty zone in case of sequential I/O and to
+	 * the nearest non-empty zone in case of random I/O.
+	 */
+	for (z1 = zb + 1, z2 = zb - 1; z1 < zl || z2 >= zf; z1++, z2--) {
+		if (z1 < zl && z1->cond != BLK_ZONE_COND_OFFLINE) {
+			pthread_mutex_lock(&z1->mutex);
+			if (z1->start + min_bs <= z1->wp)
+				return z1;
+			pthread_mutex_unlock(&z1->mutex);
+		} else if (!td_random(td)) {
+			break;
+		}
+		if (td_random(td) && z2 >= zf &&
+		    z2->cond != BLK_ZONE_COND_OFFLINE) {
+			pthread_mutex_lock(&z2->mutex);
+			if (z2->start + min_bs <= z2->wp)
+				return z2;
+			pthread_mutex_unlock(&z2->mutex);
+		}
+	}
+	dprint(FD_ZBD, "%s: adjusting random read offset failed\n",
+	       f->file_name);
+	return NULL;
+}
+
+/**
+ * zbd_queue_io - update the write pointer of a sequential zone
+ * @io_u: I/O unit
+ * @success: Whether or not the I/O unit has been queued successfully
+ * @q: queueing status (busy, completed or queued).
+ *
+ * For write and trim operations, update the write pointer of the I/O unit
+ * target zone.
+ */
+static void zbd_queue_io(struct io_u *io_u, int q, bool success)
+{
+	const struct fio_file *f = io_u->file;
+	struct zoned_block_device_info *zbd_info = f->zbd_info;
+	struct fio_zone_info *z;
+	uint32_t zone_idx;
+	uint64_t zone_end;
+
+	if (!zbd_info)
+		return;
+
+	zone_idx = zbd_zone_idx(f, io_u->offset);
+	assert(zone_idx < zbd_info->nr_zones);
+	z = &zbd_info->zone_info[zone_idx];
+
+	if (z->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+		return;
+
+	if (!success)
+		goto unlock;
+
+	dprint(FD_ZBD,
+	       "%s: queued I/O (%lld, %llu) for zone %u\n",
+	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
+
+	switch (io_u->ddir) {
+	case DDIR_WRITE:
+		zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
+			       (z + 1)->start);
+		pthread_mutex_lock(&zbd_info->mutex);
+		/*
+		 * z->wp > zone_end means that one or more I/O errors
+		 * have occurred.
+		 */
+		if (z->wp <= zone_end)
+			zbd_info->sectors_with_data += zone_end - z->wp;
+		pthread_mutex_unlock(&zbd_info->mutex);
+		z->wp = zone_end;
+		break;
+	case DDIR_TRIM:
+		assert(z->wp == z->start);
+		break;
+	default:
+		break;
+	}
+
+unlock:
+	if (!success || q != FIO_Q_QUEUED) {
+		/* BUSY or COMPLETED: unlock the zone */
+		pthread_mutex_unlock(&z->mutex);
+		io_u->zbd_put_io = NULL;
+	}
+}
+
+/**
+ * zbd_put_io - Unlock an I/O unit target zone lock
+ * @io_u: I/O unit
+ */
+static void zbd_put_io(const struct io_u *io_u)
+{
+	const struct fio_file *f = io_u->file;
+	struct zoned_block_device_info *zbd_info = f->zbd_info;
+	struct fio_zone_info *z;
+	uint32_t zone_idx;
+
+	if (!zbd_info)
+		return;
+
+	zone_idx = zbd_zone_idx(f, io_u->offset);
+	assert(zone_idx < zbd_info->nr_zones);
+	z = &zbd_info->zone_info[zone_idx];
+
+	if (z->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+		return;
+
+	dprint(FD_ZBD,
+	       "%s: terminate I/O (%lld, %llu) for zone %u\n",
+	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
+
+	assert(pthread_mutex_unlock(&z->mutex) == 0);
+	zbd_check_swd(f);
+}
+
+bool zbd_unaligned_write(int error_code)
+{
+	switch (error_code) {
+	case EIO:
+	case EREMOTEIO:
+		return true;
+	}
+	return false;
+}
+
+/**
+ * setup_zbd_zone_mode - handle zoneskip as necessary for ZBD drives
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * For sequential workloads, change the file offset to skip zoneskip bytes when
+ * no more IO can be performed in the current zone.
+ * - For read workloads, zoneskip is applied when the io has reached the end of
+ *   the zone or the zone write position (when td->o.read_beyond_wp is false).
+ * - For write workloads, zoneskip is applied when the zone is full.
+ * This applies only to read and write operations.
+ */
+void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	enum fio_ddir ddir = io_u->ddir;
+	struct fio_zone_info *z;
+	uint32_t zone_idx;
+
+	assert(td->o.zone_mode == ZONE_MODE_ZBD);
+	assert(td->o.zone_size);
+
+	/*
+	 * zone_skip is valid only for sequential workloads.
+	 */
+	if (td_random(td) || !td->o.zone_skip)
+		return;
+
+	/*
+	 * It is time to switch to a new zone if:
+	 * - zone_bytes == zone_size bytes have already been accessed
+	 * - The last position reached the end of the current zone.
+	 * - For reads with td->o.read_beyond_wp == false, the last position
+	 *   reached the zone write pointer.
+	 */
+	zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
+	z = &f->zbd_info->zone_info[zone_idx];
+
+	if (td->zone_bytes >= td->o.zone_size ||
+	    f->last_pos[ddir] >= (z+1)->start ||
+	    (ddir == DDIR_READ &&
+	     (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) {
+		/*
+		 * Skip zones.
+		 */
+		td->zone_bytes = 0;
+		f->file_offset += td->o.zone_size + td->o.zone_skip;
+
+		/*
+		 * Wrap from the beginning, if we exceed the file size
+		 */
+		if (f->file_offset >= f->real_file_size)
+			f->file_offset = get_start_offset(td, f);
+
+		f->last_pos[ddir] = f->file_offset;
+		td->io_skip_bytes += td->o.zone_skip;
+	}
+}
+
+/**
+ * zbd_adjust_block - adjust the offset and length as necessary for ZBD drives
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * Locking strategy: returns with z->mutex locked if and only if z refers
+ * to a sequential zone and if io_u_accept is returned. z is the zone that
+ * corresponds to io_u->offset at the end of this function.
+ */
+enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
+{
+	const struct fio_file *f = io_u->file;
+	uint32_t zone_idx_b;
+	struct fio_zone_info *zb, *zl, *orig_zb;
+	uint32_t orig_len = io_u->buflen;
+	uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	uint64_t new_len;
+	int64_t range;
+
+	if (!f->zbd_info)
+		return io_u_accept;
+
+	assert(is_valid_offset(f, io_u->offset));
+	assert(io_u->buflen);
+	zone_idx_b = zbd_zone_idx(f, io_u->offset);
+	zb = &f->zbd_info->zone_info[zone_idx_b];
+	orig_zb = zb;
+
+	/* Accept the I/O offset for conventional zones. */
+	if (zb->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return io_u_accept;
+
+	/*
+	 * Accept the I/O offset for reads if reading beyond the write pointer
+	 * is enabled.
+	 */
+	if (zb->cond != BLK_ZONE_COND_OFFLINE &&
+	    io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
+		return io_u_accept;
+
+	zbd_check_swd(f);
+
+	/*
+	 * Lock the io_u target zone. The zone will be unlocked if io_u offset
+	 * is changed or when io_u completes and zbd_put_io() executed.
+	 * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
+	 * other waiting for zone locks when building an io_u batch, first
+	 * only trylock the zone. If the zone is already locked by another job,
+	 * process the currently queued I/Os so that I/O progress is made and
+	 * zones unlocked.
+	 */
+	if (pthread_mutex_trylock(&zb->mutex) != 0) {
+		if (!td_ioengine_flagged(td, FIO_SYNCIO))
+			io_u_quiesce(td);
+		pthread_mutex_lock(&zb->mutex);
+	}
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		if (td->runstate == TD_VERIFYING) {
+			zb = zbd_replay_write_order(td, io_u, zb);
+			goto accept;
+		}
+		/*
+		 * Check that there is enough written data in the zone to do an
+		 * I/O of at least min_bs B. If there isn't, find a new zone for
+		 * the I/O.
+		 */
+		range = zb->cond != BLK_ZONE_COND_OFFLINE ?
+			zb->wp - zb->start : 0;
+		if (range < min_bs ||
+		    ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
+			pthread_mutex_unlock(&zb->mutex);
+			zl = &f->zbd_info->zone_info[zbd_zone_idx(f,
+						f->file_offset + f->io_size)];
+			zb = zbd_find_zone(td, io_u, zb, zl);
+			if (!zb) {
+				dprint(FD_ZBD,
+				       "%s: zbd_find_zone(%lld, %llu) failed\n",
+				       f->file_name, io_u->offset,
+				       io_u->buflen);
+				goto eof;
+			}
+			/*
+			 * zbd_find_zone() returned a zone with a range of at
+			 * least min_bs.
+			 */
+			range = zb->wp - zb->start;
+			assert(range >= min_bs);
+
+			if (!td_random(td))
+				io_u->offset = zb->start;
+		}
+		/*
+		 * Make sure the I/O is within the zone valid data range while
+		 * maximizing the I/O size and preserving randomness.
+		 */
+		if (range <= io_u->buflen)
+			io_u->offset = zb->start;
+		else if (td_random(td))
+			io_u->offset = zb->start +
+				((io_u->offset - orig_zb->start) %
+				 (range - io_u->buflen)) / min_bs * min_bs;
+		/*
+		 * Make sure the I/O does not cross over the zone wp position.
+		 */
+		new_len = min((unsigned long long)io_u->buflen,
+			      (unsigned long long)(zb->wp - io_u->offset));
+		new_len = new_len / min_bs * min_bs;
+		if (new_len < io_u->buflen) {
+			io_u->buflen = new_len;
+			dprint(FD_IO, "Changed length from %u into %llu\n",
+			       orig_len, io_u->buflen);
+		}
+		assert(zb->start <= io_u->offset);
+		assert(io_u->offset + io_u->buflen <= zb->wp);
+		goto accept;
+	case DDIR_WRITE:
+		if (io_u->buflen > f->zbd_info->zone_size)
+			goto eof;
+		if (!zbd_open_zone(td, io_u, zone_idx_b)) {
+			pthread_mutex_unlock(&zb->mutex);
+			zb = zbd_convert_to_open_zone(td, io_u);
+			if (!zb)
+				goto eof;
+			zone_idx_b = zb - f->zbd_info->zone_info;
+		}
+		/* Check whether the zone reset threshold has been exceeded */
+		if (td->o.zrf.u.f) {
+			if (f->zbd_info->sectors_with_data >=
+			    f->io_size * td->o.zrt.u.f &&
+			    zbd_dec_and_reset_write_cnt(td, f)) {
+				zb->reset_zone = 1;
+			}
+		}
+		/* Reset the zone pointer if necessary */
+		if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
+			assert(td->o.verify == VERIFY_NONE);
+			/*
+			 * Since previous write requests may have been submitted
+			 * asynchronously and since we will submit the zone
+			 * reset synchronously, wait until previously submitted
+			 * write requests have completed before issuing a
+			 * zone reset.
+			 */
+			io_u_quiesce(td);
+			zb->reset_zone = 0;
+			if (zbd_reset_zone(td, f, zb) < 0)
+				goto eof;
+		}
+		/* Make writes occur at the write pointer */
+		assert(!zbd_zone_full(f, zb, min_bs));
+		io_u->offset = zb->wp;
+		if (!is_valid_offset(f, io_u->offset)) {
+			dprint(FD_ZBD, "Dropped request with offset %llu\n",
+			       io_u->offset);
+			goto eof;
+		}
+		/*
+		 * Make sure that the buflen is a multiple of the minimal
+		 * block size. Give up if shrinking would make the request too
+		 * small.
+		 */
+		new_len = min((unsigned long long)io_u->buflen,
+			      (zb + 1)->start - io_u->offset);
+		new_len = new_len / min_bs * min_bs;
+		if (new_len == io_u->buflen)
+			goto accept;
+		if (new_len >= min_bs) {
+			io_u->buflen = new_len;
+			dprint(FD_IO, "Changed length from %u into %llu\n",
+			       orig_len, io_u->buflen);
+			goto accept;
+		}
+		log_err("Zone remainder %lld smaller than minimum block size %d\n",
+			((zb + 1)->start - io_u->offset),
+			min_bs);
+		goto eof;
+	case DDIR_TRIM:
+		/* fall-through */
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+	case DDIR_WAIT:
+	case DDIR_LAST:
+	case DDIR_INVAL:
+		goto accept;
+	}
+
+	assert(false);
+
+accept:
+	assert(zb);
+	assert(zb->cond != BLK_ZONE_COND_OFFLINE);
+	assert(!io_u->zbd_queue_io);
+	assert(!io_u->zbd_put_io);
+	io_u->zbd_queue_io = zbd_queue_io;
+	io_u->zbd_put_io = zbd_put_io;
+	return io_u_accept;
+
+eof:
+	if (zb)
+		pthread_mutex_unlock(&zb->mutex);
+	return io_u_eof;
+}
+
+/* Return a string with ZBD statistics */
+char *zbd_write_status(const struct thread_stat *ts)
+{
+	char *res;
+
+	if (asprintf(&res, "; %llu zone resets", (unsigned long long) ts->nr_zone_resets) < 0)
+		return NULL;
+	return res;
+}
diff --git a/zbd.h b/zbd.h
new file mode 100644
index 0000000..e0a7e44
--- /dev/null
+++ b/zbd.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2018 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef FIO_ZBD_H
+#define FIO_ZBD_H
+
+#include <inttypes.h>
+#include "fio.h"	/* FIO_MAX_OPEN_ZBD_ZONES */
+#ifdef CONFIG_LINUX_BLKZONED
+#include <linux/blkzoned.h>
+#endif
+
+struct fio_file;
+
+/*
+ * Zoned block device models.
+ */
+enum blk_zoned_model {
+	ZBD_DM_NONE,	/* Regular block device */
+	ZBD_DM_HOST_AWARE,	/* Host-aware zoned block device */
+	ZBD_DM_HOST_MANAGED,	/* Host-managed zoned block device */
+};
+
+enum io_u_action {
+	io_u_accept	= 0,
+	io_u_eof	= 1,
+};
+
+/**
+ * struct fio_zone_info - information about a single ZBD zone
+ * @start: zone start location (bytes)
+ * @wp: zone write pointer location (bytes)
+ * @verify_block: number of blocks that have been verified for this zone
+ * @mutex: protects the modifiable members in this structure
+ * @type: zone type (BLK_ZONE_TYPE_*)
+ * @cond: zone state (BLK_ZONE_COND_*)
+ * @open: whether or not this zone is currently open. Only relevant if
+ *		max_open_zones > 0.
+ * @reset_zone: whether or not this zone should be reset before writing to it
+ */
+struct fio_zone_info {
+#ifdef CONFIG_LINUX_BLKZONED
+	pthread_mutex_t		mutex;
+	uint64_t		start;
+	uint64_t		wp;
+	uint32_t		verify_block;
+	enum blk_zone_type	type:2;
+	enum blk_zone_cond	cond:4;
+	unsigned int		open:1;
+	unsigned int		reset_zone:1;
+#endif
+};
+
+/**
+ * zoned_block_device_info - zoned block device characteristics
+ * @model: Device model.
+ * @mutex: Protects the modifiable members in this structure (refcount and
+ *		num_open_zones).
+ * @zone_size: size of a single zone in units of 512 bytes
+ * @sectors_with_data: total size of data in all zones in units of 512 bytes
+ * @zone_size_log2: log2 of the zone size in bytes if it is a power of 2 or 0
+ *		if the zone size is not a power of 2.
+ * @nr_zones: number of zones
+ * @refcount: number of fio files that share this structure
+ * @num_open_zones: number of open zones
+ * @write_cnt: Number of writes since the latest zone reset triggered by
+ *	       the zone_reset_frequency fio job parameter.
+ * @open_zones: zone numbers of open zones
+ * @zone_info: description of the individual zones
+ *
+ * Only devices for which all zones have the same size are supported.
+ * Note: if the capacity is not a multiple of the zone size then the last zone
+ * will be smaller than 'zone_size'.
+ */
+struct zoned_block_device_info {
+	enum blk_zoned_model	model;
+	pthread_mutex_t		mutex;
+	uint64_t		zone_size;
+	uint64_t		sectors_with_data;
+	uint32_t		zone_size_log2;
+	uint32_t		nr_zones;
+	uint32_t		refcount;
+	uint32_t		num_open_zones;
+	uint32_t		write_cnt;
+	uint32_t		open_zones[FIO_MAX_OPEN_ZBD_ZONES];
+	struct fio_zone_info	zone_info[0];
+};
+
+#ifdef CONFIG_LINUX_BLKZONED
+void zbd_free_zone_info(struct fio_file *f);
+int zbd_init(struct thread_data *td);
+void zbd_file_reset(struct thread_data *td, struct fio_file *f);
+bool zbd_unaligned_write(int error_code);
+void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u);
+enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u);
+char *zbd_write_status(const struct thread_stat *ts);
+
+static inline void zbd_queue_io_u(struct io_u *io_u, enum fio_q_status status)
+{
+	if (io_u->zbd_queue_io) {
+		io_u->zbd_queue_io(io_u, status, io_u->error == 0);
+		io_u->zbd_queue_io = NULL;
+	}
+}
+
+static inline void zbd_put_io_u(struct io_u *io_u)
+{
+	if (io_u->zbd_put_io) {
+		io_u->zbd_put_io(io_u);
+		io_u->zbd_queue_io = NULL;
+		io_u->zbd_put_io = NULL;
+	}
+}
+
+#else
+static inline void zbd_free_zone_info(struct fio_file *f)
+{
+}
+
+static inline int zbd_init(struct thread_data *td)
+{
+	return 0;
+}
+
+static inline void zbd_file_reset(struct thread_data *td, struct fio_file *f)
+{
+}
+
+static inline bool zbd_unaligned_write(int error_code)
+{
+	return false;
+}
+
+static inline enum io_u_action zbd_adjust_block(struct thread_data *td,
+						struct io_u *io_u)
+{
+	return io_u_accept;
+}
+
+static inline char *zbd_write_status(const struct thread_stat *ts)
+{
+	return NULL;
+}
+
+static inline void zbd_queue_io_u(struct io_u *io_u,
+				  enum fio_q_status status) {}
+static inline void zbd_put_io_u(struct io_u *io_u) {}
+
+static inline void setup_zbd_zone_mode(struct thread_data *td,
+					struct io_u *io_u)
+{
+}
+
+#endif
+
+#endif /* FIO_ZBD_H */
diff --git a/zone-dist.c b/zone-dist.c
new file mode 100644
index 0000000..819d531
--- /dev/null
+++ b/zone-dist.c
@@ -0,0 +1,74 @@
+#include <stdlib.h>
+#include "fio.h"
+#include "zone-dist.h"
+
+static void __td_zone_gen_index(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned int i, j, sprev, aprev;
+	uint64_t sprev_sz;
+
+	td->zone_state_index[ddir] = malloc(sizeof(struct zone_split_index) * 100);
+
+	sprev_sz = sprev = aprev = 0;
+	for (i = 0; i < td->o.zone_split_nr[ddir]; i++) {
+		struct zone_split *zsp = &td->o.zone_split[ddir][i];
+
+		for (j = aprev; j < aprev + zsp->access_perc; j++) {
+			struct zone_split_index *zsi = &td->zone_state_index[ddir][j];
+
+			zsi->size_perc = sprev + zsp->size_perc;
+			zsi->size_perc_prev = sprev;
+
+			zsi->size = sprev_sz + zsp->size;
+			zsi->size_prev = sprev_sz;
+		}
+
+		aprev += zsp->access_perc;
+		sprev += zsp->size_perc;
+		sprev_sz += zsp->size;
+	}
+}
+
+static bool has_zones(struct thread_data *td)
+{
+	int i, zones = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		zones += td->o.zone_split_nr[i];
+
+	return zones != 0;
+}
+
+/*
+ * Generate state table for indexes, so we don't have to do it inline from
+ * the hot IO path
+ */
+void td_zone_gen_index(struct thread_data *td)
+{
+	int i;
+
+	if (!has_zones(td))
+		return;
+
+	td->zone_state_index = malloc(DDIR_RWDIR_CNT *
+					sizeof(struct zone_split_index *));
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		__td_zone_gen_index(td, i);
+}
+
+void td_zone_free_index(struct thread_data *td)
+{
+	int i;
+
+	if (!td->zone_state_index)
+		return;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		free(td->zone_state_index[i]);
+		td->zone_state_index[i] = NULL;
+	}
+
+	free(td->zone_state_index);
+	td->zone_state_index = NULL;
+}
diff --git a/zone-dist.h b/zone-dist.h
new file mode 100644
index 0000000..c0b2884
--- /dev/null
+++ b/zone-dist.h
@@ -0,0 +1,7 @@
+#ifndef FIO_ZONE_DIST_H
+#define FIO_ZONE_DIST_H
+
+void td_zone_gen_index(struct thread_data *td);
+void td_zone_free_index(struct thread_data *td);
+
+#endif