From 360c39361a3769f2e6281399a650d64a67ccc0bd Mon Sep 17 00:00:00 2001 From: Packit Service Date: Dec 09 2020 13:59:32 +0000 Subject: gfs2-utils-3.2.0 base --- diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..adeb56d --- /dev/null +++ b/Makefile.am @@ -0,0 +1,35 @@ +EXTRA_DIST = autogen.sh README + +AUTOMAKE_OPTIONS = foreign + +MAINTAINERCLEANFILES = \ + Makefile.in \ + aclocal.m4 \ + configure \ + depcomp \ + config.guess \ + config.sub \ + missing \ + install-sh \ + autoheader \ + automake \ + autoconf \ + libtool \ + libtoolize \ + ltmain.sh \ + compile \ + make/clusterautoconfig.h.in \ + make/clusterautoconfig.h.in~ + +noinst_HEADERS = make/copyright.cf + +ACLOCAL_AMFLAGS = -I m4 + +SUBDIRS = \ + po \ + gfs2 \ + doc \ + tests + +maintainer-clean-local: + rm -rf m4 diff --git a/README b/README new file mode 100644 index 0000000..a04653f --- /dev/null +++ b/README @@ -0,0 +1,71 @@ +gfs2-utils +---------- + +This package contains the tools needed to create, check, manipulate and analyze +gfs2 filesystems, along with important scripts required to support gfs2 +clusters. + +Build instructions +------------------ + +The following development packages are required to build gfs2-utils: + + o autoconf + o automake + o libtool + o GNU make + o ncurses + o gettext + o bison + o flex + o zlib + o libblkid + o libuuid + o check (optional, enables unit tests) + +The kernel header include/linux/gfs2-ondisk.h and its dependencies are also +required. + +To build gfs2-utils, run the following commands: + + $ ./autogen.sh + $ ./configure + $ make + +See ./configure --help for more build configuration options. + +Test Suite +---------- + +To run the test suite, use: + + $ make check + +See doc/README.tests for more details regarding the test suite. + +Installation +------------ + +gfs2-utils requires the following libraries: + + o zlib + o ncurses + o libblkid + o libuuid + +To install gfs2-utils, run: + + # make install + +Support scripts +--------------- + +The following scripts (located in gfs2/scripts) are used to complete +the userland portion of the gfs2 withdraw feature using uevents. They +will be installed by 'make install' to these directories by default: + + 82-gfs2-withdraw.rules in /usr/lib/udev/rules.d/ + gfs2_withdraw_helper in /usr/sbin/ + +See also doc/README.contributing for details on submitting patches. + diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..3c5e1d9 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,4 @@ +#!/bin/sh +# Run this to generate all the initial makefiles, etc. +mkdir -p m4 +autoreconf -i -v && echo Now run ./configure and make diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..1b76c7d --- /dev/null +++ b/configure.ac @@ -0,0 +1,261 @@ + +# Process this file with autoconf to produce a configure script. + +AC_PREREQ([2.63]) +AC_INIT([gfs2-utils], [master], [cluster-devel@redhat.com]) +AM_INIT_AUTOMAKE([-Wno-portability]) +AM_SILENT_RULES([yes]) +LT_PREREQ([2.2.6]) +LT_INIT + +AC_CONFIG_MACRO_DIR([m4]) +AC_CONFIG_SRCDIR([gfs2/libgfs2/libgfs2.h]) +AC_CONFIG_HEADERS([make/clusterautoconfig.h]) + +AC_CANONICAL_HOST +AC_PROG_LIBTOOL + +AC_LANG([C]) + +#i18n support +AM_GNU_GETTEXT([external]) +AM_GNU_GETTEXT_VERSION([0.18]) + +# Sanitize path + +if test "$prefix" = "NONE"; then + prefix="/usr" + if test "$localstatedir" = "\${prefix}/var"; then + localstatedir="/var" + fi + if test "$sysconfdir" = "\${prefix}/etc"; then + sysconfdir="/etc" + fi + if test "$libdir" = "\${exec_prefix}/lib"; then + if test -e /usr/lib64; then + libdir="/usr/lib64" + else + libdir="/usr/lib" + fi + fi +fi + +case $exec_prefix in + NONE) exec_prefix=$prefix;; + prefix) exec_prefix=$prefix;; +esac + +# Checks for programs. + +# check stolen from gnulib/m4/gnu-make.m4 +if ! ${MAKE-make} --version /cannot/make/this >/dev/null 2>&1; then + AC_MSG_ERROR([GNU make was not found but is required.]) +fi + +AC_PROG_CC +AM_PROG_CC_C_O +AC_PROG_LN_S +AC_PROG_INSTALL +AC_PROG_MAKE_SET +AC_PROG_LEX +test "$LEX" != "flex" && AC_MSG_ERROR([flex not found]) +AC_CHECK_PROG([YACC], [bison], [bison -y]) +test x"$YACC" = x && AC_MSG_ERROR([bison not found]) + +## local helper functions + +# this function checks if CC support options passed as +# args. Global CFLAGS are ignored during this test. +cc_supports_flag() { + local CFLAGS="$@" + AC_MSG_CHECKING([whether $CC supports $CFLAGS]) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(){return 0;}])], + [RC=0; AC_MSG_RESULT([yes])], + [RC=1; AC_MSG_RESULT([no])]) + return $RC +} + +# this function tests if a library has a certain function +# by using AC_CHECK_LIB but restores the original LIBS global +# envvar. This is required to avoid libtool to link everything +# with everything. +check_lib_no_libs() { + AC_CHECK_LIB([$1], [$2],, + [AC_MSG_ERROR([Unable to find $1 library])]) + LIBS=$ac_check_lib_save_LIBS +} + +# local options +AC_ARG_ENABLE([debug], + AC_HELP_STRING([--enable-debug],[enable debug build [default=no]]), + [], [enable_debug="no"]) +AC_ARG_ENABLE([gcov], + AC_HELP_STRING([--enable-gcov],[enable coverage instrumentation [default=no]]), + [], [enable_gcov="no"]) +AC_ARG_ENABLE([gprof], + AC_HELP_STRING([--enable-gprof],[enable profiling instrumentation [default=no]]), + [], [enable_gprof="no"]) + + +# We use the Check framework for unit tests +PKG_CHECK_MODULES([check], [check >= 0.9.8], + [have_check=yes], + [have_check=no]) +AM_CONDITIONAL([HAVE_CHECK], [test "x$have_check" = "xyes"]) + +PKG_CHECK_MODULES([zlib],[zlib]) +PKG_CHECK_MODULES([blkid],[blkid]) +PKG_CHECK_MODULES([uuid],[uuid], + [have_uuid=yes], + [have_uuid=no]) + +# old versions of ncurses don't ship pkg-config files +PKG_CHECK_MODULES([ncurses],[ncurses],, + [check_lib_no_libs ncurses printw]) + +if test -z "$ncurses_CFLAGS" && test -z "$ncurses_LIBS"; then + ncurses_LIBS=-lncurses +fi + +AC_ARG_WITH([udevdir], + AS_HELP_STRING([--with-udevdir=DIR], + [udev directory containing rules.d [default=${prefix}/lib/udev]]), + [], [with_udevdir=\${prefix}/lib/udev]) +AC_SUBST([udevdir], [$with_udevdir]) + +# Checks for header files. +AC_CHECK_HEADERS([fcntl.h libintl.h limits.h locale.h mntent.h stddef.h sys/file.h sys/ioctl.h sys/mount.h sys/time.h sys/vfs.h syslog.h termios.h]) +AC_CHECK_HEADER([linux/fs.h], [], [AC_MSG_ERROR([Unable to find linux/fs.h])]) +AC_CHECK_HEADER([linux/types.h], [], [AC_MSG_ERROR([Unable to find linux/types.h])]) +AC_CHECK_HEADER([linux/limits.h], [], [AC_MSG_ERROR([Unable to find linux/limits.h])]) +AC_CHECK_HEADER([linux/gfs2_ondisk.h], [], [AC_MSG_ERROR([Unable to find linux/gfs2_ondisk.h])]) +AC_CHECK_MEMBER([struct gfs2_sb.sb_uuid], [sb_has_uuid=yes], [sb_has_uuid=no], + [[#include ]]) +AC_CHECK_MEMBER([struct gfs2_leaf.lf_inode],[AC_DEFINE([GFS2_HAS_LEAF_HINTS],[],[Leaf block hints])], + [], [[#include ]]) +AC_CHECK_MEMBER([struct gfs2_dirent.de_rahead],[AC_DEFINE([GFS2_HAS_DE_RAHEAD],[],[Dirent readahead field])], + [], [[#include ]]) +AC_CHECK_MEMBER([struct gfs2_dirent.de_cookie],[AC_DEFINE([GFS2_HAS_DE_COOKIE],[],[Dirent cookie field])], + [], [[#include ]]) +AC_CHECK_MEMBER([struct gfs2_rgrp.rg_skip],[AC_DEFINE([GFS2_HAS_RG_SKIP],[],[Next resource group pointer])], + [], [[#include ]]) +AC_CHECK_MEMBER([struct gfs2_rgrp.rg_data0],[AC_DEFINE([GFS2_HAS_RG_RI_FIELDS],[],[Resource group fields duplicated from the rindex])], + [], [[#include ]]) +AC_CHECK_MEMBER([struct gfs2_log_header.lh_crc],[AC_DEFINE([GFS2_HAS_LH_V2],[],[v2 log header format])], + [], [[#include ]]) + +# libuuid is only required if struct gfs2_sb.sb_uuid exists +if test "$sb_has_uuid" = "yes" -a "$have_uuid" = "no"; then + AC_MSG_ERROR([libuuid is required for this version of gfs2]) +fi + +# *FLAGS handling +ENV_CFLAGS="$CFLAGS" +ENV_CPPFLAGS="$CPPFLAGS" +ENV_LDFLAGS="$LDFLAGS" + +# debug build stuff +if test "x${enable_debug}" = xyes; then + AC_DEFINE_UNQUOTED([DEBUG], [1], [Compiling Debugging code]) + OPT_CFLAGS="-O0" + OPT_CPPFLAGS="" +else + OPT_CFLAGS="-O2" + OPT_CPPFLAGS="-D_FORTIFY_SOURCE=2" +fi + +# gdb flags +if test "x${GCC}" = xyes; then + GDB_FLAGS="-ggdb3" +else + GDB_FLAGS="-g" +fi + +# gcov works without optimization +if test "x${enable_gcov}" = xyes; then + GCOV_CFLAGS="-fprofile-arcs -ftest-coverage" + if ! cc_supports_flag $GCOV_CFLAGS; then + AC_MSG_ERROR([your compiler does not support coverage instrumentation]) + fi + if test "x${enable_debug}" = xyes; then + enable_debug="no (gcov enabled)" + fi + OPT_CFLAGS="-O0 $GCOV_CFLAGS" +fi + +if test "x${enable_gprof}" = xyes; then + GPROF_CFLAGS="-pg" + if ! cc_supports_flag $GPROF_CFLAGS; then + AC_MSG_ERROR([your compiler does not support gprof instrumentation]) + fi + OPT_CFLAGS="$OPT_CFLAGS $GPROF_CFLAGS" +fi + +# extra warnings +EXTRA_WARNINGS="" + +WARNLIST=" + all + shadow + missing-prototypes + missing-declarations + strict-prototypes + declaration-after-statement + pointer-arith + write-strings + cast-align + bad-function-cast + missing-format-attribute + format=2 + format-security + format-nonliteral + no-long-long + no-strict-aliasing + " + +for j in $WARNLIST; do + if cc_supports_flag -W$j; then + EXTRA_WARNINGS="$EXTRA_WARNINGS -W$j"; + fi +done + +CFLAGS="$ENV_CFLAGS $OPT_CFLAGS $GDB_FLAGS $EXTRA_WARNINGS $WERROR_CFLAGS" +CPPFLAGS="-I\$(top_builddir)/make -I\$(top_srcdir)/make -I. $ENV_CPPFLAGS $OPT_CPPFLAGS" +LDFLAGS="$ENV_LDFLAGS" + +AC_CONFIG_TESTDIR([tests], [gfs2/libgfs2:gfs2/mkfs:gfs2/fsck:gfs2/edit:gfs2/convert:gfs2/tune:tests]) +AC_CONFIG_FILES([Makefile + gfs2/Makefile + gfs2/include/Makefile + gfs2/libgfs2/Makefile + gfs2/convert/Makefile + gfs2/edit/Makefile + gfs2/fsck/Makefile + gfs2/mkfs/Makefile + gfs2/tune/Makefile + gfs2/man/Makefile + gfs2/scripts/Makefile + gfs2/glocktop/Makefile + doc/Makefile + tests/Makefile + tests/atlocal + po/Makefile.in + ]) + +AC_OUTPUT + +echo +echo " Configure summary" +echo " ===================" +echo " prefix : $prefix" +echo " exec_prefix : $exec_prefix" +echo " libdir : $libdir" +echo " sbindir : $sbindir" +echo " udevdir : $udevdir" +echo " ------------------" +echo " debug build : $enable_debug" +echo " C unit tests : $have_check" +echo " gprof build : $enable_gprof" +echo " gcov build : $enable_gcov" +echo +echo "Now run 'make' to build and 'make check' to run tests" diff --git a/doc/COPYING.applications b/doc/COPYING.applications new file mode 100644 index 0000000..d511905 --- /dev/null +++ b/doc/COPYING.applications @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/doc/COPYING.libraries b/doc/COPYING.libraries new file mode 100644 index 0000000..2d2d780 --- /dev/null +++ b/doc/COPYING.libraries @@ -0,0 +1,510 @@ + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes a de-facto standard. To achieve this, non-free programs must +be allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least + three years, to give the same user the materials specified in + Subsection 6a, above, for a charge no more than the cost of + performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the library, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James + Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/doc/COPYRIGHT b/doc/COPYRIGHT new file mode 100644 index 0000000..8974c3b --- /dev/null +++ b/doc/COPYRIGHT @@ -0,0 +1,44 @@ +Unless specified otherwise in the "exceptions section" below: + +Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved. +Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. + +Exceptions: + +gfs2/man/gfs2_mount.8: + Portions copyright (C) 2001-2003 The OpenGFS2 Project + Portions copyright (C) 2004 + Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. + +Authors as known by current RCS as of the time of writing: + +Abhijith Das +Adam Manthei +A. J. Lewis +Alasdair G. Kergon +Andrew Price +Benjamin Marzinski +Bob Peterson +Chris Feist +Christine Caulfield +Daniel Phillips +David Teigland +Fabio M. Di Nitto +James Parsons +Joel Becker +Jonathan Brassow +jparsons +Ken Preslan +Lon Hohberger +Marc - A. Dahlhaus +Marek 'marx' Grac +Mark Hlawatschek +Michael Conrad Tadpol Tilstra +Patrick Caulfield +Robert Peterson +Ross Vandegrift +Ryan McCabe +Ryan O'Hara +Stanko Kupcevic +Steven Whitehouse +Wendy Cheng diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000..cf671d5 --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,10 @@ +MAINTAINERCLEANFILES = Makefile.in + +dist_doc_DATA = \ + gfs2.txt \ + journaling.txt \ + COPYING.applications \ + COPYING.libraries \ + COPYRIGHT \ + README.contributing \ + README.licence diff --git a/doc/README.contributing b/doc/README.contributing new file mode 100644 index 0000000..26ec77d --- /dev/null +++ b/doc/README.contributing @@ -0,0 +1,65 @@ +Contributing to gfs2-utils +-------------------------- + +Here are some brief guidelines to follow when contributing to gfs2-utils. + +Translations +------------ + +We use the Zanata translation service: + + https://fedora.zanata.org/project/view/gfs2-utils + +See the documentation there for submitting translations. + +Patches +------- + +We don't dictate any particular coding style but please try to use a style +consistent with the existing code. If in doubt, the Linux kernel coding style +document is a good guideline: + + http://www.kernel.org/doc/Documentation/CodingStyle + +We use git for managing our source code and we assume here that you're familiar +with git. Patches should apply cleanly to the latest master branch of +gfs2-utils.git + + https://pagure.io/gfs2-utils + +For ease of review and maintenance each of your patches should address a single +issue and if there are multiple issues please consider spreading your work over +several patches. Ideally none of the individual patches should break the build. + +We value good commit logs, which should be of the form: + + component: short patch summary + + Longer description wrapped at approx. 72 columns explaining the problem the + patch addresses and how the patch addresses it. + + Signed-off-by: Your Name + +The "component" should be the name of the tool or the part of the code which +the patch touches. As we share a mailing list with several projects it should +make clear that it's a gfs2-utils patch. Some examples: + +Bad short logs: + + Fix a bug + Add a test + +Good short logs: + + fsck.gfs2: Fix a null pointer dereference in foo + gfs2-utils: Add a test for lgfs2_do_stuff + +Be sure to reference any relevant bug reports in your long description, e.g. + + Ref: rhbz#012345 + Fixes: rhbz#98765 + +Please send patches to . We recommend using +`git format-patch' to generate patch emails from your commits and `git +send-email' for sending them to the list. See the git documentation for +details. diff --git a/doc/README.licence b/doc/README.licence new file mode 100644 index 0000000..075aa77 --- /dev/null +++ b/doc/README.licence @@ -0,0 +1,33 @@ +The Red Hat Cluster is a collection of free software built on top of different +libraries and applications. + +For a detailed list of authors and copyright holders, please check the +included COPYRIGHT file. + +Libraries: + +You can redistribute them and/or modify them under the terms of the GNU Lesser +General Public License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +The libraries are distributed in the hope that they will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +details. + +Applications: + +You can redistribute them and/or modify them under the terms of the GNU General +Public License as published by the Free Software Foundation; either version +2 of the License, or (at your option) any later version. + +The applications are distributed in the hope that they will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +A copy of each license is included for your convenience in COPYING.applications +and COPYING.libraries. + +If missing, write to the Free Software Foundation, Inc., 51 Franklin St, +Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/doc/README.tests b/doc/README.tests new file mode 100644 index 0000000..1b90a0c --- /dev/null +++ b/doc/README.tests @@ -0,0 +1,70 @@ +Working with the gfs2-utils test suite +-------------------------------------- + +Overview +-------- +The test suite in the tests directory of the gfs2-utils source tree is based on +the Autotest framework provided by Autoconf. The basic idea is that the +testsuite.at file is the main source file for the tests, written in m4 and +generating a bourne shell script called testsuite, which we run with 'make check'. + +When run, the test suite sources tests/atconfig and tests/atlocal for +configuration and then runs the whole suite or a specified set of tests, +depending on options passed to the test suite using TOPTS. For example, to see +a list of available options, use 'make check TOPTS=-h' and to see a numbered +list of the available tests, use 'make check TOPTS=-l'. + +A subset of the available tests can be run using keywords and/or by specifying +the test ID numbers in TOPTS, e.g. make check TOPTS='-k mkfs 24 25' + +Test output is captured and, if a test fails, a log is kept in +tests/testsuite.dir/$n/testsuite.log where $n is the test's ID number. Failed +tests can be re-run using make check TOPTS='--recheck' although it's better to +re-run the entire suite after fixing tests as a fix for one test could break +another. + +Writing tests +------------- +A number of GFS2-specific convenience macros have been defined in testsuite.at +to make defining new tests quick and easy. Also, some variables have been +defined in atlocal.in so that full paths to programs do not have to be included +in each test. Configuration should be specified in atlocal.in as atconfig is +generated by the configure script and atlocal is generated from atlocal.in at +build time. + +To keep the test suite organised, the testsuite.at file sources the actual +tests from other files, e.g. mkfs.at. + +A single test, specified as a test group in Autotest terms, follows the form + + AT_SETUP([Test title]) + ...test goes here... + AT_CLEANUP + +so, when adding tests, this is generally all that is required unless the tests +do not fit into an existing category, in which case AT_BANNER can be used to +group them, and they can be organised into a new .at file and sourced from +testsuite.at. + +As the tests can be run individually, any new tests which require the dummy +volume $GFS_TGT to be present should call GFS_TGT_REGEN before attempting to +use it. + +Documentation for Autotest, including the AT_* macros used to define tests, can +be found in the autoconf manual at: + + http://www.gnu.org/software/autoconf/manual/index.html + +Generating coverage reports +--------------------------- +Test coverage instrumentation can be enabled using the --enable-gcov option at +the configure stage. Once the tools have been built and run with this option +enabled, coverage data will be written to files in the source directories for +use by tools such as gcov or lcov. For example, to generate a HTML report of +testsuite code coverage, using lcov, in a directory named 'coverage': + + ./autogen.sh + ./configure --enable-gcov + make check + lcov --directory . -c -o gfs2-utils.info + genhtml -o coverage gfs2-utils.info diff --git a/doc/cluster.fig b/doc/cluster.fig new file mode 100644 index 0000000..b896a10 --- /dev/null +++ b/doc/cluster.fig @@ -0,0 +1,102 @@ +#FIG 3.2 Produced by xfig version 3.2.5a +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 7332.065 8049.457 3900 5025 3075 6375 2775 7650 + 1 1 1.00 60.00 120.00 + 1 0 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2640.072 5956.954 3300 7650 4275 6750 4200 5025 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 2812.500 5137.500 2175 4350 2775 4125 3450 4350 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 3770.455 7155.682 1650 5925 2700 4950 3450 4725 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 0 1 9962.903 13539.746 9075 5025 4950 6600 3750 7650 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 10339.926 6368.284 9300 5025 8700 5925 9225 7650 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 7917.672 6484.914 9525 5025 10050 6075 9750 7650 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 7531.731 2331.731 4125 3975 3750 2400 3900 1275 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 10477.754 2669.174 8850 3975 8400 2475 8925 1275 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 6825.000 2625.000 9675 3975 9975 2475 9675 1275 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 2690.779 2326.844 5025 1275 5250 2400 4650 3975 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 6675.000 6531.250 5250 4500 6675 4050 8100 4500 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 10499.364 5185.805 9900 4500 10500 4275 11175 4575 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 6478.423 21308.838 3975 8175 5925 7950 8550 8100 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 6102.404 -1403.365 3975 8400 5850 8625 8550 8325 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 9490.909 6835.048 11400 7650 11475 6225 9900 4800 + 1 1 1.00 60.00 120.00 + 1 1 1.00 60.00 120.00 +1 2 0 1 0 4 100 -1 20 0.000 1 0.0000 8534 4503 3825 600 4709 4503 12359 4503 +1 2 0 1 0 4 100 -1 20 0.000 1 0.0000 1961 4596 1575 2250 386 4596 3536 4596 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 225 7425 12825 7425 +2 4 0 1 0 6 50 -1 20 0.000 0 0 7 0 0 5 + 12675 1275 600 1275 600 375 12675 375 12675 1275 +2 4 0 1 0 1 50 -1 20 0.000 0 0 7 0 0 5 + 3975 8625 825 8625 825 7650 3975 7650 3975 8625 +2 4 0 1 0 1 50 -1 20 0.000 0 0 7 0 0 5 + 12225 8550 8550 8550 8550 7650 12225 7650 12225 8550 +2 4 0 1 0 2 50 -1 20 0.000 0 0 7 0 0 5 + 2250 6975 375 6975 375 5925 2250 5925 2250 6975 +2 4 0 1 0 2 50 -1 20 0.000 0 0 7 0 0 5 + 2175 5025 300 5025 300 3975 2175 3975 2175 5025 +2 4 0 1 0 6 50 -1 20 0.000 0 0 7 0 0 5 + 5250 5025 3450 5025 3450 3975 5250 3975 5250 5025 +2 4 0 1 0 6 50 -1 20 0.000 0 0 7 0 0 5 + 9900 5025 8100 5025 8100 3975 9900 3975 9900 5025 +2 4 0 1 0 2 50 -1 20 0.000 0 0 7 0 0 5 + 13050 5025 11175 5025 11175 3975 13050 3975 13050 5025 +4 0 0 50 -1 0 12 0.0000 4 180 780 4050 900 Corosync\001 +4 0 0 50 -1 0 12 0.0000 4 180 930 675 4650 gfs_control\001 +4 0 0 50 -1 0 12 0.0000 4 180 1035 3825 4575 gfs_controld\001 +4 0 0 50 -1 0 12 0.0000 4 180 1035 825 6525 mount.gfs[2]\001 +4 0 0 50 -1 0 12 0.0000 4 165 570 1950 8250 GFS[2]\001 +4 0 0 50 -1 0 12 0.0000 4 135 435 9900 8175 DLM\001 +4 0 0 50 -1 0 12 0.0000 4 135 540 5100 7650 Kernel\001 +4 0 0 50 -1 0 12 0.0000 4 180 810 5100 7350 Userspace\001 +4 0 0 50 -1 0 12 0.0000 4 180 420 6375 6000 Sysfs\001 +4 0 0 50 -1 0 12 0.0000 4 135 1005 2400 5325 Unix Socket\001 +4 0 0 50 -1 0 12 0.0000 4 135 1005 2400 4050 Unix Socket\001 +4 0 0 50 -1 0 12 0.0000 4 135 585 4500 5625 Uevent\001 +4 0 0 50 -1 0 12 0.0000 4 180 420 8700 6375 Sysfs\001 +4 0 0 50 -1 0 12 0.0000 4 135 585 10050 5775 Uevent\001 +4 0 0 50 -1 0 12 4.7124 4 180 2235 3525 1575 CPG "gfs:mount:"\001 +4 0 0 50 -1 0 12 4.7124 4 180 1560 4875 1575 CPG "gfs:controld"\001 +4 0 0 50 -1 0 12 4.7124 4 135 1635 8700 1800 CPG "dlm:controld"\001 +4 0 0 50 -1 0 12 4.7124 4 135 1920 10050 1725 GPG "dlm:ls:"\001 +4 0 0 50 -1 0 12 0.0000 4 165 1110 8550 4575 dlm_controld\001 +4 0 0 50 -1 0 12 0.0000 4 165 1005 11625 4575 dlm_control\001 +4 0 0 50 -1 0 12 0.0000 4 135 1005 10050 4200 Unix Socket\001 +4 0 0 50 -1 0 12 0.0000 4 180 2160 5100 8775 Posix lock requests/replies\001 +4 0 0 50 -1 0 12 0.0000 4 180 2370 5850 7875 DLM lock requests/callbacks\001 +4 0 0 50 -1 0 12 0.0000 4 165 1050 11550 6150 (Posix locks)\001 +4 0 0 50 -1 0 12 0.0000 4 135 1020 11475 5925 Misc Device\001 +4 0 0 50 -1 0 12 0.0000 4 135 1005 6225 4275 Unix Socket\001 +4 0 4 50 -1 0 12 0.0000 4 135 1125 6975 3825 libdlmcontrol\001 +4 0 4 50 -1 0 12 0.0000 4 180 1050 1350 2250 libgfscontrol\001 +4 0 0 50 -1 0 12 0.0000 4 180 420 3000 6750 Sysfs\001 diff --git a/doc/gfs2.txt b/doc/gfs2.txt new file mode 100644 index 0000000..f2660d8 --- /dev/null +++ b/doc/gfs2.txt @@ -0,0 +1,46 @@ +Global File System +------------------ + +http://sources.redhat.com/cluster/ + +GFS2 is a cluster file system. It allows a cluster of computers to +simultaneously use a block device that is shared between them (with FC, +iSCSI, NBD, etc). GFS2 reads and writes to the block device like a local +file system, but also uses a lock module to allow the computers coordinate +their I/O so file system consistency is maintained. One of the nifty +features of GFS2 is perfect consistency -- changes made to the file system +on one machine show up immediately on all other machines in the cluster. + +GFS2 uses interchangable inter-node locking mechanisms. The currently +supported methods are: + + lock_nolock -- does no real locking and allows gfs to be used as a + local file system + + lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking + The dlm is found at linux/fs/dlm/ + +Lock_dlm depends on user space cluster management systems found +at the URL above. + +To use GFS2 as a local file system, no external clustering systems are +needed, simply: + + $ gfs2_mkfs -p lock_nolock -j 1 /dev/block_device + $ mount -t gfs2 /dev/block_device /dir + +GFS2 is not on-disk compatible with previous versions of GFS, but it does +use a very smilar on-disk format, so that upgrading a filesystem can be +done in place and makes relatively few changes. Upgrading a filesystem +to GFS2 is not currently reversible. + +The following man pages can be found at the URL above: + mkfs.gfs2 to make a filesystem + fsck.gfs2 to repair a filesystem + gfs2_grow to expand a filesystem online + gfs2_jadd to add journals to a filesystem online + gfs2_tool to manipulate, examine and tune a filesystem + gfs2_quota to examine and change quota values in a filesystem + gfs2_convert to convert a gfs filesystem to gfs2 + mount.gfs2 to find mount options + diff --git a/doc/journaling.txt b/doc/journaling.txt new file mode 100644 index 0000000..955885a --- /dev/null +++ b/doc/journaling.txt @@ -0,0 +1,166 @@ +o Journaling & Replay + +The fundamental problem with a journaled cluster filesystem is +handling journal replay with multiple journals. A single block of +metadata can be modified sequentially by many different nodes in the +cluster. As the block is modified by each node, it gets logged in the +journal for each node. If care is not taken, it's possible to get +into a situation where a journal replay can actually corrupt a +filesystem. The error scenario is: + +1) Node A modifies a metadata block by putting a updated copy into its + incore log. +2) Node B wants to read and modify the block so it requests the lock + and a blocking callback is sent to Node A. +3) Node A flushes its incore log to disk, and then syncs out the + metadata block to its inplace location. +4) Node A then releases the lock. +5) Node B reads in the block and puts a modified copy into its ondisk + log and then the inplace block location. +6) Node A crashes. + +At this point, Node A's journal needs to be replayed. Since there is +a newer version of block inplace, if that block is replayed, the +filesystem will be corrupted. There are a few different ways of +avoiding this problem. + +1) Generation Numbers (GFS1) + + Each metadata block has header in it that contains a 64-bit + generation number. As each block is logged into a journal, the + generation number is incremented. This provides a strict ordering + of the different versions of the block a they are logged in the FS' + different journals. When journal replay happens, each block in the + journal is not replayed if generation number in the journal is less + than the generation number in place. This ensures that a newer + version of a block is never replaced with an older version. So, + this solution basically allows multiple copies of the same block in + different journals, but it allows you to always know which is the + correct one. + + Pros: + + A) This method allows the fastest callbacks. To release a lock, + the incore log for the lock must be flushed and then the inplace + data and metadata must be synced. That's it. The sync + operations involved are: start the log body and wait for it to + become stable on the disk, synchronously write the commit block, + start the inplace metadata and wait for it to become stable on + the disk. + + Cons: + + A) Maintaining the generation numbers is expensive. All newly + allocated metadata block must be read off the disk in order to + figure out what the previous value of the generation number was. + When deallocating metadata, extra work and care must be taken to + make sure dirty data isn't thrown away in such a way that the + generation numbers stop doing their thing. + B) You can't continue to modify the filesystem during journal + replay. Basically, replay of a block is a read-modify-write + operation: the block is read from disk, the generation number is + compared, and (maybe) the new version is written out. Replay + requires that the R-M-W operation is atomic with respect to + other R-M-W operations that might be happening (say by a normal + I/O process). Since journal replay doesn't (and can't) play by + the normal metadata locking rules, you can't count on them to + protect replay. Hence GFS1, quieces all writes on a filesystem + before starting replay. This provides the mutual exclusion + required, but it's slow and unnecessarily interrupts service on + the whole cluster. + +2) Total Metadata Sync (OCFS2) + + This method is really simple in that it uses exactly the same + infrastructure that a local journaled filesystem uses. Every time + a node receives a callback, it stops all metadata modification, + syncs out the whole incore journal, syncs out any dirty data, marks + the journal as being clean (unmounted), and then releases the lock. + Because journal is marked as clean and recovery won't look at any + of the journaled blocks in it, a valid copy of any particular block + only exists in one journal at a time and that journal always the + journal who modified it last. + + Pros: + + A) Very simple to implement. + B) You can reuse journaling code from other places (such as JBD). + C) No quiece necessary for replay. + D) No need for generation numbers sprinkled throughout the metadata. + + Cons: + + A) This method has the slowest possible callbacks. The sync + operations are: stop all metadata operations, start and wait for + the log body, write the log commit block, start and wait for all + the FS' dirty metadata, write an unmount block. Writing the + metadata for the whole filesystem can be particularly expensive + because it can be scattered all over the disk and there can be a + whole journal's worth of it. + +3) Revocation of a lock's buffers (GFS2) + + This method prevents a block from appearing in more than one + journal by canceling out the metadata blocks in the journal that + belong to the lock being released. Journaling works very similarly + to a local filesystem or to #2 above. + + The biggest difference is you have to keep track of buffers in the + active region of the ondisk journal, even after the inplace blocks + have been written back. This is done in GFS2 by adding a second + part to the Active Items List. The first part (in GFS2 called + AIL1) contains a list of all the blocks which have been logged to + the journal, but not written back to their inplace location. Once + an item in AIL1 has been written back to its inplace location, it + is moved to AIL2. Once the tail of the log moves past the block's + transaction in the log, it can be removed from AIL2. + + When a callback occurs, the log is flushed to the disk and the + metadata for the lock is synced to disk. At this point, any + metadata blocks for the lock that are in the current active region + of the log will be in the AIL2 list. We then build a transaction + that contains revoke tags for each buffer in the AIL2 list that + belongs to that lock. + + Pros: + + A) No quiece necessary for Replay + B) No need for generation numbers sprinkled throughout the + metadata. + C) The sync operations are: stop all metadata operations, start and + wait for the log body, write the log commit block, start and + wait for all the FS' dirty metadata, start and wait for the log + body of a transaction that revokes any of the lock's metadata + buffers in the journal's active region, and write the commit + block for that transaction. + + Cons: + + A) Recovery takes two passes, one to find all the revoke tags in + the log and one to replay the metadata blocks using the revoke + tags as a filter. This is necessary for a local filesystem and + the total sync method, too. It's just that there will probably + be more tags. + +Comparing #2 and #3, both do extra I/O during a lock callback to make +sure that any metadata blocks in the log for that lock will be +removed. I believe #2 will be slower because syncing out all the +dirty metadata for entire filesystem requires lots of little, +scattered I/O across the whole disk. The extra I/O done by #3 is a +log write to the disk. So, not only should it be less I/O, but it +should also be better suited to get good performance out of the disk +subsystem. + +KWP 07/06/05 + +Further notes (Steven Whitehouse) +------------- + +Number 3 is slow due to having to do two write/wait transactions +in the log each time we release a glock. So far as I can see there +is no way around that, but it should be possible, if we so wish to +change to using #2 at some future date and still remain backward +compatible. So that option is open to us, but I'm not sure that we +want to take it yet. There may well be other ways to speed things +up in this area. More work remains to be done. + diff --git a/gfs2/Makefile.am b/gfs2/Makefile.am new file mode 100644 index 0000000..5a2eefd --- /dev/null +++ b/gfs2/Makefile.am @@ -0,0 +1,13 @@ +MAINTAINERCLEANFILES = Makefile.in + +SUBDIRS = \ + include \ + libgfs2 \ + convert \ + edit \ + fsck \ + mkfs \ + man \ + tune \ + glocktop \ + scripts diff --git a/gfs2/convert/Makefile.am b/gfs2/convert/Makefile.am new file mode 100644 index 0000000..4db643f --- /dev/null +++ b/gfs2/convert/Makefile.am @@ -0,0 +1,11 @@ +MAINTAINERCLEANFILES = Makefile.in + +sbin_PROGRAMS = gfs2_convert + +gfs2_convert_CPPFLAGS = \ + -D_FILE_OFFSET_BITS=64 \ + -I$(top_srcdir)/gfs2/include \ + -I$(top_srcdir)/gfs2/libgfs2 + +gfs2_convert_LDADD = $(top_builddir)/gfs2/libgfs2/libgfs2.la +gfs2_convert_LDFLAGS = $(uuid_LIBS) diff --git a/gfs2/convert/gfs2_convert.c b/gfs2/convert/gfs2_convert.c new file mode 100644 index 0000000..9cf97b6 --- /dev/null +++ b/gfs2/convert/gfs2_convert.c @@ -0,0 +1,2368 @@ +/***************************************************************************** +****************************************************************************** +** +** gfs2_convert - convert a gfs1 filesystem into a gfs2 filesystem. +** +****************************************************************************** +*****************************************************************************/ + +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include +#include +#include "osi_list.h" +#include "copyright.cf" +#include "libgfs2.h" + +/* The following declares are needed because gfs2 can't have */ +/* dependencies on gfs1: */ +#define RGRP_STUFFED_BLKS(sb) (((sb)->sb_bsize - sizeof(struct gfs2_rgrp)) * GFS2_NBBY) +#define RGRP_BITMAP_BLKS(sb) (((sb)->sb_bsize - sizeof(struct gfs2_meta_header)) * GFS2_NBBY) + +/* Define some gfs1 constants from gfs1's gfs_ondisk.h */ +#define GFS_METATYPE_NONE (0) +#define GFS_METATYPE_SB (1) /* Super-Block */ +#define GFS_METATYPE_RG (2) /* Resource Group Header */ +#define GFS_METATYPE_RB (3) /* Resource Group Block Alloc BitBlock */ +#define GFS_METATYPE_DI (4) /* "Disk" inode (dinode) */ +#define GFS_METATYPE_IN (5) /* Indirect dinode block list */ +#define GFS_METATYPE_LF (6) /* Leaf dinode block list */ +#define GFS_METATYPE_JD (7) /* Journal Data */ +#define GFS_METATYPE_LH (8) /* Log Header (gfs_log_header) */ +#define GFS_METATYPE_LD (9) /* Log Descriptor (gfs_log_descriptor) */ +#define GFS_METATYPE_EA (10) /* Extended Attribute */ +#define GFS_METATYPE_ED (11) /* Extended Attribute data */ + +/* GFS1 Dinode types */ +#define GFS_FILE_NON (0) +#define GFS_FILE_REG (1) /* regular file */ +#define GFS_FILE_DIR (2) /* directory */ +#define GFS_FILE_LNK (5) /* link */ +#define GFS_FILE_BLK (7) /* block device node */ +#define GFS_FILE_CHR (8) /* character device node */ +#define GFS_FILE_FIFO (101) /* fifo/pipe */ +#define GFS_FILE_SOCK (102) /* socket */ + +#define GFS_FORMAT_SB (100) /* Super-Block */ +#define GFS_FORMAT_FS (1309) /* Filesystem (all-encompassing) */ +#define GFS_FORMAT_MULTI (1401) /* Multi-Host */ + +#define DIV_RU(x, y) (((x) + (y) - 1) / (y)) + +struct inode_dir_block { + osi_list_t list; + uint64_t di_addr; + uint64_t di_paddr; /* Parent dir inode addr */ +}; + +struct inode_block { + osi_list_t list; + uint64_t di_addr; +}; + +struct blocklist { + osi_list_t list; + uint64_t block; + struct metapath mp; + int height; + char *ptrbuf; +}; + +struct gfs2_options { + char *device; + unsigned int yes:1; + unsigned int no:1; + unsigned int query:1; +}; + +struct gfs_sb raw_gfs1_ondisk_sb; +struct gfs2_sbd sb2; +struct inode_block dirs_to_fix; /* linked list of directories to fix */ +struct inode_dir_block cdpns_to_fix; /* linked list of cdpn symlinks */ +int seconds; +struct timeval tv; +uint64_t dirs_fixed; +uint64_t cdpns_fixed; +uint64_t dirents_fixed; +struct gfs_jindex *sd_jindex = NULL; /* gfs1 journal index in memory */ +int gfs2_inptrs; +uint64_t gfs2_heightsize[GFS2_MAX_META_HEIGHT]; +uint64_t gfs2_jheightsize[GFS2_MAX_META_HEIGHT]; +uint32_t gfs2_max_height; +uint32_t gfs2_max_jheight; +uint64_t jindex_addr = 0, rindex_addr = 0; +int print_level = MSG_NOTICE; +unsigned orig_journals = 0; + +/* ------------------------------------------------------------------------- */ +/* This function is for libgfs's sake. */ +/* ------------------------------------------------------------------------- */ +void print_it(const char *label, const char *fmt, const char *fmt2, ...) +{ + va_list args; + + va_start(args, fmt2); + printf("%s: ", label); + vprintf(fmt, args); + va_end(args); +} + +/* ------------------------------------------------------------------------- */ +/* convert_bitmaps - Convert gfs1 bitmaps to gfs2 bitmaps. */ +/* Fixes all unallocated metadata bitmap states (which are */ +/* valid in gfs1 but invalid in gfs2). */ +/* ------------------------------------------------------------------------- */ +static void convert_bitmaps(struct gfs2_sbd *sdp, struct rgrp_tree *rg) +{ + uint32_t blk; + int x, y; + struct gfs2_rindex *ri; + unsigned char state; + + ri = &rg->ri; + for (blk = 0; blk < ri->ri_length; blk++) { + struct gfs2_bitmap *bi; + x = (blk) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_rgrp); + + bi = &rg->bits[blk]; + for (; x < sdp->bsize; x++) + for (y = 0; y < GFS2_NBBY; y++) { + state = (bi->bi_bh->b_data[x] >> + (GFS2_BIT_SIZE * y)) & 0x03; + if (state == 0x02) {/* unallocated metadata state invalid */ + bi->bi_bh->b_data[x] &= ~(0x02 << (GFS2_BIT_SIZE * y)); + bmodified(bi->bi_bh); + } + } + } +}/* convert_bitmaps */ + +/* ------------------------------------------------------------------------- */ +/* convert_rgs - Convert gfs1 resource groups to gfs2. */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int convert_rgs(struct gfs2_sbd *sbp) +{ + struct rgrp_tree *rgd; + struct osi_node *n, *next = NULL; + struct gfs_rgrp *rgd1; + int rgs = 0; + + /* --------------------------------- */ + /* Now convert its rgs into gfs2 rgs */ + /* --------------------------------- */ + for (n = osi_first(&sbp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + + rgd1 = (struct gfs_rgrp *)&rgd->rg; /* recast as gfs1 structure */ + /* rg_freemeta is a gfs1 structure, so libgfs2 doesn't know to */ + /* convert from be to cpu. We must do it now. */ + rgd->rg.rg_free = rgd1->rg_free + be32_to_cpu(rgd1->rg_freemeta); + /* Zero it out so we don't add it again in case something breaks */ + /* later on in the process and we have to re-run convert */ + rgd1->rg_freemeta = 0; + + sbp->blks_total += rgd->ri.ri_data; + sbp->blks_alloced += (rgd->ri.ri_data - rgd->rg.rg_free); + sbp->dinodes_alloced += rgd1->rg_useddi; + convert_bitmaps(sbp, rgd); + /* Write the updated rgrp to the gfs2 buffer */ + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + rgs++; + if (rgs % 100 == 0) { + printf("."); + fflush(stdout); + } + } + return 0; +}/* superblock_cvt */ + +/* ------------------------------------------------------------------------- */ +/* calc_gfs2_tree_height - calculate new dinode height as if this is gfs2 */ +/* */ +/* This is similar to calc_tree_height in libgfs2 but at the point this */ +/* function is called, I have the wrong (gfs1 not gfs2) constants in place. */ +/* ------------------------------------------------------------------------- */ +static unsigned int calc_gfs2_tree_height(struct gfs2_inode *ip, uint64_t size) +{ + uint64_t *arr; + unsigned int max, height; + + if (ip->i_di.di_size > size) + size = ip->i_di.di_size; + + if (S_ISDIR(ip->i_di.di_mode)) { + arr = gfs2_jheightsize; + max = gfs2_max_jheight; + } else { + arr = gfs2_heightsize; + max = gfs2_max_height; + } + + for (height = 0; height < max; height++) + if (arr[height] >= size) + break; + /* If calc_gfs2_tree_height was called, the dinode is not stuffed or + we would have returned before this point. After the call, a call is + made to fix_metatree, which unstuffs the dinode. Therefore, the + smallest height that can result after this call is 1. */ + if (!height) + height = 1; + + return height; +} + +/* ------------------------------------------------------------------------- */ +/* mp_gfs1_to_gfs2 - convert a gfs1 metapath to a gfs2 metapath. */ +/* ------------------------------------------------------------------------- */ +static void mp_gfs1_to_gfs2(struct gfs2_sbd *sbp, int gfs1_h, int gfs2_h, + struct metapath *gfs1mp, struct metapath *gfs2mp) +{ + uint64_t lblock; + int h; + uint64_t gfs1factor[GFS2_MAX_META_HEIGHT]; + uint64_t gfs2factor[GFS2_MAX_META_HEIGHT]; + + /* figure out multiplication factors for each height - gfs1 */ + memset(&gfs1factor, 0, sizeof(gfs1factor)); + gfs1factor[gfs1_h - 1] = 1ull; + for (h = gfs1_h - 1; h > 0; h--) + gfs1factor[h - 1] = gfs1factor[h] * sbp->sd_inptrs; + + /* figure out multiplication factors for each height - gfs2 */ + memset(&gfs2factor, 0, sizeof(gfs2factor)); + gfs2factor[gfs2_h - 1] = 1ull; + for (h = gfs2_h - 1; h > 0; h--) + gfs2factor[h - 1] = gfs2factor[h] * gfs2_inptrs; + + /* Convert from gfs1 to a logical block */ + lblock = 0; + for (h = 0; h < gfs1_h; h++) + lblock += (gfs1mp->mp_list[h] * gfs1factor[h]); + + /* Convert from a logical block back to gfs2 */ + memset(gfs2mp, 0, sizeof(*gfs2mp)); + for (h = 0; h < gfs2_h; h++) { + /* Can't use do_div here because the factors are too large. */ + gfs2mp->mp_list[h] = lblock / gfs2factor[h]; + lblock %= gfs2factor[h]; + } +} + +/* ------------------------------------------------------------------------- */ +/* fix_metatree - Fix up the metatree to match the gfs2 metapath info */ +/* Similar to gfs2_writei in libgfs2 but we're only */ +/* interested in rearranging the metadata while leaving the */ +/* actual data blocks intact. */ +/* ------------------------------------------------------------------------- */ +static void fix_metatree(struct gfs2_sbd *sbp, struct gfs2_inode *ip, + struct blocklist *blk, uint64_t *first_nonzero_ptr, + unsigned int size) +{ + uint64_t block; + struct gfs2_buffer_head *bh; + unsigned int amount, ptramt; + int hdrsize, h, copied = 0, new; + struct gfs2_meta_header mh; + char *srcptr = (char *)first_nonzero_ptr; + + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_IN; + mh.mh_format = GFS2_FORMAT_IN; + if (!ip->i_di.di_height) + unstuff_dinode(ip); + + ptramt = blk->mp.mp_list[blk->height] * sizeof(uint64_t); + amount = size; + + while (copied < size) { + bh = ip->i_bh; + /* First, build up the metatree */ + for (h = 0; h < blk->height; h++) { + new = 0; + lookup_block(ip, bh, h, &blk->mp, 1, &new, &block); + if (bh != ip->i_bh) + brelse(bh); + if (!block) + break; + + bh = bread(sbp, block); + if (new) + memset(bh->b_data, 0, sbp->bsize); + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + } + + hdrsize = blk->height ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode); + + if (amount > sbp->bsize - hdrsize - ptramt) + amount = sbp->bsize - hdrsize - ptramt; + + memcpy(bh->b_data + hdrsize + ptramt, (char *)srcptr, amount); + srcptr += amount; + bmodified(bh); + if (bh != ip->i_bh) + brelse(bh); + + copied += amount; + + if (hdrsize + ptramt + amount >= sbp->bsize) { + /* advance to the next metablock */ + blk->mp.mp_list[blk->height] += + (amount / sizeof(uint64_t)); + for (h = blk->height; h > 0; h--) { + if (blk->mp.mp_list[h] >= gfs2_inptrs) { + blk->mp.mp_list[h] = 0; + blk->mp.mp_list[h - 1]++; + continue; + } + break; + } + } + amount = size - copied; + ptramt = 0; + } +} + +/* ------------------------------------------------------------------------- */ +/* adjust_indirect_blocks - convert all gfs_indirect blocks to gfs2. */ +/* */ +/* This function converts all gfs_indirect blocks to GFS2. The difference */ +/* is that gfs1 indirect block has a 64-byte chunk of reserved space that */ +/* gfs2 does not. Since GFS block locations (relative to the start of the */ +/* file have their locations defined by the offset from the end of the */ +/* structure, all block pointers must be shifted. */ +/* */ +/* Stuffed inodes don't need to be shifted at since there are no indirect */ +/* blocks. Inodes with height 1 don't need to be shifted either, because */ +/* the dinode size is the same between gfs and gfs2 (232 bytes), and */ +/* therefore you can fit the same number of block pointers after the dinode */ +/* structure. For the normal 4K block size, that's 483 pointers. For 1K */ +/* blocks, it's 99 pointers. */ +/* */ +/* At height 2 things get complex. GFS1 reserves an area of 64 (0x40) bytes */ +/* at the start of the indirect block, so for 4K blocks, you can fit 501 */ +/* pointers. GFS2 doesn't reserve that space, so you can fit 509 pointers. */ +/* For 1K blocks, it's 117 pointers in GFS1 and 125 in GFS2. */ +/* */ +/* That means, for example, that if you have 4K blocks, a 946MB file will */ +/* require a height of 3 for GFS, but only a height of 2 for GFS2. */ +/* There isn't a good way to shift the pointers around from one height to */ +/* another, so the only way to do it is to rebuild all those indirect blocks */ +/* from empty ones. */ +/* */ +/* For example, with a 1K block size, if you do: */ +/* */ +/* dd if=/mnt/gfs/big of=/tmp/tocompare skip=496572346368 bs=1024 count=1 */ +/* */ +/* the resulting metadata paths will look vastly different for the data: */ +/* */ +/* height 0 1 2 3 4 5 */ +/* GFS1: 0x16 0x4b 0x70 0x11 0x5e 0x48 */ +/* GFS2: 0x10 0x21 0x78 0x05 0x14 0x76 */ +/* */ +/* To complicate matters, we can't really require free space. A user might */ +/* be trying to migrate a "full" gfs1 file system to GFS2. After we */ +/* convert the journals to GFS2, we might have more free space, so we can */ +/* allocate blocks at that time. */ +/* */ +/* Assumes: GFS1 values are in place for diptrs and inptrs. */ +/* */ +/* Returns: 0 on success, -1 on failure */ +/* */ +/* Adapted from fsck.gfs2 metawalk.c's build_and_check_metalist */ +/* ------------------------------------------------------------------------- */ + +static void jdata_mp_gfs1_to_gfs2(struct gfs2_sbd *sbp, int gfs1_h, int gfs2_h, + struct metapath *gfs1mp, struct metapath *gfs2mp, + unsigned int *len, uint64_t dinode_size) +{ + uint64_t offset; + int h; + uint64_t gfs1factor[GFS2_MAX_META_HEIGHT]; + uint64_t gfs2factor[GFS2_MAX_META_HEIGHT]; + + /* figure out multiplication factors for each height - gfs1 */ + memset(&gfs1factor, 0, sizeof(gfs1factor)); + gfs1factor[gfs1_h - 1] = sbp->bsize - sizeof(struct gfs2_meta_header); + for (h = gfs1_h - 1; h > 0; h--) + gfs1factor[h - 1] = gfs1factor[h] * sbp->sd_inptrs; + + /* figure out multiplication factors for each height - gfs2 */ + memset(&gfs2factor, 0, sizeof(gfs2factor)); + gfs2factor[gfs2_h] = 1ull; + gfs2factor[gfs2_h - 1] = sbp->bsize; + for (h = gfs2_h - 1; h > 0; h--) + gfs2factor[h - 1] = gfs2factor[h] * gfs2_inptrs; + + /* Convert from gfs1 to an offset */ + offset = 0; + for (h = 0; h < gfs1_h; h++) + offset += (gfs1mp->mp_list[h] * gfs1factor[h]); + + if (dinode_size - offset < *len) + *len = dinode_size - offset; + + /* Convert from an offset back to gfs2 */ + memset(gfs2mp, 0, sizeof(*gfs2mp)); + for (h = 0; h <= gfs2_h; h++) { + /* Can't use do_div here because the factors are too large. */ + gfs2mp->mp_list[h] = offset / gfs2factor[h]; + offset %= gfs2factor[h]; + } +} + +static uint64_t fix_jdatatree(struct gfs2_sbd *sbp, struct gfs2_inode *ip, + struct blocklist *blk, char *srcptr, + unsigned int size) +{ + uint64_t block; + struct gfs2_buffer_head *bh; + unsigned int amount, ptramt; + int h, copied = 0, new = 0; + struct gfs2_meta_header mh; + + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_IN; + mh.mh_format = GFS2_FORMAT_IN; + + if (!ip->i_di.di_height) + unstuff_dinode(ip); + + ptramt = blk->mp.mp_list[blk->height]; + amount = size; + + while (copied < size) { + bh = ip->i_bh; + /* First, build up the metatree */ + for (h = 0; h < blk->height; h++) { + new = 0; + lookup_block(ip, bh, h, &blk->mp, 1, &new, &block); + if (bh != ip->i_bh) + brelse(bh); + if (!block) + break; + + bh = bread(sbp, block); + if (new) + memset(bh->b_data, 0, sbp->bsize); + if (h < (blk->height - 1)) { + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + } + } + + if (amount > sbp->bsize - ptramt) + amount = sbp->bsize - ptramt; + + memcpy(bh->b_data + ptramt, (char *)srcptr, amount); + srcptr += amount; + bmodified(bh); + if (bh != ip->i_bh) + brelse(bh); + + copied += amount; + + if (ptramt + amount >= sbp->bsize) { + /* advance to the next metablock */ + blk->mp.mp_list[blk->height] += amount; + for (h = blk->height; h > 0; h--) { + if (blk->mp.mp_list[h] >= gfs2_inptrs) { + blk->mp.mp_list[h] = 0; + blk->mp.mp_list[h - 1]++; + continue; + } + break; + } + } + amount = size - copied; + ptramt = 0; + } + return block; +} + +static int get_inode_metablocks(struct gfs2_sbd *sbp, struct gfs2_inode *ip, struct blocklist *blocks) +{ + struct blocklist *blk, *newblk; + struct gfs2_buffer_head *bh, *dibh = ip->i_bh; + osi_list_t *tmp; + uint64_t *ptr1, block; + int h, ptrnum; + int bufsize = sbp->bsize - sizeof(struct gfs_indirect); + + /* Add dinode block to the list */ + blk = malloc(sizeof(struct blocklist)); + if (!blk) { + log_crit(_("Error: Can't allocate memory for indirect block fix\n")); + return -1; + } + memset(blk, 0, sizeof(*blk)); + blk->block = dibh->b_blocknr; + blk->ptrbuf = malloc(bufsize); + if (!blk->ptrbuf) { + log_crit(_("Error: Can't allocate memory" + " for file conversion.\n")); + free(blk); + return -1; + } + memset(blk->ptrbuf, 0, bufsize); + /* Fill in the pointers from the dinode buffer */ + memcpy(blk->ptrbuf, dibh->b_data + sizeof(struct gfs_dinode), + sbp->bsize - sizeof(struct gfs_dinode)); + /* Zero out the pointers so we can fill them in later. */ + memset(dibh->b_data + sizeof(struct gfs_dinode), 0, + sbp->bsize - sizeof(struct gfs_dinode)); + osi_list_add_prev(&blk->list, &blocks->list); + + /* Now run the metadata chain and build lists of all metadata blocks */ + osi_list_foreach(tmp, &blocks->list) { + blk = osi_list_entry(tmp, struct blocklist, list); + + if (blk->height >= ip->i_di.di_height - 1) + continue; + for (ptr1 = (uint64_t *)blk->ptrbuf, ptrnum = 0; + ptrnum < sbp->sd_inptrs; ptr1++, ptrnum++) { + if (!*ptr1) + continue; + block = be64_to_cpu(*ptr1); + + newblk = malloc(sizeof(struct blocklist)); + if (!newblk) { + log_crit(_("Error: Can't allocate memory for indirect block fix.\n")); + return -1; + } + memset(newblk, 0, sizeof(*newblk)); + newblk->ptrbuf = malloc(bufsize); + if (!newblk->ptrbuf) { + /* FIXME: This message should be different, to not conflit with the above file conversion */ + log_crit(_("Error: Can't allocate memory for file conversion.\n")); + free(newblk); + return -1; + } + memset(newblk->ptrbuf, 0, bufsize); + newblk->block = block; + newblk->height = blk->height + 1; + /* Build the metapointer list from our predecessors */ + for (h = 0; h < blk->height; h++) + newblk->mp.mp_list[h] = blk->mp.mp_list[h]; + newblk->mp.mp_list[h] = ptrnum; + /* Queue it to be processed later on in the loop. */ + osi_list_add_prev(&newblk->list, &blocks->list); + /* read the new metadata block's pointers */ + bh = bread(sbp, block); + memcpy(newblk->ptrbuf, bh->b_data + sizeof(struct gfs_indirect), bufsize); + /* Zero the buffer so we can fill it in later */ + memset(bh->b_data + sizeof(struct gfs_indirect), 0, bufsize); + bmodified(bh); + brelse(bh); + /* Free the block so we can reuse it. This allows us to + convert a "full" file system. */ + ip->i_di.di_blocks--; + gfs2_free_block(sbp, block); + } + } + return 0; +} + +static int fix_ind_reg_or_dir(struct gfs2_sbd *sbp, struct gfs2_inode *ip, uint32_t di_height, + uint32_t gfs2_hgt, struct blocklist *blk, struct blocklist *blocks) +{ + unsigned int len, bufsize; + uint64_t *ptr1, *ptr2; + int ptrnum; + struct metapath gfs2mp; + + bufsize = sbp->bsize - sizeof(struct gfs_indirect); + len = bufsize; + + /* Skip zero pointers at the start of the buffer. This may + seem pointless, but the gfs1 blocks won't align with the + gfs2 blocks. That means that a single block write of + gfs1's pointers is likely to span two blocks on gfs2. + That's a problem if the file system is full. + So I'm trying to truncate the data at the start and end + of the buffers (i.e. write only what we need to). */ + for (ptr1 = (uint64_t *)blk->ptrbuf, ptrnum = 0; + ptrnum < sbp->sd_inptrs; ptr1++, ptrnum++) { + if (*ptr1 != 0x00) + break; + len -= sizeof(uint64_t); + } + /* Skip zero bytes at the end of the buffer */ + ptr2 = (uint64_t *)(blk->ptrbuf + bufsize) - 1; + while (len > 0 && *ptr2 == 0) { + ptr2--; + len -= sizeof(uint64_t); + } + blk->mp.mp_list[di_height - 1] = ptrnum; + mp_gfs1_to_gfs2(sbp, di_height, gfs2_hgt, &blk->mp, &gfs2mp); + memcpy(&blk->mp, &gfs2mp, sizeof(struct metapath)); + blk->height -= di_height - gfs2_hgt; + if (len) { + fix_metatree(sbp, ip, blk, ptr1, len); + ip->i_di.di_goal_meta = be64_to_cpu(*ptr2); + } + + return 0; +} + +static int fix_ind_jdata(struct gfs2_sbd *sbp, struct gfs2_inode *ip, uint32_t di_height, + uint32_t gfs2_hgt, uint64_t dinode_size, struct blocklist *blk, + struct blocklist *blocks) +{ + /*FIXME: Messages here should be different, to not conflit with messages in get_inode_metablocks */ + struct blocklist *newblk; + unsigned int len, bufsize; + uint64_t *ptr1, block; + int ptrnum, h; + struct metapath gfs2mp; + struct gfs2_buffer_head *bh; + + bufsize = sbp->bsize - sizeof(struct gfs2_meta_header); + /* + * For each metadata block that holds jdata block pointers, + * get the blk pointers and copy them block by block + */ + for (ptr1 = (uint64_t *) blk->ptrbuf, ptrnum = 0; + ptrnum < sbp->sd_inptrs; ptr1++, ptrnum++) { + if (!*ptr1) + continue; + block = be64_to_cpu(*ptr1); + + newblk = malloc(sizeof(struct blocklist)); + if (!newblk) { + log_crit(_("Error: Can't allocate memory for indirect block fix.\n")); + return -1; + } + memset(newblk, 0, sizeof(*newblk)); + newblk->ptrbuf = malloc(bufsize); + if (!newblk->ptrbuf) { + log_crit(_("Error: Can't allocate memory for file conversion.\n")); + free(newblk); + return -1; + } + memset(newblk->ptrbuf, 0, bufsize); + newblk->block = block; + newblk->height = blk->height + 1; + /* Build the metapointer list from our predecessors */ + for (h=0; h < blk->height; h++) + newblk->mp.mp_list[h] = blk->mp.mp_list[h]; + newblk->mp.mp_list[h] = ptrnum; + bh = bread(sbp, block); + /* This is a data block. i.e newblk->height == ip->i_di.di_height */ + /* read in the jdata block */ + memcpy(newblk->ptrbuf, bh->b_data + + sizeof(struct gfs2_meta_header), bufsize); + memset(bh->b_data + sizeof(struct gfs2_meta_header), 0, bufsize); + bmodified(bh); + brelse(bh); + /* Free the block so we can reuse it. This allows us to + convert a "full" file system */ + ip->i_di.di_blocks--; + gfs2_free_block(sbp, block); + + len = bufsize; + jdata_mp_gfs1_to_gfs2(sbp, di_height, gfs2_hgt, &newblk->mp, &gfs2mp, + &len, dinode_size); + memcpy(&newblk->mp, &gfs2mp, sizeof(struct metapath)); + newblk->height -= di_height - gfs2_hgt; + if (len) + ip->i_di.di_goal_meta = fix_jdatatree(sbp, ip, newblk, + newblk->ptrbuf, len); + free(newblk->ptrbuf); + free(newblk); + } + return 0; +} + +static int adjust_indirect_blocks(struct gfs2_sbd *sbp, struct gfs2_inode *ip) +{ + uint64_t dinode_size; + uint32_t gfs2_hgt, di_height; + osi_list_t *tmp=NULL, *x; + struct blocklist blocks, *blk; + int error = 0; + + int isdir = S_ISDIR(ip->i_di.di_mode); /* is always jdata */ + int isjdata = ((GFS2_DIF_JDATA & ip->i_di.di_flags) && !isdir); + int isreg = (!isjdata && !isdir); + int issys = (GFS2_DIF_SYSTEM & ip->i_di.di_flags); + + /* regular files and dirs are same upto height=2 + jdata files (not dirs) are same only when height=0 */ + if (((isreg||isdir) && ip->i_di.di_height <= 1) || + (isjdata && ip->i_di.di_height == 0)) { + if (!issys) + ip->i_di.di_goal_meta = ip->i_di.di_num.no_addr; + return 0; /* nothing to do */ + } + + osi_list_init(&blocks.list); + + error = get_inode_metablocks(sbp, ip, &blocks); + if (error) + goto out; + + /* The gfs2 height may be different. We need to rebuild the + metadata tree to the gfs2 height. */ + gfs2_hgt = calc_gfs2_tree_height(ip, ip->i_di.di_size); + /* Save off the size because we're going to empty the contents + and add the data blocks back in later. */ + dinode_size = ip->i_di.di_size; + ip->i_di.di_size = 0ULL; + di_height = ip->i_di.di_height; + ip->i_di.di_height = 0; + + /* Now run through the block list a second time. If the block + is a data block, rewrite the data to the gfs2 offset. */ + osi_list_foreach_safe(tmp, &blocks.list, x) { + + blk = osi_list_entry(tmp, struct blocklist, list); + /* If it's not metadata that holds data block pointers + (i.e. metadata pointing to other metadata) */ + if (blk->height != di_height - 1) { + osi_list_del(tmp); + free(blk->ptrbuf); + free(blk); + continue; + } + if (isreg || isdir) /* more or less same way to deal with either */ + error = fix_ind_reg_or_dir(sbp, ip, di_height, + gfs2_hgt, blk, &blocks); + else if (isjdata) + error = fix_ind_jdata(sbp, ip, di_height, gfs2_hgt, + dinode_size, blk, &blocks); + if (error) + goto out; + + osi_list_del(tmp); + free(blk->ptrbuf); + free(blk); + } + + ip->i_di.di_size = dinode_size; + + /* Set the new dinode height, which may or may not have changed. */ + /* The caller will take it from the ip and write it to the buffer */ + ip->i_di.di_height = gfs2_hgt; + return error; + +out: + while (!osi_list_empty(&blocks.list)) { + blk = osi_list_entry(tmp, struct blocklist, list); + osi_list_del(&blocks.list); + free(blk->ptrbuf); + free(blk); + } + return error; +} + +const char *cdpn[14] = {"{hostname}", "{mach}", "{os}", "{uid}", "{gid}", "{sys}", "{jid}", + "@hostname", "@mach", "@os", "@uid", "@gid", "@sys", "@jid"}; +static int has_cdpn(const char *str) +{ + int i; + for (i=0; i<14; i++) + if (strstr(str, cdpn[i]) != NULL) + return 1; + return 0; +} + +static int fix_cdpn_symlink(struct gfs2_sbd *sbp, struct gfs2_buffer_head *bh, struct gfs2_inode *ip) +{ + char *linkptr = NULL; + + if (ip->i_di.di_height != 0) + return 0; + + linkptr = bh->b_data + sizeof(struct gfs_dinode); + if (has_cdpn(linkptr)) { + struct inode_dir_block *fix; + /* Save the symlink di_addr. We'll find the parent di_addr later */ + fix = malloc(sizeof(struct inode_dir_block)); + if (!fix) { + log_crit(_("Error: out of memory.\n")); + return -1; + } + memset(fix, 0, sizeof(struct inode_dir_block)); + fix->di_addr = ip->i_di.di_num.no_addr; + osi_list_add_prev((osi_list_t *)&fix->list, + (osi_list_t *)&cdpns_to_fix); + } + + return 0; +} + +/* + * fix_xattr - + * Extended attributes can be either direct (in the ip->i_di.di_eattr block) or + * then can be at a maximum of 1 indirect level. Multiple levels of indirection + * are not supported. If the di_eattr block contains extended attribute data, + * i.e block type = GFS_METATYPE_EA, we ignore it. + * If the di_eattr block contains block pointers to extended attributes we need + * to fix the header. gfs1 uses gfs_indirect as the header which is 64 bytes + * bigger than gfs2_meta_header that gfs2 uses. + */ +static int fix_xattr(struct gfs2_sbd *sbp, struct gfs2_buffer_head *bh, struct gfs2_inode *ip) +{ + int len, old_hdr_sz, new_hdr_sz; + struct gfs2_buffer_head *eabh; + char *buf; + + /* Read in the i_di.di_eattr block */ + eabh = bread(sbp, ip->i_di.di_eattr); + if (!gfs2_check_meta(eabh, GFS_METATYPE_IN)) {/* if it is an indirect block */ + len = sbp->bsize - sizeof(struct gfs_indirect); + buf = malloc(len); + if (!buf) { + /*FIXME: Same message as fix_cdpn_symlink */ + log_crit(_("Error: out of memory.\n")); + return -1; + } + old_hdr_sz = sizeof(struct gfs_indirect); + new_hdr_sz = sizeof(struct gfs2_meta_header); + memcpy(buf, eabh->b_data + old_hdr_sz, sbp->bsize - old_hdr_sz); + memset(eabh->b_data + new_hdr_sz, 0, sbp->bsize - new_hdr_sz); + memcpy(eabh->b_data + new_hdr_sz, buf, len); + free(buf); + bmodified(eabh); + } + brelse(eabh); + + return 0; +} + +/* ------------------------------------------------------------------------- */ +/* adjust_inode - change an inode from gfs1 to gfs2 */ +/* */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int adjust_inode(struct gfs2_sbd *sbp, struct gfs2_buffer_head *bh) +{ + struct gfs2_inode *inode; + struct inode_block *fixdir; + int inode_was_gfs1; + + inode = lgfs2_gfs_inode_get(sbp, bh); + if (inode == NULL) { + log_crit(_("Error reading inode: %s\n"), strerror(errno)); + return -1; + } + + inode_was_gfs1 = (inode->i_di.di_num.no_formal_ino == + inode->i_di.di_num.no_addr); + /* Fix the inode number: */ + inode->i_di.di_num.no_formal_ino = sbp->md.next_inum; + + /* Fix the inode type: gfs1 uses di_type, gfs2 uses di_mode. */ + inode->i_di.di_mode &= ~S_IFMT; + switch (inode->i_di.__pad1) { /* formerly di_type */ + case GFS_FILE_DIR: /* directory */ + inode->i_di.di_mode |= S_IFDIR; + /* Add this directory to the list of dirs to fix later. */ + fixdir = malloc(sizeof(struct inode_block)); + if (!fixdir) { + /*FIXME: Same message as fix_cdpn_symlink */ + log_crit(_("Error: out of memory.\n")); + goto err_freei; + } + memset(fixdir, 0, sizeof(struct inode_block)); + fixdir->di_addr = inode->i_di.di_num.no_addr; + osi_list_add_prev((osi_list_t *)&fixdir->list, + (osi_list_t *)&dirs_to_fix); + break; + case GFS_FILE_REG: /* regular file */ + inode->i_di.di_mode |= S_IFREG; + break; + case GFS_FILE_LNK: /* symlink */ + inode->i_di.di_mode |= S_IFLNK; + break; + case GFS_FILE_BLK: /* block device */ + inode->i_di.di_mode |= S_IFBLK; + break; + case GFS_FILE_CHR: /* character device */ + inode->i_di.di_mode |= S_IFCHR; + break; + case GFS_FILE_FIFO: /* fifo / pipe */ + inode->i_di.di_mode |= S_IFIFO; + break; + case GFS_FILE_SOCK: /* socket */ + inode->i_di.di_mode |= S_IFSOCK; + break; + } + + /* ----------------------------------------------------------- */ + /* gfs2 inodes are slightly different from gfs1 inodes in that */ + /* di_goal_meta has shifted locations and di_goal_data has */ + /* changed from 32-bits to 64-bits. The following code */ + /* adjusts for the shift. */ + /* */ + /* Note: It may sound absurd, but we need to check if this */ + /* inode has already been converted to gfs2 or if it's */ + /* still a gfs1 inode. That's just in case there was a */ + /* prior attempt to run gfs2_convert that never finished */ + /* (due to power out, ctrl-c, kill, segfault, whatever.) */ + /* If it is unconverted gfs1 we want to do a full */ + /* conversion. If it's a gfs2 inode from a prior run, */ + /* we still need to renumber the inode, but here we */ + /* don't want to shift the data around. */ + /* ----------------------------------------------------------- */ + if (inode_was_gfs1) { + struct gfs_dinode *gfs1_dinode_struct; + int ret = 0; + + gfs1_dinode_struct = (struct gfs_dinode *)&inode->i_di; + inode->i_di.di_goal_meta = inode->i_di.di_goal_data; + inode->i_di.di_goal_data = 0; /* make sure the upper 32b are 0 */ + inode->i_di.di_goal_data = gfs1_dinode_struct->di_goal_dblk; + inode->i_di.di_generation = 0; + + if (adjust_indirect_blocks(sbp, inode)) + goto err_freei; + /* Check for cdpns */ + if (S_ISLNK(inode->i_di.di_mode)) { + ret = fix_cdpn_symlink(sbp, bh, inode); + if (ret) + goto err_freei; + } + /* Check for extended attributes */ + if (inode->i_di.di_eattr) { + ret = fix_xattr(sbp, bh, inode); + if (ret) + goto err_freei; + } + } + + bmodified(inode->i_bh); + inode_put(&inode); /* does gfs2_dinode_out if modified */ + sbp->md.next_inum++; /* update inode count */ + return 0; +err_freei: + inode_put(&inode); + return -1; +} /* adjust_inode */ + +static int next_rg_meta(struct rgrp_tree *rgd, uint64_t *block, int first) +{ + struct gfs2_bitmap *bits = NULL; + uint32_t length = rgd->ri.ri_length; + uint32_t blk = (first)? 0: (uint32_t)((*block + 1) - rgd->ri.ri_data0); + int i; + + if (!first && (*block < rgd->ri.ri_data0)) { + fprintf(stderr, "next_rg_meta: Start block is outside rgrp bounds.\n"); + exit(1); + } + for (i = 0; i < length; i++){ + bits = &rgd->bits[i]; + if (blk < bits->bi_len * GFS2_NBBY) + break; + blk -= bits->bi_len * GFS2_NBBY; + } + for (; i < length; i++){ + bits = &rgd->bits[i]; + blk = gfs2_bitfit((uint8_t *)bits->bi_bh->b_data + bits->bi_offset, + bits->bi_len, blk, GFS2_BLKST_DINODE); + if(blk != BFITNOENT){ + *block = blk + (bits->bi_start * GFS2_NBBY) + + rgd->ri.ri_data0; + break; + } + blk = 0; + } + if (i == length) + return -1; + return 0; +} + +static int next_rg_metatype(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + uint64_t *block, uint32_t type, int first) +{ + struct gfs2_buffer_head *bh = NULL; + + do{ + if (bh) + brelse(bh); + if (next_rg_meta(rgd, block, first)) + return -1; + bh = bread(sdp, *block); + first = 0; + } while(gfs2_check_meta(bh, type)); + brelse(bh); + return 0; +} + +/* ------------------------------------------------------------------------- */ +/* inode_renumber - renumber the inodes */ +/* */ +/* In gfs1, the inode number WAS the inode address. In gfs2, the inodes are */ +/* numbered sequentially. */ +/* */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int inode_renumber(struct gfs2_sbd *sbp, uint64_t root_inode_addr, osi_list_t *cdpn_to_fix) +{ + struct rgrp_tree *rgd; + struct osi_node *n, *next = NULL; + uint64_t block = 0; + struct gfs2_buffer_head *bh; + int first; + int error = 0; + int rgs_processed = 0; + + log_notice(_("Converting inodes.\n")); + sbp->md.next_inum = 1; /* starting inode numbering */ + gettimeofday(&tv, NULL); + seconds = tv.tv_sec; + + /* ---------------------------------------------------------------- */ + /* Traverse the resource groups to figure out where the inodes are. */ + /* ---------------------------------------------------------------- */ + for (n = osi_first(&sbp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + rgs_processed++; + first = 1; + while (1) { /* for all inodes in the resource group */ + gettimeofday(&tv, NULL); + /* Put out a warm, fuzzy message every second so the customer */ + /* doesn't think we hung. (This may take a long time). */ + if (tv.tv_sec - seconds) { + seconds = tv.tv_sec; + log_notice(_("\r%llu inodes from %d rgs converted."), + (unsigned long long)sbp->md.next_inum, + rgs_processed); + fflush(stdout); + } + /* Get the next metadata block. Break out if we reach the end. */ + /* We have to check all metadata blocks because the bitmap may */ + /* be "11" (used meta) for both inodes and indirect blocks. */ + /* We need to process the inodes and change the indirect blocks */ + /* to have a bitmap type of "01" (data). */ + if (next_rg_metatype(sbp, rgd, &block, 0, first)) + break; + /* If this is the root inode block, remember it for later: */ + if (block == root_inode_addr) { + sbp->sd_sb.sb_root_dir.no_addr = block; + sbp->sd_sb.sb_root_dir.no_formal_ino = sbp->md.next_inum; + } + bh = bread(sbp, block); + if (!gfs2_check_meta(bh, GFS_METATYPE_DI)) {/* if it is an dinode */ + /* Skip the rindex and jindex inodes for now. */ + if (block != rindex_addr && block != jindex_addr) { + error = adjust_inode(sbp, bh); + if (error) + return error; + } + } else { /* It's metadata, but not an inode, so fix the bitmap. */ + int blk, buf_offset; + int bitmap_byte; /* byte within the bitmap to fix */ + int byte_bit; /* bit within the byte */ + + /* Figure out the absolute bitmap byte we need to fix. */ + /* ignoring structure offsets and bitmap blocks for now. */ + bitmap_byte = (block - rgd->ri.ri_data0) / GFS2_NBBY; + byte_bit = (block - rgd->ri.ri_data0) % GFS2_NBBY; + /* Now figure out which bitmap block the byte is on */ + for (blk = 0; blk < rgd->ri.ri_length; blk++) { + struct gfs2_bitmap *bi = &rgd->bits[blk]; + /* figure out offset of first bitmap byte for this map: */ + buf_offset = (blk) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_rgrp); + /* if it's on this page */ + if (buf_offset + bitmap_byte < sbp->bsize) { + bi->bi_bh->b_data[buf_offset + bitmap_byte] &= + ~(0x03 << (GFS2_BIT_SIZE * byte_bit)); + bi->bi_bh->b_data[buf_offset + bitmap_byte] |= + (0x01 << (GFS2_BIT_SIZE * byte_bit)); + bmodified(bi->bi_bh); + break; + } + bitmap_byte -= (sbp->bsize - buf_offset); + } + } + brelse(bh); + first = 0; + } /* while 1 */ + } /* for all rgs */ + log_notice(_("\r%llu inodes from %d rgs converted."), + (unsigned long long)sbp->md.next_inum, rgs_processed); + fflush(stdout); + return 0; +}/* inode_renumber */ + +/* ------------------------------------------------------------------------- */ +/* fetch_inum - fetch an inum entry from disk, given its block */ +/* ------------------------------------------------------------------------- */ +static int fetch_inum(struct gfs2_sbd *sbp, uint64_t iblock, + struct gfs2_inum *inum, uint64_t *eablk) +{ + struct gfs2_inode *fix_inode; + + fix_inode = lgfs2_inode_read(sbp, iblock); + if (fix_inode == NULL) + return 1; + inum->no_formal_ino = fix_inode->i_di.di_num.no_formal_ino; + inum->no_addr = fix_inode->i_di.di_num.no_addr; + if (eablk) + *eablk = fix_inode->i_di.di_eattr; + + inode_put(&fix_inode); + return 0; +}/* fetch_inum */ + +/* ------------------------------------------------------------------------- */ +/* process_dirent_info - fix one dirent (directory entry) buffer */ +/* */ +/* We changed inode numbers, so we must update that number into the */ +/* directory entries themselves. */ +/* */ +/* Returns: 0 on success, -1 on failure, -EISDIR when dentmod marked DT_DIR */ +/* ------------------------------------------------------------------------- */ +static int process_dirent_info(struct gfs2_inode *dip, struct gfs2_sbd *sbp, + struct gfs2_buffer_head *bh, int dir_entries, uint64_t dentmod) +{ + int error = 0; + struct gfs2_dirent *dent; + int de; /* directory entry index */ + + error = gfs2_dirent_first(dip, bh, &dent); + if (error != IS_LEAF && error != IS_DINODE) { + log_crit(_("Error retrieving directory.\n")); + return -1; + } + error = 0; + /* Go through every dirent in the buffer and process it. */ + /* Turns out you can't trust dir_entries is correct. */ + for (de = 0; ; de++) { + struct gfs2_inum inum; + int dent_was_gfs1; + + if (dentmod) { + if (dent->de_type == cpu_to_be16(DT_LNK) + && cpu_to_be64(dent->de_inum.no_addr) == dentmod) { + dent->de_type = cpu_to_be16(DT_DIR); + error = -EISDIR; + break; + } + goto skip_next; + } + + gettimeofday(&tv, NULL); + /* Do more warm fuzzy stuff for the customer. */ + dirents_fixed++; + if (tv.tv_sec - seconds) { + seconds = tv.tv_sec; + log_notice(_("\r%llu directories, %llu dirents fixed."), + (unsigned long long)dirs_fixed, + (unsigned long long)dirents_fixed); + fflush(stdout); + } + /* fix the dirent's inode number based on the inode */ + gfs2_inum_in(&inum, (char *)&dent->de_inum); + dent_was_gfs1 = (dent->de_inum.no_addr == dent->de_inum.no_formal_ino); + if (inum.no_formal_ino) { /* if not a sentinel (placeholder) */ + error = fetch_inum(sbp, inum.no_addr, &inum, NULL); + if (error) { + log_crit(_("Error retrieving inode 0x%llx\n"), + (unsigned long long)inum.no_addr); + break; + } + /* fix the dirent's inode number from the fetched inum. */ + dent->de_inum.no_formal_ino = cpu_to_be64(inum.no_formal_ino); + } + /* Fix the dirent's filename hash: They are the same as gfs1 */ + /* dent->de_hash = cpu_to_be32(gfs2_disk_hash((char *)(dent + 1), */ + /* be16_to_cpu(dent->de_name_len))); */ + /* Fix the dirent's file type. Gfs1 used home-grown values. */ + /* Gfs2 uses standard values from include/linux/fs.h */ + /* Only do this if the dent was a true gfs1 dent, and not a */ + /* gfs2 dent converted from a previously aborted run. */ + if (dent_was_gfs1) { + switch be16_to_cpu(dent->de_type) { + case GFS_FILE_NON: + dent->de_type = cpu_to_be16(DT_UNKNOWN); + break; + case GFS_FILE_REG: /* regular file */ + dent->de_type = cpu_to_be16(DT_REG); + break; + case GFS_FILE_DIR: /* directory */ + dent->de_type = cpu_to_be16(DT_DIR); + break; + case GFS_FILE_LNK: /* link */ + dent->de_type = cpu_to_be16(DT_LNK); + break; + case GFS_FILE_BLK: /* block device node */ + dent->de_type = cpu_to_be16(DT_BLK); + break; + case GFS_FILE_CHR: /* character device node */ + dent->de_type = cpu_to_be16(DT_CHR); + break; + case GFS_FILE_FIFO: /* fifo/pipe */ + dent->de_type = cpu_to_be16(DT_FIFO); + break; + case GFS_FILE_SOCK: /* socket */ + dent->de_type = cpu_to_be16(DT_SOCK); + break; + } + } + /* + * Compare this dirent address with every one in the + * cdpns_to_fix list to find if this directory (dip) is + * a cdpn symlink's parent. If so add it to the list element + */ + if (dent->de_type == cpu_to_be16(DT_LNK)) { + osi_list_t *tmp; + struct inode_dir_block *fix; + osi_list_foreach(tmp, &cdpns_to_fix.list) { + fix = osi_list_entry(tmp, struct inode_dir_block, list); + if (fix->di_addr == inum.no_addr) + fix->di_paddr = dip->i_di.di_num.no_addr; + } + } + + skip_next: + error = gfs2_dirent_next(dip, bh, &dent); + if (error) { + if (error == -ENOENT) /* beyond the end of this bh */ + error = 0; + break; + } + } /* for every directory entry */ + return error; +}/* process_dirent_info */ + +/* ------------------------------------------------------------------------- */ +/* fix_one_directory_exhash - fix one directory's inode numbers. */ +/* */ +/* This is for exhash directories, where the inode has a list of "leaf" */ +/* blocks, each of which is a buffer full of dirents that must be processed. */ +/* */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int fix_one_directory_exhash(struct gfs2_sbd *sbp, struct gfs2_inode *dip, uint64_t dentmod) +{ + struct gfs2_buffer_head *bh_leaf; + int error; + uint64_t leaf_block, prev_leaf_block; + uint32_t leaf_num; + + prev_leaf_block = 0; + /* for all the leafs, get the leaf block and process the dirents inside */ + for (leaf_num = 0; ; leaf_num++) { + uint64_t buf; + struct gfs2_leaf leaf; + + error = gfs2_readi(dip, (char *)&buf, leaf_num * sizeof(uint64_t), + sizeof(uint64_t)); + if (!error) /* end of file */ + return 0; /* success */ + else if (error != sizeof(uint64_t)) { + log_crit(_("fix_one_directory_exhash: error reading directory.\n")); + return -1; + } + else { + leaf_block = be64_to_cpu(buf); + error = 0; + } + leaf_chain: + /* leaf blocks may be repeated, so skip the duplicates: */ + if (leaf_block == prev_leaf_block) /* same block? */ + continue; /* already converted */ + + prev_leaf_block = leaf_block; + /* read the leaf buffer in */ + error = gfs2_get_leaf(dip, leaf_block, &bh_leaf); + if (error) { + log_crit(_("Error reading leaf %llx\n"), + (unsigned long long)leaf_block); + break; + } + gfs2_leaf_in(&leaf, bh_leaf->b_data); + error = process_dirent_info(dip, sbp, bh_leaf, leaf.lf_entries, dentmod); + bmodified(bh_leaf); + brelse(bh_leaf); + if (dentmod && error == -EISDIR) /* dentmod was marked DT_DIR, break out */ + break; + if (leaf.lf_next) { /* leaf has a leaf chain, process leaves in chain */ + leaf_block = leaf.lf_next; + error = 0; + goto leaf_chain; + } + } /* for leaf_num */ + return 0; +}/* fix_one_directory_exhash */ + +static int process_directory(struct gfs2_sbd *sbp, uint64_t dirblock, uint64_t dentmod) +{ + struct gfs2_inode *dip; + int error = 0; + /* read in the directory inode */ + dip = lgfs2_inode_read(sbp, dirblock); + if (dip == NULL) + return -1; + /* fix the directory: either exhash (leaves) or linear (stuffed) */ + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (fix_one_directory_exhash(sbp, dip, dentmod)) { + log_crit(_("Error fixing exhash directory.\n")); + inode_put(&dip); + return -1; + } + } else { + error = process_dirent_info(dip, sbp, dip->i_bh, dip->i_di.di_entries, dentmod); + if (error && error != -EISDIR) { + log_crit(_("Error fixing linear directory.\n")); + inode_put(&dip); + return -1; + } + } + bmodified(dip->i_bh); + inode_put(&dip); + return 0; +} +/* ------------------------------------------------------------------------- */ +/* fix_directory_info - sync new inode numbers with directory info */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int fix_directory_info(struct gfs2_sbd *sbp, osi_list_t *dir_to_fix) +{ + osi_list_t *tmp, *fix; + struct inode_block *dir_iblk; + uint64_t dirblock; + uint32_t gfs1_inptrs = sbp->sd_inptrs; + /* Directory inodes have been converted to gfs2, use gfs2 inptrs */ + sbp->sd_inptrs = (sbp->bsize - sizeof(struct gfs2_meta_header)) + / sizeof(uint64_t); + + dirs_fixed = 0; + dirents_fixed = 0; + gettimeofday(&tv, NULL); + seconds = tv.tv_sec; + log_notice(_("\nFixing file and directory information.\n")); + fflush(stdout); + tmp = NULL; + /* for every directory in the list */ + for (fix = dir_to_fix->next; fix != dir_to_fix; fix = fix->next) { + if (tmp) { + osi_list_del(tmp); + free(tmp); + } + tmp = fix; /* remember the addr to free next time */ + dirs_fixed++; + /* figure out the directory inode block and read it in */ + dir_iblk = (struct inode_block *)fix; + dirblock = dir_iblk->di_addr; /* addr of dir inode */ + if (process_directory(sbp, dirblock, 0)) { + log_crit(_("Error processing directory\n")); + return -1; + } + } + /* Free the last entry in memory: */ + if (tmp) { + osi_list_del(tmp); + free(tmp); + } + sbp->sd_inptrs = gfs1_inptrs; + return 0; +}/* fix_directory_info */ + +/* ------------------------------------------------------------------------- */ +/* fix_cdpn_symlinks - convert cdpn symlinks to empty directories */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int fix_cdpn_symlinks(struct gfs2_sbd *sbp, osi_list_t *cdpn_to_fix) +{ + osi_list_t *tmp, *x; + int error = 0; + + cdpns_fixed = 0; + osi_list_foreach_safe(tmp, cdpn_to_fix, x) { + struct gfs2_inum fix, dir; + struct inode_dir_block *l_fix; + struct gfs2_buffer_head *bh = NULL; + struct gfs2_inode *fix_inode; + uint64_t eablk; + + l_fix = osi_list_entry(tmp, struct inode_dir_block, list); + osi_list_del(tmp); + + /* convert symlink to empty dir */ + error = fetch_inum(sbp, l_fix->di_addr, &fix, &eablk); + if (error) { + log_crit(_("Error retrieving inode at block %llx\n"), + (unsigned long long)l_fix->di_addr); + break; + } + error = fetch_inum(sbp, l_fix->di_paddr, &dir, NULL); + if (error) { + log_crit(_("Error retrieving inode at block %llx\n"), + (unsigned long long)l_fix->di_paddr); + break; + } + + /* initialize the symlink inode to be a directory */ + error = init_dinode(sbp, &bh, &fix, S_IFDIR | 0755, 0, &dir); + if (error != 0) + return -1; + + fix_inode = lgfs2_inode_get(sbp, bh); + if (fix_inode == NULL) + return -1; + fix_inode->i_di.di_eattr = eablk; /*fix extended attribute */ + inode_put(&fix_inode); + bmodified(bh); + brelse(bh); + + /* fix the parent directory dirent entry for this inode */ + error = process_directory(sbp, l_fix->di_paddr, l_fix->di_addr); + if (error) { + log_crit(_("Error trying to fix cdpn dentry\n")); + break; + } + free(l_fix); + cdpns_fixed++; + } + return error; +} /* fix_cdpn_symlinks */ + +/* ------------------------------------------------------------------------- */ +/* Fetch gfs1 jindex structure from buffer */ +/* ------------------------------------------------------------------------- */ +static void gfs1_jindex_in(struct gfs_jindex *jindex, char *buf) +{ + struct gfs_jindex *str = (struct gfs_jindex *)buf; + + jindex->ji_addr = be64_to_cpu(str->ji_addr); + jindex->ji_nsegment = be32_to_cpu(str->ji_nsegment); + memset(jindex->ji_reserved, 0, 64); +} + +/* ------------------------------------------------------------------------- */ +/* read_gfs1_jiindex - read the gfs1 jindex file. */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int read_gfs1_jiindex(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = sdp->md.jiinode; + char buf[sizeof(struct gfs_jindex)]; + unsigned int j; + int error=0; + unsigned int tmp_mode = 0; + + if(ip->i_di.di_size % sizeof(struct gfs_jindex) != 0){ + log_crit(_("The size reported in the journal index" + " inode is not a\n" + "\tmultiple of the size of a journal index.\n")); + return -1; + } + if(!(sd_jindex = (struct gfs_jindex *)malloc(ip->i_di.di_size))) { + log_crit(_("Unable to allocate journal index\n")); + return -1; + } + if(!memset(sd_jindex, 0, ip->i_di.di_size)) { + log_crit(_("Unable to zero journal index\n")); + return -1; + } + /* ugly hack + * Faking the gfs1_jindex inode as a directory to gfs2_readi + * so it skips the metaheader struct in the data blocks + * in the inode. gfs2_jindex inode doesn't have metaheaders + * in the data blocks */ + tmp_mode = ip->i_di.di_mode; + ip->i_di.di_mode &= ~S_IFMT; + ip->i_di.di_mode |= S_IFDIR; + for (j = 0; ; j++) { + struct gfs_jindex *journ; + + error = gfs2_readi(ip, buf, j * sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + if(!error) + break; + if (error != sizeof(struct gfs_jindex)){ + log_crit(_("An error occurred while reading the" + " journal index file.\n")); + goto fail; + } + journ = sd_jindex + j; + gfs1_jindex_in(journ, buf); + sdp->jsize = (journ->ji_nsegment * 16 * sdp->bsize) >> 20; + } + ip->i_di.di_mode = tmp_mode; + if(j * sizeof(struct gfs_jindex) != ip->i_di.di_size){ + log_crit(_("journal inode size invalid\n")); + goto fail; + } + sdp->md.journals = orig_journals = j; + return 0; + + fail: + free(sd_jindex); + return -1; +} + +static int sanity_check(struct gfs2_sbd *sdp) +{ + int error = 0; + if (!raw_gfs1_ondisk_sb.sb_quota_di.no_addr) { + log_crit(_("Error: Superblock Quota inode address is NULL\n")); + error = 1; + } + if (!raw_gfs1_ondisk_sb.sb_license_di.no_addr) { + log_crit(_("Error: Superblock Statfs inode address is NULL\n")); + error = 1; + } + if (!raw_gfs1_ondisk_sb.sb_seg_size) { + log_crit(_("Error: Superblock segment size is zero\n")); + error = 1; + } + return error; +} + +/* ------------------------------------------------------------------------- */ +/* init - initialization code */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int init(struct gfs2_sbd *sbp, struct gfs2_options *opts) +{ + struct gfs2_buffer_head *bh; + int rgcount; + struct gfs2_inum inum; + + memset(sbp, 0, sizeof(struct gfs2_sbd)); + if ((sbp->device_fd = open(opts->device, O_RDWR)) < 0) { + perror(opts->device); + exit(-1); + } + /* --------------------------------- */ + /* initialize the incore superblock */ + /* --------------------------------- */ + sbp->sd_sb.sb_header.mh_magic = GFS2_MAGIC; + sbp->sd_sb.sb_header.mh_type = GFS2_METATYPE_SB; + sbp->sd_sb.sb_header.mh_format = GFS2_FORMAT_SB; + + osi_list_init((osi_list_t *)&dirs_to_fix); + osi_list_init((osi_list_t *)&cdpns_to_fix); + /* ---------------------------------------------- */ + /* Initialize lists and read in the superblock. */ + /* ---------------------------------------------- */ + sbp->jsize = GFS2_DEFAULT_JSIZE; + sbp->rgsize = GFS2_DEFAULT_RGSIZE; + sbp->qcsize = GFS2_DEFAULT_QCSIZE; + sbp->time = time(NULL); + sbp->blks_total = 0; /* total blocks - total them up later */ + sbp->blks_alloced = 0; /* blocks allocated - total them up later */ + sbp->dinodes_alloced = 0; /* dinodes allocated - total them up later */ + sbp->sd_sb.sb_bsize = GFS2_DEFAULT_BSIZE; + sbp->bsize = sbp->sd_sb.sb_bsize; + sbp->rgtree.osi_node = NULL; + if (compute_constants(sbp)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(-1); + } + + bh = bread(sbp, GFS2_SB_ADDR >> sbp->sd_fsb2bb_shift); + memcpy(&raw_gfs1_ondisk_sb, (struct gfs_sb *)bh->b_data, + sizeof(struct gfs_sb)); + gfs2_sb_in(&sbp->sd_sb, bh->b_data); + + jindex_addr = be64_to_cpu(raw_gfs1_ondisk_sb.sb_jindex_di.no_addr); + rindex_addr = be64_to_cpu(raw_gfs1_ondisk_sb.sb_rindex_di.no_addr); + + sbp->bsize = sbp->sd_sb.sb_bsize; + sbp->fssize = lseek(sbp->device_fd, 0, SEEK_END) / sbp->sd_sb.sb_bsize; + sbp->sd_inptrs = (sbp->bsize - sizeof(struct gfs_indirect)) / + sizeof(uint64_t); + sbp->sd_diptrs = (sbp->bsize - sizeof(struct gfs_dinode)) / + sizeof(uint64_t); + sbp->sd_jbsize = sbp->bsize - sizeof(struct gfs2_meta_header); + brelse(bh); + if (compute_heightsize(sbp->bsize, sbp->sd_heightsize, &sbp->sd_max_height, + sbp->bsize, sbp->sd_diptrs, sbp->sd_inptrs)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(-1); + } + + if (compute_heightsize(sbp->bsize, sbp->sd_jheightsize, &sbp->sd_max_jheight, + sbp->sd_jbsize, sbp->sd_diptrs, sbp->sd_inptrs)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(-1); + } + /* -------------------------------------------------------- */ + /* Our constants are for gfs1. Need some for gfs2 as well. */ + /* -------------------------------------------------------- */ + gfs2_inptrs = (sbp->bsize - sizeof(struct gfs2_meta_header)) / + sizeof(uint64_t); /* How many ptrs can we fit on a block? */ + memset(gfs2_heightsize, 0, sizeof(gfs2_heightsize)); + if (compute_heightsize(sbp->bsize, gfs2_heightsize, &gfs2_max_height, + sbp->bsize, sbp->sd_diptrs, gfs2_inptrs)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(-1); + } + memset(gfs2_jheightsize, 0, sizeof(gfs2_jheightsize)); + if (compute_heightsize(sbp->bsize, gfs2_jheightsize, &gfs2_max_jheight, + sbp->sd_jbsize, sbp->sd_diptrs, gfs2_inptrs)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(-1); + } + + /* ---------------------------------------------- */ + /* Make sure we're really gfs1 */ + /* ---------------------------------------------- */ + if (sbp->sd_sb.sb_fs_format != GFS_FORMAT_FS || + sbp->sd_sb.sb_header.mh_type != GFS_METATYPE_SB || + sbp->sd_sb.sb_header.mh_format != GFS_FORMAT_SB || + sbp->sd_sb.sb_multihost_format != GFS_FORMAT_MULTI) { + log_crit(_("Error: %s does not look like a gfs1 filesystem.\n"), opts->device); + close(sbp->device_fd); + exit(-1); + } + /* get gfs1 rindex inode - gfs1's rindex inode ptr became __pad2 */ + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_rindex_di); + sbp->md.riinode = lgfs2_gfs_inode_read(sbp, inum.no_addr); + if (sbp->md.riinode == NULL) { + log_crit(_("Could not read resource group index: %s\n"), strerror(errno)); + exit(-1); + } + /* get gfs1 jindex inode - gfs1's journal index inode ptr became master */ + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_jindex_di); + sbp->md.jiinode = lgfs2_inode_read(sbp, inum.no_addr); + if (sbp->md.jiinode == NULL) { + log_crit(_("Could not read journal index: %s\n"), strerror(errno)); + exit(-1); + } + /* read in the journal index data */ + read_gfs1_jiindex(sbp); + /* read in the resource group index data: */ + + /* We've got a slight dilemma here. In gfs1, we used to have a meta */ + /* header in front of the rgindex pages. In gfs2, we don't. That's */ + /* apparently only for directories. So we need to fake out libgfs2 */ + /* so that it adjusts for the metaheader by faking out the inode to */ + /* look like a directory, temporarily. */ + sbp->md.riinode->i_di.di_mode &= ~S_IFMT; + sbp->md.riinode->i_di.di_mode |= S_IFDIR; + printf(_("Examining file system")); + if (gfs1_ri_update(sbp, 0, &rgcount, 0)){ + log_crit(_("Unable to fill in resource group information.\n")); + return -1; + } + printf("\n"); + fflush(stdout); + inode_put(&sbp->md.riinode); + inode_put(&sbp->md.jiinode); + log_debug(_("%d rgs found.\n"), rgcount); + return 0; +}/* fill_super_block */ + +/* ------------------------------------------------------------------------- */ +/* give_warning - give the all-important warning message. */ +/* ------------------------------------------------------------------------- */ +static void give_warning(void) +{ + printf(_("This program will convert a gfs1 filesystem to a " \ + "gfs2 filesystem.\n")); + printf(_("WARNING: This can't be undone. It is strongly advised " \ + "that you:\n\n")); + printf(_(" 1. Back up your entire filesystem first.\n")); + printf(_(" 2. Run fsck.gfs2 first to ensure filesystem integrity.\n")); + printf(_(" 3. Make sure the filesystem is NOT mounted from any node.\n")); + printf(_(" 4. Make sure you have the latest software versions.\n")); +}/* give_warning */ + +/* ------------------------------------------------------------------------- */ +/* version - print version information */ +/* ------------------------------------------------------------------------- */ +static void version(void) +{ + log_notice(_("gfs2_convert version %s (built %s %s)\n"), VERSION, + __DATE__, __TIME__); + log_notice("%s\n\n", REDHAT_COPYRIGHT); +} + +/* ------------------------------------------------------------------------- */ +/* usage - print usage information */ +/* ------------------------------------------------------------------------- */ +static void usage(const char *name) +{ + give_warning(); + printf(_("\nUsage:\n")); + printf(_("%s [-hnqvVy] \n\n"), name); + printf("Flags:\n"); + printf(_("\th - print this help message\n")); + printf(_("\tn - assume 'no' to all questions\n")); + printf(_("\tq - quieter output\n")); + printf(_("\tv - more verbose output\n")); + printf(_("\tV - print version information\n")); + printf(_("\ty - assume 'yes' to all questions\n")); +}/* usage */ + +/* ------------------------------------------------------------------------- */ +/* process_parameters */ +/* ------------------------------------------------------------------------- */ +static void process_parameters(int argc, char **argv, struct gfs2_options *opts) + +{ + int c; + + opts->yes = 0; + opts->no = 0; + if (argc == 1) { + usage(argv[0]); + exit(0); + } + while((c = getopt(argc, argv, "hnqvyV")) != -1) { + switch(c) { + + case 'h': + usage(argv[0]); + exit(0); + break; + case 'n': + opts->no = 1; + break; + case 'q': + decrease_verbosity(); + break; + case 'v': + increase_verbosity(); + break; + case 'V': + exit(0); + case 'y': + opts->yes = 1; + break; + default: + fprintf(stderr,_("Parameter not understood: %c\n"), c); + usage(argv[0]); + exit(0); + } + } + if(argc > optind) { + opts->device = argv[optind]; + } else { + fprintf(stderr, _("No device specified. Please use '-h' for help\n")); + exit(1); + } +} /* process_parameters */ + +/* ------------------------------------------------------------------------- */ +/* rgrp_length - Calculate the length of a resource group */ +/* @size: The total size of the resource group */ +/* ------------------------------------------------------------------------- */ +static uint64_t rgrp_length(uint64_t size, struct gfs2_sbd *sdp) +{ + uint64_t bitbytes = RGRP_BITMAP_BLKS(&sdp->sd_sb) + 1; + uint64_t stuff = RGRP_STUFFED_BLKS(&sdp->sd_sb) + 1; + uint64_t blocks = 1; + + if (size >= stuff) { + size -= stuff; + while (size > bitbytes) { + blocks++; + size -= bitbytes; + } + if (size) + blocks++; + } + return blocks; +}/* rgrp_length */ + +/* ------------------------------------------------------------------------- */ +/* journ_space_to_rg - convert gfs1 journal space to gfs2 rg space. */ +/* */ +/* In gfs1, the journals were kept separate from the files and directories. */ +/* They had a dedicated section of the fs carved out for them. */ +/* In gfs2, the journals are just files like any other, (but still hidden). */ +/* Therefore, the old journal space has to be converted to normal resource */ +/* group space. */ +/* */ +/* Returns: 0 on success, -1 on failure */ +/* ------------------------------------------------------------------------- */ +static int journ_space_to_rg(struct gfs2_sbd *sdp) +{ + int error = 0; + int j, x; + struct gfs_jindex *jndx; + struct rgrp_tree *rgd, *rgdhigh; + struct osi_node *n, *next = NULL; + struct gfs2_meta_header mh; + uint64_t ri_addr; + + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_RB; + mh.mh_format = GFS2_FORMAT_RB; + log_notice(_("Converting journal space to rg space.\n")); + /* Go through each journal, converting them one by one */ + for (j = 0; j < orig_journals; j++) { /* for each journal */ + uint64_t size; + + jndx = &sd_jindex[j]; + /* go through all rg index entries, keeping track of the + highest that's still in the first subdevice. + Note: we really should go through all of the rgindex because + we might have had rg's added by gfs_grow, and journals added + by jadd. gfs_grow adds rgs out of order, so we can't count + on them being in ascending order. */ + rgdhigh = NULL; + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + if (rgd->ri.ri_addr < jndx->ji_addr && + ((rgdhigh == NULL) || + (rgd->ri.ri_addr > rgdhigh->ri.ri_addr))) + rgdhigh = rgd; + } /* for each rg */ + if (!rgdhigh) { /* if we somehow didn't find one. */ + log_crit(_("Error: No suitable rg found for journal.\n")); + return -1; + } + log_info(_("Addr 0x%llx comes after rg at addr 0x%llx\n"), + (unsigned long long)jndx->ji_addr, + (unsigned long long)rgdhigh->ri.ri_addr); + ri_addr = jndx->ji_addr; + /* Allocate a new rgd entry which includes rg and ri. */ + rgd = rgrp_insert(&sdp->rgtree, ri_addr); + /* convert the gfs1 rgrp into a new gfs2 rgrp */ + size = jndx->ji_nsegment * + be32_to_cpu(raw_gfs1_ondisk_sb.sb_seg_size); + rgd->rg.rg_header.mh_magic = GFS2_MAGIC; + rgd->rg.rg_header.mh_type = GFS2_METATYPE_RG; + rgd->rg.rg_header.mh_format = GFS2_FORMAT_RG; + rgd->rg.rg_flags = 0; + rgd->rg.rg_dinodes = 0; + + rgd->ri.ri_addr = jndx->ji_addr; /* new rg addr becomes ji addr */ + rgd->ri.ri_length = rgrp_length(size, sdp); /* aka bitblocks */ + + rgd->ri.ri_data0 = jndx->ji_addr + rgd->ri.ri_length; + rgd->ri.ri_data = size - rgd->ri.ri_length; + /* Round down to nearest multiple of GFS2_NBBY */ + while (rgd->ri.ri_data & 0x03) + rgd->ri.ri_data--; + sdp->blks_total += rgd->ri.ri_data; /* For statfs file update */ + rgd->rg.rg_free = rgd->ri.ri_data; + rgd->ri.ri_bitbytes = rgd->ri.ri_data / GFS2_NBBY; + + if (gfs2_compute_bitstructs(sdp->sd_sb.sb_bsize, rgd)) { + log_crit(_("gfs2_convert: Error converting bitmaps.\n")); + exit(-1); + } + + for (x = 0; x < rgd->ri.ri_length; x++) + rgd->bits[x].bi_bh = bget(sdp, rgd->ri.ri_addr + x); + + convert_bitmaps(sdp, rgd); + for (x = 0; x < rgd->ri.ri_length; x++) { + if (x) + gfs2_meta_header_out(&mh, rgd->bits[x].bi_bh->b_data); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[x].bi_bh->b_data); + bmodified(rgd->bits[x].bi_bh); + } + } /* for each journal */ + return error; +}/* journ_space_to_rg */ + +/* ------------------------------------------------------------------------- */ +/* update_inode_file - update the inode file with the new next_inum */ +/* ------------------------------------------------------------------------- */ +static void update_inode_file(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = sdp->md.inum; + uint64_t buf; + int count; + + buf = cpu_to_be64(sdp->md.next_inum); + count = gfs2_writei(ip, &buf, 0, sizeof(uint64_t)); + if (count != sizeof(uint64_t)) { + fprintf(stderr, "update_inode_file\n"); + exit(1); + } + + log_debug(_("\nNext Inum: %llu\n"), (unsigned long long)sdp->md.next_inum); +}/* update_inode_file */ + +/* ------------------------------------------------------------------------- */ +/* write_statfs_file - write the statfs file */ +/* ------------------------------------------------------------------------- */ +static void write_statfs_file(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = sdp->md.statfs; + struct gfs2_statfs_change sc; + char buf[sizeof(struct gfs2_statfs_change)]; + int count; + + sc.sc_total = sdp->blks_total; + sc.sc_free = sdp->blks_total - sdp->blks_alloced; + sc.sc_dinodes = sdp->dinodes_alloced; + + gfs2_statfs_change_out(&sc, buf); + count = gfs2_writei(ip, buf, 0, sizeof(struct gfs2_statfs_change)); + if (count != sizeof(struct gfs2_statfs_change)) { + fprintf(stderr, "do_init (2)\n"); + exit(1); + } +}/* write_statfs_file */ + +/* ------------------------------------------------------------------------- */ +/* remove_obsolete_gfs1 - remove obsolete gfs1 inodes. */ +/* ------------------------------------------------------------------------- */ +static void remove_obsolete_gfs1(struct gfs2_sbd *sbp) +{ + struct gfs2_inum inum; + + log_notice(_("Removing obsolete GFS1 file system structures.\n")); + fflush(stdout); + /* Delete the old gfs1 Journal index: */ + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_jindex_di); + gfs2_freedi(sbp, inum.no_addr); + + /* Delete the old gfs1 rgindex: */ + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_rindex_di); + gfs2_freedi(sbp, inum.no_addr); + + /* Delete the old gfs1 Quota file: */ + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_quota_di); + gfs2_freedi(sbp, inum.no_addr); + + /* Delete the old gfs1 License file: */ + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_license_di); + gfs2_freedi(sbp, inum.no_addr); +} + +/* ------------------------------------------------------------------------- */ +/* lifted from libgfs2/structures.c */ +/* ------------------------------------------------------------------------- */ +static int conv_build_jindex(struct gfs2_sbd *sdp) +{ + unsigned int j; + + sdp->md.jiinode = createi(sdp->master_dir, "jindex", S_IFDIR | 0700, + GFS2_DIF_SYSTEM); + if (sdp->md.jiinode == NULL) { + return errno; + } + + sdp->md.journal = malloc(sdp->md.journals * + sizeof(struct gfs2_inode *)); + if (sdp->md.journal == NULL) { + return errno; + } + for (j = 0; j < sdp->md.journals; j++) { + char name[256]; + + printf(_("Writing journal #%d..."), j + 1); + fflush(stdout); + sprintf(name, "journal%u", j); + sdp->md.journal[j] = createi(sdp->md.jiinode, name, S_IFREG | + 0600, GFS2_DIF_SYSTEM); + write_journal(sdp->md.journal[j], sdp->bsize, + sdp->jsize << 20 >> sdp->sd_sb.sb_bsize_shift); + inode_put(&sdp->md.journal[j]); + printf(_("done.\n")); + fflush(stdout); + } + + free(sdp->md.journal); + inode_put(&sdp->md.jiinode); + return 0; +} + +static unsigned int total_file_blocks(struct gfs2_sbd *sdp, + uint64_t filesize, int journaled) +{ + unsigned int data_blks = 0, meta_blks = 0; + unsigned int max, height, bsize; + uint64_t *arr; + + /* Now find the total meta blocks required for data_blks */ + if (filesize <= sdp->bsize - sizeof(struct gfs2_dinode)) { + goto out; + } + + if (journaled) { + arr = sdp->sd_jheightsize; + max = sdp->sd_max_jheight; + bsize = sdp->sd_jbsize; + } else { + arr = sdp->sd_heightsize; + max = sdp->sd_max_height; + bsize = sdp->bsize; + } + data_blks = DIV_RU(filesize, bsize); /* total data blocks reqd */ + + for (height = 0; height < max; height++) + if (arr[height] >= filesize) + break; + if (height == 1) { + goto out; + } + + meta_blks = DIV_RU(data_blks, sdp->sd_inptrs); +out: + return data_blks + meta_blks; +} + +/* We check if the GFS2 filesystem files/structures created after the call to + * check_fit() in main() will fit in the currently available free blocks + */ +static int check_fit(struct gfs2_sbd *sdp) +{ + unsigned int blks_need = 0, blks_avail = sdp->blks_total - sdp->blks_alloced; + + /* build_master() */ + blks_need++; /*creation of master dir inode - 1 block */ + + /* conv_build_jindex() */ + { + blks_need++; /* creation of 'jindex' disk inode */ + /* creation of journals */ + blks_need += sdp->md.journals * + total_file_blocks(sdp, sdp->jsize << 20, 1); + } + /* build_per_node() */ + { + blks_need++; /* creation of 'per_node' dir inode */ + /* njourn x (inum_range + statfs_change + quota_change inodes) */ + blks_need += sdp->md.journals * 3; + /* quota change inodes are prealloced */ + blks_need += sdp->md.journals * + total_file_blocks(sdp, sdp->qcsize << 20, 1); + } + /* build_inum() */ + blks_need++; /* creation of 'inum' disk inode */ + + /* build_statfs() */ + blks_need++; /* creation of 'statfs' disk inode */ + + /* build_rindex() */ + { + struct osi_node *n, *next = NULL; + unsigned int rg_count = 0; + + blks_need++; /* creationg of 'rindex' disk inode */ + /* find the total # of rindex entries, gives size of rindex inode */ + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rg_count++; + } + blks_need += total_file_blocks(sdp, rg_count * + sizeof(struct gfs2_rindex), 1); + } + /* build_quota() */ + blks_need++; /* quota inode block and uid=gid=0 quota - total 1 block */ + + /* Up until this point we require blks_need blocks. We don't + * include the blocks freed by the next step (remove_obsolete_gfs1) + * because it's possible for us to exceed the available blocks + * before this step */ + + return blks_avail > blks_need; +} + +/* We fetch the old quota inode block and copy the contents of the block + * (minus the struct gfs2_dinode) into the new quota block. We update the + * inode height/size of the new quota file to that of the old one and set the + * old quota inode height/size to zero, so only the inode block gets freed. + */ +static void copy_quotas(struct gfs2_sbd *sdp) +{ + struct gfs2_inum inum; + struct gfs2_inode *oq_ip, *nq_ip; + int err; + + err = gfs2_lookupi(sdp->master_dir, "quota", 5, &nq_ip); + if (err) { + fprintf(stderr, _("Couldn't lookup new quota file: %d\n"), err); + exit(1); + } + + gfs2_inum_in(&inum, (char *)&raw_gfs1_ondisk_sb.sb_quota_di); + oq_ip = lgfs2_inode_read(sdp, inum.no_addr); + if (oq_ip == NULL) { + fprintf(stderr, _("Couldn't lookup old quota file: %s\n"), strerror(errno)); + exit(1); + } + + nq_ip->i_di.di_height = oq_ip->i_di.di_height; + nq_ip->i_di.di_size = oq_ip->i_di.di_size; + nq_ip->i_di.di_blocks = oq_ip->i_di.di_blocks; + memcpy(nq_ip->i_bh->b_data + sizeof(struct gfs2_dinode), + oq_ip->i_bh->b_data + sizeof(struct gfs2_dinode), + sdp->bsize - sizeof(struct gfs2_dinode)); + + oq_ip->i_di.di_height = 0; + oq_ip->i_di.di_size = 0; + + bmodified(nq_ip->i_bh); + inode_put(&nq_ip); + + bmodified(oq_ip->i_bh); + inode_put(&oq_ip); +} + +static int gfs2_query(struct gfs2_options *opts, const char *dev) +{ + int res = 0; + + if(opts->yes) + return 1; + if(opts->no) + return 0; + + opts->query = TRUE; + while (1) { + char *line = NULL; + size_t len = 0; + int ret; + + printf(_("Convert %s from GFS1 to GFS2? (y/n)"), dev); + fflush(stdout); + ret = getline(&line, &len, stdin); + res = rpmatch(line); + free(line); + if (ret <= 0) + continue; + if (res == 1 || res == 0) + break; + /* Unrecognized input; go again. */ + } + opts->query = FALSE; + return res; +} + +int main(int argc, char **argv) +{ + int error; + struct gfs2_buffer_head *bh; + struct gfs2_options opts; + + /* Set i18n support to gfs2_convert */ + setlocale(LC_ALL, ""); + textdomain("gfs2-utils"); + + version(); + process_parameters(argc, argv, &opts); + error = init(&sb2, &opts); + + /* + * Check for some common fs errors + */ + if (!error) { + if (sanity_check(&sb2)) { + log_crit(_("%s is not a clean gfs filesystem. Please use the" + " fsck.gfs2 utility to correct these errors and" + " try again.\n"), opts.device); + exit(0); + } + } + /* ---------------------------------------------- */ + /* Make them seal their fate. */ + /* ---------------------------------------------- */ + if (!error) { + give_warning(); + if (!gfs2_query(&opts, opts.device)) { + log_crit(_("%s not converted.\n"), opts.device); + close(sb2.device_fd); + exit(0); + } + } + /* ---------------------------------------------- */ + /* Convert incore gfs1 sb to gfs2 sb */ + /* ---------------------------------------------- */ + if (!error) { + log_notice(_("Converting resource groups.")); + fflush(stdout); + error = convert_rgs(&sb2); + log_notice("\n"); + if (error) + log_crit(_("%s: Unable to convert resource groups.\n"), opts.device); + fsync(sb2.device_fd); /* write the buffers to disk */ + } + /* ---------------------------------------------- */ + /* Renumber the inodes consecutively. */ + /* ---------------------------------------------- */ + if (!error) { + /* Add a string notifying inode converstion start? */ + error = inode_renumber(&sb2, sb2.sd_sb.sb_root_dir.no_addr, + (osi_list_t *)&cdpns_to_fix); + if (error) + log_crit(_("\n%s: Error renumbering inodes.\n"), opts.device); + fsync(sb2.device_fd); /* write the buffers to disk */ + } + /* ---------------------------------------------- */ + /* Fix the directories to match the new numbers. */ + /* ---------------------------------------------- */ + if (!error) { + error = fix_directory_info(&sb2, (osi_list_t *)&dirs_to_fix); + log_notice(_("\r%llu directories, %llu dirents fixed."), + (unsigned long long)dirs_fixed, + (unsigned long long)dirents_fixed); + fflush(stdout); + if (error) + log_crit(_("\n%s: Error fixing directories.\n"), opts.device); + } + /* ---------------------------------------------- */ + /* Convert cdpn symlinks to empty dirs */ + /* ---------------------------------------------- */ + if (!error) { + error = fix_cdpn_symlinks(&sb2, (osi_list_t *)&cdpns_to_fix); + log_notice(_("\r%llu cdpn symlinks moved to empty directories."), + (unsigned long long)cdpns_fixed); + fflush(stdout); + if (error) + log_crit(_("\n%s: Error fixing cdpn symlinks.\n"), opts.device); + } + /* ---------------------------------------------- */ + /* Convert journal space to rg space */ + /* ---------------------------------------------- */ + if (!error) { + log_notice(_("\nConverting journals.\n")); + error = journ_space_to_rg(&sb2); + if (error) + log_crit(_("%s: Error converting journal space.\n"), opts.device); + fsync(sb2.device_fd); /* write the buffers to disk */ + } + /* ---------------------------------------------- */ + /* Create our system files and directories. */ + /* ---------------------------------------------- */ + if (!error) { + int jreduce = 0; + + /* Now we've got to treat it as a gfs2 file system */ + if (compute_constants(&sb2)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(-1); + } + + /* Check if all the files we're about to create will + * fit into the space remaining on the device */ + while (!check_fit(&sb2)) { + sb2.jsize--; /* reduce jsize by 1MB each time */ + jreduce = 1; + } + if (jreduce) + log_notice(_("Reduced journal size to %u MB to accommodate " + "GFS2 file system structures.\n"), sb2.jsize); + /* Build the master subdirectory. */ + build_master(&sb2); /* Does not do inode_put */ + sb2.sd_sb.sb_master_dir = sb2.master_dir->i_di.di_num; + /* Build empty journal index file. */ + error = conv_build_jindex(&sb2); + if (error) { + log_crit(_("Error: could not build jindex: %s\n"), strerror(error)); + exit(-1); + } + log_notice(_("Building GFS2 file system structures.\n")); + /* Build the per-node directories */ + error = build_per_node(&sb2); + if (error) { + log_crit(_("Error building per-node directories: %s\n"), + strerror(error)); + exit(-1); + } + /* Create the empty inode number file */ + error = build_inum(&sb2); /* Does not do inode_put */ + if (error) { + log_crit(_("Error building inum inode: %s\n"), + strerror(error)); + exit(-1); + } + gfs2_lookupi(sb2.master_dir, "inum", 4, &sb2.md.inum); + /* Create the statfs file */ + error = build_statfs(&sb2); /* Does not do inode_put */ + if (error) { + log_crit(_("Error building statfs inode: %s\n"), + strerror(error)); + exit(-1); + } + gfs2_lookupi(sb2.master_dir, "statfs", 6, &sb2.md.statfs); + do_init_statfs(&sb2); + + /* Create the resource group index file */ + error = build_rindex(&sb2); + if (error) { + log_crit(_("Error building rindex inode: %s\n"), + strerror(error)); + exit(-1); + } + /* Create the quota file */ + error = build_quota(&sb2); + if (error) { + log_crit(_("Error building quota inode: %s\n"), + strerror(error)); + exit(-1); + } + + /* Copy out the master dinode */ + { + struct gfs2_inode *ip = sb2.master_dir; + if (ip->i_bh->b_modified) + gfs2_dinode_out(&ip->i_di, ip->i_bh->b_data); + } + /* Copy old quotas */ + copy_quotas(&sb2); + + update_inode_file(&sb2); + /* Now delete the now-obsolete gfs1 files: */ + remove_obsolete_gfs1(&sb2); + + write_statfs_file(&sb2); + + inode_put(&sb2.master_dir); + inode_put(&sb2.md.inum); + inode_put(&sb2.md.statfs); + + fsync(sb2.device_fd); /* write the buffers to disk */ + + /* Now free all the in memory */ + gfs2_rgrp_free(&sb2.rgtree); + log_notice(_("Committing changes to disk.\n")); + fflush(stdout); + /* Set filesystem type in superblock to gfs2. We do this at the */ + /* end because if the tool is interrupted in the middle, we want */ + /* it to not reject the partially converted fs as already done */ + /* when it's run a second time. */ + bh = bread(&sb2, LGFS2_SB_ADDR(&sb2)); + sb2.sd_sb.sb_fs_format = GFS2_FORMAT_FS; + sb2.sd_sb.sb_multihost_format = GFS2_FORMAT_MULTI; + gfs2_sb_out(&sb2.sd_sb, bh->b_data); + bmodified(bh); + brelse(bh); + + error = fsync(sb2.device_fd); + if (error) + perror(opts.device); + else + log_notice(_("%s: filesystem converted successfully to gfs2.\n"), opts.device); + } + close(sb2.device_fd); + if (sd_jindex) + free(sd_jindex); + exit(0); +} diff --git a/gfs2/edit/Makefile.am b/gfs2/edit/Makefile.am new file mode 100644 index 0000000..a9b177e --- /dev/null +++ b/gfs2/edit/Makefile.am @@ -0,0 +1,33 @@ +MAINTAINERCLEANFILES = Makefile.in + +sbin_PROGRAMS = gfs2_edit + +noinst_HEADERS = \ + gfs2hex.h \ + hexedit.h \ + extended.h \ + journal.h + +gfs2_edit_SOURCES = \ + gfs2hex.c \ + hexedit.c \ + savemeta.c \ + extended.c \ + journal.c + +gfs2_edit_CPPFLAGS = \ + -D_FILE_OFFSET_BITS=64 \ + -I$(top_srcdir)/gfs2/include \ + -I$(top_srcdir)/gfs2/libgfs2 + +gfs2_edit_CFLAGS = \ + $(ncurses_CFLAGS) \ + $(zlib_CFLAGS) \ + $(uuid_CFLAGS) + +gfs2_edit_LDFLAGS = \ + $(ncurses_LIBS) \ + $(zlib_LIBS) \ + $(uuid_LIBS) + +gfs2_edit_LDADD = $(top_builddir)/gfs2/libgfs2/libgfs2.la diff --git a/gfs2/edit/extended.c b/gfs2/edit/extended.c new file mode 100644 index 0000000..d24d755 --- /dev/null +++ b/gfs2/edit/extended.c @@ -0,0 +1,727 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "copyright.cf" + +#include "hexedit.h" +#include "libgfs2.h" +#include "extended.h" +#include "gfs2hex.h" + +extern uint64_t block; + +static void print_block_details(struct iinfo *ind, int level, int cur_height, + int pndx, uint64_t file_offset); + +static int get_height(void) +{ + int cur_height = 0, i; + + if (gfs2_struct_type != GFS2_METATYPE_DI) { + for (i = 0; i <= blockhist && i < 5; i++) { + if (blockstack[(blockhist - i) % + BLOCK_STACK_SIZE].gfs2_struct_type == + GFS2_METATYPE_DI) + break; + cur_height++; + } + } + return cur_height; +} + +static int _do_indirect_extended(char *diebuf, struct iinfo *iinf, int hgt) +{ + unsigned int x, y; + off_t headoff; + uint64_t p; + int i_blocks; + + i_blocks = 0; + for (x = 0; x < 512; x++) { + iinf->ii[x].is_dir = 0; + iinf->ii[x].height = 0; + iinf->ii[x].block = 0; + iinf->ii[x].dirents = 0; + memset(&iinf->ii[x].dirent, 0, sizeof(struct gfs2_dirents)); + } + headoff = sbd.gfs1 ? sizeof(struct gfs_indirect) : sizeof(struct gfs2_meta_header); + for (x = headoff, y = 0; x < sbd.bsize; x += sizeof(uint64_t), y++) { + p = be64_to_cpu(*(uint64_t *)(diebuf + x)); + if (p) { + iinf->ii[i_blocks].block = p; + iinf->ii[i_blocks].mp.mp_list[hgt] = i_blocks; + iinf->ii[i_blocks].is_dir = FALSE; + iinf->ii[i_blocks].ptroff = (x - headoff) / sizeof(uint64_t); + i_blocks++; + } + } + return i_blocks; +} + +int do_indirect_extended(char *diebuf, struct iinfo *iinf) +{ + return _do_indirect_extended(diebuf, iinf, get_height()); +} + +/* ------------------------------------------------------------------------ */ +/* dinode_valid - check if we have a dinode in recent history */ +/* ------------------------------------------------------------------------ */ +static int dinode_valid(void) +{ + int i; + + if (gfs2_struct_type == GFS2_METATYPE_DI) + return 1; + for (i = 0; i <= blockhist && i < 5; i++) { + if (blockstack[(blockhist - i) % + BLOCK_STACK_SIZE].gfs2_struct_type == + GFS2_METATYPE_DI) + return 1; + } + return 0; +} + +static uint64_t metapath_to_lblock(struct metapath *mp, int hgt) +{ + int h; + uint64_t lblock = 0; + uint64_t factor[GFS2_MAX_META_HEIGHT]; + + if (di.di_height < 2) + return mp->mp_list[0]; + /* figure out multiplication factors for each height */ + memset(&factor, 0, sizeof(factor)); + factor[di.di_height - 1] = 1ull; + for (h = di.di_height - 2; h >= 0; h--) + factor[h] = factor[h + 1] * sbd.sd_inptrs; + for (h = 0; h <= hgt; h++) + lblock += (mp->mp_list[h] * factor[h]); + return lblock; +} + +static int display_indirect(struct iinfo *ind, int indblocks, int level, + uint64_t startoff) +{ + int start_line; + int cur_height = -1, pndx; + + last_entry_onscreen[dmode] = 0; + if (!has_indirect_blocks()) + return -1; + if (!level) { + if (gfs2_struct_type == GFS2_METATYPE_DI) { + if (S_ISDIR(di.di_mode)) + print_gfs2("This directory contains %d indirect blocks", + indblocks); + else + print_gfs2("This inode contains %d indirect blocks", + indblocks); + } else + print_gfs2("This indirect block contains %d indirect blocks", + indblocks); + } + if (dinode_valid() && !S_ISDIR(di.di_mode)) { + /* See if we are on an inode or have one in history. */ + if (level) + cur_height = level; + else { + cur_height = get_height(); + print_gfs2(" (at height %d of %d)", + cur_height, di.di_height); + } + } + eol(0); + if (!level && indblocks) { + print_gfs2("Indirect blocks:"); + eol(0); + } + start_line = line; + for (pndx = start_row[dmode]; + (!termlines || pndx < termlines - start_line - 1 + + start_row[dmode]) && pndx < indblocks; + pndx++) { + uint64_t file_offset; + + if (pndx && ind->ii[pndx].block == ind->ii[pndx - 1].block) + continue; + print_entry_ndx = pndx; + if (termlines) { + if (edit_row[dmode] >= 0 && + line - start_line == + edit_row[dmode] - start_row[dmode]) + COLORS_HIGHLIGHT; + move(line, 1); + } + if (!termlines) { + int h; + + for (h = 0; h < level; h++) + print_gfs2(" "); + } + print_gfs2("%d: 0x%"PRIx64" => ", pndx, ind->ii[pndx].ptroff); + if (termlines) + move(line,9); + print_gfs2("0x%"PRIx64" / %"PRId64, ind->ii[pndx].block, + ind->ii[pndx].block); + if (termlines) { + if (edit_row[dmode] >= 0 && + line - start_line == + edit_row[dmode] - start_row[dmode]) { + sprintf(estring, "%llx", + (unsigned long long)ind->ii[print_entry_ndx].block); + strcpy(edit_fmt, "%llx"); + edit_size[dmode] = strlen(estring); + COLORS_NORMAL; + } + } + if (dinode_valid() && !S_ISDIR(di.di_mode)) { + float human_off; + char h; + + file_offset = metapath_to_lblock(&ind->ii[pndx].mp, + cur_height) * + sbd.bsize; + print_gfs2(" "); + h = 'K'; + human_off = (file_offset / 1024.0); + if (human_off > 1024.0) { h = 'M'; human_off /= 1024.0; } + if (human_off > 1024.0) { h = 'G'; human_off /= 1024.0; } + if (human_off > 1024.0) { h = 'T'; human_off /= 1024.0; } + if (human_off > 1024.0) { h = 'P'; human_off /= 1024.0; } + if (human_off > 1024.0) { h = 'E'; human_off /= 1024.0; } + print_gfs2("(data offset 0x%"PRIx64" / %"PRId64" / %6.2f%c)", + file_offset, file_offset, human_off, h); + print_gfs2(" "); + } + else + file_offset = 0; + if (dinode_valid() && !termlines && + ((level + 1 < di.di_height) || + (S_ISDIR(di.di_mode) && level <= di.di_height))) { + print_block_details(ind, level, cur_height, pndx, + file_offset); + } + print_entry_ndx = pndx; /* restore after recursion */ + eol(0); + } /* for each display row */ + if (line >= 7) /* 7 because it was bumped at the end */ + last_entry_onscreen[dmode] = line - 7; + eol(0); + end_row[dmode] = indblocks; + if (end_row[dmode] < last_entry_onscreen[dmode]) + end_row[dmode] = last_entry_onscreen[dmode]; + lines_per_row[dmode] = 1; + return 0; +} + +static void print_inode_type(__be16 de_type) +{ + if (sbd.gfs1) { + switch(de_type) { + case GFS_FILE_NON: + print_gfs2("Unknown"); + break; + case GFS_FILE_REG: + print_gfs2("File "); + break; + case GFS_FILE_DIR: + print_gfs2("Dir "); + break; + case GFS_FILE_LNK: + print_gfs2("Symlink"); + break; + case GFS_FILE_BLK: + print_gfs2("BlkDev "); + break; + case GFS_FILE_CHR: + print_gfs2("ChrDev "); + break; + case GFS_FILE_FIFO: + print_gfs2("Fifo "); + break; + case GFS_FILE_SOCK: + print_gfs2("Socket "); + break; + default: + print_gfs2("%04x ", de_type); + break; + } + return; + } + switch(de_type) { + case DT_UNKNOWN: + print_gfs2("Unknown"); + break; + case DT_REG: + print_gfs2("File "); + break; + case DT_DIR: + print_gfs2("Dir "); + break; + case DT_LNK: + print_gfs2("Symlink"); + break; + case DT_BLK: + print_gfs2("BlkDev "); + break; + case DT_CHR: + print_gfs2("ChrDev "); + break; + case DT_FIFO: + print_gfs2("Fifo "); + break; + case DT_SOCK: + print_gfs2("Socket "); + break; + default: + print_gfs2("%04x ", de_type); + break; + } +} + +#ifdef GFS2_HAS_LEAF_HINTS +#define LEAF_HINT_FMTS "lf_inode: 0x%llx, lf_dist: %u, " \ + "lf_nsec: %u, lf_sec: %llu, " +#define LEAF_HINT_FIELDS(lp) lp->lf_inode, lp->lf_dist, lp->lf_nsec, lp->lf_sec, +#else +#define LEAF_HINT_FMTS +#define LEAF_HINT_FIELDS(lp) +#endif + +static int display_leaf(struct iinfo *ind) +{ + struct gfs2_leaf *leaf = &ind->ii[0].lf; + int start_line, total_dirents = start_row[dmode]; + int d; + + eol(0); + if (gfs2_struct_type == GFS2_METATYPE_SB) + print_gfs2("The superblock has 2 directories"); + else + print_gfs2("Directory block: lf_depth:%d, lf_entries:%d, " + LEAF_HINT_FMTS + "fmt:%d next=0x%llx (%d dirents).", + leaf->lf_depth, leaf->lf_entries, + LEAF_HINT_FIELDS(leaf) + leaf->lf_dirent_format, + leaf->lf_next, + ind->ii[0].dirents); + + start_line = line; + for (d = start_row[dmode]; d < ind->ii[0].dirents; d++) { + if (termlines && d >= termlines - start_line - 2 + + start_row[dmode]) + break; + total_dirents++; + if (ind->ii[0].dirents >= 1) { + eol(3); + if (termlines) { + if (edit_row[dmode] >=0 && + line - start_line - 1 == + edit_row[dmode] - start_row[dmode]) { + COLORS_HIGHLIGHT; + sprintf(estring, "%llx", + (unsigned long long)ind->ii[0].dirent[d].block); + strcpy(edit_fmt, "%llx"); + } + } + print_gfs2("%d/%d [%08x] %lld/%"PRId64" (0x%llx/0x%"PRIx64") +%u: ", + total_dirents, d + 1, + ind->ii[0].dirent[d].dirent.de_hash, + ind->ii[0].dirent[d].dirent.de_inum.no_formal_ino, + ind->ii[0].dirent[d].block, + ind->ii[0].dirent[d].dirent.de_inum.no_formal_ino, + ind->ii[0].dirent[d].block, +#ifdef GFS2_HAS_DE_RAHEAD + (unsigned int)ind->ii[0].dirent[d].dirent.de_rahead +#else + 0 +#endif + ); + } + print_inode_type(ind->ii[0].dirent[d].dirent.de_type); + print_gfs2(" %s", ind->ii[0].dirent[d].filename); + if (termlines) { + if (edit_row[dmode] >= 0 && + line - start_line - 1 == edit_row[dmode] - + start_row[dmode]) + COLORS_NORMAL; + } + } + if (line >= 4) + last_entry_onscreen[dmode] = line - 4; + eol(0); + end_row[dmode] = ind->ii[0].dirents; + if (end_row[dmode] < last_entry_onscreen[dmode]) + end_row[dmode] = last_entry_onscreen[dmode]; + return 0; +} + +static void print_block_details(struct iinfo *ind, int level, int cur_height, + int pndx, uint64_t file_offset) +{ + struct iinfo *more_indir; + int more_ind; + char *tmpbuf; + uint64_t thisblk; + + thisblk = ind->ii[pndx].block; + more_indir = malloc(sizeof(struct iinfo)); + if (!more_indir) { + fprintf(stderr, "Out of memory in function " + "display_indirect\n"); + return; + } + tmpbuf = malloc(sbd.bsize); + if (!tmpbuf) { + fprintf(stderr, "Out of memory in function " + "display_indirect\n"); + free(more_indir); + return; + } + while (thisblk) { + /* read in the desired block */ + if (pread(sbd.device_fd, tmpbuf, sbd.bsize, thisblk * sbd.bsize) != sbd.bsize) { + fprintf(stderr, "bad read: %s from %s:%d: block %lld " + "(0x%llx)\n", strerror(errno), __FUNCTION__, + __LINE__, + (unsigned long long)ind->ii[pndx].block, + (unsigned long long)ind->ii[pndx].block); + exit(-1); + } + thisblk = 0; + memset(more_indir, 0, sizeof(struct iinfo)); + if (S_ISDIR(di.di_mode) && level == di.di_height) { + thisblk = do_leaf_extended(tmpbuf, more_indir); + display_leaf(more_indir); + } else { + int x; + + for (x = 0; x < 512; x++) { + memcpy(&more_indir->ii[x].mp, + &ind->ii[pndx].mp, + sizeof(struct metapath)); + more_indir->ii[x].mp.mp_list[cur_height+1] = x; + } + more_ind = _do_indirect_extended(tmpbuf, more_indir, + cur_height + 1); + display_indirect(more_indir, more_ind, level + 1, + file_offset); + } + if (thisblk) { + eol(0); + if (termlines) + move(line,9); + print_gfs2("Continuation block 0x%"PRIx64" / %"PRId64, + thisblk, thisblk); + } + } + free(tmpbuf); + free(more_indir); +} + +static void gfs_jindex_print(struct gfs_jindex *ji) +{ + pv((unsigned long long)ji, ji_addr, "%llu", "0x%llx"); + pv(ji, ji_nsegment, "%u", "0x%x"); + pv(ji, ji_pad, "%u", "0x%x"); +} + +static int print_gfs_jindex(struct gfs2_inode *dij) +{ + int error, start_line; + struct gfs_jindex ji; + char jbuf[sizeof(struct gfs_jindex)]; + + start_line = line; + print_gfs2("Journal index entries found: %lld.", + dij->i_di.di_size / sizeof(struct gfs_jindex)); + eol(0); + lines_per_row[dmode] = 4; + for (print_entry_ndx=0; ; print_entry_ndx++) { + error = gfs2_readi(dij, (void *)&jbuf, + print_entry_ndx*sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + gfs_jindex_in(&ji, jbuf); + if (!error) /* end of file */ + break; + if (!termlines || + (print_entry_ndx >= start_row[dmode] && + ((print_entry_ndx - start_row[dmode])+1) * + lines_per_row[dmode] <= termlines - start_line - 2)) { + if (edit_row[dmode] == print_entry_ndx) { + COLORS_HIGHLIGHT; + strcpy(efield, "ji_addr"); + sprintf(estring, "%llx", (unsigned long long)ji.ji_addr); + } + print_gfs2("Journal #%d", print_entry_ndx); + eol(0); + if (edit_row[dmode] == print_entry_ndx) + COLORS_NORMAL; + gfs_jindex_print(&ji); + last_entry_onscreen[dmode] = print_entry_ndx; + } + } + end_row[dmode] = print_entry_ndx; + return error; +} + +static int print_gfs2_jindex(void) +{ + int d, error; + struct gfs2_log_header head; + struct gfs2_inode *ip; + + for (d = 0; d < indirect->ii[0].dirents; d++) { + if (strncmp(indirect->ii[0].dirent[d].filename, "journal", 7)) + continue; + ip = lgfs2_inode_read(&sbd, indirect->ii[0].dirent[d].block); + print_gfs2("%s: 0x%-5"PRIx64" %lldMB ", + indirect->ii[0].dirent[d].filename, + indirect->ii[0].dirent[d].block, + ip->i_di.di_size / 1048576); + error = gfs2_find_jhead(ip, &head); + if (error) { + print_gfs2("corrupt."); + } else { + if (head.lh_flags & GFS2_LOG_HEAD_UNMOUNT) + print_gfs2("clean."); + else + print_gfs2("dirty."); + } + eol(0); + inode_put(&ip); + } + return 0; +} + +static int parse_rindex(struct gfs2_inode *dip, int print_rindex) +{ + int error, start_line; + struct gfs2_rindex ri; + char rbuf[sizeof(struct gfs2_rindex)]; + char highlighted_addr[32]; + + start_line = line; + print_gfs2("RG index entries found: %lld.", dip->i_di.di_size / + sizeof(struct gfs2_rindex)); + eol(0); + lines_per_row[dmode] = 6; + memset(highlighted_addr, 0, sizeof(highlighted_addr)); + + for (print_entry_ndx=0; ; print_entry_ndx++) { + uint64_t roff; + + roff = print_entry_ndx * sizeof(struct gfs2_rindex); + + error = gfs2_readi(dip, (void *)&rbuf, roff, + sizeof(struct gfs2_rindex)); + if (!error) /* end of file */ + break; + gfs2_rindex_in(&ri, rbuf); + if (!termlines || + (print_entry_ndx >= start_row[dmode] && + ((print_entry_ndx - start_row[dmode])+1) * lines_per_row[dmode] <= + termlines - start_line - 2)) { + if (edit_row[dmode] == print_entry_ndx) { + COLORS_HIGHLIGHT; + sprintf(highlighted_addr, "%llx", (unsigned long long)ri.ri_addr); + } + print_gfs2("RG #%d", print_entry_ndx); + if (!print_rindex) + print_gfs2(" located at: %llu (0x%llx)", + ri.ri_addr, ri.ri_addr); + eol(0); + if (edit_row[dmode] == print_entry_ndx) + COLORS_NORMAL; + if(print_rindex) + gfs2_rindex_print(&ri); + else { + struct gfs2_buffer_head *tmp_bh; + + tmp_bh = bread(&sbd, ri.ri_addr); + if (sbd.gfs1) { + struct gfs_rgrp rg1; + gfs_rgrp_in(&rg1, tmp_bh); + gfs_rgrp_print(&rg1); + } else { + struct gfs2_rgrp rg; + gfs2_rgrp_in(&rg, tmp_bh->b_data); + gfs2_rgrp_print(&rg); + } + brelse(tmp_bh); + } + last_entry_onscreen[dmode] = print_entry_ndx; + } + } + strcpy(estring, highlighted_addr); + end_row[dmode] = print_entry_ndx; + return error; +} + +static int print_inum(struct gfs2_inode *dii) +{ + uint64_t inum, inodenum; + int rc; + + rc = gfs2_readi(dii, (void *)&inum, 0, sizeof(inum)); + if (!rc) { + print_gfs2("The inum file is empty."); + eol(0); + return 0; + } + if (rc != sizeof(inum)) { + print_gfs2("Error reading inum file."); + eol(0); + return -1; + } + inodenum = be64_to_cpu(inum); + print_gfs2("Next inode num = %"PRId64" (0x%"PRIx64")", inodenum, inodenum); + eol(0); + return 0; +} + +static int print_statfs(struct gfs2_inode *dis) +{ + struct gfs2_statfs_change sfb, sfc; + int rc; + + rc = gfs2_readi(dis, (void *)&sfb, 0, sizeof(sfb)); + if (!rc) { + print_gfs2("The statfs file is empty."); + eol(0); + return 0; + } + if (rc != sizeof(sfb)) { + print_gfs2("Error reading statfs file."); + eol(0); + return -1; + } + gfs2_statfs_change_in(&sfc, (char *)&sfb); + print_gfs2("statfs file contents:"); + eol(0); + gfs2_statfs_change_print(&sfc); + return 0; +} + +static int print_quota(struct gfs2_inode *diq) +{ + struct gfs2_quota qbuf, q; + int i, error; + + print_gfs2("quota file contents:"); + eol(0); + print_gfs2("quota entries found: %lld.", diq->i_di.di_size / sizeof(q)); + eol(0); + for (i=0; ; i++) { + error = gfs2_readi(diq, (void *)&qbuf, i * sizeof(q), sizeof(qbuf)); + if (!error) + break; + if (error != sizeof(qbuf)) { + print_gfs2("Error reading quota file."); + eol(0); + return -1; + } + gfs2_quota_in(&q, (char *)&qbuf); + print_gfs2("Entry #%d", i + 1); + eol(0); + gfs2_quota_print(&q); + } + return 0; +} + +int display_extended(void) +{ + struct gfs2_inode *tmp_inode; + struct gfs2_buffer_head *tmp_bh; + + dsplines = termlines - line - 1; + /* Display any indirect pointers that we have. */ + if (block_is_rindex(block)) { + tmp_bh = bread(&sbd, block); + tmp_inode = lgfs2_inode_get(&sbd, tmp_bh); + if (tmp_inode == NULL) + return -1; + parse_rindex(tmp_inode, TRUE); + inode_put(&tmp_inode); + brelse(tmp_bh); + } else if (block_is_journals(block)) { + if (sbd.gfs1) + block = sbd1->sb_jindex_di.no_addr; + else + block = masterblock("jindex"); + print_gfs2_jindex(); + } else if (has_indirect_blocks() && !indirect_blocks && + !display_leaf(indirect)) + return -1; + else if (display_indirect(indirect, indirect_blocks, 0, 0) == 0) + return -1; + else if (block_is_rgtree(block)) { + if (sbd.gfs1) + tmp_bh = bread(&sbd, sbd1->sb_rindex_di.no_addr); + else + tmp_bh = bread(&sbd, masterblock("rindex")); + tmp_inode = lgfs2_inode_get(&sbd, tmp_bh); + if (tmp_inode == NULL) + return -1; + parse_rindex(tmp_inode, FALSE); + inode_put(&tmp_inode); + brelse(tmp_bh); + } else if (block_is_jindex(block)) { + tmp_bh = bread(&sbd, block); + tmp_inode = lgfs2_inode_get(&sbd, tmp_bh); + if (tmp_inode == NULL) + return -1; + print_gfs_jindex(tmp_inode); + inode_put(&tmp_inode); + brelse(tmp_bh); + } + else if (block_is_inum_file(block)) { + tmp_bh = bread(&sbd, block); + tmp_inode = lgfs2_inode_get(&sbd, tmp_bh); + if (tmp_inode == NULL) + return -1; + print_inum(tmp_inode); + inode_put(&tmp_inode); + brelse(tmp_bh); + } + else if (block_is_statfs_file(block)) { + tmp_bh = bread(&sbd, block); + tmp_inode = lgfs2_inode_get(&sbd, tmp_bh); + if (tmp_inode == NULL) + return -1; + print_statfs(tmp_inode); + inode_put(&tmp_inode); + brelse(tmp_bh); + } + else if (block_is_quota_file(block)) { + tmp_bh = bread(&sbd, block); + tmp_inode = lgfs2_inode_get(&sbd, tmp_bh); + if (tmp_inode == NULL) + return -1; + print_quota(tmp_inode); + inode_put(&tmp_inode); + brelse(tmp_bh); + } + return 0; +} + diff --git a/gfs2/edit/extended.h b/gfs2/edit/extended.h new file mode 100644 index 0000000..cfb67be --- /dev/null +++ b/gfs2/edit/extended.h @@ -0,0 +1,8 @@ +#ifndef __EXTENDED_DOT_H__ +#define __EXTENDED_DOT_H__ + +extern int do_indirect_extended(char *diebuf, struct iinfo *iinf); +extern int display_extended(void); + +#endif + diff --git a/gfs2/edit/gfs2hex.c b/gfs2/edit/gfs2hex.c new file mode 100644 index 0000000..62d5cab --- /dev/null +++ b/gfs2/edit/gfs2hex.c @@ -0,0 +1,571 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hexedit.h" + +#define WANT_GFS_CONVERSION_FUNCTIONS +#include + +#include "extended.h" +#include "gfs2hex.h" +#include "libgfs2.h" +#ifdef GFS2_HAS_UUID +#include +#endif + +#define pv(struct, member, fmt, fmt2) do { \ + print_it(" "#member, fmt, fmt2, struct->member); \ + } while (FALSE); +#define pv2(struct, member, fmt, fmt2) do { \ + print_it(" ", fmt, fmt2, struct->member); \ + } while (FALSE); + +struct gfs2_sb sb; +struct gfs2_dinode di; +int line, termlines, modelines[DMODES]; +char edit_fmt[80]; +char estring[1024]; +char efield[64]; +int edit_mode = 0; +int edit_row[DMODES], edit_col[DMODES]; +int edit_size[DMODES], last_entry_onscreen[DMODES]; +enum dsp_mode dmode = HEX_MODE; /* display mode */ +uint64_t block = 0; +int blockhist = 0; +struct iinfo *indirect; +int indirect_blocks; +struct gfs2_sbd sbd; +uint64_t starting_blk; +struct blkstack_info blockstack[BLOCK_STACK_SIZE]; +int identify = FALSE; +uint64_t max_block = 0; +int start_row[DMODES], end_row[DMODES], lines_per_row[DMODES]; +struct gfs_sb *sbd1; +int gfs2_struct_type; +unsigned int offset; +struct indirect_info masterdir; +struct gfs2_inum gfs1_quota_di; +int print_entry_ndx; +struct gfs2_inum gfs1_license_di; +int screen_chunk_size = 512; +uint64_t temp_blk; +int color_scheme = 0; +int struct_len; +uint64_t dev_offset = 0; +int editing = 0; +int insert = 0; +const char *termtype; +WINDOW *wind; +int dsplines = 0; + +const char *block_type_str[15] = { + "Clump", + "Superblock", + "Resource Group Header", + "Resource Group Bitmap", + "Dinode", + "Indirect Block", + "Leaf", + "Journaled Data", + "Log Header", + "Log descriptor", + "Ext. attrib", + "Eattr Data", + "Log Buffer", + "Metatype 13", + "Quota Change", +}; + +void eol(int col) /* end of line */ +{ + if (termlines) { + line++; + move(line, col); + } else { + printf("\n"); + for (; col > 0; col--) + printf(" "); + } +} + +void print_gfs2(const char *fmt, ...) +{ + va_list args; + char string[PATH_MAX]; + + memset(string, 0, sizeof(string)); + va_start(args, fmt); + vsprintf(string, fmt, args); + if (termlines) + printw("%s", string); + else + printf("%s", string); + va_end(args); +} + +static void check_highlight(int highlight) +{ + if (!termlines || line >= termlines) /* If printing or out of bounds */ + return; + if (dmode == HEX_MODE) { + if (line == (edit_row[dmode] * lines_per_row[dmode]) + 4) { + if (highlight) { + COLORS_HIGHLIGHT; + last_entry_onscreen[dmode] = print_entry_ndx; + } else + COLORS_NORMAL; + } + } else { + if ((line * lines_per_row[dmode]) - 4 == + (edit_row[dmode] - start_row[dmode]) * lines_per_row[dmode]) { + if (highlight) { + COLORS_HIGHLIGHT; + last_entry_onscreen[dmode] = print_entry_ndx; + } + else + COLORS_NORMAL; + } + } +} + +void print_it(const char *label, const char *fmt, const char *fmt2, ...) +{ + va_list args; + char tmp_string[NAME_MAX]; + const char *fmtstring; + int decimalsize; + + if (!termlines || line < termlines) { + va_start(args, fmt2); + check_highlight(TRUE); + if (termlines) { + move(line,0); + printw("%s", label); + move(line,24); + } else { + if (!strcmp(label, " ")) + printf("%-11s", label); + else + printf("%-24s", label); + } + vsprintf(tmp_string, fmt, args); + + if (termlines) + printw("%s", tmp_string); + else + printf("%s", tmp_string); + check_highlight(FALSE); + + if (fmt2) { + decimalsize = strlen(tmp_string); + va_end(args); + va_start(args, fmt2); + vsprintf(tmp_string, fmt2, args); + check_highlight(TRUE); + if (termlines) { + move(line, 50); + printw("%s", tmp_string); + } else { + int i; + for (i=20 - decimalsize; i > 0; i--) + printf(" "); + printf("%s", tmp_string); + } + check_highlight(FALSE); + } else { + if (strstr(fmt,"X") || strstr(fmt,"x")) + fmtstring="(hex)"; + else if (strstr(fmt,"s")) + fmtstring=""; + else + fmtstring="(decimal)"; + if (termlines) { + move(line, 50); + printw("%s", fmtstring); + } + else + printf("%s", fmtstring); + } + if (termlines) { + refresh(); + if (line == (edit_row[dmode] * lines_per_row[dmode]) + 4) { + strncpy(efield, label + 2, 63); /* it's indented */ + efield[63] = '\0'; + strcpy(estring, tmp_string); + strncpy(edit_fmt, fmt, 79); + edit_fmt[79] = '\0'; + edit_size[dmode] = strlen(estring); + COLORS_NORMAL; + } + last_entry_onscreen[dmode] = (line / lines_per_row[dmode]) - 4; + } + eol(0); + va_end(args); + } +} + +static int indirect_dirent(struct indirect_info *indir, char *ptr, int d) +{ + struct gfs2_dirent de; + + gfs2_dirent_in(&de, ptr); + if (de.de_rec_len < sizeof(struct gfs2_dirent) || + de.de_rec_len > 4096 - sizeof(struct gfs2_dirent)) + return -1; + if (de.de_inum.no_addr) { + indir->block = de.de_inum.no_addr; + memcpy(&indir->dirent[d].dirent, &de, sizeof(struct gfs2_dirent)); + memcpy(&indir->dirent[d].filename, + ptr + sizeof(struct gfs2_dirent), de.de_name_len); + indir->dirent[d].filename[de.de_name_len] = '\0'; + indir->dirent[d].block = de.de_inum.no_addr; + indir->is_dir = TRUE; + indir->dirents++; + } + return de.de_rec_len; +} + +void do_dinode_extended(struct gfs2_dinode *dine, struct gfs2_buffer_head *lbh) +{ + unsigned int x, y, ptroff = 0; + uint64_t p, last; + int isdir = !!(S_ISDIR(dine->di_mode)) || + (sbd.gfs1 && dine->__pad1 == GFS_FILE_DIR); + + indirect_blocks = 0; + memset(indirect, 0, sizeof(struct iinfo)); + if (dine->di_height > 0) { + /* Indirect pointers */ + for (x = sizeof(struct gfs2_dinode); x < sbd.bsize; + x += sizeof(uint64_t)) { + p = be64_to_cpu(*(uint64_t *)(lbh->b_data + x)); + if (p) { + indirect->ii[indirect_blocks].block = p; + indirect->ii[indirect_blocks].mp.mp_list[0] = + ptroff; + indirect->ii[indirect_blocks].is_dir = FALSE; + indirect->ii[indirect_blocks].ptroff = + (x - sizeof(*dine)) / sizeof(uint64_t); + indirect_blocks++; + } + ptroff++; + } + } + else if (isdir && !(dine->di_flags & GFS2_DIF_EXHASH)) { + int skip = 0; + + /* Directory Entries: */ + indirect->ii[0].dirents = 0; + indirect->ii[0].block = block; + indirect->ii[0].is_dir = TRUE; + for (x = sizeof(struct gfs2_dinode); x < sbd.bsize; x += skip) { + skip = indirect_dirent(indirect->ii, lbh->b_data + x, + indirect->ii[0].dirents); + if (skip <= 0) + break; + } + } + else if (isdir && + (dine->di_flags & GFS2_DIF_EXHASH) && + dine->di_height == 0) { + /* Leaf Pointers: */ + + last = be64_to_cpu(*(uint64_t *)(lbh->b_data + + sizeof(struct gfs2_dinode))); + + for (x = sizeof(struct gfs2_dinode), y = 0; + y < (1 << dine->di_depth); + x += sizeof(uint64_t), y++) { + p = be64_to_cpu(*(uint64_t *)(lbh->b_data + x)); + + if (p != last || ((y + 1) * sizeof(uint64_t) == dine->di_size)) { + struct gfs2_buffer_head *tmp_bh; + int skip = 0, direntcount = 0; + struct gfs2_leaf leaf; + unsigned int bufoffset; + + if (last >= max_block) + break; + tmp_bh = bread(&sbd, last); + gfs2_leaf_in(&leaf, tmp_bh->b_data); + indirect->ii[indirect_blocks].dirents = 0; + for (direntcount = 0, bufoffset = sizeof(struct gfs2_leaf); + bufoffset < sbd.bsize; + direntcount++, bufoffset += skip) { + skip = indirect_dirent(&indirect->ii[indirect_blocks], + tmp_bh->b_data + bufoffset, + direntcount); + if (skip <= 0) + break; + } + brelse(tmp_bh); + indirect->ii[indirect_blocks].block = last; + indirect_blocks++; + last = p; + } /* if not duplicate pointer */ + } /* for indirect pointers found */ + } /* if exhash */ +}/* do_dinode_extended */ + +/** + * Returns: next leaf block, if any, in a chain of leaf blocks + */ +uint64_t do_leaf_extended(char *dlebuf, struct iinfo *indir) +{ + int x, i; + struct gfs2_dirent de; + + x = 0; + memset(indir, 0, sizeof(*indir)); + gfs2_leaf_in(&indir->ii[0].lf, dlebuf); + /* Directory Entries: */ + for (i = sizeof(struct gfs2_leaf); i < sbd.bsize; + i += de.de_rec_len) { + gfs2_dirent_in(&de, dlebuf + i); + if (de.de_inum.no_addr) { + indir->ii[0].block = de.de_inum.no_addr; + indir->ii[0].dirent[x].block = de.de_inum.no_addr; + memcpy(&indir->ii[0].dirent[x].dirent, + &de, sizeof(struct gfs2_dirent)); + memcpy(&indir->ii[0].dirent[x].filename, + dlebuf + i + sizeof(struct gfs2_dirent), + de.de_name_len); + indir->ii[0].dirent[x].filename[de.de_name_len] = '\0'; + indir->ii[0].is_dir = TRUE; + indir->ii[0].dirents++; + x++; + } + if (de.de_rec_len <= sizeof(struct gfs2_dirent)) + break; + } + return indir->ii[0].lf.lf_next; +} + +static void do_eattr_extended(struct gfs2_buffer_head *ebh) +{ + struct gfs2_ea_header ea; + unsigned int x; + + eol(0); + print_gfs2("Eattr Entries:"); + eol(0); + + for (x = sizeof(struct gfs2_meta_header); x < sbd.bsize; + x += ea.ea_rec_len) + { + eol(0); + gfs2_ea_header_in(&ea, ebh->b_data + x); + gfs2_ea_header_print(&ea, ebh->b_data + x + + sizeof(struct gfs2_ea_header)); + } +} + +static void gfs2_inum_print2(const char *title,struct gfs2_inum *no) +{ + if (termlines) { + check_highlight(TRUE); + move(line,2); + printw(title); + check_highlight(FALSE); + } + else + printf(" %s:",title); + pv2(no, no_formal_ino, "%llu", "0x%llx"); + if (!termlines) + printf(" addr:"); + pv2(no, no_addr, "%llu", "0x%llx"); +} + +/** + * gfs2_sb_print2 - Print out a superblock + * @sb: the cpu-order buffer + */ +static void gfs2_sb_print2(struct gfs2_sb *sbp2) +{ + gfs2_meta_header_print(&sbp2->sb_header); + + pv(sbp2, sb_fs_format, "%u", "0x%x"); + pv(sbp2, sb_multihost_format, "%u", "0x%x"); + + if (sbd.gfs1) + pv(sbd1, sb_flags, "%u", "0x%x"); + pv(sbp2, sb_bsize, "%u", "0x%x"); + pv(sbp2, sb_bsize_shift, "%u", "0x%x"); + if (sbd.gfs1) { + pv(sbd1, sb_seg_size, "%u", "0x%x"); + gfs2_inum_print2("jindex ino", &sbd1->sb_jindex_di); + gfs2_inum_print2("rindex ino", &sbd1->sb_rindex_di); + } + else + gfs2_inum_print2("master dir", &sbp2->sb_master_dir); + gfs2_inum_print2("root dir ", &sbp2->sb_root_dir); + + pv(sbp2, sb_lockproto, "%s", NULL); + pv(sbp2, sb_locktable, "%s", NULL); + if (sbd.gfs1) { + gfs2_inum_print2("quota ino ", &gfs1_quota_di); + gfs2_inum_print2("license ", &gfs1_license_di); + } +#ifdef GFS2_HAS_UUID + { + char readable_uuid[36+1]; + + uuid_unparse(sbp2->sb_uuid, readable_uuid); + print_it(" sb_uuid", "%s", NULL, readable_uuid); + } +#endif +} + +/** + * gfs1_rgrp_in - read in a gfs1 rgrp + */ +static void gfs1_rgrp_in(struct gfs_rgrp *rgrp, struct gfs2_buffer_head *rbh) +{ + struct gfs_rgrp *str = (struct gfs_rgrp *)rbh->b_data; + + gfs2_meta_header_in(&rgrp->rg_header, rbh->b_data); + rgrp->rg_flags = be32_to_cpu(str->rg_flags); + rgrp->rg_free = be32_to_cpu(str->rg_free); + rgrp->rg_useddi = be32_to_cpu(str->rg_useddi); + rgrp->rg_freedi = be32_to_cpu(str->rg_freedi); + gfs2_inum_in(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list); + rgrp->rg_usedmeta = be32_to_cpu(str->rg_usedmeta); + rgrp->rg_freemeta = be32_to_cpu(str->rg_freemeta); + memcpy(rgrp->rg_reserved, str->rg_reserved, 64); +} + +/** + * gfs_rgrp_print - Print out a resource group header + */ +static void gfs1_rgrp_print(struct gfs_rgrp *rg) +{ + gfs2_meta_header_print(&rg->rg_header); + pv(rg, rg_flags, "%u", "0x%x"); + pv(rg, rg_free, "%u", "0x%x"); + pv(rg, rg_useddi, "%u", "0x%x"); + pv(rg, rg_freedi, "%u", "0x%x"); + gfs2_inum_print(&rg->rg_freedi_list); + + pv(rg, rg_usedmeta, "%u", "0x%x"); + pv(rg, rg_freemeta, "%u", "0x%x"); +} + +int display_gfs2(struct gfs2_buffer_head *dbh) +{ + struct gfs2_meta_header mh; + struct gfs2_rgrp rg; + struct gfs2_leaf lf; + struct gfs_log_header lh1; + struct gfs2_log_header lh; + struct gfs2_log_descriptor ld; + struct gfs2_quota_change qc; + + uint32_t magic; + + magic = be32_to_cpu(*(uint32_t *)dbh->b_data); + + switch (magic) + { + case GFS2_MAGIC: + gfs2_meta_header_in(&mh, dbh->b_data); + if (mh.mh_type > GFS2_METATYPE_QC) + print_gfs2("Unknown metadata type"); + else + print_gfs2("%s:", block_type_str[mh.mh_type]); + eol(0); + + switch (mh.mh_type) + { + case GFS2_METATYPE_SB: + gfs2_sb_in(&sbd.sd_sb, dbh->b_data); + gfs2_sb_print2(&sbd.sd_sb); + break; + + case GFS2_METATYPE_RG: + if (sbd.gfs1) { + struct gfs_rgrp rg1; + + gfs1_rgrp_in(&rg1, dbh); + gfs1_rgrp_print(&rg1); + } else { + gfs2_rgrp_in(&rg, dbh->b_data); + gfs2_rgrp_print(&rg); + } + break; + + case GFS2_METATYPE_RB: + gfs2_meta_header_print(&mh); + break; + + case GFS2_METATYPE_DI: + gfs2_dinode_print(&di); + break; + + case GFS2_METATYPE_IN: + gfs2_meta_header_print(&mh); + break; + + case GFS2_METATYPE_LF: + gfs2_leaf_in(&lf, dbh->b_data); + gfs2_leaf_print(&lf); + break; + + case GFS2_METATYPE_JD: + gfs2_meta_header_print(&mh); + break; + + case GFS2_METATYPE_LH: + if (sbd.gfs1) { + gfs_log_header_in(&lh1, dbh); + gfs_log_header_print(&lh1); + } else { + gfs2_log_header_in(&lh, dbh->b_data); + gfs2_log_header_print(&lh); + } + break; + + case GFS2_METATYPE_LD: + gfs2_log_descriptor_in(&ld, dbh->b_data); + gfs2_log_descriptor_print(&ld); + break; + + case GFS2_METATYPE_EA: + do_eattr_extended(dbh); + break; + + case GFS2_METATYPE_ED: + gfs2_meta_header_print(&mh); + break; + + case GFS2_METATYPE_LB: + gfs2_meta_header_print(&mh); + break; + + case GFS2_METATYPE_QC: + gfs2_quota_change_in(&qc, dbh->b_data); + gfs2_quota_change_print(&qc); + break; + + default: + break; + } + break; + + default: + print_gfs2("Unknown block type"); + eol(0); + break; + }; + return(0); +} diff --git a/gfs2/edit/gfs2hex.h b/gfs2/edit/gfs2hex.h new file mode 100644 index 0000000..c3efb27 --- /dev/null +++ b/gfs2/edit/gfs2hex.h @@ -0,0 +1,14 @@ +#ifndef __GFS2HEX_DOT_H__ +#define __GFS2HEX_DOT_H__ + +#include "hexedit.h" + +extern int display_gfs2(struct gfs2_buffer_head *dbh); +extern int edit_gfs2(void); +extern void do_dinode_extended(struct gfs2_dinode *di, + struct gfs2_buffer_head *lbh); +extern void print_gfs2(const char *fmt, ...) __attribute__((format (printf, 1, 2))); +extern uint64_t do_leaf_extended(char *dlebuf, struct iinfo *indir); +extern void eol(int col); + +#endif /* __GFS2HEX_DOT_H__ */ diff --git a/gfs2/edit/hexedit.c b/gfs2/edit/hexedit.c new file mode 100644 index 0000000..f015828 --- /dev/null +++ b/gfs2/edit/hexedit.c @@ -0,0 +1,2722 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "copyright.cf" + +#include "hexedit.h" +#include "libgfs2.h" +#include "gfs2hex.h" +#include "extended.h" +#include "journal.h" + +const char *mtypes[] = {"none", "sb", "rg", "rb", "di", "in", "lf", "jd", + "lh", "ld", "ea", "ed", "lb", "13", "qc"}; +const char *allocdesc[2][5] = { + {"Free ", "Data ", "Unlnk", "Meta ", "Resrv"}, + {"Free ", "Data ", "FreeM", "Meta ", "Resrv"},}; + +struct gfs2_buffer_head *bh; +struct gfs2_rgrp *lrgrp; +struct gfs2_meta_header *lmh; +struct gfs2_dinode *ldi; +struct gfs2_leaf *lleaf; +struct gfs2_log_header *llh; +struct gfs2_log_descriptor *lld; +int pgnum; +int details = 0; +long int gziplevel = 9; +static int termcols; +char *device = NULL; +extern uint64_t block; + +/* ------------------------------------------------------------------------- */ +/* erase - clear the screen */ +/* ------------------------------------------------------------------------- */ +static void Erase(void) +{ + bkgd(A_NORMAL|COLOR_PAIR(COLOR_NORMAL)); + /* clear();*/ /* doesn't set background correctly */ + erase(); + /*bkgd(bg);*/ +} + +/* ------------------------------------------------------------------------- */ +/* display_title_lines */ +/* ------------------------------------------------------------------------- */ +static void display_title_lines(void) +{ + Erase(); + COLORS_TITLE; + move(0, 0); + printw("%-80s",TITLE1); + move(termlines, 0); + printw("%-79s",TITLE2); + COLORS_NORMAL; +} + +/* ------------------------------------------------------------------------- */ +/* bobgets - get a string */ +/* returns: 1 if user exited by hitting enter */ +/* 0 if user exited by hitting escape */ +/* ------------------------------------------------------------------------- */ +static int bobgets(char string[],int x,int y,int sz,int *ch) +{ + int done,runningy,rc; + + move(x,y); + done=FALSE; + COLORS_INVERSE; + move(x,y); + addstr(string); + move(x,y); + curs_set(2); + refresh(); + runningy=y; + rc=0; + while (!done) { + *ch = getch(); + + if(*ch < 0x0100 && isprint(*ch)) { + char *p=string+strlen(string); // end of the string + + *(p+1)='\0'; + while (insert && p > &string[runningy-y]) { + *p=*(p-1); + p--; + } + string[runningy-y]=*ch; + runningy++; + move(x,y); + addstr(string); + if (runningy-y >= sz) { + rc=1; + *ch = KEY_RIGHT; + done = TRUE; + } + } + else { + // special character, is it one we recognize? + switch(*ch) + { + case(KEY_ENTER): + case('\n'): + case('\r'): + rc=1; + done=TRUE; + string[runningy-y] = '\0'; + break; + case(KEY_CANCEL): + case(0x01B): + rc=0; + done=TRUE; + break; + case(KEY_LEFT): + if (dmode == HEX_MODE) { + done = TRUE; + rc = 1; + } + else + runningy--; + break; + case(KEY_RIGHT): + if (dmode == HEX_MODE) { + done = TRUE; + rc = 1; + } + else + runningy++; + break; + case(KEY_DC): + case(0x07F): + if (runningy>=y) { + char *p; + p = &string[runningy - y]; + while (*p) { + *p = *(p + 1); + p++; + } + *p = '\0'; + runningy--; + // remove the character from the string + move(x,y); + addstr(string); + COLORS_NORMAL; + addstr(" "); + COLORS_INVERSE; + runningy++; + } + break; + case(KEY_BACKSPACE): + if (runningy>y) { + char *p; + + p = &string[runningy - y - 1]; + while (*p) { + *p = *(p + 1); + p++; + } + *p='\0'; + runningy--; + // remove the character from the string + move(x,y); + addstr(string); + COLORS_NORMAL; + addstr(" "); + COLORS_INVERSE; + } + break; + case KEY_DOWN: // Down + rc=0x5000U; + done=TRUE; + break; + case KEY_UP: // Up + rc=0x4800U; + done=TRUE; + break; + case 0x014b: + insert=!insert; + move(0,68); + if (insert) + printw("insert "); + else + printw("replace"); + break; + default: + move(0,70); + printw("%08x",*ch); + // ignore all other characters + break; + } // end switch on non-printable character + } // end non-printable character + move(x,runningy); + refresh(); + } // while !done + if (sz>0) + string[sz]='\0'; + COLORS_NORMAL; + return rc; +}/* bobgets */ + +/****************************************************************************** +** instr - instructions +******************************************************************************/ +static void gfs2instr(const char *s1, const char *s2) +{ + COLORS_HIGHLIGHT; + move(line,0); + printw(s1); + COLORS_NORMAL; + move(line,17); + printw(s2); + line++; +} + +/****************************************************************************** +******************************************************************************* +** +** void print_usage() +** +** Description: +** This routine prints out the appropriate commands for this application. +** +******************************************************************************* +******************************************************************************/ + +static void print_usage(void) +{ + int ch; + + line = 2; + Erase(); + display_title_lines(); + move(line++,0); + printw("Supported commands: (roughly conforming to the rules of 'less')"); + line++; + move(line++,0); + printw("Navigation:"); + gfs2instr("/","Move up or down one screen full"); + gfs2instr("/","Move up or down one line"); + gfs2instr("/","Move left or right one byte"); + gfs2instr("","Return to the superblock."); + gfs2instr(" f","Forward one 4K block"); + gfs2instr(" b","Backward one 4K block"); + gfs2instr(" g","Goto a given block (number, master, root, rindex, jindex, etc)"); + gfs2instr(" j","Jump to the highlighted 64-bit block number."); + gfs2instr(" ","(You may also arrow up to the block number and hit enter)"); + gfs2instr("","Return to a previous block (a block stack is kept)"); + gfs2instr("","Jump forward to block before backspace (opposite of backspace)"); + line++; + move(line++, 0); + printw("Other commands:"); + gfs2instr(" h","This Help display"); + gfs2instr(" c","Toggle the color scheme"); + gfs2instr(" m","Switch display mode: hex -> GFS2 structure -> Extended"); + gfs2instr(" q","Quit (same as hitting key)"); + gfs2instr("","Edit a value (enter to save, esc to discard)"); + gfs2instr(" ","(Currently only works on the hex display)"); + gfs2instr("","Quit the program"); + line++; + move(line++, 0); + printw("Notes: Areas shown in red are outside the bounds of the struct/file."); + move(line++, 0); + printw(" Areas shown in blue are file contents."); + move(line++, 0); + printw(" Characters shown in green are selected for edit on ."); + move(line++, 0); + move(line++, 0); + printw("Press any key to return."); + refresh(); + while ((ch=getch()) == 0); // wait for input + Erase(); +} + +/* ------------------------------------------------------------------------ */ +/* get_block_type */ +/* returns: metatype if block is a GFS2 structure block type */ +/* 0 if block is not a GFS2 structure */ +/* ------------------------------------------------------------------------ */ +uint32_t get_block_type(const struct gfs2_buffer_head *lbh, int *structlen) +{ + uint32_t ty = lgfs2_get_block_type(lbh); + + if (ty != 0 && structlen != NULL) { + unsigned ver = sbd.gfs1 ? LGFS2_MD_GFS1 : LGFS2_MD_GFS2; + const struct lgfs2_metadata *mtype = lgfs2_find_mtype(ty, ver); + if (mtype != NULL) + *structlen = mtype->size; + else + *structlen = sbd.bsize; + } + return ty; +} + +/* ------------------------------------------------------------------------ */ +/* display_block_type */ +/* returns: metatype if block is a GFS2 structure block type */ +/* 0 if block is not a GFS2 structure */ +/* ------------------------------------------------------------------------ */ +int display_block_type(struct gfs2_buffer_head *dbh, int from_restore) +{ + const struct gfs2_meta_header *mh; + int ret_type = 0; /* return type */ + + /* first, print out the kind of GFS2 block this is */ + if (termlines) { + line = 1; + move(line, 0); + } + print_gfs2("Block #"); + if (termlines) { + if (edit_row[dmode] == -1) + COLORS_HIGHLIGHT; + } + if (block == RGLIST_DUMMY_BLOCK) + print_gfs2("RG List "); + else if (block == JOURNALS_DUMMY_BLOCK) + print_gfs2("Journal Status: "); + else + print_gfs2("%"PRIu64" (0x%"PRIx64")", dbh->b_blocknr, dbh->b_blocknr); + if (termlines) { + if (edit_row[dmode] == -1) + COLORS_NORMAL; + } + print_gfs2(" "); + if (!from_restore) + print_gfs2("of %"PRIu64" (0x%"PRIx64") ", max_block, max_block); + if (block == RGLIST_DUMMY_BLOCK) { + ret_type = GFS2_METATYPE_RG; + struct_len = sbd.gfs1 ? sizeof(struct gfs_rgrp) : + sizeof(struct gfs2_rgrp); + } else if (block == JOURNALS_DUMMY_BLOCK) { + ret_type = GFS2_METATYPE_DI; + struct_len = 0; + } else { + ret_type = get_block_type(dbh, &struct_len); + switch (ret_type) { + case GFS2_METATYPE_SB: /* 1 */ + print_gfs2("(superblock)"); + break; + case GFS2_METATYPE_RG: /* 2 */ + print_gfs2("(rsrc grp hdr)"); + break; + case GFS2_METATYPE_RB: /* 3 */ + print_gfs2("(rsrc grp bitblk)"); + break; + case GFS2_METATYPE_DI: /* 4 */ + print_gfs2("(disk inode)"); + break; + case GFS2_METATYPE_IN: /* 5 */ + print_gfs2("(indir blklist)"); + break; + case GFS2_METATYPE_LF: /* 6 */ + print_gfs2("(directory leaf)"); + break; + case GFS2_METATYPE_JD: + print_gfs2("(journal data)"); + break; + case GFS2_METATYPE_LH: + print_gfs2("(log header)"); + break; + case GFS2_METATYPE_LD: + print_gfs2("(log descriptor)"); + break; + case GFS2_METATYPE_EA: + print_gfs2("(extended attr hdr)"); + break; + case GFS2_METATYPE_ED: + print_gfs2("(extended attr data)"); + break; + case GFS2_METATYPE_LB: + print_gfs2("(log buffer)"); + break; + case GFS2_METATYPE_QC: + print_gfs2("(quota change)"); + break; + case 0: + struct_len = sbd.bsize; + break; + default: + print_gfs2("(wtf?)"); + break; + } + } + mh = dbh->iov.iov_base; + eol(0); + if (from_restore) + return ret_type; + if (termlines && dmode == HEX_MODE) { + int type; + struct rgrp_tree *rgd; + + rgd = gfs2_blk2rgrpd(&sbd, block); + if (rgd) { + gfs2_rgrp_read(&sbd, rgd); + if ((be32_to_cpu(mh->mh_type) == GFS2_METATYPE_RG) || + (be32_to_cpu(mh->mh_type) == GFS2_METATYPE_RB)) + type = 4; + else { + type = lgfs2_get_bitmap(&sbd, block, rgd); + } + } else + type = 4; + screen_chunk_size = ((termlines - 4) * 16) >> 8 << 8; + if (!screen_chunk_size) + screen_chunk_size = 256; + pgnum = (offset / screen_chunk_size); + if (type >= 0) { + print_gfs2("(p.%d of %d--%s)", pgnum + 1, + (sbd.bsize % screen_chunk_size) > 0 ? + sbd.bsize / screen_chunk_size + 1 : sbd.bsize / + screen_chunk_size, allocdesc[sbd.gfs1][type]); + } + /*eol(9);*/ + if ((be32_to_cpu(mh->mh_type) == GFS2_METATYPE_RG)) { + int ptroffset = edit_row[dmode] * 16 + edit_col[dmode]; + + if (rgd && (ptroffset >= struct_len || pgnum)) { + int blknum, b, btype; + + blknum = pgnum * screen_chunk_size; + blknum += (ptroffset - struct_len); + blknum *= 4; + blknum += rgd->ri.ri_data0; + + print_gfs2(" blk "); + for (b = blknum; b < blknum + 4; b++) { + btype = lgfs2_get_bitmap(&sbd, b, rgd); + if (btype >= 0) { + print_gfs2("0x%x-%s ", b, + allocdesc[sbd.gfs1][btype]); + } + } + } + } else if ((be32_to_cpu(mh->mh_type) == GFS2_METATYPE_RB)) { + int ptroffset = edit_row[dmode] * 16 + edit_col[dmode]; + + if (rgd && (ptroffset >= struct_len || pgnum)) { + int blknum, b, btype, rb_number; + + rb_number = block - rgd->ri.ri_addr; + blknum = 0; + /* count the number of bytes representing + blocks prior to the displayed screen. */ + for (b = 0; b < rb_number; b++) { + struct_len = (b ? + sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_rgrp)); + blknum += (sbd.bsize - struct_len); + } + struct_len = sizeof(struct gfs2_meta_header); + /* add the number of bytes on this screen */ + blknum += (ptroffset - struct_len); + /* factor in the page number */ + blknum += pgnum * screen_chunk_size; + /* convert bytes to blocks */ + blknum *= GFS2_NBBY; + /* add the starting offset for this rgrp */ + blknum += rgd->ri.ri_data0; + print_gfs2(" blk "); + for (b = blknum; b < blknum + 4; b++) { + btype = lgfs2_get_bitmap(&sbd, b, rgd); + if (btype >= 0) { + print_gfs2("0x%x-%s ", b, + allocdesc[sbd.gfs1][btype]); + } + } + } + } + if (rgd) + gfs2_rgrp_relse(rgd); + } + if (block == sbd.sd_sb.sb_root_dir.no_addr) + print_gfs2("--------------- Root directory ------------------"); + else if (!sbd.gfs1 && block == sbd.sd_sb.sb_master_dir.no_addr) + print_gfs2("-------------- Master directory -----------------"); + else if (!sbd.gfs1 && block == RGLIST_DUMMY_BLOCK) + print_gfs2("------------------ RG List ----------------------"); + else if (!sbd.gfs1 && block == JOURNALS_DUMMY_BLOCK) + print_gfs2("-------------------- Journal List --------------------"); + else { + if (sbd.gfs1) { + if (block == sbd1->sb_rindex_di.no_addr) + print_gfs2("---------------- rindex file -------------------"); + else if (block == gfs1_quota_di.no_addr) + print_gfs2("---------------- Quota file --------------------"); + else if (block == sbd1->sb_jindex_di.no_addr) + print_gfs2("--------------- Journal Index ------------------"); + else if (block == gfs1_license_di.no_addr) + print_gfs2("--------------- License file -------------------"); + } + else { + int d; + + for (d = 2; d < 8; d++) { + if (block == masterdir.dirent[d].block) { + if (!strncmp(masterdir.dirent[d].filename, "jindex", 6)) + print_gfs2("--------------- Journal Index ------------------"); + else if (!strncmp(masterdir.dirent[d].filename, "per_node", 8)) + print_gfs2("--------------- Per-node Dir -------------------"); + else if (!strncmp(masterdir.dirent[d].filename, "inum", 4)) + print_gfs2("---------------- Inum file ---------------------"); + else if (!strncmp(masterdir.dirent[d].filename, "statfs", 6)) + print_gfs2("---------------- statfs file -------------------"); + else if (!strncmp(masterdir.dirent[d].filename, "rindex", 6)) + print_gfs2("---------------- rindex file -------------------"); + else if (!strncmp(masterdir.dirent[d].filename, "quota", 5)) + print_gfs2("---------------- Quota file --------------------"); + } + } + } + } + eol(0); + return ret_type; +} + +static const struct lgfs2_metadata *find_mtype(uint32_t mtype, const unsigned versions) +{ + const struct lgfs2_metadata *m = lgfs2_metadata; + unsigned n = 0; + + do { + if ((m[n].versions & versions) && m[n].mh_type == mtype) + return &m[n]; + n++; + } while (n < lgfs2_metadata_size); + + return NULL; +} + +static int get_pnum(int ptroffset) +{ + int pnum; + + pnum = pgnum * screen_chunk_size; + pnum += (ptroffset - struct_len); + pnum /= sizeof(uint64_t); + + return pnum; +} + +/* ------------------------------------------------------------------------ */ +/* hexdump - hex dump the filesystem block to the screen */ +/* ------------------------------------------------------------------------ */ +static int hexdump(uint64_t startaddr, int len, int trunc_zeros, + uint64_t flagref, uint64_t ref_blk) +{ + const unsigned char *pointer, *ptr2; + int i; + uint64_t l; + const char *lpBuffer = bh->b_data; + const char *zeros_strt = lpBuffer + sbd.bsize; + int print_field, cursor_line; + const uint32_t block_type = get_block_type(bh, NULL); + uint64_t *ref; + int ptroffset = 0; + + strcpy(edit_fmt,"%02x"); + pointer = (unsigned char *)lpBuffer + offset; + ptr2 = (unsigned char *)lpBuffer + offset; + ref = (uint64_t *)lpBuffer + offset; + if (trunc_zeros) { + while (zeros_strt > lpBuffer && (*(zeros_strt - 1) == 0)) + zeros_strt--; + } + l = offset; + print_entry_ndx = 0; + while (((termlines && line < termlines && + line <= ((screen_chunk_size / 16) + 2)) || + (!termlines && l < len)) && l < sbd.bsize) { + int ptr_not_null = 0; + + if (termlines) { + move(line, 0); + COLORS_OFFSETS; /* cyan for offsets */ + } + if (startaddr < 0xffffffff) + print_gfs2("%.8"PRIx64, startaddr + l); + else + print_gfs2("%.16"PRIx64, startaddr + l); + if (termlines) { + if (l < struct_len) + COLORS_NORMAL; /* normal part of structure */ + else if (gfs2_struct_type == GFS2_METATYPE_DI && + l < struct_len + di.di_size) + COLORS_CONTENTS; /* after struct but not eof */ + else + COLORS_SPECIAL; /* beyond end of the struct */ + } + print_field = -1; + cursor_line = 0; + for (i = 0; i < 16; i++) { /* first print it in hex */ + /* Figure out if we have a null pointer--for colors */ + if (((gfs2_struct_type == GFS2_METATYPE_IN) || + (gfs2_struct_type == GFS2_METATYPE_DI && + l < struct_len + di.di_size && + (di.di_height > 0 || !S_ISREG(di.di_mode)))) && + (i==0 || i==8)) { + int j; + + ptr_not_null = 0; + for (j = 0; j < 8; j++) { + if (*(pointer + j)) { + ptr_not_null = 1; + break; + } + } + } + if (termlines) { + if (l + i < struct_len) + COLORS_NORMAL; /* in the structure */ + else if (gfs2_struct_type == GFS2_METATYPE_DI + && l + i < struct_len + di.di_size) { + if ((!di.di_height && + S_ISREG(di.di_mode)) || + !ptr_not_null) + COLORS_CONTENTS;/*stuff data */ + else + COLORS_SPECIAL;/* non-null */ + } + else if (gfs2_struct_type == GFS2_METATYPE_IN){ + if (ptr_not_null) + COLORS_SPECIAL;/* non-null */ + else + COLORS_CONTENTS;/* null */ + } else + COLORS_SPECIAL; /* past the struct */ + } + if (i%4 == 0) + print_gfs2(" "); + if (termlines && line == edit_row[dmode] + 3 && + i == edit_col[dmode]) { + COLORS_HIGHLIGHT; /* in the structure */ + memset(estring,0,3); + sprintf(estring,"%02x",*pointer); + cursor_line = 1; + print_field = (char *)pointer - bh->b_data; + } + print_gfs2("%02x",*pointer); + if (termlines && line == edit_row[dmode] + 3 && + i == edit_col[dmode]) { + if (l < struct_len + offset) + COLORS_NORMAL; /* in the structure */ + else + COLORS_SPECIAL; /* beyond structure */ + } + pointer++; + } + print_gfs2(" ["); + for (i=0; i<16; i++) { /* now print it in character format */ + if ((*ptr2 >=' ') && (*ptr2 <= '~')) + print_gfs2("%c",*ptr2); + else + print_gfs2("."); + ptr2++; + } + print_gfs2("] "); + if (print_field >= 0) { + const struct lgfs2_metadata *m = find_mtype(block_type, + sbd.gfs1 ? LGFS2_MD_GFS1 : LGFS2_MD_GFS2); + if (m) { + const struct lgfs2_metafield *f; + unsigned n; + for (n = 0; n < m->nfields; n++) { + f = &m->fields[n]; + if (print_field >= f->offset && + print_field < (f->offset + f->length)) { + print_gfs2("%s", m->fields[n].name); + break; + } + } + } + + } + if (cursor_line) { + if (block_type == GFS2_METATYPE_IN || + block_type == GFS2_METATYPE_LD || + ((block_type == GFS2_METATYPE_DI) && + ((struct gfs2_dinode*)bh->b_data)->di_height) || + S_ISDIR(di.di_mode)) { + ptroffset = edit_row[dmode] * 16 + + edit_col[dmode]; + + if (ptroffset >= struct_len || pgnum) { + int pnum = get_pnum(ptroffset); + if (block_type == GFS2_METATYPE_LD) + print_gfs2("*"); + print_gfs2("pointer 0x%x", pnum); + } + } + } + if (line - 3 > last_entry_onscreen[dmode]) + last_entry_onscreen[dmode] = line - 3; + if (flagref && be64_to_cpu(*ref) == flagref) + print_gfs2("<------------------------- ref in 0x%"PRIx64" " + "to 0x%"PRIx64, ref_blk, flagref); + ref++; + if (flagref && be64_to_cpu(*ref) == flagref) + print_gfs2("<------------------------- ref in 0x%"PRIx64" " + "to 0x%"PRIx64, ref_blk, flagref); + ref++; + eol(0); + l += 16; + print_entry_ndx++; + /* This should only happen if trunc_zeros is specified: */ + if ((const char *)pointer >= zeros_strt) + break; + } /* while */ + if (block_type == GFS2_METATYPE_LD && ptroffset >= struct_len) { + COLORS_NORMAL; + eol(0); + print_gfs2(" * 'j' will jump to the journaled block, " + "not the absolute block."); + eol(0); + } + if (sbd.gfs1) { + COLORS_NORMAL; + print_gfs2(" *** This seems to be a GFS-1 file system ***"); + eol(0); + } + return (offset+len); +}/* hexdump */ + +/* ------------------------------------------------------------------------ */ +/* masterblock - find a file (by name) in the master directory and return */ +/* its block number. */ +/* ------------------------------------------------------------------------ */ +uint64_t masterblock(const char *fn) +{ + int d; + + for (d = 2; d < 8; d++) + if (!strncmp(masterdir.dirent[d].filename, fn, strlen(fn))) + return (masterdir.dirent[d].block); + return 0; +} + +/* ------------------------------------------------------------------------ */ +/* rgcount - return how many rgrps there are. */ +/* ------------------------------------------------------------------------ */ +static void rgcount(void) +{ + printf("%lld RGs in this file system.\n", + (unsigned long long)sbd.md.riinode->i_di.di_size / + sizeof(struct gfs2_rindex)); + inode_put(&sbd.md.riinode); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_SUCCESS); +} + +/* ------------------------------------------------------------------------ */ +/* find_rgrp_block - locate the block for a given rgrp number */ +/* ------------------------------------------------------------------------ */ +static uint64_t find_rgrp_block(struct gfs2_inode *dif, int rg) +{ + int amt; + struct gfs2_rindex fbuf, ri; + uint64_t foffset, gfs1_adj = 0; + + foffset = rg * sizeof(struct gfs2_rindex); + if (sbd.gfs1) { + uint64_t sd_jbsize = + (sbd.bsize - sizeof(struct gfs2_meta_header)); + + gfs1_adj = (foffset / sd_jbsize) * + sizeof(struct gfs2_meta_header); + gfs1_adj += sizeof(struct gfs2_meta_header); + } + amt = gfs2_readi(dif, (void *)&fbuf, foffset + gfs1_adj, + sizeof(struct gfs2_rindex)); + if (!amt) /* end of file */ + return 0; + gfs2_rindex_in(&ri, (void *)&fbuf); + return ri.ri_addr; +} + +/* ------------------------------------------------------------------------ */ +/* gfs_rgrp_print - print a gfs1 resource group */ +/* ------------------------------------------------------------------------ */ +void gfs_rgrp_print(struct gfs_rgrp *rg) +{ + gfs2_meta_header_print(&rg->rg_header); + pv(rg, rg_flags, "%u", "0x%x"); + pv(rg, rg_free, "%u", "0x%x"); + pv(rg, rg_useddi, "%u", "0x%x"); + pv(rg, rg_freedi, "%u", "0x%x"); + gfs2_inum_print(&rg->rg_freedi_list); + pv(rg, rg_usedmeta, "%u", "0x%x"); + pv(rg, rg_freemeta, "%u", "0x%x"); +} + +/* ------------------------------------------------------------------------ */ +/* get_rg_addr */ +/* ------------------------------------------------------------------------ */ +static uint64_t get_rg_addr(int rgnum) +{ + uint64_t rgblk = 0, gblock; + struct gfs2_inode *riinode; + + if (sbd.gfs1) + gblock = sbd1->sb_rindex_di.no_addr; + else + gblock = masterblock("rindex"); + riinode = lgfs2_inode_read(&sbd, gblock); + if (riinode == NULL) + return 0; + if (rgnum < riinode->i_di.di_size / sizeof(struct gfs2_rindex)) + rgblk = find_rgrp_block(riinode, rgnum); + else + fprintf(stderr, "Error: File system only has %lld RGs.\n", + (unsigned long long)riinode->i_di.di_size / + sizeof(struct gfs2_rindex)); + inode_put(&riinode); + return rgblk; +} + +/* ------------------------------------------------------------------------ */ +/* set_rgrp_flags - Set an rgrp's flags to a given value */ +/* rgnum: which rg to print or modify flags for (0 - X) */ +/* new_flags: value to set new rg_flags to (if modify == TRUE) */ +/* modify: TRUE if the value is to be modified, FALSE if it's to be printed */ +/* full: TRUE if the full RG should be printed. */ +/* ------------------------------------------------------------------------ */ +static void set_rgrp_flags(int rgnum, uint32_t new_flags, int modify, int full) +{ + union { + struct gfs2_rgrp rg2; + struct gfs_rgrp rg1; + } rg; + struct gfs2_buffer_head *rbh; + uint64_t rgblk; + + rgblk = get_rg_addr(rgnum); + rbh = bread(&sbd, rgblk); + if (sbd.gfs1) + gfs_rgrp_in(&rg.rg1, rbh); + else + gfs2_rgrp_in(&rg.rg2, rbh->b_data); + if (modify) { + printf("RG #%d (block %llu / 0x%llx) rg_flags changed from 0x%08x to 0x%08x\n", + rgnum, (unsigned long long)rgblk, + (unsigned long long)rgblk, rg.rg2.rg_flags, new_flags); + rg.rg2.rg_flags = new_flags; + if (sbd.gfs1) + gfs_rgrp_out(&rg.rg1, rbh); + else + gfs2_rgrp_out(&rg.rg2, rbh->b_data); + bmodified(rbh); + brelse(rbh); + } else { + if (full) { + print_gfs2("RG #%d", rgnum); + print_gfs2(" located at: %"PRIu64" (0x%"PRIx64")", rgblk, rgblk); + eol(0); + if (sbd.gfs1) + gfs_rgrp_print(&rg.rg1); + else + gfs2_rgrp_print(&rg.rg2); + } + else + printf("RG #%d (block %llu / 0x%llx) rg_flags = 0x%08x\n", + rgnum, (unsigned long long)rgblk, + (unsigned long long)rgblk, rg.rg2.rg_flags); + brelse(rbh); + } + if (modify) + fsync(sbd.device_fd); +} + +/* ------------------------------------------------------------------------ */ +/* has_indirect_blocks */ +/* ------------------------------------------------------------------------ */ +int has_indirect_blocks(void) +{ + if (indirect_blocks || gfs2_struct_type == GFS2_METATYPE_SB || + gfs2_struct_type == GFS2_METATYPE_LF || + (gfs2_struct_type == GFS2_METATYPE_DI && + (S_ISDIR(di.di_mode) || (sbd.gfs1 && di.__pad1 == GFS_FILE_DIR)))) + return TRUE; + return FALSE; +} + +int block_is_rindex(uint64_t blk) +{ + if ((sbd.gfs1 && blk == sbd1->sb_rindex_di.no_addr) || + (blk == masterblock("rindex"))) + return TRUE; + return FALSE; +} + +int block_is_jindex(uint64_t blk) +{ + if ((sbd.gfs1 && blk == sbd1->sb_jindex_di.no_addr)) + return TRUE; + return FALSE; +} + +int block_is_inum_file(uint64_t blk) +{ + if (!sbd.gfs1 && blk == masterblock("inum")) + return TRUE; + return FALSE; +} + +int block_is_statfs_file(uint64_t blk) +{ + if (sbd.gfs1 && blk == gfs1_license_di.no_addr) + return TRUE; + if (!sbd.gfs1 && blk == masterblock("statfs")) + return TRUE; + return FALSE; +} + +int block_is_quota_file(uint64_t blk) +{ + if (sbd.gfs1 && blk == gfs1_quota_di.no_addr) + return TRUE; + if (!sbd.gfs1 && blk == masterblock("quota")) + return TRUE; + return FALSE; +} + +int block_is_per_node(uint64_t blk) +{ + if (!sbd.gfs1 && blk == masterblock("per_node")) + return TRUE; + return FALSE; +} + +/* ------------------------------------------------------------------------ */ +/* block_has_extended_info */ +/* ------------------------------------------------------------------------ */ +static int block_has_extended_info(void) +{ + if (has_indirect_blocks() || + block_is_rindex(block) || + block_is_rgtree(block) || + block_is_journals(block) || + block_is_jindex(block) || + block_is_inum_file(block) || + block_is_statfs_file(block) || + block_is_quota_file(block)) + return TRUE; + return FALSE; +} + +static void read_superblock(int fd) +{ + sbd1 = (struct gfs_sb *)&sbd.sd_sb; + ioctl(fd, BLKFLSBUF, 0); + memset(&sbd, 0, sizeof(struct gfs2_sbd)); + sbd.bsize = GFS2_DEFAULT_BSIZE; + sbd.device_fd = fd; + bh = bread(&sbd, 0x10); + sbd.jsize = GFS2_DEFAULT_JSIZE; + sbd.rgsize = GFS2_DEFAULT_RGSIZE; + sbd.qcsize = GFS2_DEFAULT_QCSIZE; + sbd.time = time(NULL); + sbd.rgtree.osi_node = NULL; + gfs2_sb_in(&sbd.sd_sb, bh->b_data); + /* Check to see if this is really gfs1 */ + if (sbd1->sb_fs_format == GFS_FORMAT_FS && + sbd1->sb_header.mh_type == GFS_METATYPE_SB && + sbd1->sb_header.mh_format == GFS_FORMAT_SB && + sbd1->sb_multihost_format == GFS_FORMAT_MULTI) { + struct gfs_sb *sbbuf = (struct gfs_sb *)bh->b_data; + + sbd.gfs1 = TRUE; + sbd1->sb_flags = be32_to_cpu(sbbuf->sb_flags); + sbd1->sb_seg_size = be32_to_cpu(sbbuf->sb_seg_size); + gfs2_inum_in(&sbd1->sb_rindex_di, (void *)&sbbuf->sb_rindex_di); + gfs2_inum_in(&gfs1_quota_di, (void *)&sbbuf->sb_quota_di); + gfs2_inum_in(&gfs1_license_di, (void *)&sbbuf->sb_license_di); + } + else + sbd.gfs1 = FALSE; + sbd.bsize = sbd.sd_sb.sb_bsize; + if (!sbd.bsize) + sbd.bsize = GFS2_DEFAULT_BSIZE; + if (lgfs2_get_dev_info(fd, &sbd.dinfo)) { + perror(device); + exit(-1); + } + if(compute_constants(&sbd)) { + fprintf(stderr, "Failed to compute constants.\n"); + exit(-1); + } + if (sbd.gfs1 || (sbd.sd_sb.sb_header.mh_magic == GFS2_MAGIC && + sbd.sd_sb.sb_header.mh_type == GFS2_METATYPE_SB)) + block = 0x10 * (GFS2_DEFAULT_BSIZE / sbd.bsize); + else { + block = starting_blk = 0; + } + fix_device_geometry(&sbd); + if(sbd.gfs1) { + sbd.sd_inptrs = (sbd.bsize - sizeof(struct gfs_indirect)) / + sizeof(uint64_t); + sbd.sd_diptrs = (sbd.bsize - sizeof(struct gfs_dinode)) / + sizeof(uint64_t); + sbd.md.riinode = lgfs2_inode_read(&sbd, sbd1->sb_rindex_di.no_addr); + } else { + sbd.sd_inptrs = (sbd.bsize - sizeof(struct gfs2_meta_header)) / + sizeof(uint64_t); + sbd.sd_diptrs = (sbd.bsize - sizeof(struct gfs2_dinode)) / + sizeof(uint64_t); + sbd.master_dir = lgfs2_inode_read(&sbd, + sbd.sd_sb.sb_master_dir.no_addr); + if (sbd.master_dir == NULL) { + sbd.md.riinode = NULL; + } else { + gfs2_lookupi(sbd.master_dir, "rindex", 6, &sbd.md.riinode); + } + } + brelse(bh); + bh = NULL; +} + +static int read_rindex(void) +{ + struct gfs2_rindex *ri; + uint64_t count; + int sane; + + sbd.fssize = sbd.device.length; + if (sbd.md.riinode) /* If we found the rindex */ + rindex_read(&sbd, 0, &count, &sane); + + if (!OSI_EMPTY_ROOT(&sbd.rgtree)) { + ri = &((struct rgrp_tree *)osi_last(&sbd.rgtree))->ri; + sbd.fssize = ri->ri_data0 + ri->ri_data; + } + return 0; +} + +static int read_master_dir(void) +{ + ioctl(sbd.device_fd, BLKFLSBUF, 0); + + bh = bread(&sbd, sbd.sd_sb.sb_master_dir.no_addr); + if (bh == NULL) + return 1; + gfs2_dinode_in(&di, bh->b_data); + do_dinode_extended(&di, bh); /* get extended data, if any */ + memcpy(&masterdir, &indirect[0], sizeof(struct indirect_info)); + return 0; +} + +int display(int identify_only, int trunc_zeros, uint64_t flagref, + uint64_t ref_blk) +{ + uint64_t blk; + + if (block == RGLIST_DUMMY_BLOCK) { + if (sbd.gfs1) + blk = sbd1->sb_rindex_di.no_addr; + else + blk = masterblock("rindex"); + } else if (block == JOURNALS_DUMMY_BLOCK) { + if (sbd.gfs1) + blk = sbd1->sb_jindex_di.no_addr; + else + blk = masterblock("jindex"); + } else + blk = block; + if (termlines) { + display_title_lines(); + move(2,0); + } + if (bh == NULL || bh->b_blocknr != blk) { /* If we changed blocks from the last read */ + if (bh != NULL) + brelse(bh); + dev_offset = blk * sbd.bsize; + ioctl(sbd.device_fd, BLKFLSBUF, 0); + if (!(bh = bread(&sbd, blk))) { + fprintf(stderr, "read error: %s from %s:%d: " + "offset %lld (0x%llx)\n", + strerror(errno), __FUNCTION__, __LINE__, + (unsigned long long)dev_offset, + (unsigned long long)dev_offset); + exit(-1); + } + } + line = 1; + gfs2_struct_type = display_block_type(bh, FALSE); + if (identify_only) + return 0; + indirect_blocks = 0; + lines_per_row[dmode] = 1; + if (gfs2_struct_type == GFS2_METATYPE_SB || blk == 0x10 * (4096 / sbd.bsize)) { + gfs2_sb_in(&sbd.sd_sb, bh->b_data); + memset(indirect, 0, sizeof(struct iinfo)); + indirect->ii[0].block = sbd.sd_sb.sb_master_dir.no_addr; + indirect->ii[0].is_dir = TRUE; + indirect->ii[0].dirents = 2; + + memcpy(&indirect->ii[0].dirent[0].filename, "root", 4); + indirect->ii[0].dirent[0].dirent.de_inum.no_formal_ino = + sbd.sd_sb.sb_root_dir.no_formal_ino; + indirect->ii[0].dirent[0].dirent.de_inum.no_addr = + sbd.sd_sb.sb_root_dir.no_addr; + indirect->ii[0].dirent[0].block = sbd.sd_sb.sb_root_dir.no_addr; + indirect->ii[0].dirent[0].dirent.de_type = DT_DIR; + + memcpy(&indirect->ii[0].dirent[1].filename, "master", 7); + indirect->ii[0].dirent[1].dirent.de_inum.no_formal_ino = + sbd.sd_sb.sb_master_dir.no_formal_ino; + indirect->ii[0].dirent[1].dirent.de_inum.no_addr = + sbd.sd_sb.sb_master_dir.no_addr; + indirect->ii[0].dirent[1].block = sbd.sd_sb.sb_master_dir.no_addr; + indirect->ii[0].dirent[1].dirent.de_type = DT_DIR; + } + else if (gfs2_struct_type == GFS2_METATYPE_DI) { + gfs2_dinode_in(&di, bh->b_data); + do_dinode_extended(&di, bh); /* get extended data, if any */ + } + else if (gfs2_struct_type == GFS2_METATYPE_IN) { /* indirect block list */ + if (blockhist) { + int i; + + for (i = 0; i < 512; i++) + memcpy(&indirect->ii[i].mp, + &blockstack[blockhist - 1].mp, + sizeof(struct metapath)); + } + indirect_blocks = do_indirect_extended(bh->b_data, indirect); + } + else if (gfs2_struct_type == GFS2_METATYPE_LF) { /* directory leaf */ + do_leaf_extended(bh->b_data, indirect); + } + + last_entry_onscreen[dmode] = 0; + if (dmode == EXTENDED_MODE && !block_has_extended_info()) + dmode = HEX_MODE; + if (termlines) { + move(termlines, 63); + if (dmode==HEX_MODE) + printw("Mode: Hex %s", (editing?"edit ":"view ")); + else + printw("Mode: %s", (dmode==GFS2_MODE?"Structure": + "Pointers ")); + move(line, 0); + } + if (dmode == HEX_MODE) /* if hex display mode */ + hexdump(dev_offset, (gfs2_struct_type == GFS2_METATYPE_DI)? + struct_len + di.di_size:sbd.bsize, trunc_zeros, + flagref, ref_blk); + else if (dmode == GFS2_MODE) { /* if structure display */ + if (block != JOURNALS_DUMMY_BLOCK) + display_gfs2(bh); /* display the gfs2 structure */ + } else + display_extended(); /* display extended blocks */ + /* No else here because display_extended can switch back to hex mode */ + if (termlines) + refresh(); + return(0); +} + +/* ------------------------------------------------------------------------ */ +/* push_block - push a block onto the block stack */ +/* ------------------------------------------------------------------------ */ +static void push_block(uint64_t blk) +{ + int i, bhst; + + bhst = blockhist % BLOCK_STACK_SIZE; + if (blk) { + blockstack[bhst].dmode = dmode; + for (i = 0; i < DMODES; i++) { + blockstack[bhst].start_row[i] = start_row[i]; + blockstack[bhst].end_row[i] = end_row[i]; + blockstack[bhst].edit_row[i] = edit_row[i]; + blockstack[bhst].edit_col[i] = edit_col[i]; + blockstack[bhst].lines_per_row[i] = lines_per_row[i]; + } + blockstack[bhst].gfs2_struct_type = gfs2_struct_type; + if (edit_row[dmode] >= 0 && !block_is_rindex(block)) + memcpy(&blockstack[bhst].mp, + &indirect->ii[edit_row[dmode]].mp, + sizeof(struct metapath)); + blockhist++; + blockstack[blockhist % BLOCK_STACK_SIZE].block = blk; + } +} + +/* ------------------------------------------------------------------------ */ +/* pop_block - pop a block off the block stack */ +/* ------------------------------------------------------------------------ */ +static uint64_t pop_block(void) +{ + int i, bhst; + + if (!blockhist) + return block; + blockhist--; + bhst = blockhist % BLOCK_STACK_SIZE; + dmode = blockstack[bhst].dmode; + for (i = 0; i < DMODES; i++) { + start_row[i] = blockstack[bhst].start_row[i]; + end_row[i] = blockstack[bhst].end_row[i]; + edit_row[i] = blockstack[bhst].edit_row[i]; + edit_col[i] = blockstack[bhst].edit_col[i]; + lines_per_row[i] = blockstack[bhst].lines_per_row[i]; + } + gfs2_struct_type = blockstack[bhst].gfs2_struct_type; + return blockstack[bhst].block; +} + +/* ------------------------------------------------------------------------ */ +/* Find next metadata block of a given type AFTER a given point in the fs */ +/* */ +/* This is used to find blocks that aren't represented in the bitmaps, such */ +/* as the RGs and bitmaps or the superblock. */ +/* ------------------------------------------------------------------------ */ +static uint64_t find_metablockoftype_slow(uint64_t startblk, int metatype, int print) +{ + uint64_t blk, last_fs_block; + int found = 0; + struct gfs2_buffer_head *lbh; + + last_fs_block = lseek(sbd.device_fd, 0, SEEK_END) / sbd.bsize; + for (blk = startblk + 1; blk < last_fs_block; blk++) { + lbh = bread(&sbd, blk); + /* Can't use get_block_type here (returns false "none") */ + if (lbh->b_data[0] == 0x01 && lbh->b_data[1] == 0x16 && + lbh->b_data[2] == 0x19 && lbh->b_data[3] == 0x70 && + lbh->b_data[4] == 0x00 && lbh->b_data[5] == 0x00 && + lbh->b_data[6] == 0x00 && lbh->b_data[7] == metatype) { + found = 1; + brelse(lbh); + break; + } + brelse(lbh); + } + if (!found) + blk = 0; + if (print) { + if (dmode == HEX_MODE) + printf("0x%llx\n", (unsigned long long)blk); + else + printf("%llu\n", (unsigned long long)blk); + } + gfs2_rgrp_free(&sbd.rgtree); + if (print) + exit(0); + return blk; +} + +static int find_rg_metatype(struct rgrp_tree *rgd, uint64_t *blk, uint64_t startblk, int mtype) +{ + int found; + unsigned i, j, m; + struct gfs2_buffer_head *bhp = NULL; + uint64_t *ibuf = malloc(sbd.bsize * GFS2_NBBY * sizeof(uint64_t)); + + for (i = 0; i < rgd->ri.ri_length; i++) { + m = lgfs2_bm_scan(rgd, i, ibuf, GFS2_BLKST_DINODE); + + for (j = 0; j < m; j++) { + *blk = ibuf[j]; + bhp = bread(&sbd, *blk); + found = (*blk > startblk) && !gfs2_check_meta(bhp, mtype); + brelse(bhp); + if (found) { + free(ibuf); + return 0; + } + } + } + free(ibuf); + return -1; +} + +/* ------------------------------------------------------------------------ */ +/* Find next "metadata in use" block AFTER a given point in the fs */ +/* */ +/* This version does its magic by searching the bitmaps of the RG. After */ +/* all, if we're searching for a dinode, we want a real allocated inode, */ +/* not just some block that used to be an inode in a previous incarnation. */ +/* ------------------------------------------------------------------------ */ +static uint64_t find_metablockoftype_rg(uint64_t startblk, int metatype, int print) +{ + struct osi_node *next = NULL; + uint64_t blk, errblk; + int first = 1, found = 0; + struct rgrp_tree *rgd = NULL; + struct gfs2_rindex *ri; + + blk = 0; + /* Skip the rgs prior to the block we've been given */ + for (next = osi_first(&sbd.rgtree); next; next = osi_next(next)) { + rgd = (struct rgrp_tree *)next; + ri = &rgd->ri; + if (first && startblk <= ri->ri_data0) { + startblk = ri->ri_data0; + break; + } else if (ri->ri_addr <= startblk && + startblk < ri->ri_data0 + ri->ri_data) + break; + else + rgd = NULL; + first = 0; + } + if (!rgd) { + if (print) + printf("0\n"); + gfs2_rgrp_free(&sbd.rgtree); + if (print) + exit(-1); + } + for (; !found && next; next = osi_next(next)){ + rgd = (struct rgrp_tree *)next; + errblk = gfs2_rgrp_read(&sbd, rgd); + if (errblk) + continue; + + found = !find_rg_metatype(rgd, &blk, startblk, metatype); + if (found) + break; + + gfs2_rgrp_relse(rgd); + } + + if (!found) + blk = 0; + if (print) { + if (dmode == HEX_MODE) + printf("0x%llx\n", (unsigned long long)blk); + else + printf("%llu\n", (unsigned long long)blk); + } + gfs2_rgrp_free(&sbd.rgtree); + if (print) + exit(0); + return blk; +} + +/* ------------------------------------------------------------------------ */ +/* Find next metadata block AFTER a given point in the fs */ +/* ------------------------------------------------------------------------ */ +static uint64_t find_metablockoftype(const char *strtype, int print) +{ + int mtype = 0; + uint64_t startblk, blk = 0; + + if (print) + startblk = blockstack[blockhist % BLOCK_STACK_SIZE].block; + else + startblk = block; + + for (mtype = GFS2_METATYPE_NONE; + mtype <= GFS2_METATYPE_QC; mtype++) + if (!strcasecmp(strtype, mtypes[mtype])) + break; + if (!strcmp(strtype, "dinode")) + mtype = GFS2_METATYPE_DI; + if (mtype >= GFS2_METATYPE_NONE && mtype <= GFS2_METATYPE_RB) + blk = find_metablockoftype_slow(startblk, mtype, print); + else if (mtype >= GFS2_METATYPE_DI && mtype <= GFS2_METATYPE_QC) + blk = find_metablockoftype_rg(startblk, mtype, print); + else if (print) { + fprintf(stderr, "Error: metadata type not " + "specified: must be one of:\n"); + fprintf(stderr, "sb rg rb di in lf jd lh ld" + " ea ed lb 13 qc\n"); + gfs2_rgrp_free(&sbd.rgtree); + exit(-1); + } + return blk; +} + +/* ------------------------------------------------------------------------ */ +/* Check if the word is a keyword such as "sb" or "rindex" */ +/* Returns: block number if it is, else 0 */ +/* ------------------------------------------------------------------------ */ +uint64_t check_keywords(const char *kword) +{ + unsigned long long blk = 0; + + if (!strcmp(kword, "sb") ||!strcmp(kword, "superblock")) + blk = 0x10 * (4096 / sbd.bsize); /* superblock */ + else if (!strcmp(kword, "root") || !strcmp(kword, "rootdir")) + blk = sbd.sd_sb.sb_root_dir.no_addr; + else if (!strcmp(kword, "master")) { + if (sbd.gfs1) + fprintf(stderr, "This is GFS1; there's no master directory.\n"); + else if (!sbd.sd_sb.sb_master_dir.no_addr) { + fprintf(stderr, "GFS2 master directory not found on %s\n", device); + exit(-1); + } else + blk = sbd.sd_sb.sb_master_dir.no_addr; + } + else if (!strcmp(kword, "jindex")) { + if (sbd.gfs1) + blk = sbd1->sb_jindex_di.no_addr; + else + blk = masterblock("jindex"); /* journal index */ + } + else if (!sbd.gfs1 && !strcmp(kword, "per_node")) + blk = masterblock("per_node"); + else if (!sbd.gfs1 && !strcmp(kword, "inum")) + blk = masterblock("inum"); + else if (!strcmp(kword, "statfs")) { + if (sbd.gfs1) + blk = gfs1_license_di.no_addr; + else + blk = masterblock("statfs"); + } + else if (!strcmp(kword, "rindex") || !strcmp(kword, "rgindex")) { + if (sbd.gfs1) + blk = sbd1->sb_rindex_di.no_addr; + else + blk = masterblock("rindex"); + } else if (!strcmp(kword, "rgs")) { + blk = RGLIST_DUMMY_BLOCK; + } else if (!strcmp(kword, "quota")) { + if (sbd.gfs1) + blk = gfs1_quota_di.no_addr; + else + blk = masterblock("quota"); + } else if (!strncmp(kword, "rg ", 3)) { + int rgnum = 0; + + rgnum = atoi(kword + 3); + blk = get_rg_addr(rgnum); + } else if (!strncmp(kword, "journals", 8)) { + blk = JOURNALS_DUMMY_BLOCK; + } else if (strlen(kword) > 7 && !strncmp(kword, "journal", 7) && isdigit(kword[7])) { + uint64_t j_size; + + blk = find_journal_block(kword, &j_size); + } else if (kword[0]=='/') /* search */ + blk = find_metablockoftype(&kword[1], 0); + else if (kword[0]=='0' && kword[1]=='x') /* hex addr */ + sscanf(kword, "%llx", &blk);/* retrieve in hex */ + else + sscanf(kword, "%llu", &blk); /* retrieve decimal */ + + return blk; +} + +/* ------------------------------------------------------------------------ */ +/* goto_block - go to a desired block entered by the user */ +/* ------------------------------------------------------------------------ */ +static uint64_t goto_block(void) +{ + char string[256]; + int ch, delta; + + memset(string, 0, sizeof(string)); + sprintf(string,"%lld", (long long)block); + if (bobgets(string, 1, 7, 16, &ch)) { + if (isalnum(string[0]) || string[0] == '/') + temp_blk = check_keywords(string); + else if (string[0] == '+' || string[0] == '-') { + if (string[1] == '0' && string[2] == 'x') + sscanf(string, "%x", &delta); + else + sscanf(string, "%d", &delta); + temp_blk = block + delta; + } + + if (temp_blk == RGLIST_DUMMY_BLOCK || + temp_blk == JOURNALS_DUMMY_BLOCK || temp_blk < max_block) { + offset = 0; + block = temp_blk; + push_block(block); + } + } + return block; +} + +/* ------------------------------------------------------------------------ */ +/* init_colors */ +/* ------------------------------------------------------------------------ */ +static void init_colors(void) +{ + + if (color_scheme) { + init_pair(COLOR_TITLE, COLOR_BLACK, COLOR_CYAN); + init_pair(COLOR_NORMAL, COLOR_WHITE, COLOR_BLACK); + init_pair(COLOR_INVERSE, COLOR_BLACK, COLOR_WHITE); + init_pair(COLOR_SPECIAL, COLOR_RED, COLOR_BLACK); + init_pair(COLOR_HIGHLIGHT, COLOR_GREEN, COLOR_BLACK); + init_pair(COLOR_OFFSETS, COLOR_CYAN, COLOR_BLACK); + init_pair(COLOR_CONTENTS, COLOR_YELLOW, COLOR_BLACK); + } + else { + init_pair(COLOR_TITLE, COLOR_BLACK, COLOR_CYAN); + init_pair(COLOR_NORMAL, COLOR_BLACK, COLOR_WHITE); + init_pair(COLOR_INVERSE, COLOR_WHITE, COLOR_BLACK); + init_pair(COLOR_SPECIAL, COLOR_MAGENTA, COLOR_WHITE); + init_pair(COLOR_HIGHLIGHT, COLOR_RED, COLOR_WHITE); /*cursor*/ + init_pair(COLOR_OFFSETS, COLOR_CYAN, COLOR_WHITE); + init_pair(COLOR_CONTENTS, COLOR_BLUE, COLOR_WHITE); + } +} + +/* ------------------------------------------------------------------------ */ +/* hex_edit - Allow the user to edit the page by entering hex digits */ +/* ------------------------------------------------------------------------ */ +static void hex_edit(int *exitch) +{ + int left_off; + int ch; + + left_off = ((block * sbd.bsize) < 0xffffffff) ? 9 : 17; + /* 8 and 16 char addresses on screen */ + + if (bobgets(estring, edit_row[HEX_MODE] + 3, + (edit_col[HEX_MODE] * 2) + (edit_col[HEX_MODE] / 4) + + left_off, 2, exitch)) { + if (strstr(edit_fmt,"X") || strstr(edit_fmt,"x")) { + int hexoffset; + int i, sl = strlen(estring); + + for (i = 0; i < sl; i+=2) { + hexoffset = (edit_row[HEX_MODE] * 16) + + edit_col[HEX_MODE] + (i / 2); + ch = 0x00; + if (isdigit(estring[i])) + ch = (estring[i] - '0') * 0x10; + else if (estring[i] >= 'a' && + estring[i] <= 'f') + ch = (estring[i]-'a' + 0x0a)*0x10; + else if (estring[i] >= 'A' && + estring[i] <= 'F') + ch = (estring[i] - 'A' + 0x0a) * 0x10; + if (isdigit(estring[i+1])) + ch += (estring[i+1] - '0'); + else if (estring[i+1] >= 'a' && + estring[i+1] <= 'f') + ch += (estring[i+1] - 'a' + 0x0a); + else if (estring[i+1] >= 'A' && + estring[i+1] <= 'F') + ch += (estring[i+1] - 'A' + 0x0a); + bh->b_data[offset + hexoffset] = ch; + } + if (pwrite(sbd.device_fd, bh->b_data, sbd.bsize, dev_offset) != + sbd.bsize) { + fprintf(stderr, "write error: %s from %s:%d: " + "offset %lld (0x%llx)\n", + strerror(errno), + __FUNCTION__, __LINE__, + (unsigned long long)dev_offset, + (unsigned long long)dev_offset); + exit(-1); + } + fsync(sbd.device_fd); + } + } +} + +/* ------------------------------------------------------------------------ */ +/* page up */ +/* ------------------------------------------------------------------------ */ +static void pageup(void) +{ + if (dmode == EXTENDED_MODE) { + if (edit_row[dmode] - (dsplines / lines_per_row[dmode]) > 0) + edit_row[dmode] -= (dsplines / lines_per_row[dmode]); + else + edit_row[dmode] = 0; + if (start_row[dmode] - (dsplines / lines_per_row[dmode]) > 0) + start_row[dmode] -= (dsplines / lines_per_row[dmode]); + else + start_row[dmode] = 0; + } + else { + start_row[dmode] = edit_row[dmode] = 0; + if (dmode == GFS2_MODE || offset==0) { + block--; + if (dmode == HEX_MODE) + offset = (sbd.bsize % screen_chunk_size) > 0 ? + screen_chunk_size * + (sbd.bsize / screen_chunk_size) : + sbd.bsize - screen_chunk_size; + else + offset = 0; + } else + offset -= screen_chunk_size; + } +} + +/* ------------------------------------------------------------------------ */ +/* page down */ +/* ------------------------------------------------------------------------ */ +static void pagedn(void) +{ + if (dmode == EXTENDED_MODE) { + if ((edit_row[dmode] + dsplines) / lines_per_row[dmode] + 1 <= + end_row[dmode]) { + start_row[dmode] += dsplines / lines_per_row[dmode]; + edit_row[dmode] += dsplines / lines_per_row[dmode]; + } else { + edit_row[dmode] = end_row[dmode] - 1; + while (edit_row[dmode] - start_row[dmode] + + 1 > last_entry_onscreen[dmode]) + start_row[dmode]++; + } + } + else { + start_row[dmode] = edit_row[dmode] = 0; + if (dmode == GFS2_MODE || + offset + screen_chunk_size >= sbd.bsize) { + block++; + offset = 0; + } else + offset += screen_chunk_size; + } +} + +/* ------------------------------------------------------------------------ */ +/* jump - jump to the address the cursor is on */ +/* */ +/* If the cursor is in a log descriptor, jump to the log-descriptor version */ +/* of the block instead of the "real" block. */ +/* ------------------------------------------------------------------------ */ +static void jump(void) +{ + if (dmode == HEX_MODE) { + unsigned int col2; + uint64_t *b; + const uint32_t block_type = get_block_type(bh, NULL); + + /* special exception for log descriptors: jump the journaled + version of the block, not the "real" block */ + if (block_type == GFS2_METATYPE_LD) { + int ptroffset = edit_row[dmode] * 16 + edit_col[dmode]; + int pnum = get_pnum(ptroffset); + temp_blk = bh->b_blocknr + pnum + 1; + } else if (edit_row[dmode] >= 0) { + col2 = edit_col[dmode] & 0x08;/* thus 0-7->0, 8-15->8 */ + b = (uint64_t *)&bh->b_data[edit_row[dmode]*16 + + offset + col2]; + temp_blk = be64_to_cpu(*b); + } + } + else + sscanf(estring, "%"SCNx64, &temp_blk);/* retrieve in hex */ + if (temp_blk < max_block) { /* if the block number is valid */ + int i; + + offset = 0; + push_block(temp_blk); + block = temp_blk; + for (i = 0; i < DMODES; i++) { + start_row[i] = end_row[i] = edit_row[i] = 0; + edit_col[i] = 0; + } + } +} + +/* ------------------------------------------------------------------------ */ +/* print block type */ +/* ------------------------------------------------------------------------ */ +static void print_block_type(uint64_t tblock, int type, const char *additional) +{ + if (type <= GFS2_METATYPE_QC) + printf("%d (Block %lld is type %d: %s%s)\n", type, + (unsigned long long)tblock, type, block_type_str[type], + additional); + else + printf("%d (Block %lld is type %d: unknown%s)\n", type, + (unsigned long long)tblock, type, additional); +} + +/* ------------------------------------------------------------------------ */ +/* find_print block type */ +/* ------------------------------------------------------------------------ */ +static void find_print_block_type(void) +{ + uint64_t tblock; + struct gfs2_buffer_head *lbh; + int type; + + tblock = blockstack[blockhist % BLOCK_STACK_SIZE].block; + lbh = bread(&sbd, tblock); + type = get_block_type(lbh, NULL); + print_block_type(tblock, type, ""); + brelse(lbh); + gfs2_rgrp_free(&sbd.rgtree); + exit(0); +} + +/* ------------------------------------------------------------------------ */ +/* Find and print the resource group associated with a given block */ +/* ------------------------------------------------------------------------ */ +static void find_print_block_rg(int bitmap) +{ + uint64_t rblock, rgblock; + int i; + struct rgrp_tree *rgd; + + rblock = blockstack[blockhist % BLOCK_STACK_SIZE].block; + if (rblock == LGFS2_SB_ADDR(&sbd)) + printf("0 (the superblock is not in the bitmap)\n"); + else { + rgd = gfs2_blk2rgrpd(&sbd, rblock); + if (rgd) { + rgblock = rgd->ri.ri_addr; + if (bitmap) { + struct gfs2_bitmap *bits = NULL; + + for (i = 0; i < rgd->ri.ri_length; i++) { + bits = &(rgd->bits[i]); + if (rblock - rgd->ri.ri_data0 < + ((bits->bi_start + bits->bi_len) * + GFS2_NBBY)) { + break; + } + } + if (i < rgd->ri.ri_length) + rgblock += i; + + } + if (dmode == HEX_MODE) + printf("0x%llx\n",(unsigned long long)rgblock); + else + printf("%llu\n", (unsigned long long)rgblock); + } else { + printf("-1 (block invalid or part of an rgrp).\n"); + } + } + gfs2_rgrp_free(&sbd.rgtree); + exit(0); +} + +/* ------------------------------------------------------------------------ */ +/* find/change/print block allocation (what the bitmap says about block) */ +/* ------------------------------------------------------------------------ */ +static void find_change_block_alloc(int *newval) +{ + uint64_t ablock; + int type; + struct rgrp_tree *rgd; + + if (newval && + (*newval < GFS2_BLKST_FREE || *newval > GFS2_BLKST_DINODE)) { + int i; + + printf("Error: value %d is not valid.\nValid values are:\n", + *newval); + for (i = GFS2_BLKST_FREE; i <= GFS2_BLKST_DINODE; i++) + printf("%d - %s\n", i, allocdesc[sbd.gfs1][i]); + gfs2_rgrp_free(&sbd.rgtree); + exit(-1); + } + ablock = blockstack[blockhist % BLOCK_STACK_SIZE].block; + if (ablock == LGFS2_SB_ADDR(&sbd)) + printf("3 (the superblock is not in the bitmap)\n"); + else { + rgd = gfs2_blk2rgrpd(&sbd, ablock); + if (rgd) { + gfs2_rgrp_read(&sbd, rgd); + if (newval) { + if (gfs2_set_bitmap(rgd, ablock, *newval)) + printf("-1 (block invalid or part of an rgrp).\n"); + else + printf("%d\n", *newval); + } else { + type = lgfs2_get_bitmap(&sbd, ablock, rgd); + if (type < 0) { + printf("-1 (block invalid or part of " + "an rgrp).\n"); + exit(-1); + } + printf("%d (%s)\n", type, allocdesc[sbd.gfs1][type]); + } + gfs2_rgrp_relse(rgd); + } else { + gfs2_rgrp_free(&sbd.rgtree); + printf("-1 (block invalid or part of an rgrp).\n"); + exit(-1); + } + } + gfs2_rgrp_free(&sbd.rgtree); + if (newval) + fsync(sbd.device_fd); + exit(0); +} + +/** + * process request to print a certain field from a previously pushed block + */ +static void process_field(const char *field, const char *nstr) +{ + uint64_t fblock; + struct gfs2_buffer_head *rbh; + int type; + const struct lgfs2_metadata *mtype; + const struct lgfs2_metafield *mfield; + + fblock = blockstack[blockhist % BLOCK_STACK_SIZE].block; + rbh = bread(&sbd, fblock); + type = get_block_type(rbh, NULL); + + mtype = lgfs2_find_mtype(type, sbd.gfs1 ? LGFS2_MD_GFS1 : LGFS2_MD_GFS2); + if (mtype == NULL) { + fprintf(stderr, "Metadata type '%d' invalid\n", type); + exit(1); + } + + mfield = lgfs2_find_mfield_name(field, mtype); + if (mfield == NULL) { + fprintf(stderr, "No field '%s' in block type '%s'\n", field, mtype->name); + exit(1); + } + + if (nstr != device) { + int err = 0; + if (mfield->flags & (LGFS2_MFF_UUID|LGFS2_MFF_STRING)) { + err = lgfs2_field_assign(rbh->b_data, mfield, nstr); + } else { + uint64_t val = 0; + err = sscanf(nstr, "%"SCNi64, &val); + if (err == 1) + err = lgfs2_field_assign(rbh->b_data, mfield, &val); + else + err = -1; + } + if (err != 0) { + fprintf(stderr, "Could not set '%s' to '%s': %s\n", field, nstr, + strerror(errno)); + exit(1); + } + bmodified(rbh); + } + + if (!termlines) { + char str[GFS2_LOCKNAME_LEN] = ""; + lgfs2_field_str(str, GFS2_LOCKNAME_LEN, rbh->b_data, mfield, (dmode == HEX_MODE)); + printf("%s\n", str); + } + + brelse(rbh); + fsync(sbd.device_fd); + exit(0); +} + +/* ------------------------------------------------------------------------ */ +/* interactive_mode - accept keystrokes from user and display structures */ +/* ------------------------------------------------------------------------ */ +static void interactive_mode(void) +{ + int ch = 0, Quit; + + if ((wind = initscr()) == NULL) { + fprintf(stderr, "Error: unable to initialize screen."); + eol(0); + exit(-1); + } + getmaxyx(stdscr, termlines, termcols); + termlines--; + /* Do our initial screen stuff: */ + clear(); /* don't use Erase */ + start_color(); + noecho(); + keypad(stdscr, TRUE); + raw(); + curs_set(0); + init_colors(); + /* Accept keystrokes and act on them accordingly */ + Quit = FALSE; + editing = FALSE; + while (!Quit) { + display(FALSE, 0, 0, 0); + if (editing) { + if (edit_row[dmode] == -1) + block = goto_block(); + else { + if (dmode == HEX_MODE) + hex_edit(&ch); + else if (dmode == GFS2_MODE) { + bobgets(estring, edit_row[dmode]+4, 24, + 10, &ch); + process_field(efield, estring); + } else + bobgets(estring, edit_row[dmode]+6, 14, + edit_size[dmode], &ch); + } + } + else + while ((ch=getch()) == 0); // wait for input + + switch (ch) + { + /* --------------------------------------------------------- */ + /* escape or 'q' */ + /* --------------------------------------------------------- */ + case 0x1b: + case 0x03: + case 'q': + if (editing) + editing = FALSE; + else + Quit=TRUE; + break; + /* --------------------------------------------------------- */ + /* home - return to the superblock */ + /* --------------------------------------------------------- */ + case KEY_HOME: + if (dmode == EXTENDED_MODE) { + start_row[dmode] = end_row[dmode] = 0; + edit_row[dmode] = 0; + } + else { + block = 0x10 * (4096 / sbd.bsize); + push_block(block); + offset = 0; + } + break; + /* --------------------------------------------------------- */ + /* backspace - return to the previous block on the stack */ + /* --------------------------------------------------------- */ + case KEY_BACKSPACE: + case 0x7f: + block = pop_block(); + offset = 0; + break; + /* --------------------------------------------------------- */ + /* space - go down the block stack (opposite of backspace) */ + /* --------------------------------------------------------- */ + case ' ': + blockhist++; + block = blockstack[blockhist % BLOCK_STACK_SIZE].block; + offset = 0; + break; + /* --------------------------------------------------------- */ + /* arrow up */ + /* --------------------------------------------------------- */ + case KEY_UP: + case '-': + if (dmode == EXTENDED_MODE) { + if (edit_row[dmode] > 0) + edit_row[dmode]--; + if (edit_row[dmode] < start_row[dmode]) + start_row[dmode] = edit_row[dmode]; + } + else { + if (edit_row[dmode] >= 0) + edit_row[dmode]--; + } + break; + /* --------------------------------------------------------- */ + /* arrow down */ + /* --------------------------------------------------------- */ + case KEY_DOWN: + case '+': + if (dmode == EXTENDED_MODE) { + if (edit_row[dmode] + 1 < end_row[dmode]) { + if (edit_row[dmode] - start_row[dmode] + + 1 > last_entry_onscreen[dmode]) + start_row[dmode]++; + edit_row[dmode]++; + } + } + else { + if (edit_row[dmode] < last_entry_onscreen[dmode]) + edit_row[dmode]++; + } + break; + /* --------------------------------------------------------- */ + /* arrow left */ + /* --------------------------------------------------------- */ + case KEY_LEFT: + if (dmode == HEX_MODE) { + if (edit_col[dmode] > 0) + edit_col[dmode]--; + else + edit_col[dmode] = 15; + } + break; + /* --------------------------------------------------------- */ + /* arrow right */ + /* --------------------------------------------------------- */ + case KEY_RIGHT: + if (dmode == HEX_MODE) { + if (edit_col[dmode] < 15) + edit_col[dmode]++; + else + edit_col[dmode] = 0; + } + break; + /* --------------------------------------------------------- */ + /* m - change display mode key */ + /* --------------------------------------------------------- */ + case 'm': + dmode = ((dmode + 1) % DMODES); + break; + /* --------------------------------------------------------- */ + /* J - Jump to highlighted block number */ + /* --------------------------------------------------------- */ + case 'j': + jump(); + break; + /* --------------------------------------------------------- */ + /* g - goto block */ + /* --------------------------------------------------------- */ + case 'g': + block = goto_block(); + break; + /* --------------------------------------------------------- */ + /* h - help key */ + /* --------------------------------------------------------- */ + case 'h': + print_usage(); + break; + /* --------------------------------------------------------- */ + /* e - change to extended mode */ + /* --------------------------------------------------------- */ + case 'e': + dmode = EXTENDED_MODE; + break; + /* --------------------------------------------------------- */ + /* b - Back one 4K block */ + /* --------------------------------------------------------- */ + case 'b': + start_row[dmode] = end_row[dmode] = edit_row[dmode] = 0; + if (block > 0) + block--; + offset = 0; + break; + /* --------------------------------------------------------- */ + /* c - Change color scheme */ + /* --------------------------------------------------------- */ + case 'c': + color_scheme = !color_scheme; + init_colors(); + break; + /* --------------------------------------------------------- */ + /* page up key */ + /* --------------------------------------------------------- */ + case 0x19: // ctrl-y for vt100 + case KEY_PPAGE: // PgUp + case 0x15: // ctrl-u for vi compat. + case 0x02: // ctrl-b for less compat. + pageup(); + break; + /* --------------------------------------------------------- */ + /* end - Jump to the end of the list */ + /* --------------------------------------------------------- */ + case 0x168: + if (dmode == EXTENDED_MODE) { + int ents_per_screen = dsplines / + lines_per_row[dmode]; + + edit_row[dmode] = end_row[dmode] - 1; + if ((edit_row[dmode] - ents_per_screen)+1 > 0) + start_row[dmode] = edit_row[dmode] - + ents_per_screen + 1; + else + start_row[dmode] = 0; + } + /* TODO: Make end key work for other display modes. */ + break; + /* --------------------------------------------------------- */ + /* f - Forward one 4K block */ + /* --------------------------------------------------------- */ + case 'f': + start_row[dmode]=end_row[dmode]=edit_row[dmode] = 0; + lines_per_row[dmode] = 1; + block++; + offset = 0; + break; + /* --------------------------------------------------------- */ + /* page down key */ + /* --------------------------------------------------------- */ + case 0x16: // ctrl-v for vt100 + case KEY_NPAGE: // PgDown + case 0x04: // ctrl-d for vi compat. + pagedn(); + break; + /* --------------------------------------------------------- */ + /* enter key - change a value */ + /* --------------------------------------------------------- */ + case KEY_ENTER: + case('\n'): + case('\r'): + editing = !editing; + break; + case KEY_RESIZE: + getmaxyx(stdscr, termlines, termcols); + termlines--; + break; + default: + move(termlines - 1, 0); + printw("Keystroke not understood: 0x%03x",ch); + refresh(); + usleep(50000); + break; + } /* switch */ + } /* while !Quit */ + + Erase(); + refresh(); + endwin(); +}/* interactive_mode */ + +/* ------------------------------------------------------------------------ */ +/* gfs_log_header_in - read in a gfs1-style log header */ +/* ------------------------------------------------------------------------ */ +void gfs_log_header_in(struct gfs_log_header *head, + struct gfs2_buffer_head *lbh) +{ + struct gfs_log_header *str = lbh->iov.iov_base; + + gfs2_meta_header_in(&head->lh_header, lbh->b_data); + + head->lh_flags = be32_to_cpu(str->lh_flags); + head->lh_pad = be32_to_cpu(str->lh_pad); + + head->lh_first = be64_to_cpu(str->lh_first); + head->lh_sequence = be64_to_cpu(str->lh_sequence); + + head->lh_tail = be64_to_cpu(str->lh_tail); + head->lh_last_dump = be64_to_cpu(str->lh_last_dump); + + memcpy(head->lh_reserved, str->lh_reserved, 64); +} + + +/* ------------------------------------------------------------------------ */ +/* gfs_log_header_print - print a gfs1-style log header */ +/* ------------------------------------------------------------------------ */ +void gfs_log_header_print(struct gfs_log_header *lh) +{ + gfs2_meta_header_print(&lh->lh_header); + pv(lh, lh_flags, "%u", "0x%.8x"); + pv(lh, lh_pad, "%u", "%x"); + pv((unsigned long long)lh, lh_first, "%llu", "%llx"); + pv((unsigned long long)lh, lh_sequence, "%llu", "%llx"); + pv((unsigned long long)lh, lh_tail, "%llu", "%llx"); + pv((unsigned long long)lh, lh_last_dump, "%llu", "%llx"); +} + +/* ------------------------------------------------------------------------ */ +/* usage - print command line usage */ +/* ------------------------------------------------------------------------ */ +static void usage(void) +{ + fprintf(stderr,"\nFormat is: gfs2_edit [-c 1] [-V] [-x] [-h] [identify] [-z <0-9>] [-p structures|blocks][blocktype][blockalloc [val]][blockbits][blockrg][rgcount][rgflags][rgbitmaps][find sb|rg|rb|di|in|lf|jd|lh|ld|ea|ed|lb|13|qc][field [val]] /dev/device\n\n"); + fprintf(stderr,"If only the device is specified, it enters into hexedit mode.\n"); + fprintf(stderr,"identify - prints out only the block type, not the details.\n"); + fprintf(stderr,"printsavedmeta - prints out the saved metadata blocks from a savemeta file.\n"); + fprintf(stderr,"savemeta - save off your metadata for analysis and debugging.\n"); + fprintf(stderr," (The intelligent way: assume bitmap is correct).\n"); + fprintf(stderr,"savemetaslow - save off your metadata for analysis and debugging. The SLOW way (block by block).\n"); + fprintf(stderr,"savergs - save off only the resource group information (rindex and rgs).\n"); + fprintf(stderr,"restoremeta - restore metadata for debugging (DANGEROUS).\n"); + fprintf(stderr,"rgcount - print how many RGs in the file system.\n"); + fprintf(stderr,"rgflags rgnum [new flags] - print or modify flags for rg #rgnum (0 - X)\n"); + fprintf(stderr,"rgbitmaps - print out the bitmaps for rgrp " + "rgnum.\n"); + fprintf(stderr,"rgrepair - find and repair damaged rgrp.\n"); + fprintf(stderr,"-V prints version number.\n"); + fprintf(stderr,"-c 1 selects alternate color scheme 1\n"); + fprintf(stderr,"-d prints details (for printing journals)\n"); + fprintf(stderr,"-p prints GFS2 structures or blocks to stdout.\n"); + fprintf(stderr," sb - prints the superblock.\n"); + fprintf(stderr," size - prints the filesystem size.\n"); + fprintf(stderr," master - prints the master directory.\n"); + fprintf(stderr," root - prints the root directory.\n"); + fprintf(stderr," jindex - prints the journal index directory.\n"); + fprintf(stderr," journals - prints the journal status.\n"); + fprintf(stderr," per_node - prints the per_node directory.\n"); + fprintf(stderr," inum - prints the inum file.\n"); + fprintf(stderr," statfs - prints the statfs file.\n"); + fprintf(stderr," rindex - prints the rindex file.\n"); + fprintf(stderr," rg X - print resource group X.\n"); + fprintf(stderr," rgs - prints all the resource groups (rgs).\n"); + fprintf(stderr," quota - prints the quota file.\n"); + fprintf(stderr," 0x1234 - prints the specified block\n"); + fprintf(stderr,"-p blocktype - prints the type " + "of the specified block\n"); + fprintf(stderr,"-p blockrg - prints the resource group " + "block corresponding to the specified block\n"); + fprintf(stderr,"-p blockbits - prints the block with " + "the bitmap corresponding to the specified block\n"); + fprintf(stderr,"-p blockalloc [0|1|2|3] - print or change " + "the allocation type of the specified block\n"); + fprintf(stderr,"-p field [new_value] - prints or change the " + "structure field\n"); + fprintf(stderr,"-p find sb|rg|rb|di|in|lf|jd|lh|ld|ea|ed|lb|" + "13|qc - find block of given type after block \n"); + fprintf(stderr," specifies the starting block for search\n"); + fprintf(stderr,"-z 1 use gzip compression level 1 for savemeta (default 9)\n"); + fprintf(stderr,"-z 0 do not use compression\n"); + fprintf(stderr,"-s specifies a starting block such as root, rindex, quota, inum.\n"); + fprintf(stderr,"-x print in hexmode.\n"); + fprintf(stderr,"-h prints this help.\n\n"); + fprintf(stderr,"Examples:\n"); + fprintf(stderr," To run in interactive mode:\n"); + fprintf(stderr," gfs2_edit /dev/bobs_vg/lvol0\n"); + fprintf(stderr," To print out the superblock and master directory:\n"); + fprintf(stderr," gfs2_edit -p sb master /dev/bobs_vg/lvol0\n"); + fprintf(stderr," To print out the master directory in hex:\n"); + fprintf(stderr," gfs2_edit -x -p master /dev/bobs_vg/lvol0\n"); + fprintf(stderr," To print out the block-type for block 0x27381:\n"); + fprintf(stderr," gfs2_edit identify -p 0x27381 /dev/bobs_vg/lvol0\n"); + fprintf(stderr," To print out the fourth Resource Group. (the first R is #0)\n"); + fprintf(stderr," gfs2_edit -p rg 3 /dev/sdb1\n"); + fprintf(stderr," To print out the metadata type of block 1234\n"); + fprintf(stderr," gfs2_edit -p 1234 blocktype /dev/roth_vg/roth_lb\n"); + fprintf(stderr," To print out the allocation type of block 2345\n"); + fprintf(stderr," gfs2_edit -p 2345 blockalloc /dev/vg/lv\n"); + fprintf(stderr," To change the allocation type of block 2345 to a 'free block'\n"); + fprintf(stderr," gfs2_edit -p 2345 blockalloc 0 /dev/vg/lv\n"); + fprintf(stderr," To print out the file size of the dinode at block 0x118\n"); + fprintf(stderr," gfs2_edit -p 0x118 field di_size /dev/roth_vg/roth_lb\n"); + fprintf(stderr," To find any dinode higher than the quota file dinode:\n"); + fprintf(stderr," gfs2_edit -p quota find di /dev/x/y\n"); + fprintf(stderr," To set the Resource Group flags for rg #7 to 3.\n"); + fprintf(stderr," gfs2_edit rgflags 7 3 /dev/sdc2\n"); + fprintf(stderr," To save off all metadata for /dev/vg/lv:\n"); + fprintf(stderr," gfs2_edit savemeta /dev/vg/lv /tmp/metasave.gz\n"); +}/* usage */ + +/** + * getgziplevel - Process the -z parameter to savemeta operations + * argv - argv + * i - a pointer to the argv index at which to begin processing + * The index pointed to by i will be incremented past the -z option if found + */ +static void getgziplevel(char *argv[], int *i) +{ + char *opt, *arg; + char *endptr; + + arg = argv[1 + *i]; + if (strncmp(arg, "-z", 2)) { + return; + } else if (arg[2] != '\0') { + opt = &arg[2]; + } else { + (*i)++; + opt = argv[1 + *i]; + } + errno = 0; + gziplevel = strtol(opt, &endptr, 10); + if (errno || endptr == opt || gziplevel < 0 || gziplevel > 9) { + fprintf(stderr, "Compression level out of range: %s\n", opt); + exit(-1); + } + (*i)++; +} + +static int count_dinode_blks(struct rgrp_tree *rgd, int bitmap, + struct gfs2_buffer_head *rbh) +{ + struct gfs2_buffer_head *tbh; + uint64_t b; + int dinodes = 0; + char *byte, cur_state, new_state; + int bit, off; + + if (bitmap) + off = sizeof(struct gfs2_meta_header); + else + off = sizeof(struct gfs2_rgrp); + + for (b = 0; b < rgd->bits[bitmap].bi_len << GFS2_BIT_SIZE; b++) { + tbh = bread(&sbd, rgd->ri.ri_data0 + + rgd->bits[bitmap].bi_start + b); + byte = rbh->b_data + off + (b / GFS2_NBBY); + bit = (b % GFS2_NBBY) * GFS2_BIT_SIZE; + if (gfs2_check_meta(tbh, GFS2_METATYPE_DI) == 0) { + dinodes++; + new_state = GFS2_BLKST_DINODE; + } else { + new_state = GFS2_BLKST_USED; + } + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + *byte ^= cur_state << bit; + *byte |= new_state << bit; + brelse(tbh); + } + bmodified(rbh); + return dinodes; +} + +static int count_dinode_bits(struct gfs2_buffer_head *rbh) +{ + uint64_t blk; + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)rbh->b_data; + char *byte; + int bit; + int dinodes = 0; + + if (be32_to_cpu(mh->mh_type) == GFS2_METATYPE_RG) + blk = sizeof(struct gfs2_rgrp); + else + blk = sizeof(struct gfs2_meta_header); + + for (; blk < sbd.bsize; blk++) { + byte = rbh->b_data + (blk / GFS2_NBBY); + bit = (blk % GFS2_NBBY) * GFS2_BIT_SIZE; + if (((*byte >> bit) & GFS2_BIT_MASK) == GFS2_BLKST_DINODE) + dinodes++; + } + return dinodes; +} + +static void rg_repair(void) +{ + struct gfs2_buffer_head *rbh; + struct rgrp_tree *rgd; + struct osi_node *n; + int b; + int rgs_fixed = 0; + int dinodes_found = 0, dinodes_total = 0; + + /* Walk through the resource groups saving everything within */ + for (n = osi_first(&sbd.rgtree); n; n = osi_next(n)) { + rgd = (struct rgrp_tree *)n; + if (gfs2_rgrp_read(&sbd, rgd) == 0) { /* was read in okay */ + gfs2_rgrp_relse(rgd); + continue; /* ignore it */ + } + /* If we get here, it's because we have an rgrp in the rindex + file that can't be read in. So attempt to repair it. + If we find a damaged rgrp or bitmap, fix the metadata. + Then scan all its blocks: if we find a dinode, set the + repaired bitmap to GFS2_BLKST_DINODE. Set all others to + GFS2_BLKST_USED so fsck can sort it out. If we set them + to FREE, fsck would just nuke it all. */ + printf("Resource group at block %llu (0x%llx) appears to be " + "damaged. Attempting to fix it (in reverse order).\n", + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr); + + for (b = rgd->ri.ri_length - 1; b >= 0; b--) { + int mtype = (b ? GFS2_METATYPE_RB : GFS2_METATYPE_RG); + struct gfs2_meta_header *mh; + + printf("Bitmap #%d:", b); + rbh = bread(&sbd, rgd->ri.ri_addr + b); + if (gfs2_check_meta(rbh, mtype)) { /* wrong type */ + printf("Damaged. Repairing..."); + /* Fix the meta header */ + memset(rbh->b_data, 0, sbd.bsize); + mh = (struct gfs2_meta_header *)rbh->b_data; + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); + mh->mh_type = cpu_to_be32(mtype); + if (b) + mh->mh_format = + cpu_to_be32(GFS2_FORMAT_RB); + else + mh->mh_format = + cpu_to_be32(GFS2_FORMAT_RG); + bmodified(rbh); + /* Count the dinode blocks */ + dinodes_found = count_dinode_blks(rgd, b, rbh); + } else { /* bitmap info is okay: tally it. */ + printf("Undamaged. Analyzing..."); + dinodes_found = count_dinode_bits(rbh); + } + printf("Dinodes found: %d\n", dinodes_found); + dinodes_total += dinodes_found; + if (b == 0) { /* rgrp itself was damaged */ + rgd->rg.rg_dinodes = dinodes_total; + rgd->rg.rg_free = 0; + } + brelse(rbh); + } + rgs_fixed++; + } + if (rgs_fixed) + printf("%d resource groups fixed.\n" + "You should run fsck.gfs2 to reconcile the bitmaps.\n", + rgs_fixed); + else + printf("All resource groups are okay. No repairs needed.\n"); + exit(0); +} + +/* ------------------------------------------------------------------------ */ +/* parameterpass1 - pre-processing for command-line parameters */ +/* ------------------------------------------------------------------------ */ +static void parameterpass1(int argc, char *argv[], int i) +{ + if (!strcasecmp(argv[i], "-V")) { + printf("%s version %s (built %s %s)\n", + argv[0], VERSION, __DATE__, __TIME__); + printf("%s\n", REDHAT_COPYRIGHT); + exit(0); + } + else if (!strcasecmp(argv[i], "-h") || + !strcasecmp(argv[i], "-help") || + !strcasecmp(argv[i], "-usage")) { + usage(); + exit(0); + } + else if (!strcasecmp(argv[i], "-c")) { + i++; + color_scheme = atoi(argv[i]); + } + else if (!strcasecmp(argv[i], "-p") || + !strcasecmp(argv[i], "-print")) { + termlines = 0; /* initial value--we'll figure + it out later */ + dmode = GFS2_MODE; + } + else if (!strcasecmp(argv[i], "-d") || + !strcasecmp(argv[i], "-details")) + details = 1; + else if (!strcasecmp(argv[i], "savemeta")) + termlines = 0; + else if (!strcasecmp(argv[i], "savemetaslow")) + termlines = 0; + else if (!strcasecmp(argv[i], "savergs")) + termlines = 0; + else if (!strcasecmp(argv[i], "printsavedmeta")) { + if (dmode == INIT_MODE) + dmode = GFS2_MODE; + restoremeta(argv[i+1], argv[i+2], TRUE); + } else if (!strcasecmp(argv[i], "restoremeta")) { + if (dmode == INIT_MODE) + dmode = HEX_MODE; /* hopefully not used */ + restoremeta(argv[i+1], argv[i+2], FALSE); + } else if (!strcmp(argv[i], "rgcount")) + termlines = 0; + else if (!strcmp(argv[i], "rgflags")) + termlines = 0; + else if (!strcmp(argv[i], "rgrepair")) + termlines = 0; + else if (!strcmp(argv[i], "rg")) + termlines = 0; + else if (!strcasecmp(argv[i], "-x")) + dmode = HEX_MODE; + else if (device == NULL && strchr(argv[i],'/')) { + device = argv[i]; + } +} + +/* ------------------------------------------------------------------------ */ +/* process_parameters - process commandline parameters */ +/* pass - we make two passes through the parameters; the first pass gathers */ +/* normals parameters, device name, etc. The second pass is for */ +/* figuring out what structures to print out. */ +/* ------------------------------------------------------------------------ */ +static void process_parameters(int argc, char *argv[], int pass) +{ + int i; + uint64_t keyword_blk; + + if (argc < 2) { + usage(); + die("no device specified\n"); + } + for (i = 1; i < argc; i++) { + if (!pass) { /* first pass */ + parameterpass1(argc, argv, i); + continue; + } + /* second pass */ + if (!strcasecmp(argv[i], "-s")) { + i++; + if (i >= argc - 1) { + printf("Error: starting block not specified " + "with -s.\n"); + printf("%s -s [starting block | keyword] " + "\n", argv[0]); + printf("For example: %s -s \"rg 3\" " + "/dev/exxon_vg/exxon_lv\n", argv[0]); + exit(EXIT_FAILURE); + } + starting_blk = check_keywords(argv[i]); + continue; + } + if (termlines || strchr(argv[i],'/')) /* if print or slash */ + continue; + + if (!strncmp(argv[i], "journal", 7) && isdigit(argv[i][7]) && + strcmp(argv[i+1], "field")) { + int blk = 0; + + if (i < argc - 1 && isdigit(argv[i + 1][0])) { + if (argv[i + 1][0]=='0' && argv[i + 1][1]=='x') + sscanf(argv[i + 1], "%x", &blk); + else + blk = atoi(argv[i + 1]); + } + dump_journal(argv[i], blk); + continue; + } + keyword_blk = check_keywords(argv[i]); + if (keyword_blk) + push_block(keyword_blk); + else if (!strcasecmp(argv[i], "-x")) + dmode = HEX_MODE; + else if (argv[i][0] == '-') /* if it starts with a dash */ + ; /* ignore it--meant for pass == 0 */ + else if (!strcmp(argv[i], "identify")) + identify = TRUE; + else if (!strcmp(argv[i], "size")) { + printf("Device size: %llu (0x%llx)\n", + (unsigned long long)max_block, + (unsigned long long)max_block); + exit(EXIT_SUCCESS); + } else if (!strcmp(argv[i], "rgcount")) + rgcount(); + else if (!strcmp(argv[i], "field")) { + i++; + if (i >= argc - 1) { + printf("Error: field not specified.\n"); + printf("Format is: %s -p field " + " [newvalue]\n", argv[0]); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_FAILURE); + } + process_field(argv[i], argv[i + 1]); + } else if (!strcmp(argv[i], "blocktype")) { + find_print_block_type(); + } else if (!strcmp(argv[i], "blockrg")) { + find_print_block_rg(0); + } else if (!strcmp(argv[i], "blockbits")) { + find_print_block_rg(1); + } else if (!strcmp(argv[i], "blockalloc")) { + if (isdigit(argv[i + 1][0])) { + int newval; + + if (argv[i + 1][0]=='0' && argv[i + 1][1]=='x') + sscanf(argv[i + 1], "%x", &newval); + else + newval = (uint64_t)atoi(argv[i + 1]); + find_change_block_alloc(&newval); + } else { + find_change_block_alloc(NULL); + } + } else if (!strcmp(argv[i], "find")) { + find_metablockoftype(argv[i + 1], 1); + } else if (!strcmp(argv[i], "rgflags")) { + int rg, set = FALSE; + uint32_t new_flags = 0; + + i++; + if (i >= argc - 1) { + printf("Error: rg # not specified.\n"); + printf("Format is: %s rgflags rgnum" + "[newvalue]\n", argv[0]); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_FAILURE); + } + if (argv[i][0]=='0' && argv[i][1]=='x') + sscanf(argv[i], "%"SCNx32, &rg); + else + rg = atoi(argv[i]); + i++; + if (i < argc - 1 && + isdigit(argv[i][0])) { + set = TRUE; + if (argv[i][0]=='0' && argv[i][1]=='x') + sscanf(argv[i], "%"SCNx32, &new_flags); + else + new_flags = atoi(argv[i]); + } + set_rgrp_flags(rg, new_flags, set, FALSE); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_SUCCESS); + } else if (!strcmp(argv[i], "rg")) { + int rg; + + i++; + if (i >= argc - 1) { + printf("Error: rg # not specified.\n"); + printf("Format is: %s rg rgnum\n", argv[0]); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_FAILURE); + } + rg = atoi(argv[i]); + if (!strcasecmp(argv[i + 1], "find")) { + temp_blk = get_rg_addr(rg); + push_block(temp_blk); + } else { + set_rgrp_flags(rg, 0, FALSE, TRUE); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_SUCCESS); + } + } else if (!strcmp(argv[i], "rgbitmaps")) { + int rg, bmap; + uint64_t rgblk; + struct rgrp_tree *rgd; + + i++; + if (i >= argc - 1) { + printf("Error: rg # not specified.\n"); + printf("Format is: %s rgbitmaps rgnum\n", + argv[0]); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_FAILURE); + } + rg = atoi(argv[i]); + rgblk = get_rg_addr(rg); + rgd = gfs2_blk2rgrpd(&sbd, rgblk); + if (rgd == NULL) { + printf("Error: rg # is invalid.\n"); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_FAILURE); + } + for (bmap = 0; bmap < rgd->ri.ri_length; bmap++) + push_block(rgblk + bmap); + } + else if (!strcmp(argv[i], "rgrepair")) + rg_repair(); + else if (!strcasecmp(argv[i], "savemeta")) { + getgziplevel(argv, &i); + savemeta(argv[i+2], 0, gziplevel); + } else if (!strcasecmp(argv[i], "savemetaslow")) { + getgziplevel(argv, &i); + savemeta(argv[i+2], 1, gziplevel); + } else if (!strcasecmp(argv[i], "savergs")) { + getgziplevel(argv, &i); + savemeta(argv[i+2], 2, gziplevel); + } else if (isdigit(argv[i][0])) { /* decimal addr */ + sscanf(argv[i], "%"SCNd64, &temp_blk); + push_block(temp_blk); + } else { + fprintf(stderr,"I don't know what '%s' means.\n", + argv[i]); + usage(); + exit(EXIT_FAILURE); + } + } /* for */ +}/* process_parameters */ + +int main(int argc, char *argv[]) +{ + int i, j, fd; + + indirect = malloc(sizeof(struct iinfo)); + if (!indirect) + die("Out of memory."); + memset(indirect, 0, sizeof(struct iinfo)); + memset(start_row, 0, sizeof(start_row)); + memset(lines_per_row, 0, sizeof(lines_per_row)); + memset(end_row, 0, sizeof(end_row)); + memset(edit_row, 0, sizeof(edit_row)); + memset(edit_col, 0, sizeof(edit_col)); + memset(edit_size, 0, sizeof(edit_size)); + memset(last_entry_onscreen, 0, sizeof(last_entry_onscreen)); + dmode = INIT_MODE; + sbd.bsize = 4096; + block = starting_blk = 0x10; + for (i = 0; i < BLOCK_STACK_SIZE; i++) { + blockstack[i].dmode = HEX_MODE; + blockstack[i].block = block; + for (j = 0; j < DMODES; j++) { + blockstack[i].start_row[j] = 0; + blockstack[i].end_row[j] = 0; + blockstack[i].edit_row[j] = 0; + blockstack[i].edit_col[j] = 0; + blockstack[i].lines_per_row[j] = 0; + } + } + + edit_row[GFS2_MODE] = 10; /* Start off at root inode + pointer in superblock */ + termlines = 30; /* assume interactive mode until we find -p */ + process_parameters(argc, argv, 0); + if (dmode == INIT_MODE) + dmode = HEX_MODE; + + fd = open(device, O_RDWR); + if (fd < 0) + die("can't open %s: %s\n", device, strerror(errno)); + max_block = lseek(fd, 0, SEEK_END) / sbd.bsize; + + read_superblock(fd); + if (read_rindex()) + exit(-1); + max_block = lseek(fd, 0, SEEK_END) / sbd.bsize; + if (sbd.gfs1) + edit_row[GFS2_MODE]++; + else if (read_master_dir() != 0) + exit(-1); + + process_parameters(argc, argv, 1); /* get what to print from cmdline */ + + block = blockstack[0].block = starting_blk * (4096 / sbd.bsize); + + if (termlines) + interactive_mode(); + else { /* print all the structures requested */ + i = 0; + while (blockhist > 0) { + block = blockstack[i + 1].block; + if (!block) + break; + display(identify, 0, 0, 0); + if (!identify) { + display_extended(); + printf("-------------------------------------" \ + "-----------------"); + eol(0); + } + block = pop_block(); + i++; + } + } + close(fd); + if (indirect) + free(indirect); + gfs2_rgrp_free(&sbd.rgtree); + exit(EXIT_SUCCESS); +} diff --git a/gfs2/edit/hexedit.h b/gfs2/edit/hexedit.h new file mode 100644 index 0000000..d2992d8 --- /dev/null +++ b/gfs2/edit/hexedit.h @@ -0,0 +1,243 @@ +#ifndef __HEXVIEW_DOT_H__ +#define __HEXVIEW_DOT_H__ + +#include +#include +#include +#include +#include + +#include "libgfs2.h" +#include "copyright.cf" + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +#define DMODES 3 +enum dsp_mode { HEX_MODE = 0, GFS2_MODE = 1, EXTENDED_MODE = 2, INIT_MODE = 3 }; +#define BLOCK_STACK_SIZE 256 + +#define pv(struct, member, fmt, fmt2) do { \ + print_it(" "#member, fmt, fmt2, struct->member); \ + } while (FALSE); +#define RGLIST_DUMMY_BLOCK -2 +#define JOURNALS_DUMMY_BLOCK -3 + +extern const char *mtypes[]; +extern struct gfs2_sb sb; +extern int blockhist; +extern int edit_mode; +extern int line; +extern char edit_fmt[80]; +extern char estring[1024]; /* edit string */ +extern char efield[64]; +extern uint64_t dev_offset; +extern uint64_t max_block; +extern int termlines; +extern int insert; +extern const char *termtype; +extern int line; +extern int struct_len; +extern unsigned int offset; +extern int edit_row[DMODES], edit_col[DMODES], print_entry_ndx; +extern int start_row[DMODES], end_row[DMODES], lines_per_row[DMODES]; +extern int edit_size[DMODES], last_entry_onscreen[DMODES]; +extern char edit_fmt[80]; +extern struct gfs2_sbd sbd; +extern struct gfs_sb *sbd1; +extern struct gfs2_inum gfs1_quota_di; /* kludge because gfs2 sb too small */ +extern struct gfs2_inum gfs1_license_di; /* kludge because gfs2 sb too small */ +extern struct gfs2_dinode di; +extern int screen_chunk_size; /* how much of the 4K can fit on screen */ +extern int gfs2_struct_type; +extern int identify; +extern int color_scheme; +extern WINDOW *wind; +extern int editing; +extern uint64_t temp_blk; +extern uint64_t starting_blk; +extern const char *block_type_str[15]; +extern int dsplines; +extern int dsp_lines[DMODES]; +extern int combined_display; +extern int details; +extern const char *allocdesc[2][5]; + +struct gfs2_dirents { + uint64_t block; + struct gfs2_dirent dirent; + char filename[NAME_MAX]; +}; + +struct indirect_info { + int is_dir; + int height; + uint64_t block; + uint32_t dirents; + struct gfs2_leaf lf; + struct metapath mp; + struct gfs2_dirents dirent[64]; + uint64_t ptroff; +}; + +struct iinfo { + struct indirect_info ii[512]; +}; + +struct blkstack_info { + uint64_t block; + int start_row[DMODES]; + int end_row[DMODES]; + int lines_per_row[DMODES]; + int edit_row[DMODES]; + int edit_col[DMODES]; + enum dsp_mode dmode; + int gfs2_struct_type; + struct metapath mp; +}; + +extern struct blkstack_info blockstack[BLOCK_STACK_SIZE]; +extern struct iinfo *indirect; /* more than the most indirect + pointers possible for any given 4K block */ +extern struct indirect_info masterdir; /* Master directory info */ +extern int indirect_blocks; /* count of indirect blocks */ +extern enum dsp_mode dmode; + +/* ------------------------------------------------------------------------ */ +/* block_is_rgtree - there's no such block as the rglist. This is a */ +/* special case meant to parse the rindex and follow the */ +/* blocks to the real rgs. */ +/* ------------------------------------------------------------------------ */ +static inline int block_is_rgtree(uint64_t blk) +{ + if (blk == RGLIST_DUMMY_BLOCK) + return TRUE; + return FALSE; +} + +static inline int block_is_journals(uint64_t blk) +{ + if (blk == JOURNALS_DUMMY_BLOCK) + return TRUE; + return FALSE; +} + +#define SCREEN_HEIGHT (16) +#define SCREEN_WIDTH (16) + +/* die() used to be in libgfs2.h */ +static __inline__ __attribute__((noreturn, format (printf, 1, 2))) +void die(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(-1); +} + +/* Memory macros */ + +#define type_alloc(ptr, type, count) \ +{ \ + (ptr) = (type *)malloc(sizeof(type) * (count)); \ + if (!(ptr)) \ + die("unable to allocate memory on line %d of file %s\n", \ + __LINE__, __FILE__); \ +} + +#define printk printw + +/* Divide x by y. Round up if there is a remainder. */ +#define DIV_RU(x, y) (((x) + (y) - 1) / (y)) + +#define TITLE1 "gfs2_edit - Global File System Editor (use with extreme caution)" +#define TITLE2 REDHAT_COPYRIGHT " - Press H for help" + +#define COLOR_TITLE 1 +#define COLOR_NORMAL 2 +#define COLOR_INVERSE 3 +#define COLOR_SPECIAL 4 +#define COLOR_HIGHLIGHT 5 +#define COLOR_OFFSETS 6 +#define COLOR_CONTENTS 7 + +#define COLORS_TITLE \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_TITLE)); \ + attron(A_BOLD); \ + } \ + } while (0) +#define COLORS_NORMAL \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_NORMAL)); \ + attron(A_BOLD); \ + } \ + } while (0) +#define COLORS_INVERSE \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_INVERSE)); \ + attron(A_BOLD); \ + } \ + } while (0) +#define COLORS_SPECIAL \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_SPECIAL)); \ + attron(A_BOLD); \ + } \ + } while (0) +#define COLORS_HIGHLIGHT \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_HIGHLIGHT)); \ + attron(A_BOLD); \ + } \ + } while (0) +#define COLORS_OFFSETS \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_OFFSETS)); \ + attron(A_BOLD); \ + } \ + } while (0) +#define COLORS_CONTENTS \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_CONTENTS)); \ + attron(A_BOLD); \ + } \ + } while (0) + +extern int block_is_jindex(uint64_t blk); +extern int block_is_rindex(uint64_t blk); +extern int block_is_inum_file(uint64_t blk); +extern int block_is_statfs_file(uint64_t blk); +extern int block_is_quota_file(uint64_t blk); +extern int block_is_per_node(uint64_t blk); +extern int display_block_type(struct gfs2_buffer_head *bh, int from_restore); +extern void gfs_jindex_in(struct gfs_jindex *jindex, char *buf); +extern void gfs_log_header_in(struct gfs_log_header *head, + struct gfs2_buffer_head *bh); +extern void gfs_log_header_print(struct gfs_log_header *lh); +extern void gfs_dinode_in(struct gfs_dinode *di, struct gfs2_buffer_head *bh); +extern void savemeta(char *out_fn, int saveoption, int gziplevel); +extern void restoremeta(const char *in_fn, const char *out_device, + uint64_t printblocksonly); +extern int display(int identify_only, int trunc_zeros, uint64_t flagref, + uint64_t ref_blk); +extern uint64_t check_keywords(const char *kword); +extern uint64_t masterblock(const char *fn); +extern void gfs_rgrp_print(struct gfs_rgrp *rg); +extern int has_indirect_blocks(void); +extern uint32_t get_block_type(const struct gfs2_buffer_head *lbh, + int *structlen); + +#endif /* __HEXVIEW_DOT_H__ */ diff --git a/gfs2/edit/journal.c b/gfs2/edit/journal.c new file mode 100644 index 0000000..559bd4e --- /dev/null +++ b/gfs2/edit/journal.c @@ -0,0 +1,652 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "copyright.cf" + +#include "hexedit.h" +#include "libgfs2.h" +#include "extended.h" +#include "gfs2hex.h" +#include "journal.h" + +extern uint64_t block; + +/** + * find_journal_block - figure out where a journal starts, given the name + * Returns: journal block number, changes j_size to the journal size + */ +uint64_t find_journal_block(const char *journal, uint64_t *j_size) +{ + int journal_num; + uint64_t jindex_block, jblock = 0; + int amtread; + struct gfs2_buffer_head *jindex_bh, *j_bh; + char jbuf[sbd.bsize]; + + journal_num = atoi(journal + 7); + if (journal_num < 0) + return 0; + + /* Figure out the block of the jindex file */ + if (sbd.gfs1) + jindex_block = sbd1->sb_jindex_di.no_addr; + else + jindex_block = masterblock("jindex"); + /* read in the block */ + jindex_bh = bread(&sbd, jindex_block); + /* get the dinode data from it. */ + gfs2_dinode_in(&di, jindex_bh->b_data); + + if (!sbd.gfs1) + do_dinode_extended(&di, jindex_bh); /* parse dir. */ + + if (sbd.gfs1) { + struct gfs2_inode *jiinode; + struct gfs_jindex ji; + + jiinode = lgfs2_inode_get(&sbd, jindex_bh); + if (jiinode == NULL) + return 0; + amtread = gfs2_readi(jiinode, (void *)&jbuf, + journal_num * sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + if (amtread) { + gfs_jindex_in(&ji, jbuf); + jblock = ji.ji_addr; + *j_size = (uint64_t)ji.ji_nsegment * 0x10; + } + inode_put(&jiinode); + } else { + struct gfs2_dinode jdi; + + if (journal_num > indirect->ii[0].dirents - 2) + return 0; + jblock = indirect->ii[0].dirent[journal_num + 2].block; + j_bh = bread(&sbd, jblock); + gfs2_dinode_in(&jdi, j_bh->b_data); + *j_size = jdi.di_size; + brelse(j_bh); + } + brelse(jindex_bh); + return jblock; +} + +static void check_journal_wrap(uint64_t seq, uint64_t *highest_seq) +{ + if (seq < *highest_seq) { + print_gfs2("------------------------------------------------" + "------------------------------------------------"); + eol(0); + print_gfs2("Journal wrapped here."); + eol(0); + print_gfs2("------------------------------------------------" + "------------------------------------------------"); + eol(0); + } + *highest_seq = seq; +} + +/** + * fsck_readi - same as libgfs2's gfs2_readi, but sets absolute block # + * of the first bit of data read. + */ +static int fsck_readi(struct gfs2_inode *ip, void *rbuf, uint64_t roffset, + unsigned int size, uint64_t *abs_block) +{ + struct gfs2_sbd *sdp; + struct gfs2_buffer_head *lbh; + uint64_t lblock, dblock; + unsigned int o; + uint32_t extlen = 0; + unsigned int amount; + int not_new = 0; + int isdir; + int copied = 0; + + if (ip == NULL) + return 0; + sdp = ip->i_sbd; + isdir = !!(S_ISDIR(ip->i_di.di_mode)); + *abs_block = 0; + if (roffset >= ip->i_di.di_size) + return 0; + if ((roffset + size) > ip->i_di.di_size) + size = ip->i_di.di_size - roffset; + if (!size) + return 0; + if (isdir) { + o = roffset % sdp->sd_jbsize; + lblock = roffset / sdp->sd_jbsize; + } else { + lblock = roffset >> sdp->sd_sb.sb_bsize_shift; + o = roffset & (sdp->bsize - 1); + } + + if (!ip->i_di.di_height) /* inode_is_stuffed */ + o += sizeof(struct gfs2_dinode); + else if (isdir) + o += sizeof(struct gfs2_meta_header); + + while (copied < size) { + amount = size - copied; + if (amount > sdp->bsize - o) + amount = sdp->bsize - o; + if (!extlen) + block_map(ip, lblock, ¬_new, &dblock, &extlen, + FALSE); + if (dblock) { + lbh = bread(sdp, dblock); + if (*abs_block == 0) + *abs_block = lbh->b_blocknr; + dblock++; + extlen--; + } else + lbh = NULL; + if (lbh) { + memcpy(rbuf, lbh->b_data + o, amount); + brelse(lbh); + } else { + memset(rbuf, 0, amount); + } + copied += amount; + lblock++; + o = (isdir) ? sizeof(struct gfs2_meta_header) : 0; + } + return copied; +} + +/** + * ld_is_pertinent - determine if a log descriptor is pertinent + * + * This function checks a log descriptor buffer to see if it contains + * references to a given traced block, or its rgrp bitmap block. + */ +static int ld_is_pertinent(const uint64_t *b, const char *end, uint64_t tblk, + struct rgrp_tree *rgd, uint64_t bitblk) +{ + const uint64_t *blk = b; + + if (!tblk) + return 1; + + while (*blk && (char *)blk < end) { + if (be64_to_cpu(*blk) == tblk || be64_to_cpu(*blk) == bitblk) + return 1; + blk++; + if (sbd.gfs1) + blk++; + } + return 0; +} + +/** + * print_ld_blks - print all blocks given in a log descriptor + * returns: the number of block numbers it printed + */ +static int print_ld_blks(const uint64_t *b, const char *end, int start_line, + uint64_t tblk, uint64_t *tblk_off, uint64_t bitblk, + struct rgrp_tree *rgd, uint64_t abs_block, int prnt, + uint64_t *bblk_off, int is_meta_ld) +{ + int bcount = 0, found_tblk = 0, found_bblk = 0; + static char str[256]; + struct gfs2_buffer_head *j_bmap_bh; + + if (tblk_off) + *tblk_off = 0; + if (bblk_off) + *bblk_off = 0; + while (*b && (char *)b < end) { + if (!termlines || + (print_entry_ndx >= start_row[dmode] && + ((print_entry_ndx - start_row[dmode])+1) * + lines_per_row[dmode] <= termlines - start_line - 2)) { + if (prnt && bcount && bcount % 4 == 0) { + eol(0); + print_gfs2(" "); + } + bcount++; + if (prnt) { + if (is_meta_ld) { + j_bmap_bh = bread(&sbd, abs_block + + bcount); + sprintf(str, "0x%llx %2s", + (unsigned long long)be64_to_cpu(*b), + mtypes[lgfs2_get_block_type(j_bmap_bh)]); + brelse(j_bmap_bh); + } else { + sprintf(str, "0x%llx", + (unsigned long long)be64_to_cpu(*b)); + } + print_gfs2("%-18.18s ", str); + } + if (!found_tblk && tblk_off) + (*tblk_off)++; + if (!found_bblk && bblk_off) + (*bblk_off)++; + if (tblk && (be64_to_cpu(*b) == tblk)) { + found_tblk = 1; + print_gfs2("<-------------------------0x%llx ", + (unsigned long long)tblk); + eol(18 * (bcount % 4) + 1); + print_gfs2(" "); + } + if (tblk && rgd && (be64_to_cpu(*b) == bitblk)) { + int type, bmap = 0; + uint64_t o; + struct gfs2_buffer_head *save_bh; + + found_bblk = 1; + print_gfs2("<-------------------------"); + if (is_meta_ld) { + o = tblk - rgd->ri.ri_data0; + if (o >= (rgd->bits->bi_start + + rgd->bits->bi_len) * + GFS2_NBBY) + o += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; + bmap = o / sbd.sd_blocks_per_bitmap; + save_bh = rgd->bits[bmap].bi_bh; + j_bmap_bh = bread(&sbd, abs_block + + bcount); + rgd->bits[bmap].bi_bh = j_bmap_bh; + type = lgfs2_get_bitmap(&sbd, tblk, rgd); + brelse(j_bmap_bh); + if (type < 0) { + perror("Error printing log descriptor blocks"); + exit(1); + } + rgd->bits[bmap].bi_bh = save_bh; + print_gfs2("bit for blk 0x%llx is %d " + "(%s)", + (unsigned long long)tblk, + type, + allocdesc[sbd.gfs1][type]); + } else { + print_gfs2("bitmap for blk 0x%llx " + "was revoked", + (unsigned long long)tblk); + } + eol(18 * (bcount % 4) + 1); + print_gfs2(" "); + } + } + b++; + if (sbd.gfs1) + b++; + } + if (prnt) + eol(0); + if (tblk_off && (!found_tblk || !is_meta_ld)) + *tblk_off = 0; + if (bblk_off && (!found_bblk || !is_meta_ld)) + *bblk_off = 0; + return bcount; +} + +static int is_wrap_pt(char *buf, uint64_t *highest_seq) +{ + struct gfs2_buffer_head tbh = { .b_data = buf }; + + if (get_block_type(&tbh, NULL) == GFS2_METATYPE_LH) { + uint64_t seq; + + if (sbd.gfs1) { + struct gfs_log_header lh; + gfs_log_header_in(&lh, &tbh); + seq = lh.lh_sequence; + } else { + struct gfs2_log_header lh; + gfs2_log_header_in(&lh, buf); + seq = lh.lh_sequence; + } + if (seq < *highest_seq) + return 1; + *highest_seq = seq; + } + return 0; +} + +/** + * find_wrap_pt - figure out where a journal wraps + * Returns: The wrap point, in bytes + */ +static uint64_t find_wrap_pt(struct gfs2_inode *ji, char *jbuf, uint64_t jblock, uint64_t j_size) +{ + uint64_t jb = 0; + uint64_t highest_seq = 0; + + for (jb = 0; jb < j_size; jb += (sbd.gfs1 ? 1 : sbd.bsize)) { + int found = 0; + + if (sbd.gfs1) { + struct gfs2_buffer_head *j_bh; + + j_bh = bread(&sbd, jblock + jb); + found = is_wrap_pt(j_bh->b_data, &highest_seq); + brelse(j_bh); + } else { + int copied; + uint64_t abs_block; + + copied = fsck_readi(ji, jbuf, jb, sbd.bsize, &abs_block); + if (!copied) /* end of file */ + break; + found = is_wrap_pt(jbuf, &highest_seq); + } + if (found) + return jb; + } + return 0; +} + +/** + * process_ld - process a log descriptor + */ +static int process_ld(uint64_t abs_block, uint64_t wrappt, uint64_t j_size, + uint64_t jb, char *buf, int tblk, + uint64_t *tblk_off, uint64_t bitblk, + struct rgrp_tree *rgd, int *prnt, uint64_t *bblk_off) +{ + uint64_t *b; + struct gfs2_log_descriptor ld; + int ltndx, is_meta_ld = 0; + int ld_blocks = 0; + uint32_t logtypes[2][6] = { + {GFS2_LOG_DESC_METADATA, GFS2_LOG_DESC_REVOKE, + GFS2_LOG_DESC_JDATA, 0, 0, 0}, + {GFS_LOG_DESC_METADATA, GFS_LOG_DESC_IUL, GFS_LOG_DESC_IDA, + GFS_LOG_DESC_Q, GFS_LOG_DESC_LAST, 0}}; + const char *logtypestr[2][6] = { + {"Metadata", "Revoke", "Jdata", + "Unknown", "Unknown", "Unknown"}, + {"Metadata", "Unlinked inode", "Dealloc inode", + "Quota", "Final Entry", "Unknown"}}; + + gfs2_log_descriptor_in(&ld, buf); + if (sbd.gfs1) + b = (uint64_t *)(buf + sizeof(struct gfs_log_descriptor)); + else + b = (uint64_t *)(buf + sizeof(struct gfs2_log_descriptor)); + *prnt = ld_is_pertinent(b, (buf + sbd.bsize), tblk, rgd, bitblk); + + if (*prnt) { + print_gfs2("0x%"PRIx64" (j+%4"PRIx64"): Log descriptor, ", + abs_block, ((jb + wrappt) % j_size) / sbd.bsize); + print_gfs2("type %d ", ld.ld_type); + + for (ltndx = 0;; ltndx++) { + if (ld.ld_type == logtypes[sbd.gfs1][ltndx] || + logtypes[sbd.gfs1][ltndx] == 0) + break; + } + print_gfs2("(%s) ", logtypestr[sbd.gfs1][ltndx]); + print_gfs2("len:%u, data1: %u", ld.ld_length, ld.ld_data1); + eol(0); + print_gfs2(" "); + } + ld_blocks = ld.ld_data1; + if (ld.ld_type == GFS2_LOG_DESC_METADATA || + ld.ld_type == GFS_LOG_DESC_METADATA) + is_meta_ld = 1; + ld_blocks -= print_ld_blks(b, (buf + sbd.bsize), line, tblk, tblk_off, + bitblk, rgd, abs_block, *prnt, bblk_off, + is_meta_ld); + + return ld_blocks; +} + +/** + * meta_has_ref - check if a metadata block references a given block + */ +static int meta_has_ref(uint64_t abs_block, int tblk) +{ + struct gfs2_buffer_head *mbh; + int structlen, ty, has_ref = 0; + uint64_t *b; + struct gfs2_dinode *dinode; + + mbh = bread(&sbd, abs_block); + ty = get_block_type(mbh, &structlen); + if (ty == GFS2_METATYPE_DI) { + dinode = (struct gfs2_dinode *)mbh->b_data; + if (be64_to_cpu(dinode->di_eattr) == tblk) + has_ref = 1; + } + b = (uint64_t *)(mbh->b_data + structlen); + while (!has_ref && ty && (char *)b < mbh->b_data + sbd.bsize) { + if (be64_to_cpu(*b) == tblk) + has_ref = 1; + b++; + } + brelse(mbh); + return has_ref; +} + + +/** + * get_ldref - get a log descriptor reference block, given a block number + * + * Note that we can't pass in abs_block here, because journal wrap may + * mean that the block we're interested in, in the journal, is before the + * log descriptor that holds the reference we need. + */ +static uint64_t get_ldref(uint64_t abs_ld, int offset_from_ld) +{ + struct gfs2_buffer_head *jbh; + uint64_t *b, refblk; + + jbh = bread(&sbd, abs_ld); + b = (uint64_t *)(jbh->b_data + sizeof(struct gfs2_log_descriptor)); + b += offset_from_ld - 1; + refblk = be64_to_cpu(*b); + brelse(jbh); + return refblk; +} + +/** + * dump_journal - dump a journal file's contents. + * @journal: name of the journal to dump + * @tblk: block number to trace in the journals + * + * This function dumps the contents of a journal. If a trace block is specified + * then only information printed is: (1) log descriptors that reference that + * block, (2) metadata in the journal that references the block, or (3) + * rgrp bitmaps that reference that block's allocation bit status. + */ +void dump_journal(const char *journal, int tblk) +{ + struct gfs2_buffer_head *j_bh = NULL, dummy_bh; + uint64_t jblock, j_size, jb, abs_block, saveblk, wrappt = 0; + int start_line, journal_num; + struct gfs2_inode *j_inode = NULL; + int ld_blocks = 0, offset_from_ld = 0; + uint64_t tblk_off = 0, bblk_off = 0, bitblk = 0; + uint64_t highest_seq = 0; + char *jbuf = NULL; + struct rgrp_tree *rgd = NULL; + uint64_t abs_ld = 0; + + start_line = line; + lines_per_row[dmode] = 1; + journal_num = atoi(journal + 7); + print_gfs2("Dumping journal #%d.", journal_num); + if (tblk) { + dmode = HEX_MODE; + print_gfs2(" Tracing block 0x%llx", (unsigned long long)tblk); + } + eol(0); + jblock = find_journal_block(journal, &j_size); + if (!jblock) + return; + + if (!sbd.gfs1) { + j_bh = bread(&sbd, jblock); + j_inode = lgfs2_inode_get(&sbd, j_bh); + if (j_inode == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(-1); + } + jbuf = malloc(sbd.bsize); + if (jbuf == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(-1); + } + } + + if (tblk) { + uint64_t wp; + + rgd = gfs2_blk2rgrpd(&sbd, tblk); + if (!rgd) { + print_gfs2("Can't locate the rgrp for block 0x%x", + tblk); + eol(0); + } else { + uint64_t o; + int bmap = 0; + + print_gfs2("rgd: 0x%llx for 0x%x, ", rgd->ri.ri_addr, + rgd->ri.ri_length); + o = tblk - rgd->ri.ri_data0; + if (o >= (rgd->bits->bi_start + + rgd->bits->bi_len) * (uint64_t)GFS2_NBBY) + o += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; + bmap = o / sbd.sd_blocks_per_bitmap; + bitblk = rgd->ri.ri_addr + bmap; + print_gfs2("bitmap: %d, bitblk: 0x%llx", bmap, + (unsigned long long)bitblk); + eol(0); + } + + wrappt = find_wrap_pt(j_inode, jbuf, jblock, j_size); + wp = wrappt / (sbd.gfs1 ? 1 : sbd.bsize); + print_gfs2("Starting at journal wrap block: 0x%llx " + "(j + 0x%llx)", + (unsigned long long)jblock + wp, + (unsigned long long)wp); + eol(0); + } + + for (jb = 0; jb < j_size; jb += (sbd.gfs1 ? 1 : sbd.bsize)) { + int is_pertinent = 1; + uint32_t block_type = 0; + + if (sbd.gfs1) { + if (j_bh) + brelse(j_bh); + abs_block = jblock + ((jb + wrappt) % j_size); + j_bh = bread(&sbd, abs_block); + dummy_bh.b_data = j_bh->b_data; + } else { + int error = fsck_readi(j_inode, (void *)jbuf, + ((jb + wrappt) % j_size), + sbd.bsize, &abs_block); + if (!error) /* end of file */ + break; + dummy_bh.b_data = jbuf; + } + offset_from_ld++; + block_type = get_block_type(&dummy_bh, NULL); + if (block_type == GFS2_METATYPE_LD) { + ld_blocks = process_ld(abs_block, wrappt, j_size, jb, + dummy_bh.b_data, tblk, &tblk_off, + bitblk, rgd, &is_pertinent, + &bblk_off); + offset_from_ld = 0; + abs_ld = abs_block; + } else if (!tblk && block_type == GFS2_METATYPE_LH) { + struct gfs2_log_header lh; + struct gfs_log_header lh1; + + if (sbd.gfs1) { + gfs_log_header_in(&lh1, &dummy_bh); + check_journal_wrap(lh1.lh_sequence, + &highest_seq); + print_gfs2("0x%"PRIx64" (j+%4"PRIx64"): Log header: " + "Flags:%x, Seq: 0x%llx, 1st: 0x%llx, " + "tail: 0x%llx, last: 0x%llx", + abs_block, jb + wrappt, + lh1.lh_flags, lh1.lh_sequence, + lh1.lh_first, lh1.lh_tail, + lh1.lh_last_dump); + } else { + gfs2_log_header_in(&lh, dummy_bh.b_data); + check_journal_wrap(lh.lh_sequence, + &highest_seq); + print_gfs2("0x%"PRIx64" (j+%4"PRIx64"): Log header: Seq" + ": 0x%llx, tail: 0x%x, blk: 0x%x%s", + abs_block, ((jb + wrappt) % j_size) + / sbd.bsize, lh.lh_sequence, + lh.lh_tail, lh.lh_blkno, + lh.lh_flags == + GFS2_LOG_HEAD_UNMOUNT ? + " [UNMOUNTED]" : ""); + } + eol(0); + } else if ((ld_blocks > 0) && + (sbd.gfs1 || block_type == GFS2_METATYPE_LB)) { + print_gfs2("0x%"PRIx64" (j+%4"PRIx64"): Log descriptor" + " continuation block", abs_block, + ((jb + wrappt) % j_size) / sbd.bsize); + eol(0); + print_gfs2(" "); + ld_blocks -= print_ld_blks((uint64_t *)dummy_bh.b_data + + (sbd.gfs1 ? 0 : + sizeof(struct gfs2_meta_header)), + (dummy_bh.b_data + + sbd.bsize), start_line, + tblk, &tblk_off, 0, rgd, + 0, 1, NULL, 0); + } else if (block_type == 0) { + continue; + } + /* Check if this metadata block references the block we're + trying to trace. */ + if (details || (tblk && ((is_pertinent && + ((tblk_off && offset_from_ld == tblk_off) || + (bblk_off && offset_from_ld == bblk_off))) || + meta_has_ref(abs_block, tblk)))) { + uint64_t ref_blk = 0; + + saveblk = block; + block = abs_block; + if (tblk && !details) { + ref_blk = get_ldref(abs_ld, offset_from_ld); + display(0, 1, tblk, ref_blk); + } else { + display(0, 0, 0, 0); + } + block = saveblk; + } + } + if (j_inode != NULL) + inode_put(&j_inode); + brelse(j_bh); + blockhist = -1; /* So we don't print anything else */ + free(jbuf); + if (!termlines) + fflush(stdout); +} diff --git a/gfs2/edit/journal.h b/gfs2/edit/journal.h new file mode 100644 index 0000000..1e5968b --- /dev/null +++ b/gfs2/edit/journal.h @@ -0,0 +1,7 @@ +#ifndef __JOURNAL_DOT_H__ +#define __JOURNAL_DOT_H__ + +extern void dump_journal(const char *journal, int tblk); +extern uint64_t find_journal_block(const char *journal, uint64_t *j_size); + +#endif diff --git a/gfs2/edit/savemeta.c b/gfs2/edit/savemeta.c new file mode 100644 index 0000000..04f1221 --- /dev/null +++ b/gfs2/edit/savemeta.c @@ -0,0 +1,1247 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "osi_list.h" +#include "gfs2hex.h" +#include "hexedit.h" +#include "libgfs2.h" + +#define DFT_SAVE_FILE "/tmp/gfsmeta.XXXXXX" +#define MAX_JOURNALS_SAVED 256 + +/* Header for the savemeta output file */ +struct savemeta_header { +#define SAVEMETA_MAGIC (0x01171970) + uint32_t sh_magic; +#define SAVEMETA_FORMAT (1) + uint32_t sh_format; /* In case we want to change the layout */ + uint64_t sh_time; /* When savemeta was run */ + uint64_t sh_fs_bytes; /* Size of the fs */ + uint8_t __reserved[104]; +}; + +struct saved_metablock { + uint64_t blk; + uint16_t siglen; /* significant data length */ +/* This needs to be packed because old versions of gfs2_edit read and write the + individual fields separately, so the hole after siglen must be eradicated + before the struct reflects what's on disk. */ +} __attribute__((__packed__)); + +struct metafd { + int fd; + gzFile gzfd; + const char *filename; + int gziplevel; +}; + +static uint64_t blks_saved; +static uint64_t journal_blocks[MAX_JOURNALS_SAVED]; +static uint64_t gfs1_journal_size = 0; /* in blocks */ +static int journals_found = 0; +int print_level = MSG_NOTICE; +extern char *device; + +static int block_is_a_journal(uint64_t blk) +{ + int j; + + for (j = 0; j < journals_found; j++) + if (blk == journal_blocks[j]) + return TRUE; + return FALSE; +} + +struct osi_root per_node_tree; +struct per_node_node { + struct osi_node node; + uint64_t block; +}; + +static void destroy_per_node_lookup(void) +{ + struct osi_node *n; + struct per_node_node *pnp; + + while ((n = osi_first(&per_node_tree))) { + pnp = (struct per_node_node *)n; + osi_erase(n, &per_node_tree); + free(pnp); + } +} + +static int block_is_in_per_node(uint64_t blk) +{ + struct per_node_node *pnp = (struct per_node_node *)per_node_tree.osi_node; + + while (pnp) { + if (blk < pnp->block) + pnp = (struct per_node_node *)pnp->node.osi_left; + else if (blk > pnp->block) + pnp = (struct per_node_node *)pnp->node.osi_right; + else + return 1; + } + + return 0; +} + +static int insert_per_node_lookup(uint64_t blk) +{ + struct osi_node **newn = &per_node_tree.osi_node, *parent = NULL; + struct per_node_node *pnp; + + while (*newn) { + struct per_node_node *cur = (struct per_node_node *)*newn; + + parent = *newn; + if (blk < cur->block) + newn = &((*newn)->osi_left); + else if (blk > cur->block) + newn = &((*newn)->osi_right); + else + return 0; + } + + pnp = calloc(1, sizeof(struct per_node_node)); + if (pnp == NULL) { + perror("Failed to insert per_node lookup entry"); + return 1; + } + pnp->block = blk; + osi_link_node(&pnp->node, parent, newn); + osi_insert_color(&pnp->node, &per_node_tree); + return 0; +} + +static int init_per_node_lookup(void) +{ + int i; + struct gfs2_inode *per_node_di; + + if (sbd.gfs1) + return FALSE; + + per_node_di = lgfs2_inode_read(&sbd, masterblock("per_node")); + if (per_node_di == NULL) { + fprintf(stderr, "Failed to read per_node: %s\n", strerror(errno)); + return 1; + } + + do_dinode_extended(&per_node_di->i_di, per_node_di->i_bh); + inode_put(&per_node_di); + + for (i = 0; i < indirect_blocks; i++) { + int d; + for (d = 0; d < indirect->ii[i].dirents; d++) { + int ret = insert_per_node_lookup(indirect->ii[i].dirent[d].block); + if (ret != 0) + return ret; + } + } + return 0; +} + +static int block_is_systemfile(uint64_t blk) +{ + return block_is_jindex(blk) || block_is_inum_file(blk) || + block_is_statfs_file(blk) || block_is_quota_file(blk) || + block_is_rindex(blk) || block_is_a_journal(blk) || + block_is_per_node(blk) || block_is_in_per_node(blk); +} + +/** + * anthropomorphize - make a uint64_t number more human + */ +static const char *anthropomorphize(unsigned long long inhuman_value) +{ + const char *symbols = " KMGTPE"; + int i; + unsigned long long val = inhuman_value, remainder = 0; + static char out_val[32]; + + memset(out_val, 0, sizeof(out_val)); + for (i = 0; i < 6 && val > 1024; i++) { + remainder = val % 1024; + val /= 1024; + } + sprintf(out_val, "%llu.%llu%cB", val, remainder, symbols[i]); + return out_val; +} + +static size_t di_save_len(struct gfs2_buffer_head *bh, uint64_t owner) +{ + struct gfs2_inode *inode; + struct gfs2_dinode *dn; + size_t len; + + if (sbd.gfs1) + inode = lgfs2_gfs_inode_get(&sbd, bh); + else + inode = lgfs2_inode_get(&sbd, bh); + + if (inode == NULL) { + fprintf(stderr, "Error reading inode at %"PRIu64": %s\n", + bh->b_blocknr, strerror(errno)); + return 0; /* Skip the block */ + } + dn = &inode->i_di; + len = sizeof(struct gfs2_dinode); + + /* Do not save (user) data from the inode block unless they are + indirect pointers, dirents, symlinks or fs internal data */ + if (dn->di_height != 0 || + S_ISDIR(dn->di_mode) || + S_ISLNK(dn->di_mode) || + (sbd.gfs1 && dn->__pad1 == GFS_FILE_DIR) || + block_is_systemfile(owner)) + len = sbd.bsize; + + inode_put(&inode); + return len; +} + +/* + * get_gfs_struct_info - get block type and structure length + * + * @lbh - The block buffer to examine + * @owner - The block address of the parent structure + * @block_type - pointer to integer to hold the block type + * @gstruct_len - pointer to integer to hold the structure length + * + * returns: 0 if successful + * -1 if this isn't gfs metadata. + */ +static int get_gfs_struct_info(struct gfs2_buffer_head *lbh, uint64_t owner, + int *block_type, size_t *gstruct_len) +{ + struct gfs2_meta_header mh; + + if (block_type != NULL) + *block_type = 0; + *gstruct_len = sbd.bsize; + + gfs2_meta_header_in(&mh, lbh->b_data); + if (mh.mh_magic != GFS2_MAGIC) + return -1; + + if (block_type != NULL) + *block_type = mh.mh_type; + + switch (mh.mh_type) { + case GFS2_METATYPE_SB: /* 1 (superblock) */ + *gstruct_len = sizeof(struct gfs_sb); + break; + case GFS2_METATYPE_RG: /* 2 (rsrc grp hdr) */ + *gstruct_len = sbd.bsize; /*sizeof(struct gfs_rgrp);*/ + break; + case GFS2_METATYPE_RB: /* 3 (rsrc grp bitblk) */ + *gstruct_len = sbd.bsize; + break; + case GFS2_METATYPE_DI: /* 4 (disk inode) */ + *gstruct_len = di_save_len(lbh, owner); + break; + case GFS2_METATYPE_IN: /* 5 (indir inode blklst) */ + *gstruct_len = sbd.bsize; /*sizeof(struct gfs_indirect);*/ + break; + case GFS2_METATYPE_LF: /* 6 (leaf dinode blklst) */ + *gstruct_len = sbd.bsize; /*sizeof(struct gfs_leaf);*/ + break; + case GFS2_METATYPE_JD: /* 7 (journal data) */ + *gstruct_len = sbd.bsize; + break; + case GFS2_METATYPE_LH: /* 8 (log header) */ + if (sbd.gfs1) + *gstruct_len = 512; /* gfs copies the log header + twice and compares the copy, + so we need to save all 512 + bytes of it. */ + else + *gstruct_len = sizeof(struct gfs2_log_header); + break; + case GFS2_METATYPE_LD: /* 9 (log descriptor) */ + *gstruct_len = sbd.bsize; + break; + case GFS2_METATYPE_EA: /* 10 (extended attr hdr) */ + *gstruct_len = sbd.bsize; + break; + case GFS2_METATYPE_ED: /* 11 (extended attr data) */ + *gstruct_len = sbd.bsize; + break; + default: + *gstruct_len = sbd.bsize; + break; + } + return 0; +} + +/* Put out a warm, fuzzy message every second so the user */ +/* doesn't think we hung. (This may take a long time). */ +/* We only check whether to report every one percent because */ +/* checking every block kills performance. We only report */ +/* every second because we don't need 100 extra messages in */ +/* logs made from verbose mode. */ +static void warm_fuzzy_stuff(uint64_t wfsblock, int force) +{ + static struct timeval tv; + static uint32_t seconds = 0; + + gettimeofday(&tv, NULL); + if (!seconds) + seconds = tv.tv_sec; + if (force || tv.tv_sec - seconds) { + static uint64_t percent; + + seconds = tv.tv_sec; + if (sbd.fssize) { + printf("\r"); + percent = (wfsblock * 100) / sbd.fssize; + printf("%llu blocks processed, %llu saved (%llu%%)", + (unsigned long long)wfsblock, + (unsigned long long)blks_saved, + (unsigned long long)percent); + if (force) + printf("\n"); + fflush(stdout); + } + } +} + +/** + * Open a file and prepare it for writing by savemeta() + * out_fn: the path to the file, which will be truncated if it exists + * gziplevel: 0 - do not compress the file, + * 1-9 - use gzip compression level 1-9 + * Returns a struct metafd containing the opened file descriptor + */ +static struct metafd savemetaopen(char *out_fn, int gziplevel) +{ + struct metafd mfd = {-1, NULL, NULL, gziplevel}; + char gzmode[3] = "w9"; + char dft_fn[] = DFT_SAVE_FILE; + mode_t mask = umask(S_IXUSR | S_IRWXG | S_IRWXO); + + if (!out_fn) { + out_fn = dft_fn; + mfd.fd = mkstemp(out_fn); + } else { + mfd.fd = open(out_fn, O_RDWR | O_CREAT, 0644); + } + umask(mask); + mfd.filename = out_fn; + + if (mfd.fd < 0) { + fprintf(stderr, "Can't open %s: %s\n", out_fn, strerror(errno)); + exit(1); + } + + if (ftruncate(mfd.fd, 0)) { + fprintf(stderr, "Can't truncate %s: %s\n", out_fn, strerror(errno)); + exit(1); + } + + if (gziplevel > 0) { + gzmode[1] = '0' + gziplevel; + mfd.gzfd = gzdopen(mfd.fd, gzmode); + if (!mfd.gzfd) { + fprintf(stderr, "gzdopen error: %s\n", strerror(errno)); + exit(1); + } + } + + return mfd; +} + +/** + * Write nbyte bytes from buf to a file opened with savemetaopen() + * mfd: the file descriptor opened using savemetaopen() + * buf: the buffer to write data from + * nbyte: the number of bytes to write + * Returns the number of bytes written from buf or -1 on error + */ +static ssize_t savemetawrite(struct metafd *mfd, const void *buf, size_t nbyte) +{ + ssize_t ret; + int gzerr; + const char *gzerrmsg; + + if (mfd->gziplevel == 0) { + return write(mfd->fd, buf, nbyte); + } + + ret = gzwrite(mfd->gzfd, buf, nbyte); + if (ret != nbyte) { + gzerrmsg = gzerror(mfd->gzfd, &gzerr); + if (gzerr != Z_ERRNO) { + fprintf(stderr, "Error: zlib: %s\n", gzerrmsg); + } + } + return ret; +} + +/** + * Closes a file descriptor previously opened using savemetaopen() + * mfd: the file descriptor previously opened using savemetaopen() + * Returns 0 on success or -1 on error + */ +static int savemetaclose(struct metafd *mfd) +{ + int gzret; + if (mfd->gziplevel > 0) { + gzret = gzclose(mfd->gzfd); + if (gzret == Z_STREAM_ERROR) { + fprintf(stderr, "gzclose: file is not valid\n"); + return -1; + } else if (gzret == Z_ERRNO) { + return -1; + } + } + return close(mfd->fd); +} + +static int save_bh(struct metafd *mfd, struct gfs2_buffer_head *savebh, uint64_t owner, int *blktype) +{ + struct saved_metablock *savedata; + size_t blklen; + size_t outsz; + + /* If this isn't metadata and isn't a system file, we don't want it. + Note that we're checking "owner" here rather than blk. That's + because we want to know if the source inode is a system inode + not the block within the inode "blk". They may or may not + be the same thing. */ + if (get_gfs_struct_info(savebh, owner, blktype, &blklen) && + !block_is_systemfile(owner) && owner != 0) + return 0; /* Not metadata, and not system file, so skip it */ + + /* No need to save trailing zeroes */ + for (; blklen > 0 && savebh->b_data[blklen - 1] == '\0'; blklen--); + + if (blklen == 0) /* No significant data; skip. */ + return 0; + + outsz = sizeof(*savedata) + blklen; + savedata = calloc(1, outsz); + if (savedata == NULL) { + perror("Failed to save block"); + exit(1); + } + savedata->blk = cpu_to_be64(savebh->b_blocknr); + savedata->siglen = cpu_to_be16(blklen); + memcpy(savedata + 1, savebh->b_data, blklen); + + if (savemetawrite(mfd, savedata, outsz) != outsz) { + fprintf(stderr, "write error: %s from %s:%d: block %lld (0x%llx)\n", + strerror(errno), __FUNCTION__, __LINE__, + (unsigned long long)savedata->blk, + (unsigned long long)savedata->blk); + free(savedata); + exit(-1); + } + + blks_saved++; + free(savedata); + return 0; +} + +static int save_block(int fd, struct metafd *mfd, uint64_t blk, uint64_t owner, int *blktype) +{ + struct gfs2_buffer_head *savebh; + int err; + + if (gfs2_check_range(&sbd, blk) && blk != LGFS2_SB_ADDR(&sbd)) { + fprintf(stderr, "\nWarning: bad block pointer '0x%llx' " + "ignored in block (block %llu (0x%llx))", + (unsigned long long)blk, + (unsigned long long)owner, (unsigned long long)owner); + return 0; + } + savebh = bread(&sbd, blk); + if (savebh == NULL) + return 1; + err = save_bh(mfd, savebh, owner, blktype); + brelse(savebh); + return err; +} + +/* + * save_ea_block - save off an extended attribute block + */ +static void save_ea_block(struct metafd *mfd, struct gfs2_buffer_head *metabh, uint64_t owner) +{ + int e; + struct gfs2_ea_header ea; + + for (e = sizeof(struct gfs2_meta_header); e < sbd.bsize; e += ea.ea_rec_len) { + uint64_t blk, *b; + int charoff, i; + + gfs2_ea_header_in(&ea, metabh->b_data + e); + for (i = 0; i < ea.ea_num_ptrs; i++) { + charoff = e + ea.ea_name_len + + sizeof(struct gfs2_ea_header) + + sizeof(uint64_t) - 1; + charoff /= sizeof(uint64_t); + b = (uint64_t *)(metabh->b_data); + b += charoff + i; + blk = be64_to_cpu(*b); + save_block(sbd.device_fd, mfd, blk, owner, NULL); + } + if (!ea.ea_rec_len) + break; + } +} + +/* + * save_indirect_blocks - save all indirect blocks for the given buffer + */ +static void save_indirect_blocks(struct metafd *mfd, osi_list_t *cur_list, + struct gfs2_buffer_head *mybh, uint64_t owner, int height, int hgt) +{ + uint64_t old_block = 0, indir_block; + uint64_t *ptr; + int head_size, blktype; + struct gfs2_buffer_head *nbh; + + head_size = (hgt > 1 ? + sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode)); + + for (ptr = (uint64_t *)(mybh->b_data + head_size); + (char *)ptr < (mybh->b_data + sbd.bsize); ptr++) { + if (!*ptr) + continue; + indir_block = be64_to_cpu(*ptr); + if (indir_block == old_block) + continue; + old_block = indir_block; + save_block(sbd.device_fd, mfd, indir_block, owner, &blktype); + if (blktype == GFS2_METATYPE_EA) { + nbh = bread(&sbd, indir_block); + save_ea_block(mfd, nbh, owner); + brelse(nbh); + } + if (height != hgt && /* If not at max height and */ + (!gfs2_check_range(&sbd, indir_block))) { + nbh = bread(&sbd, indir_block); + osi_list_add_prev(&nbh->b_altlist, cur_list); + /* The buffer_head needs to be queued ahead, so + don't release it! + brelse(nbh);*/ + } + } /* for all data on the indirect block */ +} + +static int save_leaf_chain(struct metafd *mfd, struct gfs2_sbd *sdp, uint64_t blk) +{ + struct gfs2_buffer_head *bh; + struct gfs2_leaf leaf; + + do { + if (gfs2_check_range(sdp, blk) != 0) + return 0; + bh = bread(sdp, blk); + if (bh == NULL) { + perror("Failed to read leaf block"); + return 1; + } + warm_fuzzy_stuff(blk, FALSE); + if (gfs2_check_meta(bh, GFS2_METATYPE_LF) == 0) { + int ret = save_bh(mfd, bh, blk, NULL); + if (ret != 0) { + brelse(bh); + return ret; + } + } + gfs2_leaf_in(&leaf, bh->b_data); + brelse(bh); + blk = leaf.lf_next; + } while (leaf.lf_next != 0); + + return 0; +} + +/* + * save_inode_data - save off important data associated with an inode + * + * mfd - destination file descriptor + * iblk - block number of the inode to save the data for + * + * For user files, we don't want anything except all the indirect block + * pointers that reside on blocks on all but the highest height. + * + * For system files like statfs and inum, we want everything because they + * may contain important clues and no user data. + * + * For file system journals, the "data" is a mixture of metadata and + * journaled data. We want all the metadata and none of the user data. + */ +static void save_inode_data(struct metafd *mfd, uint64_t iblk) +{ + uint32_t height; + struct gfs2_inode *inode; + osi_list_t metalist[GFS2_MAX_META_HEIGHT]; + osi_list_t *prev_list, *cur_list, *tmp; + struct gfs2_buffer_head *metabh, *mybh; + int i; + + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) + osi_list_init(&metalist[i]); + metabh = bread(&sbd, iblk); + if (sbd.gfs1) { + inode = lgfs2_gfs_inode_get(&sbd, metabh); + } else { + inode = lgfs2_inode_get(&sbd, metabh); + } + if (inode == NULL) { + perror("Failed to read inode"); + exit(-1); + } + height = inode->i_di.di_height; + /* If this is a user inode, we don't follow to the file height. + We stop one level less. That way we save off the indirect + pointer blocks but not the actual file contents. The exception + is directories, where the height represents the level at which + the hash table exists, and we have to save the directory data. */ + if (inode->i_di.di_flags & GFS2_DIF_EXHASH && + (S_ISDIR(inode->i_di.di_mode) || + (sbd.gfs1 && inode->i_di.__pad1 == GFS_FILE_DIR))) + height++; + else if (height && !(inode->i_di.di_flags & GFS2_DIF_SYSTEM) && + !block_is_systemfile(iblk) && !S_ISDIR(inode->i_di.di_mode)) + height--; + osi_list_add(&metabh->b_altlist, &metalist[0]); + for (i = 1; i <= height; i++){ + prev_list = &metalist[i - 1]; + cur_list = &metalist[i]; + + for (tmp = prev_list->next; tmp != prev_list; tmp = tmp->next){ + mybh = osi_list_entry(tmp, struct gfs2_buffer_head, + b_altlist); + warm_fuzzy_stuff(iblk, FALSE); + save_indirect_blocks(mfd, cur_list, mybh, iblk, + height, i); + } /* for blocks at that height */ + } /* for height */ + /* free metalists */ + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + cur_list = &metalist[i]; + while (!osi_list_empty(cur_list)) { + mybh = osi_list_entry(cur_list->next, + struct gfs2_buffer_head, + b_altlist); + if (mybh == inode->i_bh) + osi_list_del(&mybh->b_altlist); + else + brelse(mybh); + } + } + /* Process directory exhash inodes */ + if (S_ISDIR(inode->i_di.di_mode) && + inode->i_di.di_flags & GFS2_DIF_EXHASH) { + uint64_t leaf_no, old_leaf = -1; + int li; + + for (li = 0; li < (1 << inode->i_di.di_depth); li++) { + if (lgfs2_get_leaf_ptr(inode, li, &leaf_no)) { + fprintf(stderr, "Could not read leaf index %d in dinode %"PRIu64"\n", li, + (uint64_t)inode->i_di.di_num.no_addr); + exit(-1); + } + if (leaf_no != old_leaf && save_leaf_chain(mfd, &sbd, leaf_no) != 0) + exit(-1); + old_leaf = leaf_no; + } + } + if (inode->i_di.di_eattr) { /* if this inode has extended attributes */ + struct gfs2_meta_header mh; + struct gfs2_buffer_head *lbh; + + lbh = bread(&sbd, inode->i_di.di_eattr); + save_block(sbd.device_fd, mfd, inode->i_di.di_eattr, iblk, NULL); + gfs2_meta_header_in(&mh, lbh->b_data); + if (mh.mh_magic == GFS2_MAGIC && + mh.mh_type == GFS2_METATYPE_EA) + save_ea_block(mfd, lbh, iblk); + else if (mh.mh_magic == GFS2_MAGIC && + mh.mh_type == GFS2_METATYPE_IN) + save_indirect_blocks(mfd, cur_list, lbh, iblk, 2, 2); + else { + if (mh.mh_magic == GFS2_MAGIC) /* if it's metadata */ + save_block(sbd.device_fd, mfd, inode->i_di.di_eattr, + iblk, NULL); + fprintf(stderr, + "\nWarning: corrupt extended " + "attribute at block %llu (0x%llx) " + "detected in inode %lld (0x%llx).\n", + (unsigned long long)inode->i_di.di_eattr, + (unsigned long long)inode->i_di.di_eattr, + (unsigned long long)iblk, + (unsigned long long)iblk); + } + brelse(lbh); + } + inode_put(&inode); + brelse(metabh); +} + +static void get_journal_inode_blocks(void) +{ + int journal; + + journals_found = 0; + memset(journal_blocks, 0, sizeof(journal_blocks)); + /* Save off all the journals--but only the metadata. + * This is confusing so I'll explain. The journals contain important + * metadata. However, in gfs2 the journals are regular files within + * the system directory. Since they're regular files, the blocks + * within the journals are considered data, not metadata. Therefore, + * they won't have been saved by the code above. We want to dump + * these blocks, but we have to be careful. We only care about the + * journal blocks that look like metadata, and we need to not save + * journaled user data that may exist there as well. */ + for (journal = 0; ; journal++) { /* while journals exist */ + uint64_t jblock; + int amt; + struct gfs2_inode *j_inode = NULL; + + if (sbd.gfs1) { + struct gfs_jindex ji; + char jbuf[sizeof(struct gfs_jindex)]; + + j_inode = lgfs2_gfs_inode_read(&sbd, + sbd1->sb_jindex_di.no_addr); + if (j_inode == NULL) { + fprintf(stderr, "Error reading journal inode: %s\n", strerror(errno)); + return; + } + amt = gfs2_readi(j_inode, (void *)&jbuf, + journal * sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + inode_put(&j_inode); + if (!amt) + break; + gfs_jindex_in(&ji, jbuf); + jblock = ji.ji_addr; + gfs1_journal_size = (uint64_t)ji.ji_nsegment * 16; + } else { + if (journal > indirect->ii[0].dirents - 3) + break; + jblock = indirect->ii[0].dirent[journal + 2].block; + } + journal_blocks[journals_found++] = jblock; + } +} + +static void save_allocated(struct rgrp_tree *rgd, struct metafd *mfd) +{ + int blktype; + uint64_t blk = 0; + unsigned i, j, m; + uint64_t *ibuf = malloc(sbd.bsize * GFS2_NBBY * sizeof(uint64_t)); + + for (i = 0; i < rgd->ri.ri_length; i++) { + m = lgfs2_bm_scan(rgd, i, ibuf, GFS2_BLKST_DINODE); + + for (j = 0; j < m; j++) { + blk = ibuf[j]; + warm_fuzzy_stuff(blk, FALSE); + save_block(sbd.device_fd, mfd, blk, blk, &blktype); + if (blktype == GFS2_METATYPE_DI) + save_inode_data(mfd, blk); + } + + if (!sbd.gfs1) + continue; + + /* For gfs1, Save off the free/unlinked meta blocks too. + * If we don't, we may run into metadata allocation issues. */ + m = lgfs2_bm_scan(rgd, i, ibuf, GFS2_BLKST_UNLINKED); + for (j = 0; j < m; j++) { + save_block(sbd.device_fd, mfd, blk, blk, NULL); + } + } + free(ibuf); +} + +/* We don't use gfs2_rgrp_read() here as it checks for metadata sanity and we + want to save rgrp headers even if they're corrupt. */ +static int rgrp_read(struct gfs2_sbd *sdp, struct rgrp_tree *rgd) +{ + unsigned x, length = rgd->ri.ri_length; + struct gfs2_buffer_head **bhs; + + if (length == 0 || gfs2_check_range(sdp, rgd->ri.ri_addr)) + return -1; + + bhs = calloc(length, sizeof(struct gfs2_buffer_head *)); + if (bhs == NULL) + return -1; + + if (breadm(sdp, bhs, length, rgd->ri.ri_addr)) { + free(bhs); + return -1; + } + for (x = 0; x < length; x++) + rgd->bits[x].bi_bh = bhs[x]; + + if (sdp->gfs1) + gfs_rgrp_in((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_in(&rgd->rg, rgd->bits[0].bi_bh->b_data); + free(bhs); + return 0; +} + +static void save_rgrp(struct metafd *mfd, struct rgrp_tree *rgd, int withcontents) +{ + uint64_t addr = rgd->ri.ri_addr; + uint32_t i; + + if (rgrp_read(&sbd, rgd)) + return; + log_debug("RG at %"PRIu64" (0x%"PRIx64") is %u long\n", + addr, addr, rgd->ri.ri_length); + /* Save the rg and bitmaps */ + for (i = 0; i < rgd->ri.ri_length; i++) { + warm_fuzzy_stuff(rgd->ri.ri_addr + i, FALSE); + save_bh(mfd, rgd->bits[i].bi_bh, 0, NULL); + } + /* Save the other metadata: inodes, etc. if mode is not 'savergs' */ + if (withcontents) + save_allocated(rgd, mfd); + gfs2_rgrp_relse(rgd); +} + +static int save_header(struct metafd *mfd, uint64_t fsbytes) +{ + struct savemeta_header smh = { + .sh_magic = cpu_to_be32(SAVEMETA_MAGIC), + .sh_format = cpu_to_be32(SAVEMETA_FORMAT), + .sh_time = cpu_to_be64(time(NULL)), + .sh_fs_bytes = cpu_to_be64(fsbytes) + }; + + if (savemetawrite(mfd, (char *)(&smh), sizeof(smh)) != sizeof(smh)) + return -1; + return 0; +} + +static int read_header(gzFile gzin_fd, struct savemeta_header *smh) +{ + size_t rs; + struct savemeta_header smh_be = {0}; + + gzseek(gzin_fd, 0, SEEK_SET); + rs = gzread(gzin_fd, &smh_be, sizeof(smh_be)); + if (rs == -1) { + perror("Failed to read savemeta file header"); + return -1; + } + if (rs != sizeof(smh_be)) + return 1; + + smh->sh_magic = be32_to_cpu(smh_be.sh_magic); + smh->sh_format = be32_to_cpu(smh_be.sh_format); + smh->sh_time = be64_to_cpu(smh_be.sh_time); + smh->sh_fs_bytes = be64_to_cpu(smh_be.sh_fs_bytes); + + return 0; +} + +static int check_header(struct savemeta_header *smh) +{ + if (smh->sh_magic != SAVEMETA_MAGIC || smh->sh_format > SAVEMETA_FORMAT) + return -1; + printf("Metadata saved at %s", ctime((time_t *)&smh->sh_time)); /* ctime() adds \n */ + printf("File system size %s\n", anthropomorphize(smh->sh_fs_bytes)); + return 0; +} + +void savemeta(char *out_fn, int saveoption, int gziplevel) +{ + uint64_t jindex_block; + struct gfs2_buffer_head *lbh; + struct metafd mfd; + struct osi_node *n; + int err = 0; + + sbd.md.journals = 1; + + mfd = savemetaopen(out_fn, gziplevel); + + blks_saved = 0; + if (sbd.gfs1) + sbd.bsize = sbd.sd_sb.sb_bsize; + printf("There are %llu blocks of %u bytes in the filesystem.\n", + (unsigned long long)sbd.fssize, sbd.bsize); + if (sbd.gfs1) + jindex_block = sbd1->sb_jindex_di.no_addr; + else + jindex_block = masterblock("jindex"); + lbh = bread(&sbd, jindex_block); + gfs2_dinode_in(&di, lbh->b_data); + if (!sbd.gfs1) + do_dinode_extended(&di, lbh); + brelse(lbh); + + printf("Filesystem size: %s\n", anthropomorphize(sbd.fssize * sbd.bsize)); + get_journal_inode_blocks(); + + err = init_per_node_lookup(); + if (err) + exit(1); + + /* Write the savemeta file header */ + err = save_header(&mfd, sbd.fssize * sbd.bsize); + if (err) { + perror("Failed to write metadata file header"); + exit(1); + } + /* Save off the superblock */ + save_block(sbd.device_fd, &mfd, GFS2_SB_ADDR * GFS2_BASIC_BLOCK / sbd.bsize, 0, NULL); + /* If this is gfs1, save off the rindex because it's not + part of the file system as it is in gfs2. */ + if (sbd.gfs1) { + uint64_t blk; + int j; + + blk = sbd1->sb_rindex_di.no_addr; + save_block(sbd.device_fd, &mfd, blk, blk, NULL); + save_inode_data(&mfd, blk); + /* In GFS1, journals aren't part of the RG space */ + for (j = 0; j < journals_found; j++) { + log_debug("Saving journal #%d\n", j + 1); + for (blk = journal_blocks[j]; + blk < journal_blocks[j] + gfs1_journal_size; + blk++) + save_block(sbd.device_fd, &mfd, blk, blk, NULL); + } + } + /* Walk through the resource groups saving everything within */ + for (n = osi_first(&sbd.rgtree); n; n = osi_next(n)) { + struct rgrp_tree *rgd; + + rgd = (struct rgrp_tree *)n; + save_rgrp(&mfd, rgd, (saveoption != 2)); + } + /* Clean up */ + /* There may be a gap between end of file system and end of device */ + /* so we tell the user that we've processed everything. */ + warm_fuzzy_stuff(sbd.fssize, TRUE); + printf("\nMetadata saved to file %s ", mfd.filename); + if (mfd.gziplevel) { + printf("(gzipped, level %d).\n", mfd.gziplevel); + } else { + printf("(uncompressed).\n"); + } + savemetaclose(&mfd); + close(sbd.device_fd); + destroy_per_node_lookup(); + free(indirect); + gfs2_rgrp_free(&sbd.rgtree); + exit(0); +} + +static off_t restore_init(gzFile gzfd, struct savemeta_header *smh) +{ + int err; + unsigned i; + size_t rs; + char buf[256]; + off_t startpos = 0; + struct gfs2_meta_header sbmh; + + err = read_header(gzfd, smh); + if (err < 0) { + exit(1); + } else if (check_header(smh) != 0) { + printf("No valid file header found. Falling back to old format...\n"); + } else if (err == 0) { + startpos = sizeof(*smh); + } + + gzseek(gzfd, startpos, SEEK_SET); + rs = gzread(gzfd, buf, sizeof(buf)); + if (rs != sizeof(buf)) { + fprintf(stderr, "Error: File is too small.\n"); + exit(1); + } + /* Scan for the beginning of the file body. Required to support old formats(?). */ + for (i = 0; i < (256 - sizeof(struct saved_metablock) - sizeof(sbmh)); i++) { + off_t off = i + sizeof(struct saved_metablock); + + memcpy(&sbmh, &buf[off], sizeof(sbmh)); + if (sbmh.mh_magic == cpu_to_be32(GFS2_MAGIC) && + sbmh.mh_type == cpu_to_be32(GFS2_METATYPE_SB)) + break; + } + if (i == (sizeof(buf) - sizeof(struct saved_metablock) - sizeof(sbmh))) + i = 0; + return startpos + i; /* File offset of saved sb */ +} + + +static int restore_block(gzFile gzfd, struct saved_metablock *svb, char *buf, uint16_t maxlen) +{ + int gzerr; + int ret; + uint16_t checklen; + const char *errstr; + + ret = gzread(gzfd, svb, sizeof(*svb)); + if (ret < sizeof(*svb)) { + goto gzread_err; + } + svb->blk = be64_to_cpu(svb->blk); + svb->siglen = be16_to_cpu(svb->siglen); + + if (sbd.fssize && svb->blk >= sbd.fssize) { + fprintf(stderr, "Error: File system is too small to restore this metadata.\n"); + fprintf(stderr, "File system is %llu blocks. Restore block = %llu\n", + (unsigned long long)sbd.fssize, (unsigned long long)svb->blk); + return -1; + } + + if (maxlen) + checklen = maxlen; + else + checklen = sbd.bsize; + + if (checklen && svb->siglen > checklen) { + fprintf(stderr, "Bad record length: %u for block %"PRIu64" (0x%"PRIx64").\n", + svb->siglen, svb->blk, svb->blk); + return -1; + } + + if (buf != NULL && maxlen != 0) { + ret = gzread(gzfd, buf, svb->siglen); + if (ret < svb->siglen) { + goto gzread_err; + } + } + + return 0; + +gzread_err: + if (gzeof(gzfd)) + return 1; + + errstr = gzerror(gzfd, &gzerr); + if (gzerr == Z_ERRNO) + errstr = strerror(errno); + fprintf(stderr, "Failed to restore block: %s\n", errstr); + return -1; +} + +static int restore_super(gzFile gzfd, off_t pos) +{ + int ret; + struct saved_metablock svb = {0}; + char *buf; + + buf = calloc(1, sizeof(struct gfs2_sb)); + if (buf == NULL) { + perror("Failed to restore super block"); + exit(1); + } + gzseek(gzfd, pos, SEEK_SET); + ret = restore_block(gzfd, &svb, buf, sizeof(struct gfs2_sb)); + if (ret == 1) { + fprintf(stderr, "Reached end of file while restoring superblock\n"); + goto err; + } else if (ret != 0) { + goto err; + } + + gfs2_sb_in(&sbd.sd_sb, buf); + sbd1 = (struct gfs_sb *)&sbd.sd_sb; + ret = check_sb(&sbd.sd_sb); + if (ret < 0) { + fprintf(stderr,"Error: Invalid superblock data.\n"); + goto err; + } + if (ret == 1) + sbd.gfs1 = 1; + sbd.bsize = sbd.sd_sb.sb_bsize; + free(buf); + printf("Block size is %uB\n", sbd.bsize); + return 0; +err: + free(buf); + return -1; +} + +static int find_highest_block(gzFile gzfd, off_t pos, uint64_t fssize) +{ + int err = 0; + uint64_t highest = 0; + struct saved_metablock svb = {0}; + + while (1) { + gzseek(gzfd, pos, SEEK_SET); + err = restore_block(gzfd, &svb, NULL, 0); + if (err == 1) + break; + if (err != 0) + return -1; + + if (svb.blk > highest) + highest = svb.blk; + pos += sizeof(svb) + svb.siglen; + } + + if (fssize > 0) { + printf("Saved file system size is %"PRIu64" (0x%"PRIx64") blocks, %s\n", + fssize, fssize, anthropomorphize(fssize * sbd.bsize)); + sbd.fssize = fssize; + } else { + sbd.fssize = highest + 1; + } + + printf("Highest saved block is %"PRIu64" (0x%"PRIx64")\n", highest, highest); + return 0; +} + +static int restore_data(int fd, gzFile gzin_fd, off_t pos, int printonly) +{ + struct saved_metablock savedata = {0}; + uint64_t writes = 0; + char *buf; + + buf = calloc(1, sbd.bsize); + if (buf == NULL) { + perror("Failed to restore data"); + exit(1); + } + + gzseek(gzin_fd, pos, SEEK_SET); + blks_saved = 0; + while (TRUE) { + int err; + err = restore_block(gzin_fd, &savedata, buf, sbd.bsize); + if (err == 1) + break; + if (err != 0) { + free(buf); + return -1; + } + + if (printonly) { + struct gfs2_buffer_head dummy_bh = { + .b_data = buf, + .b_blocknr = savedata.blk, + }; + if (printonly > 1 && printonly == savedata.blk) { + display_block_type(&dummy_bh, TRUE); + display_gfs2(&dummy_bh); + break; + } else if (printonly == 1) { + print_gfs2("%"PRId64" (l=0x%x): ", blks_saved, savedata.siglen); + display_block_type(&dummy_bh, TRUE); + } + } else { + warm_fuzzy_stuff(savedata.blk, FALSE); + memset(buf + savedata.siglen, 0, sbd.bsize - savedata.siglen); + if (pwrite(fd, buf, sbd.bsize, savedata.blk * sbd.bsize) != sbd.bsize) { + fprintf(stderr, "write error: %s from %s:%d: block %lld (0x%llx)\n", + strerror(errno), __FUNCTION__, __LINE__, + (unsigned long long)savedata.blk, + (unsigned long long)savedata.blk); + free(buf); + return -1; + } + writes++; + if (writes % 1000 == 0) + fsync(fd); + } + blks_saved++; + } + if (!printonly) + warm_fuzzy_stuff(sbd.fssize, 1); + free(buf); + return 0; +} + +static void complain(const char *complaint) +{ + fprintf(stderr, "%s\n", complaint); + die("Format is: \ngfs2_edit restoremeta " + "\n"); +} + +void restoremeta(const char *in_fn, const char *out_device, uint64_t printonly) +{ + int error; + gzFile gzfd; + off_t pos = 0; + struct savemeta_header smh = {0}; + + termlines = 0; + if (!in_fn) + complain("No source file specified."); + if (!printonly && !out_device) + complain("No destination file system specified."); + + gzfd = gzopen(in_fn, "rb"); + if (!gzfd) + die("Can't open source file %s: %s\n", + in_fn, strerror(errno)); + + if (!printonly) { + sbd.device_fd = open(out_device, O_RDWR); + if (sbd.device_fd < 0) + die("Can't open destination file system %s: %s\n", + out_device, strerror(errno)); + } else if (out_device) /* for printsavedmeta, the out_device is an + optional block no */ + printonly = check_keywords(out_device); + + pos = restore_init(gzfd, &smh); + error = restore_super(gzfd, pos); + if (error) + exit(1); + + printf("This is gfs%c metadata.\n", sbd.gfs1 ? '1': '2'); + + if (!printonly) { + uint64_t space = lseek(sbd.device_fd, 0, SEEK_END) / sbd.bsize; + printf("There are %"PRIu64" free blocks on the destination device.\n", space); + } + + error = find_highest_block(gzfd, pos, sbd.fssize); + if (error) + exit(1); + + error = restore_data(sbd.device_fd, gzfd, pos, printonly); + printf("File %s %s %s.\n", in_fn, + (printonly ? "print" : "restore"), + (error ? "error" : "successful")); + + gzclose(gzfd); + if (!printonly) + close(sbd.device_fd); + free(indirect); + exit(error); +} diff --git a/gfs2/fsck/FEATURES b/gfs2/fsck/FEATURES new file mode 100644 index 0000000..8a63591 --- /dev/null +++ b/gfs2/fsck/FEATURES @@ -0,0 +1,25 @@ +This is a completely rewritten filesystem checker for GFS. Performance +characteristics are significantly improved. The design follows the 5-pass +fsck design found in "Fsck - The UNIX File System Check Program" +by McKusick & Kowalkski (1994) + - http://citeseer.ist.psu.edu/mckusick94fsck.html + + +Line item list of supported features: + +1. Detects and replaces missing/bad root inode +2. Detects and relinks unlinked inodes to l+f + o If a file is zero length, it is not relinked to l+f - unless it + has an extended attribute attached to it. +3. Detects duplicate blocks and removes inodes containing them +4. Detects bad blocks (block number out of range) and removes inodes + containing them - Currently EAs that have blocks are removed but + the inode containing them is left. +5. Detects bad metadata headers and clears the structure +6. Fixes bad resource group bitmaps +7. Fixes incorrect resource group counts +8. Creates l+f directory if missing +9. Detects and removes duplicate '.' and '..' entries +10. Creates '.' if missing +11. Beginning of support for internationalization +12. Checks extended attributes diff --git a/gfs2/fsck/Makefile.am b/gfs2/fsck/Makefile.am new file mode 100644 index 0000000..97fcd40 --- /dev/null +++ b/gfs2/fsck/Makefile.am @@ -0,0 +1,41 @@ +MAINTAINERCLEANFILES = Makefile.in + +sbin_PROGRAMS = fsck.gfs2 + +noinst_HEADERS = \ + afterpass1_common.h \ + fsck.h \ + fs_recovery.h \ + inode_hash.h \ + link.h \ + lost_n_found.h \ + metawalk.h \ + util.h + +fsck_gfs2_SOURCES = \ + fs_recovery.c \ + initialize.c \ + inode_hash.c \ + link.c \ + lost_n_found.c \ + main.c \ + metawalk.c \ + afterpass1_common.c \ + pass1b.c \ + pass1.c \ + pass2.c \ + pass3.c \ + pass4.c \ + pass5.c \ + rgrepair.c \ + util.c + +fsck_gfs2_CPPFLAGS = \ + -D_FILE_OFFSET_BITS=64 \ + -I$(top_srcdir)/gfs2/include \ + -I$(top_srcdir)/gfs2/libgfs2 + +fsck_gfs2_LDADD = \ + $(top_builddir)/gfs2/libgfs2/libgfs2.la +fsck_gfs2_LDFLAGS = \ + $(uuid_LIBS) diff --git a/gfs2/fsck/afterpass1_common.c b/gfs2/fsck/afterpass1_common.c new file mode 100644 index 0000000..b747640 --- /dev/null +++ b/gfs2/fsck/afterpass1_common.c @@ -0,0 +1,320 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "afterpass1_common.h" +#include "metawalk.h" +#include "util.h" + +/** + * find_remove_dup - find out if this is a duplicate ref. If so, remove it. + * + * Returns: 1 if there are any remaining references to this block, else 0. + */ +static int find_remove_dup(struct gfs2_inode *ip, uint64_t block, + const char *btype, int *removed_last_meta) +{ + struct duptree *dt; + struct inode_with_dups *id; + int deleted_a_meta_ref = 0; + int meta_refs_left = 0; + + dt = dupfind(block); + if (!dt) + return 0; + + /* remove the inode reference id structure for this reference. */ + id = find_dup_ref_inode(dt, ip); + if (!id) + goto more_refs; + + if (id->reftypecount[ref_as_meta]) + deleted_a_meta_ref = 1; + dup_listent_delete(dt, id); + if (dt->refs == 0) { + log_info( _("This was the last reference: it's no longer a " + "duplicate.\n")); + dup_delete(dt); /* not duplicate now */ + if (deleted_a_meta_ref) { + log_debug("Removed the last reference as metadata.\n"); + *removed_last_meta = 1; + } + return 0; + } else if (deleted_a_meta_ref) { + /* If we deleted a metadata reference, see if there are more + references as meta, or if it was the last one. */ + meta_refs_left = count_dup_meta_refs(dt); + } +more_refs: + log_info(_("%d block reference(s) remain (%d as metadata).\n"), + dt->refs, meta_refs_left); + if (deleted_a_meta_ref && meta_refs_left == 0) { + log_debug("Removed the last reference as metadata.\n"); + *removed_last_meta = 1; + } + return 1; /* references still exist so do not free the block. */ +} + +/** + * delete_block_if_notdup - delete blocks associated with an inode + * + * Ignore blocks that are already marked free. + * If it has been identified as duplicate, remove the duplicate reference. + * If all duplicate references have been removed, delete the block. + */ +static int delete_block_if_notdup(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, + const char *btype, int *was_duplicate, + void *private) +{ + int q; + int removed_lastmeta = 0; + + if (!valid_block_ip(ip, block)) + return meta_error; + + q = bitmap_type(ip->i_sbd, block); + if (q == GFS2_BLKST_FREE) { + log_info( _("%s block %lld (0x%llx), part of inode " + "%lld (0x%llx), was already free.\n"), + btype, (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + return meta_is_good; + } + if (find_remove_dup(ip, block, btype, &removed_lastmeta)) { /* a dup */ + if (was_duplicate) { + if (removed_lastmeta) + log_debug("Removed last reference as meta.\n"); + else + *was_duplicate = 1; + } + log_err( _("Not clearing duplicate reference in inode " + "at block #%llu (0x%llx) to block #%llu (0x%llx) " + "because it's referenced by another inode.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, (unsigned long long)block); + } else { + check_n_fix_bitmap(ip->i_sbd, ip->i_rgd, block, 0, + GFS2_BLKST_FREE); + } + return meta_is_good; +} + +static int remove_dentry(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_dirent *prev_de, + struct gfs2_buffer_head *bh, + char *filename, uint32_t *count, int *lindex, + void *private) +{ + /* the metawalk_fxn's private field must be set to the dentry + * block we want to clear */ + uint64_t *dentryblock = (uint64_t *) private; + struct gfs2_dirent dentry, *de; + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&dentry, (char *)dent); + de = &dentry; + + if (de->de_inum.no_addr == *dentryblock) + dirent2_del(ip, bh, prev_de, dent); + else + (*count)++; + + return 0; + +} + +int remove_dentry_from_dir(struct gfs2_sbd *sdp, uint64_t dir, + uint64_t dentryblock) +{ + struct metawalk_fxns remove_dentry_fxns = {0}; + struct gfs2_inode *ip; + int q; + int error; + + log_debug( _("Removing dentry %llu (0x%llx) from directory %llu" + " (0x%llx)\n"), (unsigned long long)dentryblock, + (unsigned long long)dentryblock, + (unsigned long long)dir, (unsigned long long)dir); + if (!valid_block(sdp, dir)) { + log_err( _("Parent directory is invalid\n")); + return 1; + } + remove_dentry_fxns.private = &dentryblock; + remove_dentry_fxns.check_dentry = remove_dentry; + + q = bitmap_type(sdp, dir); + if (q != GFS2_BLKST_DINODE) { + log_info( _("Parent block is not an inode...ignoring\n")); + return 1; + } + + ip = fsck_load_inode(sdp, dir); + if (ip == NULL) { + stack; + return -1; + } + /* Need to run check_dir with a private var of dentryblock, + * and fxns that remove that dentry if found */ + error = check_dir(sdp, ip, &remove_dentry_fxns); + fsck_inode_put(&ip); + return error; +} + +int delete_metadata(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, int *is_valid, + int *was_duplicate, void *private) +{ + *is_valid = 1; + *was_duplicate = 0; + return delete_block_if_notdup(ip, block, bh, _("metadata"), + was_duplicate, private); +} + +int delete_leaf(struct gfs2_inode *ip, uint64_t block, void *private) +{ + return delete_block_if_notdup(ip, block, NULL, _("leaf"), NULL, + private); +} + +int delete_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, struct gfs2_buffer_head *bh, + uint64_t *ptr) +{ + return delete_block_if_notdup(ip, block, NULL, _("data"), NULL, + private); +} + +static int del_eattr_generic(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private, const char *eatype) +{ + int ret = 0; + int was_free = 0; + int q; + + if (valid_block_ip(ip, block)) { + q = bitmap_type(ip->i_sbd, block); + if (q == GFS2_BLKST_FREE) + was_free = 1; + ret = delete_block_if_notdup(ip, block, NULL, eatype, + NULL, private); + if (!ret) { + *bh = bread(ip->i_sbd, block); + if (!was_free) + ip->i_di.di_blocks--; + bmodified(ip->i_bh); + } + } + /* Even if it's a duplicate reference, we want to eliminate the + reference itself, and adjust di_blocks accordingly. */ + if (ip->i_di.di_eattr) { + if (block == ip->i_di.di_eattr) + ip->i_di.di_eattr = 0; + bmodified(ip->i_bh); + } + return ret; +} + +int delete_eattr_indir(struct gfs2_inode *ip, uint64_t block, uint64_t parent, + struct gfs2_buffer_head **bh, void *private) +{ + return del_eattr_generic(ip, block, parent, bh, private, + _("extended attribute")); +} + +int delete_eattr_leaf(struct gfs2_inode *ip, uint64_t block, uint64_t parent, + struct gfs2_buffer_head **bh, void *private) +{ + return del_eattr_generic(ip, block, parent, bh, private, + _("indirect extended attribute")); +} + +int delete_eattr_entry(struct gfs2_inode *ip, struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + char ea_name[256]; + uint32_t avail_size; + int max_ptrs; + + if (!ea_hdr->ea_name_len){ + /* Skip this entry for now */ + return 1; + } + + memset(ea_name, 0, sizeof(ea_name)); + strncpy(ea_name, (char *)ea_hdr + sizeof(struct gfs2_ea_header), + ea_hdr->ea_name_len); + + if (!GFS2_EATYPE_VALID(ea_hdr->ea_type) && + ((ea_hdr_prev) || (!ea_hdr_prev && ea_hdr->ea_type))){ + /* Skip invalid entry */ + return 1; + } + + if (!ea_hdr->ea_num_ptrs) + return 0; + + avail_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + max_ptrs = (be32_to_cpu(ea_hdr->ea_data_len) + avail_size - 1) / + avail_size; + + if (max_ptrs > ea_hdr->ea_num_ptrs) + return 1; + + log_debug( _(" Pointers Required: %d\n Pointers Reported: %d\n"), + max_ptrs, ea_hdr->ea_num_ptrs); + + return 0; +} + +int delete_eattr_extentry(struct gfs2_inode *ip, int i, uint64_t *ea_data_ptr, + struct gfs2_buffer_head *leaf_bh, uint32_t tot_ealen, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, void *private) +{ + uint64_t block = be64_to_cpu(*ea_data_ptr); + int error; + + error = delete_block_if_notdup(ip, block, NULL, + _("extended attribute"), NULL, private); + if (error) { + log_err(_("Bad extended attribute found at block %lld " + "(0x%llx)"), + (unsigned long long)be64_to_cpu(*ea_data_ptr), + (unsigned long long)be64_to_cpu(*ea_data_ptr)); + if (query( _("Repair the bad Extended Attribute? (y/n) "))) { + ea_hdr->ea_num_ptrs = i; + ea_hdr->ea_data_len = cpu_to_be32(tot_ealen); + *ea_data_ptr = 0; + bmodified(leaf_bh); + /* Endianness doesn't matter in this case because it's + a single byte. */ + fsck_bitmap_set(ip, ip->i_di.di_eattr, + _("extended attribute"), + ip->i_sbd->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + log_err( _("The EA was fixed.\n")); + } else { + error = 1; + log_err( _("The bad EA was not fixed.\n")); + } + } + return error; +} diff --git a/gfs2/fsck/afterpass1_common.h b/gfs2/fsck/afterpass1_common.h new file mode 100644 index 0000000..829828f --- /dev/null +++ b/gfs2/fsck/afterpass1_common.h @@ -0,0 +1,31 @@ +#ifndef _AFTERPASS1_H +#define _AFTERPASS1_H + +#include "util.h" + +extern int delete_metadata(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, int *is_valid, + int *was_duplicate, void *private); +extern int delete_leaf(struct gfs2_inode *ip, uint64_t block, void *private); +extern int delete_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr); +extern int delete_eattr_indir(struct gfs2_inode *ip, uint64_t block, uint64_t parent, + struct gfs2_buffer_head **bh, void *private); +extern int delete_eattr_leaf(struct gfs2_inode *ip, uint64_t block, uint64_t parent, + struct gfs2_buffer_head **bh, void *private); +extern int delete_eattr_entry(struct gfs2_inode *ip, + struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private); +extern int delete_eattr_extentry(struct gfs2_inode *ip, int i, + uint64_t *ea_data_ptr, + struct gfs2_buffer_head *leaf_bh, + uint32_t tot_ealen, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private); +extern int remove_dentry_from_dir(struct gfs2_sbd *sdp, uint64_t dir, + uint64_t dentryblock); +#endif diff --git a/gfs2/fsck/fs_recovery.c b/gfs2/fsck/fs_recovery.c new file mode 100644 index 0000000..677abd7 --- /dev/null +++ b/gfs2/fsck/fs_recovery.c @@ -0,0 +1,952 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "fsck.h" +#include "fs_recovery.h" +#include "libgfs2.h" +#include "metawalk.h" +#include "util.h" + +#define JOURNAL_NAME_SIZE 18 +#define JOURNAL_SEQ_TOLERANCE 10 + +unsigned int sd_found_jblocks = 0, sd_replayed_jblocks = 0; +unsigned int sd_found_metablocks = 0, sd_replayed_metablocks = 0; +unsigned int sd_found_revokes = 0; +osi_list_t sd_revoke_list; +unsigned int sd_replay_tail; + +struct gfs2_revoke_replay { + osi_list_t rr_list; + uint64_t rr_blkno; + unsigned int rr_where; +}; + +int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where) +{ + osi_list_t *tmp, *head = &sd_revoke_list; + struct gfs2_revoke_replay *rr; + int found = 0; + + osi_list_foreach(tmp, head) { + rr = osi_list_entry(tmp, struct gfs2_revoke_replay, rr_list); + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (found) { + rr->rr_where = where; + return 0; + } + + rr = malloc(sizeof(struct gfs2_revoke_replay)); + if (!rr) + return -ENOMEM; + + rr->rr_blkno = blkno; + rr->rr_where = where; + osi_list_add(&rr->rr_list, head); + return 1; +} + +int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where) +{ + osi_list_t *tmp; + struct gfs2_revoke_replay *rr; + int wrap, a, b; + int found = 0; + + osi_list_foreach(tmp, &sd_revoke_list) { + rr = osi_list_entry(tmp, struct gfs2_revoke_replay, rr_list); + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (!found) + return 0; + + wrap = (rr->rr_where < sd_replay_tail); + a = (sd_replay_tail < where); + b = (where < rr->rr_where); + return (wrap) ? (a || b) : (a && b); +} + +void gfs2_revoke_clean(struct gfs2_sbd *sdp) +{ + osi_list_t *head = &sd_revoke_list; + struct gfs2_revoke_replay *rr; + + while (!osi_list_empty(head)) { + rr = osi_list_entry(head->next, struct gfs2_revoke_replay, rr_list); + osi_list_del(&rr->rr_list); + free(rr); + } +} + +static void refresh_rgrp(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + struct gfs2_buffer_head *bh, uint64_t blkno) +{ + int i; + + log_debug(_("Block is part of rgrp 0x%llx; refreshing the rgrp.\n"), + (unsigned long long)rgd->ri.ri_addr); + for (i = 0; i < rgd->ri.ri_length; i++) { + if (rgd->bits[i].bi_bh->b_blocknr != blkno) + continue; + + memcpy(rgd->bits[i].bi_bh->b_data, bh->b_data, sdp->bsize); + bmodified(rgd->bits[i].bi_bh); + if (i == 0) { /* this is the rgrp itself */ + if (sdp->gfs1) + gfs_rgrp_in((struct gfs_rgrp *)&rgd->rg, + rgd->bits[0].bi_bh); + else + gfs2_rgrp_in(&rgd->rg, rgd->bits[0].bi_bh->b_data); + } + break; + } +} + +static int buf_lo_scan_elements(struct gfs2_inode *ip, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct gfs2_buffer_head *bh_log, *bh_ip; + uint64_t blkno; + int error = 0; + struct rgrp_tree *rgd; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) + return 0; + + gfs2_replay_incr_blk(ip, &start); + + for (; blks; gfs2_replay_incr_blk(ip, &start), blks--) { + uint32_t check_magic; + + sd_found_metablocks++; + + blkno = be64_to_cpu(*ptr); + ptr++; + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(ip, start, &bh_log); + if (error) + return error; + + log_info( _("Journal replay writing metadata block #" + "%lld (0x%llx) for journal+0x%x\n"), + (unsigned long long)blkno, (unsigned long long)blkno, + start); + bh_ip = bget(sdp, blkno); + if (!bh_ip) { + log_err(_("Out of memory when replaying journals.\n")); + return FSCK_ERROR; + } + memcpy(bh_ip->b_data, bh_log->b_data, sdp->bsize); + + check_magic = ((struct gfs2_meta_header *) + (bh_ip->b_data))->mh_magic; + check_magic = be32_to_cpu(check_magic); + if (check_magic != GFS2_MAGIC) { + log_err(_("Journal corruption detected at block #" + "%lld (0x%llx) for journal+0x%x.\n"), + (unsigned long long)blkno, (unsigned long long)blkno, + start); + error = -EIO; + } else { + bmodified(bh_ip); + rgd = gfs2_blk2rgrpd(sdp, blkno); + if (rgd && blkno < rgd->ri.ri_data0) + refresh_rgrp(sdp, rgd, bh_ip, blkno); + } + + brelse(bh_log); + brelse(bh_ip); + if (error) + break; + + sd_replayed_metablocks++; + } + return error; +} + +static int revoke_lo_scan_elements(struct gfs2_inode *ip, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + unsigned int blks = be32_to_cpu(ld->ld_length); + unsigned int revokes = be32_to_cpu(ld->ld_data1); + struct gfs2_buffer_head *bh; + unsigned int offset; + uint64_t blkno; + int first = 1; + int error; + + if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE) + return 0; + + offset = sizeof(struct gfs2_log_descriptor); + + for (; blks; gfs2_replay_incr_blk(ip, &start), blks--) { + error = gfs2_replay_read_block(ip, start, &bh); + if (error) + return error; + + if (!first) { + if (gfs2_check_meta(bh, GFS2_METATYPE_LB)) + continue; + } + while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) { + blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); + log_info( _("Journal replay processing revoke for " + "block #%lld (0x%llx) for journal+0x%x\n"), + (unsigned long long)blkno, + (unsigned long long)blkno, + start); + error = gfs2_revoke_add(sdp, blkno, start); + if (error < 0) + return error; + else if (error) + sd_found_revokes++; + + if (!--revokes) + break; + offset += sizeof(uint64_t); + } + + bmodified(bh); + brelse(bh); + offset = sizeof(struct gfs2_meta_header); + first = 0; + } + return 0; +} + +static int databuf_lo_scan_elements(struct gfs2_inode *ip, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct gfs2_buffer_head *bh_log, *bh_ip; + uint64_t blkno; + uint64_t esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(ip, &start); + for (; blks; gfs2_replay_incr_blk(ip, &start), blks--) { + blkno = be64_to_cpu(*ptr); + ptr++; + esc = be64_to_cpu(*ptr); + ptr++; + + sd_found_jblocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(ip, start, &bh_log); + if (error) + return error; + + log_info( _("Journal replay writing data block #%lld (0x%llx)" + " for journal+0x%x\n"), + (unsigned long long)blkno, (unsigned long long)blkno, + start); + bh_ip = bget(sdp, blkno); + if (!bh_ip) { + log_err(_("Out of memory when replaying journals.\n")); + return FSCK_ERROR; + } + memcpy(bh_ip->b_data, bh_log->b_data, sdp->bsize); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + + brelse(bh_log); + bmodified(bh_ip); + brelse(bh_ip); + + sd_replayed_jblocks++; + } + return error; +} + +/** + * foreach_descriptor - go through the active part of the log + * @ip: the journal incore inode + * @start: the first log header in the active region + * @end: the last log header (don't process the contents of this entry)) + * + * Call a given function once for every log descriptor in the active + * portion of the log. + * + * Returns: errno + */ + +static int foreach_descriptor(struct gfs2_inode *ip, unsigned int start, + unsigned int end, int pass) +{ + struct gfs2_buffer_head *bh; + struct gfs2_log_descriptor *ld; + int error = 0; + uint32_t length; + __be64 *ptr; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + offset += sizeof(__be64) - 1; + offset &= ~(sizeof(__be64) - 1); + + while (start != end) { + uint32_t check_magic; + + error = gfs2_replay_read_block(ip, start, &bh); + if (error) + return error; + check_magic = ((struct gfs2_meta_header *) + (bh->b_data))->mh_magic; + check_magic = be32_to_cpu(check_magic); + if (check_magic != GFS2_MAGIC) { + bmodified(bh); + brelse(bh); + return -EIO; + } + ld = (struct gfs2_log_descriptor *)bh->b_data; + length = be32_to_cpu(ld->ld_length); + + if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { + struct gfs2_log_header lh; + + error = get_log_header(ip, start, &lh); + if (!error) { + gfs2_replay_incr_blk(ip, &start); + bmodified(bh); + brelse(bh); + continue; + } + if (error == 1) { + log_err(_("Journal corruption detected at " + "journal+0x%x.\n"), start); + error = -EIO; + } + bmodified(bh); + brelse(bh); + return error; + } else if (gfs2_check_meta(bh, GFS2_METATYPE_LD)) { + bmodified(bh); + brelse(bh); + return -EIO; + } + ptr = (__be64 *)(bh->b_data + offset); + error = databuf_lo_scan_elements(ip, start, ld, ptr, pass); + if (error) { + bmodified(bh); + brelse(bh); + return error; + } + error = buf_lo_scan_elements(ip, start, ld, ptr, pass); + if (error) { + bmodified(bh); + brelse(bh); + return error; + } + error = revoke_lo_scan_elements(ip, start, ld, ptr, pass); + if (error) { + bmodified(bh); + brelse(bh); + return error; + } + + while (length--) + gfs2_replay_incr_blk(ip, &start); + + bmodified(bh); + brelse(bh); + } + + return 0; +} + +/** + * check_journal_seq_no - Check and Fix log header sequencing problems + * @ip: the journal incore inode + * @fix: if 1, fix the sequence numbers, otherwise just report the problem + * + * Returns: The number of sequencing errors (hopefully none). + */ +static int check_journal_seq_no(struct gfs2_inode *ip, int fix) +{ + int error = 0, wrapped = 0; + uint32_t jd_blocks = ip->i_di.di_size / ip->i_sbd->sd_sb.sb_bsize; + uint32_t blk; + struct gfs2_log_header lh; + uint64_t highest_seq = 0, lowest_seq = 0, prev_seq = 0; + int new = 0; + uint64_t dblock; + struct gfs2_buffer_head *bh; + int seq_errors = 0; + + memset(&lh, 0, sizeof(lh)); + for (blk = 0; blk < jd_blocks; blk++) { + error = get_log_header(ip, blk, &lh); + if (error == 1) /* if not a log header */ + continue; /* just journal data--ignore it */ + if (!lowest_seq || lh.lh_sequence < lowest_seq) + lowest_seq = lh.lh_sequence; + if (!highest_seq || lh.lh_sequence > highest_seq) + highest_seq = lh.lh_sequence; + if (lh.lh_sequence > prev_seq) { + prev_seq = lh.lh_sequence; + continue; + } + /* The sequence number is not higher than the previous one, + so it's either wrap-around or a sequencing problem. */ + if (!wrapped && lh.lh_sequence == lowest_seq) { + wrapped = 1; + prev_seq = lh.lh_sequence; + continue; + } + log_err( _("Journal block %u (0x%x): sequence no. 0x%llx " + "out of order.\n"), blk, blk, lh.lh_sequence); + log_info( _("Low: 0x%llx, High: 0x%llx, Prev: 0x%llx\n"), + (unsigned long long)lowest_seq, + (unsigned long long)highest_seq, + (unsigned long long)prev_seq); + seq_errors++; + if (!fix) + continue; + highest_seq++; + lh.lh_sequence = highest_seq; + prev_seq = lh.lh_sequence; + log_warn( _("Renumbering it as 0x%llx\n"), lh.lh_sequence); + block_map(ip, blk, &new, &dblock, NULL, FALSE); + bh = bread(ip->i_sbd, dblock); + gfs2_log_header_out(&lh, bh->b_data); + bmodified(bh); + brelse(bh); + } + if (seq_errors && fix) { + log_err(_("%d sequence errors fixed.\n"), seq_errors); + seq_errors = 0; + } + return seq_errors; +} + +/** + * preen_is_safe - Can we safely preen the file system? + * + * If a preen option was specified (-a or -p) we're likely to have been + * called from rc.sysinit. We need to determine whether this is shared + * storage or not. If it's local storage (locking protocol==lock_nolock) + * it's safe to preen the file system. If it's lock_dlm, it's likely + * mounted by other nodes in the cluster, which is dangerous and therefore, + * we should warn the user to run fsck.gfs2 manually when it's safe. + */ +int preen_is_safe(struct gfs2_sbd *sdp, int preen, int force_check) +{ + if (!preen) /* If preen was not specified */ + return 1; /* not called by rc.sysinit--we're okay to preen */ + if (force_check) /* If check was forced by the user? */ + return 1; /* user's responsibility--we're okay to preen */ + if (!memcmp(sdp->sd_sb.sb_lockproto + 5, "nolock", 6)) + return 1; /* local file system--preen is okay */ + return 0; /* might be mounted on another node--not guaranteed safe */ +} + +/** + * gfs2_recover_journal - recovery a given journal + * @ip: the journal incore inode + * j: which journal to check + * preen: Was preen (-a or -p) specified? + * force_check: Was -f specified to force the check? + * @was_clean: if the journal was originally clean, this is set to 1. + * if the journal was dirty from the start, this is set to 0. + * + * Acquire the journal's lock, check to see if the journal is clean, and + * do recovery if necessary. + * + * Returns: errno + */ + +static int gfs2_recover_journal(struct gfs2_inode *ip, int j, int preen, + int force_check, int *was_clean) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_log_header head; + unsigned int pass; + int error; + + *was_clean = 0; + log_info( _("jid=%u: Looking at journal...\n"), j); + + osi_list_init(&sd_revoke_list); + error = gfs2_find_jhead(ip, &head); + if (!error) { + error = check_journal_seq_no(ip, 0); + if (error > JOURNAL_SEQ_TOLERANCE) { + log_err( _("Journal #%d (\"journal%d\") has %d " + "sequencing errors; tolerance is %d.\n"), + j+1, j, error, JOURNAL_SEQ_TOLERANCE); + goto out; + } + } + if (error) { + if (opts.no) { + log_err( _("Journal #%d (\"journal%d\") is corrupt\n"),j+1, j); + log_err( _("Not fixing it due to the -n option.\n")); + goto out; + } + if (!preen_is_safe(sdp, preen, force_check)) { + log_err(_("Journal #%d (\"journal%d\") is corrupt.\n"), + j+1, j); + log_err(_("I'm not fixing it because it may be unsafe:\n" + "Locking protocol is not lock_nolock and " + "the -a or -p option was specified.\n")); + log_err(_("Please make sure no node has the file system " + "mounted then rerun fsck.gfs2 manually " + "without -a or -p.\n")); + goto out; + } + if (!query( _("\nJournal #%d (\"journal%d\") is " + "corrupt. Okay to repair it? (y/n)"), + j+1, j)) { + log_err( _("jid=%u: The journal was not repaired.\n"), + j); + goto out; + } + log_info( _("jid=%u: Repairing journal...\n"), j); + error = check_journal_seq_no(ip, 1); + if (error) { + log_err( _("jid=%u: Unable to fix the bad journal.\n"), + j); + goto out; + } + error = gfs2_find_jhead(ip, &head); + if (error) { + log_err( _("jid=%u: Unable to fix the bad journal.\n"), + j); + goto out; + } + log_err( _("jid=%u: The journal was successfully fixed.\n"), + j); + } + if (head.lh_flags & GFS2_LOG_HEAD_UNMOUNT) { + log_info( _("jid=%u: Journal is clean.\n"), j); + *was_clean = 1; + return 0; + } + if (opts.no) { + log_err(_("Journal #%d (\"journal%d\") is dirty\n"),j+1, j); + log_err(_("not replaying due to the -n option.\n")); + goto out; + } + if (!preen_is_safe(sdp, preen, force_check)) { + log_err( _("Journal #%d (\"journal%d\") is dirty\n"), j+1, j); + log_err( _("I'm not replaying it because it may be unsafe:\n" + "Locking protocol is not lock_nolock and " + "the -a or -p option was specified.\n")); + log_err( _("Please make sure no node has the file system " + "mounted then rerun fsck.gfs2 manually " + "without -a or -p.\n")); + error = FSCK_ERROR; + goto out; + } + if (!query( _("\nJournal #%d (\"journal%d\") is dirty. Okay to " + "replay it? (y/n)"), j+1, j)) + goto reinit; + + log_info( _("jid=%u: Replaying journal...\n"), j); + + sd_found_jblocks = sd_replayed_jblocks = 0; + sd_found_metablocks = sd_replayed_metablocks = 0; + sd_found_revokes = 0; + sd_replay_tail = head.lh_tail; + for (pass = 0; pass < 2; pass++) { + error = foreach_descriptor(ip, head.lh_tail, + head.lh_blkno, pass); + if (error) { + log_err(_("Error found during journal replay.\n")); + goto out; + } + } + log_info( _("jid=%u: Found %u revoke tags\n"), j, sd_found_revokes); + gfs2_revoke_clean(sdp); + error = clean_journal(ip, &head); + if (error) + goto out; + log_err( _("jid=%u: Replayed %u of %u journaled data blocks\n"), + j, sd_replayed_jblocks, sd_found_jblocks); + log_err( _("jid=%u: Replayed %u of %u metadata blocks\n"), + j, sd_replayed_metablocks, sd_found_metablocks); + + /* Check for errors and give them the option to reinitialize the + journal. */ +out: + if (!error) { + log_info( _("jid=%u: Done\n"), j); + return 0; + } + log_err( _("jid=%u: Failed\n"), j); +reinit: + if (query( _("Do you want to clear the journal instead? (y/n)"))) { + error = write_journal(sdp->md.journal[j], sdp->bsize, + sdp->md.journal[j]->i_di.di_size / + sdp->sd_sb.sb_bsize); + log_err(_("jid=%u: journal was cleared.\n"), j); + } else { + log_err( _("jid=%u: journal not cleared.\n"), j); + } + return error; +} + +/* We can't use the rangecheck function from pass1 because we haven't gone + * through initialization properly yet. */ +static int rangecheck_jblock(struct gfs2_inode *ip, uint64_t block) +{ + if((block > ip->i_sbd->fssize) || (block <= LGFS2_SB_ADDR(ip->i_sbd))) { + log_info( _("Bad block pointer (out of range) found in " + "journal inode %lld (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + return meta_error; /* Exits check_metatree quicker */ + } + return meta_is_good; +} + +static int rangecheck_jmeta(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private) +{ + int rc; + + *bh = NULL; + *was_duplicate = 0; + *is_valid = 0; + rc = rangecheck_jblock(ip, block); + if (rc == meta_is_good) { + *bh = bread(ip->i_sbd, block); + *is_valid = (gfs2_check_meta(*bh, GFS2_METATYPE_IN) == 0); + if (!(*is_valid)) { + log_err( _("Journal at block %lld (0x%llx) has a bad " + "indirect block pointer %lld (0x%llx) " + "(points to something that is not an " + "indirect block).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, + (unsigned long long)block); + brelse(*bh); + *bh = NULL; + return meta_skip_further; + } + } + return rc; +} + +static int rangecheck_jdata(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + return rangecheck_jblock(ip, block); +} + +struct metawalk_fxns rangecheck_journal = { + .private = NULL, + .invalid_meta_is_fatal = 1, + .check_metalist = rangecheck_jmeta, + .check_data = rangecheck_jdata, +}; + +/* + * replay_journals - replay the journals + * sdp: the super block + * preen: Was preen (-a or -p) specified? + * force_check: Was -f specified to force the check? + * @clean_journals - set to the number of clean journals we find + * + * There should be a flag to the fsck to enable/disable this + * feature. The fsck falls back to clearing the journal if an + * inconsistency is found, but only for the bad journal. + * + * Returns: 0 on success, -1 on failure + */ +int replay_journals(struct gfs2_sbd *sdp, int preen, int force_check, + int *clean_journals) +{ + int i; + int clean = 0, dirty_journals = 0, error = 0, gave_msg = 0; + + *clean_journals = 0; + + sdp->jsize = GFS2_DEFAULT_JSIZE; + + for(i = 0; i < sdp->md.journals; i++) { + if (sdp->md.journal[i]) { + error = check_metatree(sdp->md.journal[i], + &rangecheck_journal); + if (error) + /* Don't use fsck_inode_put here because it's a + system file and we need to dismantle it. */ + inode_put(&sdp->md.journal[i]); + error = 0; /* bad journal is non-fatal */ + } + if (!sdp->md.journal[i]) { + log_err(_("File system journal \"journal%d\" is " + "missing or corrupt: pass1 will try to " + "recreate it.\n"), i); + continue; + } + if (!error) { + uint64_t jsize = sdp->md.journal[i]->i_di.di_size / + (1024 * 1024); + + if (sdp->jsize == GFS2_DEFAULT_JSIZE && jsize && + jsize != sdp->jsize) + sdp->jsize = jsize; + error = gfs2_recover_journal(sdp->md.journal[i], i, + preen, force_check, + &clean); + if (!clean) + dirty_journals++; + if (!gave_msg && dirty_journals == 1 && !opts.no && + preen_is_safe(sdp, preen, force_check)) { + gave_msg = 1; + log_notice( _("Recovering journals (this may " + "take a while)\n")); + } + *clean_journals += clean; + } + } + /* Sync the buffers to disk so we get a fresh start. */ + fsync(sdp->device_fd); + return error; +} + +/* + * ji_update - fill in journal info + * sdp: the incore superblock pointer + * + * Given the inode for the journal index, read in all + * the journal inodes. + * + * Returns: 0 on success, -1 on failure + */ +int ji_update(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *jip, *ip = sdp->md.jiinode; + char journal_name[JOURNAL_NAME_SIZE]; + int i, error; + char buf[sizeof(struct gfs_jindex)]; + struct gfs_jindex ji; + + if (!ip) { + log_crit(_("Journal index inode not found.\n")); + return -1; + } + + /* The per_node directory will have 3 directory entries per node, + plus two for "." and "..". So we subtract the 2 and divide by 3. + If per_node is missing or damaged, we have to trust jindex has + the correct number of entries. */ + if (sdp->gfs1) + sdp->md.journals = ip->i_di.di_size / sizeof(struct gfs_jindex); + else if (sdp->md.pinode) /* if per_node was read in properly */ + sdp->md.journals = (sdp->md.pinode->i_di.di_entries - 2) / 3; + else + sdp->md.journals = ip->i_di.di_entries - 2; + + if (!(sdp->md.journal = calloc(sdp->md.journals, + sizeof(struct gfs2_inode *)))) { + log_err(_("Unable to allocate journal index\n")); + return -1; + } + memset(journal_name, 0, sizeof(*journal_name)); + for (i = 0; i < sdp->md.journals; i++) { + if (sdp->gfs1) { + error = gfs2_readi(ip, + buf, i * sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + if (!error) + break; + if (error != sizeof(struct gfs_jindex)){ + log_err(_("An error occurred while reading the" + " journal index file.\n")); + return -1; + } + gfs_jindex_in(&ji, buf); + sdp->md.journal[i] = lgfs2_inode_read(sdp, ji.ji_addr); + if (sdp->md.journal[i] == NULL) + return -1; + } else { + /* FIXME check snprintf return code */ + snprintf(journal_name, JOURNAL_NAME_SIZE, + "journal%u", i); + gfs2_lookupi(sdp->md.jiinode, journal_name, + strlen(journal_name), &jip); + sdp->md.journal[i] = jip; + } + } + return 0; +} + +static void bad_journalname(const char *filename, int len) +{ + if (len >= 64) + len = 63; + log_debug(_("Journal index entry '%.*s' has an invalid filename.\n"), + len, filename); +} + +/** + * check_jindex_dent - check the jindex directory entries + * + * This function makes sure the directory entries of the jindex are valid. + * If they're not '.' or '..' they better have the form journalXXX. + */ +static int check_jindex_dent(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_dirent *prev_de, + struct gfs2_buffer_head *bh, char *filename, + uint32_t *count, int *lindex, void *priv) +{ + struct gfs2_dirent dentry, *de; + int i; + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&dentry, (char *)dent); + de = &dentry; + + if (de->de_name_len == 1 && filename[0] == '.') + goto dirent_good; + if (de->de_name_len == 2 && filename[0] == '.' && filename[1] == '.') + goto dirent_good; + + if ((de->de_name_len >= 11) || /* "journal9999" */ + (de->de_name_len <= 7) || + (strncmp(filename, "journal", 7))) { + bad_journalname(filename, de->de_name_len); + return -1; + } + for (i = 7; i < de->de_name_len; i++) { + if (filename[i] < '0' || filename[i] > '9') { + bad_journalname(filename, de->de_name_len); + return -2; + } + } + +dirent_good: + /* Return the number of leaf entries so metawalk doesn't flag this + leaf as having none. */ + *count = be16_to_cpu(((struct gfs2_leaf *)bh->b_data)->lf_entries); + return 0; +} + +struct metawalk_fxns jindex_check_fxns = { + .private = NULL, + .check_dentry = check_jindex_dent, +}; + +/** + * init_jindex - read in the rindex file + */ +int init_jindex(struct gfs2_sbd *sdp, int allow_ji_rebuild) +{ + /******************************************************************* + ****************** Fill in journal information ****************** + *******************************************************************/ + + log_debug(_("Validating the journal index.\n")); + /* rgrepair requires the journals be read in in order to distinguish + "real" rgrps from rgrps that are just copies left in journals. */ + if (sdp->gfs1) + sdp->md.jiinode = lgfs2_inode_read(sdp, sbd1->sb_jindex_di.no_addr); + else + gfs2_lookupi(sdp->master_dir, "jindex", 6, &sdp->md.jiinode); + + if (!sdp->md.jiinode) { + int err; + + if (!allow_ji_rebuild) { + log_crit(_("Error: jindex and rindex files are both " + "corrupt.\n")); + return -1; + } + if (!query( _("The gfs2 system jindex inode is missing. " + "Okay to rebuild it? (y/n) "))) { + log_crit(_("Error: cannot proceed without a valid " + "jindex file.\n")); + return -1; + } + + err = build_jindex(sdp); + if (err) { + log_crit(_("Error %d rebuilding jindex\n"), err); + return err; + } + gfs2_lookupi(sdp->master_dir, "jindex", 6, &sdp->md.jiinode); + } + + /* check for irrelevant entries in jindex. Can't use check_dir because + that creates and destroys the inode, which we don't want. */ + if (!sdp->gfs1) { + int error; + + log_debug(_("Checking the integrity of the journal index.\n")); + if (sdp->md.jiinode->i_di.di_flags & GFS2_DIF_EXHASH) + error = check_leaf_blks(sdp->md.jiinode, + &jindex_check_fxns); + else + error = check_linear_dir(sdp->md.jiinode, + sdp->md.jiinode->i_bh, + &jindex_check_fxns); + if (error) { + log_err(_("The system journal index is damaged.\n")); + if (!query( _("Okay to rebuild it? (y/n) "))) { + log_crit(_("Error: cannot proceed without a " + "valid jindex file.\n")); + return -1; + } + inode_put(&sdp->md.jiinode); + gfs2_dirent_del(sdp->master_dir, "jindex", 6); + log_err(_("Corrupt journal index was removed.\n")); + error = build_jindex(sdp); + if (error) { + log_err(_("Error rebuilding journal " + "index: Cannot continue.\n")); + return error; + } + gfs2_lookupi(sdp->master_dir, "jindex", 6, + &sdp->md.jiinode); + } + } + + /* read in the ji data */ + if (ji_update(sdp)){ + log_err( _("Unable to read jindex inode.\n")); + return -1; + } + return 0; +} diff --git a/gfs2/fsck/fs_recovery.h b/gfs2/fsck/fs_recovery.h new file mode 100644 index 0000000..d687627 --- /dev/null +++ b/gfs2/fsck/fs_recovery.h @@ -0,0 +1,13 @@ +#ifndef __FS_RECOVERY_H__ +#define __FS_RECOVERY_H__ + +#include "libgfs2.h" + +extern int replay_journals(struct gfs2_sbd *sdp, int preen, int force_check, + int *clean_journals); +extern int preen_is_safe(struct gfs2_sbd *sdp, int preen, int force_check); + +extern int ji_update(struct gfs2_sbd *sdp); +extern int init_jindex(struct gfs2_sbd *sdp, int allow_ji_rebuild); +#endif /* __FS_RECOVERY_H__ */ + diff --git a/gfs2/fsck/fsck.h b/gfs2/fsck/fsck.h new file mode 100644 index 0000000..d3f7635 --- /dev/null +++ b/gfs2/fsck/fsck.h @@ -0,0 +1,200 @@ +#ifndef _FSCK_H +#define _FSCK_H + +#include "libgfs2.h" +#include "osi_tree.h" + +#define FSCK_HASH_SHIFT (13) +#define FSCK_HASH_SIZE (1 << FSCK_HASH_SHIFT) +#define FSCK_HASH_MASK (FSCK_HASH_SIZE - 1) + +#define query(fmt, args...) fsck_query(fmt, ##args) + +/* + * Exit codes used by fsck-type programs + * Copied from e2fsck's e2fsck.h + */ +#define FSCK_OK 0 /* No errors */ +#define FSCK_NONDESTRUCT 1 /* File system errors corrected */ +#define FSCK_REBOOT 2 /* System should be rebooted */ +#define FSCK_UNCORRECTED 4 /* File system errors left uncorrected */ +#define FSCK_ERROR 8 /* Operational error */ +#define FSCK_USAGE 16 /* Usage or syntax error */ +#define FSCK_CANCELED 32 /* Aborted with a signal or ^C */ +#define FSCK_LIBRARY 128 /* Shared library error */ + +#define BAD_POINTER_TOLERANCE 10 /* How many bad pointers is too many? */ + +struct gfs2_bmap { + uint64_t size; + uint64_t mapsize; + unsigned char *map; +}; + +struct inode_info +{ + struct osi_node node; + struct gfs2_inum di_num; + uint32_t di_nlink; /* the number of links the inode + * thinks it has */ + uint32_t counted_links; /* the number of links we've found */ +}; + +struct dir_info +{ + struct osi_node node; + struct gfs2_inum dinode; + uint64_t treewalk_parent; + struct gfs2_inum dotdot_parent; + uint32_t di_nlink; + uint32_t counted_links; + uint8_t checked:1; +}; + +struct dir_status { + uint8_t dotdir:1; + uint8_t dotdotdir:1; + int q; + uint32_t entry_count; +}; + +#define DUPFLAG_REF1_FOUND 1 /* Has the original reference been found? */ +#define DUPFLAG_REF1_IS_DUPL 2 /* The original reference is also where we + determined there was a duplicate. */ + +struct duptree { + struct osi_node node; + int dup_flags; + int refs; + uint64_t block; + osi_list_t ref_inode_list; /* list of inodes referencing a dup block */ + osi_list_t ref_invinode_list; /* list of invalid inodes referencing */ +}; + +enum dup_ref_type { + ref_as_data = 0, /* dinode references this block as a data block */ + ref_as_meta = 1, /* dinode references this block as a metadata block */ + ref_as_ea = 2, /* dinode references this block as an extended attr */ + ref_is_inode= 3, /* The reference is itself a dinode. In other words, + it's a dinode, not pointed to as data or + metadata */ + ref_types = 4, +}; + +struct inode_with_dups { + osi_list_t list; + uint64_t block_no; + int dup_count; + int reftypecount[ref_types]; + uint64_t parent; + char *name; +}; + +enum rgindex_trust_level { /* how far can we trust our RG index? */ + blind_faith = 0, /* We'd like to trust the rgindex. We always used to + before bz 179069. This should cover most cases. */ + ye_of_little_faith = 1, /* The rindex seems trustworthy but there's + rg damage that need to be fixed. */ + open_minded = 2, /* At least 1 RG is corrupt. Try to calculate what it + should be, in a perfect world where our RGs are all + on even boundaries. Blue sky. Chirping birds. */ + distrust = 3, /* The world isn't perfect, our RGs are not on nice neat + boundaries. The fs must have been messed with by + gfs2_grow or something. Count the RGs by hand. */ + indignation = 4 /* Not only do we have corruption, but the rgrps + aren't on even boundaries, so this file system + must have been converted from gfs2_convert. */ +}; + +struct error_block { + uint64_t metablk; /* metadata block where error was found */ + int metaoff; /* offset in that metadata block where error found */ + uint64_t errblk; /* error block */ +}; + +extern struct gfs2_inode *fsck_load_inode(struct gfs2_sbd *sdp, uint64_t block); +extern struct gfs2_inode *fsck_inode_get(struct gfs2_sbd *sdp, + struct rgrp_tree *rgd, + struct gfs2_buffer_head *bh); +extern void fsck_inode_put(struct gfs2_inode **ip); + +extern int initialize(struct gfs2_sbd *sdp, int force_check, int preen, + int *all_clean); +extern void destroy(struct gfs2_sbd *sdp); +extern int pass1(struct gfs2_sbd *sdp); +extern int pass1b(struct gfs2_sbd *sdp); +extern int pass1c(struct gfs2_sbd *sdp); +extern int pass2(struct gfs2_sbd *sdp); +extern int pass3(struct gfs2_sbd *sdp); +extern int pass4(struct gfs2_sbd *sdp); +extern int pass5(struct gfs2_sbd *sdp, struct gfs2_bmap *bl); +extern int rg_repair(struct gfs2_sbd *sdp, int trust_lvl, int *rg_count, + int *sane); +extern int fsck_query(const char *format, ...) + __attribute__((format(printf,1,2))); +extern struct dir_info *dirtree_find(uint64_t block); +extern void dup_delete(struct duptree *dt); +extern void dirtree_delete(struct dir_info *b); + +/* FIXME: Hack to get this going for pass2 - this should be pulled out + * of pass1 and put somewhere else... */ +struct dir_info *dirtree_insert(struct gfs2_inum inum); + +struct gfs2_options { + char *device; + unsigned int yes:1; + unsigned int no:1; + unsigned int query:1; +}; + +extern struct gfs2_options opts; +extern struct gfs2_inode *lf_dip; /* Lost and found directory inode */ +extern int lf_was_created; +extern uint64_t last_fs_block, last_reported_block; +extern int64_t last_reported_fblock; +extern int skip_this_pass, fsck_abort; +extern int errors_found, errors_corrected; +extern uint64_t last_data_block; +extern uint64_t first_data_block; +extern struct osi_root dup_blocks; +extern struct osi_root dirtree; +extern struct osi_root inodetree; +extern int dups_found; /* How many duplicate references have we found? */ +extern int dups_found_first; /* How many duplicates have we found the original + reference for? */ +extern struct gfs_sb *sbd1; + +static inline int valid_block(struct gfs2_sbd *sdp, uint64_t blkno) +{ + return !((blkno > sdp->fssize) || (blkno <= LGFS2_SB_ADDR(sdp)) || + (lgfs2_get_bitmap(sdp, blkno, NULL) < 0)); +} + +static inline int rgrp_contains_block(struct rgrp_tree *rgd, uint64_t blk) +{ + if (blk < rgd->ri.ri_addr) + return 0; + if (blk >= rgd->ri.ri_data0 + rgd->ri.ri_data) + return 0; + return 1; +} + +static inline int valid_block_ip(struct gfs2_inode *ip, uint64_t blk) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct rgrp_tree *rgd = ip->i_rgd; + + if (blk > sdp->fssize) + return 0; + if (blk <= LGFS2_SB_ADDR(sdp)) + return 0; + if (rgd == NULL || !rgrp_contains_block(rgd, blk)) { + rgd = gfs2_blk2rgrpd(sdp, blk); + if (rgd == NULL) + return 0; + } + + return rgrp_contains_block(rgd, blk); +} + +#endif /* _FSCK_H */ diff --git a/gfs2/fsck/initialize.c b/gfs2/fsck/initialize.c new file mode 100644 index 0000000..ebe62b9 --- /dev/null +++ b/gfs2/fsck/initialize.c @@ -0,0 +1,1713 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "util.h" +#include "fs_recovery.h" +#include "metawalk.h" +#include "inode_hash.h" + +#define CLEAR_POINTER(x) \ + if (x) { \ + free(x); \ + x = NULL; \ + } +#define HIGHEST_BLOCK 0xffffffffffffffff + +static int was_mounted_ro = 0; +static uint64_t possible_root = HIGHEST_BLOCK; +static struct master_dir fix_md; +static unsigned long long blks_2free = 0; +extern int sb_fixed; + +/** + * block_mounters + * + * Change the lock protocol so nobody can mount the fs + * + */ +static int block_mounters(struct gfs2_sbd *sdp, int block_em) +{ + if (block_em) { + /* verify it starts with lock_ */ + if (!strncmp(sdp->sd_sb.sb_lockproto, "lock_", 5)) { + /* Change lock_ to fsck_ */ + memcpy(sdp->sd_sb.sb_lockproto, "fsck_", 5); + } + /* FIXME: Need to do other verification in the else + * case */ + } else { + /* verify it starts with fsck_ */ + /* verify it starts with lock_ */ + if (!strncmp(sdp->sd_sb.sb_lockproto, "fsck_", 5)) { + /* Change fsck_ to lock_ */ + memcpy(sdp->sd_sb.sb_lockproto, "lock_", 5); + } + } + + if (lgfs2_sb_write(&sdp->sd_sb, sdp->device_fd, sdp->bsize)) { + stack; + return -1; + } + return 0; +} + +static void gfs2_dup_free(void) +{ + struct osi_node *n; + struct duptree *dt; + + while ((n = osi_first(&dup_blocks))) { + dt = (struct duptree *)n; + dup_delete(dt); + } +} + +static void gfs2_dirtree_free(void) +{ + struct osi_node *n; + struct dir_info *dt; + + while ((n = osi_first(&dirtree))) { + dt = (struct dir_info *)n; + dirtree_delete(dt); + } +} + +static void gfs2_inodetree_free(void) +{ + struct osi_node *n; + struct inode_info *dt; + + while ((n = osi_first(&inodetree))) { + dt = (struct inode_info *)n; + inodetree_delete(dt); + } +} + +/* + * empty_super_block - free all structures in the super block + * sdp: the in-core super block + * + * This function frees all allocated structures within the + * super block. It does not free the super block itself. + * + * Returns: Nothing + */ +static void empty_super_block(struct gfs2_sbd *sdp) +{ + log_info( _("Freeing buffers.\n")); + gfs2_rgrp_free(&sdp->rgtree); + + gfs2_inodetree_free(); + gfs2_dirtree_free(); + gfs2_dup_free(); +} + + +/** + * set_block_ranges + * @sdp: superblock + * + * Uses info in rgrps and jindex to determine boundaries of the + * file system. + * + * Returns: 0 on success, -1 on failure + */ +static int set_block_ranges(struct gfs2_sbd *sdp) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rgd; + struct gfs2_rindex *ri; + char buf[sdp->sd_sb.sb_bsize]; + uint64_t rmax = 0; + uint64_t rmin = 0; + int error; + + log_info( _("Setting block ranges...")); + + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + ri = &rgd->ri; + if (ri->ri_data0 + ri->ri_data && + ri->ri_data0 + ri->ri_data - 1 > rmax) + rmax = ri->ri_data0 + ri->ri_data - 1; + if (!rmin || ri->ri_data0 < rmin) + rmin = ri->ri_data0; + } + + last_fs_block = rmax; + if (last_fs_block > 0xffffffff && sizeof(unsigned long) <= 4) { + log_crit( _("This file system is too big for this computer to handle.\n")); + log_crit( _("Last fs block = 0x%llx, but sizeof(unsigned long) is %zu bytes.\n"), + (unsigned long long)last_fs_block, + sizeof(unsigned long)); + goto fail; + } + + last_data_block = rmax; + first_data_block = rmin; + + if (fsck_lseek(sdp->device_fd, (last_fs_block * sdp->sd_sb.sb_bsize))){ + log_crit( _("Can't seek to last block in file system: %llu" + " (0x%llx)\n"), (unsigned long long)last_fs_block, + (unsigned long long)last_fs_block); + goto fail; + } + + memset(buf, 0, sdp->sd_sb.sb_bsize); + error = read(sdp->device_fd, buf, sdp->sd_sb.sb_bsize); + if (error != sdp->sd_sb.sb_bsize){ + log_crit( _("Can't read last block in file system (error %u), " + "last_fs_block: %llu (0x%llx)\n"), error, + (unsigned long long)last_fs_block, + (unsigned long long)last_fs_block); + goto fail; + } + + log_info(_("0x%llx to 0x%llx\n"), (unsigned long long)first_data_block, + (unsigned long long)last_data_block); + return 0; + + fail: + log_info( _("Error\n")); + return -1; +} + +/** + * check_rgrp_integrity - verify a rgrp free block count against the bitmap + */ +static void check_rgrp_integrity(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + int *fixit, int *this_rg_fixed, + int *this_rg_bad, int *this_rg_cleaned) +{ + uint32_t rg_free, rg_reclaimed, rg_unlinked, rg_usedmeta, rg_useddi; + int rgb, x, y, off, bytes_to_check, total_bytes_to_check, asked = 0; + unsigned int state; + struct gfs_rgrp *gfs1rg = (struct gfs_rgrp *)&rgd->rg; + uint64_t diblock; + struct gfs2_buffer_head *bh; + + rg_free = rg_reclaimed = rg_unlinked = rg_usedmeta = rg_useddi = 0; + total_bytes_to_check = rgd->ri.ri_bitbytes; + + *this_rg_fixed = *this_rg_bad = *this_rg_cleaned = 0; + + diblock = rgd->ri.ri_data0; + for (rgb = 0; rgb < rgd->ri.ri_length; rgb++){ + /* Count up the free blocks in the bitmap */ + off = (rgb) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_rgrp); + if (total_bytes_to_check <= sdp->bsize - off) + bytes_to_check = total_bytes_to_check; + else + bytes_to_check = sdp->bsize - off; + total_bytes_to_check -= bytes_to_check; + for (x = 0; x < bytes_to_check; x++) { + unsigned char *byte; + + byte = (unsigned char *)&rgd->bits[rgb].bi_bh->b_data[off + x]; + if (*byte == 0x55) { + diblock += GFS2_NBBY; + continue; + } + if (*byte == 0x00) { + diblock += GFS2_NBBY; + rg_free += GFS2_NBBY; + continue; + } + for (y = 0; y < GFS2_NBBY; y++) { + state = (*byte >> + (GFS2_BIT_SIZE * y)) & GFS2_BIT_MASK; + if (state == GFS2_BLKST_USED) { + diblock++; + continue; + } + if (state == GFS2_BLKST_DINODE) { + if (sdp->gfs1) { + bh = bread(sdp, diblock); + if (!gfs2_check_meta(bh, + GFS2_METATYPE_DI)) + rg_useddi++; + else + rg_usedmeta++; + brelse(bh); + } + diblock++; + continue; + } + if (state == GFS2_BLKST_FREE) { + diblock++; + rg_free++; + continue; + } + /* GFS2_BLKST_UNLINKED */ + if (sdp->gfs1) + log_info(_("Free metadata block 0x%llx" + " found.\n"), + (unsigned long long)diblock); + else + log_info(_("Unlinked dinode 0x%llx " + "found.\n"), + (unsigned long long)diblock); + if (!asked) { + char msg[256]; + + asked = 1; + sprintf(msg, + _("Okay to reclaim free " + "metadata in resource group " + "%lld (0x%llx)? (y/n)"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr); + if (query("%s", msg)) + *fixit = 1; + } + if (!(*fixit)) { + rg_unlinked++; + diblock++; + continue; + } + *byte &= ~(GFS2_BIT_MASK << + (GFS2_BIT_SIZE * y)); + bmodified(rgd->bits[rgb].bi_bh); + rg_reclaimed++; + rg_free++; + rgd->rg.rg_free++; + if (sdp->gfs1 && gfs1rg->rg_freemeta) + gfs1rg->rg_freemeta--; + log_info(_("Free metadata block %lld (0x%llx) " + "reclaimed.\n"), + (unsigned long long)diblock, + (unsigned long long)diblock); + bh = bread(sdp, diblock); + if (!gfs2_check_meta(bh, GFS2_METATYPE_DI)) { + struct gfs2_inode *ip = + fsck_inode_get(sdp, rgd, bh); + if (ip->i_di.di_blocks > 1) { + blks_2free += + ip->i_di.di_blocks - 1; + log_info(_("%lld blocks " + "(total) may need " + "to be freed in " + "pass 5.\n"), + blks_2free); + } + fsck_inode_put(&ip); + } + brelse(bh); + diblock++; + } + } + } + /* The unlinked blocks we reclaim shouldn't be considered errors, + since we're just reclaiming them as a courtesy. If we already + got permission to reclaim them, we adjust the rgrp counts + accordingly. That way, only "real" rgrp count inconsistencies + will be reported. */ + if (rg_reclaimed && *fixit) { + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + *this_rg_cleaned = 1; + log_info( _("The rgrp at %lld (0x%llx) was cleaned of %d " + "free metadata blocks.\n"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr, + rg_reclaimed); + } + if (rgd->rg.rg_free != rg_free) { + *this_rg_bad = 1; + *this_rg_cleaned = 0; + log_err( _("Error: resource group %lld (0x%llx): " + "free space (%d) does not match bitmap (%d)\n"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr, + rgd->rg.rg_free, rg_free); + if (query( _("Fix the rgrp free blocks count? (y/n)"))) { + rgd->rg.rg_free = rg_free; + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + *this_rg_fixed = 1; + log_err( _("The rgrp was fixed.\n")); + } else + log_err( _("The rgrp was not fixed.\n")); + } + if (!sdp->gfs1) + return; + + if (gfs1rg->rg_freemeta != rg_unlinked) { + *this_rg_bad = 1; + *this_rg_cleaned = 0; + log_err( _("Error: resource group %lld (0x%llx): " + "free meta (%d) does not match bitmap (%d)\n"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr, + gfs1rg->rg_freemeta, rg_unlinked); + if (query( _("Fix the rgrp free meta blocks count? (y/n)"))) { + gfs1rg->rg_freemeta = rg_unlinked; + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + *this_rg_fixed = 1; + log_err( _("The rgrp was fixed.\n")); + } else + log_err( _("The rgrp was not fixed.\n")); + } + if (gfs1rg->rg_useddi != rg_useddi) { + *this_rg_bad = 1; + *this_rg_cleaned = 0; + log_err( _("Error: resource group %lld (0x%llx): used dinode " + "count (%d) does not match bitmap (%d)\n"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr, + gfs1rg->rg_useddi, rg_useddi); + if (query( _("Fix the rgrp used dinode block count? (y/n)"))) { + gfs1rg->rg_useddi = rg_useddi; + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, + rgd->bits[0].bi_bh); + *this_rg_fixed = 1; + log_err( _("The rgrp was fixed.\n")); + } else + log_err( _("The rgrp was not fixed.\n")); + } + if (gfs1rg->rg_usedmeta != rg_usedmeta) { + *this_rg_bad = 1; + *this_rg_cleaned = 0; + log_err( _("Error: resource group %lld (0x%llx): used " + "metadata (%d) does not match bitmap (%d)\n"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr, + gfs1rg->rg_usedmeta, rg_usedmeta); + if (query( _("Fix the rgrp used meta blocks count? (y/n)"))) { + gfs1rg->rg_usedmeta = rg_usedmeta; + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, + rgd->bits[0].bi_bh); + *this_rg_fixed = 1; + log_err( _("The rgrp was fixed.\n")); + } else + log_err( _("The rgrp was not fixed.\n")); + } + /* + else { + log_debug( _("Resource group %lld (0x%llx) free space " + "is consistent: free: %d reclaimed: %d\n"), + (unsigned long long)rgd->ri.ri_addr, + (unsigned long long)rgd->ri.ri_addr, + rg_free, rg_reclaimed); + }*/ +} + +/** + * check_rgrps_integrity - verify rgrp consistency + * Note: We consider an rgrp "cleaned" if the unlinked meta blocks are + * cleaned, so not quite "bad" and not quite "good" but rewritten anyway. + * + * Returns: 0 on success, 1 if errors were detected + */ +static void check_rgrps_integrity(struct gfs2_sbd *sdp) +{ + struct osi_node *n, *next = NULL; + int rgs_good = 0, rgs_bad = 0, rgs_fixed = 0, rgs_cleaned = 0; + int was_bad = 0, was_fixed = 0, was_cleaned = 0; + struct rgrp_tree *rgd; + int reclaim_unlinked = 0; + + log_info( _("Checking the integrity of all resource groups.\n")); + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + if (fsck_abort) + return; + check_rgrp_integrity(sdp, rgd, &reclaim_unlinked, + &was_fixed, &was_bad, &was_cleaned); + if (was_fixed) + rgs_fixed++; + if (was_cleaned) + rgs_cleaned++; + else if (was_bad) + rgs_bad++; + else + rgs_good++; + } + if (rgs_bad || rgs_cleaned) { + log_err( _("RGs: Consistent: %d Cleaned: %d Inconsistent: " + "%d Fixed: %d Total: %d\n"), + rgs_good, rgs_cleaned, rgs_bad, rgs_fixed, + rgs_good + rgs_bad + rgs_cleaned); + if (rgs_cleaned && blks_2free) + log_err(_("%lld blocks may need to be freed in pass 5 " + "due to the cleaned resource groups.\n"), + blks_2free); + } +} + +/** + * rebuild_master - rebuild a destroyed master directory + */ +static int rebuild_master(struct gfs2_sbd *sdp) +{ + struct gfs2_inum inum; + struct gfs2_buffer_head *bh = NULL; + int err = 0; + + log_err(_("The system master directory seems to be destroyed.\n")); + if (!query(_("Okay to rebuild it? (y/n)"))) { + log_err(_("System master not rebuilt; aborting.\n")); + return -1; + } + log_err(_("Trying to rebuild the master directory.\n")); + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = sdp->sd_sb.sb_master_dir.no_addr; + err = init_dinode(sdp, &bh, &inum, S_IFDIR | 0755, GFS2_DIF_SYSTEM, &inum); + if (err != 0) + return -1; + sdp->master_dir = lgfs2_inode_get(sdp, bh); + if (sdp->master_dir == NULL) { + log_crit(_("Error reading master: %s\n"), strerror(errno)); + return -1; + } + sdp->master_dir->bh_owned = 1; + + if (fix_md.jiinode) { + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = fix_md.jiinode->i_di.di_num.no_addr; + err = dir_add(sdp->master_dir, "jindex", 6, &inum, + IF2DT(S_IFDIR | 0700)); + if (err) { + log_crit(_("Error %d adding jindex directory\n"), errno); + exit(FSCK_ERROR); + } + sdp->master_dir->i_di.di_nlink++; + } else { + err = build_jindex(sdp); + if (err) { + log_crit(_("Error %d building jindex\n"), err); + exit(FSCK_ERROR); + } + } + + if (fix_md.pinode) { + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = fix_md.pinode->i_di.di_num.no_addr; + err = dir_add(sdp->master_dir, "per_node", 8, &inum, + IF2DT(S_IFDIR | 0700)); + if (err) { + log_crit(_("Error %d adding per_node directory\n"), + errno); + exit(FSCK_ERROR); + } + sdp->master_dir->i_di.di_nlink++; + } else { + err = build_per_node(sdp); + if (err) { + log_crit(_("Error %d building per_node directory\n"), + err); + exit(FSCK_ERROR); + } + } + + if (fix_md.inum) { + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = fix_md.inum->i_di.di_num.no_addr; + err = dir_add(sdp->master_dir, "inum", 4, &inum, + IF2DT(S_IFREG | 0600)); + if (err) { + log_crit(_("Error %d adding inum inode\n"), errno); + exit(FSCK_ERROR); + } + } else { + err = build_inum(sdp); + if (err) { + log_crit(_("Error %d building inum inode\n"), err); + exit(FSCK_ERROR); + } + gfs2_lookupi(sdp->master_dir, "inum", 4, &sdp->md.inum); + } + + if (fix_md.statfs) { + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = fix_md.statfs->i_di.di_num.no_addr; + err = dir_add(sdp->master_dir, "statfs", 6, &inum, + IF2DT(S_IFREG | 0600)); + if (err) { + log_crit(_("Error %d adding statfs inode\n"), errno); + exit(FSCK_ERROR); + } + } else { + err = build_statfs(sdp); + if (err) { + log_crit(_("Error %d building statfs inode\n"), err); + exit(FSCK_ERROR); + } + gfs2_lookupi(sdp->master_dir, "statfs", 6, &sdp->md.statfs); + } + + if (fix_md.riinode) { + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = fix_md.riinode->i_di.di_num.no_addr; + err = dir_add(sdp->master_dir, "rindex", 6, &inum, + IF2DT(S_IFREG | 0600)); + if (err) { + log_crit(_("Error %d adding rindex inode\n"), errno); + exit(FSCK_ERROR); + } + } else { + err = build_rindex(sdp); + if (err) { + log_crit(_("Error %d building rindex inode\n"), err); + exit(FSCK_ERROR); + } + } + + if (fix_md.qinode) { + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = fix_md.qinode->i_di.di_num.no_addr; + err = dir_add(sdp->master_dir, "quota", 5, &inum, + IF2DT(S_IFREG | 0600)); + if (err) { + log_crit(_("Error %d adding quota inode\n"), errno); + exit(FSCK_ERROR); + } + } else { + err = build_quota(sdp); + if (err) { + log_crit(_("Error %d building quota inode\n"), err); + exit(FSCK_ERROR); + } + } + + log_err(_("Master directory rebuilt.\n")); + inode_put(&sdp->md.inum); + inode_put(&sdp->md.statfs); + inode_put(&sdp->master_dir); + return 0; +} + +/** + * lookup_per_node - Make sure the per_node directory is read in + * + * This function is used to read in the per_node directory. It is called + * twice. The first call tries to read in the dinode early on. That ensures + * that if any journals are missing, we can figure out the number of journals + * from per_node. However, we unfortunately can't rebuild per_node at that + * point in time because our resource groups aren't read in yet. + * The second time it's called is much later when we can rebuild it. + * + * allow_rebuild: 0 if rebuilds are not allowed + * 1 if rebuilds are allowed + */ +static void lookup_per_node(struct gfs2_sbd *sdp, int allow_rebuild) +{ + if (sdp->md.pinode) + return; + + gfs2_lookupi(sdp->master_dir, "per_node", 8, &sdp->md.pinode); + if (sdp->md.pinode) + return; + if (!allow_rebuild) { + log_err( _("The gfs2 system per_node directory " + "inode is missing, so we might not be \nable to " + "rebuild missing journals this run.\n")); + return; + } + + if (query( _("The gfs2 system per_node directory " + "inode is missing. Okay to rebuild it? (y/n) "))) { + int err; + + err = build_per_node(sdp); + if (err) { + log_crit(_("Error %d rebuilding per_node directory\n"), + err); + exit(FSCK_ERROR); + } + } + gfs2_lookupi(sdp->master_dir, "per_node", 8, &sdp->md.pinode); + if (!sdp->md.pinode) { + log_err( _("Unable to rebuild per_node; aborting.\n")); + exit(FSCK_ERROR); + } +} + +/** + * fetch_rgrps - fetch the resource groups from disk, and check their integrity + */ +static int fetch_rgrps(struct gfs2_sbd *sdp) +{ + enum rgindex_trust_level trust_lvl; + int rgcount, sane = 1; + + const char *level_desc[] = { + _("Checking if all rgrp and rindex values are good"), + _("Checking if rindex values may be easily repaired"), + _("Calculating where the rgrps should be if evenly spaced"), + _("Trying to rebuild rindex assuming evenly spaced rgrps"), + _("Trying to rebuild rindex assuming unevenly spaced rgrps"), + }; + const char *fail_desc[] = { + _("Some damage was found; we need to take remedial measures"), + _("rindex is unevenly spaced: either gfs1-style or corrupt"), + _("rindex calculations don't match: uneven rgrp boundaries"), + _("Too many rgrp misses: rgrps must be unevenly spaced"), + _("Too much damage found: we cannot rebuild this rindex"), + }; + /******************************************************************* + ******** Validate and read in resource group information ******** + *******************************************************************/ + log_notice(_("Validating resource group index.\n")); + for (trust_lvl = blind_faith; trust_lvl <= indignation; trust_lvl++) { + int ret = 0; + + log_notice(_("Level %d resource group check: %s.\n"), trust_lvl + 1, + level_desc[trust_lvl]); + if ((rg_repair(sdp, trust_lvl, &rgcount, &sane) == 0) && + ((ret = ri_update(sdp, 0, &rgcount, &sane)) == 0)) { + log_notice(_("(level %d passed)\n"), trust_lvl + 1); + break; + } else { + if (ret == -1) + log_err( _("(level %d failed: %s)\n"), + trust_lvl + 1, fail_desc[trust_lvl]); + else + log_err( _("(level %d failed at block %lld " + "(0x%llx): %s)\n"), trust_lvl + 1, + (unsigned long long)ret, + (unsigned long long)ret, + fail_desc[trust_lvl]); + } + if (fsck_abort) + break; + } + if (trust_lvl > indignation) { + log_err( _("Resource group recovery impossible; I can't fix " + "this file system.\n")); + return -1; + } + log_info( _("%u resource groups found.\n"), rgcount); + + check_rgrps_integrity(sdp); + return 0; +} + +/** + * init_system_inodes + * + * Returns: 0 on success, -1 on failure + */ +static int init_system_inodes(struct gfs2_sbd *sdp) +{ + uint64_t inumbuf = 0; + char *buf; + struct gfs2_statfs_change sc; + int err; + + /******************************************************************* + ****************** Initialize important inodes ****************** + *******************************************************************/ + + log_info( _("Initializing special inodes...\n")); + + /* Get root dinode */ + sdp->md.rooti = lgfs2_inode_read(sdp, sdp->sd_sb.sb_root_dir.no_addr); + if (sdp->md.rooti == NULL) + return -1; + + /******************************************************************* + ***************** Initialize more system inodes ***************** + *******************************************************************/ + if (!sdp->gfs1) { + /* Look for "inum" entry in master dinode */ + gfs2_lookupi(sdp->master_dir, "inum", 4, &sdp->md.inum); + if (!sdp->md.inum) { + if (!query( _("The gfs2 system inum inode is missing. " + "Okay to rebuild it? (y/n) "))) { + log_err( _("fsck.gfs2 cannot continue without " + "a valid inum file; aborting.\n")); + goto fail; + } + err = build_inum(sdp); + if (err) { + log_crit(_("Error %d rebuilding inum inode\n"), + err); + exit(FSCK_ERROR); + } + gfs2_lookupi(sdp->master_dir, "inum", 4, + &sdp->md.inum); + if (!sdp->md.inum) { + log_crit(_("System inum inode was not rebuilt." + " Aborting.\n")); + goto fail; + } + } + /* Read inum entry into buffer */ + err = gfs2_readi(sdp->md.inum, &inumbuf, 0, + sdp->md.inum->i_di.di_size); + if (err != sdp->md.inum->i_di.di_size) { + log_crit(_("Error %d reading system inum inode. " + "Aborting.\n"), err); + goto fail; + } + /* call gfs2_inum_range_in() to retrieve range */ + sdp->md.next_inum = be64_to_cpu(inumbuf); + } + + if (sdp->gfs1) { + /* In gfs1, the license_di is always 3 blocks after the jindex_di */ + if ((sbd1->sb_license_di.no_addr != sbd1->sb_jindex_di.no_addr + 3) || + (sbd1->sb_license_di.no_formal_ino != sbd1->sb_jindex_di.no_addr + 3)) { + if (!query( _("The gfs system statfs inode pointer is incorrect. " + "Okay to correct? (y/n) "))) { + log_err( _("fsck.gfs2 cannot continue without a valid " + "statfs file; aborting.\n")); + goto fail; + } + sbd1->sb_license_di.no_addr = sbd1->sb_license_di.no_formal_ino + = sbd1->sb_jindex_di.no_addr + 3; + } + + sdp->md.statfs = lgfs2_inode_read(sdp, sbd1->sb_license_di.no_addr); + if (sdp->md.statfs == NULL) { + log_crit(_("Error reading statfs inode: %s\n"), strerror(errno)); + goto fail; + } + } else + gfs2_lookupi(sdp->master_dir, "statfs", 6, &sdp->md.statfs); + if (!sdp->gfs1 && !sdp->md.statfs) { + if (!query( _("The gfs2 system statfs inode is missing. " + "Okay to rebuild it? (y/n) "))) { + log_err( _("fsck.gfs2 cannot continue without a valid " + "statfs file; aborting.\n")); + goto fail; + } + err = build_statfs(sdp); + if (err) { + log_crit(_("Error %d rebuilding statfs inode\n"), err); + exit(FSCK_ERROR); + } + gfs2_lookupi(sdp->master_dir, "statfs", 6, &sdp->md.statfs); + if (!sdp->md.statfs) { + log_err( _("Rebuild of statfs system file failed.")); + log_err( _("fsck.gfs2 cannot continue without " + "a valid statfs file; aborting.\n")); + goto fail; + } + do_init_statfs(sdp); + } + if (sdp->md.statfs->i_di.di_size) { + buf = malloc(sdp->md.statfs->i_di.di_size); + if (buf) { + err = gfs2_readi(sdp->md.statfs, buf, 0, + sdp->md.statfs->i_di.di_size); + if (err != sdp->md.statfs->i_di.di_size) { + log_crit(_("Error %d reading statfs file. " + "Aborting.\n"), err); + free(buf); + goto fail; + } + /* call gfs2_inum_range_in() to retrieve range */ + gfs2_statfs_change_in(&sc, buf); + free(buf); + } + } + + if (sdp->gfs1) { + /* In gfs1, the quota_di is always 2 blocks after the jindex_di */ + if ((sbd1->sb_quota_di.no_addr != sbd1->sb_jindex_di.no_addr + 2) || + (sbd1->sb_quota_di.no_formal_ino != sbd1->sb_jindex_di.no_addr + 2)) { + if (!query( _("The gfs system quota inode pointer is incorrect. " + " Okay to correct? (y/n) "))) { + log_err( _("fsck.gfs2 cannot continue without a valid " + "quota file; aborting.\n")); + goto fail; + } + sbd1->sb_quota_di.no_addr = sbd1->sb_quota_di.no_formal_ino + = sbd1->sb_jindex_di.no_addr + 2; + } + + sdp->md.qinode = lgfs2_inode_read(sdp, sbd1->sb_quota_di.no_addr); + if (sdp->md.qinode == NULL) { + log_crit(_("Error reading quota inode: %s\n"), strerror(errno)); + goto fail; + } + } else + gfs2_lookupi(sdp->master_dir, "quota", 5, &sdp->md.qinode); + if (!sdp->gfs1 && !sdp->md.qinode) { + if (!query( _("The gfs2 system quota inode is missing. " + "Okay to rebuild it? (y/n) "))) { + log_crit(_("System quota inode was not " + "rebuilt. Aborting.\n")); + goto fail; + } + err = build_quota(sdp); + if (err) { + log_crit(_("Error %d rebuilding quota inode\n"), err); + exit(FSCK_ERROR); + } + gfs2_lookupi(sdp->master_dir, "quota", 5, &sdp->md.qinode); + if (!sdp->md.qinode) { + log_crit(_("Unable to rebuild system quota file " + "inode. Aborting.\n")); + goto fail; + } + } + + /* Try to lookup the per_node inode. If it was missing, it is now + safe to rebuild it. */ + if (!sdp->gfs1) + lookup_per_node(sdp, 1); + + /******************************************************************* + ******* Now, set boundary fields in the super block ************* + *******************************************************************/ + if (set_block_ranges(sdp)){ + log_err( _("Unable to determine the boundaries of the" + " file system.\n")); + goto fail; + } + + return 0; + fail: + empty_super_block(sdp); + + return -1; +} + +/** + * is_journal_copy - Is this a "real" dinode or a copy inside a journal? + * A real dinode will be located at the block number in its no_addr. + * A journal-copy will be at a different block (inside the journal). + */ +static int is_journal_copy(struct gfs2_inode *ip, struct gfs2_buffer_head *bh) +{ + if (ip->i_di.di_num.no_addr == bh->b_blocknr) + return 0; + return 1; /* journal copy */ +} + +/** + * peruse_system_dinode - process a system dinode + * + * This function looks at a system dinode and tries to figure out which + * dinode it is: statfs, inum, per_node, master, etc. Some of them we + * can deduce from the contents. For example, di_size will be a multiple + * of 96 for the rindex. di_size will be 8 for inum, 24 for statfs, etc. + * the per_node directory will have a ".." entry that will lead us to + * the master dinode if it's been destroyed. + */ +static void peruse_system_dinode(struct gfs2_sbd *sdp, struct gfs2_dinode *di, + struct gfs2_buffer_head *bh) +{ + struct gfs2_inode *ip, *child_ip; + struct gfs2_inum inum; + int error; + + if (di->di_num.no_formal_ino == 2) { + if (sdp->sd_sb.sb_master_dir.no_addr) + return; + log_warn(_("Found system master directory at: 0x%llx.\n"), + di->di_num.no_addr); + sdp->sd_sb.sb_master_dir.no_addr = di->di_num.no_addr; + return; + } + ip = lgfs2_inode_read(sdp, di->di_num.no_addr); + if (ip == NULL) { + log_crit(_("Error reading inode: %s\n"), strerror(errno)); + return; + } + if ((!sdp->gfs1 && di->di_num.no_formal_ino == 3) || + (sdp->gfs1 && (di->di_flags & GFS2_DIF_JDATA) && + (di->di_size % sizeof(struct gfs_jindex) == 0))) { + if (fix_md.jiinode || is_journal_copy(ip, bh)) + goto out_discard_ip; + log_warn(_("Found system jindex file at: 0x%llx\n"), + di->di_num.no_addr); + fix_md.jiinode = ip; + } else if (!sdp->gfs1 && is_dir(di, sdp->gfs1)) { + /* Check for a jindex dir entry. Only one system dir has a + jindex: master */ + gfs2_lookupi(ip, "jindex", 6, &child_ip); + if (child_ip) { + if (fix_md.jiinode || is_journal_copy(ip, bh)) { + inode_put(&child_ip); + goto out_discard_ip; + } + fix_md.jiinode = child_ip; + sdp->sd_sb.sb_master_dir.no_addr = di->di_num.no_addr; + log_warn(_("Found system master directory at: " + "0x%llx\n"), di->di_num.no_addr); + return; + } + + /* Check for a statfs_change0 dir entry. Only one system dir + has a statfs_change: per_node, and its .. will be master. */ + gfs2_lookupi(ip, "statfs_change0", 14, &child_ip); + if (child_ip) { + inode_put(&child_ip); + if (fix_md.pinode || is_journal_copy(ip, bh)) + goto out_discard_ip; + log_warn(_("Found system per_node directory at: " + "0x%llx\n"), ip->i_di.di_num.no_addr); + fix_md.pinode = ip; + error = dir_search(ip, "..", 2, NULL, &inum); + if (!error && inum.no_addr) { + sdp->sd_sb.sb_master_dir.no_addr = + inum.no_addr; + log_warn(_("From per_node\'s \'..\' I " + "backtracked the master directory " + "to: 0x%llx\n"), inum.no_addr); + } + return; + } + log_debug(_("Unknown system directory at block 0x%llx\n"), + di->di_num.no_addr); + goto out_discard_ip; + } else if (!sdp->gfs1 && di->di_size == 8) { + if (fix_md.inum || is_journal_copy(ip, bh)) + goto out_discard_ip; + fix_md.inum = ip; + log_warn(_("Found system inum file at: 0x%llx\n"), + di->di_num.no_addr); + } else if (di->di_size == 24) { + if (fix_md.statfs || is_journal_copy(ip, bh)) + goto out_discard_ip; + fix_md.statfs = ip; + log_warn(_("Found system statfs file at: 0x%llx\n"), + di->di_num.no_addr); + } else if ((di->di_size % 96) == 0) { + if (fix_md.riinode || is_journal_copy(ip, bh)) + goto out_discard_ip; + fix_md.riinode = ip; + log_warn(_("Found system rindex file at: 0x%llx\n"), + di->di_num.no_addr); + } else if (!fix_md.qinode && di->di_size >= 176 && + di->di_num.no_formal_ino >= 12 && + di->di_num.no_formal_ino <= 100) { + if (is_journal_copy(ip, bh)) + goto out_discard_ip; + fix_md.qinode = ip; + log_warn(_("Found system quota file at: 0x%llx\n"), + di->di_num.no_addr); + } else { +out_discard_ip: + inode_put(&ip); + } +} + +/** + * peruse_user_dinode - process a user dinode trying to find the root directory + * + */ +static void peruse_user_dinode(struct gfs2_sbd *sdp, struct gfs2_dinode *di, + struct gfs2_buffer_head *bh) +{ + struct gfs2_inode *ip, *parent_ip; + struct gfs2_inum inum; + int error; + + if (sdp->sd_sb.sb_root_dir.no_addr) /* if we know the root dinode */ + return; /* we don't need to find the root */ + if (!is_dir(di, sdp->gfs1)) /* if this isn't a directory */ + return; /* it can't lead us to the root anyway */ + + if (di->di_num.no_formal_ino == 1) { + struct gfs2_buffer_head *root_bh; + + if (di->di_num.no_addr == bh->b_blocknr) { + log_warn(_("Found the root directory at: 0x%llx.\n"), + di->di_num.no_addr); + sdp->sd_sb.sb_root_dir.no_addr = di->di_num.no_addr; + return; + } + log_warn(_("The root dinode should be at block 0x%llx but it " + "seems to be destroyed.\n"), + (unsigned long long)di->di_num.no_addr); + log_warn(_("Found a copy of the root directory in a journal " + "at block: 0x%llx.\n"), + (unsigned long long)bh->b_blocknr); + if (!query(_("Do you want to replace the root dinode from the " + "copy? (y/n)"))) { + log_err(_("Damaged root dinode not fixed.\n")); + return; + } + root_bh = bread(sdp, di->di_num.no_addr); + memcpy(root_bh->b_data, bh->b_data, sdp->bsize); + bmodified(root_bh); + brelse(root_bh); + log_warn(_("Root directory copied from the journal.\n")); + return; + } + ip = lgfs2_inode_read(sdp, di->di_num.no_addr); + if (ip == NULL) { + log_crit(_("Error reading inode: %s\n"), strerror(errno)); + return; + } + while (ip) { + gfs2_lookupi(ip, "..", 2, &parent_ip); + if (parent_ip && parent_ip->i_di.di_num.no_addr == + ip->i_di.di_num.no_addr) { + log_warn(_("Found the root directory at: 0x%llx\n"), + ip->i_di.di_num.no_addr); + sdp->sd_sb.sb_root_dir.no_addr = + ip->i_di.di_num.no_addr; + inode_put(&parent_ip); + inode_put(&ip); + return; + } + if (!parent_ip) + break; + inode_put(&ip); + ip = parent_ip; + } + error = dir_search(ip, "..", 2, NULL, &inum); + if (!error && inum.no_addr && inum.no_addr < possible_root) { + possible_root = inum.no_addr; + log_debug(_("Found a possible root at: 0x%llx\n"), + (unsigned long long)possible_root); + } + inode_put(&ip); +} + +/** + * find_rgs_for_bsize - check a range of blocks for rgrps to determine bsize. + * Assumes: device is open. + */ +static int find_rgs_for_bsize(struct gfs2_sbd *sdp, uint64_t startblock, + uint32_t *known_bsize) +{ + uint64_t blk, max_rg_size, rb_addr; + struct gfs2_buffer_head *bh, *rb_bh; + uint32_t bsize, bsize2; + uint32_t chk; + char *p; + int found_rg; + struct gfs2_meta_header mh; + + sdp->bsize = GFS2_DEFAULT_BSIZE; + max_rg_size = 524288; + /* Max RG size is 2GB. Max block size is 4K. 2G / 4K blks = 524288, + So this is traversing 2GB in 4K block increments. */ + for (blk = startblock; blk < startblock + max_rg_size; blk++) { + bh = bread(sdp, blk); + found_rg = 0; + for (bsize = 0; bsize < GFS2_DEFAULT_BSIZE; + bsize += GFS2_BASIC_BLOCK) { + p = bh->b_data + bsize; + chk = ((struct gfs2_meta_header *)p)->mh_magic; + if (be32_to_cpu(chk) != GFS2_MAGIC) + continue; + chk = ((struct gfs2_meta_header *)p)->mh_type; + if (be32_to_cpu(chk) == GFS2_METATYPE_RG) { + found_rg = 1; + break; + } + } + if (!found_rg) + continue; + /* Try all the block sizes in 512 byte multiples */ + for (bsize2 = GFS2_BASIC_BLOCK; bsize2 <= GFS2_DEFAULT_BSIZE; + bsize2 += GFS2_BASIC_BLOCK) { + rb_addr = (bh->b_blocknr * + (GFS2_DEFAULT_BSIZE / bsize2)) + + (bsize / bsize2) + 1; + sdp->bsize = bsize2; /* temporarily */ + rb_bh = bread(sdp, rb_addr); + gfs2_meta_header_in(&mh, rb_bh->b_data); + brelse(rb_bh); + if (mh.mh_magic == GFS2_MAGIC && + mh.mh_type == GFS2_METATYPE_RB) { + log_debug(_("boff:%d bsize2:%d rg:0x%llx, " + "rb:0x%llx\n"), bsize, bsize2, + (unsigned long long)blk, + (unsigned long long)rb_addr); + *known_bsize = bsize2; + break; + } + } + brelse(bh); + if (!(*known_bsize)) { + sdp->bsize = GFS2_DEFAULT_BSIZE; + continue; + } + + sdp->bsize = *known_bsize; + log_warn(_("Block size determined to be: %d\n"), *known_bsize); + return 0; + } + return 0; +} + +/** + * peruse_metadata - check a range of blocks for metadata + * Assumes: device is open. + */ +static int peruse_metadata(struct gfs2_sbd *sdp, uint64_t startblock) +{ + uint64_t blk, max_rg_size; + struct gfs2_buffer_head *bh; + struct gfs2_dinode di; + + max_rg_size = 2147483648ull / sdp->bsize; + /* Max RG size is 2GB. 2G / bsize. */ + for (blk = startblock; blk < startblock + max_rg_size; blk++) { + bh = bread(sdp, blk); + if (gfs2_check_meta(bh, GFS2_METATYPE_DI)) { + brelse(bh); + continue; + } + gfs2_dinode_in(&di, bh->b_data); + if (di.di_flags & GFS2_DIF_SYSTEM) + peruse_system_dinode(sdp, &di, bh); + else + peruse_user_dinode(sdp, &di, bh); + brelse(bh); + } + return 0; +} + +/** + * sb_repair - repair a damaged superblock + * Assumes: device is open. + * The biggest RG size is 2GB + */ +static int sb_repair(struct gfs2_sbd *sdp) +{ + uint64_t half; + uint32_t known_bsize = 0; + int error = 0; + + memset(&fix_md, 0, sizeof(fix_md)); + /* Step 1 - First we need to determine the correct block size. */ + sdp->bsize = GFS2_DEFAULT_BSIZE; + log_warn(_("Gathering information to repair the gfs2 superblock. " + "This may take some time.\n")); + error = find_rgs_for_bsize(sdp, (GFS2_SB_ADDR * GFS2_BASIC_BLOCK) / + GFS2_DEFAULT_BSIZE, &known_bsize); + if (error) + return error; + if (!known_bsize) { + log_warn(_("Block size not apparent; checking elsewhere.\n")); + /* First, figure out the device size. We need that so we can + find a suitable start point to determine what's what. */ + half = sdp->dinfo.size / 2; /* in bytes */ + half /= sdp->bsize; + /* Start looking halfway through the device for gfs2 + structures. If there aren't any at all, forget it. */ + error = find_rgs_for_bsize(sdp, half, &known_bsize); + if (error) + return error; + } + if (!known_bsize) { + log_err(_("Unable to determine the block size; this " + "does not look like a gfs2 file system.\n")); + return -1; + } + /* Step 2 - look for the sytem dinodes */ + error = peruse_metadata(sdp, (GFS2_SB_ADDR * GFS2_BASIC_BLOCK) / + GFS2_DEFAULT_BSIZE); + if (error) + return error; + if (!sdp->sd_sb.sb_master_dir.no_addr) { + log_err(_("Unable to locate the system master directory.\n")); + return -1; + } + if (!sdp->sd_sb.sb_root_dir.no_addr) { + struct gfs2_inum inum; + + log_err(_("Unable to locate the root directory.\n")); + if (possible_root == HIGHEST_BLOCK) { + /* Take advantage of the fact that mkfs.gfs2 + creates master immediately after root. */ + log_err(_("Can't find any dinodes that might " + "be the root; using master - 1.\n")); + possible_root = sdp->sd_sb.sb_master_dir.no_addr - 1; + } + log_err(_("Found a possible root at: 0x%llx\n"), + (unsigned long long)possible_root); + sdp->sd_sb.sb_root_dir.no_addr = possible_root; + sdp->md.rooti = lgfs2_inode_read(sdp, possible_root); + if (!sdp->md.rooti || + sdp->md.rooti->i_di.di_header.mh_magic != GFS2_MAGIC) { + struct gfs2_buffer_head *bh = NULL; + + log_err(_("The root dinode block is destroyed.\n")); + log_err(_("At this point I recommend " + "reinitializing it.\n" + "Hopefully everything will later " + "be put into lost+found.\n")); + if (!query(_("Okay to reinitialize the root " + "dinode? (y/n)"))) { + log_err(_("The root dinode was not " + "reinitialized; aborting.\n")); + return -1; + } + inum.no_formal_ino = 1; + inum.no_addr = possible_root; + error = init_dinode(sdp, &bh, &inum, S_IFDIR | 0755, 0, &inum); + if (error != 0) + return -1; + brelse(bh); + } + } + /* Step 3 - Rebuild the lock protocol and file system table name */ + if (query(_("Okay to fix the GFS2 superblock? (y/n)"))) { + struct gfs2_sb sb; + log_info(_("Found system master directory at: 0x%llx\n"), + sdp->sd_sb.sb_master_dir.no_addr); + sdp->master_dir = lgfs2_inode_read(sdp, + sdp->sd_sb.sb_master_dir.no_addr); + if (sdp->master_dir == NULL) { + log_crit(_("Error reading master inode: %s\n"), strerror(errno)); + return -1; + } + sdp->master_dir->i_di.di_num.no_addr = + sdp->sd_sb.sb_master_dir.no_addr; + log_info(_("Found the root directory at: 0x%llx\n"), + sdp->sd_sb.sb_root_dir.no_addr); + sdp->md.rooti = lgfs2_inode_read(sdp, + sdp->sd_sb.sb_root_dir.no_addr); + if (sdp->md.rooti == NULL) { + log_crit(_("Error reading root inode: %s\n"), strerror(errno)); + return -1; + } + lgfs2_sb_init(&sb, sdp->bsize); + strcpy(sb.sb_lockproto, GFS2_DEFAULT_LOCKPROTO); + strcpy(sb.sb_locktable, "unknown"); + sb.sb_master_dir = sdp->master_dir->i_di.di_num; + sb.sb_root_dir = sdp->md.rooti->i_di.di_num; + lgfs2_sb_write(&sb, sdp->device_fd, sdp->bsize); + inode_put(&sdp->md.rooti); + inode_put(&sdp->master_dir); + sb_fixed = 1; + } else { + log_crit(_("GFS2 superblock not fixed; fsck cannot proceed " + "without a valid superblock.\n")); + return -1; + } + return 0; +} + +/** + * fill_super_block + * @sdp: + * + * Returns: 0 on success, -1 on failure + */ +static int fill_super_block(struct gfs2_sbd *sdp) +{ + int ret; + + sync(); + + /******************************************************************** + ***************** First, initialize all lists ********************** + ********************************************************************/ + log_info( _("Initializing lists...\n")); + sdp->rgtree.osi_node = NULL; + + /******************************************************************** + ************ next, read in on-disk SB and set constants ********** + ********************************************************************/ + sdp->sd_sb.sb_bsize = GFS2_DEFAULT_BSIZE; + sdp->bsize = sdp->sd_sb.sb_bsize; + + if (sizeof(struct gfs2_sb) > sdp->sd_sb.sb_bsize){ + log_crit( _("GFS superblock is larger than the blocksize!\n")); + log_debug("sizeof(struct gfs2_sb) > sdp->sd_sb.sb_bsize\n"); + return -1; + } + + if (compute_constants(sdp)) { + log_crit("%s\n", _("Failed to compute file system constants")); + exit(FSCK_ERROR); + } + ret = read_sb(sdp); + if (ret < 0) { + if (sb_repair(sdp) != 0) + return -1; /* unrepairable, so exit */ + /* Now that we've tried to repair it, re-read it. */ + ret = read_sb(sdp); + if (ret < 0) + return -1; + } + if (sdp->gfs1) + sbd1 = (struct gfs_sb *)&sdp->sd_sb; + return 0; +} + +static void gfs_log_header_out(struct gfs_log_header *head, char *buf) +{ + struct gfs_log_header *str = (struct gfs_log_header *) buf; + + str->lh_header.mh_magic = cpu_to_be32(head->lh_header.mh_magic); + str->lh_header.mh_type = cpu_to_be32(head->lh_header.mh_type); + str->lh_header.mh_format = cpu_to_be32(head->lh_header.mh_format); + str->lh_header.__pad0 = cpu_to_be32(head->lh_header.__pad0); + + str->lh_flags = cpu_to_be32(head->lh_flags); + str->lh_pad = cpu_to_be32(head->lh_pad); + str->lh_first = cpu_to_be64(head->lh_first); + str->lh_sequence = cpu_to_be64(head->lh_sequence); + str->lh_tail = cpu_to_be64(head->lh_tail); + str->lh_last_dump = cpu_to_be64(head->lh_last_dump); +} + +/* + * reconstruct_single_journal - write a fresh GFS1 journal + * @sdp: superblock + * @jnum: journal number + * + * This function will write a fresh journal over the top of + * the previous journal. All journal information is lost. This + * process is basically stolen from write_journals() in the mkfs code. + * + * Returns: -1 on error, 0 otherwise + */ +static int reconstruct_single_journal(struct gfs2_sbd *sdp, int jnum, + uint32_t ji_nsegment) +{ + struct gfs_log_header lh; + uint32_t seg, sequence; + struct gfs2_buffer_head *bh; + + srandom(time(NULL)); + sequence = ji_nsegment / (RAND_MAX + 1.0) * random(); + + log_info(_("Clearing journal %d\n"), jnum); + + for (seg = 0; seg < ji_nsegment; seg++){ + memset(&lh, 0, sizeof(struct gfs_log_header)); + + lh.lh_header.mh_magic = GFS2_MAGIC; + lh.lh_header.mh_type = GFS2_METATYPE_LH; + lh.lh_header.mh_format = GFS2_FORMAT_LH; + lh.lh_header.__pad0 = 0x101674; /* mh_generation */ + lh.lh_flags = GFS2_LOG_HEAD_UNMOUNT; + lh.lh_first = sdp->md.journal[jnum]->i_di.di_num.no_addr + + (seg * sbd1->sb_seg_size); + lh.lh_sequence = sequence; + + bh = bget(sdp, lh.lh_first * sdp->bsize); + memset(bh->b_data, 0, sdp->bsize); + gfs_log_header_out(&lh, bh->b_data); + gfs_log_header_out(&lh, bh->b_data + GFS2_BASIC_BLOCK - + sizeof(struct gfs_log_header)); + brelse(bh); + + if (++sequence == ji_nsegment) + sequence = 0; + } + return 0; +} + +static int reset_journal_seg_size(unsigned int jsize, unsigned int nsegs, + unsigned int bsize) +{ + unsigned int seg_size = jsize / (nsegs * bsize); + if (!seg_size) + seg_size = 16; /* The default with 128MB journal and 4K bsize */ + if (seg_size != sbd1->sb_seg_size) { + sbd1->sb_seg_size = seg_size; + if (!query(_("Computed correct journal segment size to %u." + " Reset it? (y/n) "), seg_size)) { + log_crit(_("Error: Cannot proceed without a valid journal" + " segment size value.\n")); + return -1; + } + log_err(_("Resetting journal segment size to %u\n"), sbd1->sb_seg_size); + } + return 0; +} + +static int correct_journal_seg_size(struct gfs2_sbd *sdp) +{ + int count; + struct gfs_jindex ji_0, ji_1; + char buf[sizeof(struct gfs_jindex)]; + unsigned int jsize = GFS2_DEFAULT_JSIZE * 1024 * 1024; + + count = gfs2_readi(sdp->md.jiinode, buf, 0, sizeof(struct gfs_jindex)); + if (count != sizeof(struct gfs_jindex)) { + log_crit(_("Error %d reading system journal index inode. " + "Aborting\n"), count); + return -1; + } + gfs_jindex_in(&ji_0, buf); + + if (sdp->md.journals == 1) { + if (sbd1->sb_seg_size == 0) { + if (!query(_("The gfs2 journal segment size is 0 and a" + " correct value cannot be determined in a" + " single-journal filesystem.\n" + "Continue with default? (y/n) "))) { + log_crit(_("Error: Cannot proceed without a valid" + " sb_seg_size value.\n")); + return -1; + } + goto out; + } + /* Don't mess with sb_seg_size because we don't know what + * it needs to be + */ + return 0; + } + + count = gfs2_readi(sdp->md.jiinode, buf, sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + if (count != sizeof(struct gfs_jindex)) { + log_crit(_("Error %d reading system journal index inode. " + "Aborting\n"), count); + return -1; + } + gfs_jindex_in(&ji_1, buf); + + jsize = (ji_1.ji_addr - ji_0.ji_addr) * sbd1->sb_bsize; +out: + return reset_journal_seg_size(jsize, ji_0.ji_nsegment, sbd1->sb_bsize); +} + +/* + * reconstruct_journals - write fresh journals for GFS1 only + * sdp: the super block + * + * Returns: 0 on success, -1 on failure + */ +static int reconstruct_journals(struct gfs2_sbd *sdp) +{ + int i, count; + struct gfs_jindex ji; + char buf[sizeof(struct gfs_jindex)]; + + /* Ensure that sb_seg_size is valid */ + if (correct_journal_seg_size(sdp)) { + log_crit(_("Failed to set correct journal segment size. Cannot continue\n")); + return -1; + } + + log_err(_("Clearing GFS journals (this may take a while)\n")); + for (i = 0; i < sdp->md.journals; i++) { + count = gfs2_readi(sdp->md.jiinode, buf, + i * sizeof(struct gfs_jindex), + sizeof(struct gfs_jindex)); + if (count != sizeof(struct gfs_jindex)) + return 0; + gfs_jindex_in(&ji, buf); + if ((i % 2) == 0) + log_err("."); + if (reconstruct_single_journal(sdp, i, ji.ji_nsegment)) + return -1; + } + log_err(_("\nJournals cleared.\n")); + return 0; +} + +/** + * init_rindex - read in the rindex file + */ +static int init_rindex(struct gfs2_sbd *sdp) +{ + int err; + + if (sdp->gfs1) + sdp->md.riinode = lgfs2_inode_read(sdp, sbd1->sb_rindex_di.no_addr); + else + gfs2_lookupi(sdp->master_dir, "rindex", 6, &sdp->md.riinode); + + if (sdp->md.riinode) + return 0; + + if (!query( _("The gfs2 system rindex inode is missing. " + "Okay to rebuild it? (y/n) "))) { + log_crit(_("Error: Cannot proceed without a valid rindex.\n")); + return -1; + } + if ((err = build_rindex(sdp))) { + log_crit(_("Error %d rebuilding rindex\n"), err); + return -1; + } + return 0; +} + +/** + * initialize - initialize superblock pointer + * + */ +int initialize(struct gfs2_sbd *sdp, int force_check, int preen, + int *all_clean) +{ + int clean_journals = 0, open_flag; + + *all_clean = 0; + + if (opts.no) + open_flag = O_RDONLY; + else + open_flag = O_RDWR | O_EXCL; + + sdp->device_fd = open(opts.device, open_flag); + if (sdp->device_fd < 0) { + struct mntent *mnt; + if (open_flag == O_RDONLY || errno != EBUSY) { + log_crit( _("Unable to open device: %s\n"), + opts.device); + return FSCK_USAGE; + } + /* We can't open it EXCL. It may be already open rw (in which + case we want to deny them access) or it may be mounted as + the root file system at boot time (in which case we need to + allow it.) + If the device is busy, but not because it's mounted, fail. + This protects against cases where the file system is LVM + and perhaps mounted on a different node. + Try opening without O_EXCL. */ + sdp->device_fd = lgfs2_open_mnt_dev(opts.device, O_RDWR, &mnt); + if (sdp->device_fd < 0) + goto mount_fail; + /* If the device is mounted, but not mounted RO, fail. This + protects them against cases where the file system is + mounted RW, but still allows us to check our own root + file system. */ + if (!hasmntopt(mnt, MNTOPT_RO)) + goto close_fail; + /* The device is mounted RO, so it's likely our own root + file system. We can only do so much to protect the users + from themselves. */ + was_mounted_ro = 1; + } + + if (lgfs2_get_dev_info(sdp->device_fd, &sdp->dinfo)) { + perror(opts.device); + return FSCK_ERROR; + } + + /* read in sb from disk */ + if (fill_super_block(sdp)) + return FSCK_ERROR; + + /* Change lock protocol to be fsck_* instead of lock_* */ + if (!opts.no && preen_is_safe(sdp, preen, force_check)) { + if (block_mounters(sdp, 1)) { + log_err( _("Unable to block other mounters\n")); + return FSCK_USAGE; + } + } + + /* Get master dinode */ + if (sdp->gfs1) + sdp->master_dir = NULL; + else + sdp->master_dir = lgfs2_inode_read(sdp, + sdp->sd_sb.sb_master_dir.no_addr); + if (!sdp->gfs1 && + (sdp->master_dir->i_di.di_header.mh_magic != GFS2_MAGIC || + sdp->master_dir->i_di.di_header.mh_type != GFS2_METATYPE_DI || + !sdp->master_dir->i_di.di_size)) { + inode_put(&sdp->master_dir); + rebuild_master(sdp); + sdp->master_dir = lgfs2_inode_read(sdp, + sdp->sd_sb.sb_master_dir.no_addr); + if (sdp->master_dir == NULL) { + log_crit(_("Error reading master directory: %s\n"), strerror(errno)); + return FSCK_ERROR; + } + } + + /* Look up the "per_node" inode. If there are journals missing, we + need to figure out what's missing from per_node. And we need all + our journals to be there before we can replay them. */ + if (!sdp->gfs1) + lookup_per_node(sdp, 0); + + /* We need rindex first in case jindex is missing and needs to read + in the rgrps before rebuilding it. However, note that if the rindex + is damaged, we need the journals to repair it. That's because the + journals likely contain rgrps and bitmaps, which we need to ignore + when we're trying to find the rgrps. */ + if (init_rindex(sdp)) + return FSCK_ERROR; + + if (fetch_rgrps(sdp)) + return FSCK_ERROR; + + /* We need to read in jindex in order to replay the journals. If + there's an error, we may proceed and let init_system_inodes + try to rebuild it. */ + if (init_jindex(sdp, 1) == 0) { + /* If GFS, rebuild the journals. If GFS2, replay them. We don't + have the smarts to replay GFS1 journals (neither did + gfs_fsck). */ + if (sdp->gfs1) { + if (reconstruct_journals(sdp)) + return FSCK_ERROR; + } else if (replay_journals(sdp, preen, force_check, + &clean_journals)) { + if (!opts.no && preen_is_safe(sdp, preen, force_check)) + block_mounters(sdp, 0); + stack; + return FSCK_ERROR; + } + if (sdp->md.journals == clean_journals) + *all_clean = 1; + else if (force_check || !preen) + log_notice( _("\nJournal recovery complete.\n")); + + if (!force_check && *all_clean && preen) + return FSCK_OK; + } + + if (init_system_inodes(sdp)) + return FSCK_ERROR; + + return FSCK_OK; + +close_fail: + close(sdp->device_fd); +mount_fail: + log_crit( _("Device %s is busy.\n"), opts.device); + return FSCK_USAGE; +} + +void destroy(struct gfs2_sbd *sdp) +{ + if (!opts.no) { + if (block_mounters(sdp, 0)) { + log_warn( _("Unable to unblock other mounters - manual intervention required\n")); + log_warn( _("Use 'gfs2_tool sb proto' to fix\n")); + } + log_info( _("Syncing the device.\n")); + fsync(sdp->device_fd); + } + empty_super_block(sdp); + close(sdp->device_fd); + if (was_mounted_ro && errors_corrected) { + sdp->device_fd = open("/proc/sys/vm/drop_caches", O_WRONLY); + if (sdp->device_fd >= 0) { + if (write(sdp->device_fd, "2", 1) == 2) { + close(sdp->device_fd); + return; + } + close(sdp->device_fd); + } + log_warn(_("fsck.gfs2: Could not flush caches (non-fatal).\n")); + } +} diff --git a/gfs2/fsck/inode_hash.c b/gfs2/fsck/inode_hash.c new file mode 100644 index 0000000..09303d7 --- /dev/null +++ b/gfs2/fsck/inode_hash.c @@ -0,0 +1,67 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include + +#include +#include "libgfs2.h" +#include "osi_list.h" +#include "inode_hash.h" +#include "fsck.h" +#define _(String) gettext(String) + +struct inode_info *inodetree_find(uint64_t block) +{ + struct osi_node *node = inodetree.osi_node; + + while (node) { + struct inode_info *data = (struct inode_info *)node; + + if (block < data->di_num.no_addr) + node = node->osi_left; + else if (block > data->di_num.no_addr) + node = node->osi_right; + else + return data; + } + return NULL; +} + +struct inode_info *inodetree_insert(struct gfs2_inum di_num) +{ + struct osi_node **newn = &inodetree.osi_node, *parent = NULL; + struct inode_info *data; + + /* Figure out where to put new node */ + while (*newn) { + struct inode_info *cur = (struct inode_info *)*newn; + + parent = *newn; + if (di_num.no_addr < cur->di_num.no_addr) + newn = &((*newn)->osi_left); + else if (di_num.no_addr > cur->di_num.no_addr) + newn = &((*newn)->osi_right); + else + return cur; + } + + data = calloc(1, sizeof(struct inode_info)); + if (!data) { + log_crit( _("Unable to allocate inode_info structure\n")); + return NULL; + } + /* Add new node and rebalance tree. */ + data->di_num = di_num; + osi_link_node(&data->node, parent, newn); + osi_insert_color(&data->node, &inodetree); + + return data; +} + +void inodetree_delete(struct inode_info *b) +{ + osi_erase(&b->node, &inodetree); + free(b); +} diff --git a/gfs2/fsck/inode_hash.h b/gfs2/fsck/inode_hash.h new file mode 100644 index 0000000..ba18ab2 --- /dev/null +++ b/gfs2/fsck/inode_hash.h @@ -0,0 +1,10 @@ +#ifndef _INODE_HASH_H +#define _INODE_HASH_H + +struct inode_info; + +extern struct inode_info *inodetree_find(uint64_t block); +extern struct inode_info *inodetree_insert(struct gfs2_inum di_num); +extern void inodetree_delete(struct inode_info *b); + +#endif /* _INODE_HASH_H */ diff --git a/gfs2/fsck/link.c b/gfs2/fsck/link.c new file mode 100644 index 0000000..8ea09c7 --- /dev/null +++ b/gfs2/fsck/link.c @@ -0,0 +1,201 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "inode_hash.h" +#include "link.h" +#include "util.h" + +struct gfs2_bmap nlink1map = { 0 }; /* map of dinodes with nlink == 1 */ +struct gfs2_bmap clink1map = { 0 }; /* map of dinodes w/counted links == 1 */ + +int link1_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark) +{ + static unsigned char *byte; + static uint64_t b; + + if (!bmap) + return 0; + if (bblock > bmap->size) + return -1; + + byte = bmap->map + BLOCKMAP_SIZE1(bblock); + b = BLOCKMAP_BYTE_OFFSET1(bblock); + *byte &= ~(BLOCKMAP_MASK1 << b); + *byte |= (mark & BLOCKMAP_MASK1) << b; + return 0; +} + +int set_di_nlink(struct gfs2_inode *ip) +{ + struct inode_info *ii; + struct dir_info *di; + + if (is_dir(&ip->i_di, ip->i_sbd->gfs1)) { + di = dirtree_find(ip->i_di.di_num.no_addr); + if (di == NULL) { + log_err(_("Error: directory %lld (0x%llx) is not " + "in the dir_tree (set).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + return -1; + } + di->di_nlink = ip->i_di.di_nlink; + return 0; + } + if (ip->i_di.di_nlink == 1) { + link1_set(&nlink1map, ip->i_di.di_num.no_addr, 1); + return 0; + } + /*log_debug( _("Setting link count to %u for %" PRIu64 + " (0x%" PRIx64 ")\n"), count, inode_no, inode_no);*/ + /* If the list has entries, look for one that matches inode_no */ + ii = inodetree_find(ip->i_di.di_num.no_addr); + if (!ii) + ii = inodetree_insert(ip->i_di.di_num); + if (ii) + ii->di_nlink = ip->i_di.di_nlink; + else + return -1; + return 0; +} + +/* I'm making whyincr a macro rather than function so that the debug output + * matches older versions. */ +#define whyincr(no_addr, why, referenced_from, counted_links) \ + log_debug(_("Dir (0x%llx) incremented counted links to %u " \ + "for (0x%llx) via %s\n"), \ + (unsigned long long)referenced_from, counted_links, \ + (unsigned long long)no_addr, why); + +int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip, + const char *why) +{ + struct inode_info *ii = NULL; + uint64_t referenced_from = ip ? ip->i_di.di_num.no_addr : 0; + struct dir_info *di; + struct gfs2_inode *link_ip; + + di = dirtree_find(no.no_addr); + if (di) { + if (di->dinode.no_formal_ino != no.no_formal_ino) + return incr_link_ino_mismatch; + + di->counted_links++; + whyincr(no.no_addr, why, referenced_from, di->counted_links); + return incr_link_good; + } + ii = inodetree_find(no.no_addr); + /* If the list has entries, look for one that matches inode_no */ + if (ii) { + if (ii->di_num.no_formal_ino != no.no_formal_ino) + return incr_link_ino_mismatch; + + ii->counted_links++; + whyincr(no.no_addr, why, referenced_from, ii->counted_links); + return incr_link_good; + } + if (link1_type(&clink1map, no.no_addr) != 1) { + link1_set(&clink1map, no.no_addr, 1); + whyincr(no.no_addr, why, referenced_from, 1); + return incr_link_good; + } + + link_ip = fsck_load_inode(ip->i_sbd, no.no_addr); + /* Check formal ino against dinode before adding to inode tree. */ + if (no.no_formal_ino != link_ip->i_di.di_num.no_formal_ino) { + fsck_inode_put(&link_ip); + return incr_link_ino_mismatch; /* inode mismatch */ + } + /* Move it from the link1 maps to a real inode tree entry */ + link1_set(&nlink1map, no.no_addr, 0); + link1_set(&clink1map, no.no_addr, 0); + + /* If no match was found, it must be a hard link. In theory, it can't + be a duplicate because those were resolved in pass1b. Add a new + inodetree entry and set its counted links to 2 */ + ii = inodetree_insert(no); + if (!ii) { + log_debug( _("Ref: (0x%llx) Error incrementing link for " + "(0x%llx)!\n"), + (unsigned long long)referenced_from, + (unsigned long long)no.no_addr); + fsck_inode_put(&link_ip); + return incr_link_bad; + } + ii->di_num = link_ip->i_di.di_num; + fsck_inode_put(&link_ip); + ii->di_nlink = 1; /* Must be 1 or it wouldn't have gotten into the + nlink1map */ + ii->counted_links = 2; + whyincr(no.no_addr, why, referenced_from, ii->counted_links); + /* We transitioned a dentry link count from 1 to 2, and we know it's + not a directory. But the new reference has the correct formal + inode number, so the first reference is suspect: we need to + check it in case it's a bad reference, and not just a hard link. */ + return incr_link_check_orig; +} + +#define whydecr(no_addr, why, referenced_from, counted_links) \ + log_debug(_("Dir (0x%llx) decremented counted links to %u " \ + "for (0x%llx) via %s\n"), \ + (unsigned long long)referenced_from, counted_links, \ + (unsigned long long)no_addr, why); + +int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1, + const char *why) +{ + struct inode_info *ii = NULL; + struct dir_info *di; + + di = dirtree_find(inode_no); + if (di) { + if (!di->counted_links) { + log_debug( _("Dir (0x%llx)'s link to " + "(0x%llx) via %s is zero!\n"), + (unsigned long long)referenced_from, + (unsigned long long)inode_no, why); + return 0; + } + di->counted_links--; + whydecr(inode_no, why, referenced_from, di->counted_links); + return 0; + } + + ii = inodetree_find(inode_no); + /* If the list has entries, look for one that matches + * inode_no */ + if (ii) { + if (!ii->counted_links) { + log_debug( _("Dir (0x%llx)'s link to " + "(0x%llx) via %s is zero!\n"), + (unsigned long long)referenced_from, + (unsigned long long)inode_no, why); + return 0; + } + ii->counted_links--; + whydecr(inode_no, why, referenced_from, ii->counted_links); + return 0; + } + if (link1_type(&clink1map, inode_no) == 1) { /* 1 -> 0 */ + link1_set(&clink1map, inode_no, 0); + whydecr(inode_no, why, referenced_from, 0); + return 0; + } + + log_debug( _("No match found when decrementing link for (0x%llx)!\n"), + (unsigned long long)inode_no); + return -1; + +} + + diff --git a/gfs2/fsck/link.h b/gfs2/fsck/link.h new file mode 100644 index 0000000..a5dd1c8 --- /dev/null +++ b/gfs2/fsck/link.h @@ -0,0 +1,21 @@ +#ifndef _LINK_H +#define _LINK_H + +extern struct gfs2_bmap nlink1map; /* map of dinodes with nlink == 1 */ +extern struct gfs2_bmap clink1map; /* map of dinodes w/counted links == 1 */ + +enum { + incr_link_bad = -1, + incr_link_good = 0, + incr_link_ino_mismatch = 1, + incr_link_check_orig = 2, +}; + +int link1_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark); +int set_di_nlink(struct gfs2_inode *ip); +int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip, + const char *why); +int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1, + const char *why); + +#endif /* _LINK_H */ diff --git a/gfs2/fsck/lost_n_found.c b/gfs2/fsck/lost_n_found.c new file mode 100644 index 0000000..4d5d52c --- /dev/null +++ b/gfs2/fsck/lost_n_found.c @@ -0,0 +1,259 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "fsck.h" +#include "libgfs2.h" +#include "lost_n_found.h" +#include "link.h" +#include "metawalk.h" +#include "util.h" + +static void add_dotdot(struct gfs2_inode *ip) +{ + struct dir_info *di; + struct gfs2_sbd *sdp = ip->i_sbd; + int err; + + log_info( _("Adding .. entry to directory %llu (0x%llx) pointing back " + "to lost+found\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + + /* If there's a pre-existing .. directory entry, we have to + back out the links. */ + di = dirtree_find(ip->i_di.di_num.no_addr); + if (di && valid_block(sdp, di->dotdot_parent.no_addr)) { + struct gfs2_inode *dip; + + log_debug(_("Directory (0x%llx) already had a " + "\"..\" link to (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)di->dotdot_parent.no_addr); + dip = fsck_load_inode(sdp, di->dotdot_parent.no_addr); + if (dip->i_di.di_num.no_formal_ino == + di->dotdot_parent.no_formal_ino) { + decr_link_count(di->dotdot_parent.no_addr, + ip->i_di.di_num.no_addr, sdp->gfs1, + _(".. unlinked, moving to lost+found")); + if (dip->i_di.di_nlink > 0) { + dip->i_di.di_nlink--; + set_di_nlink(dip); /* keep inode tree in sync */ + log_debug(_("Decrementing its links to %d\n"), + dip->i_di.di_nlink); + bmodified(dip->i_bh); + } else if (!dip->i_di.di_nlink) { + log_debug(_("Its link count is zero.\n")); + } else { + log_debug(_("Its link count is %d! Changing " + "it to 0.\n"), dip->i_di.di_nlink); + dip->i_di.di_nlink = 0; + set_di_nlink(dip); /* keep inode tree in sync */ + bmodified(dip->i_bh); + } + } else { + log_debug(_("Directory (0x%llx)'s link to parent " + "(0x%llx) had a formal inode discrepancy: " + "was 0x%llx, expected 0x%llx\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)di->dotdot_parent.no_addr, + di->dotdot_parent.no_formal_ino, + dip->i_di.di_num.no_formal_ino); + log_debug(_("The parent directory was not changed.\n")); + } + fsck_inode_put(&dip); + di = NULL; + } else { + if (di) + log_debug(_("Couldn't find a valid \"..\" entry " + "for orphan directory (0x%llx): " + "'..' = 0x%llx\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)di->dotdot_parent.no_addr); + else + log_debug(_("Couldn't find directory (0x%llx) " + "in directory tree.\n"), + (unsigned long long)ip->i_di.di_num.no_addr); + } + if (gfs2_dirent_del(ip, "..", 2)) + log_warn( _("add_inode_to_lf: Unable to remove " + "\"..\" directory entry.\n")); + + err = dir_add(ip, "..", 2, &(lf_dip->i_di.di_num), + (sdp->gfs1 ? GFS_FILE_DIR : DT_DIR)); + if (err) { + log_crit(_("Error adding .. directory: %s\n"), + strerror(errno)); + exit(FSCK_ERROR); + } +} + +void make_sure_lf_exists(struct gfs2_inode *ip) +{ + struct dir_info *di; + struct gfs2_sbd *sdp = ip->i_sbd; + uint32_t mode; + int root_entries; + + if (lf_dip) + return; + + root_entries = sdp->md.rooti->i_di.di_entries; + log_info( _("Locating/Creating lost+found directory\n")); + + /* if this is gfs1, we have to trick createi into using + no_formal_ino = no_addr, so we set next_inum to the + free block we're about to allocate. */ + if (sdp->gfs1) + sdp->md.next_inum = find_free_blk(sdp); + mode = (sdp->gfs1 ? DT2IF(GFS_FILE_DIR) : S_IFDIR) | 0700; + if (sdp->gfs1) + lf_dip = gfs_createi(sdp->md.rooti, "lost+found", mode, 0); + else + lf_dip = createi(sdp->md.rooti, "lost+found", + S_IFDIR | 0700, 0); + if (lf_dip == NULL) { + log_crit(_("Error creating lost+found: %s\n"), + strerror(errno)); + exit(FSCK_ERROR); + } + + /* createi will have incremented the di_nlink link count for the root + directory. We must set the nlink value in the hash table to keep + them in sync so that pass4 can detect and fix any descrepancies. */ + set_di_nlink(sdp->md.rooti); + + if (sdp->md.rooti->i_di.di_entries > root_entries) { + lf_was_created = 1; + /* This is a new lost+found directory, so set its block type + and increment link counts for the directories */ + /* FIXME: i'd feel better about this if fs_mkdir returned + whether it created a new directory or just found an old one, + and we used that instead of the bitmap_type to run this */ + dirtree_insert(lf_dip->i_di.di_num); + /* Set the bitmap AFTER the dirtree insert so that function + check_n_fix_bitmap will realize it's a dinode and adjust + the rgrp counts properly. */ + fsck_bitmap_set(ip, lf_dip->i_di.di_num.no_addr, + _("lost+found dinode"), GFS2_BLKST_DINODE); + /* root inode links to lost+found */ + incr_link_count(sdp->md.rooti->i_di.di_num, lf_dip, _("root")); + /* lost+found link for '.' from itself */ + incr_link_count(lf_dip->i_di.di_num, lf_dip, "\".\""); + /* lost+found link for '..' back to root */ + incr_link_count(lf_dip->i_di.di_num, sdp->md.rooti, "\"..\""); + if (sdp->gfs1) + lf_dip->i_di.__pad1 = GFS_FILE_DIR; + } + log_info( _("lost+found directory is dinode %lld (0x%llx)\n"), + (unsigned long long)lf_dip->i_di.di_num.no_addr, + (unsigned long long)lf_dip->i_di.di_num.no_addr); + di = dirtree_find(lf_dip->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking lost+found inode connected\n")); + di->checked = 1; + di = NULL; + } +} + +/* add_inode_to_lf - Add dir entry to lost+found for the inode + * @ip: inode to add to lost + found + * + * This function adds an entry into the lost and found dir + * for the given inode. The name of the entry will be + * "lost_i_num.no_addr>". + * + * Returns: 0 on success, -1 on failure. + */ +int add_inode_to_lf(struct gfs2_inode *ip){ + char tmp_name[256]; + __be32 inode_type; + struct gfs2_sbd *sdp = ip->i_sbd; + int err = 0; + uint32_t mode; + + make_sure_lf_exists(ip); + if (ip->i_di.di_num.no_addr == lf_dip->i_di.di_num.no_addr) { + log_err( _("Trying to add lost+found to itself...skipping")); + return 0; + } + + if (sdp->gfs1) + mode = gfs_to_gfs2_mode(ip); + else + mode = ip->i_di.di_mode & S_IFMT; + + switch (mode) { + case S_IFDIR: + add_dotdot(ip); + sprintf(tmp_name, "lost_dir_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_DIR : DT_DIR); + break; + case S_IFREG: + sprintf(tmp_name, "lost_file_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_REG : DT_REG); + break; + case S_IFLNK: + sprintf(tmp_name, "lost_link_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_LNK : DT_LNK); + break; + case S_IFBLK: + sprintf(tmp_name, "lost_blkdev_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_BLK : DT_BLK); + break; + case S_IFCHR: + sprintf(tmp_name, "lost_chrdev_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_CHR : DT_CHR); + break; + case S_IFIFO: + sprintf(tmp_name, "lost_fifo_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_FIFO : DT_FIFO); + break; + case S_IFSOCK: + sprintf(tmp_name, "lost_socket_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_SOCK : DT_SOCK); + break; + default: + sprintf(tmp_name, "lost_%llu", + (unsigned long long)ip->i_di.di_num.no_addr); + inode_type = (sdp->gfs1 ? GFS_FILE_REG : DT_REG); + break; + } + + err = dir_add(lf_dip, tmp_name, strlen(tmp_name), &(ip->i_di.di_num), + inode_type); + if (err) { + log_crit(_("Error adding directory %s: %s\n"), + tmp_name, strerror(errno)); + exit(FSCK_ERROR); + } + + /* This inode is linked from lost+found */ + incr_link_count(ip->i_di.di_num, lf_dip, _("from lost+found")); + /* If it's a directory, lost+found is back-linked to it via .. */ + if (mode == S_IFDIR) + incr_link_count(lf_dip->i_di.di_num, ip, _("to lost+found")); + + log_notice( _("Added inode #%llu (0x%llx) to lost+found\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + gfs2_dinode_out(&lf_dip->i_di, lf_dip->i_bh->b_data); + bwrite(lf_dip->i_bh); + return 0; +} diff --git a/gfs2/fsck/lost_n_found.h b/gfs2/fsck/lost_n_found.h new file mode 100644 index 0000000..2b76cc2 --- /dev/null +++ b/gfs2/fsck/lost_n_found.h @@ -0,0 +1,9 @@ +#ifndef __LOST_N_FOUND_H__ +#define __LOST_N_FOUND_H__ + +#include "libgfs2.h" + +int add_inode_to_lf(struct gfs2_inode *ip); +void make_sure_lf_exists(struct gfs2_inode *ip); + +#endif /* __LOST_N_FOUND_H__ */ diff --git a/gfs2/fsck/main.c b/gfs2/fsck/main.c new file mode 100644 index 0000000..ecdcd0f --- /dev/null +++ b/gfs2/fsck/main.c @@ -0,0 +1,385 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) +#include + +#include +#include "copyright.cf" +#include "libgfs2.h" +#include "fsck.h" +#include "link.h" +#include "osi_list.h" +#include "metawalk.h" +#include "util.h" + +struct gfs2_options opts = {0}; +struct gfs2_inode *lf_dip = NULL; /* Lost and found directory inode */ +int lf_was_created = 0; +uint64_t last_fs_block, last_reported_block = -1; +int64_t last_reported_fblock = -1000000; +int skip_this_pass = FALSE, fsck_abort = FALSE; +int errors_found = 0, errors_corrected = 0; +const char *pass = ""; +uint64_t last_data_block; +uint64_t first_data_block; +int preen = 0, force_check = 0; +struct osi_root dup_blocks; +struct osi_root dirtree; +struct osi_root inodetree; +int dups_found = 0, dups_found_first = 0; +struct gfs_sb *sbd1 = NULL; +int sb_fixed = 0; +int print_level = MSG_NOTICE; + +/* This function is for libgfs2's sake. */ +void print_it(const char *label, const char *fmt, const char *fmt2, ...) +{ + va_list args; + + va_start(args, fmt2); + printf("%s: ", label); + vprintf(fmt, args); + va_end(args); +} + +static void usage(char *name) +{ + printf("Usage: %s [-afhnpqvVy] \n", basename(name)); +} + +static void version(void) +{ + printf( _("GFS2 fsck %s (built %s %s)\n"), + VERSION, __DATE__, __TIME__); + printf(REDHAT_COPYRIGHT "\n"); +} + +static int read_cmdline(int argc, char **argv, struct gfs2_options *gopts) +{ + int c; + + while ((c = getopt(argc, argv, "afhnpqvyV")) != -1) { + switch(c) { + + case 'a': + case 'p': + if (gopts->yes || gopts->no) { + fprintf(stderr, _("Options -p/-a, -y and -n may not be used together\n")); + return FSCK_USAGE; + } + preen = 1; + gopts->yes = 1; + break; + case 'f': + force_check = 1; + break; + case 'h': + usage(argv[0]); + exit(FSCK_OK); + break; + case 'n': + if (gopts->yes || preen) { + fprintf(stderr, _("Options -p/-a, -y and -n may not be used together\n")); + return FSCK_USAGE; + } + gopts->no = 1; + break; + case 'q': + decrease_verbosity(); + break; + case 'v': + increase_verbosity(); + break; + case 'V': + version(); + exit(FSCK_OK); + break; + case 'y': + if (gopts->no || preen) { + fprintf(stderr, _("Options -p/-a, -y and -n may not be used together\n")); + return FSCK_USAGE; + } + gopts->yes = 1; + break; + case ':': + case '?': + fprintf(stderr, _("Please use '-h' for help.\n")); + return FSCK_USAGE; + default: + fprintf(stderr, _("Invalid option %c\n"), c); + return FSCK_USAGE; + + } + } + if (argc > optind) { + gopts->device = (argv[optind]); + if (!gopts->device) { + fprintf(stderr, _("Please use '-h' for help.\n")); + return FSCK_USAGE; + } + } else { + fprintf(stderr, _("No device specified (Please use '-h' for help)\n")); + return FSCK_USAGE; + } + return 0; +} + +static void interrupt(int sig) +{ + char response; + char progress[PATH_MAX]; + + if (!last_reported_block || last_reported_block == last_fs_block) + sprintf(progress, _("progress unknown.\n")); + else + sprintf(progress, _("processing block %llu out of %llu\n"), + (unsigned long long)last_reported_block, + (unsigned long long)last_fs_block); + + response = generic_interrupt("fsck.gfs2", pass, progress, + _("Do you want to abort fsck.gfs2, skip " \ + "the rest of this pass or continue " \ + "(a/s/c)?"), "asc"); + if (tolower(response) == 's') { + skip_this_pass = TRUE; + return; + } + else if (tolower(response) == 'a') { + fsck_abort = TRUE; + return; + } +} + +static int check_statfs(struct gfs2_sbd *sdp) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rgd; + struct gfs2_rindex *ri; + struct gfs2_statfs_change sc = {0,}; + char buf[sizeof(struct gfs2_statfs_change)]; + int count; + + if (sdp->gfs1 && !sdp->md.statfs->i_di.di_size) { + log_info("This GFS1 file system is not using fast_statfs.\n"); + return 0; + } + /* Read the current statfs values */ + count = gfs2_readi(sdp->md.statfs, buf, 0, + sdp->md.statfs->i_di.di_size); + if (count != sizeof(struct gfs2_statfs_change)) { + log_err(_("Failed to read statfs values (%d of %"PRIu64" read)\n"), + count, (uint64_t)sdp->md.statfs->i_di.di_size); + return FSCK_ERROR; + } + gfs2_statfs_change_in(&sc, buf); + /* Calculate the real values from the rgrp information */ + sdp->blks_total = 0; + sdp->blks_alloced = 0; + sdp->dinodes_alloced = 0; + + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + ri = &rgd->ri; + sdp->blks_total += ri->ri_data; + sdp->blks_alloced += (ri->ri_data - rgd->rg.rg_free); + sdp->dinodes_alloced += rgd->rg.rg_dinodes; + } + + /* See if they match */ + if (sc.sc_total == sdp->blks_total && + sc.sc_free == (sdp->blks_total - sdp->blks_alloced) && + sc.sc_dinodes == sdp->dinodes_alloced) { + log_info( _("The statfs file is accurate.\n")); + return 0; + } + log_err( _("The statfs file is wrong:\n\n")); + log_err( _("Current statfs values:\n")); + log_err( _("blocks: %lld (0x%llx)\n"), + (unsigned long long)sc.sc_total, + (unsigned long long)sc.sc_total); + log_err( _("free: %lld (0x%llx)\n"), + (unsigned long long)sc.sc_free, + (unsigned long long)sc.sc_free); + log_err( _("dinodes: %lld (0x%llx)\n\n"), + (unsigned long long)sc.sc_dinodes, + (unsigned long long)sc.sc_dinodes); + + log_err( _("Calculated statfs values:\n")); + log_err( _("blocks: %lld (0x%llx)\n"), + (unsigned long long)sdp->blks_total, + (unsigned long long)sdp->blks_total); + log_err( _("free: %lld (0x%llx)\n"), + (unsigned long long)(sdp->blks_total - sdp->blks_alloced), + (unsigned long long)(sdp->blks_total - sdp->blks_alloced)); + log_err( _("dinodes: %lld (0x%llx)\n"), + (unsigned long long)sdp->dinodes_alloced, + (unsigned long long)sdp->dinodes_alloced); + + errors_found++; + if (!query( _("Okay to fix the master statfs file? (y/n)"))) { + log_err( _("The statfs file was not fixed.\n")); + return 0; + } + + do_init_statfs(sdp); + log_err( _("The statfs file was fixed.\n")); + errors_corrected++; + return 0; +} + +static const struct fsck_pass passes[] = { + { .name = "pass1", .f = pass1 }, + { .name = "pass1b", .f = pass1b }, + { .name = "pass2", .f = pass2 }, + { .name = "pass3", .f = pass3 }, + { .name = "pass4", .f = pass4 }, + { .name = "check_statfs", .f = check_statfs }, + { .name = NULL, } +}; + +static int fsck_pass(const struct fsck_pass *p, struct gfs2_sbd *sdp) +{ + int ret; + struct timeval timer; + + if (fsck_abort) + return FSCK_CANCELED; + pass = p->name; + + log_notice( _("Starting %s\n"), p->name); + gettimeofday(&timer, NULL); + + ret = p->f(sdp); + if (ret) + exit(ret); + if (skip_this_pass || fsck_abort) { + skip_this_pass = 0; + log_notice( _("%s interrupted \n"), p->name); + return FSCK_CANCELED; + } + + print_pass_duration(p->name, &timer); + return 0; +} + +static void exitlog(int status, void *unused) +{ + syslog(LOG_INFO, "exit: %d", status); +} + +static void startlog(int argc, char **argv) +{ + int i; + char *cmd, *p; + size_t len; + + for (len = i = 0; i < argc; i++) + len += strlen(argv[i]); + len += argc; /* Add spaces and '\0' */ + + cmd = malloc(len); + if (cmd == NULL) { + perror(argv[0]); + exit(FSCK_ERROR); + } + p = cmd; + for (i = 0; i < argc; i++, p++) { + p = stpcpy(p, argv[i]); + *p = ' '; + } + *(--p) = '\0'; + syslog(LOG_INFO, "started: %s", cmd); + free(cmd); +} + +int main(int argc, char **argv) +{ + struct gfs2_sbd sb; + struct gfs2_sbd *sdp = &sb; + int j; + int i; + int error = 0; + int all_clean = 0; + struct sigaction act = { .sa_handler = interrupt, }; + + setlocale(LC_ALL, ""); + textdomain("gfs2-utils"); + + openlog("fsck.gfs2", LOG_CONS|LOG_PID, LOG_USER); + startlog(argc - 1, &argv[1]); + on_exit(exitlog, NULL); + + memset(sdp, 0, sizeof(*sdp)); + + if ((error = read_cmdline(argc, argv, &opts))) + exit(error); + setbuf(stdout, NULL); + log_notice( _("Initializing fsck\n")); + if ((error = initialize(sdp, force_check, preen, &all_clean))) + exit(error); + + if (!force_check && all_clean && preen) { + log_err( _("%s: clean.\n"), opts.device); + destroy(sdp); + exit(FSCK_OK); + } + + sigaction(SIGINT, &act, NULL); + + for (i = 0; passes[i].name; i++) + error = fsck_pass(passes + i, sdp); + + /* Free up our system inodes */ + if (!sdp->gfs1) + inode_put(&sdp->md.inum); + inode_put(&sdp->md.statfs); + for (j = 0; j < sdp->md.journals; j++) + inode_put(&sdp->md.journal[j]); + free(sdp->md.journal); + sdp->md.journal = NULL; + inode_put(&sdp->md.jiinode); + inode_put(&sdp->md.riinode); + inode_put(&sdp->md.qinode); + if (!sdp->gfs1) + inode_put(&sdp->md.pinode); + inode_put(&sdp->md.rooti); + if (!sdp->gfs1) + inode_put(&sdp->master_dir); + if (lf_dip) + inode_put(&lf_dip); + + if (!opts.no && errors_corrected) + log_notice( _("Writing changes to disk\n")); + fsync(sdp->device_fd); + link1_destroy(&nlink1map); + link1_destroy(&clink1map); + destroy(sdp); + if (sb_fixed) + log_warn(_("Superblock was reset. Use tunegfs2 to manually " + "set lock table before mounting.\n")); + log_notice( _("fsck.gfs2 complete\n")); + + if (!error) { + if (!errors_found) + error = FSCK_OK; + else if (errors_found == errors_corrected) + error = FSCK_NONDESTRUCT; + else + error = FSCK_UNCORRECTED; + } + exit(error); +} diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c new file mode 100644 index 0000000..a7780d7 --- /dev/null +++ b/gfs2/fsck/metawalk.c @@ -0,0 +1,1713 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "link.h" +#include "osi_tree.h" +#include "fsck.h" +#include "util.h" +#include "metawalk.h" +#include "inode_hash.h" + +#define COMFORTABLE_BLKS 5242880 /* 20GB in 4K blocks */ + +/* There are two bitmaps: (1) The "blockmap" that fsck uses to keep track of + what block type has been discovered, and (2) The rgrp bitmap. Function + gfs2_blockmap_set is used to set the former and gfs2_set_bitmap + is used to set the latter. The two must be kept in sync, otherwise + you'll get bitmap mismatches. This function checks the status of the + bitmap whenever the blockmap changes, and fixes it accordingly. */ +int check_n_fix_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + uint64_t blk, int error_on_dinode, int new_state) +{ + int old_state; + int treat_as_inode = 0; + int rewrite_rgrp = 0; + struct gfs_rgrp *gfs1rg; + const char *allocdesc[2][5] = { /* gfs2 descriptions */ + {"free", "data", "unlinked", "inode", "reserved"}, + /* gfs1 descriptions: */ + {"free", "data", "free meta", "metadata", "reserved"}}; + static struct rgrp_tree *prevrgd = NULL; + + if (prevrgd && rgrp_contains_block(prevrgd, blk)) { + rgd = prevrgd; + } else if (rgd == NULL || !rgrp_contains_block(rgd, blk)) { + rgd = gfs2_blk2rgrpd(sdp, blk); + prevrgd = rgd; + } + + gfs1rg = (struct gfs_rgrp *)&rgd->rg; + + old_state = lgfs2_get_bitmap(sdp, blk, rgd); + if (old_state < 0) { + log_err( _("Block %llu (0x%llx) is not represented in the " + "system bitmap; part of an rgrp or superblock.\n"), + (unsigned long long)blk, (unsigned long long)blk); + return -1; + } + if (old_state == new_state) + return 0; + + if (error_on_dinode && old_state == GFS2_BLKST_DINODE && + new_state != GFS2_BLKST_FREE) { + log_debug(_("Reference as '%s' to block %llu (0x%llx) which " + "was marked as dinode. Needs further " + "investigation.\n"), + allocdesc[sdp->gfs1][new_state], + (unsigned long long)blk, (unsigned long long)blk); + return 1; + } + /* Keep these messages as short as possible, or the output gets to be + huge and unmanageable. */ + log_err( _("Block %llu (0x%llx) was '%s', should be %s.\n"), + (unsigned long long)blk, (unsigned long long)blk, + allocdesc[sdp->gfs1][old_state], + allocdesc[sdp->gfs1][new_state]); + if (!query( _("Fix the bitmap? (y/n)"))) { + log_err( _("The bitmap inconsistency was ignored.\n")); + return 0; + } + /* If the new bitmap state is free (and therefore the old state was + not) we have to add to the free space in the rgrp. If the old + bitmap state was free (and therefore it no longer is) we have to + subtract to the free space. If the type changed from dinode to + data or data to dinode, no change in free space. */ + gfs2_set_bitmap(rgd, blk, new_state); + if (new_state == GFS2_BLKST_FREE) { + rgd->rg.rg_free++; + rewrite_rgrp = 1; + } else if (old_state == GFS2_BLKST_FREE) { + rgd->rg.rg_free--; + rewrite_rgrp = 1; + } + /* If we're freeing a dinode, get rid of the data structs for it. */ + if (old_state == GFS2_BLKST_DINODE || + old_state == GFS2_BLKST_UNLINKED) { + struct dir_info *dt; + struct inode_info *ii; + + dt = dirtree_find(blk); + if (dt) { + dirtree_delete(dt); + treat_as_inode = 1; + } + ii = inodetree_find(blk); + if (ii) { + inodetree_delete(ii); + treat_as_inode = 1; + } else if (!sdp->gfs1) { + treat_as_inode = 1; + } else if (link1_type(&nlink1map, blk) == 1) { + /* This is a GFS1 fs (so all metadata is marked inode). + We need to verify it is an inode before we can decr + the rgrp inode count. */ + treat_as_inode = 1; + } + if (old_state == GFS2_BLKST_DINODE) { + if (treat_as_inode && rgd->rg.rg_dinodes > 0) + rgd->rg.rg_dinodes--; + else if (sdp->gfs1 && gfs1rg->rg_usedmeta > 0) + gfs1rg->rg_usedmeta--; + rewrite_rgrp = 1; + } + link1_set(&nlink1map, blk, 0); + } else if (new_state == GFS2_BLKST_DINODE) { + if (!sdp->gfs1) { + treat_as_inode = 1; + } else { + /* This is GFS1 (so all metadata is marked inode). We + need to verify it is an inode before we can decr + the rgrp inode count. */ + if (link1_type(&nlink1map, blk) == 1) + treat_as_inode = 1; + else { + struct dir_info *dt; + struct inode_info *ii; + + dt = dirtree_find(blk); + if (dt) + treat_as_inode = 1; + else { + ii = inodetree_find(blk); + if (ii) + treat_as_inode = 1; + } + } + } + if (treat_as_inode) + rgd->rg.rg_dinodes++; + else if (sdp->gfs1) + gfs1rg->rg_usedmeta++; + rewrite_rgrp = 1; + } + if (rewrite_rgrp) { + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + } + log_err( _("The bitmap was fixed.\n")); + return 0; +} + +/* + * _fsck_bitmap_set - Mark a block in the bitmap, and adjust free space. + */ +int _fsck_bitmap_set(struct gfs2_inode *ip, uint64_t bblock, + const char *btype, int mark, + int error_on_dinode, const char *caller, int fline) +{ + int error; + static int prev_ino_addr = 0; + static int prev_mark = 0; + static int prevcount = 0; + static const char *prev_caller = NULL; + + if (print_level >= MSG_DEBUG) { + if ((ip->i_di.di_num.no_addr == prev_ino_addr) && + (mark == prev_mark) && caller == prev_caller) { + log_info("(0x%llx) ", (unsigned long long)bblock); + prevcount++; + if (prevcount > 10) { + log_info("\n"); + prevcount = 0; + } + /* I'm circumventing the log levels here on purpose to make the + output easier to debug. */ + } else if (ip->i_di.di_num.no_addr == bblock) { + if (prevcount) { + log_info("\n"); + prevcount = 0; + } + printf( _("(%s:%d) %s inode found at block " + "(0x%llx): marking as '%s'\n"), caller, fline, + btype, + (unsigned long long)ip->i_di.di_num.no_addr, + block_type_string(mark)); + + } else { + if (prevcount) { + log_info("\n"); + prevcount = 0; + } + printf( _("(%s:%d) inode (0x%llx) references %s block" + " (0x%llx): marking as '%s'\n"), + caller, fline, + (unsigned long long)ip->i_di.di_num.no_addr, + btype, (unsigned long long)bblock, + block_type_string(mark)); + } + prev_ino_addr = ip->i_di.di_num.no_addr; + prev_mark = mark; + prev_caller = caller; + } + error = check_n_fix_bitmap(ip->i_sbd, ip->i_rgd, bblock, + error_on_dinode, mark); + if (error < 0) + log_err(_("This block is not represented in the bitmap.\n")); + return error; +} + +struct duptree *dupfind(uint64_t block) +{ + struct osi_node *node = dup_blocks.osi_node; + + while (node) { + struct duptree *dt = (struct duptree *)node; + + if (block < dt->block) + node = node->osi_left; + else if (block > dt->block) + node = node->osi_right; + else + return dt; + } + return NULL; +} + +struct gfs2_inode *fsck_system_inode(struct gfs2_sbd *sdp, uint64_t block) +{ + int j; + + if (lf_dip && lf_dip->i_di.di_num.no_addr == block) + return lf_dip; + if (!sdp->gfs1) + return is_system_inode(sdp, block); + + if (sdp->md.statfs && block == sdp->md.statfs->i_di.di_num.no_addr) + return sdp->md.statfs; + if (sdp->md.jiinode && block == sdp->md.jiinode->i_di.di_num.no_addr) + return sdp->md.jiinode; + if (sdp->md.riinode && block == sdp->md.riinode->i_di.di_num.no_addr) + return sdp->md.riinode; + if (sdp->md.qinode && block == sdp->md.qinode->i_di.di_num.no_addr) + return sdp->md.qinode; + if (sdp->md.rooti && block == sdp->md.rooti->i_di.di_num.no_addr) + return sdp->md.rooti; + for (j = 0; j < sdp->md.journals; j++) + if (sdp->md.journal && sdp->md.journal[j] && + block == sdp->md.journal[j]->i_di.di_num.no_addr) + return sdp->md.journal[j]; + return NULL; +} + +/* fsck_load_inode - same as gfs2_load_inode() in libgfs2 but system inodes + get special treatment. */ +struct gfs2_inode *fsck_load_inode(struct gfs2_sbd *sdp, uint64_t block) +{ + struct gfs2_inode *ip = NULL; + + ip = fsck_system_inode(sdp, block); + if (ip) + return ip; + if (sdp->gfs1) + return lgfs2_gfs_inode_read(sdp, block); + return lgfs2_inode_read(sdp, block); +} + +/* fsck_inode_get - same as inode_get() in libgfs2 but system inodes + get special treatment. */ +struct gfs2_inode *fsck_inode_get(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + struct gfs2_buffer_head *bh) +{ + struct gfs2_inode *sysip; + struct gfs2_inode *ip; + + sysip = fsck_system_inode(sdp, bh->b_blocknr); + if (sysip) + return sysip; + + if (sdp->gfs1) + ip = lgfs2_gfs_inode_get(sdp, bh); + else + ip = lgfs2_inode_get(sdp, bh); + if (ip) + ip->i_rgd = rgd; + return ip; +} + +/* fsck_inode_put - same as inode_put() in libgfs2 but system inodes + get special treatment. */ +void fsck_inode_put(struct gfs2_inode **ip_in) +{ + struct gfs2_inode *ip = *ip_in; + struct gfs2_inode *sysip; + + sysip = fsck_system_inode(ip->i_sbd, ip->i_di.di_num.no_addr); + if (!sysip) + inode_put(ip_in); +} + +/** + * dirent_repair - attempt to repair a corrupt directory entry. + * @bh - The buffer header that contains the bad dirent + * @de - The directory entry in native format + * @dent - The directory entry in on-disk format + * @type - Type of directory (DIR_LINEAR or DIR_EXHASH) + * @first - TRUE if this is the first dirent in the buffer + * + * This function tries to repair a corrupt directory entry. All we + * know at this point is that the length field is wrong. + */ +static int dirent_repair(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + struct gfs2_dirent *de, struct gfs2_dirent *dent, + int type, int first) +{ + char *bh_end, *p; + int calc_de_name_len = 0; + + /* If this is a sentinel, just fix the length and move on */ + if (first && !de->de_inum.no_formal_ino) { /* Is it a sentinel? */ + if (type == DIR_LINEAR) + de->de_rec_len = ip->i_sbd->bsize - + sizeof(struct gfs2_dinode); + else + de->de_rec_len = ip->i_sbd->bsize - + sizeof(struct gfs2_leaf); + } else { + bh_end = bh->b_data + ip->i_sbd->bsize; + /* first, figure out a probable name length */ + p = (char *)dent + sizeof(struct gfs2_dirent); + while (*p && /* while there's a non-zero char and */ + isprint(*p) && /* a printable character and */ + p < bh_end) { /* not past end of buffer */ + calc_de_name_len++; + p++; + } + if (!calc_de_name_len) + return 1; + /* There can often be noise at the end, so only */ + /* Trust the shorter of the two in case we have too much */ + /* Or rather, only trust ours if it's shorter. */ + if (!de->de_name_len || de->de_name_len > NAME_MAX || + calc_de_name_len < de->de_name_len) /* if dent is hosed */ + de->de_name_len = calc_de_name_len; /* use ours */ + de->de_rec_len = GFS2_DIRENT_SIZE(de->de_name_len); + } + gfs2_dirent_out(de, (char *)dent); + bmodified(bh); + return 0; +} + +/** + * dirblk_truncate - truncate a directory block + */ +static void dirblk_truncate(struct gfs2_inode *ip, struct gfs2_dirent *fixb, + struct gfs2_buffer_head *bh) +{ + char *bh_end; + struct gfs2_dirent de; + + bh_end = bh->b_data + ip->i_sbd->sd_sb.sb_bsize; + /* truncate the block to save the most dentries. To do this we + have to patch the previous dent. */ + gfs2_dirent_in(&de, (char *)fixb); + de.de_rec_len = bh_end - (char *)fixb; + gfs2_dirent_out(&de, (char *)fixb); + bmodified(bh); +} + +/* + * check_entries - check directory entries for a given block + * + * @ip - dinode associated with this leaf block + * bh - buffer for the leaf block + * type - type of block this is (linear or exhash) + * @count - set to the count entries + * @lindex - the last inde + * @pass - structure pointing to pass-specific functions + * + * returns: 0 - good block or it was repaired to be good + * -1 - error occurred + */ +static int check_entries(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + int type, uint32_t *count, int lindex, + struct metawalk_fxns *pass) +{ + struct gfs2_dirent *dent; + struct gfs2_dirent de, *prev; + int error = 0; + char *bh_end; + char *filename; + int first = 1; + + bh_end = bh->b_data + ip->i_sbd->bsize; + + if (type == DIR_LINEAR) { + dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_dinode)); + } else { + dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_leaf)); + log_debug( _("Checking leaf %llu (0x%llx)\n"), + (unsigned long long)bh->b_blocknr, + (unsigned long long)bh->b_blocknr); + } + + prev = NULL; + if (!pass->check_dentry) + return 0; + + while (1) { + if (skip_this_pass || fsck_abort) + return FSCK_OK; + memset(&de, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&de, (char *)dent); + filename = (char *)dent + sizeof(struct gfs2_dirent); + + if (de.de_rec_len < sizeof(struct gfs2_dirent) + + de.de_name_len || + (de.de_inum.no_formal_ino && !de.de_name_len && !first)) { + log_err( _("Directory block %llu (0x%llx" + "), entry %d of directory %llu " + "(0x%llx) is corrupt.\n"), + (unsigned long long)bh->b_blocknr, + (unsigned long long)bh->b_blocknr, + (*count) + 1, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query( _("Attempt to repair it? (y/n) "))) { + if (dirent_repair(ip, bh, &de, dent, type, + first)) { + if (first) /* make a new sentinel */ + dirblk_truncate(ip, dent, bh); + else + dirblk_truncate(ip, prev, bh); + log_err( _("Unable to repair corrupt " + "directory entry; the " + "entry was removed " + "instead.\n")); + return 0; + } else { + log_err( _("Corrupt directory entry " + "repaired.\n")); + /* keep looping through dentries */ + } + } else { + log_err( _("Corrupt directory entry ignored, " + "stopped after checking %d entries.\n"), + *count); + return 0; + } + } + if (!de.de_inum.no_formal_ino){ + if (first){ + log_debug( _("First dirent is a sentinel (place holder).\n")); + first = 0; + } else { + log_err( _("Directory entry with inode number of " + "zero in leaf %llu (0x%llx) of " + "directory %llu (0x%llx)!\n"), + (unsigned long long)bh->b_blocknr, + (unsigned long long)bh->b_blocknr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query(_("Attempt to remove it? (y/n) "))) { + dirblk_truncate(ip, prev, bh); + log_err(_("The corrupt directory " + "entry was removed.\n")); + } else { + log_err( _("Corrupt directory entry " + "ignored, stopped after " + "checking %d entries.\n"), + *count); + } + return 0; + } + } else { + if (!de.de_inum.no_addr && first) { /* reverse sentinel */ + log_debug( _("First dirent is a Sentinel (place holder).\n")); + /* Swap the two to silently make it a proper sentinel */ + de.de_inum.no_addr = de.de_inum.no_formal_ino; + de.de_inum.no_formal_ino = 0; + gfs2_dirent_out(&de, (char *)dent); + bmodified(bh); + /* Mark dirent buffer as modified */ + first = 0; + } else { + error = pass->check_dentry(ip, dent, prev, bh, + filename, count, + &lindex, + pass->private); + if (error < 0) { + stack; + return error; + } + } + } + + if ((char *)dent + de.de_rec_len >= bh_end){ + log_debug( _("Last entry processed for %lld->%lld " + "(0x%llx->0x%llx), di_blocks=%llu.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)bh->b_blocknr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)bh->b_blocknr, + (unsigned long long)ip->i_di.di_blocks); + break; + } + + /* If we didn't clear the dentry, or if we did, but it + * was the first dentry, set prev */ + if (!error || first) + prev = dent; + first = 0; + dent = (struct gfs2_dirent *)((char *)dent + de.de_rec_len); + } + return 0; +} + +/** + * check_leaf - check a leaf block for errors + * Reads in the leaf block + * Leaves the buffer around for further analysis (caller must brelse) + */ +int check_leaf(struct gfs2_inode *ip, int lindex, struct metawalk_fxns *pass, + uint64_t *leaf_no, struct gfs2_leaf *leaf, int *ref_count) +{ + int error = 0, fix; + struct gfs2_buffer_head *lbh = NULL; + uint32_t count = 0; + struct gfs2_sbd *sdp = ip->i_sbd; + const char *msg; + int di_depth = ip->i_di.di_depth; + + /* Make sure the block number is in range. */ + if (!valid_block_ip(ip, *leaf_no)) { + log_err( _("Leaf block #%llu (0x%llx) is out of range for " + "directory #%llu (0x%llx) at index %d (0x%x).\n"), + (unsigned long long)*leaf_no, + (unsigned long long)*leaf_no, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + lindex, lindex); + msg = _("that is out of range"); + goto bad_leaf; + } + + /* Try to read in the leaf block. */ + lbh = bread(sdp, *leaf_no); + /* Make sure it's really a valid leaf block. */ + if (gfs2_check_meta(lbh, GFS2_METATYPE_LF)) { + msg = _("that is not really a leaf"); + goto bad_leaf; + } + if (pass->check_leaf_depth) + error = pass->check_leaf_depth(ip, *leaf_no, *ref_count, lbh); + + if (error >= 0 && pass->check_leaf) { + error = pass->check_leaf(ip, *leaf_no, pass->private); + if (error == -EEXIST) { + log_info(_("Previous reference to leaf %lld (0x%llx) " + "has already checked it; skipping.\n"), + (unsigned long long)*leaf_no, + (unsigned long long)*leaf_no); + brelse(lbh); + return error; + } + } + /* Early versions of GFS2 had an endianess bug in the kernel that set + lf_dirent_format to cpu_to_be16(GFS2_FORMAT_DE). This was fixed + to use cpu_to_be32(), but we should check for incorrect values and + replace them with the correct value. */ + + gfs2_leaf_in(leaf, lbh->b_data); + if (leaf->lf_dirent_format == (GFS2_FORMAT_DE << 16)) { + log_debug( _("incorrect lf_dirent_format at leaf #%" PRIu64 + "\n"), *leaf_no); + leaf->lf_dirent_format = GFS2_FORMAT_DE; + gfs2_leaf_out(leaf, lbh->b_data); + bmodified(lbh); + log_debug( _("Fixing lf_dirent_format.\n")); + } + + /* Make sure it's really a leaf. */ + if (leaf->lf_header.mh_type != GFS2_METATYPE_LF) { + log_err( _("Inode %llu (0x%llx) points to bad leaf %llu" + " (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)*leaf_no, + (unsigned long long)*leaf_no); + msg = _("that is not a leaf"); + goto bad_leaf; + } + + if (pass->check_dentry && is_dir(&ip->i_di, sdp->gfs1)) { + error = check_entries(ip, lbh, DIR_EXHASH, &count, lindex, + pass); + + if (skip_this_pass || fsck_abort) + goto out; + + if (error < 0) { + stack; + goto out; /* This seems wrong: needs investigation */ + } + + if (count == leaf->lf_entries) + goto out; + + /* release and re-read the leaf in case check_entries + changed it. */ + brelse(lbh); + lbh = bread(sdp, *leaf_no); + gfs2_leaf_in(leaf, lbh->b_data); + if (count != leaf->lf_entries) { + log_err( _("Leaf %llu (0x%llx) entry count in " + "directory %llu (0x%llx) does not match " + "number of entries found - is %u, found %u\n"), + (unsigned long long)*leaf_no, + (unsigned long long)*leaf_no, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + leaf->lf_entries, count); + if (query( _("Update leaf entry count? (y/n) "))) { + leaf->lf_entries = count; + gfs2_leaf_out(leaf, lbh->b_data); + bmodified(lbh); + log_warn( _("Leaf entry count updated\n")); + } else + log_err( _("Leaf entry count left in " + "inconsistent state\n")); + } + } +out: + if (di_depth < ip->i_di.di_depth) { + log_debug(_("Depth of directory %lld (0x%llx) changed from " + "%d to %d; adjusting ref_count from %d to %d\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + di_depth, ip->i_di.di_depth, + *ref_count, + (*ref_count) << (ip->i_di.di_depth - di_depth)); + (*ref_count) <<= (ip->i_di.di_depth - di_depth); + } + brelse(lbh); + if (error < 0) + return error; + return 0; + +bad_leaf: + if (lbh) + brelse(lbh); + if (pass->repair_leaf) { + /* The leaf we read in is bad so we need to repair it. */ + fix = pass->repair_leaf(ip, leaf_no, lindex, *ref_count, msg); + if (fix < 0) + return fix; + + } + if (di_depth < ip->i_di.di_depth) { + log_debug(_("Depth of directory %lld (0x%llx) changed from " + "%d to %d. Adjusting ref_count from %d to %d\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + di_depth, ip->i_di.di_depth, + *ref_count, + (*ref_count) << (ip->i_di.di_depth - di_depth)); + (*ref_count) <<= (ip->i_di.di_depth - di_depth); + } + return 1; +} + +static int u64cmp(const void *p1, const void *p2) +{ + uint64_t a = *(uint64_t *)p1; + uint64_t b = *(uint64_t *)p2; + + if (a > b) + return 1; + if (a < b) + return -1; + + return 0; +} + +static void dir_leaf_reada(struct gfs2_inode *ip, uint64_t *tbl, unsigned hsize) +{ + uint64_t *t = alloca(hsize * sizeof(uint64_t)); + uint64_t leaf_no; + struct gfs2_sbd *sdp = ip->i_sbd; + unsigned n = 0; + unsigned i; + + for (i = 0; i < hsize; i++) { + leaf_no = be64_to_cpu(tbl[i]); + if (valid_block_ip(ip, leaf_no)) + t[n++] = leaf_no * sdp->bsize; + } + qsort(t, n, sizeof(uint64_t), u64cmp); + for (i = 0; i < n; i++) + posix_fadvise(sdp->device_fd, t[i], sdp->bsize, POSIX_FADV_WILLNEED); +} + +/* Checks exhash directory entries */ +int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass) +{ + int error = 0; + unsigned hsize = (1 << ip->i_di.di_depth); + uint64_t leaf_no, leaf_next; + uint64_t first_ok_leaf, orig_di_blocks; + struct gfs2_buffer_head *lbh; + int lindex; + struct gfs2_sbd *sdp = ip->i_sbd; + int ref_count, orig_ref_count, orig_di_depth, orig_di_height; + uint64_t *tbl; + int chained_leaf, tbl_valid; + + tbl = get_dir_hash(ip); + if (tbl == NULL) { + perror("get_dir_hash"); + return -1; + } + tbl_valid = 1; + orig_di_depth = ip->i_di.di_depth; + orig_di_height = ip->i_di.di_height; + orig_di_blocks = ip->i_di.di_blocks; + + /* Turn off system readahead */ + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_RANDOM); + + /* Readahead */ + dir_leaf_reada(ip, tbl, hsize); + + if (pass->check_hash_tbl) { + error = pass->check_hash_tbl(ip, tbl, hsize, pass->private); + if (error < 0) { + free(tbl); + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); + return error; + } + /* If hash table changes were made, read it in again. */ + if (error) { + free(tbl); + tbl = get_dir_hash(ip); + if (tbl == NULL) { + perror("get_dir_hash"); + return -1; + } + } + } + + /* Find the first valid leaf pointer in range and use it as our "old" + leaf. That way, bad blocks at the beginning will be overwritten + with the first valid leaf. */ + first_ok_leaf = leaf_no = -1; + for (lindex = 0; lindex < hsize; lindex++) { + leaf_no = be64_to_cpu(tbl[lindex]); + if (valid_block_ip(ip, leaf_no)) { + lbh = bread(sdp, leaf_no); + /* Make sure it's really a valid leaf block. */ + if (gfs2_check_meta(lbh, GFS2_METATYPE_LF) == 0) { + brelse(lbh); + first_ok_leaf = leaf_no; + break; + } + brelse(lbh); + } + } + if (first_ok_leaf == -1) { /* no valid leaf found */ + log_err( _("Directory #%llu (0x%llx) has no valid leaf " + "blocks\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + free(tbl); + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); + return 1; + } + lindex = 0; + leaf_next = -1; + while (lindex < hsize) { + int l; + + if (fsck_abort) + break; + + if (!tbl_valid) { + free(tbl); + log_debug(_("Re-reading 0x%llx hash table.\n"), + (unsigned long long)ip->i_di.di_num.no_addr); + tbl = get_dir_hash(ip); + if (tbl == NULL) { + perror("get_dir_hash"); + return -1; + } + tbl_valid = 1; + orig_di_depth = ip->i_di.di_depth; + orig_di_height = ip->i_di.di_height; + orig_di_blocks = ip->i_di.di_blocks; + } + leaf_no = be64_to_cpu(tbl[lindex]); + + /* count the number of block pointers to this leaf. We don't + need to count the current lindex, because we already know + it's a reference */ + ref_count = 1; + + for (l = lindex + 1; l < hsize; l++) { + leaf_next = be64_to_cpu(tbl[l]); + if (leaf_next != leaf_no) + break; + ref_count++; + } + orig_ref_count = ref_count; + + chained_leaf = 0; + do { + struct gfs2_leaf leaf; + if (fsck_abort) { + free(tbl); + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); + return 0; + } + error = check_leaf(ip, lindex, pass, &leaf_no, &leaf, + &ref_count); + if (ref_count != orig_ref_count) { + log_debug(_("Ref count of leaf 0x%llx " + "changed from %d to %d.\n"), + (unsigned long long)leaf_no, + orig_ref_count, ref_count); + tbl_valid = 0; + } + if (error < 0) { + free(tbl); + return error; + } + if (!leaf.lf_next || error) + break; + leaf_no = leaf.lf_next; + chained_leaf++; + log_debug( _("Leaf chain #%d (0x%llx) detected.\n"), + chained_leaf, (unsigned long long)leaf_no); + } while (1); /* while we have chained leaf blocks */ + if (orig_di_depth != ip->i_di.di_depth) { + log_debug(_("Depth of 0x%llx changed from %d to %d\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + orig_di_depth, ip->i_di.di_depth); + tbl_valid = 0; + lindex <<= (ip->i_di.di_depth - orig_di_depth); + hsize = (1 << ip->i_di.di_depth); + } + if (orig_di_height != ip->i_di.di_height) { + log_debug(_("Height of 0x%llx changed from %d to " + "%d\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + orig_di_height, ip->i_di.di_height); + tbl_valid = 0; + } + if (orig_di_blocks != ip->i_di.di_blocks) { + log_debug(_("Block count of 0x%llx changed from %llu " + "to %llu\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)orig_di_blocks, + (unsigned long long)ip->i_di.di_blocks); + tbl_valid = 0; + } + lindex += ref_count; + } /* for every leaf block */ + free(tbl); + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); + return 0; +} + +static int check_eattr_entries(struct gfs2_inode *ip, + struct gfs2_buffer_head *bh, + struct metawalk_fxns *pass) +{ + struct gfs2_ea_header *ea_hdr, *ea_hdr_prev = NULL; + uint64_t *ea_data_ptr = NULL; + int i; + int error = 0, err; + uint32_t offset = (uint32_t)sizeof(struct gfs2_meta_header); + + if (!pass->check_eattr_entry) + return 0; + + ea_hdr = (struct gfs2_ea_header *)(bh->b_data + + sizeof(struct gfs2_meta_header)); + + while (1){ + if (ea_hdr->ea_type == GFS2_EATYPE_UNUSED) + error = 0; + else + error = pass->check_eattr_entry(ip, bh, ea_hdr, + ea_hdr_prev, + pass->private); + if (error < 0) { + stack; + return -1; + } + if (error == 0 && pass->check_eattr_extentry && + ea_hdr->ea_num_ptrs) { + uint32_t tot_ealen = 0; + struct gfs2_sbd *sdp = ip->i_sbd; + + ea_data_ptr = ((uint64_t *)((char *)ea_hdr + + sizeof(struct gfs2_ea_header) + + ((ea_hdr->ea_name_len + 7) & ~7))); + + /* It is possible when a EA is shrunk + ** to have ea_num_ptrs be greater than + ** the number required for ** data. + ** In this case, the EA ** code leaves + ** the blocks ** there for ** + ** reuse........... */ + + for(i = 0; i < ea_hdr->ea_num_ptrs; i++){ + err = pass->check_eattr_extentry(ip, i, + ea_data_ptr, bh, tot_ealen, + ea_hdr, ea_hdr_prev, + pass->private); + if (err) + error = err; + tot_ealen += sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header); + ea_data_ptr++; + } + } + offset += be32_to_cpu(ea_hdr->ea_rec_len); + if (ea_hdr->ea_flags & GFS2_EAFLAG_LAST || + offset >= ip->i_sbd->sd_sb.sb_bsize || ea_hdr->ea_rec_len == 0){ + break; + } + ea_hdr_prev = ea_hdr; + ea_hdr = (struct gfs2_ea_header *) + ((char *)(ea_hdr) + + be32_to_cpu(ea_hdr->ea_rec_len)); + } + + return error; +} + +/** + * check_leaf_eattr + * @ip: the inode the eattr comes from + * @block: block number of the leaf + * + * Returns: 0 on success, 1 if removal is needed, -1 on error + */ +static int check_leaf_eattr(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct metawalk_fxns *pass) +{ + struct gfs2_buffer_head *bh = NULL; + + if (pass->check_eattr_leaf) { + int error = 0; + + log_debug( _("Checking EA leaf block #%llu (0x%llx) for " + "inode #%llu (0x%llx).\n"), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + + error = pass->check_eattr_leaf(ip, block, parent, &bh, + pass->private); + if (error < 0) { + stack; + return -1; + } + if (error > 0) { + if (bh) + brelse(bh); + return 1; + } + if (bh) { + error = check_eattr_entries(ip, bh, pass); + brelse(bh); + } + return error; + } + + return 0; +} + +/** + * check_indirect_eattr + * @ip: the inode the eattr comes from + * @indirect_block + * + * Returns: 0 on success -1 on error + */ +static int check_indirect_eattr(struct gfs2_inode *ip, uint64_t indirect, + struct gfs2_buffer_head *indirect_buf, + struct metawalk_fxns *pass) +{ + int error = 0, err; + uint64_t *ea_leaf_ptr, *end; + uint64_t block; + struct gfs2_sbd *sdp = ip->i_sbd; + int first_ea_is_bad = 0; + uint64_t di_eattr_save = ip->i_di.di_eattr; + uint64_t offset = ip->i_sbd->gfs1 ? sizeof(struct gfs_indirect) : sizeof(struct gfs2_meta_header); + int leaf_pointers = 0, leaf_pointer_errors = 0; + + ea_leaf_ptr = (uint64_t *)(indirect_buf->b_data + offset); + end = ea_leaf_ptr + ((sdp->sd_sb.sb_bsize - offset) / 8); + + while (*ea_leaf_ptr && (ea_leaf_ptr < end)){ + block = be64_to_cpu(*ea_leaf_ptr); + leaf_pointers++; + err = check_leaf_eattr(ip, block, indirect, pass); + if (err) { + error = err; + log_err(_("Error detected in leaf block %lld (0x%llx) " + "referenced by indirect block %lld (0x%llx)" + ".\n"), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)indirect, + (unsigned long long)indirect); + log_err(_("Subsequent leaf block pointers should be " + "cleared.\n")); + } + if (error) { /* leaf blocks following an error must also be + treated as error blocks and cleared. */ + leaf_pointer_errors++; + log_err(_("Pointer to EA leaf block %lld (0x%llx) in " + "indirect block %lld (0x%llx) should be " + "cleared.\n"), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)indirect, + (unsigned long long)indirect); + } + /* If the first eattr lead is bad, we can't have a hole, so we + have to treat this as an unrecoverable eattr error and + delete all eattr info. Calling finish_eattr_indir here + causes ip->i_di.di_eattr = 0 and that ensures that + subsequent calls to check_leaf_eattr result in the eattr + check_leaf_block nuking them all "due to previous errors" */ + if (leaf_pointers == 1 && leaf_pointer_errors == 1) { + first_ea_is_bad = 1; + if (pass->finish_eattr_indir) + pass->finish_eattr_indir(ip, leaf_pointers, + leaf_pointer_errors, + pass->private); + } else if (leaf_pointer_errors) { + /* This is a bit tricky. We can't have eattr holes. + So if we have 4 good eattrs, 1 bad eattr and 5 more + good ones: GGGGBGGGGG, we need to tell + check_leaf_eattr to delete all eattrs after the bad + one. So we want: GGGG when we finish. To do that, + we set di_eattr to 0 temporarily. */ + ip->i_di.di_eattr = 0; + bmodified(ip->i_bh); + } + ea_leaf_ptr++; + } + /* If we temporarily nuked the ea block to prevent checking past + a corrupt ea leaf, we need to restore the saved di_eattr block. */ + if (di_eattr_save != 0) + ip->i_di.di_eattr = di_eattr_save; + if (pass->finish_eattr_indir) { + if (!first_ea_is_bad) { + pass->finish_eattr_indir(ip, leaf_pointers, + leaf_pointer_errors, + pass->private); + } + if (pass->delete_block && leaf_pointer_errors && + leaf_pointer_errors == leaf_pointers) { + pass->delete_block(ip, indirect, NULL, "leaf", NULL); + error = 1; + } + } + + return error; +} + +/** + * check_inode_eattr - check the EA's for a single inode + * @ip: the inode whose EA to check + * + * Returns: 0 on success, -1 on error + */ +int check_inode_eattr(struct gfs2_inode *ip, struct metawalk_fxns *pass) +{ + int error = 0; + struct gfs2_buffer_head *indirect_buf = NULL; + + if (!ip->i_di.di_eattr) + return 0; + + if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT){ + if (!pass->check_eattr_indir) + return 0; + + log_debug( _("Checking EA indirect block #%llu (0x%llx) for " + "inode #%llu (0x%llx)..\n"), + (unsigned long long)ip->i_di.di_eattr, + (unsigned long long)ip->i_di.di_eattr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + error = pass->check_eattr_indir(ip, ip->i_di.di_eattr, + ip->i_di.di_num.no_addr, + &indirect_buf, pass->private); + if (!error) { + error = check_indirect_eattr(ip, ip->i_di.di_eattr, + indirect_buf, pass); + if (error) + stack; + } + if (indirect_buf) + brelse(indirect_buf); + return error; + } + error = check_leaf_eattr(ip, ip->i_di.di_eattr, + ip->i_di.di_num.no_addr, pass); + if (error) + stack; + + return error; +} + +/** + * free_metalist - free all metadata on a multi-level metadata list + */ +static void free_metalist(struct gfs2_inode *ip, osi_list_t *mlp) +{ + int i; + struct gfs2_buffer_head *nbh; + + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + osi_list_t *list; + + list = &mlp[i]; + while (!osi_list_empty(list)) { + nbh = osi_list_entry(list->next, + struct gfs2_buffer_head, b_altlist); + if (nbh == ip->i_bh) + osi_list_del_init(&nbh->b_altlist); + else + brelse(nbh); + } + } +} + +static void file_ra(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + int head_size, int maxptrs, int h) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + uint64_t *p, sblock = 0, block; + int extlen = 0; + + if (h + 2 == ip->i_di.di_height) { + p = (uint64_t *)(bh->b_data + head_size); + if (*p && *(p + 1)) { + sblock = be64_to_cpu(*p); + p++; + block = be64_to_cpu(*p); + extlen = block - sblock; + if (extlen > 1 && extlen <= maxptrs) { + posix_fadvise(sdp->device_fd, + sblock * sdp->bsize, + (extlen + 1) * sdp->bsize, + POSIX_FADV_WILLNEED); + return; + } + } + extlen = 0; + } + for (p = (uint64_t *)(bh->b_data + head_size); + p < (uint64_t *)(bh->b_data + sdp->bsize); p++) { + if (*p) { + if (!sblock) { + sblock = be64_to_cpu(*p); + extlen = 1; + continue; + } + block = be64_to_cpu(*p); + if (block == sblock + extlen) { + extlen++; + continue; + } + } + if (extlen && sblock) { + if (extlen > 1) + extlen--; + posix_fadvise(sdp->device_fd, sblock * sdp->bsize, + extlen * sdp->bsize, + POSIX_FADV_WILLNEED); + extlen = 0; + p--; + } + } + if (extlen) + posix_fadvise(sdp->device_fd, sblock * sdp->bsize, + extlen * sdp->bsize, POSIX_FADV_WILLNEED); +} + +/** + * build_and_check_metalist - check a bunch of indirect blocks + * This includes hash table blocks for directories + * which are technically "data" in the bitmap. + * + * Returns: 0 - all is well, process the blocks this metadata references + * 1 - something went wrong, but process the sub-blocks anyway + * -1 - something went wrong, so don't process the sub-blocks + * @ip: + * @mlp: + */ +static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp, + struct metawalk_fxns *pass) +{ + uint32_t height = ip->i_di.di_height; + struct gfs2_buffer_head *bh, *nbh, *metabh = ip->i_bh; + osi_list_t *prev_list, *cur_list, *tmp; + int h, head_size, iblk_type; + uint64_t *ptr, block, *undoptr; + int error, was_duplicate, is_valid; + int maxptrs; + + osi_list_add(&metabh->b_altlist, &mlp[0]); + + /* Directories are special. Their 'data' is the hash table, which is + basically an indirect block list. Their height is not important + because it checks everything through the hash table using + "depth" field calculations. However, we still have to check the + indirect blocks, even if the height == 1. */ + if (is_dir(&ip->i_di, ip->i_sbd->gfs1)) + height++; + + /* if () */ + if (height < 2) + return meta_is_good; + for (h = 1; h < height; h++) { + if (h > 1) { + if (is_dir(&ip->i_di, ip->i_sbd->gfs1) && + h == ip->i_di.di_height + 1) + iblk_type = GFS2_METATYPE_JD; + else + iblk_type = GFS2_METATYPE_IN; + if (ip->i_sbd->gfs1) { + head_size = sizeof(struct gfs_indirect); + maxptrs = (ip->i_sbd->bsize - head_size) / + sizeof(uint64_t); + } else { + head_size = sizeof(struct gfs2_meta_header); + maxptrs = ip->i_sbd->sd_inptrs; + } + } else { + iblk_type = GFS2_METATYPE_DI; + head_size = sizeof(struct gfs2_dinode); + maxptrs = ip->i_sbd->sd_diptrs; + } + prev_list = &mlp[h - 1]; + cur_list = &mlp[h]; + + for (tmp = prev_list->next; tmp != prev_list; tmp = tmp->next){ + bh = osi_list_entry(tmp, struct gfs2_buffer_head, + b_altlist); + if (gfs2_check_meta(bh, iblk_type)) { + if (pass->invalid_meta_is_fatal) + return meta_error; + + continue; + } + + if (pass->readahead) + file_ra(ip, bh, head_size, maxptrs, h); + /* Now check the metadata itself */ + for (ptr = (uint64_t *)(bh->b_data + head_size); + (char *)ptr < (bh->b_data + ip->i_sbd->bsize); + ptr++) { + if (skip_this_pass || fsck_abort) { + free_metalist(ip, mlp); + return meta_is_good; + } + nbh = NULL; + + if (!*ptr) + continue; + + block = be64_to_cpu(*ptr); + was_duplicate = 0; + error = pass->check_metalist(ip, block, &nbh, + h, &is_valid, + &was_duplicate, + pass->private); + /* check_metalist should hold any buffers + it gets with "bread". */ + if (error == meta_error) { + stack; + log_info(_("\nSerious metadata " + "error on block %llu " + "(0x%llx).\n"), + (unsigned long long)block, + (unsigned long long)block); + goto error_undo; + } + if (error == meta_skip_further) { + log_info(_("\nUnrecoverable metadata " + "error on block %llu " + "(0x%llx). Further metadata" + " will be skipped.\n"), + (unsigned long long)block, + (unsigned long long)block); + goto error_undo; + } + if (!is_valid) { + log_debug( _("Skipping rejected block " + "%llu (0x%llx)\n"), + (unsigned long long)block, + (unsigned long long)block); + if (pass->invalid_meta_is_fatal) { + error = meta_error; + goto error_undo; + } + continue; + } + /* Note that there's a special case in which + we need to process the metadata block, even + if it was a duplicate. That's for cases + where we deleted the last reference as + metadata. */ + if (was_duplicate) { + log_debug( _("Skipping duplicate %llu " + "(0x%llx)\n"), + (unsigned long long)block, + (unsigned long long)block); + continue; + } + if (!valid_block_ip(ip, block)) { + log_debug( _("Skipping invalid block " + "%lld (0x%llx)\n"), + (unsigned long long)block, + (unsigned long long)block); + if (pass->invalid_meta_is_fatal) { + error = meta_error; + goto error_undo; + } + continue; + } + if (!nbh) + nbh = bread(ip->i_sbd, block); + osi_list_add_prev(&nbh->b_altlist, cur_list); + } /* for all data on the indirect block */ + } /* for blocks at that height */ + } /* for height */ + return 0; + +error_undo: /* undo what we've done so far for this block */ + if (pass->undo_check_meta == NULL) + return error; + + log_info(_("Undoing the work we did before the error on block %llu " + "(0x%llx).\n"), (unsigned long long)bh->b_blocknr, + (unsigned long long)bh->b_blocknr); + for (undoptr = (uint64_t *)(bh->b_data + head_size); undoptr < ptr && + (char *)undoptr < (bh->b_data + ip->i_sbd->bsize); + undoptr++) { + if (!*undoptr) + continue; + + block = be64_to_cpu(*undoptr); + pass->undo_check_meta(ip, block, h, pass->private); + } + return error; +} + +/** + * check_data - check all data pointers for a given buffer + * This does not include "data" blocks that are really + * hash table blocks for directories. + * + * @ip: + * + * returns: +ENOENT if there are too many bad pointers + * -1 if a more serious error occurred. + * 0 if no errors occurred + * 1 if errors were found and corrected + * 2 (ENOENT) is there were too many bad pointers + */ +static int check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass, + struct gfs2_buffer_head *bh, int head_size, + uint64_t *blks_checked, struct error_block *error_blk) +{ + int error = 0, rc = 0; + uint64_t block, *ptr; + uint64_t *ptr_start = (uint64_t *)(bh->b_data + head_size); + char *ptr_end = (bh->b_data + ip->i_sbd->bsize); + uint64_t metablock = bh->b_blocknr; + + /* If there isn't much pointer corruption check the pointers */ + log_debug(_("\nProcessing data blocks for inode 0x%llx, metadata " + "block 0x%llx.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)bh->b_blocknr); + for (ptr = ptr_start ; (char *)ptr < ptr_end && !fsck_abort; ptr++) { + if (!*ptr) + continue; + + if (skip_this_pass || fsck_abort) + return error; + block = be64_to_cpu(*ptr); + /* It's important that we don't call valid_block() and + bypass calling check_data on invalid blocks because that + would defeat the rangecheck_block related functions in + pass1. Therefore the individual check_data functions + should do a range check. */ + rc = pass->check_data(ip, metablock, block, pass->private, + bh, ptr); + if (rc && (!error || (rc < error))) { + log_info("\n"); + if (rc < 0) { + /* A fatal error trumps a non-fatal one. */ + if ((error_blk->errblk == 0) || + (rc < error)) { + log_debug(_("Fatal error on metadata " + "block 0x%llx, offset " + "0x%x, referencing block " + "0x%llx preempts non-fatal" + " error on block 0x%llx\n"), + (unsigned long long)metablock, + (int)(ptr - ptr_start), + (unsigned long long)block, + (unsigned long long)error_blk->errblk); + error_blk->metablk = metablock; + error_blk->metaoff = ptr - ptr_start; + error_blk->errblk = block; + } + log_info(_("Unrecoverable ")); + } else { /* nonfatal error */ + if (error_blk->errblk == 0) { + error_blk->metablk = metablock; + error_blk->metaoff = ptr - ptr_start; + error_blk->errblk = block; + } + } + log_info(_("data block error %d on metadata block " + "%lld (0x%llx), offset %d (0x%x), " + "referencing data block %lld (0x%llx).\n"), + rc, (unsigned long long)metablock, + (unsigned long long)metablock, + (int)(ptr - ptr_start), + (int)(ptr - ptr_start), + (unsigned long long)block, + (unsigned long long)block); + error = rc; + } + if (rc < 0) + return rc; + (*blks_checked)++; + } + return error; +} + +static int undo_check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass, + uint64_t metablock, + uint64_t *ptr_start, char *ptr_end, + struct error_block *error_blk, int error) +{ + int rc = 0; + uint64_t block, *ptr; + int found_error_blk = 0; + + /* If there isn't much pointer corruption check the pointers */ + for (ptr = ptr_start ; (char *)ptr < ptr_end && !fsck_abort; ptr++) { + if (!*ptr) + continue; + + if (skip_this_pass || fsck_abort) + return 1; + block = be64_to_cpu(*ptr); + if (metablock == error_blk->metablk && + (ptr - ptr_start == error_blk->metaoff) && + block == error_blk->errblk) { + if (error < 0) { /* A fatal error that stopped it? */ + log_debug(_("Stopping the undo process: " + "fatal error block 0x%llx was " + "found at metadata block 0x%llx," + "offset 0x%x.\n"), + (unsigned long long)error_blk->errblk, + (unsigned long long)error_blk->metablk, + error_blk->metaoff); + return 1; + } + found_error_blk = 1; + log_debug(_("The non-fatal error block 0x%llx was " + "found at metadata block 0x%llx, offset " + "0x%d, but undo processing will continue " + "until the end of this metadata block.\n"), + (unsigned long long)error_blk->errblk, + (unsigned long long)error_blk->metablk, + error_blk->metaoff); + } + rc = pass->undo_check_data(ip, block, pass->private); + if (rc < 0) + return rc; + } + return found_error_blk; +} + +static int hdr_size(struct gfs2_buffer_head *bh, int height) +{ + if (height > 1) { + if (gfs2_check_meta(bh, GFS2_METATYPE_IN)) + return 0; + if (bh->sdp->gfs1) + return sizeof(struct gfs_indirect); + else + return sizeof(struct gfs2_meta_header); + } + /* if this isn't really a dinode, skip it */ + if (gfs2_check_meta(bh, GFS2_METATYPE_DI)) + return 0; + + return sizeof(struct gfs2_dinode); +} + +/** + * check_metatree + * @ip: inode structure in memory + * @pass: structure passed in from caller to determine the sub-functions + * + */ +int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass) +{ + osi_list_t metalist[GFS2_MAX_META_HEIGHT]; + osi_list_t *list, *tmp; + struct gfs2_buffer_head *bh; + uint32_t height = ip->i_di.di_height; + int i, head_size; + uint64_t blks_checked = 0; + int error, rc; + int metadata_clean = 0; + struct error_block error_blk = {0, 0, 0}; + int hit_error_blk = 0; + + if (!height && !is_dir(&ip->i_di, ip->i_sbd->gfs1)) + return 0; + + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) + osi_list_init(&metalist[i]); + + /* create and check the metadata list for each height */ + error = build_and_check_metalist(ip, &metalist[0], pass); + if (error) { + stack; + goto undo_metalist; + } + + metadata_clean = 1; + /* For directories, we've already checked the "data" blocks which + * comprise the directory hash table, so we perform the directory + * checks and exit. */ + if (is_dir(&ip->i_di, ip->i_sbd->gfs1)) { + if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) + goto out; + /* check validity of leaf blocks and leaf chains */ + error = check_leaf_blks(ip, pass); + if (error) + goto undo_metalist; + goto out; + } + + /* check data blocks */ + list = &metalist[height - 1]; + if (ip->i_di.di_blocks > COMFORTABLE_BLKS) + last_reported_fblock = -10000000; + + for (tmp = list->next; !error && tmp != list; tmp = tmp->next) { + if (fsck_abort) { + free_metalist(ip, &metalist[0]); + return 0; + } + bh = osi_list_entry(tmp, struct gfs2_buffer_head, b_altlist); + head_size = hdr_size(bh, height); + if (!head_size) + continue; + + if (pass->check_data) + error = check_data(ip, pass, bh, head_size, + &blks_checked, &error_blk); + if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS) + pass->big_file_msg(ip, blks_checked); + } + if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS) { + log_notice( _("\rLarge file at %lld (0x%llx) - 100 percent " + "complete. " + "\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + fflush(stdout); + } +undo_metalist: + if (!error) + goto out; + log_err( _("Error: inode %llu (0x%llx) had unrecoverable errors at " + "metadata block %lld (0x%llx), offset %d (0x%x), block " + "%lld (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)error_blk.metablk, + (unsigned long long)error_blk.metablk, + error_blk.metaoff, error_blk.metaoff, + (unsigned long long)error_blk.errblk, + (unsigned long long)error_blk.errblk); + if (!query( _("Remove the invalid inode? (y/n) "))) { + free_metalist(ip, &metalist[0]); + log_err(_("Invalid inode not deleted.\n")); + return error; + } + for (i = 0; pass->undo_check_meta && i < height; i++) { + while (!osi_list_empty(&metalist[i])) { + list = &metalist[i]; + bh = osi_list_entry(list->next, + struct gfs2_buffer_head, + b_altlist); + log_err(_("Undoing metadata work for block %llu " + "(0x%llx)\n"), + (unsigned long long)bh->b_blocknr, + (unsigned long long)bh->b_blocknr); + if (i) + rc = pass->undo_check_meta(ip, bh->b_blocknr, + i, pass->private); + else + rc = 0; + if (metadata_clean && rc == 0 && i == height - 1 && + !hit_error_blk) { + head_size = hdr_size(bh, height); + if (head_size) { + rc = undo_check_data(ip, pass, + bh->b_blocknr, + (uint64_t *) + (bh->b_data + head_size), + (bh->b_data + ip->i_sbd->bsize), + &error_blk, + error); + if (rc > 0) { + hit_error_blk = 1; + log_err("Reached the error " + "block undoing work " + "for inode %lld " + "(0x%llx).\n", + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + rc = 0; + } + } + } + if (bh == ip->i_bh) + osi_list_del(&bh->b_altlist); + else + brelse(bh); + } + } + /* There may be leftover duplicate records, so we need to delete them. + For example, if a metadata block was found to be a duplicate, we + may not have added it to the metalist, which means it's not there + to undo. */ + delete_all_dups(ip); + /* Set the dinode as "bad" so it gets deleted */ + fsck_bitmap_set(ip, ip->i_di.di_num.no_addr, _("corrupt"), + GFS2_BLKST_FREE); + log_err(_("The corrupt inode was invalidated.\n")); +out: + free_metalist(ip, &metalist[0]); + return error; +} + +/* Checks stuffed inode directories */ +int check_linear_dir(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + struct metawalk_fxns *pass) +{ + int error = 0; + uint32_t count = 0; + + error = check_entries(ip, bh, DIR_LINEAR, &count, 0, pass); + if (error < 0) { + stack; + return -1; + } + + return error; +} + +int check_dir(struct gfs2_sbd *sdp, struct gfs2_inode *ip, struct metawalk_fxns *pass) +{ + int error = 0; + + if (ip->i_di.di_flags & GFS2_DIF_EXHASH) + error = check_leaf_blks(ip, pass); + else + error = check_linear_dir(ip, ip->i_bh, pass); + + if (error < 0) + stack; + + return error; +} diff --git a/gfs2/fsck/metawalk.h b/gfs2/fsck/metawalk.h new file mode 100644 index 0000000..119efee --- /dev/null +++ b/gfs2/fsck/metawalk.h @@ -0,0 +1,127 @@ +#ifndef _METAWALK_H +#define _METAWALK_H + +#define DIR_LINEAR 1 +#define DIR_EXHASH 2 + +#include "util.h" + +struct metawalk_fxns; + +extern int check_inode_eattr(struct gfs2_inode *ip, + struct metawalk_fxns *pass); +extern int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass); +extern int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass); +extern int check_dir(struct gfs2_sbd *sdp, struct gfs2_inode *ip, + struct metawalk_fxns *pass); +extern int check_linear_dir(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + struct metawalk_fxns *pass); +extern int check_leaf(struct gfs2_inode *ip, int lindex, + struct metawalk_fxns *pass, uint64_t *leaf_no, + struct gfs2_leaf *leaf, int *ref_count); +extern int _fsck_bitmap_set(struct gfs2_inode *ip, uint64_t bblock, + const char *btype, int mark, int error_on_dinode, + const char *caller, int line); +extern int check_n_fix_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + uint64_t blk, int error_on_dinode, + int new_state); +extern struct duptree *dupfind(uint64_t block); +extern struct gfs2_inode *fsck_system_inode(struct gfs2_sbd *sdp, + uint64_t block); + +#define is_duplicate(dblock) ((dupfind(dblock)) ? 1 : 0) + +#define fsck_bitmap_set(ip, b, bt, m) \ + _fsck_bitmap_set(ip, b, bt, m, 0, __FUNCTION__, __LINE__) +#define fsck_bitmap_set_noino(ip, b, bt, m) \ + _fsck_bitmap_set(ip, b, bt, m, 1, __FUNCTION__, __LINE__) +enum meta_check_rc { + meta_error = -1, + meta_is_good = 0, + meta_skip_further = 1, +}; + +/* metawalk_fxns: function pointers to check various parts of the fs + * + * The functions should return -1 on fatal errors, 1 if the block + * should be skipped, and 0 on success + * + * private: Data that should be passed to the fxns + * check_leaf: + * check_metalist: + * check_data: + * check_eattr_indir: + * check_eattr_leaf: + * check_dentry: + * check_eattr_entry: + * check_eattr_extentry: + */ +struct metawalk_fxns { + void *private; + int invalid_meta_is_fatal; + int readahead; + int (*check_leaf_depth) (struct gfs2_inode *ip, uint64_t leaf_no, + int ref_count, struct gfs2_buffer_head *lbh); + int (*check_leaf) (struct gfs2_inode *ip, uint64_t block, + void *private); + /* parameters to the check_metalist sub-functions: + ip: incore inode pointer + block: block number of the metadata block to be checked + bh: buffer_head to be returned + h: height + is_valid: returned as 1 if the metadata block is valid and should + be added to the metadata list for further processing. + was_duplicate: returns as 1 if the metadata block was determined + to be a duplicate reference, in which case we want to + skip adding it to the metadata list. + private: Pointer to pass-specific data + returns: 0 - everything is good, but there may be duplicates + 1 - skip further processing + */ + int (*check_metalist) (struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private); + int (*check_data) (struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr); + int (*check_eattr_indir) (struct gfs2_inode *ip, uint64_t block, + uint64_t parent, + struct gfs2_buffer_head **bh, void *private); + int (*check_eattr_leaf) (struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private); + int (*check_dentry) (struct gfs2_inode *ip, struct gfs2_dirent *de, + struct gfs2_dirent *prev, + struct gfs2_buffer_head *bh, + char *filename, uint32_t *count, + int *lindex, void *private); + int (*check_eattr_entry) (struct gfs2_inode *ip, + struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private); + int (*check_eattr_extentry) (struct gfs2_inode *ip, int i, + uint64_t *ea_data_ptr, + struct gfs2_buffer_head *leaf_bh, + uint32_t tot_ealen, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private); + int (*finish_eattr_indir) (struct gfs2_inode *ip, int leaf_pointers, + int leaf_pointer_errors, void *private); + void (*big_file_msg) (struct gfs2_inode *ip, uint64_t blks_checked); + int (*check_hash_tbl) (struct gfs2_inode *ip, uint64_t *tbl, + unsigned hsize, void *private); + int (*repair_leaf) (struct gfs2_inode *ip, uint64_t *leaf_no, + int lindex, int ref_count, const char *msg); + int (*undo_check_meta) (struct gfs2_inode *ip, uint64_t block, + int h, void *private); + int (*undo_check_data) (struct gfs2_inode *ip, uint64_t block, + void *private); + int (*delete_block) (struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, const char *btype, + void *private); +}; + +#endif /* _METAWALK_H */ diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c new file mode 100644 index 0000000..3c2f74f --- /dev/null +++ b/gfs2/fsck/pass1.c @@ -0,0 +1,2229 @@ +/* pass1 checks inodes for format & type, duplicate blocks, & incorrect + * block count. + * + * It builds up tables that contains the state of each block (free, + * block in use, metadata type, etc), as well as bad blocks and + * duplicate blocks. (See block_list.[ch] for more info) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "inode_hash.h" +#include "util.h" +#include "link.h" +#include "metawalk.h" +#include "fs_recovery.h" + +struct special_blocks gfs1_rindex_blks; +struct gfs2_bmap *bl = NULL; + +struct block_count { + uint64_t indir_count; + uint64_t data_count; + uint64_t ea_count; +}; + +static int p1check_leaf(struct gfs2_inode *ip, uint64_t block, void *private); +static int check_metalist(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, int *is_valid, + int *was_duplicate, void *private); +static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block, + int h, void *private); +static int check_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr); +static int undo_check_data(struct gfs2_inode *ip, uint64_t block, + void *private); +static int check_eattr_indir(struct gfs2_inode *ip, uint64_t indirect, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private); +static int check_eattr_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private); +static int check_eattr_entries(struct gfs2_inode *ip, + struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private); +static int check_extended_leaf_eattr(struct gfs2_inode *ip, int i, + uint64_t *data_ptr, + struct gfs2_buffer_head *leaf_bh, + uint32_t tot_ealen, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private); +static int finish_eattr_indir(struct gfs2_inode *ip, int leaf_pointers, + int leaf_pointer_errors, void *private); +static int invalidate_metadata(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private); +static int invalidate_leaf(struct gfs2_inode *ip, uint64_t block, + void *private); +static int invalidate_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr); +static int invalidate_eattr_indir(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, + struct gfs2_buffer_head **bh, + void *private); +static int invalidate_eattr_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private); +static int handle_ip(struct gfs2_sbd *sdp, struct gfs2_inode *ip); +static int delete_block(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, const char *btype, + void *private); + +static int gfs2_blockmap_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark) +{ + static unsigned char *byte; + static uint64_t b; + + if (!bmap) + return 0; + if (bblock > bmap->size) + return -1; + + byte = bmap->map + BLOCKMAP_SIZE2(bblock); + b = BLOCKMAP_BYTE_OFFSET2(bblock); + *byte &= ~(BLOCKMAP_MASK2 << b); + *byte |= (mark & BLOCKMAP_MASK2) << b; + return 0; +} + +/* + * _fsck_blockmap_set - Mark a block in the 4-bit blockmap and the 2-bit + * bitmap, and adjust free space accordingly. + */ +static int _fsck_blockmap_set(struct gfs2_inode *ip, uint64_t bblock, + const char *btype, int mark, int error_on_dinode, + const char *caller, int fline) +{ + int error = _fsck_bitmap_set(ip, bblock, btype, mark, error_on_dinode, + caller, fline); + if (error) + return error; + + return gfs2_blockmap_set(bl, bblock, mark); +} + +#define fsck_blockmap_set(ip, b, bt, m) \ + _fsck_blockmap_set(ip, b, bt, m, 0, __FUNCTION__, __LINE__) +#define fsck_blkmap_set_noino(ip, b, bt, m) \ + _fsck_blockmap_set(ip, b, bt, m, 1, __FUNCTION__, __LINE__) + +/** + * delete_block - delete a block associated with an inode + */ +static int delete_block(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, const char *btype, + void *private) +{ + if (valid_block_ip(ip, block)) { + fsck_blockmap_set(ip, block, btype, GFS2_BLKST_FREE); + return 0; + } + return -1; +} + +/* This is a pass1-specific leaf repair. Since we are not allowed to do + * block allocations, we do what we can. */ +static int pass1_repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no, + int lindex, int ref_count, const char *msg) +{ + uint64_t *cpyptr; + char *padbuf; + int pad_size, i; + + log_err( _("Directory Inode %llu (0x%llx) points to leaf %llu" + " (0x%llx) %s.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)*leaf_no, + (unsigned long long)*leaf_no, msg); + if (!query( _("Attempt to patch around it? (y/n) "))) { + log_err( _("Bad leaf left in place.\n")); + goto out; + } + + padbuf = malloc(ref_count * sizeof(uint64_t)); + cpyptr = (uint64_t *)padbuf; + for (i = 0; i < ref_count; i++) { + *cpyptr = 0; + cpyptr++; + } + pad_size = ref_count * sizeof(uint64_t); + log_err(_("Writing zeros to the hash table of directory %lld " + "(0x%llx) at index: 0x%x for 0x%x pointers.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, lindex, + ref_count); + if (ip->i_sbd->gfs1) + gfs1_writei(ip, padbuf, lindex * sizeof(uint64_t), pad_size); + else + gfs2_writei(ip, padbuf, lindex * sizeof(uint64_t), pad_size); + free(padbuf); + log_err( _("Directory Inode %llu (0x%llx) patched.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + +out: + *leaf_no = 0; + return 0; +} + +struct metawalk_fxns pass1_fxns = { + .private = NULL, + .check_leaf = p1check_leaf, + .check_metalist = check_metalist, + .check_data = check_data, + .check_eattr_indir = check_eattr_indir, + .check_eattr_leaf = check_eattr_leaf, + .check_dentry = NULL, + .check_eattr_entry = check_eattr_entries, + .check_eattr_extentry = check_extended_leaf_eattr, + .big_file_msg = big_file_comfort, + .repair_leaf = pass1_repair_leaf, + .undo_check_meta = undo_check_metalist, + .undo_check_data = undo_check_data, + .delete_block = delete_block, +}; + +struct metawalk_fxns invalidate_fxns = { + .private = NULL, + .check_metalist = invalidate_metadata, + .check_data = invalidate_data, + .check_leaf = invalidate_leaf, + .check_eattr_indir = invalidate_eattr_indir, + .check_eattr_leaf = invalidate_eattr_leaf, + .delete_block = delete_block, +}; + +/* + * resuscitate_metalist - make sure a system directory entry's metadata blocks + * are marked "in use" in the bitmap. + * + * This function makes sure metadata blocks for system and root directories are + * marked "in use" by the bitmap. You don't want root's indirect blocks + * deleted, do you? Or worse, reused for lost+found. + */ +static int resuscitate_metalist(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private) +{ + struct block_count *bc = (struct block_count *)private; + + *is_valid = 1; + *was_duplicate = 0; + *bh = NULL; + if (!valid_block_ip(ip, block)){ /* blk outside of FS */ + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, + _("itself"), GFS2_BLKST_UNLINKED); + log_err( _("Bad indirect block pointer (invalid or out of " + "range) found in system inode %lld (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + *is_valid = 0; + return meta_is_good; + } + if (fsck_system_inode(ip->i_sbd, block)) + fsck_blockmap_set(ip, block, _("system file"), + ip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + else + check_n_fix_bitmap(ip->i_sbd, ip->i_rgd, block, 0, + ip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + bc->indir_count++; + return meta_is_good; +} + +/* + * resuscitate_dentry - make sure a system directory entry is alive + * + * This function makes sure directory entries in system directories are + * kept alive. You don't want journal0 deleted from jindex, do you? + */ +static int resuscitate_dentry(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_dirent *prev_de, + struct gfs2_buffer_head *bh, char *filename, + uint32_t *count, int *lindex, void *priv) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_dirent dentry, *de; + char tmp_name[PATH_MAX]; + uint64_t block; + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&dentry, (char *)dent); + de = &dentry; + block = de->de_inum.no_addr; + /* Start of checks */ + memset(tmp_name, 0, sizeof(tmp_name)); + if (de->de_name_len < sizeof(tmp_name)) + strncpy(tmp_name, filename, de->de_name_len); + else + strncpy(tmp_name, filename, sizeof(tmp_name) - 1); + if (!valid_block_ip(ip, block)) { + log_err( _("Block # referenced by system directory entry %s " + "in inode %lld (0x%llx) is invalid or out of range;" + " ignored.\n"), + tmp_name, (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + return 0; + } + /* If this is a system dinode, we'll handle it later in + check_system_inodes. If not, it'll be handled by pass1 but + since it's in a system directory we need to make sure it's + represented in the rgrp bitmap. */ + if (fsck_system_inode(sdp, block)) + fsck_blockmap_set(ip, block, _("system file"), + GFS2_BLKST_DINODE); + else + check_n_fix_bitmap(sdp, ip->i_rgd, block, 0, + GFS2_BLKST_DINODE); + /* Return the number of leaf entries so metawalk doesn't flag this + leaf as having none. */ + *count = be16_to_cpu(((struct gfs2_leaf *)bh->b_data)->lf_entries); + return 0; +} + +struct metawalk_fxns sysdir_fxns = { + .private = NULL, + .check_metalist = resuscitate_metalist, + .check_dentry = resuscitate_dentry, + .delete_block = delete_block, +}; + +static int p1check_leaf(struct gfs2_inode *ip, uint64_t block, void *private) +{ + struct block_count *bc = (struct block_count *) private; + int q; + + /* Note if we've gotten this far, the block has already passed the + check in metawalk: gfs2_check_meta(lbh, GFS2_METATYPE_LF). + So we know it's a leaf block. */ + bc->indir_count++; + q = block_type(bl, block); + if (q != GFS2_BLKST_FREE) { + log_err( _("Found duplicate block #%llu (0x%llx) referenced " + "as a directory leaf in dinode " + "%llu (0x%llx) - was marked %d (%s)\n"), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, q, + block_type_string(q)); + add_duplicate_ref(ip, block, ref_as_meta, 0, INODE_VALID); + if (q == (ip->i_sbd->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED)) + /* If the previous reference also saw this as a leaf, + it was already checked, so don't check again. */ + return EEXIST; /* non-fatal */ + } + fsck_blockmap_set(ip, block, _("directory leaf"), + ip->i_sbd->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + return 0; +} + +static int check_metalist(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, int *is_valid, + int *was_duplicate, void *private) +{ + int q; + int iblk_type; + struct gfs2_buffer_head *nbh; + struct block_count *bc = (struct block_count *)private; + const char *blktypedesc; + + *bh = NULL; + + *was_duplicate = 0; + *is_valid = 0; + if (!valid_block_ip(ip, block)) { /* blk outside of FS */ + /* The bad dinode should be invalidated later due to + "unrecoverable" errors. The inode itself should be + set "free" and removed from the inodetree by + undo_check_metalist. */ + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, + _("bad block referencing"), GFS2_BLKST_UNLINKED); + log_debug( _("Bad indirect block (invalid/out of range) " + "found in inode %lld (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + + return meta_skip_further; + } + if (is_dir(&ip->i_di, ip->i_sbd->gfs1) && h == ip->i_di.di_height) { + iblk_type = GFS2_METATYPE_JD; + blktypedesc = _("a directory hash table block"); + } else { + iblk_type = GFS2_METATYPE_IN; + blktypedesc = _("a journaled data block"); + } + q = block_type(bl, block); + if (q != GFS2_BLKST_FREE) { + log_err( _("Found duplicate block #%llu (0x%llx) referenced " + "as metadata in indirect block for dinode " + "%llu (0x%llx) - was marked %d (%s)\n"), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, q, + block_type_string(q)); + *was_duplicate = 1; + } + nbh = bread(ip->i_sbd, block); + + *is_valid = (gfs2_check_meta(nbh, iblk_type) == 0); + + if (!(*is_valid)) { + log_err( _("Inode %lld (0x%llx) has a bad indirect block " + "pointer %lld (0x%llx) (points to something " + "that is not %s).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, + (unsigned long long)block, blktypedesc); + brelse(nbh); + return meta_skip_further; + } + + bc->indir_count++; + if (*was_duplicate) { + add_duplicate_ref(ip, block, ref_as_meta, 0, + *is_valid ? INODE_VALID : INODE_INVALID); + brelse(nbh); + } else { + *bh = nbh; + fsck_blockmap_set(ip, block, _("indirect"), ip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + } + + if (*is_valid) + return meta_is_good; + return meta_skip_further; +} + +/* undo_reference - undo previously processed data or metadata + * We've treated the metadata for this dinode as good so far, but not we + * realize it's bad. So we need to undo what we've done. + * + * Returns: 0 - We need to process the block as metadata. In other words, + * we need to undo any blocks it refers to. + * 1 - We can't process the block as metadata. + */ + +static int undo_reference(struct gfs2_inode *ip, uint64_t block, int meta, + void *private) +{ + struct block_count *bc = (struct block_count *)private; + struct duptree *dt; + struct inode_with_dups *id; + int old_bitmap_state = 0; + struct rgrp_tree *rgd; + + if (!valid_block_ip(ip, block)) { /* blk outside of FS */ + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, + _("bad block referencing"), GFS2_BLKST_FREE); + return 1; + } + + if (meta) + bc->indir_count--; + dt = dupfind(block); + if (dt) { + /* remove all duplicate reference structures from this inode */ + do { + id = find_dup_ref_inode(dt, ip); + if (!id) + break; + + dup_listent_delete(dt, id); + } while (id); + + if (dt->refs) { + log_err(_("Block %llu (0x%llx) is still referenced " + "from another inode; not freeing.\n"), + (unsigned long long)block, + (unsigned long long)block); + if (dt->refs == 1) { + log_err(_("This was the only duplicate " + "reference so far; removing it.\n")); + dup_delete(dt); + } + return 1; + } + } + if (!meta) { + rgd = gfs2_blk2rgrpd(ip->i_sbd, block); + old_bitmap_state = lgfs2_get_bitmap(ip->i_sbd, block, rgd); + if (old_bitmap_state == GFS2_BLKST_DINODE) + return -1; + } + fsck_blockmap_set(ip, block, + meta ? _("bad indirect") : _("referenced data"), + GFS2_BLKST_FREE); + return 0; +} + +static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block, + int h, void *private) +{ + return undo_reference(ip, block, 1, private); +} + +static int undo_check_data(struct gfs2_inode *ip, uint64_t block, + void *private) +{ + return undo_reference(ip, block, 0, private); +} + +/* blockmap_set_as_data - set block as 'data' in the blockmap, if not dinode + * + * This function tries to set a block that's referenced as data as 'data' + * in the fsck blockmap. But if that block is marked as 'dinode' in the + * rgrp bitmap, it does additional checks to see if it looks like a dinode. + * Note that previous checks were done for duplicate references, so this + * is checking for dinodes that we haven't processed yet. + */ +static int blockmap_set_as_data(struct gfs2_inode *ip, uint64_t block) +{ + int error; + struct gfs2_buffer_head *bh; + struct gfs2_dinode *di; + + error = fsck_blkmap_set_noino(ip, block, _("data"), GFS2_BLKST_USED); + if (!error) + return 0; + + error = 0; + /* The bitmap says it's a dinode, but a block reference begs to differ. + So which is it? */ + bh = bread(ip->i_sbd, block); + if (gfs2_check_meta(bh, GFS2_METATYPE_DI) != 0) + goto out; + + /* The meta header agrees it's a dinode. But it might be data in + disguise, so do some extra checks. */ + di = (struct gfs2_dinode *)bh->b_data; + if (be64_to_cpu(di->di_num.no_addr) != block) + goto out; + + log_err(_("Inode %lld (0x%llx) has a reference to block %lld (0x%llx) " + "as a data block, but it appears to be a dinode we " + "haven't checked yet.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, (unsigned long long)block); + error = -1; +out: + if (!error) + fsck_blockmap_set(ip, block, _("data"), GFS2_BLKST_USED); + brelse(bh); + return error; +} + +static int check_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bbh, uint64_t *ptr) +{ + int q; + struct block_count *bc = (struct block_count *) private; + + if (!valid_block_ip(ip, block)) { + log_err( _("inode %lld (0x%llx) has a bad data block pointer " + "%lld (0x%llx) (invalid or out of range) "), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, (unsigned long long)block); + if (metablock == ip->i_di.di_num.no_addr) + log_err("\n"); + else + log_err(_("from metadata block %llu (0x%llx)\n"), + (unsigned long long)metablock, + (unsigned long long)metablock); + /* Mark the owner of this block with the bad_block + * designator so we know to check it for out of range + * blocks later */ + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, + _("bad (out of range) data"), + GFS2_BLKST_UNLINKED); + return -1; + } + bc->data_count++; /* keep the count sane anyway */ + q = block_type(bl, block); + if (q != GFS2_BLKST_FREE) { + struct gfs2_buffer_head *bh; + struct gfs2_meta_header mh; + + log_err( _("Found duplicate %s block %llu (0x%llx) " + "referenced as data by dinode %llu (0x%llx) "), + block_type_string(q), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (metablock == ip->i_di.di_num.no_addr) + log_err("\n"); + else + log_err(_("from metadata block %llu (0x%llx)\n"), + (unsigned long long)metablock, + (unsigned long long)metablock); + + switch (q) { + case GFS2_BLKST_DINODE: + log_info(_("The block was processed earlier as an " + "inode, so it can't possibly be data.\n")); + /* We still need to add a duplicate record here because + when check_metatree tries to delete the inode, we + can't have the "undo" functions freeing the block + out from other the original referencing inode. */ + add_duplicate_ref(ip, block, ref_as_data, 0, + INODE_VALID); + return 1; + case GFS2_BLKST_USED: /* tough decision: May be data or meta */ + bh = bread(ip->i_sbd, block); + gfs2_meta_header_in(&mh, bh->b_data); + brelse(bh); + if (mh.mh_magic == GFS2_MAGIC && + mh.mh_type >= GFS2_METATYPE_RG && + mh.mh_type <= GFS2_METATYPE_QC && + mh.mh_type != GFS2_METATYPE_DI && + mh.mh_format % 100 == 0) { + log_info(_("The block was processed earlier " + "as valid metadata, so it can't " + "possibly be data.\n")); + /* We still need to add a duplicate record here + because when check_metatree tries to delete + the inode, we can't have the "undo" + functions freeing the block out from other + the original referencing inode. */ + add_duplicate_ref(ip, block, ref_as_data, 0, + INODE_VALID); + return 1; + } + log_info( _("Seems to be a normal duplicate; I'll " + "sort it out in pass1b.\n")); + add_duplicate_ref(ip, block, ref_as_data, 0, + INODE_VALID); + /* This inode references the block as data. So if this + all is validated, we want to keep this count. */ + return 0; + case GFS2_BLKST_UNLINKED: + log_info( _("The block was invalid as metadata but might be " + "okay as data. I'll sort it out in pass1b.\n")); + add_duplicate_ref(ip, block, ref_as_data, 0, INODE_VALID); + return 0; + } + } + /* In gfs1, rgrp indirect blocks are marked in the bitmap as "meta". + In gfs2, "meta" is only for dinodes. So here we dummy up the + blocks so that the bitmap isn't changed improperly. */ + if (ip->i_sbd->gfs1 && ip == ip->i_sbd->md.riinode) { + log_info(_("Block %lld (0x%llx) is a GFS1 rindex block\n"), + (unsigned long long)block, (unsigned long long)block); + gfs2_special_set(&gfs1_rindex_blks, block); + fsck_blockmap_set(ip, block, _("rgrp"), GFS2_BLKST_DINODE); + /*gfs2_meta_rgrp);*/ + } else if (ip->i_sbd->gfs1 && ip->i_di.di_flags & GFS2_DIF_JDATA) { + log_info(_("Block %lld (0x%llx) is a GFS1 journaled data " + "block\n"), + (unsigned long long)block, (unsigned long long)block); + fsck_blockmap_set(ip, block, _("jdata"), GFS2_BLKST_DINODE); + } else + return blockmap_set_as_data(ip, block); + return 0; +} + +static int ask_remove_inode_eattr(struct gfs2_inode *ip, + struct block_count *bc) +{ + if (ip->i_di.di_eattr == 0) + return 0; /* eattr was removed prior to this call */ + log_err( _("Inode %lld (0x%llx) has unrecoverable Extended Attribute " + "errors.\n"), (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query( _("Clear all Extended Attributes from the inode? (y/n) "))){ + undo_reference(ip, ip->i_di.di_eattr, 0, bc); + ip->i_di.di_eattr = 0; + bc->ea_count = 0; + ip->i_di.di_blocks = 1 + bc->indir_count + bc->data_count; + ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + bmodified(ip->i_bh); + log_err( _("Extended attributes were removed.\n")); + } else { + log_err( _("Extended attributes were not removed.\n")); + } + return 0; +} + +static int undo_eattr_indir_or_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, + struct gfs2_buffer_head **bh, + void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + int q; + int error; + struct block_count *bc = (struct block_count *) private; + + if (!valid_block_ip(ip, block)) + return meta_error; + + /* Need to check block_type before undoing the reference, which can + set it to free, which would cause the test below to fail. */ + q = block_type(bl, block); + + error = undo_reference(ip, block, 0, private); + if (error) + return error; + + bc->ea_count--; + + if (q != (sdp->gfs1 ? GFS2_BLKST_DINODE : GFS2_BLKST_USED)) + return 1; + + *bh = bread(sdp, block); + return 0; +} + +/* complain_eas - complain about extended attribute errors for an inode + * + * @ip - in core inode pointer + * block - the block that had the problem + * duplicate - if this is a duplicate block, don't set it "free" + * emsg - what to tell the user about the eas being checked + * Returns: 1 if the EA is fixed, else 0 if it was not fixed. + */ +static void complain_eas(struct gfs2_inode *ip, uint64_t block, + const char *emsg) +{ + log_err(_("Inode #%llu (0x%llx): %s"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, emsg); + log_err(_(" at block #%lld (0x%llx).\n"), + (unsigned long long)block, (unsigned long long)block); +} + +static int check_eattr_indir(struct gfs2_inode *ip, uint64_t indirect, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + int ret = 0; + int q; + struct block_count *bc = (struct block_count *) private; + + /* This inode contains an eattr - it may be invalid, but the + * eattr attributes points to a non-zero block */ + if (!valid_block_ip(ip, indirect)) { + /* Doesn't help to mark this here - this gets checked + * in pass1c */ + return 1; + } + q = block_type(bl, indirect); + + /* Special duplicate processing: If we have an EA block, + check if it really is an EA. If it is, let duplicate + handling sort it out. If it isn't, clear it but don't + count it as a duplicate. */ + *bh = bread(sdp, indirect); + if (gfs2_check_meta(*bh, GFS2_METATYPE_IN)) { + bc->ea_count++; + if (q != GFS2_BLKST_FREE) { /* Duplicate? */ + add_duplicate_ref(ip, indirect, ref_as_ea, 0, + INODE_VALID); + complain_eas(ip, indirect, + _("Bad indirect Extended Attribute " + "duplicate found")); + /* Return 0 here because if all that's wrong is a + duplicate block reference, we want pass1b to figure + it out. We don't want to delete all the extended + attributes as if they are in error. */ + return 0; + } + complain_eas(ip, indirect, + _("Extended Attribute indirect block has " + "incorrect type")); + return 1; + } + if (q != GFS2_BLKST_FREE) { /* Duplicate? */ + add_duplicate_ref(ip, indirect, ref_as_ea, 0, INODE_VALID); + complain_eas(ip, indirect, + _("Duplicate Extended Attribute indirect block")); + bc->ea_count++; + ret = 0; /* For the same reason stated above. */ + } else { + fsck_blockmap_set(ip, indirect, + _("indirect Extended Attribute"), sdp->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + bc->ea_count++; + } + return ret; +} + +static int finish_eattr_indir(struct gfs2_inode *ip, int leaf_pointers, + int leaf_pointer_errors, void *private) +{ + struct block_count *bc = (struct block_count *) private; + + if (leaf_pointer_errors == leaf_pointers) /* All eas were bad */ + return ask_remove_inode_eattr(ip, bc); + log_debug( _("Marking inode #%llu (0x%llx) with extended " + "attribute block\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (!leaf_pointer_errors) + return 0; + log_err( _("Inode %lld (0x%llx) has recoverable indirect " + "Extended Attribute errors.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query( _("Okay to fix the block count for the inode? (y/n) "))) { + ip->i_di.di_blocks = 1 + bc->indir_count + + bc->data_count + bc->ea_count; + bmodified(ip->i_bh); + log_err(_("Block count fixed: 1+%lld+%lld+%lld = %lld.\n"), + (unsigned long long)bc->indir_count, + (unsigned long long)bc->data_count, + (unsigned long long)bc->ea_count, + (unsigned long long)ip->i_di.di_blocks); + return 1; + } + log_err( _("Block count not fixed.\n")); + return 1; +} + +/* check_ealeaf_block + * checks an extended attribute (not directory) leaf block + */ +static int check_ealeaf_block(struct gfs2_inode *ip, uint64_t block, int btype, + struct gfs2_buffer_head **bh, void *private) +{ + struct gfs2_buffer_head *leaf_bh = NULL; + struct gfs2_sbd *sdp = ip->i_sbd; + int q; + struct block_count *bc = (struct block_count *) private; + + q = block_type(bl, block); + /* Special duplicate processing: If we have an EA block, check if it + really is an EA. If it is, let duplicate handling sort it out. + If it isn't, clear it but don't count it as a duplicate. */ + leaf_bh = bread(sdp, block); + if (gfs2_check_meta(leaf_bh, btype)) { + bc->ea_count++; + if (q != GFS2_BLKST_FREE) { /* Duplicate? */ + add_duplicate_ref(ip, block, ref_as_ea, 0, + INODE_VALID); + complain_eas(ip, block, _("Extended attribute leaf " + "duplicate found")); + /* Return 0 here because if all that's wrong is a + duplicate block reference, we want pass1b to figure + it out. We don't want to delete all the extended + attributes as if they are in error. */ + return 0; + } + complain_eas(ip, block, _("Extended Attribute leaf block has " + "incorrect type")); + brelse(leaf_bh); + return 1; + } + if (q != GFS2_BLKST_FREE) { /* Duplicate? */ + complain_eas(ip, block, _("Extended Attribute leaf " + "duplicate found")); + add_duplicate_ref(ip, block, ref_as_data, 0, INODE_VALID); + bc->ea_count++; + brelse(leaf_bh); + /* Return 0 here because if all that's wrong is a duplicate + block reference, we want pass1b to figure it out. We don't + want to delete all the extended attributes as if they are + in error. */ + return 0; + } + /* Point of confusion: We've got to set the ea block itself to + GFS2_BLKST_USED here. Elsewhere we mark the inode with + gfs2_eattr_block meaning it contains an eattr. */ + fsck_blockmap_set(ip, block, _("Extended Attribute"), + sdp->gfs1 ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + bc->ea_count++; + *bh = leaf_bh; + return 0; +} + +/** + * check_extended_leaf_eattr + * @ip + * @el_blk: block number of the extended leaf + * + * An EA leaf block can contain EA's with pointers to blocks + * where the data for that EA is kept. Those blocks still + * have the gfs2 meta header of type GFS2_METATYPE_EA + * + * Returns: 0 if correct[able], -1 if removal is needed + */ +static int check_extended_leaf_eattr(struct gfs2_inode *ip, int i, + uint64_t *data_ptr, + struct gfs2_buffer_head *leaf_bh, + uint32_t tot_ealen, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private) +{ + uint64_t el_blk = be64_to_cpu(*data_ptr); + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh = NULL; + int error = 0; + + if (!valid_block_ip(ip, el_blk)) { + log_err( _("Inode #%llu (0x%llx): Extended Attribute block " + "%llu (0x%llx) has an extended leaf block #%llu " + "(0x%llx) that is invalid or out of range.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_eattr, + (unsigned long long)ip->i_di.di_eattr, + (unsigned long long)el_blk, + (unsigned long long)el_blk); + fsck_blockmap_set(ip, ip->i_di.di_eattr, + _("bad (out of range) Extended Attribute "), + GFS2_BLKST_UNLINKED); + error = 1; + } else { + error = check_ealeaf_block(ip, el_blk, GFS2_METATYPE_ED, &bh, + private); + } + if (bh) + brelse(bh); + if (error) { + log_err(_("Bad extended attribute found at block %lld " + "(0x%llx)"), + (unsigned long long)be64_to_cpu(*data_ptr), + (unsigned long long)be64_to_cpu(*data_ptr)); + if (query( _("Repair the bad Extended Attribute? (y/n) "))) { + ea_hdr->ea_num_ptrs = i; + ea_hdr->ea_data_len = cpu_to_be32(tot_ealen); + *data_ptr = 0; + bmodified(leaf_bh); + /* Endianness doesn't matter in this case because it's + a single byte. */ + fsck_blockmap_set(ip, ip->i_di.di_eattr, + _("extended attribute"), + sdp->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + log_err( _("The EA was fixed.\n")); + error = 0; + } else { + error = 1; + log_err( _("The bad EA was not fixed.\n")); + } + } + return error; +} + +static int check_eattr_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + if (!valid_block_ip(ip, block)) { + log_warn( _("Inode #%llu (0x%llx): Extended Attribute leaf " + "block #%llu (0x%llx) is invalid or out of " + "range.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, (unsigned long long)block); + fsck_blockmap_set(ip, ip->i_di.di_eattr, + _("bad (out of range) Extended " + "Attribute leaf"), GFS2_BLKST_UNLINKED); + return 1; + } + return check_ealeaf_block(ip, block, GFS2_METATYPE_EA, bh, private); +} + +static int ask_remove_eattr_entry(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *curr, + struct gfs2_ea_header *prev, + int fix_curr, int fix_curr_len) +{ + if (!query( _("Remove the bad Extended Attribute entry? (y/n) "))) { + log_err( _("Bad Extended Attribute not removed.\n")); + return 0; + } + if (fix_curr) + curr->ea_flags |= GFS2_EAFLAG_LAST; + if (fix_curr_len) { + uint32_t max_size = sdp->sd_sb.sb_bsize; + uint32_t offset = (uint32_t)(((unsigned long)curr) - + ((unsigned long)leaf_bh->b_data)); + curr->ea_rec_len = cpu_to_be32(max_size - offset); + } + if (!prev) + curr->ea_type = GFS2_EATYPE_UNUSED; + else { + uint32_t tmp32 = be32_to_cpu(curr->ea_rec_len) + + be32_to_cpu(prev->ea_rec_len); + prev->ea_rec_len = cpu_to_be32(tmp32); + if (curr->ea_flags & GFS2_EAFLAG_LAST) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } + log_err( _("Bad Extended Attribute at block #%llu" + " (0x%llx) removed.\n"), + (unsigned long long)leaf_bh->b_blocknr, + (unsigned long long)leaf_bh->b_blocknr); + bmodified(leaf_bh); + return 1; +} + +static int check_eattr_entries(struct gfs2_inode *ip, + struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + char ea_name[256]; + uint32_t offset = (uint32_t)(((unsigned long)ea_hdr) - + ((unsigned long)leaf_bh->b_data)); + uint32_t max_size = sdp->sd_sb.sb_bsize; + uint32_t avail_size; + int max_ptrs; + + if (!ea_hdr->ea_name_len){ + log_err( _("EA has name length of zero\n")); + return ask_remove_eattr_entry(sdp, leaf_bh, ea_hdr, + ea_hdr_prev, 1, 1); + } + if (offset + be32_to_cpu(ea_hdr->ea_rec_len) > max_size){ + log_err( _("EA rec length too long\n")); + return ask_remove_eattr_entry(sdp, leaf_bh, ea_hdr, + ea_hdr_prev, 1, 1); + } + if (offset + be32_to_cpu(ea_hdr->ea_rec_len) == max_size && + (ea_hdr->ea_flags & GFS2_EAFLAG_LAST) == 0){ + log_err( _("last EA has no last entry flag\n")); + return ask_remove_eattr_entry(sdp, leaf_bh, ea_hdr, + ea_hdr_prev, 0, 0); + } + if (!ea_hdr->ea_name_len){ + log_err( _("EA has name length of zero\n")); + return ask_remove_eattr_entry(sdp, leaf_bh, ea_hdr, + ea_hdr_prev, 0, 0); + } + + memset(ea_name, 0, sizeof(ea_name)); + strncpy(ea_name, (char *)ea_hdr + sizeof(struct gfs2_ea_header), + ea_hdr->ea_name_len); + + if (!GFS2_EATYPE_VALID(ea_hdr->ea_type) && + ((ea_hdr_prev) || (!ea_hdr_prev && ea_hdr->ea_type))){ + /* Skip invalid entry */ + log_err(_("EA (%s) type is invalid (%d > %d).\n"), + ea_name, ea_hdr->ea_type, GFS2_EATYPE_LAST); + return ask_remove_eattr_entry(sdp, leaf_bh, ea_hdr, + ea_hdr_prev, 0, 0); + } + + if (!ea_hdr->ea_num_ptrs) + return 0; + + avail_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + max_ptrs = (be32_to_cpu(ea_hdr->ea_data_len)+avail_size-1)/avail_size; + + if (max_ptrs > ea_hdr->ea_num_ptrs) { + log_err(_("EA (%s) has incorrect number of pointers.\n"), + ea_name); + log_err(_(" Required: %d\n Reported: %d\n"), + max_ptrs, ea_hdr->ea_num_ptrs); + return ask_remove_eattr_entry(sdp, leaf_bh, ea_hdr, + ea_hdr_prev, 0, 0); + } else { + log_debug( _(" Pointers Required: %d\n Pointers Reported: %d\n"), + max_ptrs, ea_hdr->ea_num_ptrs); + } + return 0; +} + +/** + * mark_block_invalid - mark blocks associated with an inode as invalid + * unless the block is a duplicate. + * + * An "invalid" block is now considered free in the bitmap, and pass2 will + * delete any invalid blocks. This is nearly identical to function + * delete_block_if_notdup. + */ +static int mark_block_invalid(struct gfs2_inode *ip, uint64_t block, + enum dup_ref_type reftype, const char *btype, + int *is_valid, int *was_duplicate) +{ + int q; + + /* If the block isn't valid, we obviously can't invalidate it. + * However, if we return an error, invalidating will stop, and + * we want it to continue to invalidate the valid blocks. If we + * don't do this, block references that follow that are also + * referenced elsewhere (duplicates) won't be flagged as such, + * and as a result, they'll be freed when this dinode is deleted, + * despite being used by another dinode as a valid block. */ + if (is_valid) + *is_valid = 1; + if (was_duplicate) + *was_duplicate = 0; + if (!valid_block_ip(ip, block)) { + if (is_valid) + *is_valid = 0; + return meta_is_good; + } + + q = block_type(bl, block); + if (q != GFS2_BLKST_FREE) { + if (was_duplicate) + *was_duplicate = 1; + add_duplicate_ref(ip, block, reftype, 0, INODE_INVALID); + log_info( _("%s block %lld (0x%llx), part of inode " + "%lld (0x%llx), was previously referenced so " + "the invalid reference is ignored.\n"), + btype, (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + return meta_is_good; + } + fsck_blockmap_set(ip, block, btype, GFS2_BLKST_UNLINKED); + return meta_is_good; +} + +static int invalidate_metadata(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private) +{ + *is_valid = 1; + *was_duplicate = 0; + return mark_block_invalid(ip, block, ref_as_meta, _("metadata"), + is_valid, was_duplicate); +} + +static int invalidate_leaf(struct gfs2_inode *ip, uint64_t block, + void *private) +{ + return mark_block_invalid(ip, block, ref_as_meta, _("leaf"), + NULL, NULL); +} + +static int invalidate_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + return mark_block_invalid(ip, block, ref_as_data, _("data"), + NULL, NULL); +} + +static int invalidate_eattr_indir(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, + struct gfs2_buffer_head **bh, void *private) +{ + return mark_block_invalid(ip, block, ref_as_ea, + _("indirect extended attribute"), + NULL, NULL); +} + +static int invalidate_eattr_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + return mark_block_invalid(ip, block, ref_as_ea, + _("extended attribute"), + NULL, NULL); +} + +/** + * Check for massive amounts of pointer corruption. If the block has + * lots of out-of-range pointers, we can't trust any of the pointers. + * For example, a stray pointer with a value of 0x1d might be + * corruption/nonsense, and if so, we don't want to delete an + * important file (like master or the root directory) because of it. + * We need to check for a large number of bad pointers BEFORE we start + * messing with them because we don't want to mark a block as a + * duplicate (for example) until we know if the pointers in general can + * be trusted. Thus it needs to be in a separate loop. + * Returns: 0 if good range, otherwise != 0 + */ +enum b_types { btype_meta, btype_leaf, btype_data, btype_ieattr, btype_eattr}; +const char *btypes[5] = { + "metadata", "leaf", "data", "indirect extended attribute", + "extended attribute" }; + +static int rangecheck_block(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, enum b_types btype, + void *private) +{ + long *bad_pointers = (long *)private; + int q; + + if (!valid_block_ip(ip, block)) { + (*bad_pointers)++; + log_info( _("Bad %s block pointer (invalid or out of range " + "#%ld) found in inode %lld (0x%llx).\n"), + btypes[btype], *bad_pointers, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if ((*bad_pointers) <= BAD_POINTER_TOLERANCE) + return meta_is_good; + else + return meta_error; /* Exits check_metatree quicker */ + } + /* See how many duplicate blocks it has */ + q = block_type(bl, block); + if (q != GFS2_BLKST_FREE) { + (*bad_pointers)++; + log_info( _("Duplicated %s block pointer (violation %ld, block" + " %lld (0x%llx)) found in inode %lld (0x%llx).\n"), + btypes[btype], *bad_pointers, + (unsigned long long)block, (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if ((*bad_pointers) <= BAD_POINTER_TOLERANCE) + return meta_is_good; + else { + log_debug(_("Inode 0x%llx bad pointer tolerance " + "exceeded: block 0x%llx.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block); + return meta_error; /* Exits check_metatree quicker */ + } + } + return meta_is_good; +} + +static int rangecheck_metadata(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private) +{ + *is_valid = 1; + *was_duplicate = 0; + return rangecheck_block(ip, block, bh, btype_meta, private); +} + +static int rangecheck_leaf(struct gfs2_inode *ip, uint64_t block, + void *private) +{ + return rangecheck_block(ip, block, NULL, btype_leaf, private); +} + +static int rangecheck_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + return rangecheck_block(ip, block, NULL, btype_data, private); +} + +static int rangecheck_eattr_indir(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, + struct gfs2_buffer_head **bh, void *private) +{ + return rangecheck_block(ip, block, NULL, btype_ieattr, private); +} + +static int rangecheck_eattr_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + return rangecheck_block(ip, block, NULL, btype_eattr, private); +} + +struct metawalk_fxns rangecheck_fxns = { + .private = NULL, + .readahead = 1, + .check_metalist = rangecheck_metadata, + .check_data = rangecheck_data, + .check_leaf = rangecheck_leaf, + .check_eattr_indir = rangecheck_eattr_indir, + .check_eattr_leaf = rangecheck_eattr_leaf, + .delete_block = delete_block, +}; + +struct metawalk_fxns eattr_undo_fxns = { + .private = NULL, + .check_eattr_indir = undo_eattr_indir_or_leaf, + .check_eattr_leaf = undo_eattr_indir_or_leaf, + .finish_eattr_indir = finish_eattr_indir, + .delete_block = delete_block, +}; +/* set_ip_blockmap - set the blockmap for a dinode + * + * returns: 0 if no error, -EINVAL if dinode has a bad mode, -EPERM on error + */ +static int set_ip_blockmap(struct gfs2_inode *ip) +{ + uint64_t block = ip->i_bh->b_blocknr; + uint32_t mode; + const char *ty; + + if (ip->i_sbd->gfs1) + mode = gfs_to_gfs2_mode(ip); + else + mode = ip->i_di.di_mode & S_IFMT; + + switch (mode) { + case S_IFDIR: + ty = _("directory"); + break; + case S_IFREG: + ty = _("file"); + break; + case S_IFLNK: + ty = _("symlink"); + break; + case S_IFBLK: + ty = _("block device"); + break; + case S_IFCHR: + ty = _("character device"); + break; + case S_IFIFO: + ty = _("fifo"); + break; + case S_IFSOCK: + ty = _("socket"); + break; + default: + return -EINVAL; + } + if (fsck_blockmap_set(ip, block, ty, GFS2_BLKST_DINODE) || + (mode == S_IFDIR && !dirtree_insert(ip->i_di.di_num))) { + stack; + return -EPERM; + } + return 0; +} + +static int alloc_metalist(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, int *is_valid, + int *was_duplicate, void *private) +{ + int q; + const char *desc = (const char *)private; + + /* No need to range_check here--if it was added, it's in range. */ + /* We can't check the bitmap here because this function is called + after the bitmap has been set but before the blockmap has. */ + *is_valid = 1; + *was_duplicate = 0; + *bh = bread(ip->i_sbd, block); + q = bitmap_type(ip->i_sbd, block); + if (q == GFS2_BLKST_FREE) { + log_debug(_("%s reference to new metadata block " + "%lld (0x%llx) is now marked as indirect.\n"), + desc, (unsigned long long)block, + (unsigned long long)block); + gfs2_blockmap_set(bl, block, ip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + } + return meta_is_good; +} + +static int alloc_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + int q; + const char *desc = (const char *)private; + + /* No need to range_check here--if it was added, it's in range. */ + /* We can't check the bitmap here because this function is called + after the bitmap has been set but before the blockmap has. */ + q = bitmap_type(ip->i_sbd, block); + if (q == GFS2_BLKST_FREE) { + log_debug(_("%s reference to new data block " + "%lld (0x%llx) is now marked as data.\n"), + desc, (unsigned long long)block, + (unsigned long long)block); + gfs2_blockmap_set(bl, block, GFS2_BLKST_USED); + } + return 0; +} + +static int alloc_leaf(struct gfs2_inode *ip, uint64_t block, void *private) +{ + int q; + + /* No need to range_check here--if it was added, it's in range. */ + /* We can't check the bitmap here because this function is called + after the bitmap has been set but before the blockmap has. */ + q = bitmap_type(ip->i_sbd, block); + if (q == GFS2_BLKST_FREE) + fsck_blockmap_set(ip, block, _("newly allocated leaf"), + ip->i_sbd->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + return 0; +} + +struct metawalk_fxns alloc_fxns = { + .private = NULL, + .check_leaf = alloc_leaf, + .check_metalist = alloc_metalist, + .check_data = alloc_data, + .check_eattr_indir = NULL, + .check_eattr_leaf = NULL, + .check_dentry = NULL, + .check_eattr_entry = NULL, + .check_eattr_extentry = NULL, + .finish_eattr_indir = NULL, + .delete_block = delete_block, +}; + +/* + * pass1_check_metatree - wrapper function for check_metatree + * + * Generic function check_metatree sets the bitmap values, but not the + * corresponding values in the blockmap. If we get an error, the inode will + * have been freed in the bitmap. We need to set the inode address as free + * as well. + */ +static int pass1_check_metatree(struct gfs2_inode *ip, + struct metawalk_fxns *pass) +{ + int error; + + error = check_metatree(ip, pass); + if (error) + gfs2_blockmap_set(bl, ip->i_di.di_num.no_addr, + GFS2_BLKST_FREE); + return error; +} + +/* + * reprocess_inode - fixes the blockmap to match the bitmap due to an + * unexpected block allocation via libgfs2. + * + * The problem we're trying to overcome here is when a new block must be + * added to a dinode because of a write. This will happen when lost+found + * needs a new indirect block for its hash table. In that case, the write + * causes a new block to be assigned in the bitmap but that block is not yet + * accurately reflected in the fsck blockmap. We need to compensate here. + * + * We can't really use fsck_blockmap_set here because the new block + * was already allocated by libgfs2 and therefore it took care of + * the rgrp free space variable. fsck_blockmap_set adjusts the free space + * in the rgrp according to the change, which has already been done. + * So it's only our blockmap that now disagrees with the rgrp bitmap, so we + * need to fix only that. + */ +static void reprocess_inode(struct gfs2_inode *ip, const char *desc) +{ + int error; + + alloc_fxns.private = (void *)desc; + log_info( _("%s inode %llu (0x%llx) had blocks added; reprocessing " + "its metadata tree at height=%d.\n"), desc, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + ip->i_di.di_height); + error = pass1_check_metatree(ip, &alloc_fxns); + if (error) + log_err( _("Error %d reprocessing the %s metadata tree.\n"), + error, desc); +} + +/* + * handle_ip - process an incore structure representing a dinode. + */ +static int handle_ip(struct gfs2_sbd *sdp, struct gfs2_inode *ip) +{ + int error; + struct block_count bc = {0}; + long bad_pointers; + uint64_t lf_blks = 0; + + bad_pointers = 0L; + + /* First, check the metadata for massive amounts of pointer corruption. + Such corruption can only lead us to ruin trying to clean it up, + so it's better to check it up front and delete the inode if + there is corruption. */ + rangecheck_fxns.private = &bad_pointers; + error = pass1_check_metatree(ip, &rangecheck_fxns); + if (bad_pointers > BAD_POINTER_TOLERANCE) { + log_err( _("Error: inode %llu (0x%llx) has more than " + "%d bad pointers.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + BAD_POINTER_TOLERANCE); + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, + _("badly corrupt"), GFS2_BLKST_FREE); + return 0; + } + + error = set_ip_blockmap(ip); + if (error == -EINVAL) { + /* We found a dinode that has an invalid mode. At this point + set_ip_blockmap returned an error, which means it never + got inserted into the inode tree. Since we haven't even + processed its metadata with pass1_fxns, none of its + metadata will be flagged as metadata or data blocks yet. + Therefore, we don't need to invalidate anything. */ + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, + _("invalid mode"), GFS2_BLKST_FREE); + return 0; + } else if (error) + goto bad_dinode; + + if (set_di_nlink(ip)) + goto bad_dinode; + + if (lf_dip) + lf_blks = lf_dip->i_di.di_blocks; + + pass1_fxns.private = &bc; + error = pass1_check_metatree(ip, &pass1_fxns); + + /* Pass1 may have added some blocks to lost+found by virtue of leafs + that were misplaced. If it did, we need to reprocess lost+found + to correctly account for its blocks. */ + if (lf_dip && lf_dip->i_di.di_blocks != lf_blks) + reprocess_inode(lf_dip, "lost+found"); + + /* We there was an error, we return 0 because we want fsck to continue + and analyze the other dinodes as well. */ + if (fsck_abort) + return 0; + + if (!error) { + error = check_inode_eattr(ip, &pass1_fxns); + + if (error) { + if (!query(_("Clear the bad Extended Attributes? " + "(y/n) "))) { + log_err( _("The bad Extended Attributes were " + "not fixed.\n")); + return 0; + } + log_err(_("Clearing the bad Extended Attributes in " + "inode %lld (0x%llx).\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + eattr_undo_fxns.private = &bc; + check_inode_eattr(ip, &eattr_undo_fxns); + ask_remove_inode_eattr(ip, &bc); + return 1; + } + } + + if (ip->i_di.di_blocks != + (1 + bc.indir_count + bc.data_count + bc.ea_count)) { + log_err( _("Inode #%llu (0x%llx): Ondisk block count (%llu" + ") does not match what fsck found (%llu)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_blocks, + (unsigned long long)1 + bc.indir_count + + bc.data_count + bc.ea_count); + log_info( _("inode has: %lld, but fsck counts: Dinode:1 + " + "indir:%lld + data: %lld + ea: %lld\n"), + (unsigned long long)ip->i_di.di_blocks, + (unsigned long long)bc.indir_count, + (unsigned long long)bc.data_count, + (unsigned long long)bc.ea_count); + if (query( _("Fix ondisk block count? (y/n) "))) { + ip->i_di.di_blocks = 1 + bc.indir_count + bc.data_count + + bc.ea_count; + bmodified(ip->i_bh); + log_err( _("Block count for #%llu (0x%llx) fixed\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + } else + log_err( _("Bad block count for #%llu (0x%llx" + ") not fixed\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + } + + return 0; +bad_dinode: + stack; + return -1; +} + +static void check_i_goal(struct gfs2_sbd *sdp, struct gfs2_inode *ip) +{ + if (sdp->gfs1 || ip->i_di.di_flags & GFS2_DIF_SYSTEM) + return; + + if (ip->i_di.di_goal_meta <= LGFS2_SB_ADDR(sdp) || + ip->i_di.di_goal_meta > sdp->fssize) { + log_err(_("Inode #%llu (0x%llx): Bad allocation goal block " + "found: %llu (0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_goal_meta, + (unsigned long long)ip->i_di.di_goal_meta); + if (query( _("Fix goal block in inode #%llu (0x%llx)? (y/n) "), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr)) { + ip->i_di.di_goal_meta = ip->i_di.di_num.no_addr; + bmodified(ip->i_bh); + } else + log_err(_("Allocation goal block in inode #%lld " + "(0x%llx) not fixed\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + } +} + +/* + * handle_di - This is now a wrapper function that takes a gfs2_buffer_head + * and calls handle_ip, which takes an in-code dinode structure. + */ +static int handle_di(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, + struct gfs2_buffer_head *bh) +{ + int error = 0; + uint64_t block = bh->b_blocknr; + struct gfs2_inode *ip; + + ip = fsck_inode_get(sdp, rgd, bh); + + if (ip->i_di.di_num.no_addr != block) { + log_err( _("Inode #%llu (0x%llx): Bad inode address found: %llu " + "(0x%llx)\n"), (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query( _("Fix address in inode at block #%llu" + " (0x%llx)? (y/n) "), + (unsigned long long)block, (unsigned long long)block)) { + ip->i_di.di_num.no_addr = ip->i_di.di_num.no_formal_ino = block; + bmodified(ip->i_bh); + } else + log_err( _("Address in inode at block #%llu" + " (0x%llx) not fixed\n"), + (unsigned long long)block, + (unsigned long long)block); + } + if (sdp->gfs1 && ip->i_di.di_num.no_formal_ino != block) { + log_err( _("Inode #%llu (0x%llx): GFS1 formal inode number " + "mismatch: was %llu (0x%llx)\n"), + (unsigned long long)block, (unsigned long long)block, + (unsigned long long)ip->i_di.di_num.no_formal_ino, + (unsigned long long)ip->i_di.di_num.no_formal_ino); + if (query( _("Fix formal inode number in inode #%llu" + " (0x%llx)? (y/n) "), (unsigned long long)block, + (unsigned long long)block)) { + ip->i_di.di_num.no_formal_ino = block; + bmodified(ip->i_bh); + } else + log_err( _("Inode number in inode at block #%lld " + "(0x%llx) not fixed\n"), + (unsigned long long)block, + (unsigned long long)block); + } + check_i_goal(sdp, ip); + error = handle_ip(sdp, ip); + fsck_inode_put(&ip); + return error; +} + +/* Check system inode and verify it's marked "in use" in the bitmap: */ +/* Should work for all system inodes: root, master, jindex, per_node, etc. */ +/* We have to pass the sysinode as ** because the pointer may change out from + under the reference by way of the builder() function. */ +static int check_system_inode(struct gfs2_sbd *sdp, + struct gfs2_inode **sysinode, + const char *filename, + int builder(struct gfs2_sbd *sdp), int isdir, + struct gfs2_inode *sysdir, int needs_sysbit) +{ + uint64_t iblock = 0; + struct dir_status ds = {0}; + int error, err = 0; + + log_info( _("Checking system inode '%s'\n"), filename); + if (*sysinode) { + /* Read in the system inode, look at its dentries, and start + * reading through them */ + iblock = (*sysinode)->i_di.di_num.no_addr; + log_info( _("System inode for '%s' is located at block %llu" + " (0x%llx)\n"), filename, + (unsigned long long)iblock, + (unsigned long long)iblock); + if (gfs2_check_meta((*sysinode)->i_bh, GFS2_METATYPE_DI)) { + log_err( _("Found invalid system dinode at block #" + "%llu (0x%llx)\n"), + (unsigned long long)iblock, + (unsigned long long)iblock); + gfs2_blockmap_set(bl, iblock, GFS2_BLKST_FREE); + check_n_fix_bitmap(sdp, (*sysinode)->i_rgd, iblock, 0, + GFS2_BLKST_FREE); + inode_put(sysinode); + } + } + if (*sysinode) { + ds.q = block_type(bl, iblock); + /* If the inode exists but the block is marked free, we might + be recovering from a corrupt bitmap. In that case, don't + rebuild the inode. Just reuse the inode and fix the + bitmap. */ + if (ds.q == GFS2_BLKST_FREE) { + log_info( _("The inode exists but the block is not " + "marked 'in use'; fixing it.\n")); + fsck_blockmap_set(*sysinode, + (*sysinode)->i_di.di_num.no_addr, + filename, GFS2_BLKST_DINODE); + ds.q = GFS2_BLKST_DINODE; + if (isdir) + dirtree_insert((*sysinode)->i_di.di_num); + } + /* Make sure it's marked as a system file/directory */ + if (needs_sysbit && + !((*sysinode)->i_di.di_flags & GFS2_DIF_SYSTEM)) { + log_err( _("System inode %s is missing the 'system' " + "flag. It should be rebuilt.\n"), filename); + if (sysdir && query(_("Delete the corrupt %s system " + "inode? (y/n) "), filename)) { + inode_put(sysinode); + gfs2_dirent_del(sysdir, filename, + strlen(filename)); + /* Set the blockmap (but not bitmap) back to + 'free' so that it gets checked like any + normal dinode. */ + gfs2_blockmap_set(bl, iblock, GFS2_BLKST_FREE); + log_err( _("Removed system inode \"%s\".\n"), + filename); + } + } + } else + log_info( _("System inode for '%s' is corrupt or missing.\n"), + filename); + /* If there are errors with the inode here, we need to create a new + inode and get it all setup - of course, everything will be in + lost+found then, but we *need* our system inodes before we can + do any of that. */ + if (!(*sysinode) || ds.q != GFS2_BLKST_DINODE) { + log_err(_("Invalid or missing %s system inode (is '%s', " + "should be '%s').\n"), filename, + block_type_string(ds.q), + block_type_string(GFS2_BLKST_DINODE)); + if (query(_("Create new %s system inode? (y/n) "), filename)) { + log_err( _("Rebuilding system file \"%s\"\n"), + filename); + error = builder(sdp); + if (error) { + log_err( _("Error rebuilding system " + "inode %s: Cannot continue\n"), + filename); + return error; + } + if (*sysinode == sdp->md.jiinode) + ji_update(sdp); + fsck_blockmap_set(*sysinode, + (*sysinode)->i_di.di_num.no_addr, + filename, GFS2_BLKST_DINODE); + ds.q = GFS2_BLKST_DINODE; + if (isdir) + dirtree_insert((*sysinode)->i_di.di_num); + } else { + log_err( _("Cannot continue without valid %s inode\n"), + filename); + return -1; + } + } + if (is_dir(&(*sysinode)->i_di, sdp->gfs1)) { + struct block_count bc = {0}; + + sysdir_fxns.private = &bc; + if ((*sysinode)->i_di.di_flags & GFS2_DIF_EXHASH) + pass1_check_metatree(*sysinode, &sysdir_fxns); + else { + err = check_linear_dir(*sysinode, (*sysinode)->i_bh, + &sysdir_fxns); + /* If we encountered an error in our directory check + we should still call handle_ip, but return the + error later. */ + if (err) + log_err(_("Error found in %s while checking " + "directory entries.\n"), filename); + } + } + check_i_goal(sdp, *sysinode); + error = handle_ip(sdp, *sysinode); + return error ? error : err; +} + +static int build_a_journal(struct gfs2_sbd *sdp) +{ + char name[256]; + int err = 0; + + /* First, try to delete the journal if it's in jindex */ + sprintf(name, "journal%u", sdp->md.journals); + gfs2_dirent_del(sdp->md.jiinode, name, strlen(name)); + /* Now rebuild it */ + err = build_journal(sdp, sdp->md.journals, sdp->md.jiinode); + if (err) { + log_crit(_("Error %d building journal\n"), err); + exit(FSCK_ERROR); + } + return 0; +} + +static int check_system_inodes(struct gfs2_sbd *sdp) +{ + int journal_count; + + /******************************************************************* + ******* Check the system inode integrity ************* + *******************************************************************/ + /* Mark the master system dinode as a "dinode" in the block map. + All other system dinodes in master will be taken care of by function + resuscitate_metalist. But master won't since it has no parent.*/ + if (!sdp->gfs1) { + fsck_blockmap_set(sdp->master_dir, + sdp->master_dir->i_di.di_num.no_addr, + "master", GFS2_BLKST_DINODE); + if (check_system_inode(sdp, &sdp->master_dir, "master", + build_master, 1, NULL, 1)) { + stack; + return -1; + } + } + /* Mark the root dinode as a "dinode" in the block map as we did + for master, since it has no parent. */ + fsck_blockmap_set(sdp->md.rooti, sdp->md.rooti->i_di.di_num.no_addr, + "root", GFS2_BLKST_DINODE); + if (check_system_inode(sdp, &sdp->md.rooti, "root", build_root, 1, + NULL, 0)) { + stack; + return -1; + } + if (!sdp->gfs1 && + check_system_inode(sdp, &sdp->md.inum, "inum", build_inum, 0, + sdp->master_dir, 1)) { + stack; + return -1; + } + if (check_system_inode(sdp, &sdp->md.statfs, "statfs", build_statfs, 0, + sdp->master_dir, !sdp->gfs1)) { + stack; + return -1; + } + if (check_system_inode(sdp, &sdp->md.jiinode, "jindex", build_jindex, + (sdp->gfs1 ? 0 : 1), sdp->master_dir, + !sdp->gfs1)) { + stack; + return -1; + } + if (check_system_inode(sdp, &sdp->md.riinode, "rindex", build_rindex, + 0, sdp->master_dir, !sdp->gfs1)) { + stack; + return -1; + } + if (check_system_inode(sdp, &sdp->md.qinode, "quota", build_quota, + 0, sdp->master_dir, !sdp->gfs1)) { + stack; + return -1; + } + if (!sdp->gfs1 && + check_system_inode(sdp, &sdp->md.pinode, "per_node", + build_per_node, 1, sdp->master_dir, 1)) { + stack; + return -1; + } + /* We have to play a trick on build_journal: We swap md.journals + in order to keep a count of which journal we need to build. */ + journal_count = sdp->md.journals; + /* gfs1's journals aren't dinode, they're just a bunch of blocks. */ + if (sdp->gfs1) { + /* gfs1 has four dinodes that are set in the superblock and + therefore not linked to anything else. We need to adjust + the link counts so pass4 doesn't get confused. */ + incr_link_count(sdp->md.statfs->i_di.di_num, NULL, + _("gfs1 statfs inode")); + incr_link_count(sdp->md.jiinode->i_di.di_num, NULL, + _("gfs1 jindex inode")); + incr_link_count(sdp->md.riinode->i_di.di_num, NULL, + _("gfs1 rindex inode")); + incr_link_count(sdp->md.qinode->i_di.di_num, NULL, + _("gfs1 quota inode")); + return 0; + } + for (sdp->md.journals = 0; sdp->md.journals < journal_count; + sdp->md.journals++) { + char jname[16]; + + sprintf(jname, "journal%d", sdp->md.journals); + if (check_system_inode(sdp, &sdp->md.journal[sdp->md.journals], + jname, build_a_journal, 0, + sdp->md.jiinode, 1)) { + stack; + return -1; + } + } + + return 0; +} + +static int pass1_process_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, uint64_t *ibuf, unsigned n) +{ + struct gfs2_buffer_head *bh; + unsigned i; + uint64_t block; + struct gfs2_inode *ip; + int q; + /* Readahead numbers arrived at by experiment */ + unsigned rawin = 50; + unsigned ralen = 100 * sdp->bsize; + unsigned r = 0; + + for (i = 0; i < n; i++) { + int is_inode; + uint32_t check_magic; + + block = ibuf[i]; + + if (r++ == rawin) { + posix_fadvise(sdp->device_fd, block * sdp->bsize, ralen, POSIX_FADV_WILLNEED); + r = 0; + } + + /* skip gfs1 rindex indirect blocks */ + if (sdp->gfs1 && blockfind(&gfs1_rindex_blks, block)) { + log_debug(_("Skipping rindex indir block " + "%lld (0x%llx)\n"), + (unsigned long long)block, + (unsigned long long)block); + continue; + } + warm_fuzzy_stuff(block); + + if (fsck_abort) { /* if asked to abort */ + gfs2_special_free(&gfs1_rindex_blks); + return FSCK_OK; + } + if (skip_this_pass) { + printf( _("Skipping pass 1 is not a good idea.\n")); + skip_this_pass = FALSE; + fflush(stdout); + } + if (fsck_system_inode(sdp, block)) { + log_debug(_("Already processed system inode " + "%lld (0x%llx)\n"), + (unsigned long long)block, + (unsigned long long)block); + continue; + } + + bh = bread(sdp, block); + + is_inode = 0; + if (gfs2_check_meta(bh, GFS2_METATYPE_DI) == 0) + is_inode = 1; + + check_magic = ((struct gfs2_meta_header *) + (bh->b_data))->mh_magic; + + q = block_type(bl, block); + if (q != GFS2_BLKST_FREE) { + if (be32_to_cpu(check_magic) == GFS2_MAGIC && + sdp->gfs1 && !is_inode) { + log_debug(_("Block 0x%llx assumed to be " + "previously processed GFS1 " + "non-dinode metadata.\n"), + (unsigned long long)block); + brelse(bh); + continue; + } + log_err( _("Found a duplicate inode block at #%llu " + "(0x%llx) previously marked as a %s\n"), + (unsigned long long)block, + (unsigned long long)block, + block_type_string(q)); + ip = fsck_inode_get(sdp, rgd, bh); + if (is_inode && ip->i_di.di_num.no_addr == block) + add_duplicate_ref(ip, block, ref_is_inode, 0, + INODE_VALID); + else + log_info(_("dinum.no_addr is wrong, so I " + "assume the bitmap is just " + "wrong.\n")); + fsck_inode_put(&ip); + brelse(bh); + continue; + } + + if (!is_inode) { + if (be32_to_cpu(check_magic) == GFS2_MAGIC) { + /* In gfs2, a bitmap mark of 2 means an inode, + but in gfs1 it means any metadata. So if + this is gfs1 and not an inode, it may be + okay. If it's non-dinode metadata, it will + be referenced by an inode, so we need to + skip it here and it will be sorted out + when the referencing inode is checked. */ + if (sdp->gfs1) { + log_debug( _("Deferring GFS1 " + "metadata block #" + "%" PRIu64" (0x%" + PRIx64 ")\n"), + block, block); + brelse(bh); + continue; + } + } + log_err( _("Found invalid inode at block #" + "%llu (0x%llx)\n"), + (unsigned long long)block, + (unsigned long long)block); + check_n_fix_bitmap(sdp, rgd, block, 0, + GFS2_BLKST_FREE); + } else if (handle_di(sdp, rgd, bh) < 0) { + stack; + brelse(bh); + gfs2_special_free(&gfs1_rindex_blks); + return FSCK_ERROR; + } + /* Ignore everything else - they should be hit by the + handle_di step. Don't check NONE either, because + check_meta passes everything if GFS2_METATYPE_NONE + is specified. Hopefully, other metadata types such + as indirect blocks will be handled when the inode + itself is processed, and if it's not, it should be + caught in pass5. */ + brelse(bh); + } + + return 0; +} + +static int pass1_process_rgrp(struct gfs2_sbd *sdp, struct rgrp_tree *rgd) +{ + unsigned k, n, i; + uint64_t *ibuf = malloc(sdp->bsize * GFS2_NBBY * sizeof(uint64_t)); + int ret = 0; + + if (ibuf == NULL) + return FSCK_ERROR; + + for (k = 0; k < rgd->ri.ri_length; k++) { + n = lgfs2_bm_scan(rgd, k, ibuf, GFS2_BLKST_DINODE); + + if (n) { + ret = pass1_process_bitmap(sdp, rgd, ibuf, n); + if (ret) + goto out; + } + + if (fsck_abort) + goto out; + /* + For GFS1, we have to count the "free meta" blocks in the + resource group and mark them specially so we can count them + properly in pass5. + */ + if (!sdp->gfs1) + continue; + + n = lgfs2_bm_scan(rgd, k, ibuf, GFS2_BLKST_UNLINKED); + for (i = 0; i < n; i++) { + gfs2_blockmap_set(bl, ibuf[i], GFS2_BLKST_UNLINKED); + if (fsck_abort) + goto out; + } + } + +out: + free(ibuf); + return ret; +} + +static int gfs2_blockmap_create(struct gfs2_bmap *bmap, uint64_t size) +{ + bmap->size = size; + + /* Have to add 1 to BLOCKMAP_SIZE since it's 0-based and mallocs + * must be 1-based */ + bmap->mapsize = BLOCKMAP_SIZE2(size) + 1; + + if (!(bmap->map = calloc(bmap->mapsize, sizeof(char)))) + return -ENOMEM; + return 0; +} + + +static int link1_create(struct gfs2_bmap *bmap, uint64_t size) +{ + bmap->size = size; + + /* Have to add 1 to BLOCKMAP_SIZE since it's 0-based and mallocs + * must be 1-based */ + bmap->mapsize = BLOCKMAP_SIZE1(size) + 1; + + if (!(bmap->map = calloc(bmap->mapsize, sizeof(char)))) + return -ENOMEM; + return 0; +} + +static struct gfs2_bmap *gfs2_bmap_create(struct gfs2_sbd *sdp, uint64_t size, + uint64_t *addl_mem_needed) +{ + struct gfs2_bmap *il; + + *addl_mem_needed = 0L; + il = calloc(1, sizeof(*il)); + if (!il) + return NULL; + + if (gfs2_blockmap_create(il, size)) { + *addl_mem_needed = il->mapsize; + free(il); + il = NULL; + } + return il; +} + +static void gfs2_blockmap_destroy(struct gfs2_bmap *bmap) +{ + if (bmap->map) + free(bmap->map); + bmap->size = 0; + bmap->mapsize = 0; +} + +static void *gfs2_bmap_destroy(struct gfs2_sbd *sdp, struct gfs2_bmap *il) +{ + if (il) { + gfs2_blockmap_destroy(il); + free(il); + il = NULL; + } + return il; +} + +static void enomem(uint64_t addl_mem_needed) +{ + log_crit( _("This system doesn't have enough memory and swap space to fsck this file system.\n")); + log_crit( _("Additional memory needed is approximately: %lluMB\n"), + (unsigned long long)(addl_mem_needed / 1048576ULL)); + log_crit( _("Please increase your swap space by that amount and run fsck.gfs2 again.\n")); +} + +/** + * pass1 - walk through inodes and check inode state + * + * this walk can be done using root inode and depth first search, + * watching for repeat inode numbers + * + * format & type + * link count + * duplicate blocks + * bad blocks + * inodes size + * dir info + */ +int pass1(struct gfs2_sbd *sdp) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rgd; + uint64_t i; + uint64_t rg_count = 0; + struct timeval timer; + int ret = FSCK_OK; + uint64_t addl_mem_needed; + + bl = gfs2_bmap_create(sdp, last_fs_block+1, &addl_mem_needed); + if (!bl) { + enomem(addl_mem_needed); + return FSCK_ERROR; + } + addl_mem_needed = link1_create(&nlink1map, last_fs_block+1); + if (addl_mem_needed) { + enomem(addl_mem_needed); + gfs2_bmap_destroy(sdp, bl); + return FSCK_ERROR; + } + addl_mem_needed = link1_create(&clink1map, last_fs_block+1); + if (addl_mem_needed) { + enomem(addl_mem_needed); + link1_destroy(&nlink1map); + gfs2_bmap_destroy(sdp, bl); + return FSCK_ERROR; + } + osi_list_init(&gfs1_rindex_blks.list); + + /* FIXME: In the gfs fsck, we had to mark things like the + * journals and indices and such as 'other_meta' - in gfs2, + * the journals are files and are found in the normal file + * sweep - is there any metadata we need to mark here before + * the sweeps start that we won't find otherwise? */ + + /* Make sure the system inodes are okay & represented in the bitmap. */ + check_system_inodes(sdp); + + /* So, do we do a depth first search starting at the root + * inode, or use the rg bitmaps, or just read every fs block + * to find the inodes? If we use the depth first search, why + * have pass3 at all - if we use the rg bitmaps, pass5 is at + * least partially invalidated - if we read every fs block, + * things will probably be intolerably slow. The current fsck + * uses the rg bitmaps, so maybe that's the best way to start + * things - we can change the method later if necessary. + */ + for (n = osi_first(&sdp->rgtree); n; n = next, rg_count++) { + if (fsck_abort) { + ret = FSCK_CANCELED; + goto out; + } + next = osi_next(n); + log_debug( _("Checking metadata in Resource Group #%llu\n"), + (unsigned long long)rg_count); + rgd = (struct rgrp_tree *)n; + for (i = 0; i < rgd->ri.ri_length; i++) { + log_debug( _("rgrp block %lld (0x%llx) " + "is now marked as 'rgrp data'\n"), + rgd->ri.ri_addr + i, rgd->ri.ri_addr + i); + if (gfs2_blockmap_set(bl, rgd->ri.ri_addr + i, + GFS2_BLKST_USED)) { + stack; + gfs2_special_free(&gfs1_rindex_blks); + ret = FSCK_ERROR; + goto out; + } + /* rgrps and bitmaps don't have bits to represent + their blocks, so don't do this: + check_n_fix_bitmap(sdp, rgd, rgd->ri.ri_addr + i, 0, + gfs2_meta_rgrp);*/ + } + + ret = pass1_process_rgrp(sdp, rgd); + if (ret) + goto out; + } + log_notice(_("Reconciling bitmaps.\n")); + gettimeofday(&timer, NULL); + pass5(sdp, bl); + print_pass_duration("reconcile_bitmaps", &timer); +out: + gfs2_special_free(&gfs1_rindex_blks); + if (bl) + gfs2_bmap_destroy(sdp, bl); + return ret; +} diff --git a/gfs2/fsck/pass1b.c b/gfs2/fsck/pass1b.c new file mode 100644 index 0000000..62686fe --- /dev/null +++ b/gfs2/fsck/pass1b.c @@ -0,0 +1,975 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "link.h" +#include "fsck.h" +#include "osi_list.h" +#include "util.h" +#include "metawalk.h" +#include "inode_hash.h" +#include "afterpass1_common.h" + +struct fxn_info { + uint64_t block; + int found; + int ea_only; /* The only dups were found in EAs */ +}; + +struct dup_handler { + struct duptree *dt; + int ref_inode_count; + int ref_count; +}; + +struct clone_target { + uint64_t dup_block; + int first; +}; + +struct meta_blk_ref { + uint64_t block; /* block to locate */ + uint64_t metablock; /* returned metadata block addr containing ref */ + int off; /* offset to the reference within the buffer */ +}; + +static int clone_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr); + +static void log_inode_reference(struct duptree *dt, osi_list_t *tmp, int inval) +{ + char reftypestring[32]; + struct inode_with_dups *id; + + id = osi_list_entry(tmp, struct inode_with_dups, list); + if (id->dup_count == 1) + sprintf(reftypestring, "as %s", reftypes[get_ref_type(id)]); + else + sprintf(reftypestring, "%d/%d/%d/%d", + id->reftypecount[ref_is_inode], + id->reftypecount[ref_as_data], + id->reftypecount[ref_as_meta], + id->reftypecount[ref_as_ea]); + if (inval) + log_warn( _("Invalid ")); + log_warn( _("Inode %s (%lld/0x%llx) has %d reference(s) to " + "block %llu (0x%llx) (%s)\n"), id->name, + (unsigned long long)id->block_no, + (unsigned long long)id->block_no, id->dup_count, + (unsigned long long)dt->block, + (unsigned long long)dt->block, reftypestring); +} + +static int findref_meta(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, void *private) +{ + *is_valid = 1; + *was_duplicate = 0; + return meta_is_good; +} + +static int findref_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + struct meta_blk_ref *mbr = (struct meta_blk_ref *)private; + + if (block == mbr->block) { + mbr->metablock = bh->b_blocknr; + mbr->off = (ptr - (uint64_t *)bh->b_data); + log_debug("Duplicate data reference located on metadata " + "block 0x%llx, offset 0x%x\n", + (unsigned long long)mbr->metablock, mbr->off); + } + return meta_is_good; +} + +static void clone_data_block(struct gfs2_sbd *sdp, struct duptree *dt, + struct inode_with_dups *id) +{ + struct meta_blk_ref metaref = { .block = dt->block, }; + struct metawalk_fxns find1ref_fxns = { + .private = &metaref, + .check_metalist = findref_meta, + .check_data = findref_data, + }; + struct clone_target clone = {.dup_block = dt->block,}; + struct gfs2_inode *ip; + struct gfs2_buffer_head *bh; + uint64_t *ptr; + + if (!(query(_("Okay to clone data block %lld (0x%llx) for inode " + "%lld (0x%llx)? (y/n) "), + (unsigned long long)dt->block, + (unsigned long long)dt->block, + (unsigned long long)id->block_no, + (unsigned long long)id->block_no))) { + log_warn(_("The duplicate reference was not cloned.\n")); + return; + } + ip = fsck_load_inode(sdp, id->block_no); + check_metatree(ip, &find1ref_fxns); + if (metaref.metablock == 0) { + log_err(_("Unable to clone data block.\n")); + } else { + if (metaref.metablock != id->block_no) + bh = bread(sdp, metaref.metablock); + else + bh = ip->i_bh; + ptr = (uint64_t *)bh->b_data + metaref.off; + clone_data(ip, 0, dt->block, &clone, bh, ptr); + if (metaref.metablock != id->block_no) + brelse(bh); + else + bmodified(ip->i_bh); + } + fsck_inode_put(&ip); /* out, brelse, free */ +} + +/* revise_dup_handler - get current information about a duplicate reference + * + * Function resolve_dup_references can delete dinodes that reference blocks + * which may have duplicate references. Therefore, the duplicate tree is + * constantly being changed. This function revises the duplicate handler so + * that it accurately matches what's in the duplicate tree regarding this block + */ +static void revise_dup_handler(uint64_t dup_blk, struct dup_handler *dh) +{ + osi_list_t *tmp; + struct duptree *dt; + struct inode_with_dups *id; + + dh->ref_inode_count = 0; + dh->ref_count = 0; + dh->dt = NULL; + + dt = dupfind(dup_blk); + if (!dt) + return; + + dh->dt = dt; + /* Count the duplicate references, both valid and invalid */ + osi_list_foreach(tmp, &dt->ref_invinode_list) { + id = osi_list_entry(tmp, struct inode_with_dups, list); + dh->ref_inode_count++; + dh->ref_count += id->dup_count; + } + osi_list_foreach(tmp, &dt->ref_inode_list) { + id = osi_list_entry(tmp, struct inode_with_dups, list); + dh->ref_inode_count++; + dh->ref_count += id->dup_count; + } +} + +/* + * resolve_dup_references - resolve all but the last dinode that has a + * duplicate reference to a given block. + * + * @sdp - pointer to the superblock structure + * @dt - pointer to the duplicate reference rbtree to use + * @ref_list - list of duplicate references to be resolved (invalid or valid) + * @dh - duplicate handler + * inval - The references on this ref_list are invalid. We prefer to delete + * these first before resorting to deleting valid dinodes. + * acceptable_ref - Delete dinodes that reference the given block as anything + * _but_ this type. Try to save references as this type. + */ +static void resolve_dup_references(struct gfs2_sbd *sdp, struct duptree *dt, + osi_list_t *ref_list, + struct dup_handler *dh, + int inval, int acceptable_ref) +{ + struct gfs2_inode *ip; + struct inode_with_dups *id; + osi_list_t *tmp, *x; + struct metawalk_fxns pass1b_fxns_delete = { + .private = NULL, + .check_metalist = delete_metadata, + .check_data = delete_data, + .check_leaf = delete_leaf, + .check_eattr_indir = delete_eattr_indir, + .check_eattr_leaf = delete_eattr_leaf, + .check_eattr_entry = delete_eattr_entry, + .check_eattr_extentry = delete_eattr_extentry, + }; + enum dup_ref_type this_ref; + struct inode_info *ii; + struct dir_info *di; + int found_good_ref = 0; + int q; + + osi_list_foreach_safe(tmp, ref_list, x) { + if (skip_this_pass || fsck_abort) + return; + + id = osi_list_entry(tmp, struct inode_with_dups, list); + dh->dt = dt; + + if (dh->ref_inode_count == 1) /* down to the last reference */ + return; + + this_ref = get_ref_type(id); + q = bitmap_type(sdp, id->block_no); + if (inval) + log_warn( _("Invalid ")); + /* FIXME: If we already found an acceptable reference to this + * block, we should really duplicate the block and fix all + * references to it in this inode. Unfortunately, we would + * have to traverse the entire metadata tree to do that. */ + if (acceptable_ref != ref_types && /* If we're nuking all but + an acceptable reference + type and */ + this_ref == acceptable_ref) { /* this ref is acceptable */ + /* If this is an invalid inode, but not on the invalid + list, it's better to delete it. */ + if (q == GFS2_BLKST_DINODE) { + found_good_ref = 1; + log_warn( _("Inode %s (%lld/0x%llx)'s " + "reference to block %llu (0x%llx) " + "as '%s' is acceptable.\n"), + id->name, + (unsigned long long)id->block_no, + (unsigned long long)id->block_no, + (unsigned long long)dt->block, + (unsigned long long)dt->block, + reftypes[this_ref]); + continue; /* don't delete the dinode */ + } + } + /* If this reference is from a system inode, for example, if + it's data or metadata inside a journal, the reference + should take priority over user dinodes that reference the + block. */ + if (!found_good_ref && fsck_system_inode(sdp, id->block_no)) { + found_good_ref = 1; + continue; /* don't delete the dinode */ + } + log_warn( _("Inode %s (%lld/0x%llx) references block " + "%llu (0x%llx) as '%s', but the block is " + "really %s.\n"), + id->name, (unsigned long long)id->block_no, + (unsigned long long)id->block_no, + (unsigned long long)dt->block, + (unsigned long long)dt->block, + reftypes[this_ref], reftypes[acceptable_ref]); + if (this_ref == ref_as_ea) { + if (!(query( _("Okay to remove extended attributes " + "from %s inode %lld (0x%llx)? (y/n) "), + (inval ? _("invalidated") : ""), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no))) { + log_warn( _("The bad EA reference was not " + "cleared.")); + /* delete the list entry so we don't leak + memory but leave the reference count. If we + decrement the ref count, we could get down + to 1 and the dinode would be changed + without a 'Yes' answer. */ + /* (dh->ref_inode_count)--;*/ + dup_listent_delete(dt, id); + continue; + } + } else if (acceptable_ref == ref_types && + this_ref == ref_as_data) { + clone_data_block(sdp, dt, id); + dup_listent_delete(dt, id); + revise_dup_handler(dt->block, dh); + continue; + } else if (!(query( _("Okay to delete %s inode %lld (0x%llx)? " + "(y/n) "), + (inval ? _("invalidated") : ""), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no))) { + log_warn( _("The bad inode was not cleared.")); + /* delete the list entry so we don't leak memory but + leave the reference count. If we decrement the + ref count, we could get down to 1 and the dinode + would be changed without a 'Yes' answer. */ + /* (dh->ref_inode_count)--;*/ + dup_listent_delete(dt, id); + continue; + } + if (q == GFS2_BLKST_FREE) + log_warn( _("Inode %lld (0x%llx) was previously " + "deleted.\n"), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + else if (this_ref == ref_as_ea) + log_warn(_("Pass1b is removing extended attributes " + "from inode %lld (0x%llx).\n"), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + else + log_warn(_("Pass1b is deleting inode %lld (0x%llx).\n"), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + + ip = fsck_load_inode(sdp, id->block_no); + /* If we've already deleted this dinode, don't try to delete + it again. That could free blocks that used to be duplicate + references that are now resolved (and gone). */ + if (q != GFS2_BLKST_FREE) { + /* If the inode's eattr pointer is to the duplicate + ref block, we don't want to call check_inode_eattr + because that would traverse the structure, and it's + not ours to do anymore; it rightly belongs to a + different dinode. On the other hand, if the dup + block is buried deep within the eattr structure + of this dinode, we need to traverse the structure + because it IS ours, and we need to remove all the + eattr leaf blocks: they do belong to us (except for + the duplicate referenced one, which is handled). */ + if (ip->i_di.di_eattr == dt->block) { + ip->i_di.di_eattr = 0; + if (ip->i_di.di_blocks > 0) + ip->i_di.di_blocks--; + ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + bmodified(ip->i_bh); + dup_listent_delete(dt, id); + (dh->ref_inode_count)--; + } else { + /* Clear the EAs for the inode first */ + check_inode_eattr(ip, &pass1b_fxns_delete); + (dh->ref_inode_count)--; + } + /* If the reference was as metadata or data, we've got + a corrupt dinode that will be deleted. */ + if ((this_ref != ref_as_ea) && + (inval || id->reftypecount[ref_as_data] || + id->reftypecount[ref_as_meta])) { + /* Fix the bitmap first, while the inodetree + and dirtree entries exist. That way, the + bitmap_set will do proper accounting for + the rgrp dinode count. */ + fsck_bitmap_set(ip, ip->i_di.di_num.no_addr, + _("duplicate referencing bad"), + GFS2_BLKST_FREE); + /* Remove the inode from the inode tree */ + ii = inodetree_find(ip->i_di.di_num.no_addr); + if (ii) + inodetree_delete(ii); + di = dirtree_find(ip->i_di.di_num.no_addr); + if (di) + dirtree_delete(di); + link1_set(&nlink1map, ip->i_di.di_num.no_addr, + 0); + /* We delete the dup_handler inode count and + duplicate id BEFORE clearing the metadata, + because if this is the last reference to + this metadata block, we need to traverse the + tree and free the data blocks it references. + However, we don't want to delete other + duplicates that may be used by other + dinodes. */ + (dh->ref_inode_count)--; + /* FIXME: other option should be to duplicate + the block for each duplicate and point the + metadata at the cloned blocks */ + check_metatree(ip, &pass1b_fxns_delete); + } + } + /* Now we've got to go through and delete any other duplicate + references from this dinode we're deleting. If we don't, + pass1b will discover the other duplicate record, try to + delete this dinode a second time, and this time its earlier + duplicate references won't be seen as duplicates anymore + (because they were eliminated earlier in pass1b). And so + the blocks will be mistakenly freed, when, in fact, they're + still being referenced by a valid dinode. */ + if (this_ref != ref_as_ea) + delete_all_dups(ip); + fsck_inode_put(&ip); /* out, brelse, free */ + } + return; +} + +static int clone_check_meta(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, void *private) +{ + *was_duplicate = 0; + *is_valid = 1; + *bh = bread(ip->i_sbd, block); + return 0; +} + +/* clone_data - clone a duplicate reference + * + * This function remembers the first reference to the specified block, and + * clones all subsequent references to it (with permission). + */ +static int clone_data(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + struct clone_target *clonet = (struct clone_target *)private; + struct gfs2_buffer_head *clone_bh; + uint64_t cloneblock; + int error; + + if (block != clonet->dup_block) + return 0; + + if (clonet->first) { + log_debug(_("Inode %lld (0x%llx)'s first reference to " + "block %lld (0x%llx) is targeted for cloning.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, + (unsigned long long)block); + clonet->first = 0; + return 0; + } + log_err(_("Error: Inode %lld (0x%llx)'s reference to block %lld " + "(0x%llx) should be replaced with a clone.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)block, (unsigned long long)block); + if (query( _("Okay to clone the duplicated reference? (y/n) "))) { + error = lgfs2_meta_alloc(ip, &cloneblock); + if (!error) { + clone_bh = bread(ip->i_sbd, clonet->dup_block); + if (clone_bh) { + fsck_bitmap_set(ip, cloneblock, _("data"), + GFS2_BLKST_USED); + clone_bh->b_blocknr = cloneblock; + bmodified(clone_bh); + brelse(clone_bh); + /* Now fix the reference: */ + *ptr = cpu_to_be64(cloneblock); + bmodified(bh); + log_err(_("Duplicate reference to block %lld " + "(0x%llx) was cloned to block %lld " + "(0x%llx).\n"), + (unsigned long long)block, + (unsigned long long)block, + (unsigned long long)cloneblock, + (unsigned long long)cloneblock); + return 0; + } + } + log_err(_("Error: Unable to allocate a new data block.\n")); + if (!query("Should I zero the reference instead? (y/n)")) { + log_err(_("Duplicate reference to block %lld " + "(0x%llx) was not fixed.\n"), + (unsigned long long)block, + (unsigned long long)block); + return 0; + } + *ptr = 0; + bmodified(bh); + log_err(_("Duplicate reference to block %lld (0x%llx) was " + "zeroed.\n"), + (unsigned long long)block, + (unsigned long long)block); + } else { + log_err(_("Duplicate reference to block %lld (0x%llx) " + "was not fixed.\n"), (unsigned long long)block, + (unsigned long long)block); + } + return 0; +} + +/* clone_dup_ref_in_inode - clone a duplicate reference within a single inode + * + * This function traverses the metadata tree of an inode, cloning all + * but the first reference to a duplicate block reference. + */ +static void clone_dup_ref_in_inode(struct gfs2_inode *ip, struct duptree *dt) +{ + int error; + struct clone_target clonet = {.dup_block = dt->block, .first = 1}; + struct metawalk_fxns pass1b_fxns_clone = { + .private = &clonet, + .check_metalist = clone_check_meta, + .check_data = clone_data, + }; + + log_err(_("There are multiple references to block %lld (0x%llx) in " + "inode %lld (0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)dt->block, (unsigned long long)dt->block); + error = check_metatree(ip, &pass1b_fxns_clone); + if (error) { + log_err(_("Error cloning duplicate reference(s) to block %lld " + "(0x%llx).\n"), (unsigned long long)dt->block, + (unsigned long long)dt->block); + } +} + +static int set_ip_bitmap(struct gfs2_inode *ip) +{ + uint64_t block = ip->i_bh->b_blocknr; + uint32_t mode; + const char *ty; + + if (ip->i_sbd->gfs1) + mode = gfs_to_gfs2_mode(ip); + else + mode = ip->i_di.di_mode & S_IFMT; + + switch (mode) { + case S_IFDIR: + ty = _("directory"); + break; + case S_IFREG: + ty = _("file"); + break; + case S_IFLNK: + ty = _("symlink"); + break; + case S_IFBLK: + ty = _("block device"); + break; + case S_IFCHR: + ty = _("character device"); + break; + case S_IFIFO: + ty = _("fifo"); + break; + case S_IFSOCK: + ty = _("socket"); + break; + default: + return -EINVAL; + } + fsck_bitmap_set(ip, block, ty, GFS2_BLKST_DINODE); + return 0; +} + +static void resolve_last_reference(struct gfs2_sbd *sdp, struct duptree *dt, + enum dup_ref_type acceptable_ref) +{ + struct gfs2_inode *ip; + struct inode_with_dups *id; + osi_list_t *tmp; + int q; + + log_notice( _("Block %llu (0x%llx) has only one remaining " + "valid inode referencing it.\n"), + (unsigned long long)dt->block, + (unsigned long long)dt->block); + /* If we're down to a single reference (and not all references + deleted, which may be the case of an inode that has only + itself and a reference), we need to reset the block type + from invalid to data or metadata. Start at the first one + in the list, not the structure's place holder. */ + tmp = dt->ref_inode_list.next; + id = osi_list_entry(tmp, struct inode_with_dups, list); + log_debug( _("----------------------------------------------\n" + "Step 4. Set block type based on the remaining " + "reference in inode %lld (0x%llx).\n"), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + ip = fsck_load_inode(sdp, id->block_no); + + if (dt->dup_flags & DUPFLAG_REF1_IS_DUPL) + clone_dup_ref_in_inode(ip, dt); + + q = bitmap_type(sdp, id->block_no); + if (q == GFS2_BLKST_FREE) { + log_debug( _("The remaining reference inode %lld (0x%llx) was " + "already marked free.\n"), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + } else if (id->reftypecount[ref_is_inode]) { + set_ip_bitmap(ip); + } else if (id->reftypecount[ref_as_data]) { + fsck_bitmap_set(ip, dt->block, _("reference-repaired data"), + GFS2_BLKST_USED); + } else if (id->reftypecount[ref_as_meta]) { + if (is_dir(&ip->i_di, sdp->gfs1)) + fsck_bitmap_set(ip, dt->block, + _("reference-repaired leaf"), + sdp->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + else + fsck_bitmap_set(ip, dt->block, + _("reference-repaired indirect"), + sdp->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + } else { + if (acceptable_ref == ref_as_ea) + fsck_bitmap_set(ip, dt->block, + _("reference-repaired extended " + "attribute"), + sdp->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + else { + log_err(_("Error: The remaining reference to block " + " %lld (0x%llx) is as extended attribute, " + "in inode %lld (0x%llx) but the block is " + "not an EA.\n"), + (unsigned long long)dt->block, + (unsigned long long)dt->block, + (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + if (query(_("Okay to remove the bad extended " + "attribute from inode %lld (0x%llx)? " + "(y/n) "), + (unsigned long long)id->block_no, + (unsigned long long)id->block_no)) { + ip->i_di.di_eattr = 0; + ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + ip->i_di.di_blocks--; + bmodified(ip->i_bh); + fsck_bitmap_set(ip, dt->block, + _("reference-repaired EA"), + GFS2_BLKST_FREE); + log_err(_("The bad extended attribute was " + "removed.\n")); + } else { + log_err(_("The bad extended attribute was not " + "removed.\n")); + } + } + } + fsck_inode_put(&ip); /* out, brelse, free */ + log_debug(_("Done with duplicate reference to block 0x%llx\n"), + (unsigned long long)dt->block); + dup_delete(dt); +} + +/* handle_dup_blk - handle a duplicate block reference. + * + * This function should resolve and delete the duplicate block reference given, + * iow dt. + */ +static int handle_dup_blk(struct gfs2_sbd *sdp, struct duptree *dt) +{ + osi_list_t *tmp; + struct dup_handler dh = {0}; + struct gfs2_buffer_head *bh; + uint32_t cmagic, ctype; + enum dup_ref_type acceptable_ref; + uint64_t dup_blk; + + dup_blk = dt->block; + revise_dup_handler(dup_blk, &dh); + + /* Log the duplicate references */ + log_notice( _("Block %llu (0x%llx) has %d inodes referencing it" + " for a total of %d duplicate references:\n"), + (unsigned long long)dt->block, + (unsigned long long)dt->block, + dh.ref_inode_count, dh.ref_count); + + osi_list_foreach(tmp, &dt->ref_invinode_list) + log_inode_reference(dt, tmp, 1); + osi_list_foreach(tmp, &dt->ref_inode_list) + log_inode_reference(dt, tmp, 0); + + /* Figure out the block type to see if we can eliminate references + to a different type. In other words, if the duplicate block looks + like metadata, we can delete dinodes that reference it as data. + If the block doesn't look like metadata, we can eliminate any + references to it as metadata. Dinodes with such references are + clearly corrupt and need to be deleted. + And if we're left with a single reference, problem solved. */ + bh = bread(sdp, dt->block); + cmagic = ((struct gfs2_meta_header *)(bh->b_data))->mh_magic; + ctype = ((struct gfs2_meta_header *)(bh->b_data))->mh_type; + brelse(bh); + + /* If this is a dinode, any references to it (except in directory + entries) are invalid and should be deleted. */ + if (be32_to_cpu(cmagic) == GFS2_MAGIC && + be32_to_cpu(ctype) == GFS2_METATYPE_DI) + acceptable_ref = ref_is_inode; + else if (be32_to_cpu(cmagic) == GFS2_MAGIC && + (be32_to_cpu(ctype) == GFS2_METATYPE_EA || + be32_to_cpu(ctype) == GFS2_METATYPE_ED)) + acceptable_ref = ref_as_ea; + else if (be32_to_cpu(cmagic) == GFS2_MAGIC && + be32_to_cpu(ctype) <= GFS2_METATYPE_QC) + acceptable_ref = ref_as_meta; + else + acceptable_ref = ref_as_data; + + /* A single reference to the block implies a possible situation where + a data pointer points to a metadata block. In other words, the + duplicate reference in the file system is (1) Metadata block X and + (2) A dinode reference such as a data pointer pointing to block X. + We can't really check for that in pass1 because user data might + just _look_ like metadata by coincidence, and at the time we're + checking, we might not have processed the referenced block. + Here in pass1b we're sure. */ + /* Another possibility here is that there is a single reference + because all the other metadata references were in inodes that got + invalidated for other reasons, such as bad pointers. So we need to + make sure at this point that any inode deletes reverse out any + duplicate reference before we get to this point. */ + + /* Step 1 - eliminate references from inodes that are not valid. + * This may be because they were deleted due to corruption. + * All block types are unacceptable, so we use ref_types. + */ + if (dh.ref_count > 1) { + log_debug( _("----------------------------------------------\n" + "Step 1: Eliminate references to block %llu " + "(0x%llx) that were previously marked " + "invalid.\n"), + (unsigned long long)dt->block, + (unsigned long long)dt->block); + resolve_dup_references(sdp, dt, &dt->ref_invinode_list, + &dh, 1, ref_types); + revise_dup_handler(dup_blk, &dh); + } + /* Step 2 - eliminate reference from inodes that reference it as the + * wrong type. For example, a data file referencing it as + * a data block, but it's really a metadata block. Or a + * directory inode referencing a data block as a leaf block. + */ + if (dh.ref_count > 1) { + log_debug( _("----------------------------------------------\n" + "Step 2: Eliminate references to block %llu " + "(0x%llx) that need the wrong block type.\n"), + (unsigned long long)dt->block, + (unsigned long long)dt->block); + resolve_dup_references(sdp, dt, &dt->ref_inode_list, &dh, 0, + acceptable_ref); + revise_dup_handler(dup_blk, &dh); + } + /* Step 3 - We have multiple dinodes referencing it as the correct + * type. Just blast one of them. + * All block types are fair game, so we use ref_types. + */ + if (dh.ref_count > 1) { + log_debug( _("----------------------------------------------\n" + "Step 3: Choose one reference to block %llu " + "(0x%llx) to keep.\n"), + (unsigned long long)dt->block, + (unsigned long long)dt->block); + resolve_dup_references(sdp, dt, &dt->ref_inode_list, &dh, 0, + ref_types); + revise_dup_handler(dup_blk, &dh); + } + /* If there's still a last remaining reference, and it's a valid + reference, use it to determine the correct block type for our + blockmap and bitmap. */ + if (dh.ref_inode_count == 1 && !osi_list_empty(&dt->ref_inode_list)) { + resolve_last_reference(sdp, dt, acceptable_ref); + } else { + /* They may have answered no and not fixed all references. */ + log_debug( _("All duplicate references to block 0x%llx were " + "processed.\n"), (unsigned long long)dup_blk); + if (dh.ref_count) { + log_debug(_("Done with duplicate reference to block " + "0x%llx, but %d references remain.\n"), + (unsigned long long)dup_blk, dh.ref_count); + } else { + log_notice( _("Block %llu (0x%llx) has no more " + "references; Marking as 'free'.\n"), + (unsigned long long)dup_blk, + (unsigned long long)dup_blk); + if (dh.dt) + dup_delete(dh.dt); + check_n_fix_bitmap(sdp, NULL, dup_blk, 0, + GFS2_BLKST_FREE); + } + } + return 0; +} + +static int check_leaf_refs(struct gfs2_inode *ip, uint64_t block, + void *private) +{ + return add_duplicate_ref(ip, block, ref_as_meta, 1, INODE_VALID); +} + +static int check_metalist_refs(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, + void *private) +{ + *was_duplicate = 0; + *is_valid = 1; + return add_duplicate_ref(ip, block, ref_as_meta, 1, INODE_VALID); +} + +static int check_data_refs(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bh, uint64_t *ptr) +{ + return add_duplicate_ref(ip, block, ref_as_data, 1, INODE_VALID); +} + +static int check_eattr_indir_refs(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, + struct gfs2_buffer_head **bh, void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + int error; + + error = add_duplicate_ref(ip, block, ref_as_ea, 1, INODE_VALID); + if (!error) + *bh = bread(sdp, block); + + return error; +} + +static int check_eattr_leaf_refs(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + int error; + + error = add_duplicate_ref(ip, block, ref_as_ea, 1, INODE_VALID); + if (!error) + *bh = bread(sdp, block); + return error; +} + +static int check_eattr_entry_refs(struct gfs2_inode *ip, + struct gfs2_buffer_head *leaf_bh, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private) +{ + return 0; +} + +static int check_eattr_extentry_refs(struct gfs2_inode *ip, int i, + uint64_t *ea_data_ptr, + struct gfs2_buffer_head *leaf_bh, + uint32_t tot_ealen, + struct gfs2_ea_header *ea_hdr, + struct gfs2_ea_header *ea_hdr_prev, + void *private) +{ + uint64_t block = be64_to_cpu(*ea_data_ptr); + + /* This is a case where a bad return code may be sent back, and + behavior has changed. Before, if add_duplicate_ref returned a + non-zero return code, the caller would delete the eattr from + the blockmap. In this case, we should be okay because the only + error possible is a malloc that fails, in which case we don't + want to delete the eattr anyway. */ + return add_duplicate_ref(ip, block, ref_as_ea, 1, INODE_VALID); +} + +/* Finds all references to duplicate blocks in the metadata */ +/* Finds all references to duplicate blocks in the metadata */ +static int find_block_ref(struct gfs2_sbd *sdp, uint64_t inode) +{ + struct gfs2_inode *ip; + int error = 0; + struct metawalk_fxns find_refs = { + .private = NULL, + .check_leaf = check_leaf_refs, + .check_metalist = check_metalist_refs, + .check_data = check_data_refs, + .check_eattr_indir = check_eattr_indir_refs, + .check_eattr_leaf = check_eattr_leaf_refs, + .check_eattr_entry = check_eattr_entry_refs, + .check_eattr_extentry = check_eattr_extentry_refs, + }; + + ip = fsck_load_inode(sdp, inode); /* bread, inode_get */ + + /* double-check the meta header just to be sure it's metadata */ + if (ip->i_di.di_header.mh_magic != GFS2_MAGIC || + ip->i_di.di_header.mh_type != GFS2_METATYPE_DI) { + if (!sdp->gfs1) + log_debug( _("Block %lld (0x%llx) is not a dinode.\n"), + (unsigned long long)inode, + (unsigned long long)inode); + error = 1; + goto out; + } + /* Check to see if this inode was referenced by another by mistake */ + add_duplicate_ref(ip, inode, ref_is_inode, 1, INODE_VALID); + + /* Check this dinode's metadata for references to known duplicates */ + error = check_metatree(ip, &find_refs); + if (error < 0) + stack; + + /* Check for ea references in the inode */ + if (!error) + error = check_inode_eattr(ip, &find_refs); + +out: + fsck_inode_put(&ip); /* out, brelse, free */ + return error; +} + +/* Pass 1b handles finding the previous inode for a duplicate block + * When found, store the inodes pointing to the duplicate block for + * use in pass2 */ +int pass1b(struct gfs2_sbd *sdp) +{ + struct duptree *dt; + uint64_t i; + int q; + struct osi_node *n; + int rc = FSCK_OK; + + log_info( _("Looking for duplicate blocks...\n")); + + /* If there were no dups in the bitmap, we don't need to do anymore */ + if (dup_blocks.osi_node == NULL) { + log_info( _("No duplicate blocks found\n")); + return FSCK_OK; + } + + /* Rescan the fs looking for pointers to blocks that are in + * the duplicate block map */ + log_info( _("Scanning filesystem for inodes containing duplicate blocks...\n")); + log_debug( _("Filesystem has %llu (0x%llx) blocks total\n"), + (unsigned long long)last_fs_block, + (unsigned long long)last_fs_block); + for (i = 0; i < last_fs_block; i++) { + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + goto out; + + if (dups_found_first == dups_found) { + log_debug(_("Found all %d original references to " + "duplicates.\n"), dups_found); + break; + } + q = bitmap_type(sdp, i); + + if (q == GFS2_BLKST_FREE || q == GFS2_BLKST_USED || q < 0) + continue; + + if (q == GFS2_BLKST_UNLINKED) { + log_debug( _("Error: block %lld (0x%llx) is still " + "marked UNLINKED.\n"), + (unsigned long long)i, + (unsigned long long)i); + return FSCK_ERROR; + } + + warm_fuzzy_stuff(i); + if (find_block_ref(sdp, i) < 0) { + stack; + rc = FSCK_ERROR; + goto out; + } + } + + /* Fix dups here - it's going to slow things down a lot to fix + * it later */ + log_info( _("Handling duplicate blocks\n")); +out: + /* Resolve all duplicates by clearing out the dup tree */ + while ((n = osi_first(&dup_blocks))) { + dt = (struct duptree *)n; + if (!skip_this_pass && !rc) /* no error & not asked to skip the rest */ + handle_dup_blk(sdp, dt); + } + return rc; +} diff --git a/gfs2/fsck/pass2.c b/gfs2/fsck/pass2.c new file mode 100644 index 0000000..763e39a --- /dev/null +++ b/gfs2/fsck/pass2.c @@ -0,0 +1,2283 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "util.h" +#include "metawalk.h" +#include "link.h" +#include "lost_n_found.h" +#include "inode_hash.h" +#include "afterpass1_common.h" + +#define MAX_FILENAME 256 + +struct metawalk_fxns pass2_fxns; + +struct metawalk_fxns delete_eattrs = { + .check_eattr_indir = delete_eattr_indir, + .check_eattr_leaf = delete_eattr_leaf, + .check_eattr_entry = delete_eattr_entry, + .check_eattr_extentry = delete_eattr_extentry, +}; + +/* Set children's parent inode in dir_info structure - ext2 does not set + * dotdot inode here, but instead in pass3 - should we? */ +static int set_parent_dir(struct gfs2_sbd *sdp, struct gfs2_inum child, + struct gfs2_inum parent) +{ + struct dir_info *di; + + di = dirtree_find(child.no_addr); + if (!di) { + log_err( _("Unable to find block %llu (0x%llx" + ") in dir_info list\n"), + (unsigned long long)child.no_addr, + (unsigned long long)child.no_addr); + return -1; + } + + if (di->dinode.no_addr == child.no_addr && + di->dinode.no_formal_ino == child.no_formal_ino) { + if (di->treewalk_parent) { + log_err( _("Another directory at block %lld (0x%llx) " + "already contains this child %lld (0x%llx)" + " - checking parent %lld (0x%llx)\n"), + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent, + (unsigned long long)child.no_addr, + (unsigned long long)child.no_addr, + (unsigned long long)parent.no_addr, + (unsigned long long)parent.no_addr); + return 1; + } + log_debug( _("Child %lld (0x%llx) has parent %lld (0x%llx)\n"), + (unsigned long long)child.no_addr, + (unsigned long long)child.no_addr, + (unsigned long long)parent.no_addr, + (unsigned long long)parent.no_addr); + di->treewalk_parent = parent.no_addr; + } + + return 0; +} + +/* Set's the child's '..' directory inode number in dir_info structure */ +static int set_dotdot_dir(struct gfs2_sbd *sdp, uint64_t childblock, + struct gfs2_inum parent) +{ + struct dir_info *di; + + di = dirtree_find(childblock); + if (!di) { + log_err( _("Unable to find block %"PRIu64" (0x%" PRIx64 + ") in dir_info tree\n"), childblock, childblock); + return -1; + } + if (di->dinode.no_addr != childblock) { + log_debug("'..' doesn't point to what we found: childblock " + "(0x%llx) != dinode (0x%llx)\n", + (unsigned long long)childblock, + (unsigned long long)di->dinode.no_addr); + return -1; + } + /* Special case for root inode because we set it earlier */ + if (di->dotdot_parent.no_addr && + sdp->md.rooti->i_di.di_num.no_addr != di->dinode.no_addr) { + /* This should never happen */ + log_crit( _("Dotdot parent already set for block %llu (0x%llx)" + "-> %llu (0x%llx)\n"), + (unsigned long long)childblock, + (unsigned long long)childblock, + (unsigned long long)di->dotdot_parent.no_addr, + (unsigned long long)di->dotdot_parent.no_addr); + return -1; + } + log_debug("Setting '..' for directory block (0x%llx) to parent " + "(0x%llx)\n", (unsigned long long)childblock, + (unsigned long long)parent.no_addr); + di->dotdot_parent.no_addr = parent.no_addr; + di->dotdot_parent.no_formal_ino = parent.no_formal_ino; + return 0; +} + +static int check_eattr_indir(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + *bh = bread(ip->i_sbd, block); + return 0; +} +static int check_eattr_leaf(struct gfs2_inode *ip, uint64_t block, + uint64_t parent, struct gfs2_buffer_head **bh, + void *private) +{ + *bh = bread(ip->i_sbd, block); + return 0; +} + +static const char *de_type_string(uint8_t de_type) +{ + const char *de_types[15] = {"unknown", "fifo", "chrdev", "invalid", + "directory", "invalid", "blkdev", "invalid", + "file", "invalid", "symlink", "invalid", + "socket", "invalid", "wht"}; + if (de_type < 15) + return de_types[de_type]; + return de_types[3]; /* invalid */ +} + +static int check_file_type(uint64_t block, uint8_t de_type, int q, + int gfs1, int *isdir) +{ + struct dir_info *dt; + + *isdir = 0; + if (q != GFS2_BLKST_DINODE) { + log_err( _("Invalid block type\n")); + return -1; + } + if (de_type == (gfs1 ? GFS_FILE_DIR : DT_DIR)) + *isdir = 1; + /* Check if the dinode is in the dir tree */ + dt = dirtree_find(block); + /* This is a bit confusing, so let me explain: + If the dirent says the inode supposed to be for a directory, + it should be in the dir tree. If it is, no problem, return 0. + If it's not, return 1 (wrong type). If it's not supposed to be + a directory, it shouldn't be in the dir tree. */ + if (dt) + return !(*isdir); + return *isdir; +} + +struct metawalk_fxns pass2_fxns_delete = { + .private = NULL, + .check_metalist = delete_metadata, + .check_data = delete_data, + .check_leaf = delete_leaf, + .check_eattr_indir = delete_eattr_indir, + .check_eattr_leaf = delete_eattr_leaf, + .check_eattr_entry = delete_eattr_entry, + .check_eattr_extentry = delete_eattr_extentry, +}; + +/* bad_formal_ino - handle mismatches in formal inode number + * Returns: 0 if the dirent was repaired + * 1 if the caller should delete the dirent + */ +static int bad_formal_ino(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_inum entry, const char *tmp_name, + int q, struct gfs2_dirent *de, + struct gfs2_buffer_head *bh) +{ + struct inode_info *ii; + struct dir_info *di = NULL; + struct gfs2_inode *child_ip; + struct gfs2_inum childs_dotdot; + struct gfs2_sbd *sdp = ip->i_sbd; + int error; + struct gfs2_inum inum = { 0 }; + + ii = inodetree_find(entry.no_addr); + if (ii) + inum = ii->di_num; + else { + di = dirtree_find(entry.no_addr); + if (di) + inum = di->dinode; + else if (link1_type(&clink1map, entry.no_addr) == 1) { + struct gfs2_inode *dent_ip; + + dent_ip = fsck_load_inode(ip->i_sbd, entry.no_addr); + inum = dent_ip->i_di.di_num; + fsck_inode_put(&dent_ip); + } + } + log_err( _("Directory entry '%s' pointing to block %llu (0x%llx) in " + "directory %llu (0x%llx) has the wrong 'formal' inode " + "number.\n"), tmp_name, (unsigned long long)entry.no_addr, + (unsigned long long)entry.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + log_err( _("The directory entry has %llu (0x%llx) but the inode has " + "%llu (0x%llx)\n"), (unsigned long long)entry.no_formal_ino, + (unsigned long long)entry.no_formal_ino, + (unsigned long long)inum.no_formal_ino, + (unsigned long long)inum.no_formal_ino); + if (q != GFS2_BLKST_DINODE || !strcmp("..", tmp_name)) { + if (query( _("Remove the corrupt directory entry? (y/n) "))) + return 1; + log_err( _("Corrupt directory entry not removed.\n")); + return 0; + } + /* We have a directory pointing to another directory, but the + formal inode number still doesn't match. If that directory + has a '..' pointing back, just fix up the no_formal_ino. */ + child_ip = lgfs2_inode_read(sdp, entry.no_addr); + error = dir_search(child_ip, "..", 2, NULL, &childs_dotdot); + if (!error && childs_dotdot.no_addr == ip->i_di.di_num.no_addr) { + log_err( _("The entry points to another directory with intact " + "linkage.\n")); + if (query( _("Fix the bad directory entry? (y/n) "))) { + log_err( _("Fixing the corrupt directory entry.\n")); + entry.no_formal_ino = inum.no_formal_ino; + de->de_inum.no_formal_ino = entry.no_formal_ino; + gfs2_dirent_out(de, (char *)dent); + bmodified(bh); + incr_link_count(entry, ip, _("fixed reference")); + set_parent_dir(sdp, entry, ip->i_di.di_num); + } else { + log_err( _("Directory entry not fixed.\n")); + } + } else { + if (query( _("Remove the corrupt directory entry? (y/n) "))) { + inode_put(&child_ip); + return 1; + } + log_err( _("Corrupt directory entry not removed.\n")); + } + inode_put(&child_ip); + return 0; +} + +static int hash_table_index(uint32_t hash, struct gfs2_inode *ip) +{ + return hash >> (32 - ip->i_di.di_depth); +} + +static int hash_table_max(int lindex, struct gfs2_inode *ip, + struct gfs2_buffer_head *bh) +{ + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + return (1 << (ip->i_di.di_depth - be16_to_cpu(leaf->lf_depth))) + + lindex - 1; +} + +static int check_leaf_depth(struct gfs2_inode *ip, uint64_t leaf_no, + int ref_count, struct gfs2_buffer_head *lbh) +{ + struct gfs2_leaf *leaf = (struct gfs2_leaf *)lbh->b_data; + int cur_depth = be16_to_cpu(leaf->lf_depth); + int exp_count = 1 << (ip->i_di.di_depth - cur_depth); + int divisor; + int factor, correct_depth; + + if (exp_count == ref_count) + return 0; + + factor = 0; + divisor = ref_count; + while (divisor > 1) { + factor++; + divisor >>= 1; + } + if (ip->i_di.di_depth < factor) /* can't be fixed--leaf must be on the + wrong dinode. */ + return -1; + correct_depth = ip->i_di.di_depth - factor; + if (cur_depth == correct_depth) + return 0; + + log_err(_("Leaf block %llu (0x%llx) in dinode %llu (0x%llx) has the " + "wrong depth: is %d (length %d), should be %d (length " + "%d).\n"), + (unsigned long long)leaf_no, (unsigned long long)leaf_no, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + cur_depth, ref_count, correct_depth, exp_count); + if (!query( _("Fix the leaf block? (y/n)"))) { + log_err( _("The leaf block was not fixed.\n")); + return 0; + } + + leaf->lf_depth = cpu_to_be16(correct_depth); + bmodified(lbh); + log_err( _("The leaf block depth was fixed.\n")); + return 1; +} + +/* wrong_leaf: Deal with a dirent discovered to be on the wrong leaf block + * + * Returns: 1 if the dirent is to be removed, 0 if it needs to be kept, + * or -1 on error + */ +static int wrong_leaf(struct gfs2_inode *ip, struct gfs2_inum *entry, + const char *tmp_name, int *lindex, int lindex_max, + int hash_index, struct gfs2_buffer_head *bh, + struct dir_status *ds, struct gfs2_dirent *dent, + struct gfs2_dirent *de, struct gfs2_dirent *prev_de, + uint32_t *count, int q) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *dest_lbh; + uint64_t planned_leaf, real_leaf; + int li, dest_ref, error; + uint64_t *tbl; + int di_depth; + + log_err(_("Directory entry '%s' at block %lld (0x%llx) is on the " + "wrong leaf block.\n"), tmp_name, + (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr); + log_err(_("Leaf index is: 0x%x. The range for this leaf block is " + "0x%x - 0x%x\n"), hash_index, *lindex, lindex_max); + if (!query( _("Move the misplaced directory entry to " + "a valid leaf block? (y/n) "))) { + log_err( _("Misplaced directory entry not moved.\n")); + return 0; + } + + /* check the destination leaf block's depth */ + tbl = get_dir_hash(ip); + if (tbl == NULL) { + perror("get_dir_hash"); + return -1; + } + planned_leaf = be64_to_cpu(tbl[hash_index]); + log_err(_("Moving it from leaf %llu (0x%llx) to %llu (0x%llx)\n"), + (unsigned long long)be64_to_cpu(tbl[*lindex]), + (unsigned long long)be64_to_cpu(tbl[*lindex]), + (unsigned long long)planned_leaf, + (unsigned long long)planned_leaf); + /* Can't trust lf_depth; we have to count */ + dest_ref = 0; + for (li = 0; li < (1 << ip->i_di.di_depth); li++) { + if (be64_to_cpu(tbl[li]) == planned_leaf) + dest_ref++; + else if (dest_ref) + break; + } + dest_lbh = bread(sdp, planned_leaf); + check_leaf_depth(ip, planned_leaf, dest_ref, dest_lbh); + brelse(dest_lbh); + free(tbl); + + /* check if it's already on the correct leaf block */ + error = dir_search(ip, tmp_name, de->de_name_len, NULL, &de->de_inum); + if (!error) { + log_err(_("The misplaced directory entry already appears on " + "the correct leaf block.\n")); + log_err( _("The bad duplicate directory entry " + "'%s' was cleared.\n"), tmp_name); + return 1; /* nuke the dent upon return */ + } + + di_depth = ip->i_di.di_depth; + if (dir_add(ip, tmp_name, de->de_name_len, &de->de_inum, + de->de_type) == 0) { + log_err(_("The misplaced directory entry was moved to a " + "valid leaf block.\n")); + if (ip->i_di.di_depth > di_depth) { + log_err(_("Directory hash table was doubled.\n")); + hash_index <<= (ip->i_di.di_depth - di_depth); + (*lindex) <<= (ip->i_di.di_depth - di_depth); + } + if (lgfs2_get_leaf_ptr(ip, hash_index, &real_leaf)) { + log_err(_("Could not read leaf %d in dinode %"PRIu64": %s\n"), hash_index, + (uint64_t)ip->i_di.di_num.no_addr, strerror(errno)); + } + if (real_leaf != planned_leaf) { + log_err(_("The planned leaf was split. The new leaf " + "is: %llu (0x%llx). di_blocks=%llu\n"), + (unsigned long long)real_leaf, + (unsigned long long)real_leaf, + (unsigned long long)ip->i_di.di_blocks); + fsck_bitmap_set(ip, real_leaf, _("split leaf"), + sdp->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED); + } + /* If the misplaced dirent was supposed to be earlier in the + hash table, we need to adjust our counts for the blocks + that have already been processed. If it's supposed to + appear later, we'll count it has part of our normal + processing when we get to that leaf block later on in the + hash table. */ + if (hash_index > *lindex) { + log_err(_("Accounting deferred.\n")); + return 1; /* nuke the dent upon return */ + } + /* If we get here, it's because we moved a dent to another + leaf, but that leaf has already been processed. So we have + to nuke the dent from this leaf when we return, but we + still need to do the "good dent" accounting. */ + if (de->de_type == (sdp->gfs1 ? GFS_FILE_DIR : DT_DIR)) { + error = set_parent_dir(sdp, de->de_inum, + ip->i_di.di_num); + if (error > 0) + /* This is a bit of a kludge, but returning 0 + in this case causes the caller to go through + function set_parent_dir a second time and + deal properly with the hard link. */ + return 0; + } + error = incr_link_count(*entry, ip, + _("moved valid reference")); + if (error > 0 && + bad_formal_ino(ip, dent, *entry, tmp_name, q, de, bh) == 1) + return 1; /* nuke it */ + + /* You cannot do this: + (*count)++; + The reason is: *count is the count of dentries on the leaf, + and we moved the dentry to a previous leaf within the same + directory dinode. So the directory counts still get + incremented, but not leaf entries. When we called dir_add + above, it should have fixed that prev leaf's lf_entries. */ + ds->entry_count++; + return 1; + } else { + log_err(_("Error moving directory entry.\n")); + return 1; /* nuke it */ + } +} + +/* basic_dentry_checks - fundamental checks for directory entries + * + * @ip: pointer to the incode inode structure + * @entry: pointer to the inum info + * @tmp_name: user-friendly file name + * @count: pointer to the entry count + * @de: pointer to the directory entry + * + * Returns: 1 means corruption, nuke the dentry, 0 means checks pass + */ +static int basic_dentry_checks(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_inum *entry, const char *tmp_name, + uint32_t *count, struct gfs2_dirent *de, + struct dir_status *ds, int *q, + struct gfs2_buffer_head *bh, int *isdir) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + uint32_t calculated_hash; + struct gfs2_inode *entry_ip = NULL; + int error; + struct inode_info *ii; + struct dir_info *di = NULL; + struct gfs2_inum inum = { 0 }; + + *isdir = 0; + if (!valid_block_ip(ip, entry->no_addr)) { + log_err( _("Block # referenced by directory entry %s in inode " + "%lld (0x%llx) is invalid\n"), + tmp_name, (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query( _("Clear directory entry to out of range block? " + "(y/n) "))) { + return 1; + } else { + log_err( _("Directory entry to out of range block remains\n")); + (*count)++; + ds->entry_count++; + /* can't do this because the block is out of range: + incr_link_count(entry); */ + return 0; + } + } + + if (de->de_rec_len < GFS2_DIRENT_SIZE(de->de_name_len) || + de->de_name_len > GFS2_FNAMESIZE) { + log_err( _("Dir entry with bad record or name length\n" + "\tRecord length = %u\n\tName length = %u\n"), + de->de_rec_len, de->de_name_len); + if (!query( _("Clear the directory entry? (y/n) "))) { + log_err( _("Directory entry not fixed.\n")); + return 0; + } + /* Don't be tempted to do this: + fsck_bitmap_set(ip, ip->i_di.di_num.no_addr, + _("corrupt directory entry"), + GFS2_BLKST_FREE); + We can't free it because another dir may have a valid reference + to it. Just return 1 so we can delete the bad dirent. */ + log_err( _("Bad directory entry deleted.\n")); + return 1; + } + + calculated_hash = gfs2_disk_hash(tmp_name, de->de_name_len); + if (de->de_hash != calculated_hash){ + log_err( _("Dir entry with bad hash or name length\n" + "\tHash found = %u (0x%x)\n" + "\tFilename = %s\n"), + de->de_hash, de->de_hash, tmp_name); + log_err( _("\tName length found = %u\n" + "\tHash expected = %u (0x%x)\n"), + de->de_name_len, calculated_hash, calculated_hash); + if (!query( _("Fix directory hash for %s? (y/n) "), + tmp_name)) { + log_err( _("Directory entry hash for %s not " + "fixed.\n"), tmp_name); + return 0; + } + de->de_hash = calculated_hash; + gfs2_dirent_out(de, (char *)dent); + bmodified(bh); + log_err( _("Directory entry hash for %s fixed.\n"), + tmp_name); + } + + *q = bitmap_type(sdp, entry->no_addr); + /* Get the status of the directory inode */ + /** + * 1. Blocks marked "invalid" were invalidated due to duplicate + * block references. Pass1b should have already taken care of deleting + * their metadata, so here we only need to delete the directory entries + * pointing to them. We delete the metadata in pass1b because we need + * to eliminate the inode referencing the duplicate-referenced block + * from the list of candidates to keep. So we have a delete-as-we-go + * policy. + * + * 2. Blocks marked "bad" need to have their entire + * metadata tree deleted. + */ + if (*q != GFS2_BLKST_DINODE) { + log_err( _("Directory entry '%s' referencing inode %llu " + "(0x%llx) in dir inode %llu (0x%llx) block type " + "%d: %s.\n"), tmp_name, + (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + *q, *q == GFS2_BLKST_FREE ? + _("was previously marked invalid") : + _("was deleted or is not an inode")); + + if (!query( _("Clear directory entry to non-inode block? " + "(y/n) "))) { + log_err( _("Directory entry to non-inode block remains\n")); + return 0; + } + + /* Don't decrement the link here: Here in pass2, we increment + only when we know it's okay. + decr_link_count(ip->i_di.di_num.no_addr, blah); */ + /* If it was previously marked invalid (i.e. known + to be bad, not just a free block, etc.) then the temptation + would be to delete any metadata it holds. The trouble is: + if it's invalid, we may or _may_not_ have traversed its + metadata tree, and therefore may or may not have marked the + blocks it points to as a metadata type, or as a duplicate. + If there is really a duplicate reference, but we didn't + process the metadata tree because it's invalid, some other + inode has a reference to the metadata block, in which case + freeing it would do more harm than good. IOW we cannot + count on "delete_block_if_notdup" knowing whether it's + really a duplicate block if we never traversed the metadata + tree for the invalid inode. */ + return 1; + } + + error = check_file_type(entry->no_addr, de->de_type, *q, sdp->gfs1, + isdir); + if (error < 0) { + log_err( _("Error: directory entry type is " + "incompatible with block type at block %lld " + "(0x%llx) in directory inode %llu (0x%llx).\n"), + (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + log_err( _("Directory entry type is %d, block type is %d.\n"), + de->de_type, *q); + stack; + return -1; + } + if (error > 0) { + log_err( _("Type '%s' in dir entry (%s, %llu/0x%llx) conflicts" + " with type '%s' in dinode. (Dir entry is stale.)\n"), + de_type_string(de->de_type), tmp_name, + (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr, + block_type_string(*q)); + if (!query( _("Clear stale directory entry? (y/n) "))) { + log_err( _("Stale directory entry remains\n")); + return 0; + } + if (ip->i_di.di_num.no_addr == entry->no_addr) + entry_ip = ip; + else + entry_ip = fsck_load_inode(sdp, entry->no_addr); + check_inode_eattr(entry_ip, &delete_eattrs); + if (entry_ip != ip) + fsck_inode_put(&entry_ip); + return 1; + } + /* We need to verify the formal inode number matches. If it doesn't, + it needs to be deleted. */ + ii = inodetree_find(entry->no_addr); + if (ii) + inum = ii->di_num; + else { + di = dirtree_find(entry->no_addr); + if (di) + inum = di->dinode; + else if (link1_type(&nlink1map, entry->no_addr) == 1) { + /* Since we don't have ii or di, the only way to + validate formal_ino is to read in the inode, which + would kill performance. So skip it for now. */ + return 0; + } + } + if (inum.no_formal_ino != entry->no_formal_ino) { + log_err( _("Directory entry '%s' pointing to block %llu " + "(0x%llx) in directory %llu (0x%llx) has the " + "wrong 'formal' inode number.\n"), tmp_name, + (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + log_err( _("The directory entry has %llu (0x%llx) but the " + "inode has %llu (0x%llx)\n"), + (unsigned long long)entry->no_formal_ino, + (unsigned long long)entry->no_formal_ino, + (unsigned long long)inum.no_formal_ino, + (unsigned long long)inum.no_formal_ino); + return 1; + } + /* Check for a special case where a (bad) GFS1 dirent points to what + * is not a known inode. It could be other GFS1 metadata, such as an + * eattr or indirect block, but marked "dinode" in the bitmap because + * gfs1 marked all gfs1 metadata that way. */ + if (ii == NULL && di == NULL && sdp->gfs1) { + struct gfs2_buffer_head *tbh; + + tbh = bread(sdp, entry->no_addr); + if (gfs2_check_meta(tbh, GFS2_METATYPE_DI)) { /* not dinode */ + log_err( _("Directory entry '%s' pointing to block " + "%llu (0x%llx) in directory %llu (0x%llx) " + "is not really a GFS1 dinode.\n"), tmp_name, + (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + brelse(tbh); + return 1; + } + brelse(tbh); + } + return 0; +} + +static int dirref_find(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_dirent *prev, struct gfs2_buffer_head *bh, + char *filename, uint32_t *count, int *lindex, + void *private) +{ + /* the metawalk_fxn's private field must be set to the dentry + * block we want to clear */ + struct gfs2_inum *entry = (struct gfs2_inum *)private; + struct gfs2_dirent dentry, *de; + char fn[MAX_FILENAME]; + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&dentry, (char *)dent); + de = &dentry; + + if (de->de_inum.no_addr != entry->no_addr) { + (*count)++; + return 0; + } + if (de->de_inum.no_formal_ino == dent->de_inum.no_formal_ino) { + log_debug("Formal inode number matches; must be a hard " + "link.\n"); + goto out; + } + log_err(_("The original reference to inode %lld (0x%llx) from " + "directory %lld (0x%llx) has the wrong 'formal' inode " + "number.\n"), (unsigned long long)entry->no_addr, + (unsigned long long)entry->no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + memset(fn, 0, sizeof(fn)); + if (de->de_name_len < MAX_FILENAME) + strncpy(fn, filename, de->de_name_len); + else + strncpy(fn, filename, MAX_FILENAME - 1); + log_err(_("The bad reference '%s' had formal inode number: %lld " + "(0x%llx) but the correct value is: %lld (0x%llx)\n"), + fn, (unsigned long long)de->de_inum.no_formal_ino, + (unsigned long long)de->de_inum.no_formal_ino, + (unsigned long long)entry->no_formal_ino, + (unsigned long long)entry->no_formal_ino); + if (!query(_("Delete the bad directory entry? (y/n) "))) { + log_err(_("The corrupt directory entry was not fixed.\n")); + goto out; + } + decr_link_count(entry->no_addr, ip->i_di.di_num.no_addr, + ip->i_sbd->gfs1, _("bad original reference")); + dirent2_del(ip, bh, prev, dent); + log_err(_("The corrupt directory entry '%s' was deleted.\n"), fn); +out: + return -1; /* force check_dir to stop; don't waste time. */ +} + +/** + * check_suspicious_dirref - double-check a questionable first dentry ref + * + * This function is called when a dentry has caused us to increment the + * link count to a file from 1 to 2, and we know the object pointed to is + * not a directory. (Most likely, it'a a file). The second directory to + * reference the dinode has the correct formal inode number, but when we + * created the original reference in the counted links bitmap (clink1map), + * we had no way to check the formal inode number. (Well, we could have read + * in the dinode, but that would kill fsck.gfs2 performance.) + * So now we have to walk through the directory tree and find that original + * reference so make sure it's a valid reference. If the formal inode number + * is the same, it's a hard link (which is unlikely for gfs2). If it's not + * the same, that's an error, and we need to delete the damaged original + * dentry, since we failed to detect the problem earlier. + */ +static int check_suspicious_dirref(struct gfs2_sbd *sdp, + struct gfs2_inum *entry) +{ + struct osi_node *tmp, *next = NULL; + struct dir_info *dt; + struct gfs2_inode *ip; + uint64_t dirblk; + int error = FSCK_OK; + struct metawalk_fxns dirref_hunt = { + .private = (void *)entry, + .check_dentry = dirref_find, + }; + + log_debug("This dentry is good, but since this is a second " + "reference to block 0x%llx, we need to check the " + "original.\n", (unsigned long long)entry->no_addr); + for (tmp = osi_first(&dirtree); tmp; tmp = next) { + next = osi_next(tmp); + dt = (struct dir_info *)tmp; + dirblk = dt->dinode.no_addr; + if (skip_this_pass || fsck_abort) /* asked to skip the rest */ + break; + ip = fsck_load_inode(sdp, dirblk); + if (ip == NULL) { + stack; + return FSCK_ERROR; + } + error = check_dir(sdp, ip, &dirref_hunt); + fsck_inode_put(&ip); + /* Error just means we found the dentry and dealt with it. */ + if (error) + break; + } + log_debug("Original reference check complete. Found = %d.\n", + error ? 1 : 0); + return 0; +} + +/* FIXME: should maybe refactor this a bit - but need to deal with + * FIXMEs internally first */ +static int check_dentry(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_dirent *prev_de, + struct gfs2_buffer_head *bh, char *filename, + uint32_t *count, int *lindex, void *priv) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + int q = 0; + char tmp_name[MAX_FILENAME]; + struct gfs2_inum entry; + struct dir_status *ds = (struct dir_status *) priv; + int error; + struct gfs2_inode *entry_ip = NULL; + struct gfs2_dirent dentry, *de; + int hash_index; /* index into the hash table based on the hash */ + int lindex_max; /* largest acceptable hash table index for hash */ + int isdir; + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&dentry, (char *)dent); + de = &dentry; + + entry.no_addr = de->de_inum.no_addr; + entry.no_formal_ino = de->de_inum.no_formal_ino; + + /* Start of checks */ + memset(tmp_name, 0, MAX_FILENAME); + if (de->de_name_len < MAX_FILENAME) + strncpy(tmp_name, filename, de->de_name_len); + else + strncpy(tmp_name, filename, MAX_FILENAME - 1); + + error = basic_dentry_checks(ip, dent, &entry, tmp_name, count, de, + ds, &q, bh, &isdir); + if (error) + goto nuke_dentry; + + if (!strcmp(".", tmp_name)) { + log_debug( _("Found . dentry in directory %lld (0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + + if (ds->dotdir) { + log_err( _("Already found '.' entry in directory %llu" + " (0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (!query( _("Clear duplicate '.' entry? (y/n) "))) { + log_err( _("Duplicate '.' entry remains\n")); + /* FIXME: Should we continue on here + * and check the rest of the '.' entry? */ + goto dentry_is_valid; + } + if (ip->i_di.di_num.no_addr == entry.no_addr) + entry_ip = ip; + else + entry_ip = fsck_load_inode(sdp, entry.no_addr); + check_inode_eattr(entry_ip, &delete_eattrs); + if (entry_ip != ip) + fsck_inode_put(&entry_ip); + goto nuke_dentry; + } + + /* GFS2 does not rely on '.' being in a certain + * location */ + + /* check that '.' refers to this inode */ + if (entry.no_addr != ip->i_di.di_num.no_addr) { + log_err( _("'.' entry's value incorrect in directory %llu" + " (0x%llx). Points to %llu" + " (0x%llx) when it should point to %llu" + " (0x%llx).\n"), + (unsigned long long)entry.no_addr, + (unsigned long long)entry.no_addr, + (unsigned long long)entry.no_addr, + (unsigned long long)entry.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (!query( _("Remove '.' reference? (y/n) "))) { + log_err( _("Invalid '.' reference remains\n")); + /* Not setting ds->dotdir here since + * this '.' entry is invalid */ + goto dentry_is_valid; + } + if (ip->i_di.di_num.no_addr == entry.no_addr) + entry_ip = ip; + else + entry_ip = fsck_load_inode(sdp, entry.no_addr); + check_inode_eattr(entry_ip, &delete_eattrs); + if (entry_ip != ip) + fsck_inode_put(&entry_ip); + goto nuke_dentry; + } + + ds->dotdir = 1; + goto dentry_is_valid; + } + if (!strcmp("..", tmp_name)) { + log_debug( _("Found '..' dentry in directory %lld (0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (ds->dotdotdir) { + log_err( _("Already had a '..' entry in directory %llu" + "(0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (!query( _("Clear duplicate '..' entry? (y/n) "))) { + log_err( _("Duplicate '..' entry remains\n")); + /* FIXME: Should we continue on here + * and check the rest of the '..' + * entry? */ + goto dentry_is_valid; + } + + if (ip->i_di.di_num.no_addr == entry.no_addr) + entry_ip = ip; + else + entry_ip = fsck_load_inode(sdp, entry.no_addr); + check_inode_eattr(entry_ip, &delete_eattrs); + if (entry_ip != ip) + fsck_inode_put(&entry_ip); + + goto nuke_dentry; + } + if (!isdir) { + log_err( _("Found '..' entry in directory %llu (0x%llx) " + "pointing to something that's not a directory"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (!query( _("Clear bad '..' directory entry? (y/n) "))) { + log_err( _("Bad '..' directory entry remains\n")); + goto dentry_is_valid; + } + if (ip->i_di.di_num.no_addr == entry.no_addr) + entry_ip = ip; + else + entry_ip = fsck_load_inode(sdp, entry.no_addr); + check_inode_eattr(entry_ip, &delete_eattrs); + if (entry_ip != ip) + fsck_inode_put(&entry_ip); + + goto nuke_dentry; + } + /* GFS2 does not rely on '..' being in a certain location */ + + /* Add the address this entry is pointing to + * to this inode's dotdot_parent in + * dir_info */ + if (set_dotdot_dir(sdp, ip->i_di.di_num.no_addr, entry)) { + stack; + return -1; + } + + ds->dotdotdir = 1; + goto dentry_is_valid; + } + /* If this is an exhash directory, make sure the dentries in the leaf + block have a hash table index that fits */ + if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + hash_index = hash_table_index(de->de_hash, ip); + lindex_max = hash_table_max(*lindex, ip, bh); + if (hash_index < *lindex || hash_index > lindex_max) { + int nuke_dent; + + nuke_dent = wrong_leaf(ip, &entry, tmp_name, lindex, + lindex_max, hash_index, bh, ds, + dent, de, prev_de, count, q); + if (nuke_dent) + goto nuke_dentry; + } + } + + /* After this point we're only concerned with directories */ + if (!isdir) { + log_debug( _("Found non-dir inode dentry pointing to %lld " + "(0x%llx)\n"), + (unsigned long long)entry.no_addr, + (unsigned long long)entry.no_addr); + goto dentry_is_valid; + } + + /*log_debug( _("Found plain directory dentry\n"));*/ + error = set_parent_dir(sdp, entry, ip->i_di.di_num); + if (error > 0) { + log_err( _("%s: Hard link to block %llu (0x%llx" + ") detected.\n"), tmp_name, + (unsigned long long)entry.no_addr, + (unsigned long long)entry.no_addr); + + if (query( _("Clear hard link to directory? (y/n) "))) + goto nuke_dentry; + else { + log_err( _("Hard link to directory remains\n")); + goto dentry_is_valid; + } + } else if (error < 0) { + stack; + return -1; + } +dentry_is_valid: + /* This directory inode links to this inode via this dentry */ + error = incr_link_count(entry, ip, _("valid reference")); + if (error == incr_link_check_orig) { + error = check_suspicious_dirref(sdp, &entry); + } else if (error == incr_link_ino_mismatch) { + log_err("incr_link_count err=%d.\n", error); + if (bad_formal_ino(ip, dent, entry, tmp_name, q, de, bh) == 1) + goto nuke_dentry; + } + (*count)++; + ds->entry_count++; + /* End of checks */ + return 0; + +nuke_dentry: + dirent2_del(ip, bh, prev_de, dent); + log_err( _("Bad directory entry '%s' cleared.\n"), tmp_name); + return 1; +} + +/* + * write_new_leaf - allocate and write a new leaf to cover a gap in hash table + * @dip: the directory inode + * @start_lindex: where in the hash table to start writing + * @num_copies: number of copies of the pointer to write into hash table + * @before_or_after: desc. of whether this is being added before/after/etc. + * @bn: pointer to return the newly allocated leaf's block number + */ +static int write_new_leaf(struct gfs2_inode *dip, int start_lindex, + int num_copies, const char *before_or_after, + uint64_t *bn) +{ + struct gfs2_buffer_head *nbh; + struct gfs2_leaf *leaf; + struct gfs2_dirent *dent; + int count, i; + int factor = 0, pad_size; + uint64_t *cpyptr; + char *padbuf; + int divisor = num_copies; + int end_lindex = start_lindex + num_copies; + + padbuf = malloc(num_copies * sizeof(uint64_t)); + /* calculate the depth needed for the new leaf */ + while (divisor > 1) { + factor++; + divisor /= 2; + } + /* Make sure the number of copies is properly a factor of 2 */ + if ((1 << factor) != num_copies) { + log_err(_("Program error: num_copies not a factor of 2.\n")); + log_err(_("num_copies=%d, dinode = %lld (0x%llx)\n"), + num_copies, + (unsigned long long)dip->i_di.di_num.no_addr, + (unsigned long long)dip->i_di.di_num.no_addr); + log_err(_("lindex = %d (0x%x)\n"), start_lindex, start_lindex); + stack; + free(padbuf); + return -1; + } + + /* allocate and write out a new leaf block */ + if (lgfs2_meta_alloc(dip, bn)) { + log_err( _("Error: allocation failed while fixing directory leaf " + "pointers.\n")); + free(padbuf); + return -1; + } + fsck_bitmap_set(dip, *bn, _("directory leaf"), dip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + log_err(_("A new directory leaf was allocated at block %lld " + "(0x%llx) to fill the %d (0x%x) pointer gap %s the existing " + "pointer at index %d (0x%x).\n"), (unsigned long long)*bn, + (unsigned long long)*bn, num_copies, num_copies, + before_or_after, start_lindex, start_lindex); + dip->i_di.di_blocks++; + bmodified(dip->i_bh); + nbh = bget(dip->i_sbd, *bn); + memset(nbh->b_data, 0, dip->i_sbd->bsize); + leaf = (struct gfs2_leaf *)nbh->b_data; + leaf->lf_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + leaf->lf_header.mh_type = cpu_to_be32(GFS2_METATYPE_LF); + leaf->lf_header.mh_format = cpu_to_be32(GFS2_FORMAT_LF); + leaf->lf_depth = cpu_to_be16(dip->i_di.di_depth - factor); + + /* initialize the first dirent on the new leaf block */ + dent = (struct gfs2_dirent *)(nbh->b_data + sizeof(struct gfs2_leaf)); + dent->de_rec_len = cpu_to_be16(dip->i_sbd->bsize - + sizeof(struct gfs2_leaf)); + bmodified(nbh); + brelse(nbh); + + /* pad the hash table with the new leaf block */ + cpyptr = (uint64_t *)padbuf; + for (i = start_lindex; i < end_lindex; i++) { + *cpyptr = cpu_to_be64(*bn); + cpyptr++; + } + pad_size = num_copies * sizeof(uint64_t); + log_err(_("Writing to the hash table of directory %lld " + "(0x%llx) at index: 0x%x for 0x%lx pointers.\n"), + (unsigned long long)dip->i_di.di_num.no_addr, + (unsigned long long)dip->i_di.di_num.no_addr, + start_lindex, (unsigned long)pad_size / sizeof(uint64_t)); + if (dip->i_sbd->gfs1) + count = gfs1_writei(dip, padbuf, start_lindex * + sizeof(uint64_t), pad_size); + else + count = gfs2_writei(dip, padbuf, start_lindex * + sizeof(uint64_t), pad_size); + free(padbuf); + if (count != pad_size) { + log_err( _("Error: bad write while fixing directory leaf " + "pointers.\n")); + return -1; + } + return 0; +} + +/* pad_with_leafblks - pad a hash table with pointers to new leaf blocks + * + * @ip: pointer to the dinode structure + * @tbl: pointer to the hash table in memory + * @lindex: index location within the hash table to pad + * @len: number of pointers to be padded + */ +static void pad_with_leafblks(struct gfs2_inode *ip, uint64_t *tbl, + int lindex, int len) +{ + int new_len, i; + uint32_t proper_start = lindex; + uint64_t new_leaf_blk; + + log_err(_("Padding inode %llu (0x%llx) hash table at offset %d (0x%x) " + "for %d pointers.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, lindex, lindex, + len); + while (len) { + new_len = 1; + /* Determine the next factor of 2 down from extras. We can't + just write out a leaf block on a power-of-two boundary. + We also need to make sure it has a length that will + ensure a "proper start" block as well. */ + while ((new_len << 1) <= len) { + /* Translation: If doubling the size of the new leaf + will make its start boundary wrong, we have to + settle for a smaller length (and iterate more). */ + proper_start = (lindex & ~((new_len << 1) - 1)); + if (lindex != proper_start) + break; + new_len <<= 1; + } + write_new_leaf(ip, lindex, new_len, "after", &new_leaf_blk); + log_err(_("New leaf block was allocated at %llu (0x%llx) for " + "index %d (0x%x), length %d\n"), + (unsigned long long)new_leaf_blk, + (unsigned long long)new_leaf_blk, + lindex, lindex, new_len); + fsck_bitmap_set(ip, new_leaf_blk, _("pad leaf"), + ip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + /* Fix the hash table in memory to have the new leaf */ + for (i = 0; i < new_len; i++) + tbl[lindex + i] = cpu_to_be64(new_leaf_blk); + len -= new_len; + lindex += new_len; + } +} + +/* lost_leaf - repair a leaf block that's on the wrong directory inode + * + * If the correct index is less than the starting index, we have a problem. + * Since we process the index sequentially, the previous index has already + * been processed, fixed, and is now correct. But this leaf wants to overwrite + * a previously written good leaf. The only thing we can do is move all the + * directory entries to lost+found so we don't overwrite the good leaf. Then + * we need to pad the gap we leave. + */ +static int lost_leaf(struct gfs2_inode *ip, uint64_t *tbl, uint64_t leafno, + int ref_count, int lindex, struct gfs2_buffer_head *bh) +{ + char *filename; + char *bh_end = bh->b_data + ip->i_sbd->bsize; + struct gfs2_dirent de, *dent; + int error; + int isdir = 0; + + log_err(_("Leaf block %llu (0x%llx) seems to be out of place and its " + "contents need to be moved to lost+found.\n"), + (unsigned long long)leafno, (unsigned long long)leafno); + if (!query( _("Attempt to fix it? (y/n) "))) { + log_err( _("Directory leaf was not fixed.\n")); + return 0; + } + make_sure_lf_exists(ip); + + dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_leaf)); + while (1) { + char tmp_name[PATH_MAX]; + + memset(&de, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&de, (char *)dent); + filename = (char *)dent + sizeof(struct gfs2_dirent); + memset(tmp_name, 0, sizeof(tmp_name)); + if (de.de_name_len > sizeof(filename)) { + log_debug(_("Encountered bad filename length; " + "stopped processing.\n")); + break; + } + memcpy(tmp_name, filename, de.de_name_len); + if ((de.de_name_len == 1 && filename[0] == '.')) { + log_debug(_("Skipping entry '.'\n")); + } else if (de.de_name_len == 2 && filename[0] == '.' && + filename[1] == '.') { + log_debug(_("Skipping entry '..'\n")); + } else if (!de.de_inum.no_formal_ino) { /* sentinel */ + log_debug(_("Skipping sentinel '%s'\n"), tmp_name); + } else { + uint32_t count; + struct dir_status ds = {0}; + int q = 0; + + error = basic_dentry_checks(ip, dent, &de.de_inum, + tmp_name, &count, &de, + &ds, &q, bh, &isdir); + if (error) { + log_err(_("Not relocating corrupt entry " + "\"%s\".\n"), tmp_name); + } else { + error = dir_add(lf_dip, filename, + de.de_name_len, &de.de_inum, + de.de_type); + if (error && error != -EEXIST) { + log_err(_("Error %d encountered while " + "trying to relocate \"%s\" " + "to lost+found.\n"), error, + tmp_name); + return error; + } + /* This inode is linked from lost+found */ + incr_link_count(de.de_inum, lf_dip, + _("from lost+found")); + /* If it's a directory, lost+found is + back-linked to it via .. */ + if (isdir) + incr_link_count(lf_dip->i_di.di_num, + NULL, + _("to lost+found")); + log_err(_("Relocated \"%s\", block %llu " + "(0x%llx) to lost+found.\n"), + tmp_name, + (unsigned long long)de.de_inum.no_addr, + (unsigned long long)de.de_inum.no_addr); + } + } + if ((char *)dent + de.de_rec_len >= bh_end) + break; + dent = (struct gfs2_dirent *)((char *)dent + de.de_rec_len); + } + log_err(_("Directory entries from misplaced leaf block were relocated " + "to lost+found.\n")); + /* Free the lost leaf. */ + fsck_bitmap_set(ip, leafno, _("lost leaf"), GFS2_BLKST_FREE); + ip->i_di.di_blocks--; + bmodified(ip->i_bh); + /* Now we have to deal with the bad hash table entries pointing to the + misplaced leaf block. But we can't just fill the gap with a single + leaf. We have to write on nice power-of-two boundaries, and we have + to pad out any extra pointers. */ + pad_with_leafblks(ip, tbl, lindex, ref_count); + return 1; +} + +static int basic_check_dentry(struct gfs2_inode *ip, struct gfs2_dirent *dent, + struct gfs2_dirent *prev_de, + struct gfs2_buffer_head *bh, char *filename, + uint32_t *count, int *lindex, void *priv) +{ + int q = 0; + char tmp_name[MAX_FILENAME]; + struct gfs2_inum entry; + struct dir_status *ds = (struct dir_status *) priv; + struct gfs2_dirent dentry, *de; + int error; + int isdir; + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + gfs2_dirent_in(&dentry, (char *)dent); + de = &dentry; + + entry.no_addr = de->de_inum.no_addr; + entry.no_formal_ino = de->de_inum.no_formal_ino; + + /* Start of checks */ + memset(tmp_name, 0, MAX_FILENAME); + if (de->de_name_len < MAX_FILENAME) + strncpy(tmp_name, filename, de->de_name_len); + else + strncpy(tmp_name, filename, MAX_FILENAME - 1); + + error = basic_dentry_checks(ip, dent, &entry, tmp_name, count, de, + ds, &q, bh, &isdir); + if (error) { + dirent2_del(ip, bh, prev_de, dent); + log_err( _("Bad directory entry '%s' cleared.\n"), tmp_name); + return 1; + } else { + (*count)++; + return 0; + } +} + +/* pass2_repair_leaf - Warn the user of an error and ask permission to fix it + * Process a bad leaf pointer and ask to repair the first time. + * The repair process involves extending the previous leaf's entries + * so that they replace the bad ones. We have to hack up the old + * leaf a bit, but it's better than deleting the whole directory, + * which is what used to happen before. */ +static int pass2_repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no, + int lindex, int ref_count, const char *msg) +{ + int new_leaf_blks = 0, error, refs; + uint64_t bn = 0; + + log_err( _("Directory Inode %llu (0x%llx) points to leaf %llu" + " (0x%llx) %s.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)*leaf_no, + (unsigned long long)*leaf_no, msg); + if (!query( _("Attempt to patch around it? (y/n) "))) { + log_err( _("Bad leaf left in place.\n")); + goto out; + } + /* We can only write leafs in quantities that are factors of + two, since leaves are doubled, not added sequentially. + So if we have a hole that's not a factor of 2, we have to + break it down into separate leaf blocks that are. */ + while (ref_count) { + refs = 1; + while (refs <= ref_count) { + if (refs * 2 > ref_count) + break; + refs *= 2; + } + error = write_new_leaf(ip, lindex, refs, _("replacing"), &bn); + if (error) + return error; + + new_leaf_blks++; + lindex += refs; + ref_count -= refs; + } + log_err( _("Directory Inode %llu (0x%llx) repaired.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); +out: + *leaf_no = bn; + return new_leaf_blks; +} + +/* The purpose of leafck_fxns is to provide a means for function fix_hashtable + * to do basic sanity checks on leaf blocks before manipulating them, for + * example, splitting them. If they're corrupt, splitting them or trying to + * move their contents can cause a segfault. We can't really use the standard + * pass2_fxns because that will do things we don't want. For example, it will + * find '.' and '..' and increment the directory link count, which would be + * done a second time when the dirent is really checked in pass2_fxns. + * We don't want it to do the "wrong leaf" thing, or set_parent_dir either. + * We just want a basic sanity check on pointers and lengths. + */ +struct metawalk_fxns leafck_fxns = { + .check_leaf_depth = check_leaf_depth, + .check_dentry = basic_check_dentry, + .repair_leaf = pass2_repair_leaf, +}; + +/* fix_hashtable - fix a corrupt hash table + * + * The main intent of this function is to sort out hash table problems. + * That is, it needs to determine if leaf blocks are in the wrong place, + * if the count of pointers is wrong, and if there are extra pointers. + * Everything should be placed on correct power-of-two boundaries appropriate + * to their leaf depth, and extra pointers should be correctly padded with new + * leaf blocks. + * + * @ip: the directory dinode structure pointer + * @tbl: hash table that's already read into memory + * @hsize: hash table size, as dictated by the dinode's di_depth + * @leafblk: the leaf block number that appears at this lindex in the tbl + * @lindex: leaf index that has a problem + * @proper_start: where this leaf's pointers should start, as far as the + * hash table is concerned (sight unseen; trusting the leaf + * really belongs here). + * @len: count of pointers in the hash table to this leafblk + * @proper_len: pointer to return the proper number of pointers, as the kernel + * calculates it, based on the leaf depth. + * @factor: the proper depth, given this number of pointers (rounded down). + * + * Returns: 0 - no changes made, or X if changes were made + */ +static int fix_hashtable(struct gfs2_inode *ip, uint64_t *tbl, unsigned hsize, + uint64_t leafblk, int lindex, uint32_t proper_start, + int len, int *proper_len, int factor) +{ + struct gfs2_buffer_head *lbh; + struct gfs2_leaf leaf; + struct gfs2_dirent dentry, *de; + int changes = 0, error, i, extras, hash_index; + uint64_t new_leaf_blk; + uint64_t leaf_no; + uint32_t leaf_proper_start; + + *proper_len = len; + log_err(_("Dinode %llu (0x%llx) has a hash table error at index " + "0x%x, length 0x%x: leaf block %llu (0x%llx)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, lindex, len, + (unsigned long long)leafblk, (unsigned long long)leafblk); + if (!query( _("Fix the hash table? (y/n) "))) { + log_err(_("Hash table not fixed.\n")); + return 0; + } + + memset(&leaf, 0, sizeof(leaf)); + leaf_no = leafblk; + error = check_leaf(ip, lindex, &leafck_fxns, &leaf_no, &leaf, &len); + if (error) { + log_debug("Leaf repaired while fixing the hash table.\n"); + error = 0; + } + lbh = bread(ip->i_sbd, leafblk); + /* If the leaf's depth is out of range for this dinode, it's obviously + attached to the wrong dinode. Move the dirents to lost+found. */ + if (leaf.lf_depth > ip->i_di.di_depth) { + log_err(_("This leaf block's depth (%d) is too big for this " + "dinode's depth (%d)\n"), + leaf.lf_depth, ip->i_di.di_depth); + error = lost_leaf(ip, tbl, leafblk, len, lindex, lbh); + brelse(lbh); + return error; + } + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + de = (struct gfs2_dirent *)(lbh->b_data + sizeof(struct gfs2_leaf)); + gfs2_dirent_in(&dentry, (char *)de); + + /* If this is an empty leaf, we can just delete it and pad. */ + if ((dentry.de_rec_len == cpu_to_be16(ip->i_sbd->bsize - + sizeof(struct gfs2_leaf))) && + (dentry.de_inum.no_formal_ino == 0)) { + brelse(lbh); + gfs2_free_block(ip->i_sbd, leafblk); + log_err(_("Out of place leaf block %llu (0x%llx) had no " + "entries, so it was deleted.\n"), + (unsigned long long)leafblk, + (unsigned long long)leafblk); + pad_with_leafblks(ip, tbl, lindex, len); + log_err(_("Reprocessing index 0x%x (case 1).\n"), lindex); + return 1; + } + + /* Calculate the proper number of pointers based on the leaf depth. */ + *proper_len = 1 << (ip->i_di.di_depth - leaf.lf_depth); + + /* Look at the first dirent and check its hash value to see if it's + at the proper starting offset. */ + hash_index = hash_table_index(dentry.de_hash, ip); + /* Need to use len here, not *proper_len because the leaf block may + be valid within the range, but starts too soon in the hash table. */ + if (hash_index < lindex || hash_index > lindex + len) { + log_err(_("This leaf block has hash index %d, which is out of " + "bounds for where it appears in the hash table " + "(%d - %d)\n"), + hash_index, lindex, lindex + *proper_len); + error = lost_leaf(ip, tbl, leafblk, len, lindex, lbh); + brelse(lbh); + return error; + } + + /* Now figure out where this leaf should start, and pad any pointers + up to that point with new leaf blocks. */ + leaf_proper_start = (hash_index & ~(*proper_len - 1)); + if (lindex < leaf_proper_start) { + log_err(_("Leaf pointers start at %d (0x%x), should be %d " + "(%x).\n"), lindex, lindex, + leaf_proper_start, leaf_proper_start); + pad_with_leafblks(ip, tbl, lindex, leaf_proper_start - lindex); + brelse(lbh); + return 1; /* reprocess the starting lindex */ + } + /* If the proper start according to the leaf's hash index is later + than the proper start according to the hash table, it's once + again lost and we have to relocate it. The same applies if the + leaf's hash index is prior to the proper state, but the leaf is + already at its maximum depth. */ + if ((leaf_proper_start < proper_start) || + ((*proper_len > len || lindex > leaf_proper_start) && + leaf.lf_depth == ip->i_di.di_depth)) { + log_err(_("Leaf block should start at 0x%x, but it appears at " + "0x%x in the hash table.\n"), leaf_proper_start, + proper_start); + error = lost_leaf(ip, tbl, leafblk, len, lindex, lbh); + brelse(lbh); + return error; + } + + /* If we SHOULD have more pointers than we do, we can solve the + problem by splitting the block to a lower depth. Then we may have + the right number of pointers. If the leaf block pointers start + later than they should, we can split the leaf to give it a smaller + footprint in the hash table. */ + if ((*proper_len > len || lindex > leaf_proper_start) && + ip->i_di.di_depth > leaf.lf_depth) { + log_err(_("For depth %d, length %d, the proper start is: " + "0x%x.\n"), factor, len, proper_start); + changes++; + new_leaf_blk = find_free_blk(ip->i_sbd); + dir_split_leaf(ip, lindex, leafblk, lbh); + /* re-read the leaf to pick up dir_split_leaf's changes */ + gfs2_leaf_in(&leaf, lbh->b_data); + *proper_len = 1 << (ip->i_di.di_depth - leaf.lf_depth); + log_err(_("Leaf block %llu (0x%llx) was split from length " + "%d to %d\n"), (unsigned long long)leafblk, + (unsigned long long)leafblk, len, *proper_len); + if (*proper_len < 0) { + log_err(_("Programming error: proper_len=%d, " + "di_depth = %d, lf_depth = %d.\n"), + *proper_len, ip->i_di.di_depth, leaf.lf_depth); + exit(FSCK_ERROR); + } + log_err(_("New split-off leaf block was allocated at %lld " + "(0x%llx) for index %d (0x%x)\n"), + (unsigned long long)new_leaf_blk, + (unsigned long long)new_leaf_blk, lindex, lindex); + fsck_bitmap_set(ip, new_leaf_blk, _("split leaf"), + ip->i_sbd->gfs1 ? + GFS2_BLKST_DINODE : GFS2_BLKST_USED); + log_err(_("Hash table repaired.\n")); + /* Fix up the hash table in memory to include the new leaf */ + for (i = 0; i < *proper_len; i++) + tbl[lindex + i] = cpu_to_be64(new_leaf_blk); + if (*proper_len < (len >> 1)) { + log_err(_("One leaf split is not enough. The hash " + "table will need to be reprocessed.\n")); + brelse(lbh); + return changes; + } + lindex += (*proper_len); /* skip the new leaf from the split */ + len -= (*proper_len); + } + if (*proper_len < len) { + log_err(_("There are %d pointers, but leaf 0x%llx's " + "depth, %d, only allows %d\n"), + len, (unsigned long long)leafblk, leaf.lf_depth, + *proper_len); + } + brelse(lbh); + /* At this point, lindex should be at the proper end of the pointers. + Now we need to replace any extra duplicate pointers to the old + (original) leafblk (that ran off the end) with new leaf blocks. */ + lindex += (*proper_len); /* Skip past the normal good pointers */ + len -= (*proper_len); + extras = 0; + for (i = 0; i < len; i++) { + if (be64_to_cpu(tbl[lindex + i]) == leafblk) + extras++; + else + break; + } + if (extras) { + log_err(_("Found %d extra pointers to leaf %llu (0x%llx)\n"), + extras, (unsigned long long)leafblk, + (unsigned long long)leafblk); + pad_with_leafblks(ip, tbl, lindex, extras); + log_err(_("Reprocessing index 0x%x (case 2).\n"), lindex); + return 1; + } + return changes; +} + +/* check_hash_tbl_dups - check for the same leaf in multiple places */ +static int check_hash_tbl_dups(struct gfs2_inode *ip, uint64_t *tbl, + unsigned hsize, int lindex, int len) +{ + int l, len2; + uint64_t leafblk, leaf_no; + struct gfs2_buffer_head *lbh; + struct gfs2_leaf leaf; + struct gfs2_dirent dentry, *de; + int hash_index; /* index into the hash table based on the hash */ + + leafblk = be64_to_cpu(tbl[lindex]); + for (l = 0; l < hsize; l++) { + if (l == lindex) { /* skip the valid reference */ + l += len - 1; + continue; + } + if (be64_to_cpu(tbl[l]) != leafblk) + continue; + + for (len2 = 0; l + len2 < hsize; len2++) { + if (l + len2 == lindex) + break; + if (be64_to_cpu(tbl[l + len2]) != leafblk) + break; + } + log_err(_("Dinode %llu (0x%llx) has duplicate leaf pointers " + "to block %llu (0x%llx) at offsets %u (0x%x) " + "(for 0x%x) and %u (0x%x) (for 0x%x)\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)leafblk, + (unsigned long long)leafblk, lindex, lindex, len, + l, l, len2); + + /* See which set of references is valid: the one passed in + or the duplicate we found. */ + memset(&leaf, 0, sizeof(leaf)); + leaf_no = leafblk; + if (!valid_block_ip(ip, leaf_no)) /* Checked later */ + continue; + + lbh = bread(ip->i_sbd, leafblk); + if (gfs2_check_meta(lbh, GFS2_METATYPE_LF)) { /* Chked later */ + brelse(lbh); + continue; + } + + memset(&dentry, 0, sizeof(struct gfs2_dirent)); + de = (struct gfs2_dirent *)(lbh->b_data + + sizeof(struct gfs2_leaf)); + gfs2_dirent_in(&dentry, (char *)de); + hash_index = hash_table_index(dentry.de_hash, ip); + brelse(lbh); + /* check the duplicate ref first */ + if (hash_index < l || hash_index > l + len2) { + log_err(_("This leaf block has hash index %d, which " + "is out of bounds for lindex (%d - %d)\n"), + hash_index, l, l + len2); + if (!query( _("Fix the hash table? (y/n) "))) { + log_err(_("Hash table not fixed.\n")); + return 0; + } + /* Adjust the ondisk block count. The original value + may have been correct without the duplicates but + pass1 would have counted them and adjusted the + count to include them. So we must subtract them. */ + ip->i_di.di_blocks--; + bmodified(ip->i_bh); + pad_with_leafblks(ip, tbl, l, len2); + } else { + log_debug(_("Hash index 0x%x is the proper " + "reference to leaf 0x%llx.\n"), + l, (unsigned long long)leafblk); + } + /* Check the original ref: both references might be bad. + If both were bad, just return and if we encounter it + again, we'll treat it as new. If the original ref is not + bad, keep looking for (and fixing) other instances. */ + if (hash_index < lindex || hash_index > lindex + len) { + log_err(_("This leaf block has hash index %d, which " + "is out of bounds for lindex (%d - %d).\n"), + hash_index, lindex, lindex + len); + if (!query( _("Fix the hash table? (y/n) "))) { + log_err(_("Hash table not fixed.\n")); + return 0; + } + ip->i_di.di_blocks--; + bmodified(ip->i_bh); + pad_with_leafblks(ip, tbl, lindex, len); + /* At this point we know both copies are bad, so we + return to start fresh */ + return -EFAULT; + } else { + log_debug(_("Hash index 0x%x is the proper " + "reference to leaf 0x%llx.\n"), + lindex, (unsigned long long)leafblk); + } + } + return 0; +} + +/* check_hash_tbl - check that the hash table is sane + * + * We've got to make sure the hash table is sane. Each leaf needs to + * be counted a proper power of 2. We can't just have 3 pointers to a leaf. + * The number of pointers must correspond to the proper leaf depth, and they + * must all fall on power-of-two boundaries. The leaf block pointers all need + * to fall properly on these boundaries, otherwise the kernel code's + * calculations will land it on the wrong leaf block while it's searching, + * and the result will be files you can see with ls, but can't open, delete + * or use them. + * + * The goal of this function is to check the hash table to make sure the + * boundaries and lengths all line up properly, and if not, to fix it. + * + * Note: There's a delicate balance here, because this function gets called + * BEFORE leaf blocks are checked by function check_leaf from function + * check_leaf_blks: the hash table has to be sane before we can start + * checking all the leaf blocks. And yet if there's hash table corruption + * we may need to reference leaf blocks to fix it, which means we need + * to check and/or fix a leaf block along the way. + */ +static int check_hash_tbl(struct gfs2_inode *ip, uint64_t *tbl, + unsigned hsize, void *private) +{ + int error = 0; + int lindex, len, proper_len, i, changes = 0; + uint64_t leafblk; + struct gfs2_leaf leaf; + struct gfs2_buffer_head *lbh; + int factor; + uint32_t proper_start; + int anomaly; + + lindex = 0; + while (lindex < hsize) { + if (fsck_abort) + return changes; + len = 1; + factor = 0; + leafblk = be64_to_cpu(tbl[lindex]); + anomaly = 0; + while (lindex + (len << 1) - 1 < hsize) { + uint32_t next_proper_start; + if (be64_to_cpu(tbl[lindex + (len << 1) - 1]) != + leafblk) + break; + next_proper_start = (lindex & ~((len << 1) - 1)); + if (lindex != next_proper_start) + anomaly = 1; + /* Check if there are other values written between + here and the next factor. */ + for (i = len; !anomaly && i + lindex < hsize && + i < (len << 1); i++) + if (be64_to_cpu(tbl[lindex + i]) != leafblk) + anomaly = 1; + if (anomaly) + break; + len <<= 1; + factor++; + } + + /* Check for leftover pointers after the factor of two: */ + proper_len = len; /* A factor of 2 that fits nicely */ + while (lindex + len < hsize && + be64_to_cpu(tbl[lindex + len]) == leafblk) + len++; + + /* See if that leaf block is valid. If not, write a new one + that falls on a proper boundary. If it doesn't naturally, + we may need more. */ + if (!valid_block_ip(ip, leafblk)) { + uint64_t new_leafblk; + + log_err(_("Dinode %llu (0x%llx) has bad leaf pointers " + "at offset %d for %d\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + lindex, len); + if (!query( _("Fix the hash table? (y/n) "))) { + log_err(_("Hash table not fixed.\n")); + lindex += len; + continue; + } + error = write_new_leaf(ip, lindex, proper_len, + _("replacing"), &new_leafblk); + if (error) + return error; + + for (i = lindex; i < lindex + proper_len; i++) + tbl[i] = cpu_to_be64(new_leafblk); + lindex += proper_len; + continue; + } + + if (check_hash_tbl_dups(ip, tbl, hsize, lindex, len)) + continue; + + /* Make sure they call on proper leaf-split boundaries. This + is the calculation used by the kernel, and dir_split_leaf */ + proper_start = (lindex & ~(proper_len - 1)); + if (lindex != proper_start) { + log_debug(_("lindex 0x%llx is not a proper starting " + "point for leaf %llu (0x%llx): 0x%llx\n"), + (unsigned long long)lindex, + (unsigned long long)leafblk, + (unsigned long long)leafblk, + (unsigned long long)proper_start); + changes = fix_hashtable(ip, tbl, hsize, leafblk, + lindex, proper_start, len, + &proper_len, factor); + /* Check if we need to split more leaf blocks */ + if (changes) { + if (proper_len < (len >> 1)) + log_err(_("More leaf splits are " + "needed; ")); + log_err(_("Reprocessing index 0x%x (case 3).\n"), + lindex); + continue; /* Make it reprocess the lindex */ + } + } + /* Check for extra pointers to this leaf. At this point, len + is the number of pointers we have. proper_len is the proper + number of pointers if the hash table is assumed correct. + Function fix_hashtable will read in the leaf block and + determine the "actual" proper length based on the leaf + depth, and adjust the hash table accordingly. */ + if (len != proper_len) { + log_err(_("Length %d (0x%x) is not a proper length " + "for leaf %llu (0x%llx). Valid boundary " + "assumed to be %d (0x%x).\n"), len, len, + (unsigned long long)leafblk, + (unsigned long long)leafblk, + proper_len, proper_len); + lbh = bread(ip->i_sbd, leafblk); + gfs2_leaf_in(&leaf, lbh->b_data); + if (gfs2_check_meta(lbh, GFS2_METATYPE_LF) || + leaf.lf_depth > ip->i_di.di_depth) + leaf.lf_depth = factor; + brelse(lbh); + changes = fix_hashtable(ip, tbl, hsize, leafblk, + lindex, lindex, len, + &proper_len, leaf.lf_depth); + /* If fixing the hash table made changes, we can no + longer count on the leaf block pointers all pointing + to the same leaf (which is checked below). To avoid + flagging another error, reprocess the offset. */ + if (changes) { + log_err(_("Reprocessing index 0x%x (case 4).\n"), + lindex); + continue; /* Make it reprocess the lindex */ + } + } + + /* Now make sure they're all the same pointer */ + for (i = lindex; i < lindex + proper_len; i++) { + if (fsck_abort) + return changes; + + if (be64_to_cpu(tbl[i]) == leafblk) /* No problem */ + continue; + + log_err(_("Dinode %llu (0x%llx) has a hash table " + "inconsistency at index %d (0x%x) for %d\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + i, i, len); + if (!query( _("Fix the hash table? (y/n) "))) { + log_err(_("Hash table not fixed.\n")); + continue; + } + changes++; + /* Now we have to determine if the hash table is + corrupt, or if the leaf has the wrong depth. */ + lbh = bread(ip->i_sbd, leafblk); + gfs2_leaf_in(&leaf, lbh->b_data); + brelse(lbh); + /* Calculate the expected pointer count based on the + leaf depth. */ + proper_len = 1 << (ip->i_di.di_depth - leaf.lf_depth); + if (proper_len != len) { + log_debug(_("Length 0x%x is not proper for " + "leaf %llu (0x%llx): 0x%x\n"), + len, (unsigned long long)leafblk, + (unsigned long long)leafblk, + proper_len); + changes = fix_hashtable(ip, tbl, hsize, + leafblk, lindex, + lindex, len, + &proper_len, + leaf.lf_depth); + break; + } + } + lindex += proper_len; + } + if (!error && changes) + error = 1; + return error; +} + +struct metawalk_fxns pass2_fxns = { + .private = NULL, + .check_leaf_depth = check_leaf_depth, + .check_leaf = NULL, + .check_metalist = NULL, + .check_data = NULL, + .check_eattr_indir = check_eattr_indir, + .check_eattr_leaf = check_eattr_leaf, + .check_dentry = check_dentry, + .check_eattr_entry = NULL, + .check_hash_tbl = check_hash_tbl, + .repair_leaf = pass2_repair_leaf, +}; + +static int check_metalist_qc(struct gfs2_inode *ip, uint64_t block, + struct gfs2_buffer_head **bh, int h, + int *is_valid, int *was_duplicate, void *private) +{ + *was_duplicate = 0; + *is_valid = 1; + *bh = bread(ip->i_sbd, block); + return meta_is_good; +} + +static int check_data_qc(struct gfs2_inode *ip, uint64_t metablock, + uint64_t block, void *private, + struct gfs2_buffer_head *bbh, uint64_t *ptr) +{ + struct gfs2_buffer_head *bh; + + /* At this point, basic data block checks have already been done, + so we only need to make sure they're QC blocks. */ + if (!valid_block_ip(ip, block)) + return -1; + + bh = bread(ip->i_sbd, block); + if (gfs2_check_meta(bh, GFS2_METATYPE_QC) != 0) { + log_crit(_("Error: quota_change block at %lld (0x%llx) is " + "the wrong metadata type.\n"), + (unsigned long long)block, (unsigned long long)block); + brelse(bh); + return -1; + } + brelse(bh); + return 0; +} + +struct metawalk_fxns quota_change_fxns = { + .check_metalist = check_metalist_qc, + .check_data = check_data_qc, +}; + +/* check_pernode_for - verify a file within the system per_node directory + * @x - index number X + * @per_node - pointer to the per_node inode + * @fn - system file name + * @filelen - the file length the system file needs to be + * @multiple - the file length must be a multiple (versus the exact value) + * @pass - a metawalk function for checking the data blocks (if any) + * @builder - a rebuild function for the file + * + * Returns: 0 if all went well, else error. */ +static int check_pernode_for(int x, struct gfs2_inode *pernode, const char *fn, + unsigned long long filelen, int multiple, + struct metawalk_fxns *pass, + int builder(struct gfs2_inode *per_node, + unsigned int j)) +{ + struct gfs2_inode *ip; + int error, valid_size = 1; + + log_debug(_("Checking system file %s\n"), fn); + error = gfs2_lookupi(pernode, fn, strlen(fn), &ip); + if (error) { + log_err(_("System file %s is missing.\n"), fn); + if (!query( _("Rebuild the system file? (y/n) "))) + return 0; + goto build_it; + } + if (!ip->i_di.di_size) + valid_size = 0; + else if (!multiple && ip->i_di.di_size != filelen) + valid_size = 0; + else if (multiple && (ip->i_di.di_size % filelen)) + valid_size = 0; + if (!valid_size) { + log_err(_("System file %s has an invalid size. Is %llu, " + "should be %llu.\n"), fn, ip->i_di.di_size, filelen); + if (!query( _("Rebuild the system file? (y/n) "))) + goto out_good; + fsck_inode_put(&ip); + goto build_it; + } + if (pass) { + error = check_metatree(ip, pass); + if (!error) + goto out_good; + log_err(_("System file %s has bad contents.\n"), fn); + if (!query( _("Delete and rebuild the system file? (y/n) "))) + goto out_good; + check_metatree(ip, &pass2_fxns_delete); + fsck_inode_put(&ip); + gfs2_dirent_del(pernode, fn, strlen(fn)); + goto build_it; + } +out_good: + fsck_inode_put(&ip); + return 0; + +build_it: + if (builder(pernode, x)) { + log_err(_("Error building %s\n"), fn); + return -1; + } + error = gfs2_lookupi(pernode, fn, strlen(fn), &ip); + if (error) { + log_err(_("Error rebuilding %s.\n"), fn); + return -1; + } + fsck_bitmap_set(ip, ip->i_di.di_num.no_addr, fn, GFS2_BLKST_DINODE); + log_err(_("System file %s rebuilt.\n"), fn); + goto out_good; +} + +/* Check system directory inode */ +/* Should work for all system directories: root, master, jindex, per_node */ +static int check_system_dir(struct gfs2_inode *sysinode, const char *dirname, + int builder(struct gfs2_sbd *sdp)) +{ + uint64_t iblock = 0; + struct dir_status ds = {0}; + int error = 0; + + log_info( _("Checking system directory inode '%s'\n"), dirname); + + if (!sysinode) { + log_err( _("Failed to check '%s': sysinode is null\n"), dirname); + stack; + return -1; + } + + iblock = sysinode->i_di.di_num.no_addr; + ds.q = bitmap_type(sysinode->i_sbd, iblock); + + pass2_fxns.private = (void *) &ds; + if (ds.q == GFS2_BLKST_FREE) { + /* First check that the directory's metatree is valid */ + error = check_metatree(sysinode, &pass2_fxns); + if (error < 0) { + stack; + return error; + } + } + error = check_dir(sysinode->i_sbd, sysinode, &pass2_fxns); + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + if (error < 0) { + stack; + return -1; + } + if (error > 0) + fsck_bitmap_set(sysinode, iblock, dirname, GFS2_BLKST_FREE); + + if (check_inode_eattr(sysinode, &pass2_fxns)) { + stack; + return -1; + } + if (!ds.dotdir) { + log_err( _("No '.' entry found for %s directory.\n"), dirname); + if (query( _("Is it okay to add '.' entry? (y/n) "))) { + log_warn( _("Adding '.' entry\n")); + error = dir_add(sysinode, ".", 1, &(sysinode->i_di.di_num), + (sysinode->i_sbd->gfs1 ? GFS_FILE_DIR : DT_DIR)); + if (error) { + log_err(_("Error adding directory %s: %s\n"), "'.'", + strerror(errno)); + return -errno; + } + /* This system inode is linked to itself via '.' */ + incr_link_count(sysinode->i_di.di_num, sysinode, + "sysinode \".\""); + ds.entry_count++; + } else + log_err( _("The directory was not fixed.\n")); + } + if (sysinode->i_di.di_entries != ds.entry_count) { + log_err( _("%s inode %llu (0x%llx" + "): Entries is %d - should be %d\n"), dirname, + (unsigned long long)sysinode->i_di.di_num.no_addr, + (unsigned long long)sysinode->i_di.di_num.no_addr, + sysinode->i_di.di_entries, ds.entry_count); + if (query( _("Fix entries for %s inode %llu (0x%llx)? (y/n) "), + dirname, + (unsigned long long)sysinode->i_di.di_num.no_addr, + (unsigned long long)sysinode->i_di.di_num.no_addr)) { + sysinode->i_di.di_entries = ds.entry_count; + bmodified(sysinode->i_bh); + log_warn( _("Entries updated\n")); + } else { + log_err( _("Entries for inode %llu (0x%llx" + ") left out of sync\n"), + (unsigned long long) + sysinode->i_di.di_num.no_addr, + (unsigned long long) + sysinode->i_di.di_num.no_addr); + } + } + error = 0; + if (sysinode == sysinode->i_sbd->md.pinode) { + int j; + char fn[64]; + + /* Make sure all the per_node files are there, and valid */ + for (j = 0; j < sysinode->i_sbd->md.journals; j++) { + sprintf(fn, "inum_range%d", j); + error += check_pernode_for(j, sysinode, fn, 16, 0, + NULL, build_inum_range); + sprintf(fn, "statfs_change%d", j); + error += check_pernode_for(j, sysinode, fn, 24, 0, + NULL, build_statfs_change); + sprintf(fn, "quota_change%d", j); + error += check_pernode_for(j, sysinode, fn, 1048576, 1, + "a_change_fxns, + build_quota_change); + } + } + return error; +} + +/** + * is_system_dir - determine if a given block is for a system directory. + */ +static inline int is_system_dir(struct gfs2_sbd *sdp, uint64_t block) +{ + if (block == sdp->md.rooti->i_di.di_num.no_addr) + return TRUE; + if (sdp->gfs1) + return FALSE; + if (block == sdp->md.jiinode->i_di.di_num.no_addr || + block == sdp->md.pinode->i_di.di_num.no_addr || + block == sdp->master_dir->i_di.di_num.no_addr) + return TRUE; + return FALSE; +} + +static int pass2_check_dir(struct gfs2_sbd *sdp, struct gfs2_inode *ip) +{ + uint64_t dirblk = ip->i_di.di_num.no_addr; + struct dir_status ds = {0}; + int error; + + pass2_fxns.private = &ds; + error = check_dir(sdp, ip, &pass2_fxns); + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + if (error < 0) { + stack; + return FSCK_ERROR; + } + if (error > 0) { + struct dir_info *di; + + di = dirtree_find(dirblk); + if (!di) { + stack; + return FSCK_ERROR; + } + if (query(_("Remove directory entry for bad inode " + "%llu (0x%llx) in %llu (0x%llx)? (y/n)"), + (unsigned long long)dirblk, + (unsigned long long)dirblk, + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent)) { + error = remove_dentry_from_dir(sdp, di->treewalk_parent, dirblk); + if (error < 0) { + stack; + return FSCK_ERROR; + } + if (error > 0) { + log_warn(_("Unable to find dentry for %llu (0x%llx) " + "in %llu (0x%llx)\n"), + (unsigned long long)dirblk, + (unsigned long long)dirblk, + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent); + } + log_warn(_("Directory entry removed\n")); + } else + log_err(_("Directory entry to invalid inode remains.\n")); + + log_debug(_("Directory block %lld (0x%llx) is now marked as 'invalid'\n"), + (unsigned long long)dirblk, (unsigned long long)dirblk); + check_n_fix_bitmap(sdp, ip->i_rgd, dirblk, 0, GFS2_BLKST_FREE); + } + + if (!ds.dotdir) { + log_err(_("No '.' entry found for directory inode at block %llu (0x%llx)\n"), + (unsigned long long)dirblk, (unsigned long long)dirblk); + + if (query( _("Is it okay to add '.' entry? (y/n) "))) { + error = dir_add(ip, ".", 1, &(ip->i_di.di_num), + (sdp->gfs1 ? GFS_FILE_DIR : DT_DIR)); + if (error) { + log_err(_("Error adding directory %s: %s\n"), "'.'", + strerror(errno)); + return -errno; + } + /* directory links to itself via '.' */ + incr_link_count(ip->i_di.di_num, ip, _("\". (itself)\"")); + ds.entry_count++; + log_err(_("The directory was fixed.\n")); + } else { + log_err(_("The directory was not fixed.\n")); + } + } + + if (!fsck_abort && ip->i_di.di_entries != ds.entry_count) { + log_err(_("Entries is %d - should be %d for inode block %llu (0x%llx)\n"), + ip->i_di.di_entries, ds.entry_count, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (query(_("Fix the entry count? (y/n) "))) { + ip->i_di.di_entries = ds.entry_count; + bmodified(ip->i_bh); + } else { + log_err(_("The entry count was not fixed.\n")); + } + } + return FSCK_OK; +} + +/* What i need to do in this pass is check that the dentries aren't + * pointing to invalid blocks...and verify the contents of each + * directory. and start filling in the directory info structure*/ + +/** + * pass2 - check pathnames + * + * verify root inode + * directory name length + * entries in range + */ +int pass2(struct gfs2_sbd *sdp) +{ + struct osi_node *tmp, *next = NULL; + struct gfs2_inode *ip; + struct dir_info *dt; + uint64_t dirblk; + int error; + + /* Check all the system directory inodes. */ + if (!sdp->gfs1 && + check_system_dir(sdp->md.jiinode, "jindex", build_jindex)) { + stack; + return FSCK_ERROR; + } + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + if (!sdp->gfs1 && + check_system_dir(sdp->md.pinode, "per_node", build_per_node)) { + stack; + return FSCK_ERROR; + } + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + if (!sdp->gfs1 && + check_system_dir(sdp->master_dir, "master", build_master)) { + stack; + return FSCK_ERROR; + } + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + if (check_system_dir(sdp->md.rooti, "root", build_root)) { + stack; + return FSCK_ERROR; + } + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + log_info( _("Checking directory inodes.\n")); + /* Grab each directory inode, and run checks on it */ + for (tmp = osi_first(&dirtree); tmp; tmp = next) { + next = osi_next(tmp); + + dt = (struct dir_info *)tmp; + dirblk = dt->dinode.no_addr; + warm_fuzzy_stuff(dirblk); + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + + /* Skip the system inodes - they're checked above */ + if (is_system_dir(sdp, dirblk)) + continue; + + /* If we created lost+found, its links should have been + properly adjusted, so don't check it. */ + if (lf_was_created && + (dirblk == lf_dip->i_di.di_num.no_addr)) { + log_debug(_("Pass2 skipping the new lost+found.\n")); + continue; + } + + log_debug(_("Checking directory inode at block %llu (0x%llx)\n"), + (unsigned long long)dirblk, (unsigned long long)dirblk); + + ip = fsck_load_inode(sdp, dirblk); + if (ip == NULL) { + stack; + return FSCK_ERROR; + } + error = pass2_check_dir(sdp, ip); + fsck_inode_put(&ip); + + if (skip_this_pass || fsck_abort) + return FSCK_OK; + + if (error != FSCK_OK) { + stack; + return error; + } + } + return FSCK_OK; +} diff --git a/gfs2/fsck/pass3.c b/gfs2/fsck/pass3.c new file mode 100644 index 0000000..4b9c54d --- /dev/null +++ b/gfs2/fsck/pass3.c @@ -0,0 +1,326 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "osi_list.h" +#include "fsck.h" +#include "lost_n_found.h" +#include "link.h" +#include "metawalk.h" +#include "util.h" +#include "afterpass1_common.h" + +static int attach_dotdot_to(struct gfs2_sbd *sdp, uint64_t newdotdot, + uint64_t olddotdot, uint64_t block) +{ + const char *filename = ".."; + int filename_len = 2; + int err; + struct gfs2_inode *ip, *pip; + + ip = fsck_load_inode(sdp, block); + pip = fsck_load_inode(sdp, newdotdot); + /* FIXME: Need to add some interactive + * options here and come up with a + * good default for non-interactive */ + /* FIXME: do i need to correct the + * '..' entry for this directory in + * this case? */ + + if (gfs2_dirent_del(ip, filename, filename_len)) + log_warn( _("Unable to remove \"..\" directory entry.\n")); + else + decr_link_count(olddotdot, block, sdp->gfs1, _("old \"..\"")); + err = dir_add(ip, filename, filename_len, &pip->i_di.di_num, + (sdp->gfs1 ? GFS_FILE_DIR : DT_DIR)); + if (err) { + log_err(_("Error adding directory %s: %s\n"), + filename, strerror(errno)); + exit(FSCK_ERROR); + } + incr_link_count(pip->i_di.di_num, ip, _("new \"..\"")); + fsck_inode_put(&ip); + fsck_inode_put(&pip); + return 0; +} + +static struct dir_info *mark_and_return_parent(struct gfs2_sbd *sdp, + struct dir_info *di) +{ + struct dir_info *pdi; + int q_dotdot, q_treewalk; + int error = 0; + struct dir_info *dt_dotdot, *dt_treewalk; + + di->checked = 1; + + if (!di->treewalk_parent) + return NULL; + + if (di->dotdot_parent.no_addr == di->treewalk_parent) { + q_dotdot = bitmap_type(sdp, di->dotdot_parent.no_addr); + if (q_dotdot != GFS2_BLKST_DINODE) { + log_err( _("Orphaned directory at block %llu (0x%llx) " + "moved to lost+found\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr); + return NULL; + } + goto out; + } + + log_warn( _("Directory '..' and treewalk connections disagree for " + "inode %llu (0x%llx)\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr); + log_notice( _("'..' has %llu (0x%llx), treewalk has %llu (0x%llx)\n"), + (unsigned long long)di->dotdot_parent.no_addr, + (unsigned long long)di->dotdot_parent.no_addr, + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent); + q_dotdot = bitmap_type(sdp, di->dotdot_parent.no_addr); + dt_dotdot = dirtree_find(di->dotdot_parent.no_addr); + q_treewalk = bitmap_type(sdp, di->treewalk_parent); + dt_treewalk = dirtree_find(di->treewalk_parent); + /* if the dotdot entry isn't a directory, but the + * treewalk is, treewalk is correct - if the treewalk + * entry isn't a directory, but the dotdot is, dotdot + * is correct - if both are directories, which do we + * choose? if neither are directories, we have a + * problem - need to move this directory into lost+found + */ + if (q_dotdot != GFS2_BLKST_DINODE || dt_dotdot == NULL) { + if (q_treewalk != GFS2_BLKST_DINODE) { + log_err( _("Orphaned directory, move to " + "lost+found\n")); + return NULL; + } else { + log_warn( _("Treewalk parent is correct, fixing " + "dotdot -> %llu (0x%llx)\n"), + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent); + attach_dotdot_to(sdp, di->treewalk_parent, + di->dotdot_parent.no_addr, + di->dinode.no_addr); + di->dotdot_parent.no_addr = di->treewalk_parent; + } + goto out; + } + if (dt_treewalk) { + log_err( _("Both .. and treewalk parents are directories, " + "going with treewalk...\n")); + attach_dotdot_to(sdp, di->treewalk_parent, + di->dotdot_parent.no_addr, + di->dinode.no_addr); + di->dotdot_parent.no_addr = di->treewalk_parent; + goto out; + } + log_warn( _(".. parent is valid, but treewalk is bad - reattaching to " + "lost+found")); + + /* FIXME: add a dinode for this entry instead? */ + + if (!query( _("Remove directory entry for bad inode %llu (0x%llx) in " + "%llu (0x%llx)? (y/n)"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent)) { + log_err( _("Directory entry to invalid inode remains\n")); + return NULL; + } + error = remove_dentry_from_dir(sdp, di->treewalk_parent, + di->dinode.no_addr); + if (error < 0) { + stack; + return NULL; + } + if (error > 0) + log_warn( _("Unable to find dentry for block %llu" + " (0x%llx) in %llu (0x%llx)\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->treewalk_parent, + (unsigned long long)di->treewalk_parent); + log_warn( _("Directory entry removed\n")); + log_info( _("Marking directory unlinked\n")); + + return NULL; + +out: + pdi = dirtree_find(di->dotdot_parent.no_addr); + + return pdi; +} + +/** + * pass3 - check connectivity of directories + * + * handle disconnected directories + * handle lost+found directory errors (missing, not a directory, no space) + */ +int pass3(struct gfs2_sbd *sdp) +{ + struct osi_node *tmp, *next = NULL; + struct dir_info *di, *tdi; + struct gfs2_inode *ip; + int q; + + di = dirtree_find(sdp->md.rooti->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking root inode connected\n")); + di->checked = 1; + } + if (sdp->gfs1) { + di = dirtree_find(sdp->md.statfs->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking GFS1 statfs file inode " + "connected\n")); + di->checked = 1; + } + di = dirtree_find(sdp->md.jiinode->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking GFS1 jindex file inode " + "connected\n")); + di->checked = 1; + } + di = dirtree_find(sdp->md.riinode->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking GFS1 rindex file inode " + "connected\n")); + di->checked = 1; + } + di = dirtree_find(sdp->md.qinode->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking GFS1 quota file inode " + "connected\n")); + di->checked = 1; + } + } else { + di = dirtree_find(sdp->master_dir->i_di.di_num.no_addr); + if (di) { + log_info( _("Marking master directory inode " + "connected\n")); + di->checked = 1; + } + } + + /* Go through the directory list, working up through the parents + * until we find one that's been checked already. If we don't + * find a parent, put in lost+found. + */ + log_info( _("Checking directory linkage.\n")); + for (tmp = osi_first(&dirtree); tmp; tmp = next) { + next = osi_next(tmp); + di = (struct dir_info *)tmp; + while (!di->checked) { + /* FIXME: Change this so it returns success or + * failure and put the parent inode in a + * param */ + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + tdi = mark_and_return_parent(sdp, di); + + if (tdi) { + log_debug( _("Directory at block %llu " + "(0x%llx) connected\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr); + di = tdi; + continue; + } + q = bitmap_type(sdp, di->dinode.no_addr); + ip = fsck_load_inode(sdp, di->dinode.no_addr); + if (q == GFS2_BLKST_FREE) { + log_err( _("Found unlinked directory " + "containing bad block at block %llu" + " (0x%llx)\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr); + if (query(_("Clear unlinked directory " + "with bad blocks? (y/n) "))) { + log_warn( _("inode %lld (0x%llx) is " + "now marked as free\n"), + (unsigned long long) + di->dinode.no_addr, + (unsigned long long) + di->dinode.no_addr); + check_n_fix_bitmap(sdp, ip->i_rgd, + di->dinode.no_addr, + 0, GFS2_BLKST_FREE); + fsck_inode_put(&ip); + break; + } else + log_err( _("Unlinked directory with bad block remains\n")); + } + if (q != GFS2_BLKST_DINODE) { + log_err( _("Unlinked block marked as an inode " + "is not an inode\n")); + if (!query(_("Clear the unlinked block?" + " (y/n) "))) { + log_err( _("The block was not " + "cleared\n")); + fsck_inode_put(&ip); + break; + } + log_warn( _("inode %lld (0x%llx) is now " + "marked as free\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr); + check_n_fix_bitmap(sdp, ip->i_rgd, + di->dinode.no_addr, 0, + GFS2_BLKST_FREE); + log_err( _("The block was cleared\n")); + fsck_inode_put(&ip); + break; + } + + log_err( _("Found unlinked directory at block %llu" + " (0x%llx)\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr); + /* Don't skip zero size directories with eattrs */ + if (!ip->i_di.di_size && !ip->i_di.di_eattr){ + log_err( _("Unlinked directory has zero " + "size.\n")); + if (query( _("Remove zero-size unlinked " + "directory? (y/n) "))) { + fsck_bitmap_set(ip, di->dinode.no_addr, + _("zero-sized unlinked inode"), + GFS2_BLKST_FREE); + fsck_inode_put(&ip); + break; + } else { + log_err( _("Zero-size unlinked " + "directory remains\n")); + } + } + if (query( _("Add unlinked directory to " + "lost+found? (y/n) "))) { + if (add_inode_to_lf(ip)) { + fsck_inode_put(&ip); + stack; + return FSCK_ERROR; + } + log_warn( _("Directory relinked to lost+found\n")); + } else { + log_err( _("Unlinked directory remains unlinked\n")); + } + fsck_inode_put(&ip); + break; + } + } + if (lf_dip) { + log_debug( _("At end of pass3, lost+found entries is %u\n"), + lf_dip->i_di.di_entries); + } + return FSCK_OK; +} diff --git a/gfs2/fsck/pass4.c b/gfs2/fsck/pass4.c new file mode 100644 index 0000000..313ff0f --- /dev/null +++ b/gfs2/fsck/pass4.c @@ -0,0 +1,306 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "link.h" +#include "lost_n_found.h" +#include "inode_hash.h" +#include "metawalk.h" +#include "util.h" +#include "afterpass1_common.h" + +struct metawalk_fxns pass4_fxns_delete = { + .private = NULL, + .check_metalist = delete_metadata, + .check_data = delete_data, + .check_eattr_indir = delete_eattr_indir, + .check_eattr_leaf = delete_eattr_leaf, +}; + +/* Updates the link count of an inode to what the fsck has seen for + * link count */ +static int fix_link_count(uint32_t counted_links, struct gfs2_inode *ip) +{ + log_info( _("Fixing inode link count (%d->%d) for %llu (0x%llx) \n"), + ip->i_di.di_nlink, counted_links, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (ip->i_di.di_nlink == counted_links) + return 0; + ip->i_di.di_nlink = counted_links; + bmodified(ip->i_bh); + + log_debug( _("Changing inode %llu (0x%llx) to have %u links\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, counted_links); + return 0; +} + +/** + * handle_unlinked - handle an unlinked dinode + * + * Note: We need to pass in *counted_links here, not counted_links because + * add_inode_to_lf may be called here, and that might change the original + * value, whether that's in the dirtree or the inodetree. + * + * Returns: 1 if caller should do "continue", 0 if not. + */ +static int handle_unlinked(struct gfs2_sbd *sdp, uint64_t no_addr, + uint32_t *counted_links, int *lf_addition) +{ + struct gfs2_inode *ip; + int q; + + log_err( _("Found unlinked inode at %llu (0x%llx)\n"), + (unsigned long long)no_addr, (unsigned long long)no_addr); + q = bitmap_type(sdp, no_addr); + if (q == GFS2_BLKST_FREE) { + log_err( _("Unlinked inode %llu (0x%llx) contains bad " + "blocks\n"), (unsigned long long)no_addr, + (unsigned long long)no_addr); + if (query(_("Delete unlinked inode with bad blocks? " + "(y/n) "))) { + ip = fsck_load_inode(sdp, no_addr); + check_inode_eattr(ip, &pass4_fxns_delete); + check_metatree(ip, &pass4_fxns_delete); + fsck_bitmap_set(ip, no_addr, _("bad unlinked"), + GFS2_BLKST_FREE); + fsck_inode_put(&ip); + return 1; + } else { + log_err( _("Unlinked inode with bad blocks not " + "cleared\n")); + } + } + if (q != GFS2_BLKST_DINODE) { + log_err( _("Unlinked block %lld (0x%llx) marked as inode is " + "not an inode (%d)\n"), + (unsigned long long)no_addr, + (unsigned long long)no_addr, q); + ip = fsck_load_inode(sdp, no_addr); + if (query(_("Delete unlinked inode? (y/n) "))) { + check_inode_eattr(ip, &pass4_fxns_delete); + check_metatree(ip, &pass4_fxns_delete); + fsck_bitmap_set(ip, no_addr, _("invalid unlinked"), + GFS2_BLKST_FREE); + fsck_inode_put(&ip); + log_err( _("The inode was deleted\n")); + } else { + log_err( _("The inode was not deleted\n")); + fsck_inode_put(&ip); + } + return 1; + } + ip = fsck_load_inode(sdp, no_addr); + + /* We don't want to clear zero-size files with eattrs - there might be + relevent info in them. */ + if (!ip->i_di.di_size && !ip->i_di.di_eattr){ + log_err( _("Unlinked inode has zero size\n")); + if (query(_("Clear zero-size unlinked inode? (y/n) "))) { + fsck_bitmap_set(ip, no_addr, _("unlinked zero-length"), + GFS2_BLKST_FREE); + fsck_inode_put(&ip); + return 1; + } + } + if (query( _("Add unlinked inode to lost+found? (y/n)"))) { + if (add_inode_to_lf(ip)) { + stack; + fsck_inode_put(&ip); + return -1; + } else { + fix_link_count(*counted_links, ip); + *lf_addition = 1; + } + } else + log_err( _("Unlinked inode left unlinked\n")); + fsck_inode_put(&ip); + return 0; +} + +static void handle_inconsist(struct gfs2_sbd *sdp, uint64_t no_addr, + uint32_t *di_nlink, uint32_t counted_links) +{ + log_err( _("Link count inconsistent for inode %llu" + " (0x%llx) has %u but fsck found %u.\n"), + (unsigned long long)no_addr, (unsigned long long)no_addr, + *di_nlink, counted_links); + /* Read in the inode, adjust the link count, and write it back out */ + if (query( _("Update link count for inode %llu (0x%llx) ? (y/n) "), + (unsigned long long)no_addr, (unsigned long long)no_addr)) { + struct gfs2_inode *ip; + + ip = fsck_load_inode(sdp, no_addr); /* bread, inode_get */ + fix_link_count(counted_links, ip); + *di_nlink = counted_links; + fsck_inode_put(&ip); /* out, brelse, free */ + log_warn(_("Link count updated to %d for inode %llu " + "(0x%llx)\n"), *di_nlink, + (unsigned long long)no_addr, + (unsigned long long)no_addr); + } else { + log_err( _("Link count for inode %llu (0x%llx) still " + "incorrect\n"), + (unsigned long long)no_addr, + (unsigned long long)no_addr); + } +} + +static int adjust_lf_links(int lf_addition) +{ + struct dir_info *lf_di; + + if (lf_dip == NULL) + return 0; + + if (!lf_addition) + return 0; + + if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) { + log_crit(_("Unable to find lost+found inode in " + "inode_hash!!\n")); + return -1; + } else { + fix_link_count(lf_di->counted_links, lf_dip); + } + return 0; +} + +static int scan_inode_list(struct gfs2_sbd *sdp) +{ + struct osi_node *tmp, *next = NULL; + struct inode_info *ii; + int lf_addition = 0; + + /* FIXME: should probably factor this out into a generic + * scanning fxn */ + for (tmp = osi_first(&inodetree); tmp; tmp = next) { + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return 0; + next = osi_next(tmp); + ii = (struct inode_info *)tmp; + /* Don't check reference counts on the special gfs files */ + if (sdp->gfs1 && + ((ii->di_num.no_addr == sdp->md.riinode->i_di.di_num.no_addr) || + (ii->di_num.no_addr == sdp->md.qinode->i_di.di_num.no_addr) || + (ii->di_num.no_addr == sdp->md.statfs->i_di.di_num.no_addr))) + continue; + if (ii->counted_links == 0) { + if (handle_unlinked(sdp, ii->di_num.no_addr, + &ii->counted_links, &lf_addition)) + continue; + } /* if (ii->counted_links == 0) */ + else if (ii->di_nlink != ii->counted_links) { + handle_inconsist(sdp, ii->di_num.no_addr, + &ii->di_nlink, ii->counted_links); + } + log_debug( _("block %llu (0x%llx) has link count %d\n"), + (unsigned long long)ii->di_num.no_addr, + (unsigned long long)ii->di_num.no_addr, ii->di_nlink); + } /* osi_list_foreach(tmp, list) */ + + return adjust_lf_links(lf_addition); +} + +static int scan_dir_list(struct gfs2_sbd *sdp) +{ + struct osi_node *tmp, *next = NULL; + struct dir_info *di; + int lf_addition = 0; + + /* FIXME: should probably factor this out into a generic + * scanning fxn */ + for (tmp = osi_first(&dirtree); tmp; tmp = next) { + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return 0; + next = osi_next(tmp); + di = (struct dir_info *)tmp; + /* Don't check reference counts on the special gfs files */ + if (sdp->gfs1 && + di->dinode.no_addr == sdp->md.jiinode->i_di.di_num.no_addr) + continue; + if (di->counted_links == 0) { + if (handle_unlinked(sdp, di->dinode.no_addr, + &di->counted_links, &lf_addition)) + continue; + } else if (di->di_nlink != di->counted_links) { + handle_inconsist(sdp, di->dinode.no_addr, + &di->di_nlink, di->counted_links); + } + log_debug( _("block %llu (0x%llx) has link count %d\n"), + (unsigned long long)di->dinode.no_addr, + (unsigned long long)di->dinode.no_addr, di->di_nlink); + } /* osi_list_foreach(tmp, list) */ + + return adjust_lf_links(lf_addition); +} + +static int scan_nlink1_list(struct gfs2_sbd *sdp) +{ + uint64_t blk; + uint32_t counted_links; + int lf_addition = 0; + + for (blk = 0; blk < last_fs_block; blk++) { + if (skip_this_pass || fsck_abort) + return 0; + if (link1_type(&nlink1map, blk) == 0) + continue; + + if (link1_type(&clink1map, blk) == 0) { + /* In other cases, counted_links is a pointer to a + real count that gets incremented when it's added + to lost+found. In this case, however, there's not a + real count, so we fake it out to be 1. */ + counted_links = 1; + if (handle_unlinked(sdp, blk, &counted_links, + &lf_addition)) + continue; + } + } + return adjust_lf_links(lf_addition); +} + +/** + * pass4 - Check reference counts (pass 2 & 6 in current fsck) + * + * handle unreferenced files + * lost+found errors (missing, not a directory, no space) + * adjust link count + * handle unreferenced inodes of other types + * handle bad blocks + */ +int pass4(struct gfs2_sbd *sdp) +{ + if (lf_dip) + log_debug( _("At beginning of pass4, lost+found entries is %u\n"), + lf_dip->i_di.di_entries); + log_info( _("Checking inode reference counts: multi-links.\n")); + if (scan_inode_list(sdp)) { + stack; + return FSCK_ERROR; + } + log_info( _("Checking inode reference counts: directories.\n")); + if (scan_dir_list(sdp)) { + stack; + return FSCK_ERROR; + } + log_info( _("Checking inode reference counts: normal links.\n")); + if (scan_nlink1_list(sdp)) { + stack; + return FSCK_ERROR; + } + + if (lf_dip) + log_debug( _("At end of pass4, lost+found entries is %u\n"), + lf_dip->i_di.di_entries); + return FSCK_OK; +} diff --git a/gfs2/fsck/pass5.c b/gfs2/fsck/pass5.c new file mode 100644 index 0000000..6234194 --- /dev/null +++ b/gfs2/fsck/pass5.c @@ -0,0 +1,242 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "fsck.h" +#include "util.h" + +#define GFS1_BLKST_USEDMETA 4 + +static int check_block_status(struct gfs2_sbd *sdp, struct gfs2_bmap *bl, + char *buffer, unsigned int buflen, + uint64_t *rg_block, uint64_t rg_data, + uint32_t *count) +{ + unsigned char *byte, *end; + unsigned int bit; + unsigned char rg_status; + int q; + uint64_t block; + + /* FIXME verify cast */ + byte = (unsigned char *) buffer; + bit = 0; + end = (unsigned char *) buffer + buflen; + + while (byte < end) { + rg_status = ((*byte >> bit) & GFS2_BIT_MASK); + block = rg_data + *rg_block; + warm_fuzzy_stuff(block); + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return 0; + + q = block_type(bl, block); + /* GFS1 file systems will have to suffer from slower fsck run + * times because in GFS, there's no 1:1 relationship between + * bits and counts. If a bit is marked "dinode" in GFS1, it + * may be dinode -OR- any kind of metadata. I consider GFS1 to + * be a rare exception, so acceptable loss at this point. So + * we must determine whether it's really a dinode or other + * metadata by reading it in. */ + if (sdp->gfs1 && q == GFS2_BLKST_DINODE) { + struct gfs2_buffer_head *bh; + + bh = bread(sdp, block); + if (gfs2_check_meta(bh, GFS2_METATYPE_DI) == 0) + count[GFS2_BLKST_DINODE]++; + else + count[GFS1_BLKST_USEDMETA]++; + brelse(bh); + } else { + count[q]++; + } + + /* If one node opens a file and another node deletes it, we + may be left with a block that appears to be "unlinked" in + the bitmap, but nothing links to it. This is a valid case + and should be cleaned up by the file system eventually. + So we ignore it. */ + if (q == GFS2_BLKST_UNLINKED) { + log_err( _("Unlinked inode found at block %llu " + "(0x%llx).\n"), + (unsigned long long)block, + (unsigned long long)block); + if (query(_("Do you want to reclaim the block? " + "(y/n) "))) { + lgfs2_rgrp_t rg = gfs2_blk2rgrpd(sdp, block); + if (gfs2_set_bitmap(rg, block, GFS2_BLKST_FREE)) + log_err(_("Unlinked block %llu " + "(0x%llx) bitmap not fixed." + "\n"), + (unsigned long long)block, + (unsigned long long)block); + else { + log_err(_("Unlinked block %llu " + "(0x%llx) bitmap fixed.\n"), + (unsigned long long)block, + (unsigned long long)block); + count[GFS2_BLKST_UNLINKED]--; + count[GFS2_BLKST_FREE]++; + } + } else { + log_info( _("Unlinked block found at block %llu" + " (0x%llx), left unchanged.\n"), + (unsigned long long)block, + (unsigned long long)block); + } + } else if (rg_status != q) { + log_err( _("Block %llu (0x%llx) bitmap says %u (%s) " + "but FSCK saw %u (%s)\n"), + (unsigned long long)block, + (unsigned long long)block, rg_status, + block_type_string(rg_status), q, + block_type_string(q)); + if (q) /* Don't print redundant "free" */ + log_err( _("Metadata type is %u (%s)\n"), q, + block_type_string(q)); + + if (query(_("Fix bitmap for block %llu (0x%llx) ? (y/n) "), + (unsigned long long)block, + (unsigned long long)block)) { + lgfs2_rgrp_t rg = gfs2_blk2rgrpd(sdp, block); + if (gfs2_set_bitmap(rg, block, q)) + log_err( _("Repair failed.\n")); + else + log_err( _("Fixed.\n")); + } else + log_err( _("Bitmap at block %llu (0x%llx) left inconsistent\n"), + (unsigned long long)block, + (unsigned long long)block); + } + (*rg_block)++; + bit += GFS2_BIT_SIZE; + if (bit >= 8){ + bit = 0; + byte++; + } + } + + return 0; +} + +static void update_rgrp(struct gfs2_sbd *sdp, struct rgrp_tree *rgp, + struct gfs2_bmap *bl, uint32_t *count) +{ + uint32_t i; + struct gfs2_bitmap *bits; + uint64_t rg_block = 0; + int update = 0; + struct gfs_rgrp *gfs1rg = (struct gfs_rgrp *)&rgp->rg; + + for(i = 0; i < rgp->ri.ri_length; i++) { + bits = &rgp->bits[i]; + + /* update the bitmaps */ + if (check_block_status(sdp, bl, bits->bi_bh->b_data + bits->bi_offset, + bits->bi_len, &rg_block, rgp->ri.ri_data0, count)) + return; + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return; + } + + /* actually adjust counters and write out to disk */ + if (rgp->rg.rg_free != count[GFS2_BLKST_FREE]) { + log_err( _("RG #%llu (0x%llx) free count inconsistent: " + "is %u should be %u\n"), + (unsigned long long)rgp->ri.ri_addr, + (unsigned long long)rgp->ri.ri_addr, + rgp->rg.rg_free, count[GFS2_BLKST_FREE]); + rgp->rg.rg_free = count[GFS2_BLKST_FREE]; + update = 1; + } + if (rgp->rg.rg_dinodes != count[GFS2_BLKST_DINODE]) { + log_err( _("RG #%llu (0x%llx) Inode count inconsistent: is " + "%u should be %u\n"), + (unsigned long long)rgp->ri.ri_addr, + (unsigned long long)rgp->ri.ri_addr, + rgp->rg.rg_dinodes, count[GFS2_BLKST_DINODE]); + rgp->rg.rg_dinodes = count[GFS2_BLKST_DINODE]; + update = 1; + } + if (sdp->gfs1 && gfs1rg->rg_usedmeta != count[GFS1_BLKST_USEDMETA]) { + log_err( _("RG #%llu (0x%llx) Used metadata count " + "inconsistent: is %u should be %u\n"), + (unsigned long long)rgp->ri.ri_addr, + (unsigned long long)rgp->ri.ri_addr, + gfs1rg->rg_usedmeta, count[GFS1_BLKST_USEDMETA]); + gfs1rg->rg_usedmeta = count[GFS1_BLKST_USEDMETA]; + update = 1; + } + if (sdp->gfs1 && gfs1rg->rg_freemeta != count[GFS2_BLKST_UNLINKED]) { + log_err( _("RG #%llu (0x%llx) Free metadata count " + "inconsistent: is %u should be %u\n"), + (unsigned long long)rgp->ri.ri_addr, + (unsigned long long)rgp->ri.ri_addr, + gfs1rg->rg_freemeta, count[GFS2_BLKST_UNLINKED]); + gfs1rg->rg_freemeta = count[GFS2_BLKST_UNLINKED]; + update = 1; + } + if (!sdp->gfs1 && (rgp->ri.ri_data != count[GFS2_BLKST_FREE] + + count[GFS2_BLKST_USED] + + count[GFS2_BLKST_UNLINKED] + + count[GFS2_BLKST_DINODE])) { + /* FIXME not sure how to handle this case ATM - it + * means that the total number of blocks we've counted + * exceeds the blocks in the rg */ + log_err( _("Internal fsck error: %u != %u + %u + %u + %u\n"), + rgp->ri.ri_data, count[GFS2_BLKST_FREE], + count[GFS2_BLKST_USED], count[GFS2_BLKST_UNLINKED], + count[GFS2_BLKST_DINODE]); + exit(FSCK_ERROR); + } + if (update) { + if (query( _("Update resource group counts? (y/n) "))) { + log_warn( _("Resource group counts updated\n")); + /* write out the rgrp */ + if (sdp->gfs1) + gfs_rgrp_out(gfs1rg, rgp->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgp->rg, rgp->bits[0].bi_bh->b_data); + } else + log_err( _("Resource group counts left inconsistent\n")); + } +} + +/** + * pass5 - check resource groups + * + * fix free block maps + * fix used inode maps + */ +int pass5(struct gfs2_sbd *sdp, struct gfs2_bmap *bl) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rgp = NULL; + uint32_t count[5]; /* we need 5 because of GFS1 usedmeta */ + uint64_t rg_count = 0; + + /* Reconcile RG bitmaps with fsck bitmap */ + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + if (skip_this_pass || fsck_abort) /* if asked to skip the rest */ + return FSCK_OK; + log_info( _("Verifying Resource Group #%llu\n"), (unsigned long long)rg_count); + memset(count, 0, sizeof(count)); + rgp = (struct rgrp_tree *)n; + + rg_count++; + /* Compare the bitmaps and report the differences */ + update_rgrp(sdp, rgp, bl, count); + } + /* Fix up superblock info based on this - don't think there's + * anything to do here... */ + + return FSCK_OK; +} diff --git a/gfs2/fsck/rgrepair.c b/gfs2/fsck/rgrepair.c new file mode 100644 index 0000000..2f14590 --- /dev/null +++ b/gfs2/fsck/rgrepair.c @@ -0,0 +1,1245 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "osi_list.h" +#include "fsck.h" +#include "fs_recovery.h" + +int rindex_modified = FALSE; +struct special_blocks false_rgrps; + +#define BAD_RG_PERCENT_TOLERANCE 11 +#define AWAY_FROM_BITMAPS 0x1000 +#define MAX_RGSEGMENTS 20 + +#define ri_equal(ondisk, expected, field) (ondisk.field == expected.field) + +#define ri_compare(rg, ondisk, expected, field, fmt, type) \ + if (ondisk.field != expected.field) { \ + log_warn( _("rindex #%d " #field " discrepancy: index 0x%" \ + fmt " != expected: 0x%" fmt "\n"), \ + rg + 1, (type)ondisk.field, (type)expected.field); \ + ondisk.field = expected.field; \ + rindex_modified = TRUE; \ + } + +/* + * find_journal_entry_rgs - find all RG blocks within all journals + * + * Since Resource Groups (RGs) are journaled, it is not uncommon for them + * to appear inside a journal. But if there is severe damage to the rindex + * file or some of the RGs, we may need to hunt and peck for RGs and in that + * case, we don't want to mistake these blocks that look just a real RG + * for a real RG block. These are "fake" RGs that need to be ignored for + * the purposes of finding where things are. + * + * NOTE: This function assumes that the jindex and journals have been read in, + * which isn't often the case. Normally the rindex needs to be read in + * first. If the rindex is damaged, that's not an option. + */ +static void find_journaled_rgs(struct gfs2_sbd *sdp) +{ + int j, new = 0; + unsigned int jblocks; + uint64_t b, dblock; + struct gfs2_inode *ip; + struct gfs2_buffer_head *bh; + int false_count; + + osi_list_init(&false_rgrps.list); + for (j = 0; j < sdp->md.journals; j++) { + ip = sdp->md.journal[j]; + log_debug(_("Checking for rgrps in journal%d which starts " + "at block 0x%llx.\n"), j, + (unsigned long long)ip->i_di.di_num.no_addr); + jblocks = ip->i_di.di_size / sdp->sd_sb.sb_bsize; + false_count = 0; + for (b = 0; b < jblocks; b++) { + block_map(ip, b, &new, &dblock, NULL, 0); + if (!dblock) + break; + bh = bread(sdp, dblock); + if (!gfs2_check_meta(bh, GFS2_METATYPE_RG)) { + /* False rgrp found at block dblock */ + false_count++; + gfs2_special_set(&false_rgrps, dblock); + } + brelse(bh); + } + log_debug("\n%d false positives identified.\n", false_count); + } +} + +static int is_false_rg(uint64_t block) +{ + if (blockfind(&false_rgrps, block)) + return 1; + return 0; +} + +/* + * find_shortest_rgdist - hunt and peck for the shortest distance between RGs. + * + * Sample several of them because an RG that's been blasted may + * look like twice the distance. If we can find 6 of them, that + * should be enough to figure out the correct layout. + * This also figures out first_rg_dist since that's always different. + * + * This function was revised to return the number of segments, usually 2. + * The shortest distance is now returned in the highest entry in rg_dist + */ +static int find_shortest_rgdist(struct gfs2_sbd *sdp, uint64_t *dist_array, + int *dist_cnt) +{ + uint64_t blk, block_last_rg, shortest_dist_btwn_rgs; + struct gfs2_buffer_head *bh; + int rgs_sampled = 0; + struct gfs2_rindex buf, tmpndx; + uint64_t initial_first_rg_dist; + int gsegment = 0; + int is_rgrp; + + /* Figure out if there are any RG-looking blocks in the journal we + need to ignore. */ + find_journaled_rgs(sdp); + + initial_first_rg_dist = dist_array[0] = block_last_rg = + LGFS2_SB_ADDR(sdp) + 1; + shortest_dist_btwn_rgs = sdp->device.length; + + for (blk = LGFS2_SB_ADDR(sdp) + 1; blk < sdp->device.length; blk++) { + uint64_t dist; + + if (blk == LGFS2_SB_ADDR(sdp) + 1) + is_rgrp = 1; + else if (is_false_rg(blk)) + is_rgrp = 0; + else { + bh = bread(sdp, blk); + is_rgrp = (gfs2_check_meta(bh, GFS2_METATYPE_RG) == 0); + brelse(bh); + } + if (!is_rgrp) { + if (rgs_sampled >= 6) { + uint64_t nblk; + + log_info(_("rgrp not found at block 0x%llx. " + "Last found rgrp was 0x%llx. " + "Checking the next one.\n"), + (unsigned long long)blk, + (unsigned long long)block_last_rg); + /* check for just a damaged rgrp */ + nblk = blk + dist_array[gsegment]; + if (is_false_rg(nblk)) { + is_rgrp = 0; + } else { + bh = bread(sdp, nblk); + is_rgrp = (((gfs2_check_meta(bh, + GFS2_METATYPE_RG) == 0))); + brelse(bh); + } + if (is_rgrp) { + log_info(_("Next rgrp is intact, so " + "this one is damaged.\n")); + blk = nblk - 1; + dist_cnt[gsegment]++; + continue; + } + log_info(_("Looking for new segment.\n")); + blk -= 16; + rgs_sampled = 0; + shortest_dist_btwn_rgs = sdp->device.length; + /* That last one didn't pan out, so: */ + dist_cnt[gsegment]--; + gsegment++; + if (gsegment >= MAX_RGSEGMENTS) + break; + } + if ((blk - block_last_rg) > (524288 * 2)) { + log_info(_("No rgrps were found within 4GB " + "of the last rgrp. Must be the " + "end of the file system.\n")); + + break; + } + continue; + } + + dist_cnt[gsegment]++; + if (rgs_sampled >= 6) { + block_last_rg = blk; + blk += dist_array[gsegment] - 1; /* prev value in + array minus 1. */ + continue; + } + log_info(_("segment %d: rgrp found at block 0x%llx\n"), + gsegment + 1, (unsigned long long)blk); + dist = blk - block_last_rg; + if (blk > LGFS2_SB_ADDR(sdp) + 1) { /* not the very first rgrp */ + + log_info("dist 0x%llx = 0x%llx - 0x%llx ", + (unsigned long long)dist, + (unsigned long long)blk, + (unsigned long long)block_last_rg); + /** + * We found an RG. Check to see if we need to set the + * first_rg_dist based on whether it is still at its + * initial value (i.e. the fs.) The first rg distance + * is different from the rest because of the + * superblock and 64K dead space. + **/ + if (dist_array[0] == initial_first_rg_dist) { + dist_array[0] = dist; + dist_cnt[0] = 1; + rgs_sampled = 0; + } + if (dist < shortest_dist_btwn_rgs) { + shortest_dist_btwn_rgs = dist; + log_info( _("(shortest so far)")); + } + log_info("\n"); + if (++rgs_sampled == 6) { + dist_array[gsegment] = shortest_dist_btwn_rgs; + log_info(_("Settled on distance 0x%llx for " + "segment %d\n"), + (unsigned long long) + dist_array[gsegment], gsegment + 1); + } + } else { + gsegment++; + if (gsegment >= MAX_RGSEGMENTS) + break; + } + block_last_rg = blk; + if (rgs_sampled < 6) + blk += 250; /* skip ahead for performance */ + else + blk += shortest_dist_btwn_rgs - 1; + } + if (gsegment >= MAX_RGSEGMENTS) { + log_err(_("Maximum number of rgrp grow segments reached.\n")); + log_err(_("This file system has more than %d resource " + "group segments.\n"), MAX_RGSEGMENTS); + } + /* -------------------------------------------------------------- */ + /* Sanity-check our first_rg_dist. If RG #2 got nuked, the */ + /* first_rg_dist would measure from #1 to #3, which would be bad. */ + /* We need to take remedial measures to fix it (from the index). */ + /* -------------------------------------------------------------- */ + if (*dist_array >= shortest_dist_btwn_rgs + + (shortest_dist_btwn_rgs / 4)) { + /* read in the second RG index entry for this subd. */ + gfs2_readi(sdp->md.riinode, (char *)&buf, + sizeof(struct gfs2_rindex), + sizeof(struct gfs2_rindex)); + gfs2_rindex_in(&tmpndx, (char *)&buf); + if (tmpndx.ri_addr > LGFS2_SB_ADDR(sdp) + 1) { /* sanity check */ + log_warn( _("rgrp 2 is damaged: getting dist from index: ")); + *dist_array = tmpndx.ri_addr - (LGFS2_SB_ADDR(sdp) + 1); + log_warn("0x%llx\n", (unsigned long long)*dist_array); + } else { + log_warn( _("rgrp index 2 is damaged: extrapolating dist: ")); + *dist_array = sdp->device.length - (sdp->rgrps - 1) * + (sdp->device.length / sdp->rgrps); + log_warn("0x%llx\n", (unsigned long long)*dist_array); + } + log_debug( _("Adjusted first rgrp distance: 0x%llx\n"), + (unsigned long long)*dist_array); + } /* if first RG distance is within tolerance */ + + gfs2_special_free(&false_rgrps); + return gsegment; +} + +/* + * count_usedspace - count the used bits in a rgrp bitmap buffer + */ +static uint64_t count_usedspace(struct gfs2_sbd *sdp, int first, + struct gfs2_buffer_head *bh) +{ + int off, x, y, bytes_to_check; + uint32_t rg_used = 0; + unsigned int state; + + /* Count up the free blocks in the bitmap */ + if (first) { + if (sdp->gfs1) + off = sizeof(struct gfs_rgrp); + else + off = sizeof(struct gfs2_rgrp); + } else + off = sizeof(struct gfs2_meta_header); + bytes_to_check = sdp->bsize - off; + for (x = 0; x < bytes_to_check; x++) { + unsigned char *byte; + + byte = (unsigned char *)&bh->b_data[off + x]; + if (*byte == 0x55) { + rg_used += GFS2_NBBY; + continue; + } + if (*byte == 0x00) + continue; + for (y = 0; y < GFS2_NBBY; y++) { + state = (*byte >> (GFS2_BIT_SIZE * y)) & GFS2_BIT_MASK; + if (state == GFS2_BLKST_FREE || + state == GFS2_BLKST_UNLINKED) + continue; + rg_used++; + } + } + return rg_used; +} + +/* + * find_next_rgrp_dist - find the distance to the next rgrp + * + * This function is only called if the rgrps are determined to be on uneven + * boundaries. In a normal gfs2 file system, after mkfs.gfs2, all the + * rgrps but the first and second one will be the same distance from the + * previous rgrp. (The first rgrp will predictably be after the superblock + * and the second one will be adjusted based on the number 64KB skipped + * at the start of the file system.) The only way we can deviate from that + * pattern is if the user did gfs_grow on a gfs1 file system, then converted + * it to gfs2 using gfs2_convert. + * + * This function finds the distance to the next rgrp for these cases. + */ +static uint64_t find_next_rgrp_dist(struct gfs2_sbd *sdp, uint64_t blk, + struct rgrp_tree *prevrgd) +{ + struct osi_node *n, *next = NULL; + uint64_t rgrp_dist = 0, used_blocks, block, next_block, twogigs; + struct rgrp_tree *rgd = NULL, *next_rgd; + struct gfs2_buffer_head *bh; + struct gfs2_meta_header mh; + int first, length, b, found; + uint64_t mega_in_blocks; + uint32_t free_blocks; + + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + if (rgd->ri.ri_addr == blk) + break; + } + if (rgd && n && osi_next(n) && rgd->ri.ri_addr == blk) { + n = osi_next(n); + next_rgd = (struct rgrp_tree *)n; + rgrp_dist = next_rgd->ri.ri_addr - rgd->ri.ri_addr; + return rgrp_dist; + } + mega_in_blocks = (1024 * 1024) / sdp->bsize; + twogigs = (uint64_t)mega_in_blocks * 2048; + /* Unfortunately, if we fall through to here we can't trust the + rindex. So we have to analyze the current rgrp to figure out + the bare minimum block number where it ends. If we don't have + rindex, all we know about this rgrp is what's on disk: its + rg_free. If we analyze the rgrp's bitmap and the bitmaps that + follow, we can figure out how many bits are used. If we add + rg_free, we get the total number of blocks this rgrp + represents. After that should be the next rgrp, but it may + skip a few blocks (hopefully no more than 4). */ + used_blocks = 0; + length = 0; + block = prevrgd->ri.ri_addr; + first = 1; + found = 0; + while (1) { + if (block >= sdp->device.length) + break; + if (block >= prevrgd->ri.ri_addr + twogigs) + break; + bh = bread(sdp, block); + gfs2_meta_header_in(&mh, bh->b_data); + if ((mh.mh_magic != GFS2_MAGIC) || + (first && mh.mh_type != GFS2_METATYPE_RG) || + (!first && mh.mh_type != GFS2_METATYPE_RB)) { + brelse(bh); + break; + } + if (first) { + struct gfs2_rgrp *rg; + + rg = (struct gfs2_rgrp *)bh->b_data; + free_blocks = be32_to_cpu(rg->rg_free); + } + used_blocks += count_usedspace(sdp, first, bh); + first = 0; + block++; + length++; + brelse(bh); + /* Check if this distance points to an rgrp: + We have to look for blocks that resemble rgrps and bitmaps. + If they do, we need to count blocks used and free and see + if adding that number of free blocks accounts for the + next rgrp we find. Otherwise, you could have a length of + 6 with additional user blocks that just happen to look like + bitmap blocks. Count them all as bitmaps and you'll be + hopelessly lost. */ + rgrp_dist = used_blocks + free_blocks + length; + next_block = prevrgd->ri.ri_addr + rgrp_dist; + /* Now we account for block rounding done by mkfs.gfs2 */ + for (b = 0; b <= length + GFS2_NBBY; b++) { + if (next_block >= sdp->device.length) + break; + bh = bread(sdp, next_block + b); + gfs2_meta_header_in(&mh, bh->b_data); + brelse(bh); + if (mh.mh_magic == GFS2_MAGIC) { + if (mh.mh_type == GFS2_METATYPE_RG) { + found = 1; + break; + } + /* if the first thing we find is a bitmap, + there must be a damaged rgrp on the + previous block. */ + if (mh.mh_type == GFS2_METATYPE_RB) { + found = 1; + rgrp_dist--; + break; + } + } + rgrp_dist++; + } + if (found) { + log_info( _("rgrp found at 0x%llx, length=%d, " + "used=%llu, free=%d\n"), + prevrgd->ri.ri_addr, length, + (unsigned long long)used_blocks, + free_blocks); + break; + } + } + return rgrp_dist; +} + +/* + * hunt_and_peck - find the distance to the next rgrp + * + * This function is only called if the rgrps are determined to be on uneven + * boundaries, and also corrupt. So we have to go out searching for one. + */ +static uint64_t hunt_and_peck(struct gfs2_sbd *sdp, uint64_t blk, + struct rgrp_tree *prevrgd, uint64_t last_bump) +{ + uint64_t rgrp_dist = 0, block, twogigs, last_block, last_meg; + struct gfs2_buffer_head *bh; + struct gfs2_meta_header mh; + int b, mega_in_blocks; + + /* Skip ahead the previous amount: we might get lucky. + If we're close to the end of the device, take the rest. */ + if (gfs2_check_range(sdp, blk + last_bump)) + return sdp->fssize - blk; + + bh = bread(sdp, blk + last_bump); + gfs2_meta_header_in(&mh, bh->b_data); + brelse(bh); + if (mh.mh_magic == GFS2_MAGIC && mh.mh_type == GFS2_METATYPE_RG) { + log_info( _("rgrp found at 0x%llx, length=%lld\n"), + (unsigned long long)blk + last_bump, + (unsigned long long)last_bump); + return last_bump; + } + + rgrp_dist = AWAY_FROM_BITMAPS; /* Get away from any bitmaps + associated with the previous rgrp */ + block = prevrgd->ri.ri_addr + rgrp_dist; + /* Now we account for block rounding done by mkfs.gfs2. A rgrp can + be at most 2GB in size, so that's where we call it. We do somewhat + obscure math here to avoid integer overflows. */ + mega_in_blocks = (1024 * 1024) / sdp->bsize; + twogigs = 2048 * mega_in_blocks; + if (block + twogigs <= sdp->fssize) { + last_block = twogigs; + last_meg = 0; + } else { + /* There won't be a rgrp in the last megabyte. */ + last_block = sdp->fssize - block - mega_in_blocks; + last_meg = mega_in_blocks; + } + for (b = AWAY_FROM_BITMAPS; b < last_block; b++) { + bh = bread(sdp, block + b); + gfs2_meta_header_in(&mh, bh->b_data); + brelse(bh); + if (mh.mh_magic == GFS2_MAGIC) { + if (mh.mh_type == GFS2_METATYPE_RG) + break; + /* if the first thing we find is a bitmap, there must + be a damaged rgrp on the previous block. */ + if (mh.mh_type == GFS2_METATYPE_RB) { + rgrp_dist--; + break; + } + } + rgrp_dist++; + } + return rgrp_dist + last_meg; +} + +/* + * gfs2_rindex_rebuild - rebuild a corrupt Resource Group (RG) index manually + * where trust_lvl == distrust + * + * If this routine is called, it means we have RGs in odd/unexpected places, + * and there is a corrupt RG or RG index entry. It also means we can't trust + * the RG index to be sane, and the RGs don't agree with how mkfs would have + * built them by default. So we have no choice but to go through and count + * them by hand. We've tried twice to recover the RGs and RG index, and + * failed, so this is our last chance to remedy the situation. + * + * This routine tries to minimize performance impact by: + * 1. Skipping through the filesystem at known increments when possible. + * 2. Shuffle through every block when RGs are not found at the predicted + * locations. + * + * Note: A GFS2 filesystem differs from a GFS1 file system in that there will + * only be ONE chunk (i.e. no artificial subdevices on either size of the + * journals). The journals and even the rindex are kept as part of the file + * system, so we need to rebuild that information by hand. Also, with GFS1, + * the different chunks ("subdevices") could have different RG sizes, which + * made for quite a mess when trying to recover RGs. GFS2 always uses the + * same RG size determined by the original mkfs, so recovery is easier. + * + * If "gfs_grow" is specified the file system was most likely converted + * from gfs1 to gfs2 after a gfs_grow operation. In that case, the rgrps + * will not be on predictable boundaries. + */ +static int gfs2_rindex_rebuild(struct gfs2_sbd *sdp, int *num_rgs, + int gfs_grow) +{ + struct osi_node *n, *next = NULL; + struct gfs2_buffer_head *bh; + uint64_t rg_dist[MAX_RGSEGMENTS] = {0, }; + int rg_dcnt[MAX_RGSEGMENTS] = {0, }; + uint64_t blk; + uint64_t fwd_block, block_bump; + struct rgrp_tree *calc_rgd, *prev_rgd; + int number_of_rgs, rgi, segment_rgs; + int rg_was_fnd = FALSE, corrupt_rgs = 0; + int error = -1, j, i; + int grow_segments, segment = 0; + + /* + * In order to continue, we need to initialize the jindex. We need + * the journals in order to correctly eliminate false positives during + * rgrp repair. IOW, we need to properly ignore rgrps that appear in + * the journals, and we can only do that if we have the journals. + * To make matters worse, journals may span several (small) rgrps, + * so we can't go by the rgrps. + */ + if (init_jindex(sdp, 0) != 0) { + log_crit(_("Error: Can't read jindex required for rindex " + "repairs.\n")); + return -1; + } + + sdp->rgcalc.osi_node = NULL; + grow_segments = find_shortest_rgdist(sdp, &rg_dist[0], &rg_dcnt[0]); + for (i = 0; i < grow_segments; i++) + log_info(_("Segment %d: rgrp distance: 0x%llx, count: %d\n"), + i + 1, (unsigned long long)rg_dist[i], rg_dcnt[i]); + number_of_rgs = segment_rgs = 0; + /* -------------------------------------------------------------- */ + /* Now go through the RGs and verify their integrity, fixing as */ + /* needed when corruption is encountered. */ + /* -------------------------------------------------------------- */ + prev_rgd = NULL; + block_bump = rg_dist[0]; + blk = LGFS2_SB_ADDR(sdp) + 1; + while (blk <= sdp->device.length) { + log_debug( _("Block 0x%llx\n"), (unsigned long long)blk); + bh = bread(sdp, blk); + rg_was_fnd = (!gfs2_check_meta(bh, GFS2_METATYPE_RG)); + brelse(bh); + /* Allocate a new RG and index. */ + calc_rgd = rgrp_insert(&sdp->rgcalc, blk); + if (!calc_rgd) { + log_crit( _("Can't allocate memory for rgrp repair.\n")); + goto out; + } + calc_rgd->ri.ri_length = 1; + if (!rg_was_fnd) { /* if not an RG */ + /* ------------------------------------------------- */ + /* This SHOULD be an RG but isn't. */ + /* ------------------------------------------------- */ + corrupt_rgs++; + if (corrupt_rgs < 5) + log_debug(_("Missing or damaged rgrp at block " + "%llu (0x%llx)\n"), + (unsigned long long)blk, + (unsigned long long)blk); + else { + log_crit( _("Error: too many missing or " + "damaged rgrps using this method. " + "Time to try another method.\n")); + goto out; + } + } + /* ------------------------------------------------ */ + /* Now go through and count the bitmaps for this RG */ + /* ------------------------------------------------ */ + for (fwd_block = blk + 1; fwd_block < sdp->device.length; fwd_block++) { + int bitmap_was_fnd; + bh = bread(sdp, fwd_block); + bitmap_was_fnd = !gfs2_check_meta(bh, GFS2_METATYPE_RB); + brelse(bh); + if (bitmap_was_fnd) /* if a bitmap */ + calc_rgd->ri.ri_length++; + else + break; /* end of bitmap, so call it quits. */ + } /* for subsequent bitmaps */ + + gfs2_compute_bitstructs(sdp->sd_sb.sb_bsize, calc_rgd); + calc_rgd->ri.ri_data0 = calc_rgd->ri.ri_addr + + calc_rgd->ri.ri_length; + if (prev_rgd) { + uint32_t rgblocks; + + prev_rgd->ri.ri_length = rgblocks2bitblocks(sdp->bsize, block_bump, &rgblocks); + prev_rgd->ri.ri_data = rgblocks; + prev_rgd->ri.ri_data0 = prev_rgd->ri.ri_addr + + prev_rgd->ri.ri_length; + prev_rgd->ri.ri_data -= prev_rgd->ri.ri_data % + GFS2_NBBY; + prev_rgd->ri.ri_bitbytes = prev_rgd->ri.ri_data / + GFS2_NBBY; + log_debug( _("Prev ri_data set to: %lx.\n"), + (unsigned long)prev_rgd->ri.ri_data); + } + number_of_rgs++; + segment_rgs++; + if (rg_was_fnd) + log_info( _(" rgrp %d at block 0x%llx intact\n"), + number_of_rgs, (unsigned long long)blk); + else + log_warn( _("* rgrp %d at block 0x%llx *** DAMAGED ***\n"), + number_of_rgs, (unsigned long long)blk); + prev_rgd = calc_rgd; + /* + * Figure out where our next rgrp should be. + */ + if ((blk == LGFS2_SB_ADDR(sdp) + 1) || (!gfs_grow)) { + block_bump = rg_dist[segment]; + if (segment_rgs >= rg_dcnt[segment]) { + log_debug(_("End of segment %d\n"), ++segment); + segment_rgs = 0; + if (segment >= grow_segments) { + log_debug(_("Last segment.\n")); + break; + } + } + /* if we have uniformly-spaced rgrps, there may be + some wasted space at the end of the device. + Since we don't want to create a short rgrp and + break our uniformity, just quit here. */ + if (blk + (2 * block_bump) > sdp->device.length) + break; + } else if (rg_was_fnd) + block_bump = find_next_rgrp_dist(sdp, blk, prev_rgd); + else + block_bump = hunt_and_peck(sdp, blk, prev_rgd, + block_bump); + if (block_bump != 1) { + if (rg_was_fnd) + log_info( _(" [length 0x%llx]\n"), + (unsigned long long)block_bump); + else + log_warn( _(" [length 0x%llx]\n"), + (unsigned long long)block_bump); + } else { + log_warn("\n"); + } + blk += block_bump; + } /* for each rg block */ + /* ----------------------------------------------------------------- */ + /* If we got to the end of the fs, we still need to fix the */ + /* allocation information for the very last RG. */ + /* ----------------------------------------------------------------- */ + if (prev_rgd && !prev_rgd->ri.ri_data) { + uint32_t rgblocks; + + prev_rgd->ri.ri_length = rgblocks2bitblocks(sdp->bsize, block_bump, &rgblocks); + prev_rgd->ri.ri_data0 = prev_rgd->ri.ri_addr + + prev_rgd->ri.ri_length; + prev_rgd->ri.ri_data = rgblocks; + prev_rgd->ri.ri_data -= prev_rgd->ri.ri_data % GFS2_NBBY; + prev_rgd->ri.ri_bitbytes = prev_rgd->ri.ri_data / GFS2_NBBY; + log_debug( _("Prev ri_data set to: %lx.\n"), + (unsigned long)prev_rgd->ri.ri_data); + prev_rgd = NULL; /* make sure we don't use it later */ + } + /* ---------------------------------------------- */ + /* Now dump out the information (if verbose mode) */ + /* ---------------------------------------------- */ + log_debug( _("rindex rebuilt as follows:\n")); + for (n = osi_first(&sdp->rgcalc), rgi = 0; n; n = next, rgi++) { + next = osi_next(n); + calc_rgd = (struct rgrp_tree *)n; + log_debug("%d: 0x%llx / %x / 0x%llx" + " / 0x%x / 0x%x\n", rgi + 1, + (unsigned long long)calc_rgd->ri.ri_addr, + calc_rgd->ri.ri_length, + calc_rgd->ri.ri_data0, calc_rgd->ri.ri_data, + calc_rgd->ri.ri_bitbytes); + } + *num_rgs = number_of_rgs; + error = 0; +out: + for (j = 0; j < sdp->md.journals; j++) + inode_put(&sdp->md.journal[j]); + free(sdp->md.journal); + return error; +} + +#define DIV_RU(x, y) (((x) + (y) - 1) / (y)) + +/** + * how_many_rgrps - figure out how many RG to put in a subdevice + * @w: the command line + * @dev: the device + * + * Returns: the number of RGs + */ +static uint64_t how_many_rgrps(struct gfs2_sbd *sdp, struct device *dev, int rgsize_specified) +{ + uint64_t nrgrp; + uint32_t rgblocks1, rgblocksn, bitblocks1, bitblocksn; + int bitmap_overflow = 0; + + while (1) { + nrgrp = DIV_RU(dev->length, (sdp->rgsize << 20) / sdp->bsize); + + /* check to see if the rg length overflows max # bitblks */ + bitblocksn = rgblocks2bitblocks(sdp->bsize, dev->length / nrgrp, &rgblocksn); + /* calculate size of the first rgrp */ + bitblocks1 = rgblocks2bitblocks(sdp->bsize, dev->length - (nrgrp - 1) * (dev->length / nrgrp), + &rgblocks1); + if (bitblocks1 > 2149 || bitblocksn > 2149) { + bitmap_overflow = 1; + if (sdp->rgsize <= GFS2_DEFAULT_RGSIZE) { + fprintf(stderr, "error: It is not possible " + "to use the entire device with " + "block size %u bytes.\n", + sdp->bsize); + exit(-1); + } + sdp->rgsize -= GFS2_DEFAULT_RGSIZE; /* smaller rgs */ + continue; + } + if (bitmap_overflow || + rgsize_specified || /* If user specified an rg size or */ + nrgrp <= GFS2_EXCESSIVE_RGS || /* not an excessive # or */ + sdp->rgsize >= 2048) /* we reached the max rg size */ + break; + + sdp->rgsize += GFS2_DEFAULT_RGSIZE; /* bigger rgs */ + } + + log_debug(" rg sz = %"PRIu32"\n nrgrp = %"PRIu64"\n", sdp->rgsize, + nrgrp); + + return nrgrp; +} + +/** + * compute_rgrp_layout - figure out where the RG in a FS are + */ +static void compute_rgrp_layout(struct gfs2_sbd *sdp, struct osi_root *rgtree, int rgsize_specified) +{ + struct device *dev; + struct rgrp_tree *rl, *rlast = NULL; + struct osi_node *n, *next = NULL; + unsigned int rgrp = 0, nrgrp, rglength; + uint64_t rgaddr; + + sdp->new_rgrps = 0; + dev = &sdp->device; + + /* If this is a new file system, compute the length and number */ + /* of rgs based on the size of the device. */ + /* If we have existing RGs (i.e. gfs2_grow) find the last one. */ + if (!rgtree->osi_node) { + dev->length -= LGFS2_SB_ADDR(sdp) + 1; + nrgrp = how_many_rgrps(sdp, dev, rgsize_specified); + rglength = dev->length / nrgrp; + sdp->new_rgrps = nrgrp; + } else { + uint64_t old_length, new_chunk; + + printf("Existing resource groups:\n"); + for (rgrp = 0, n = osi_first(rgtree); n; n = next, rgrp++) { + next = osi_next(n); + rl = (struct rgrp_tree *)n; + + printf("%d: start: %" PRIu64 " (0x%" + PRIx64 "), length = %"PRIu64" (0x%" + PRIx64 ")\n", rgrp + 1, rl->start, rl->start, + rl->length, rl->length); + rlast = rl; + } + rlast->start = rlast->ri.ri_addr; + rglength = rgrp_size(rlast); + rlast->length = rglength; + old_length = rlast->ri.ri_addr + rglength; + new_chunk = dev->length - old_length; + sdp->new_rgrps = new_chunk / rglength; + nrgrp = rgrp + sdp->new_rgrps; + } + + if (rgrp < nrgrp) + printf("\nNew resource groups:\n"); + for (; rgrp < nrgrp; rgrp++) { + if (rgrp) { + rgaddr = rlast->start + rlast->length; + rl = rgrp_insert(rgtree, rgaddr); + rl->length = rglength; + } else { + rgaddr = LGFS2_SB_ADDR(sdp) + 1; + rl = rgrp_insert(rgtree, rgaddr); + rl->length = dev->length - + (nrgrp - 1) * (dev->length / nrgrp); + } + rl->start = rgaddr; + /* printf("%d: start: %" PRIu64 " (0x%" + PRIx64 "), length = %"PRIu64" (0x%" + PRIx64 ")\n", rgrp + 1, rl->start, rl->start, + rl->length, rl->length);*/ + rlast = rl; + } + + sdp->rgrps = nrgrp; +} + +/* + * gfs2_rindex_calculate - calculate what the rindex should look like + * in a perfect world (trust_lvl == open_minded) + * + * Calculate what the rindex should look like, + * so we can later check if all RG index entries are sane. + * This is a lot easier for gfs2 because we can just call the same libgfs2 + * functions used by mkfs. + * + * Returns: 0 on success, -1 on failure + * Sets: sdp->rglist to a linked list of fsck_rgrp structs representing + * what we think the rindex should really look like. + */ +static int gfs2_rindex_calculate(struct gfs2_sbd *sdp, int *num_rgs) +{ + uint64_t num_rgrps = 0; + + /* ----------------------------------------------------------------- */ + /* Calculate how many RGs there are supposed to be based on the */ + /* rindex filesize. Remember that our trust level is open-minded */ + /* here. If the filesize of the rindex file is not a multiple of */ + /* our rindex structures, then something's wrong and we can't trust */ + /* the index. */ + /* ----------------------------------------------------------------- */ + *num_rgs = sdp->md.riinode->i_di.di_size / sizeof(struct gfs2_rindex); + + sdp->rgcalc.osi_node = NULL; + fix_device_geometry(sdp); + + /* Try all possible rgrp sizes: 2048, 1024, 512, 256, 128, 64, 32 */ + for (sdp->rgsize = GFS2_DEFAULT_RGSIZE; sdp->rgsize >= 32; + sdp->rgsize /= 2) { + num_rgrps = how_many_rgrps(sdp, &sdp->device, TRUE); + if (num_rgrps == *num_rgs) { + log_info(_("rgsize must be: %lld (0x%llx)\n"), + (unsigned long long)sdp->rgsize, + (unsigned long long)sdp->rgsize); + break; + } + } + /* Compute the default resource group layout as mkfs would have done */ + compute_rgrp_layout(sdp, &sdp->rgcalc, TRUE); + if (build_rgrps(sdp, FALSE)) { /* FALSE = calc but don't write to disk. */ + fprintf(stderr, _("Failed to build resource groups\n")); + exit(-1); + } + log_debug( _("fs_total_size = 0x%llx blocks.\n"), + (unsigned long long)sdp->device.length); + log_warn( _("L3: number of rgs in the index = %d.\n"), *num_rgs); + return 0; +} + +/* + * rewrite_rg_block - rewrite ("fix") a buffer with rg or bitmap data + * returns: 0 if the rg was repaired, otherwise 1 + */ +static int rewrite_rg_block(struct gfs2_sbd *sdp, struct rgrp_tree *rg, + uint64_t errblock) +{ + int x = errblock - rg->ri.ri_addr; + const char *typedesc = x ? "GFS2_METATYPE_RB" : "GFS2_METATYPE_RG"; + + log_err( _("Block #%lld (0x%llx) (%d of %d) is not %s.\n"), + (unsigned long long)rg->ri.ri_addr + x, + (unsigned long long)rg->ri.ri_addr + x, + (int)x+1, (int)rg->ri.ri_length, typedesc); + if (query( _("Fix the Resource Group? (y/n)"))) { + log_err( _("Attempting to repair the rgrp.\n")); + rg->bits[x].bi_bh = bread(sdp, rg->ri.ri_addr + x); + if (x) { + struct gfs2_meta_header mh; + + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_RB; + mh.mh_format = GFS2_FORMAT_RB; + gfs2_meta_header_out(&mh, rg->bits[x].bi_bh->b_data); + } else { + if (sdp->gfs1) + memset(&rg->rg, 0, sizeof(struct gfs_rgrp)); + else + memset(&rg->rg, 0, sizeof(struct gfs2_rgrp)); + rg->rg.rg_header.mh_magic = GFS2_MAGIC; + rg->rg.rg_header.mh_type = GFS2_METATYPE_RG; + rg->rg.rg_header.mh_format = GFS2_FORMAT_RG; + rg->rg.rg_free = rg->ri.ri_data; + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rg->rg, rg->bits[x].bi_bh); + else + gfs2_rgrp_out(&rg->rg, rg->bits[x].bi_bh->b_data); + } + bmodified(rg->bits[x].bi_bh); + brelse(rg->bits[x].bi_bh); + rg->bits[x].bi_bh = NULL; + return 0; + } + return 1; +} + +/* + * expect_rindex_sanity - the rindex file seems trustworthy, so use those + * values as our expected values and assume the + * damage is only to the rgrps themselves. + */ +static int expect_rindex_sanity(struct gfs2_sbd *sdp, int *num_rgs) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rgd, *exp; + + *num_rgs = sdp->md.riinode->i_di.di_size / sizeof(struct gfs2_rindex) ; + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + exp = rgrp_insert(&sdp->rgcalc, rgd->ri.ri_addr); + if (exp == NULL) { + fprintf(stderr, "Out of memory in %s\n", __FUNCTION__); + exit(-1); + } + exp->start = rgd->start; + exp->length = rgd->length; + memcpy(&exp->ri, &rgd->ri, sizeof(exp->ri)); + memcpy(&exp->rg, &rgd->rg, sizeof(exp->rg)); + exp->bits = NULL; + gfs2_compute_bitstructs(sdp->sd_sb.sb_bsize, exp); + } + sdp->rgrps = *num_rgs; + return 0; +} + +/* + * rg_repair - try to repair a damaged rg index (rindex) + * trust_lvl - This is how much we trust the rindex file. + * blind_faith means we take the rindex at face value. + * open_minded means it might be okay, but we should verify it. + * distrust means it's not to be trusted, so we should go to + * greater lengths to build it from scratch. + * indignation means we have corruption, but the file system + * was converted from GFS via gfs2_convert, and its rgrps are + * not on nice boundaries thanks to previous gfs_grow ops. Lovely. + */ +int rg_repair(struct gfs2_sbd *sdp, int trust_lvl, int *rg_count, int *sane) +{ + struct osi_node *n, *next = NULL, *e, *enext; + int error, discrepancies, percent; + int calc_rg_count = 0, rg; + struct gfs2_rindex buf; + + if (trust_lvl == blind_faith) + return 0; + if (trust_lvl == ye_of_little_faith) { /* if rindex seems sane */ + /* Don't free previous incarnations in memory, if any. + * We need them to copy in the next function: + * gfs2_rgrp_free(&sdp->rglist); */ + if (!(*sane)) { + log_err(_("The rindex file does not meet our " + "expectations.\n")); + return -1; + } + error = expect_rindex_sanity(sdp, &calc_rg_count); + if (error) { + gfs2_rgrp_free(&sdp->rgcalc); + return error; + } + } else if (trust_lvl == open_minded) { /* If we can't trust RG index */ + /* Free previous incarnations in memory, if any. */ + gfs2_rgrp_free(&sdp->rgtree); + + /* Calculate our own RG index for comparison */ + error = gfs2_rindex_calculate(sdp, &calc_rg_count); + if (error) { /* If calculated RGs don't match the fs */ + gfs2_rgrp_free(&sdp->rgcalc); + return -1; + } + } else if (trust_lvl == distrust) { /* If we can't trust RG index */ + /* Free previous incarnations in memory, if any. */ + gfs2_rgrp_free(&sdp->rgtree); + + error = gfs2_rindex_rebuild(sdp, &calc_rg_count, 0); + if (error) { + log_crit( _("Error rebuilding rgrp list.\n")); + gfs2_rgrp_free(&sdp->rgcalc); + return -1; + } + } else if (trust_lvl == indignation) { /* If we can't trust anything */ + /* Free previous incarnations in memory, if any. */ + gfs2_rgrp_free(&sdp->rgtree); + + error = gfs2_rindex_rebuild(sdp, &calc_rg_count, 1); + if (error) { + log_crit( _("Error rebuilding rgrp list.\n")); + gfs2_rgrp_free(&sdp->rgcalc); + return -1; + } + } + /* Read in the rindex */ + sdp->rgtree.osi_node = NULL; /* Just to be safe */ + rindex_read(sdp, 0, &sdp->rgrps, sane); + if (sdp->md.riinode->i_di.di_size % sizeof(struct gfs2_rindex)) { + log_warn( _("WARNING: rindex file has an invalid size.\n")); + if (!query( _("Truncate the rindex size? (y/n)"))) { + log_err(_("The rindex was not repaired.\n")); + gfs2_rgrp_free(&sdp->rgcalc); + gfs2_rgrp_free(&sdp->rgtree); + return -1; + } + sdp->md.riinode->i_di.di_size /= sizeof(struct gfs2_rindex); + sdp->md.riinode->i_di.di_size *= sizeof(struct gfs2_rindex); + bmodified(sdp->md.riinode->i_bh); + log_err(_("Changing rindex size to %lld.\n"), + (unsigned long long)sdp->md.riinode->i_di.di_size); + } + log_warn( _("L%d: number of rgs expected = %lld.\n"), trust_lvl + 1, + (unsigned long long)sdp->rgrps); + if (calc_rg_count != sdp->rgrps) { + int most_that_fit; + + log_warn( _("L%d: They don't match; either (1) the fs was " + "extended, (2) an odd\n"), trust_lvl + 1); + log_warn( _("L%d: rgrp size was used, or (3) we have a corrupt " + "rg index.\n"), trust_lvl + 1); + /* If the trust level is open_minded, we would have calculated + the rindex based on the device size. If it's not the same + number, don't trust it. Complain about the discrepancy, + then try again with a little more distrust. */ + if ((trust_lvl < distrust) || + !query( _("Attempt to use what rgrps we can? (y/n)"))) { + gfs2_rgrp_free(&sdp->rgcalc); + gfs2_rgrp_free(&sdp->rgtree); + log_err(_("The rindex was not repaired.\n")); + return -1; + } + /* We cannot grow rindex at this point. Since pass1 has not + yet run, we can't allocate blocks. Therefore we must use + whatever will fix in the space given. */ + most_that_fit = sdp->md.riinode->i_di.di_size / + sizeof(struct gfs2_rindex); + log_debug(_("The most we can fit is %d rgrps\n"), + most_that_fit); + if (most_that_fit < calc_rg_count) + calc_rg_count = most_that_fit; + log_err(_("Attempting to fix rindex with %d rgrps.\n"), + calc_rg_count); + } + /* ------------------------------------------------------------- */ + /* Now compare the rindex to what we think it should be. */ + /* See how far off our expected values are. If too much, abort. */ + /* The theory is: if we calculated the index to have 32 RGs and */ + /* we have a large number that are completely wrong, we should */ + /* abandon this method of recovery and try a better one. */ + /* ------------------------------------------------------------- */ + discrepancies = 0; + for (rg = 0, n = osi_first(&sdp->rgtree), e = osi_first(&sdp->rgcalc); + n && e && !fsck_abort && rg < calc_rg_count; rg++) { + struct rgrp_tree *expected, *actual; + + next = osi_next(n); + enext = osi_next(e); + + expected = (struct rgrp_tree *)e; + actual = (struct rgrp_tree *)n; + if (actual->ri.ri_addr < expected->ri.ri_addr) { + n = next; + discrepancies++; + log_info(_("%d addr: 0x%llx < 0x%llx * mismatch\n"), + rg + 1, actual->ri.ri_addr, + expected->ri.ri_addr); + continue; + } else if (expected->ri.ri_addr < actual->ri.ri_addr) { + e = enext; + discrepancies++; + log_info(_("%d addr: 0x%llx > 0x%llx * mismatch\n"), + rg + 1, actual->ri.ri_addr, + expected->ri.ri_addr); + continue; + } + if (!ri_equal(actual->ri, expected->ri, ri_length) || + !ri_equal(actual->ri, expected->ri, ri_data0) || + !ri_equal(actual->ri, expected->ri, ri_data) || + !ri_equal(actual->ri, expected->ri, ri_bitbytes)) { + discrepancies++; + log_info(_("%d addr: 0x%llx 0x%llx * has mismatch\n"), + rg + 1, actual->ri.ri_addr, + expected->ri.ri_addr); + } + n = next; + e = enext; + } + if (rg) { + /* Check to see if more than 2% of the rgrps are wrong. */ + percent = (discrepancies * 100) / rg; + if (percent > BAD_RG_PERCENT_TOLERANCE) { + log_warn( _("Level %d didn't work. Too many " + "discrepancies.\n"), trust_lvl + 1); + log_warn( _("%d out of %d rgrps (%d percent) did not " + "match what was expected.\n"), + discrepancies, rg, percent); + gfs2_rgrp_free(&sdp->rgcalc); + gfs2_rgrp_free(&sdp->rgtree); + return -1; + } + } + log_debug("Calculated %d rgrps: Total: %d Match: %d Mismatch: %d\n", + calc_rg_count, rg, rg - discrepancies, discrepancies); + /* ------------------------------------------------------------- */ + /* Now compare the rindex to what we think it should be. */ + /* Our rindex should be pretty predictable unless we've grown */ + /* so look for index problems first before looking at the rgs. */ + /* ------------------------------------------------------------- */ + for (rg = 0, n = osi_first(&sdp->rgtree), e = osi_first(&sdp->rgcalc); + e && !fsck_abort && rg < calc_rg_count; rg++) { + struct rgrp_tree *expected, *actual; + + if (n) + next = osi_next(n); + enext = osi_next(e); + expected = (struct rgrp_tree *)e; + actual = (struct rgrp_tree *)n; + + /* If the next "actual" rgrp in memory is too far away, + fill in a new one with the expected value. -or- + If we ran out of actual rindex entries due to rindex + damage, fill in a new one with the expected values. */ + if (!n || /* end of actual rindex */ + expected->ri.ri_addr < actual->ri.ri_addr) { + log_err( _("Entry missing from rindex: 0x%llx\n"), + (unsigned long long)expected->ri.ri_addr); + actual = rgrp_insert(&sdp->rgtree, + expected->ri.ri_addr); + if (!actual) { + log_err(_("Out of memory!\n")); + break; + } + rindex_modified = 1; + next = n; /* Ensure that the old actual gets checked + against a new expected, since we added */ + } else { + ri_compare(rg, actual->ri, expected->ri, ri_addr, + "llx", unsigned long long); + ri_compare(rg, actual->ri, expected->ri, ri_length, + "lx", unsigned long); + ri_compare(rg, actual->ri, expected->ri, ri_data0, + "llx", unsigned long long); + ri_compare(rg, actual->ri, expected->ri, ri_data, + "lx", unsigned long); + ri_compare(rg, actual->ri, expected->ri, ri_bitbytes, + "lx", unsigned long); + } + /* If we modified the index, write it back to disk. */ + if (rindex_modified) { + if (query( _("Fix the index? (y/n)"))) { + gfs2_rindex_out(&expected->ri, (char *)&buf); + gfs2_writei(sdp->md.riinode, (char *)&buf, + rg * sizeof(struct gfs2_rindex), + sizeof(struct gfs2_rindex)); + actual->ri.ri_addr = expected->ri.ri_addr; + actual->ri.ri_length = expected->ri.ri_length; + actual->ri.ri_data0 = expected->ri.ri_data0; + actual->ri.ri_data = expected->ri.ri_data; + actual->ri.ri_bitbytes = + expected->ri.ri_bitbytes; + /* If our rindex was hosed, ri_length is bad */ + /* Therefore, gfs2_compute_bitstructs might */ + /* have malloced the wrong length for bitmap */ + /* buffers. So we have to redo it. */ + if (actual->bits) { + free(actual->bits); + actual->bits = NULL; + } + } + else + log_err( _("rindex not fixed.\n")); + gfs2_compute_bitstructs(sdp->sd_sb.sb_bsize, actual); + rindex_modified = FALSE; + } + e = enext; + if (n) + n = next; + } + /* ------------------------------------------------------------- */ + /* Read the real RGs and check their integrity. */ + /* Now we can somewhat trust the rindex and the RG addresses, */ + /* so let's read them in, check them and optionally fix them. */ + /* ------------------------------------------------------------- */ + for (rg = 0, n = osi_first(&sdp->rgtree); n && !fsck_abort && + rg < calc_rg_count; n = next, rg++) { + struct rgrp_tree *rgd; + uint64_t prev_err = 0, errblock; + int i; + + next = osi_next(n); + /* Now we try repeatedly to read in the rg. For every block */ + /* we encounter that has errors, repair it and try again. */ + i = 0; + do { + rgd = (struct rgrp_tree *)n; + errblock = gfs2_rgrp_read(sdp, rgd); + if (errblock) { + if (errblock == prev_err) + break; + prev_err = errblock; + rewrite_rg_block(sdp, rgd, errblock); + } else { + gfs2_rgrp_relse(rgd); + break; + } + i++; + } while (i < rgd->ri.ri_length); + } + *rg_count = rg; + gfs2_rgrp_free(&sdp->rgcalc); + gfs2_rgrp_free(&sdp->rgtree); + /* We shouldn't need to worry about getting the user's permission to + make changes here. If b_modified is true, they already gave their + permission. */ + if (sdp->md.riinode->i_bh->b_modified) { + log_debug("Syncing rindex inode changes to disk.\n"); + gfs2_dinode_out(&sdp->md.riinode->i_di, sdp->md.riinode->i_bh->b_data); + bwrite(sdp->md.riinode->i_bh); + } + return 0; +} diff --git a/gfs2/fsck/util.c b/gfs2/fsck/util.c new file mode 100644 index 0000000..1c3ed9c --- /dev/null +++ b/gfs2/fsck/util.c @@ -0,0 +1,694 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define _(String) gettext(String) + +#include +#include "libgfs2.h" +#include "metawalk.h" +#include "util.h" + +const char *reftypes[ref_types + 1] = {"data", "metadata", + "an extended attribute", "an inode", + "unimportant"}; + +void big_file_comfort(struct gfs2_inode *ip, uint64_t blks_checked) +{ + static struct timeval tv; + static uint32_t seconds = 0; + static uint64_t percent, fsize, chksize; + uint64_t one_percent = 0; + int i, cs; + const char *human_abbrev = " KMGTPE"; + + one_percent = ip->i_di.di_blocks / 100; + if (blks_checked - last_reported_fblock < one_percent) + return; + + last_reported_fblock = blks_checked; + gettimeofday(&tv, NULL); + if (!seconds) + seconds = tv.tv_sec; + if (tv.tv_sec == seconds) + return; + + fsize = ip->i_di.di_size; + for (i = 0; i < 6 && fsize > 1024; i++) + fsize /= 1024; + chksize = blks_checked * ip->i_sbd->bsize; + for (cs = 0; cs < 6 && chksize > 1024; cs++) + chksize /= 1024; + seconds = tv.tv_sec; + percent = (blks_checked * 100) / ip->i_di.di_blocks; + log_notice( _("\rChecking %lld%c of %lld%c of file at %lld (0x%llx)" + "- %llu percent complete. \r"), + (long long)chksize, human_abbrev[cs], + (unsigned long long)fsize, human_abbrev[i], + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)percent); + fflush(stdout); +} + +/* Put out a warm, fuzzy message every second so the user */ +/* doesn't think we hung. (This may take a long time). */ +void warm_fuzzy_stuff(uint64_t block) +{ + static uint64_t one_percent = 0; + static struct timeval tv; + static uint32_t seconds = 0; + + if (!one_percent) + one_percent = last_fs_block / 100; + if (!last_reported_block || + block - last_reported_block >= one_percent) { + last_reported_block = block; + gettimeofday(&tv, NULL); + if (!seconds) + seconds = tv.tv_sec; + if (tv.tv_sec - seconds) { + static uint64_t percent; + + seconds = tv.tv_sec; + if (last_fs_block) { + percent = (block * 100) / last_fs_block; + log_notice( _("\r%llu percent complete.\r"), + (unsigned long long)percent); + fflush(stdout); + } + } + } +} + +char gfs2_getch(void) +{ + struct termios termattr, savetermattr; + char ch; + ssize_t size; + + tcgetattr (STDIN_FILENO, &termattr); + savetermattr = termattr; + termattr.c_lflag &= ~(ICANON | IEXTEN | ISIG); + termattr.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON); + termattr.c_cflag &= ~(CSIZE | PARENB); + termattr.c_cflag |= CS8; + termattr.c_oflag &= ~(OPOST); + termattr.c_cc[VMIN] = 0; + termattr.c_cc[VTIME] = 0; + + tcsetattr (STDIN_FILENO, TCSANOW, &termattr); + do { + size = read(STDIN_FILENO, &ch, 1); + if (size) + break; + usleep(50000); + } while (!size); + + tcsetattr (STDIN_FILENO, TCSANOW, &savetermattr); + return ch; +} + +char generic_interrupt(const char *caller, const char *where, + const char *progress, const char *question, + const char *answers) +{ + fd_set rfds; + struct timeval tv; + char response; + int err, i; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + + tv.tv_sec = 0; + tv.tv_usec = 0; + /* Make sure there isn't extraneous input before asking the + * user the question */ + while((err = select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv))) { + if(err < 0) { + log_debug("Error in select() on stdin\n"); + break; + } + if(read(STDIN_FILENO, &response, sizeof(char)) < 0) { + log_debug("Error in read() on stdin\n"); + break; + } + } + while (TRUE) { + printf("\n%s interrupted during %s: ", caller, where); + if (progress) + printf("%s.\n", progress); + printf("%s", question); + + /* Make sure query is printed out */ + fflush(NULL); + response = gfs2_getch(); + printf("\n"); + fflush(NULL); + if (strchr(answers, response)) + break; + printf("Bad response, please type "); + for (i = 0; i < strlen(answers) - 1; i++) + printf("'%c', ", answers[i]); + printf(" or '%c'.\n", answers[i]); + } + return response; +} + +/* fsck_query: Same as gfs2_query except it adjusts errors_found and + errors_corrected. */ +int fsck_query(const char *format, ...) +{ + va_list args; + char response; + int ret = 0; + + errors_found++; + fsck_abort = 0; + if (opts.yes) { + errors_corrected++; + return 1; + } + if (opts.no) + return 0; + + opts.query = TRUE; + while (1) { + va_start(args, format); + vprintf(format, args); + va_end(args); + + /* Make sure query is printed out */ + fflush(NULL); + response = gfs2_getch(); + + printf("\n"); + fflush(NULL); + if (response == 0x3) { /* if interrupted, by ctrl-c */ + response = generic_interrupt("Question", "response", + NULL, + "Do you want to abort " \ + "or continue (a/c)?", + "ac"); + if (response == 'a') { + ret = 0; + fsck_abort = 1; + break; + } + printf("Continuing.\n"); + } else if (tolower(response) == 'y') { + errors_corrected++; + ret = 1; + break; + } else if (tolower(response) == 'n') { + ret = 0; + break; + } else { + printf("Bad response %d, please type 'y' or 'n'.\n", + response); + } + } + + opts.query = FALSE; + return ret; +} + +/* + * gfs2_dup_set - Flag a block as a duplicate + * We keep the references in a red/black tree. We can't keep track of every + * single inode in the file system, so the first time this function is called + * will actually be for the second reference to the duplicated block. + * This will return the number of references to the block. + * + * create - will be set if the call is supposed to create the reference. */ +static struct duptree *gfs2_dup_set(uint64_t dblock, int create) +{ + struct osi_node **newn = &dup_blocks.osi_node, *parent = NULL; + struct duptree *dt; + + /* Figure out where to put new node */ + while (*newn) { + struct duptree *cur = (struct duptree *)*newn; + + parent = *newn; + if (dblock < cur->block) + newn = &((*newn)->osi_left); + else if (dblock > cur->block) + newn = &((*newn)->osi_right); + else + return cur; + } + + if (!create) + return NULL; + dt = malloc(sizeof(struct duptree)); + if (dt == NULL) { + log_crit( _("Unable to allocate duptree structure\n")); + return NULL; + } + dups_found++; + memset(dt, 0, sizeof(struct duptree)); + /* Add new node and rebalance tree. */ + dt->block = dblock; + dt->refs = 1; /* reference 1 is actually the reference we need to + discover in pass1b. */ + osi_list_init(&dt->ref_inode_list); + osi_list_init(&dt->ref_invinode_list); + osi_link_node(&dt->node, parent, newn); + osi_insert_color(&dt->node, &dup_blocks); + + return dt; +} + +/** + * find_dup_ref_inode - find a duplicate reference inode entry for an inode + */ +struct inode_with_dups *find_dup_ref_inode(struct duptree *dt, + struct gfs2_inode *ip) +{ + osi_list_t *ref; + struct inode_with_dups *id; + + osi_list_foreach(ref, &dt->ref_invinode_list) { + id = osi_list_entry(ref, struct inode_with_dups, list); + + if (id->block_no == ip->i_di.di_num.no_addr) + return id; + } + osi_list_foreach(ref, &dt->ref_inode_list) { + id = osi_list_entry(ref, struct inode_with_dups, list); + + if (id->block_no == ip->i_di.di_num.no_addr) + return id; + } + return NULL; +} + +/** + * count_dup_meta_refs - count the number of remaining references as metadata + */ +int count_dup_meta_refs(struct duptree *dt) +{ + osi_list_t *ref; + struct inode_with_dups *id; + int metarefs = 0; + + osi_list_foreach(ref, &dt->ref_invinode_list) { + id = osi_list_entry(ref, struct inode_with_dups, list); + if (id->reftypecount[ref_as_meta]) + metarefs++; + } + osi_list_foreach(ref, &dt->ref_inode_list) { + id = osi_list_entry(ref, struct inode_with_dups, list); + if (id->reftypecount[ref_as_meta]) + metarefs++; + } + return metarefs; +} + +/* + * add_duplicate_ref - Add a duplicate reference to the duplicates tree list + * A new element of the tree will be created as needed + * When the first reference is discovered in pass1, it realizes it's a + * duplicate but it has already forgotten where the first reference was. + * So we need to recreate the duplicate reference structure if it's not there. + * Later, in pass1b, it has to go back through the file system + * and figure out those original references in order to resolve them. + * + * first - if 1, we're being called from pass1b, in which case we're trying + * to find the first reference to this block. If 0, we're being + * called from pass1, which is the second reference, which determined + * it was a duplicate.. + */ +int add_duplicate_ref(struct gfs2_inode *ip, uint64_t block, + enum dup_ref_type reftype, int first, int inode_valid) +{ + struct inode_with_dups *id; + struct duptree *dt; + + if (!valid_block_ip(ip, block)) + return meta_is_good; + /* If this is not the first reference (i.e. all calls from pass1) we + need to create the duplicate reference. If this is pass1b, we want + to ignore references that aren't found. */ + dt = gfs2_dup_set(block, !first); + if (!dt) /* If this isn't a duplicate */ + return meta_is_good; + + /* If we found the duplicate reference but we've already discovered + the first reference (in pass1b) and the other references in pass1, + we don't need to count it, so just return. */ + if (dt->dup_flags & DUPFLAG_REF1_FOUND) + return meta_is_good; + + /* Check for a previous reference to this duplicate */ + id = find_dup_ref_inode(dt, ip); + + /* We have to be careful here. The original referencing dinode may have + deemed to be bad and deleted/freed in pass1. In that case, pass1b + wouldn't discover the correct [deleted] original reference. In + that case, we don't want to be confused and consider this second + reference the same as the first. If we do, we'll never be able to + resolve it. The first reference can't be the second reference. */ + if (id && first && !(dt->dup_flags & DUPFLAG_REF1_FOUND)) { + log_info(_("Original reference to block %llu (0x%llx) was " + "either found to be bad and deleted, or else " + "a duplicate within the same inode.\n"), + (unsigned long long)block, + (unsigned long long)block); + log_info(_("I'll consider the reference from inode %llu " + "(0x%llx) the first reference.\n"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + dt->dup_flags |= DUPFLAG_REF1_IS_DUPL; + dt->refs++; + } + + /* The first time this is called from pass1 is actually the second + reference. When we go back in pass1b looking for the original + reference, we don't want to increment the reference count because + it's already accounted for. */ + if (first) { + dt->dup_flags |= DUPFLAG_REF1_FOUND; + dups_found_first++; /* We found another first ref. */ + } else { + dt->refs++; + } + + if (id == NULL) { + /* Check for the inode on the invalid inode reference list. */ + int q; + + id = calloc(1, sizeof(*id)); + if (!id) { + log_crit( _("Unable to allocate inode_with_dups structure\n")); + return meta_error; + } + id->block_no = ip->i_di.di_num.no_addr; + q = bitmap_type(ip->i_sbd, ip->i_di.di_num.no_addr); + /* If it's an invalid dinode, put it first on the invalid + inode reference list otherwise put it on the normal list. */ + if (!inode_valid || q == GFS2_BLKST_UNLINKED) + osi_list_add_prev(&id->list, &dt->ref_invinode_list); + else { + /* If this is a system dinode, we want the duplicate + processing to find it first. That way references + from inside journals, et al, will take priority. + We don't want to delete journals in favor of dinodes + that reference a block inside a journal. */ + if (fsck_system_inode(ip->i_sbd, id->block_no)) + osi_list_add(&id->list, &dt->ref_inode_list); + else + osi_list_add_prev(&id->list, + &dt->ref_inode_list); + } + } + id->reftypecount[reftype]++; + id->dup_count++; + log_info( _("Found %d reference(s) to block %llu" + " (0x%llx) as %s in %s inode #%llu (0x%llx)\n"), + id->dup_count, (unsigned long long)block, + (unsigned long long)block, reftypes[reftype], + inode_valid ? _("valid") : _("invalid"), + (unsigned long long)ip->i_di.di_num.no_addr, + (unsigned long long)ip->i_di.di_num.no_addr); + if (first) + log_info( _("This is the original reference.\n")); + else { + /* Check for duplicate refs to the same block in one inode. */ + if (id->dup_count > 1) + dt->dup_flags |= DUPFLAG_REF1_FOUND; + log_info( _("This brings the total to: %d inode references, " + "%d from this inode.\n"), + dt->refs, id->dup_count); + } + return meta_is_good; +} + +struct dir_info *dirtree_insert(struct gfs2_inum inum) +{ + struct osi_node **newn = &dirtree.osi_node, *parent = NULL; + struct dir_info *data; + + /* Figure out where to put new node */ + while (*newn) { + struct dir_info *cur = (struct dir_info *)*newn; + + parent = *newn; + if (inum.no_addr < cur->dinode.no_addr) + newn = &((*newn)->osi_left); + else if (inum.no_addr > cur->dinode.no_addr) + newn = &((*newn)->osi_right); + else + return cur; + } + + data = calloc(1, sizeof(struct dir_info)); + if (!data) { + log_crit( _("Unable to allocate dir_info structure\n")); + return NULL; + } + /* Add new node and rebalance tree. */ + data->dinode.no_addr = inum.no_addr; + data->dinode.no_formal_ino = inum.no_formal_ino; + osi_link_node(&data->node, parent, newn); + osi_insert_color(&data->node, &dirtree); + + return data; +} + +struct dir_info *dirtree_find(uint64_t block) +{ + struct osi_node *node = dirtree.osi_node; + + while (node) { + struct dir_info *data = (struct dir_info *)node; + + if (block < data->dinode.no_addr) + node = node->osi_left; + else if (block > data->dinode.no_addr) + node = node->osi_right; + else + return data; + } + return NULL; +} + +/* get_ref_type - figure out if all duplicate references from this inode + are the same type, and if so, return the type. */ +enum dup_ref_type get_ref_type(struct inode_with_dups *id) +{ + enum dup_ref_type t, i; + int found_type_with_ref; + int found_other_types; + + for (t = ref_as_data; t < ref_types; t++) { + found_type_with_ref = 0; + found_other_types = 0; + for (i = ref_as_data; i < ref_types; i++) { + if (id->reftypecount[i]) { + if (t == i) + found_type_with_ref = 1; + else + found_other_types = 1; + } + } + if (found_type_with_ref) + return found_other_types ? ref_types : t; + } + return ref_types; +} + +void dup_listent_delete(struct duptree *dt, struct inode_with_dups *id) +{ + log_err( _("Removing duplicate reference to block %llu (0x%llx) " + "referenced as %s by dinode %llu (0x%llx)\n"), + (unsigned long long)dt->block, (unsigned long long)dt->block, + reftypes[get_ref_type(id)], (unsigned long long)id->block_no, + (unsigned long long)id->block_no); + dt->refs--; /* one less reference */ + if (id->name) + free(id->name); + osi_list_del(&id->list); + free(id); +} + +void dup_delete(struct duptree *dt) +{ + struct inode_with_dups *id; + osi_list_t *tmp; + + while (!osi_list_empty(&dt->ref_invinode_list)) { + tmp = (&dt->ref_invinode_list)->next; + id = osi_list_entry(tmp, struct inode_with_dups, list); + dup_listent_delete(dt, id); + } + while (!osi_list_empty(&dt->ref_inode_list)) { + tmp = (&dt->ref_inode_list)->next; + id = osi_list_entry(tmp, struct inode_with_dups, list); + dup_listent_delete(dt, id); + } + osi_erase(&dt->node, &dup_blocks); + free(dt); +} + +void dirtree_delete(struct dir_info *b) +{ + osi_erase(&b->node, &dirtree); + free(b); +} + +uint64_t find_free_blk(struct gfs2_sbd *sdp) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rl = NULL; + struct gfs2_rindex *ri; + struct gfs2_rgrp *rg; + unsigned int block, bn = 0, x = 0, y = 0; + unsigned int state; + struct gfs2_buffer_head *bh; + + memset(&rg, 0, sizeof(rg)); + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rl = (struct rgrp_tree *)n; + if (rl->rg.rg_free) + break; + } + + if (n == NULL) + return 0; + + ri = &rl->ri; + rg = &rl->rg; + + for (block = 0; block < ri->ri_length; block++) { + bh = rl->bits[block].bi_bh; + x = (block) ? sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_rgrp); + + for (; x < sdp->bsize; x++) + for (y = 0; y < GFS2_NBBY; y++) { + state = (bh->b_data[x] >> (GFS2_BIT_SIZE * y)) & 0x03; + if (state == GFS2_BLKST_FREE) + return ri->ri_data0 + bn; + bn++; + } + } + return 0; +} + +uint64_t *get_dir_hash(struct gfs2_inode *ip) +{ + unsigned hsize = (1 << ip->i_di.di_depth) * sizeof(uint64_t); + int ret; + uint64_t *tbl = malloc(hsize); + + if (tbl == NULL) + return NULL; + + ret = gfs2_readi(ip, tbl, 0, hsize); + if (ret != hsize) { + free(tbl); + return NULL; + } + + return tbl; +} + +void delete_all_dups(struct gfs2_inode *ip) +{ + struct osi_node *n, *next; + struct duptree *dt; + osi_list_t *tmp, *x; + struct inode_with_dups *id; + int found; + + for (n = osi_first(&dup_blocks); n; n = next) { + next = osi_next(n); + dt = (struct duptree *)n; + + found = 0; + id = NULL; + + osi_list_foreach_safe(tmp, &dt->ref_invinode_list, x) { + id = osi_list_entry(tmp, struct inode_with_dups, list); + if (id->block_no == ip->i_di.di_num.no_addr) { + dup_listent_delete(dt, id); + found = 1; + } + } + osi_list_foreach_safe(tmp, &dt->ref_inode_list, x) { + id = osi_list_entry(tmp, struct inode_with_dups, list); + if (id->block_no == ip->i_di.di_num.no_addr) { + dup_listent_delete(dt, id); + found = 1; + } + } + if (!found) + continue; + + if (dt->refs == 0) { + log_debug(_("This was the last reference: 0x%llx is " + "no longer a duplicate.\n"), + (unsigned long long)dt->block); + dup_delete(dt); /* not duplicate now */ + } else { + log_debug(_("%d references remain to 0x%llx\n"), + dt->refs, (unsigned long long)dt->block); + if (dt->refs > 1) + continue; + + id = NULL; + osi_list_foreach(tmp, &dt->ref_invinode_list) + id = osi_list_entry(tmp, + struct inode_with_dups, + list); + osi_list_foreach(tmp, &dt->ref_inode_list) + id = osi_list_entry(tmp, + struct inode_with_dups, + list); + if (id) + log_debug("Last reference is from inode " + "0x%llx\n", + (unsigned long long)id->block_no); + } + } +} + +void print_pass_duration(const char *name, struct timeval *start) +{ + char duration[17] = ""; /* strlen("XXdXXhXXmXX.XXXs") + 1 */ + struct timeval end, diff; + unsigned d, h, m, s; + char *p = duration; + + gettimeofday(&end, NULL); + timersub(&end, start, &diff); + + s = diff.tv_sec % 60; + diff.tv_sec /= 60; + m = diff.tv_sec % 60; + diff.tv_sec /= 60; + h = diff.tv_sec % 24; + d = diff.tv_sec / 24; + + if (d) + p += snprintf(p, 4, "%ud", d > 99 ? 99U : d); + if (h) + p += snprintf(p, 4, "%uh", h); + if (m) + p += snprintf(p, 4, "%um", m); + + snprintf(p, 8, "%u.%03lus", s, diff.tv_usec / 1000); + log_notice(_("%s completed in %s\n"), name, duration); +} + diff --git a/gfs2/fsck/util.h b/gfs2/fsck/util.h new file mode 100644 index 0000000..d93b65d --- /dev/null +++ b/gfs2/fsck/util.h @@ -0,0 +1,137 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include + +#include "fsck.h" +#include "libgfs2.h" + +#define fsck_lseek(fd, off) \ + ((lseek((fd), (off), SEEK_SET) == (off)) ? 0 : -1) + +#define INODE_VALID 1 +#define INODE_INVALID 0 + +struct di_info *search_list(osi_list_t *list, uint64_t addr); +void big_file_comfort(struct gfs2_inode *ip, uint64_t blks_checked); +void warm_fuzzy_stuff(uint64_t block); +int add_duplicate_ref(struct gfs2_inode *ip, uint64_t block, + enum dup_ref_type reftype, int first, int inode_valid); +extern struct inode_with_dups *find_dup_ref_inode(struct duptree *dt, + struct gfs2_inode *ip); +extern void dup_listent_delete(struct duptree *dt, struct inode_with_dups *id); +extern int count_dup_meta_refs(struct duptree *dt); +extern const char *reftypes[ref_types + 1]; + +#define BLOCKMAP_SIZE1(size) ((size) >> 3) +#define BLOCKMAP_SIZE2(size) ((size) >> 2) +#define BLOCKMAP_BYTE_OFFSET2(x) ((x & 0x0000000000000003) << 1) +#define BLOCKMAP_BYTE_OFFSET1(x) (x & 0x0000000000000007) +#define BLOCKMAP_MASK2 (0x3) +#define BLOCKMAP_MASK1 (1) + +struct fsck_pass { + const char *name; + int (*f)(struct gfs2_sbd *sdp); +}; + +static inline int block_type(struct gfs2_bmap *bl, uint64_t bblock) +{ + static unsigned char *byte; + static uint64_t b; + static int btype; + + byte = bl->map + BLOCKMAP_SIZE2(bblock); + b = BLOCKMAP_BYTE_OFFSET2(bblock); + btype = (*byte & (BLOCKMAP_MASK2 << b )) >> b; + return btype; +} + +static inline int link1_type(struct gfs2_bmap *bl, uint64_t bblock) +{ + static unsigned char *byte; + static uint64_t b; + static int btype; + + byte = bl->map + BLOCKMAP_SIZE1(bblock); + b = BLOCKMAP_BYTE_OFFSET1(bblock); + btype = (*byte & (BLOCKMAP_MASK1 << b )) >> b; + return btype; +} + +static inline void link1_destroy(struct gfs2_bmap *bmap) +{ + if (bmap->map) + free(bmap->map); + bmap->size = 0; + bmap->mapsize = 0; +} + +static inline int bitmap_type(struct gfs2_sbd *sdp, uint64_t bblock) +{ + struct rgrp_tree *rgd; + + rgd = gfs2_blk2rgrpd(sdp, bblock); + return lgfs2_get_bitmap(sdp, bblock, rgd); +} + +static const inline char *block_type_string(int q) +{ + const char *blktyp[] = {"free", "data", "other", "inode", "invalid"}; + if (q >= GFS2_BLKST_FREE && q <= GFS2_BLKST_DINODE) + return (blktyp[q]); + return blktyp[4]; +} + +static inline int is_dir(struct gfs2_dinode *dinode, int gfs1) +{ + if (gfs1 && is_gfs_dir(dinode)) + return 1; + if (S_ISDIR(dinode->di_mode)) + return 1; + + return 0; +} + +static inline uint32_t gfs_to_gfs2_mode(struct gfs2_inode *ip) +{ + uint16_t gfs1mode = ip->i_di.__pad1; + + switch (gfs1mode) { + case GFS_FILE_DIR: + return S_IFDIR; + case GFS_FILE_REG: + return S_IFREG; + case GFS_FILE_LNK: + return S_IFLNK; + case GFS_FILE_BLK: + return S_IFBLK; + case GFS_FILE_CHR: + return S_IFCHR; + case GFS_FILE_FIFO: + return S_IFIFO; + case GFS_FILE_SOCK: + return S_IFSOCK; + default: + /* This could be an aborted gfs2_convert so look for both. */ + if (ip->i_di.di_entries || + (ip->i_di.di_mode & S_IFMT) == S_IFDIR) + return S_IFDIR; + else + return S_IFREG; + } +} + +extern enum dup_ref_type get_ref_type(struct inode_with_dups *id); +extern char generic_interrupt(const char *caller, const char *where, + const char *progress, const char *question, + const char *answers); +extern char gfs2_getch(void); +extern uint64_t find_free_blk(struct gfs2_sbd *sdp); +extern uint64_t *get_dir_hash(struct gfs2_inode *ip); +extern void delete_all_dups(struct gfs2_inode *ip); +extern void print_pass_duration(const char *name, struct timeval *start); + +#define stack log_debug(" - %s()\n", __func__) + +#endif /* __UTIL_H__ */ diff --git a/gfs2/glocktop/Makefile.am b/gfs2/glocktop/Makefile.am new file mode 100644 index 0000000..1102c8e --- /dev/null +++ b/gfs2/glocktop/Makefile.am @@ -0,0 +1,24 @@ +MAINTAINERCLEANFILES = Makefile.in + +sbin_PROGRAMS = \ + glocktop + +glocktop_SOURCES = \ + glocktop.c + +glocktop_CFLAGS = \ + $(ncurses_CFLAGS) + +glocktop_LDFLAGS = \ + $(ncurses_LIBS) \ + $(uuid_LIBS) + +glocktop_CPPFLAGS = \ + -D_FILE_OFFSET_BITS=64 \ + -D_LARGEFILE64_SOURCE \ + -D_GNU_SOURCE \ + -I$(top_srcdir)/gfs2/include \ + -I$(top_srcdir)/gfs2/libgfs2 + +glocktop_LDADD = \ + $(top_builddir)/gfs2/libgfs2/libgfs2.la diff --git a/gfs2/glocktop/glocktop.c b/gfs2/glocktop/glocktop.c new file mode 100644 index 0000000..7d65ac4 --- /dev/null +++ b/gfs2/glocktop/glocktop.c @@ -0,0 +1,1901 @@ +#include "clusterautoconfig.h" +/** + * glocktop.c - list/print the top GFS2 glock waiters + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_GLOCKS 20 +#define MAX_LINES 6000 +#define MAX_MOUNT_POINTS 100 +#define MAX_FILES 512 +#define MAX_CALLTRACE_LINES 4 +#define TITLE1 "glocktop - GFS2 glock monitor" +#define TITLE2 "Press or to exit" + +#define COLOR_TITLE 1 +#define COLOR_NORMAL 2 +#define COLOR_INVERSE 3 +#define COLOR_SPECIAL 4 +#define COLOR_HIGHLIGHT 5 +#define COLOR_OFFSETS 6 +#define COLOR_CONTENTS 7 +#define COLOR_HELD 8 + +/* init_pair(COLOR_TITLE, COLOR_BLACK, COLOR_CYAN); + init_pair(COLOR_INVERSE, COLOR_BLACK, COLOR_WHITE); + init_pair(COLOR_NORMAL, COLOR_WHITE, COLOR_BLACK); + init_pair(COLOR_SPECIAL, COLOR_MAGENTA, COLOR_WHITE); + init_pair(COLOR_HIGHLIGHT, COLOR_WHITE, COLOR_BLUE); + init_pair(COLOR_OFFSETS, COLOR_CYAN, COLOR_WHITE); + init_pair(COLOR_CONTENTS, COLOR_BLUE, COLOR_WHITE); + init_pair(COLOR_HELD, COLOR_CYAN, COLOR_BLACK); +*/ + +#define STR_BLACK "[\033[0;30m]" +#define STR_RED "[\033[0;31m]" +#define STR_GREEN "[\033[0;32m]" +#define STR_YELLOW "[\033[0;33m]" +#define STR_BLUE "[\033[0;34m]" +#define STR_MAGENTA "[\033[0;35m]" +#define STR_CYAN "[\033[0;36m]" +#define STR_WHITE "[\033[0;37m]" + +#define BOLD_WHITE "[\033[1;37m]" + +#define BKG_CYAN "[\033[46m]" +#define BKG_WHITE "[\033[47m]" +#define BKG_BLUE "[\033[44m]" + +#define REFRESH_TIME 30 +#define COLORS_TITLE \ + do { \ + if (termlines) \ + attrset(COLOR_PAIR(COLOR_TITLE)); \ + else \ + printf(BKG_CYAN); \ + } while (0) +#define COLORS_NORMAL_BOLD \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_NORMAL)); \ + attron(A_BOLD); \ + } else { \ + printf(BOLD_WHITE); \ + } \ + } while (0) +#define COLORS_NORMAL \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_NORMAL)); \ + } else { \ + printf(STR_WHITE); \ + } \ + } while (0) +#define COLORS_INVERSE_BOLD \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_INVERSE)); \ + attron(A_BOLD); \ + } else { \ + printf(BKG_WHITE); \ + } \ + } while (0) +#define COLORS_INVERSE \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_INVERSE)); \ + } else { \ + printf(BKG_WHITE); \ + } \ + } while (0) +#define COLORS_HELD \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_HELD)); \ + } else { \ + printf(STR_CYAN); \ + } \ + } while (0) +#define COLORS_HIGHLIGHT \ + do { \ + if (termlines) { \ + attrset(COLOR_PAIR(COLOR_HIGHLIGHT)); \ + } else { \ + printf(BKG_BLUE); \ + } \ + } while (0) +#define DLM_DIRTBL "/sys/kernel/config/dlm/cluster/dirtbl_size" +#define DLM_RSBTBL "/sys/kernel/config/dlm/cluster/rsbtbl_size" +#define DLM_LKBTBL "/sys/kernel/config/dlm/cluster/lkbtbl_size" + +#define GFS2_MAX_META_HEIGHT 10 + +#define DETAILS 0x00000001 +#define FRIENDLY 0x00000002 + +enum summary_types { + all = 0, + locked = 1, + held_ex = 2, + held_sh = 3, + held_df = 4, + has_waiter = 5, + tot_waiters = 6, + stypes = 7, +}; + +char debugfs[PATH_MAX]; +int termcols = 80, termlines = 30, done = 0; +unsigned glocks = 0; +const char *termtype; +WINDOW *wind; +int bufsize = 4 * 1024 * 1024; +char *glock[MAX_GLOCKS]; +int iterations = 0, show_reservations = 0, iters_done = 0; +char devices[MAX_MOUNT_POINTS][80]; +char mount_points[MAX_MOUNT_POINTS][80]; +int fs_fd[MAX_MOUNT_POINTS]; +int mounted = 0; +char dlmwlines[100][96]; /* waiters lines */ +char dlmglines[MAX_LINES][97]; /* granted lines */ +char contended_filenames[MAX_FILES][PATH_MAX]; +unsigned long long contended_blocks[MAX_FILES]; +int contended_count = 0; +int line = 0; +const char *prog_name; +char dlm_dirtbl_size[32], dlm_rsbtbl_size[32], dlm_lkbtbl_size[32]; +int bsize = 0; +struct gfs2_sb sd_sb[MAX_MOUNT_POINTS]; +int sd_diptrs = 0, sd_inptrs = 0; +uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT]; +uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT]; +int sd_max_height, sd_max_jheight; +char print_dlm_grants = 1; +char *gbuf = NULL; /* glocks buffer */ +char *gpos = NULL; +char *gnextpos = NULL; +int gmaxpos = 0; + +char *dbuf = NULL; /* dlm locks buffer */ +char *dpos = NULL; +char *dnextpos = NULL; +int dmaxpos = 0; +char hostname[256]; + +/* + * init_colors + */ +static void init_colors(void) +{ + init_pair(COLOR_TITLE, COLOR_BLACK, COLOR_CYAN); + init_pair(COLOR_INVERSE, COLOR_BLACK, COLOR_WHITE); + init_pair(COLOR_NORMAL, COLOR_WHITE, COLOR_BLACK); + init_pair(COLOR_SPECIAL, COLOR_MAGENTA, COLOR_WHITE); + init_pair(COLOR_HIGHLIGHT, COLOR_WHITE, COLOR_BLUE); + init_pair(COLOR_OFFSETS, COLOR_CYAN, COLOR_WHITE); + init_pair(COLOR_CONTENTS, COLOR_BLUE, COLOR_WHITE); + init_pair(COLOR_HELD, COLOR_CYAN, COLOR_BLACK); +} + +/* + * UpdateSize - screen size changed, so update it + */ +static void UpdateSize(int sig) +{ + static char term_buffer[2048]; + int rc; + + if (termlines) { + termlines = 30; + termtype = getenv("TERM"); + if (termtype == NULL) + return; + rc=tgetent(term_buffer,termtype); + if (rc >= 0) { + termlines = tgetnum((char *)"li"); + if (termlines < 10) + termlines = 30; + termcols = tgetnum((char *)"co"); + if (termcols < 80) + termcols = 80; + } else + perror("Error: tgetent failed."); + termlines--; /* last line is number of lines -1 */ + } + signal(SIGWINCH, UpdateSize); +} + +static void read_superblock(int fd, int mntpt) +{ + struct gfs2_sbd sbd = { .device_fd = fd, .bsize = GFS2_BASIC_BLOCK }; + struct gfs2_buffer_head *bh; + int x; + uint64_t space = 0; + + ioctl(fd, BLKFLSBUF, 0); + bh = bread(&sbd, GFS2_SB_ADDR); + gfs2_sb_in(&sd_sb[mntpt], bh->b_data); + bsize = sd_sb[mntpt].sb_bsize; + if (!bsize) + bsize = 4096; + sd_inptrs = (bsize - sizeof(struct gfs2_meta_header)) / + sizeof(uint64_t); + sd_diptrs = (bsize - sizeof(struct gfs2_dinode)) / + sizeof(uint64_t); + sd_heightsize[0] = bsize - sizeof(struct gfs2_dinode); + sd_heightsize[1] = bsize * sd_diptrs; + for (x = 2; ; x++) { + space = sd_heightsize[x - 1] * sd_inptrs; + if (space / sd_inptrs != sd_heightsize[x - 1] || + space % sd_inptrs != 0) + break; + sd_heightsize[x] = space; + } + sd_jheightsize[0] = bsize - sizeof(struct gfs2_dinode); + sd_jheightsize[1] = (bsize - sizeof(struct gfs2_meta_header)) * + sd_diptrs; + for (x = 2; ; x++){ + space = sd_jheightsize[x - 1] * sd_inptrs; + if (space / sd_inptrs != sd_jheightsize[x - 1] || + space % sd_inptrs != 0) + break; + sd_jheightsize[x] = space; + } + sd_max_jheight = x; +} + +static int parse_mounts(void) +{ + char str[PATH_MAX], dev[PATH_MAX], mnt[PATH_MAX], mtype[PATH_MAX]; + char opts[PATH_MAX]; + FILE *fp; + + memset(debugfs, 0, sizeof(debugfs)); + memset(mount_points, 0, sizeof(mount_points)); + memset(devices, 0, sizeof(devices)); + + fp = fopen("/proc/mounts", "rt"); + if (fp == NULL) { + perror("/proc/mounts"); + return 1; + } + while (fgets(str, sizeof(str) - 1, fp)) { + sscanf(str, "%s %s %s %s", dev, mnt, mtype, opts); + if (!strcmp(mtype, "debugfs")) { + strcpy(debugfs, mnt); + continue; + } + if (strcmp(mtype, "gfs2")) /* if not gfs2 */ + continue; + + strncpy(mount_points[mounted], mnt, 79); + mount_points[mounted][79] = '\0'; + strncpy(devices[mounted], dev, 79); + devices[mounted][79] = '\0'; + + /* Now find out the mount point's file system name */ + fs_fd[mounted] = open(dev, O_RDONLY); + if (fs_fd[mounted]) + read_superblock(fs_fd[mounted], mounted); + mounted++; + } + if (debugfs[0] == '\0') { + if (mount("debugfs", "/sys/kernel/debug", "debugfs", 0, NULL)){ + fprintf(stderr, "Unable to mount debugfs.\n"); + fprintf(stderr, "Please mount it manually.\n"); + exit(-1); + } + strcpy(debugfs, "/sys/kernel/debug"); + } + fclose(fp); + return 0; +} + +/* + * display_title_lines + */ +static void display_title_lines(void) +{ + if (termlines) { + clear(); /* don't use Erase */ + COLORS_TITLE; + attron(A_BOLD); + move(0, 0); + printw("%-80s", TITLE1); + move(termlines, 0); + printw("%-79s", TITLE2); + COLORS_NORMAL_BOLD; + move(1, 0); + } else { + printf("\n"); + } + line = 1; +} + +/* + * bobgets - get a string + * returns: 1 if user exited by hitting enter + * 0 if user exited by hitting escape + */ +static int bobgets(char string[], int x, int y, int sz, int *ch) +{ + int finished,runningy,rc; + + if (!termlines) + return 0; + move(x,y); + finished=FALSE; + COLORS_INVERSE_BOLD; + move(x,y); + addstr(string); + move(x,y); + curs_set(2); + refresh(); + runningy=y; + rc=0; + while (!finished) { + *ch = getch(); + + if(*ch < 0x0100 && isprint(*ch)) { + char *p=string+strlen(string); // end of the string + + *(p+1)='\0'; + string[runningy-y]=*ch; + runningy++; + move(x,y); + addstr(string); + if (runningy-y >= sz) { + rc=1; + *ch = KEY_RIGHT; + finished = TRUE; + } + } + else { + // special character, is it one we recognize? + switch(*ch) + { + case(KEY_ENTER): + case('\n'): + case('\r'): + rc=1; + finished=TRUE; + string[runningy-y] = '\0'; + break; + case(KEY_CANCEL): + case(0x01B): + rc=0; + finished=TRUE; + break; + case(KEY_DC): + case(0x07F): + if (runningy>=y) { + char *p; + p = &string[runningy - y]; + while (*p) { + *p = *(p + 1); + p++; + } + *p = '\0'; + runningy--; + // remove the character from the string + move(x,y); + addstr(string); + COLORS_NORMAL_BOLD; + addstr(" "); + COLORS_INVERSE_BOLD; + runningy++; + } + break; + case(KEY_BACKSPACE): + if (runningy>y) { + char *p; + + p = &string[runningy - y - 1]; + while (*p) { + *p = *(p + 1); + p++; + } + *p='\0'; + runningy--; + // remove the character from the string + move(x,y); + addstr(string); + COLORS_NORMAL_BOLD; + addstr(" "); + COLORS_INVERSE_BOLD; + } + break; + default: + move(0,70); + printw("%08x", *ch); + // ignore all other characters + break; + } // end switch on non-printable character + } // end non-printable character + move(line, runningy); + refresh(); + } // while !finished + if (sz>0) + string[sz]='\0'; + COLORS_NORMAL_BOLD; + return rc; +}/* bobgets */ + +static char *bufgets(int fd, char *bigbuf, char **nextpos, char **pos, + int *maxpos) +{ + if (*nextpos == NULL) { + *maxpos = read(fd, bigbuf, bufsize - 1); + bigbuf[bufsize - 1] = '\0'; + if (*maxpos == 0) + return NULL; + *pos = bigbuf; + } else + *pos = *nextpos; + + *nextpos = memchr(*pos, '\n', (bigbuf + *maxpos) - *pos); + while (*nextpos && (**nextpos == '\n' || **nextpos == '\r') && + *nextpos < bigbuf + (bufsize - 1)) { + **nextpos = '\0'; + (*nextpos)++; + } + if (*nextpos >= bigbuf + *maxpos) + *nextpos = NULL; + return *pos; +} + +static char *glock_number(const char *str) +{ + const char *glockid; + char *p; + static char id[32]; + + glockid = strchr(str, '/'); + if (glockid == NULL) + return NULL; + glockid++; + strncpy(id, glockid, sizeof(id)); + id[31] = '\0'; + p = strchr(id, ' '); + if (p) + *p = '\0'; + return id; +} + +static int this_glock_requested(const char *str) +{ + const char *glockid; + int i; + + if (!glocks) + return 0; + + glockid = glock_number(str); + if (glockid == NULL) + return 0; + for (i = 0; i < glocks; i++) + if (!strcmp(glockid, glock[i])) + return 1; + return 0; +} + +static int is_iopen(const char *str) +{ + char *p; + + p = strchr(str, '/'); + if (p == NULL) + return 0; + p--; + if (*p == '5') + return 1; + return 0; +} + +static int this_lkb_requested(const char *str) +{ + int i; + + if (!glocks) + return 1; + + for (i = 0; i < glocks; i++) { + if (strstr(str, glock[i])) + return 1; + } + return 0; +} + +static void eol(int col) /* end of line */ +{ + if (termlines) { + line++; + move(line, col); + } else { + printf("\n"); + for (; col > 0; col--) + printf(" "); + } +} + +void print_it(const char *label, const char *fmt, const char *fmt2, ...) +{ + va_list args; + char tmp_string[128]; + + if (!termlines || line < termlines) { + va_start(args, fmt2); + vsnprintf(tmp_string, 127, fmt, args); + tmp_string[127] = '\0'; + + if (termlines) { + printw("%s", tmp_string); + refresh(); + } else { + printf("%s", tmp_string); + fflush(stdout); + } + } + va_end(args); +} + +static void display_filename(int fd, unsigned long long block, + unsigned long long dirarray[256], int subdepth) +{ + int i, subs; + char *mntpt = NULL; + char blk[32]; + DIR *dir = NULL; + struct dirent *dent; + + for (i = 0; i < mounted; i++) { + if (fd == fs_fd[i]) { + mntpt = mount_points[i]; + break; + } + } + if (i == mounted) + return; + for (i = 0; i < contended_count; i++) { + if (contended_blocks[i] == block) { + break; + } + } + sprintf(blk, "%lld", block); + if (i >= contended_count) { + memset(contended_filenames[i], 0, PATH_MAX); + strcat(contended_filenames[i], mntpt); + for (subs = subdepth - 2; subs >= 0; subs--) { + dir = opendir(contended_filenames[i]); + while ((dent = readdir(dir))) { + if (dent->d_ino == dirarray[subs]) { + strcat(contended_filenames[i], "/"); + strcat(contended_filenames[i], + dent->d_name); + break; + } + } + closedir(dir); + } + } + + print_it(NULL, "%s", NULL, contended_filenames[i]); + eol(0); +} + +static const char *show_inode(const char *id, int fd, unsigned long long block) +{ + struct gfs2_inode *ip; + const char *inode_type = NULL; + struct gfs2_sbd sbd = { .device_fd = fd, .bsize = bsize }; + + ip = lgfs2_inode_read(&sbd, block); + if (S_ISDIR(ip->i_di.di_mode)) { + struct gfs2_inode *parent; + unsigned long long dirarray[256]; + int subdepth = 0, error; + + inode_type = "directory "; + dirarray[0] = block; + subdepth++; + /* Backtrack the directory to its source */ + while (1) { + error = gfs2_lookupi(ip, "..", 2, &parent); + if (error) + break; + /* Stop at the root inode */ + if (ip->i_di.di_num.no_addr == + parent->i_di.di_num.no_addr) { + inode_put(&parent); + break; + } + inode_put(&ip); + ip = parent; + dirarray[subdepth++] = parent->i_di.di_num.no_addr; + } + display_filename(fd, block, dirarray, subdepth); + } else if (S_ISREG(ip->i_di.di_mode)) { + inode_type = "file "; + } else if (S_ISLNK(ip->i_di.di_mode)) { + inode_type = "link "; + } else if (S_ISCHR(ip->i_di.di_mode)) { + inode_type = "char device "; + } else if (S_ISBLK(ip->i_di.di_mode)) { + inode_type = "block device "; + } else if (S_ISFIFO(ip->i_di.di_mode)) { + inode_type = "fifo "; + } else if (S_ISSOCK(ip->i_di.di_mode)) { + inode_type = "socket "; + } else + inode_type = "file? "; + inode_put(&ip); + return inode_type; +} + +static const char *show_details(const char *id, const char *fsname, int btype, + int trace_dir_path) +{ + int mnt_num; + unsigned long long block = 0; + const char *blk_type = NULL; + FILE *dlmf; + + /* Figure out which mount point corresponds to this debugfs id */ + for (mnt_num = 0; mnt_num < mounted; mnt_num++) { + char *p; + + p = strchr(sd_sb[mnt_num].sb_locktable, ':'); + if (!p) + continue; + p++; + if (!strcmp(p, fsname)) + break; + } + memset(dlm_dirtbl_size, 0, sizeof(dlm_dirtbl_size)); + memset(dlm_rsbtbl_size, 0, sizeof(dlm_rsbtbl_size)); + memset(dlm_lkbtbl_size, 0, sizeof(dlm_lkbtbl_size)); + if (!strcmp(sd_sb[mnt_num].sb_lockproto, "lock_dlm")) { + char *sp; + char *p; + + dlmf = fopen(DLM_DIRTBL, "rt"); + if (dlmf) { + sp = fgets(dlm_dirtbl_size, sizeof(dlm_dirtbl_size), dlmf); + if (sp == NULL) + goto out_err; + p = strchr(dlm_dirtbl_size, '\n'); + if (p) + *p = '\0'; + fclose(dlmf); + } else { + strcpy(dlm_dirtbl_size, " "); + } + dlmf = fopen(DLM_RSBTBL, "rt"); + if (dlmf) { + sp = fgets(dlm_rsbtbl_size, sizeof(dlm_rsbtbl_size), dlmf); + if (sp == NULL) + goto out_err; + p = strchr(dlm_rsbtbl_size, '\n'); + if (p) + *p = '\0'; + fclose(dlmf); + } else { + strcpy(dlm_rsbtbl_size, " "); + } + dlmf = fopen(DLM_LKBTBL, "rt"); + if (dlmf) { + sp = fgets(dlm_lkbtbl_size, sizeof(dlm_lkbtbl_size), dlmf); + if (sp == NULL) + goto out_err; + p = strchr(dlm_lkbtbl_size, '\n'); + if (p) + *p = '\0'; + fclose(dlmf); + } else { + strcpy(dlm_lkbtbl_size, " "); + } + } else { + strcpy(dlm_dirtbl_size, "nolock"); + strcpy(dlm_lkbtbl_size, "nolock"); + strcpy(dlm_lkbtbl_size, "nolock"); + } + + if (mnt_num >= mounted) /* can't find the right superblock */ + return "unknown"; + + /* Read the inode in so we can see its type. */ + sscanf(id, "%llx", &block); + if (block) { + if (btype == 2) + if (trace_dir_path) + blk_type = show_inode(id, fs_fd[mnt_num], + block); + else + blk_type = ""; + else + blk_type = ""; + } + return blk_type; +out_err: + fclose(dlmf); + return "error"; +} + +static int is_dlm_waiting(int dlmwaiters, int locktype, char *id) +{ + int i; + int dlmid, wait_type, nodeid, type; + char locknum[32]; + + for (i = 0; i < dlmwaiters && i < 100; i++) { + sscanf(dlmwlines[i], "%x %d %d %d %s", + &dlmid, &wait_type, &nodeid, &type, locknum); + if ((type == locktype) && (!strcmp(locknum, id))) + return 1; + } + return 0; +} + +static const char *friendly_state(const char *glock_line, const char *search) +{ + const char *p; + + p = strstr(glock_line, search); + + if (p == NULL) + return "Dazed"; + + p += 2; + if (*p == 'E') + return "Exclusive"; + else if (*p == 'S') + return "Shared"; + else if (*p == 'U') + return "Unlocked"; + else if (*p == 'D') + return "Deferred"; + else + return "Confused"; +} + +static const char *friendly_gflags(const char *glock_line) +{ + static char flagout[PATH_MAX]; + const char *p; + + memset(flagout, 0, sizeof(flagout)); + + p = strstr(glock_line, "f:"); + if (!p) + return " "; + p += 2; + strcpy(flagout, "["); + while (*p != ' ') { + switch (*p) { + case 'l': + /*strcat(flagout, "Locked");*/ + break; + case 'D': + strcat(flagout, "Demoting"); + break; + case 'd': + strcat(flagout, "Demote pending"); + break; + case 'p': + strcat(flagout, "Demote in progress"); + break; + case 'y': + strcat(flagout, "Dirty"); + break; + case 'f': + strcat(flagout, "Flush"); + break; + case 'i': + strcat(flagout, "Invalidating"); + break; + case 'r': + strcat(flagout, "Reply pending"); + break; + case 'I': + /*strcat(flagout, "Initial");*/ + break; + case 'F': + strcat(flagout, "Frozen"); + break; + case 'q': + strcat(flagout, "Queued"); + break; + case 'L': + strcat(flagout, "LRU"); + break; + case 'o': + /*strcat(flagout, "Object present");*/ + break; + case 'b': + strcat(flagout, "Blocking"); + break; + default: + strcat(flagout, "Unknown"); + break; + } + if ((strlen(flagout)) > 1 && (!strchr(" lIo", *(p + 1)))) + strcat(flagout, ", "); + p++; + } + strcat(flagout, "]"); + return flagout; +} + +static const char *friendly_glock(const char *glock_line, char prefix) +{ + static char gline[PATH_MAX]; + + if (prefix == 'W') + sprintf(gline, "Is:%s, Want:%s %s", + friendly_state(glock_line, "s:"), + friendly_state(glock_line, "t:"), + friendly_gflags(glock_line)); + else + sprintf(gline, "Held:%s %s", + friendly_state(glock_line, "s:"), + friendly_gflags(glock_line)); + return gline; +} + +static const char *dlm_grtype(int grmode) +{ + const char *dlm_types[8] = {"NL", "CR", "CW", "PR", "PW", "EX", + "NA", "NA"}; + + if (grmode < 0) + return "-1"; + return dlm_types[grmode & 0x07]; +} + +static const char *dlm_status(int status) +{ + const char *dlm_statuses[4] = {"Unknown", "Waiting", "Granted", + "Converting"}; + if (status < 0) + return "unknown"; + return dlm_statuses[status & 0x03]; +} + +static const char *dlm_nodeid(int lkbnodeid) +{ + static char nodeid[16]; + + if (lkbnodeid == 0) + return "this node"; + sprintf(nodeid, "node %d", lkbnodeid); + return nodeid; +} + +static const char *getprocname(int ownpid) +{ + char fn[1024]; + static char str[80]; + const char *procname; + FILE *fp; + + sprintf(fn, "/proc/%d/status", ownpid); + fp = fopen(fn, "r"); + if (fp == NULL) + return "ended"; + + if (fgets(str, 80, fp) != NULL) { + char *p; + + procname = str + 6; + p = strchr(procname, '\n'); + if (p) + *p = '\0'; + } else + procname = "unknown"; + + fclose(fp); + return procname; +} + +static void show_dlm_grants(int locktype, const char *g_line, int dlmgrants, + int summary) +{ + int i; + char dlm_resid[75]; + unsigned int lkb_id, lkbnodeid, remid, ownpid, exflags, flags, status; + unsigned int grmode, rqmode, nodeid, length; + unsigned long long xid, us; + char trgt_res_name[64], res_name[64], *p1, *p2; + const char *procname; + + p1 = strchr(g_line, '/'); + if (!p1) + return; + p1++; + p2 = strchr(p1, ' '); + if (!p2) + return; + memset(trgt_res_name, 0, sizeof(trgt_res_name)); + memcpy(trgt_res_name, p1, p2 - p1); + sprintf(dlm_resid, "%8d%16s", locktype, trgt_res_name); + for (i = 0; i < dlmgrants; i++) { +/* +lkb_id n remid pid x e f s g rq u n ln res_name 1234567890123456 +1100003 1 2ae0006 8954 0 0 0 2 5 -1 0 1 24 " 2 102ab" +2a20001 1 30d0001 8934 0 0 0 2 3 -1 0 1 24 " 5 102ab" + b0001 2 860001 8868 0 0 10000 2 3 -1 0 0 24 " 1 2" +2450001 2 1be0002 8962 0 0 10000 1 -1 5 12214 0 24 " 2 102ab" +*/ + p1 = strchr(dlmglines[i], '\"'); + if (!p1) + continue; + p1++; + if (strncmp(dlm_resid, p1, 24)) + continue; + + sscanf(dlmglines[i], "%x %d %x %u %llu %x %x %d %d %d %llu " + "%u %d \"%24s\"\n", + &lkb_id, &lkbnodeid, &remid, &ownpid, &xid, &exflags, + &flags, &status, &grmode, &rqmode, &us, &nodeid, + &length, res_name); + if (status == 1) { /* Waiting */ + if (!lkbnodeid) + procname = getprocname(ownpid); + else + procname = ""; + if (summary) + print_it(NULL, " (", NULL); + else + print_it(NULL, " D: ", NULL); + print_it(NULL, "%s for %s, pid %d %s", NULL, + dlm_status(status), dlm_nodeid(lkbnodeid), + ownpid, procname); + if (summary) + print_it(NULL, ")", NULL); + } else if (grmode == 0) { + continue; /* ignore "D: Granted NL on node X" */ + } else { + procname = getprocname(ownpid); + if (summary) + print_it(NULL, " (", NULL); + else + print_it(NULL, " D: ", NULL); + print_it(NULL, "%s %s on %s to pid %d %s", NULL, + dlm_status(status), dlm_grtype(grmode), + dlm_nodeid(lkbnodeid), ownpid, procname); + if (summary) + print_it(NULL, ")", NULL); + } + if (!summary) + eol(0); + } +} + +static void print_call_trace(const char *hline) +{ + char *p, *pid, tmp[32], stackfn[64], str[96]; + FILE *fp; + int i; + + p = strchr(hline, 'p'); + if (!p) + return; + pid = p + 2; + p = strchr(pid, ' '); + if (!p) + return; + memset(tmp, 0, sizeof(tmp)); + memcpy(tmp, pid, p - pid); + sprintf(stackfn, "/proc/%s/stack", tmp); + fp = fopen(stackfn, "rt"); + if (fp == NULL) + return; + for (i = 0; i < MAX_CALLTRACE_LINES; i++) { + if (fgets(str, sizeof(str) - 1, fp) == NULL) + break; + if (strstr(str, "gfs2_glock_")) { /* skip lines we don't + care about*/ + i--; + continue; + } + p = strchr(str, '\n'); + if (p) + *p = '\0'; + p = strchr(str, ']'); + if (p) + p += 2; + else + p = str; + print_it(NULL, " C: %s ", NULL, p); + eol(0); + } + fclose(fp); +} + +static int is_ex(const char *hline) +{ + if (strncmp(hline, " H: s:EX ", 9) == 0) + return 1; + return 0; +} + +static int has_holder_flag(const char *hline, char flag) +{ + const char *p; + + p = strchr(hline, 'f'); + if (p == NULL) + return 0; + p++; + if (*p != ':') + return 0; + p++; + while (*p != '\0') { + if (*p == ' ') + return 0; + if (*p == flag) + return 1; + p++; + } + return 0; +} + +static int is_holder(const char *hline) +{ + return has_holder_flag(hline, 'H'); +} + +static int is_waiter(const char *hline) +{ + return has_holder_flag(hline, 'W'); +} + +static int get_lock_type(const char *str) +{ + const char *p; + + p = strchr(str, '/'); + return (p ? (*(p - 1)) - '0' : 0); +} + +static long long get_demote_time(const char *str) +{ + char *p; + char tmp[80]; + + p = strchr(str, '/'); + if (p == NULL) + return 0; + p++; + p = strchr(p, '/'); + if (p == NULL) + return 0; + p++; + strncpy(tmp, p, 79); + tmp[79] = '\0'; + p = strchr(tmp, ' '); + if (p == NULL) + return 0; + *p = '\0'; + return atoll(tmp); +} + +static const char *pid_string(const char *str) +{ + char *p; + static char pidstr[80]; + + memset(pidstr, 0, sizeof(pidstr)); + p = strchr(str, 'p'); + if (p) { + strncpy(pidstr, p + 2, sizeof(pidstr)); + pidstr[79] = '\0'; + p = strchr(pidstr, ']'); + if (p) { + p++; + *p = '\0'; + } + } + return pidstr; +} + +/* If this glock is relevant, return 0, else the reason it's irrelevant */ +static int irrelevant(const char *holder, const char *glockstr) +{ + int lock_type = get_lock_type(glockstr); + + /* Exclude shared and locks */ + if (!is_ex(holder)) + return 1; + /* Exclude locks held at mount time: statfs*/ + if (strstr(holder, "init_per_node")) + return 2; + if (strstr(holder, "init_journal")) + return 3; + if (strstr(holder, "init_inodes")) + return 4; + if (strstr(holder, "fill_super")) + return 5; + if (lock_type == 9) /* Exclude journal locks */ + return 6; + return 0; +} + +static const char *reason(int why) +{ + const char *reasons[] = {"(N/A:------)", /* 0 */ + "(N/A:Not EX)", /* 1 */ + "(N/A:System)", /* 2 */ + "(N/A:journl)", /* 3 */ + "(N/A:System)", /* 4 */ + "(N/A:System)", /* 5 */ + "(N/A:Journl)"}; /* 6 */ + + return reasons[why]; +} + +static void print_friendly_prefix(char one_glocks_lines[MAX_LINES][97]) +{ + int why = irrelevant(one_glocks_lines[1], one_glocks_lines[0]); + + if (why) + print_it(NULL, " U: %s ", NULL, reason(why)); + else + print_it(NULL, " U: ", NULL); +} + +static void show_glock(char one_glocks_lines[MAX_LINES][97], int gline, + const char *fsname, int dlmwaiters, int dlmgrants, + int trace_dir_path, int prev_had_waiter, int flags, + int summary) +{ + int i, locktype = 0; + char id[33], *p; + char extras[80], prefix = '\0'; + long long demote_time = 0; + const char *ltype[] = {"N/A", "non-disk", "inode", "rgrp", "meta", + "i_open", "flock", "posix lock", "quota", + "journal"}; + + if (termlines) { + if (irrelevant(one_glocks_lines[1], one_glocks_lines[0])) + COLORS_HELD; + else + COLORS_NORMAL; + } + if (!gline) + return; + + memset(extras, 0, sizeof(extras)); + p = strchr(one_glocks_lines[0], '/'); + memset(id, 0, sizeof(id)); + + if (p) { + locktype = get_lock_type(one_glocks_lines[0]); + demote_time = get_demote_time(one_glocks_lines[0]); + p++; + strncpy(id, p, sizeof(id) - 1); + id[sizeof(id) - 1] = '\0'; + p = strchr(id, ' '); + if (p) + *p = '\0'; + + if (locktype != 2) { + strncpy(extras, ltype[locktype], 79); + extras[79] = '\0'; + } else { + const char *i_type = show_details(id, fsname, 2, + trace_dir_path); + sprintf(extras, "%sinode", i_type); + } + } + if (flags & DETAILS) { + print_it(NULL, " %s ", NULL, one_glocks_lines[0]); + print_it(NULL, "(%s)", NULL, extras); + if (demote_time) + print_it(NULL, " ** demote time is greater than 0 **", + NULL); + eol(0); + if (dlmgrants) + show_dlm_grants(locktype, one_glocks_lines[0], + dlmgrants, 0); + } + if (flags & FRIENDLY) { + print_friendly_prefix(one_glocks_lines); + for (i = 1; i < gline; i++) { + if (one_glocks_lines[i][0] == ' ' && + one_glocks_lines[i][1] == 'H' && + prefix != 'W') + prefix = (is_holder(one_glocks_lines[i]) ? + 'H' : 'W'); + } + print_it(NULL, " %c %-10.10s %-9.9s %s", NULL, prefix, + extras, id, friendly_glock(one_glocks_lines[0], + prefix)); + eol(0); + } + for (i = 1; i < gline; i++) { + if (!show_reservations && + one_glocks_lines[i][0] == ' ' && + one_glocks_lines[i][2] == 'B' && + one_glocks_lines[i][3] == ':') + continue; + + if (flags & DETAILS) { + print_it(NULL, " %-80.80s", NULL, one_glocks_lines[i]); + eol(0); + continue; + } + if ((flags & FRIENDLY) && + one_glocks_lines[i][1] == 'H') + print_friendly_prefix(one_glocks_lines); + + if (one_glocks_lines[i][0] == ' ' && + one_glocks_lines[i][1] == 'H') { + print_it(NULL, " %c ---> %s pid %s ", NULL, + prefix, (is_holder(one_glocks_lines[i]) ? + "held by" : "waiting"), + pid_string(one_glocks_lines[i])); + if (demote_time) + print_it(NULL, "** demote time is non-" + "zero ** ", NULL); + if (is_dlm_waiting(dlmwaiters, locktype, id)) { + print_it(NULL, "***** DLM is in a " + "comm wait for this lock " + "***** ", NULL); + } + show_dlm_grants(locktype, one_glocks_lines[0], + dlmgrants, 1); + eol(0); + print_call_trace(one_glocks_lines[i]); + } + } +} + +static int parse_dlm_waiters(FILE *dlm, const char *fsname) +{ + int dlml = 0; + + memset(dlmwlines, 0, sizeof(dlmwlines)); + while (fgets(dlmwlines[dlml], 80, dlm)) + dlml++; + + return dlml; +} + +static int parse_dlm_grants(int dlmfd, const char *fsname) +{ + int dlml = 0; + char *dlmline; + + memset(dlmglines, 0, sizeof(dlmglines)); + dnextpos = NULL; + while ((dlmline = bufgets(dlmfd, dbuf, &dnextpos, &dpos, &dmaxpos))) { + if (!this_lkb_requested(dlmline)) + continue; + strncpy(dlmglines[dlml], dlmline, 96); + dlmglines[dlml][96] = '\0'; + dlml++; + if (dlml >= MAX_LINES) + break; + } + return dlml; +} + +static void print_summary(int total_glocks[11][stypes], int dlmwaiters) +{ + int i; + int total_unlocked = 0; + const struct { + const char *name; + const int width; + } column[] = { + { "unknown", 7 }, { "nondisk", 7}, { "inode", 8 }, { "rgrp", 7 }, + { "meta", 4 }, { "iopen", 7 }, { "flock", 7 }, { "p", 1 }, + { "quota", 5 }, { "jrnl", 4 }, { "Total", 8 } + }; + const int ncols = sizeof(column) / sizeof(column[0]); + + /* Print column headers */ + print_it(NULL, "S glocks ", NULL); + for (i = 1; i < ncols; i++) + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*s ", NULL, column[i].width, column[i].name); + eol(0); + print_it(NULL, "S --------- ", NULL); + for (i = 1; i < ncols; i++) + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*s ", NULL, column[i].width, "--------"); + eol(0); + + /* Print rows */ + print_it(NULL, "S Unlocked: ", NULL); + for (i = 1; i < (ncols - 1); i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][all] - total_glocks[i][locked]); + total_unlocked += total_glocks[i][all] - + total_glocks[i][locked]; + } + print_it(NULL, "%*d ", NULL, column[i].width, total_unlocked); + eol(0); + print_it(NULL, "S Locked: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][locked]); + total_glocks[10][locked] += total_glocks[i][locked]; + } + eol(0); + print_it(NULL, "S Total: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][all]); + total_glocks[10][all] += total_glocks[i][all]; + } + eol(0); + print_it(NULL, "S", NULL); + eol(0); + print_it(NULL, "S Held EX: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][held_ex]); + total_glocks[10][held_ex] += total_glocks[i][held_ex]; + } + eol(0); + print_it(NULL, "S Held SH: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][held_sh]); + total_glocks[10][held_sh] += total_glocks[i][held_sh]; + } + eol(0); + print_it(NULL, "S Held DF: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][held_df]); + total_glocks[10][held_df] += total_glocks[i][held_df]; + } + eol(0); + print_it(NULL, "S G Waiting: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][has_waiter]); + total_glocks[10][has_waiter] += total_glocks[i][has_waiter]; + } + eol(0); + print_it(NULL, "S P Waiting: ", NULL); + for (i = 1; i < ncols; i++) { + if (i != 7 && i != 4) /* Ignore plock and meta */ + print_it(NULL, "%*d ", NULL, column[i].width, + total_glocks[i][tot_waiters]); + total_glocks[10][tot_waiters] += total_glocks[i][tot_waiters]; + } + eol(0); + print_it(NULL, "S DLM wait: %7d", NULL, dlmwaiters); + eol(0); + eol(0); +} + +/* flags = DETAILS || FRIENDLY or both */ +static void glock_details(int fd, const char *fsname, int dlmwaiters, + int dlmgrants, int trace_dir_path, int show_held, + int summary) +{ + char *ln, *p; + char one_glocks_lines[MAX_LINES][97]; + int gline = 0; + int show_prev_glock = 0, prev_had_waiter = 0; + int total_glocks[11][stypes], locktype = 0; + int holders_this_glock_ex = 0; + int holders_this_glock_sh = 0; + int holders_this_glock_df = 0; + int waiters_this_glock = 0; + + memset(total_glocks, 0, sizeof(total_glocks)); + gnextpos = NULL; + while ((ln = bufgets(fd, gbuf, &gnextpos, &gpos, &gmaxpos))) { + if (ln[0] == ' ' && ln[1] == ' ' && ln[2] == ' ') + continue; + if (ln[0] == 'G') { + /* Summary stuff------------------------------------ */ + if (waiters_this_glock) { + total_glocks[locktype][tot_waiters] += + waiters_this_glock; + total_glocks[locktype][has_waiter]++; + } + if (holders_this_glock_ex) + total_glocks[locktype][held_ex]++; + if (holders_this_glock_sh) + total_glocks[locktype][held_sh]++; + if (holders_this_glock_df) + total_glocks[locktype][held_df]++; + locktype = get_lock_type(ln); + p = ln + 6; + if (*p != 'U' || *(p + 1) != 'N') + total_glocks[locktype][locked]++; + total_glocks[locktype][all]++; + holders_this_glock_ex = 0; + holders_this_glock_sh = 0; + holders_this_glock_df = 0; + waiters_this_glock = 0; + /* Detail stuff------------------------------------- */ + if (show_prev_glock) { + show_glock(one_glocks_lines, gline, fsname, + dlmwaiters, dlmgrants, + trace_dir_path, prev_had_waiter, + DETAILS, summary); + show_glock(one_glocks_lines, gline, fsname, + dlmwaiters, dlmgrants, + trace_dir_path, prev_had_waiter, + FRIENDLY, summary); + memset(one_glocks_lines, 0, + sizeof(one_glocks_lines)); + show_prev_glock = 0; + } + prev_had_waiter = 0; + gline = 0; + if (this_glock_requested(ln)) + show_prev_glock = 1; + } else if (ln[0] == ' ' && ln[1] == 'H') { + char *flag = strchr(ln, 'f'); + char *mode = strchr(ln, 's'); + + /* Summary stuff------------------------------------ */ + while (flag) { + flag++; + switch (*flag) { + case ':': + break; + case 'W': + waiters_this_glock++; + flag = NULL; + break; + case 'H': + flag = NULL; + if (mode == NULL) + holders_this_glock_df++; + else if (*(mode + 1) == ':' && + *(mode + 2) == 'E' && + *(mode + 3) == 'X') + holders_this_glock_ex++; + else if (*(mode + 1) == ':' && + *(mode + 2) == 'S' && + *(mode + 3) == 'H') + holders_this_glock_sh++; + else + holders_this_glock_df++; + break; + case ' ': + flag = NULL; + break; + default: + break; + }; + } + /* Detail stuff------------------------------------- */ + if (!glocks) { + int haswaiter = is_waiter(ln); + + if (haswaiter) { + show_prev_glock = 1; + prev_had_waiter = 1; + } else if (show_held && is_holder(ln) && + !is_iopen(one_glocks_lines[0])) { + show_prev_glock = 1; + } else if (!irrelevant(ln, one_glocks_lines[0])) { + show_prev_glock = 1; + } + } + } + /* Detail stuff--------------------------------------------- */ + strncpy(one_glocks_lines[gline], ln, 96); + one_glocks_lines[gline][96] = '\0'; + gline++; + if (gline >= MAX_LINES) + break; + if (termlines && line >= termlines) + break; + } + /* Detail stuff----------------------------------------------------- */ + if (show_prev_glock && gline < MAX_LINES && + (!termlines || line < termlines)) { + show_glock(one_glocks_lines, gline, fsname, dlmwaiters, + dlmgrants, trace_dir_path, prev_had_waiter, + DETAILS, summary); + show_glock(one_glocks_lines, gline, fsname, dlmwaiters, + dlmgrants, trace_dir_path, prev_had_waiter, + FRIENDLY, summary); + } + if (!summary || ((iters_done % summary) != 0)) + return; + + print_summary(total_glocks, dlmwaiters); +} + +static void show_help(int help) +{ + if (help == 1) { + COLORS_NORMAL; + eol(0); + print_it(NULL, " Glock flags: ", NULL); + eol(0); + print_it(NULL, " l - Locked ", NULL); + print_it(NULL, " r - Reply pending ", NULL); + eol(0); + print_it(NULL, " d - Demote pending ", NULL); + print_it(NULL, " I - Initial ", NULL); + eol(0); + print_it(NULL, " D - Demote requested ", NULL); + print_it(NULL, " F - Frozen ", NULL); + eol(0); + print_it(NULL, " p - Demote in progress ", NULL); + print_it(NULL, " q - Queued holder ", NULL); + eol(0); + print_it(NULL, " y - Dirty data ", NULL); + print_it(NULL, " L - LRU ", NULL); + eol(0); + print_it(NULL, " f - Flush ", NULL); + print_it(NULL, " o - Object present ", NULL); + eol(0); + print_it(NULL, " i - Invalidating ", NULL); + print_it(NULL, " b - Blocking request ", NULL); + eol(0); + } else if (help == 2) { + COLORS_NORMAL; + eol(0); + print_it(NULL, " Holder flags: ", NULL); + eol(0); + print_it(NULL, " t - Try (non-blocking) ", NULL); + print_it(NULL, " E - Exact lock ", NULL); + eol(0); + print_it(NULL, " T - Try with callback ", NULL); + print_it(NULL, " c - No Cache lock ", NULL); + eol(0); + print_it(NULL, " e - No exp ", NULL); + print_it(NULL, " H - Held (locked) ", NULL); + eol(0); + print_it(NULL, " A - Any lock ", NULL); + print_it(NULL, " W - Waiting for lock ", NULL); + eol(0); + print_it(NULL, " p - Priority lock ", NULL); + print_it(NULL, " a - Asynchronous lock ", NULL); + eol(0); + print_it(NULL, " F - First ", NULL); + eol(0); + } +} + +/* flags = DETAILS || FRIENDLY or both */ +static void parse_glocks_file(int fd, const char *fsname, int dlmwaiters, + int dlmgrants, int trace_dir_path, + int show_held, int help, int summary) +{ + char fstitle[96], fsdlm[105]; + char ctimestr[64]; + time_t t; + int i; + + tzset(); + t = time(NULL); + strftime(ctimestr, 64, "%a %b %d %T %Y", localtime(&t)); + ctimestr[63] = '\0'; + memset(fstitle, 0, sizeof(fstitle)); + memset(fsdlm, 0, sizeof(fsdlm)); + sprintf(fstitle, "@ %s %s ", fsname, ctimestr); + if (dlmwaiters) { + sprintf(fsdlm, "dlm: %s/%s/%s [", dlm_dirtbl_size, + dlm_rsbtbl_size, dlm_lkbtbl_size); + for (i = 0; i < dlmwaiters; i++) + strcat(fsdlm, "*"); + for (; i < 10; i++) + strcat(fsdlm, " "); + strcat(fsdlm, "]"); + } + attron(A_BOLD); + print_it(NULL, "%s @%s %s", NULL, fstitle, hostname, fsdlm); + eol(0); + attroff(A_BOLD); + glock_details(fd, fsname, dlmwaiters, dlmgrants, trace_dir_path, + show_held, summary); + + show_help(help); + if (termlines) + refresh(); +} + +static void usage(void) +{ + printf("Usage:\n"); + printf("glocktop [-i] [-d ] [-n ] [-sX] [-c] [-D] [-H] [-r] [-t]\n"); + printf("\n"); + printf("-i : Runs glocktop in interactive mode.\n"); + printf("-d : delay between refreshes, in seconds (default: %d).\n", REFRESH_TIME); + printf("-n : stop after refreshes.\n"); + printf("-H : don't show Held glocks, even if not waited on, excluding " + "iopen\n"); + printf("-r : show reservations when rgrp glocks are displayed\n"); + printf("-s : show glock summary information every X iterations\n"); + printf("-t : trace directory glocks back\n"); + printf("-D : don't show DLM lock status\n"); + printf("\n"); + fflush(stdout); + exit(0); +} + +int main(int argc, char **argv) +{ + int fd; + DIR *dir = NULL; + char *fn; + struct dirent *dent; + int retval; + int refresh_time = REFRESH_TIME; + fd_set readfds; + char string[96]; + int ch, i, dlmwaiters = 0, dlmgrants = 0; + int cont = TRUE, optchar; + int trace_dir_path = 0; + int show_held = 1, help = 0; + int interactive = 0; + int summary = 10; + int nfds = STDIN_FILENO + 1; + + prog_name = argv[0]; + memset(glock, 0, sizeof(glock)); + memset(contended_filenames, 0, sizeof(contended_filenames)); + memset(contended_blocks, 0, sizeof(contended_blocks)); + UpdateSize(0); + /* decode command line arguments */ + while (cont) { + optchar = getopt(argc, argv, "-d:Dn:rs:thHi"); + + switch (optchar) { + case 'd': + refresh_time = atoi(optarg); + if (refresh_time < 1) { + fprintf(stderr, "Error: delay %d too small; " + "must be at least 1\n", refresh_time); + exit(-1); + } + break; + case 'D': + print_dlm_grants = 0; + break; + case 'n': + iterations = atoi(optarg); + break; + case 'r': + show_reservations = 1; + break; + case 's': + summary = atoi(optarg); + break; + case 't': + trace_dir_path = 1; + break; + case 'h': + usage(); + break; + case 'H': + show_held = 0; /* held, but not iopen held */ + break; + case 'i': + interactive = 1; + break; + case EOF: + cont = FALSE; + break; + case 1: + if (optarg && glocks < MAX_GLOCKS) + glock[glocks++] = optarg; + break; + + default: + fprintf(stderr, "unknown option: %c\n", optchar); + exit(-1); + }; + } + + if (interactive) { + printf("Initializing. Please wait..."); + fflush(stdout); + } + if (gethostname(hostname, sizeof(hostname))) { + fprintf(stderr, "Error: unable to determine host name.\n"); + exit(-1); + } + if (parse_mounts()) + exit(-1); + + if (interactive && (wind = initscr()) == NULL) { + fprintf(stderr, "Error: unable to initialize screen.\n"); + exit(-1); + } + + if (interactive) { + /* Do our initial screen stuff: */ + signal(SIGWINCH, UpdateSize); /* handle term resize signal */ + UpdateSize(0); /* update screen size based on term settings */ + clear(); /* don't use Erase */ + start_color(); + noecho(); + keypad(stdscr, TRUE); + raw(); + curs_set(0); + init_colors(); + } else { + termlines = 0; + } + while (!gbuf) { + gbuf = malloc(bufsize); + if (gbuf) { + /*printf("bufsize=%dK\n", bufsize / 1024);*/ + break; + } + bufsize /= 2; + } + while (!dbuf) { + dbuf = malloc(bufsize); + if (dbuf) { + /*printf("bufsize=%dK\n", bufsize / 1024);*/ + break; + } + bufsize /= 2; + } + + while (!done) { + struct timeval tv; + + if (asprintf(&fn, "%s/gfs2/", debugfs) == -1) { + perror(argv[0]); + exit(-1); + } + dir = opendir(fn); + free(fn); + + if (!dir) { + if (interactive) { + refresh(); + endwin(); + } + fprintf(stderr, "Unable to open gfs2 debugfs directory.\n"); + fprintf(stderr, "Check if debugfs and gfs2 are mounted.\n"); + exit(-1); + } + display_title_lines(); + while ((dent = readdir(dir))) { + const char *fsname; + char dlm_fn[PATH_MAX+5+8]; /* "/dlm/" and "_waiters" */ + FILE *dlmf; + int dlmfd; + + if (!strcmp(dent->d_name, ".")) + continue; + if (!strcmp(dent->d_name, "..")) + continue; + + fsname = strchr(dent->d_name, ':'); + if (fsname) + fsname++; + else + fsname = dent->d_name; + + memset(dlm_fn, 0, sizeof(dlm_fn)); + sprintf(dlm_fn, "%s/dlm/%s_waiters", debugfs, fsname); + dlmf = fopen(dlm_fn, "rt"); + if (dlmf) { + dlmwaiters = parse_dlm_waiters(dlmf, fsname); + fclose(dlmf); + } + + if (print_dlm_grants) { + memset(dlm_fn, 0, sizeof(dlm_fn)); + sprintf(dlm_fn, "%s/dlm/%s_locks", debugfs, + fsname); + dlmfd = open(dlm_fn, O_RDONLY); + if (dlmfd > 0) { + dlmgrants = parse_dlm_grants(dlmfd, + fsname); + close(dlmfd); + } + } + + if (asprintf(&fn, "%s/gfs2/%s/glocks", debugfs, dent->d_name) == -1) { + perror(argv[0]); + exit(-1); + } + fd = open(fn, O_RDONLY); + if (fd < 0) { + if (interactive) { + refresh(); + endwin(); + } + perror(fn); + free(fn); + exit(-1); + } + free(fn); + parse_glocks_file(fd, fsname, dlmwaiters, dlmgrants, + trace_dir_path, show_held, help, + summary); + close(fd); + } + closedir(dir); + tv.tv_sec = refresh_time; + tv.tv_usec = 0; + FD_ZERO(&readfds); + if (nfds != 0) + FD_SET(STDIN_FILENO, &readfds); + retval = select(nfds, &readfds, NULL, NULL, &tv); + if (retval) { + if (interactive) + ch = getch(); + else + ch = getchar(); + switch (ch) { + case 0x1b: /* mount wheel? */ + case 0x03: + case 'q': + done = 1; + break; + case 'h': + help = (help + 1) % 3; + break; + case 's': + if (!interactive) + break; + move(1, 0); + printw("Change delay from %d to: ", + refresh_time); + if (bobgets(string, 1, 25, 5, &ch) == 1) + refresh_time = atoi(string); + if (refresh_time < 1) + refresh_time = 1; + break; + /* When we get EOF on stdin, remove it from the fd_set + to avoid shorting out the select() */ + case EOF: + nfds = 0; + break; + } + } + iters_done++; + if (iterations && iters_done >= iterations) + break; + } + for (i = 0; i < mounted; i++) + close(fs_fd[i]); + free(gbuf); + free(dbuf); + if (interactive) { + refresh(); + endwin(); + } + exit(0); +} diff --git a/gfs2/include/Makefile.am b/gfs2/include/Makefile.am new file mode 100644 index 0000000..fca6f6a --- /dev/null +++ b/gfs2/include/Makefile.am @@ -0,0 +1,7 @@ +MAINTAINERCLEANFILES = Makefile.in + +noinst_HEADERS = \ + osi_list.h \ + osi_tree.h \ + linux_endian.h \ + logging.h diff --git a/gfs2/include/linux_endian.h b/gfs2/include/linux_endian.h new file mode 100644 index 0000000..43089d2 --- /dev/null +++ b/gfs2/include/linux_endian.h @@ -0,0 +1,68 @@ +#ifndef __LINUX_ENDIAN_DOT_H__ +#define __LINUX_ENDIAN_DOT_H__ + + +#include +#include + + +/* I'm not sure which versions of alpha glibc/gcc are broken, + so fix all of them. */ +#ifdef __alpha__ +#undef bswap_64 +static __inline__ unsigned long bswap_64(unsigned long x) +{ + unsigned int h = x >> 32; + unsigned int l = x; + + h = bswap_32(h); + l = bswap_32(l); + + return ((unsigned long)l << 32) | h; +} +#endif /* __alpha__ */ + + +#if __BYTE_ORDER == __BIG_ENDIAN + +#define be16_to_cpu(x) (x) +#define be32_to_cpu(x) (x) +#define be64_to_cpu(x) (x) + +#define cpu_to_be16(x) (x) +#define cpu_to_be32(x) (x) +#define cpu_to_be64(x) (x) + +#define le16_to_cpu(x) (bswap_16((x))) +#define le32_to_cpu(x) (bswap_32((x))) +#define le64_to_cpu(x) (bswap_64((x))) + +#define cpu_to_le16(x) (bswap_16((x))) +#define cpu_to_le32(x) (bswap_32((x))) +#define cpu_to_le64(x) (bswap_64((x))) + +#endif /* __BYTE_ORDER == __BIG_ENDIAN */ + + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +#define be16_to_cpu(x) (bswap_16((x))) +#define be32_to_cpu(x) (bswap_32((x))) +#define be64_to_cpu(x) (bswap_64((x))) + +#define cpu_to_be16(x) (bswap_16((x))) +#define cpu_to_be32(x) (bswap_32((x))) +#define cpu_to_be64(x) (bswap_64((x))) + +#define le16_to_cpu(x) (x) +#define le32_to_cpu(x) (x) +#define le64_to_cpu(x) (x) + +#define cpu_to_le16(x) (x) +#define cpu_to_le32(x) (x) +#define cpu_to_le64(x) (x) + +#endif /* __BYTE_ORDER == __LITTLE_ENDIAN */ + + +#endif /* __LINUX_ENDIAN_DOT_H__ */ diff --git a/gfs2/include/logging.h b/gfs2/include/logging.h new file mode 100644 index 0000000..18b5832 --- /dev/null +++ b/gfs2/include/logging.h @@ -0,0 +1,36 @@ +#ifndef __LOGGING_H__ +#define __LOGGING_H__ + +extern int print_level; +#define increase_verbosity() do { print_level++; } while(0) +#define decrease_verbosity() do { print_level--; } while(0) + +#define MSG_DEBUG 7 +#define MSG_INFO 6 +#define MSG_NOTICE 5 +#define MSG_WARN 4 +#define MSG_ERROR 3 +#define MSG_CRITICAL 2 +#define MSG_NULL 1 + +#define log_debug(format...) \ + do { if (print_level >= MSG_DEBUG) { \ + printf("(%s:%d) ", __FUNCTION__, __LINE__); \ + printf(format); } } while(0) + +#define log_info(format...) \ + do { if (print_level >= MSG_INFO) printf(format); } while(0) + +#define log_notice(format...) \ + do { if (print_level >= MSG_NOTICE) printf(format); } while(0) + +#define log_warn(format...) \ + do { if (print_level >= MSG_WARN) printf(format); } while(0) + +#define log_err(format...) \ + do { if (print_level >= MSG_ERROR) fprintf(stderr, format); } while(0) + +#define log_crit(format...) \ + do { if (print_level >= MSG_CRITICAL) fprintf(stderr, format); } while(0) + +#endif /* __LOGGING_H__ */ diff --git a/gfs2/include/osi_list.h b/gfs2/include/osi_list.h new file mode 100644 index 0000000..3b1483b --- /dev/null +++ b/gfs2/include/osi_list.h @@ -0,0 +1,84 @@ +#ifndef __OSI_LIST_DOT_H__ +#define __OSI_LIST_DOT_H__ + + + +struct osi_list +{ + struct osi_list *next, *prev; +}; +typedef struct osi_list osi_list_t; + + + +#define osi_list_decl(var) osi_list_t var = { &var, &var } + +#define osi_list_empty(var) ((var)->next == (var)) +#define osi_list_entry(var, type, mem) ((type *)((unsigned long)(var) - (unsigned long)(&((type *)NULL)->mem))) + + + +#define osi_list_init(head) \ +do \ +{ \ + osi_list_t *osi_list_var = (head); \ + osi_list_var->next = osi_list_var->prev = osi_list_var; \ +} \ +while (0) + +#define osi_list_add(new, head) \ +do \ +{ \ + osi_list_t *osi_list_var_new = (new); \ + osi_list_t *osi_list_var_head = (head); \ + osi_list_var_new->next = osi_list_var_head->next; \ + osi_list_var_new->prev = osi_list_var_head; \ + osi_list_var_head->next->prev = osi_list_var_new; \ + osi_list_var_head->next = osi_list_var_new; \ +} \ +while (0) + +#define osi_list_add_next osi_list_add + +#define osi_list_add_prev(new, head) \ +do \ +{ \ + osi_list_t *osi_list_var_new = (new); \ + osi_list_t *osi_list_var_head = (head); \ + osi_list_var_new->prev = osi_list_var_head->prev; \ + osi_list_var_new->next = osi_list_var_head; \ + osi_list_var_head->prev->next = osi_list_var_new; \ + osi_list_var_head->prev = osi_list_var_new; \ +} \ +while (0) + +#define osi_list_del(var) \ +do \ +{ \ + osi_list_t *osi_list_var = (var); \ + osi_list_var->next->prev = osi_list_var->prev; \ + osi_list_var->prev->next = osi_list_var->next; \ +} \ +while (0) + +#define osi_list_del_init(var) \ +do \ +{ \ + osi_list_t *osi_list_var = (var); \ + osi_list_var->next->prev = osi_list_var->prev; \ + osi_list_var->prev->next = osi_list_var->next; \ + osi_list_var->next = osi_list_var->prev = osi_list_var; \ +} \ +while (0) + +#define osi_list_foreach(tmp, head) \ + for ((tmp) = (head)->next; (tmp) != (head); (tmp) = (tmp)->next) + +#define osi_list_foreach_safe(tmp, head, x) \ + for ((tmp) = (head)->next, (x) = (tmp)->next; \ + (tmp) != (head); \ + (tmp) = (x), (x) = (x)->next) + + + +#endif /* __OSI_LIST_DOT_H__ */ diff --git a/gfs2/include/osi_tree.h b/gfs2/include/osi_tree.h new file mode 100644 index 0000000..eca04a0 --- /dev/null +++ b/gfs2/include/osi_tree.h @@ -0,0 +1,404 @@ +#ifndef __OSI_RBTREE_DOT_H__ +#define __OSI_RBTREE_DOT_H__ + +#include +#include +#include + +/* Adapted from the kernel's rbtree.c */ +struct osi_node { + unsigned long osi_parent_color; +#define OSI_RED 0 +#define OSI_BLACK 1 + struct osi_node *osi_left; + struct osi_node *osi_right; +}; + +#define osi_parent(r) ((struct osi_node *)((r)->osi_parent_color & ~3)) +#define osi_color(r) ((r)->osi_parent_color & 1) +#define osi_is_red(r) (!osi_color(r)) +#define osi_is_black(r) osi_color(r) +#define osi_set_red(r) do { (r)->osi_parent_color &= ~1; } while (0) +#define osi_set_black(r) do { (r)->osi_parent_color |= 1; } while (0) +#define OSI_EMPTY_NODE(node) (osi_parent(node) == node) + +struct osi_root +{ + struct osi_node *osi_node; +}; + +#define OSI_EMPTY_ROOT(root) ((root)->osi_node == NULL) + +static inline void osi_set_parent(struct osi_node *rb, struct osi_node *p) +{ + rb->osi_parent_color = (rb->osi_parent_color & 3) | (unsigned long)p; +} + +static inline void osi_set_color(struct osi_node *rb, int color) +{ + rb->osi_parent_color = (rb->osi_parent_color & ~1) | color; +} + +static inline void osi_link_node(struct osi_node *node, + struct osi_node *parent, + struct osi_node **osi_link) +{ + node->osi_parent_color = (unsigned long )parent; + node->osi_left = node->osi_right = NULL; + + *osi_link = node; +} + +static inline void __osi_rotate_left(struct osi_node *node, + struct osi_root *root) +{ + struct osi_node *right = node->osi_right; + struct osi_node *parent = osi_parent(node); + + if ((node->osi_right = right->osi_left)) + osi_set_parent(right->osi_left, node); + right->osi_left = node; + + osi_set_parent(right, parent); + + if (parent) { + if (node == parent->osi_left) + parent->osi_left = right; + else + parent->osi_right = right; + } + else + root->osi_node = right; + osi_set_parent(node, right); +} + +static inline void __osi_rotate_right(struct osi_node *node, + struct osi_root *root) +{ + struct osi_node *left = node->osi_left; + struct osi_node *parent = osi_parent(node); + + if ((node->osi_left = left->osi_right)) + osi_set_parent(left->osi_right, node); + left->osi_right = node; + + osi_set_parent(left, parent); + + if (parent) { + if (node == parent->osi_right) + parent->osi_right = left; + else + parent->osi_left = left; + } else + root->osi_node = left; + osi_set_parent(node, left); +} + +static inline void osi_insert_color(struct osi_node *node, + struct osi_root *root) +{ + struct osi_node *parent, *gparent; + + while ((parent = osi_parent(node)) && osi_is_red(parent)) { + gparent = osi_parent(parent); + + if (parent == gparent->osi_left) { + { + register struct osi_node *uncle = gparent->osi_right; + if (uncle && osi_is_red(uncle)) { + osi_set_black(uncle); + osi_set_black(parent); + osi_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->osi_right == node) { + register struct osi_node *tmp; + + __osi_rotate_left(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + osi_set_black(parent); + osi_set_red(gparent); + __osi_rotate_right(gparent, root); + } else { + { + register struct osi_node *uncle = gparent->osi_left; + if (uncle && osi_is_red(uncle)) { + osi_set_black(uncle); + osi_set_black(parent); + osi_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->osi_left == node) { + register struct osi_node *tmp; + __osi_rotate_right(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + osi_set_black(parent); + osi_set_red(gparent); + __osi_rotate_left(gparent, root); + } + } + + osi_set_black(root->osi_node); +} + +static inline void __osi_erase_color(struct osi_node *node, + struct osi_node *parent, + struct osi_root *root) +{ + struct osi_node *other; + + while ((!node || osi_is_black(node)) && node != root->osi_node) { + if (parent->osi_left == node) { + other = parent->osi_right; + if (osi_is_red(other)) { + osi_set_black(other); + osi_set_red(parent); + __osi_rotate_left(parent, root); + other = parent->osi_right; + } + if ((!other->osi_left || osi_is_black(other->osi_left)) && + (!other->osi_right || osi_is_black(other->osi_right))) + { + osi_set_red(other); + node = parent; + parent = osi_parent(node); + } else { + if (!other->osi_right || osi_is_black(other->osi_right)) + { + struct osi_node *o_left; + if ((o_left = other->osi_left)) + osi_set_black(o_left); + osi_set_red(other); + __osi_rotate_right(other, root); + other = parent->osi_right; + } + osi_set_color(other, osi_color(parent)); + osi_set_black(parent); + if (other->osi_right) + osi_set_black(other->osi_right); + __osi_rotate_left(parent, root); + node = root->osi_node; + break; + } + } else { + other = parent->osi_left; + if (osi_is_red(other)) { + osi_set_black(other); + osi_set_red(parent); + __osi_rotate_right(parent, root); + other = parent->osi_left; + } + if ((!other->osi_left || osi_is_black(other->osi_left)) && + (!other->osi_right || osi_is_black(other->osi_right))) + { + osi_set_red(other); + node = parent; + parent = osi_parent(node); + } else { + if (!other->osi_left || osi_is_black(other->osi_left)) + { + register struct osi_node *o_right; + if ((o_right = other->osi_right)) + osi_set_black(o_right); + osi_set_red(other); + __osi_rotate_left(other, root); + other = parent->osi_left; + } + osi_set_color(other, osi_color(parent)); + osi_set_black(parent); + if (other->osi_left) + osi_set_black(other->osi_left); + __osi_rotate_right(parent, root); + node = root->osi_node; + break; + } + } + } + if (node) + osi_set_black(node); +} + +static inline void osi_erase(struct osi_node *node, struct osi_root *root) +{ + struct osi_node *child, *parent; + int color; + + if (!node->osi_left) + child = node->osi_right; + else if (!node->osi_right) + child = node->osi_left; + else { + struct osi_node *old = node, *left; + + node = node->osi_right; + while ((left = node->osi_left) != NULL) + node = left; + + if (osi_parent(old)) { + if (osi_parent(old)->osi_left == old) + osi_parent(old)->osi_left = node; + else + osi_parent(old)->osi_right = node; + } else + root->osi_node = node; + + child = node->osi_right; + parent = osi_parent(node); + color = osi_color(node); + + if (parent == old) { + parent = node; + } else { + if (child) + osi_set_parent(child, parent); + parent->osi_left = child; + + node->osi_right = old->osi_right; + osi_set_parent(old->osi_right, node); + } + + node->osi_parent_color = old->osi_parent_color; + node->osi_left = old->osi_left; + osi_set_parent(old->osi_left, node); + + goto color; + } + + parent = osi_parent(node); + color = osi_color(node); + + if (child) + osi_set_parent(child, parent); + if (parent) + { + if (parent->osi_left == node) + parent->osi_left = child; + else + parent->osi_right = child; + } + else + root->osi_node = child; + + color: + if (color == OSI_BLACK) + __osi_erase_color(child, parent, root); +} + +/* + * This function returns the first node (in sort order) of the tree. + */ +static inline struct osi_node *osi_first(struct osi_root *root) +{ + struct osi_node *n; + + n = root->osi_node; + if (!n) + return NULL; + while (n->osi_left) + n = n->osi_left; + return n; +} + +static inline struct osi_node *osi_last(struct osi_root *root) +{ + struct osi_node *n; + + n = root->osi_node; + if (!n) + return NULL; + while (n->osi_right) + n = n->osi_right; + return n; +} + +static inline struct osi_node *osi_next(struct osi_node *node) +{ + struct osi_node *parent; + + if (OSI_EMPTY_NODE(node)) + return NULL; + + /* If we have a right-hand child, go down and then left as far + as we can. */ + if (node->osi_right) { + node = node->osi_right; + while (node->osi_left) + node=node->osi_left; + return node; + } + + /* No right-hand children. Everything down and left is + smaller than us, so any 'next' node must be in the general + direction of our parent. Go up the tree; any time the + ancestor is a right-hand child of its parent, keep going + up. First time it's a left-hand child of its parent, said + parent is our 'next' node. */ + while ((parent = osi_parent(node)) && node == parent->osi_right) + node = parent; + + return parent; +} + +static inline struct osi_node *osi_prev(struct osi_node *node) +{ + struct osi_node *parent; + + if (OSI_EMPTY_NODE(node)) + return NULL; + + /* If we have a left-hand child, go down and then right as far + as we can. */ + if (node->osi_left) { + node = node->osi_left; + while (node->osi_right) + node=node->osi_right; + return node; + } + + /* No left-hand children. Go up till we find an ancestor which + is a right-hand child of its parent */ + while ((parent = osi_parent(node)) && node == parent->osi_left) + node = parent; + + return parent; +} + +static inline void osi_replace_node(struct osi_node *victim, + struct osi_node *new, + struct osi_root *root) +{ + struct osi_node *parent = osi_parent(victim); + + /* Set the surrounding nodes to point to the replacement */ + if (parent) { + if (victim == parent->osi_left) + parent->osi_left = new; + else + parent->osi_right = new; + } else { + root->osi_node = new; + } + if (victim->osi_left) + osi_set_parent(victim->osi_left, new); + if (victim->osi_right) + osi_set_parent(victim->osi_right, new); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; +} + +#endif diff --git a/gfs2/init.d/gfs2 b/gfs2/init.d/gfs2 new file mode 100644 index 0000000..467f37b --- /dev/null +++ b/gfs2/init.d/gfs2 @@ -0,0 +1,131 @@ +#!/bin/bash +# +# gfs2 mount/unmount helper +# +# chkconfig: - 26 74 +# description: mount/unmount gfs2 filesystems configured in /etc/fstab + +### BEGIN INIT INFO +# Provides: gfs2 +# Required-Start: $network cman gfs_controld +# Required-Stop: $network cman gfs_controld +# Default-Start: +# Default-Stop: +# Short-Description: mount/unmount gfs2 filesystems configured in /etc/fstab +# Description: mount/unmount gfs2 filesystems configured in /etc/fstab +### END INIT INFO + +# set secure PATH +PATH="/bin:/sbin:/usr/sbin:/usr/bin" + +### generic wrapper functions + +success() +{ + echo -ne "[ OK ]\r" +} + +failure() +{ + echo -ne "[FAILED]\r" +} + +ok() { + success + echo +} + +nok() { + echo -e "$errmsg" + failure + echo + exit 1 +} + +# rpm based distros +if [ -d /etc/sysconfig ]; then + [ -f /etc/init.d/functions ] && . /etc/init.d/functions + [ -f /etc/sysconfig/cluster ] && . /etc/sysconfig/cluster + [ -f /etc/sysconfig/gfs2 ] && . /etc/sysconfig/gfs2 + [ -z "$LOCK_FILE" ] && LOCK_FILE="/var/lock/subsys/gfs2" +fi + +# deb based distros +if [ -d /etc/default ]; then + [ -f /etc/default/cluster ] && . /etc/default/cluster + [ -f /etc/default/gfs2 ] && . /etc/default/gfs2 + [ -z "$LOCK_FILE" ] && LOCK_FILE="/var/lock/gfs2" +fi + +# proc is required for both status and stop. +# start could live without, but better be consistent with the behavior +if [ ! -f /proc/mounts ]; then + echo "GFS2: /proc is not available, unable to proceed" + exit 1 +fi + +# +# This script's behavior is modeled closely after the netfs script. +# +GFS2FSTAB=$(LC_ALL=C awk '!/^#/ && $3 == "gfs2" && $4 !~ /noauto/ { print $2 }' /etc/fstab) +GFS2MTAB=$(LC_ALL=C awk '!/^#/ && $3 == "gfs2" && $2 != "/" { print $2 }' /proc/mounts | sort -r) + +if [ -z "$GFS2FSTAB" ]; then + echo "GFS2: no entries found in /etc/fstab" + exit 6 +fi + +# See how we were called. +case "$1" in +start) + [ -z "$GFS2FSTAB" ] && exit 0 + echo -n "Mounting GFS2 filesystems: " + errmsg="$(mount -a -t gfs2 2>&1)" || nok + touch $LOCK_FILE + ok +;; +stop) + [ -z "$GFS2MTAB" ] && exit 0 + echo -n "Unmounting GFS2 filesystems: " + errmsg="$(umount -a -t gfs2 2>&1)" || nok + modprobe -r gfs2 > /dev/null 2>&1 || true + rm -f $LOCK_FILE + ok + ;; + +status) + if [ -z "$GFS2MTAB" ] && [ -f $LOCK_FILE ]; then + echo "GFS2: Found stale lock file $LOCK_FILE" + exit 2 + fi + + if [ -n "$GFS2FSTAB" ] && [ -z "$GFS2MTAB" ]; then + echo "GFS2: service is not running" + exit 3 + fi + + echo "Configured GFS2 mountpoints: " + for fs in $GFS2FSTAB; do + echo $fs; + done + + echo "Active GFS2 mountpoints: " + for fs in $GFS2MTAB; do + echo $fs; + done +;; +condrestart|try-restart) + $0 status >/dev/null 2>&1 || exit 0 + $0 restart +;; +restart|reload|force-reload) + $0 stop + $0 start +;; +*) + echo "Usage: $0 {start|stop|restart|reload|force-reload|condrestart|try-restart|status}" + exit 2 +;; +esac + +exit 0 diff --git a/gfs2/libgfs2/Makefile.am b/gfs2/libgfs2/Makefile.am new file mode 100644 index 0000000..749da85 --- /dev/null +++ b/gfs2/libgfs2/Makefile.am @@ -0,0 +1,65 @@ +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = \ + parser.h \ + parser.c \ + lexer.c \ + lexer.h + +BUILT_SOURCES = \ + parser.h \ + lexer.h + +AM_LFLAGS = --header-file=lexer.h +AM_YFLAGS = -d + +noinst_HEADERS = \ + libgfs2.h \ + crc32c.h \ + lang.h \ + config.h \ + rgrp.h + +noinst_LTLIBRARIES = libgfs2.la + +noinst_PROGRAMS = gfs2l + +libgfs2_la_SOURCES = \ + crc32c.c \ + block_list.c \ + fs_bits.c \ + gfs1.c \ + misc.c \ + rgrp.c \ + super.c \ + buf.c \ + fs_geometry.c \ + gfs2_disk_hash.c \ + ondisk.c \ + config.c \ + device_geometry.c \ + fs_ops.c \ + recovery.c \ + structures.c \ + meta.c \ + lang.c \ + parser.y \ + lexer.l + +libgfs2_la_CPPFLAGS = \ + -D_FILE_OFFSET_BITS=64 \ + -D_LARGEFILE64_SOURCE \ + -D_GNU_SOURCE \ + -I$(top_srcdir)/gfs2/include \ + $(uuid_CFLAGS) + +gfs2l_SOURCES = gfs2l.c +gfs2l_LDADD = libgfs2.la +gfs2l_LDFLAGS = $(uuid_LIBS) +gfs2l_CPPFLAGS = \ + -I$(top_srcdir)/gfs2/include \ + -D_FILE_OFFSET_BITS=64 + +# Autotools can't handle header files output by flex so we have to generate it manually +lexer.h: lexer.l + $(LEX) -o lexer.c $(AM_LFLAGS) $^ diff --git a/gfs2/libgfs2/block_list.c b/gfs2/libgfs2/block_list.c new file mode 100644 index 0000000..9d99845 --- /dev/null +++ b/gfs2/libgfs2/block_list.c @@ -0,0 +1,67 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" + +void gfs2_special_free(struct special_blocks *blist) +{ + struct special_blocks *f; + + while(!osi_list_empty(&blist->list)) { + f = osi_list_entry(blist->list.next, struct special_blocks, + list); + osi_list_del(&f->list); + free(f); + } +} + +struct special_blocks *blockfind(struct special_blocks *blist, uint64_t num) +{ + osi_list_t *head = &blist->list; + osi_list_t *tmp; + struct special_blocks *b; + + for (tmp = head->next; tmp != head; tmp = tmp->next) { + b = osi_list_entry(tmp, struct special_blocks, list); + if (b->block == num) + return b; + } + return NULL; +} + +void gfs2_special_add(struct special_blocks *blocklist, uint64_t block) +{ + struct special_blocks *b; + + b = malloc(sizeof(struct special_blocks)); + if (b) { + memset(b, 0, sizeof(*b)); + b->block = block; + osi_list_add_prev(&b->list, &blocklist->list); + } +} + +void gfs2_special_set(struct special_blocks *blocklist, uint64_t block) +{ + if (blockfind(blocklist, block)) + return; + gfs2_special_add(blocklist, block); +} + +void gfs2_special_clear(struct special_blocks *blocklist, uint64_t block) +{ + struct special_blocks *b; + + b = blockfind(blocklist, block); + if (b) { + osi_list_del(&b->list); + free(b); + } +} diff --git a/gfs2/libgfs2/buf.c b/gfs2/libgfs2/buf.c new file mode 100644 index 0000000..92cd393 --- /dev/null +++ b/gfs2/libgfs2/buf.c @@ -0,0 +1,121 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" + +#ifndef IOV_MAX + #ifdef UIO_MAXIOV + #define IOV_MAX UIO_MAXIOV + #else + #define IOV_MAX (1024) + #endif +#endif + +struct gfs2_buffer_head *bget(struct gfs2_sbd *sdp, uint64_t num) +{ + struct gfs2_buffer_head *bh; + + bh = calloc(1, sizeof(struct gfs2_buffer_head) + sdp->bsize); + if (bh == NULL) + return NULL; + + bh->b_blocknr = num; + bh->sdp = sdp; + bh->iov.iov_base = (char *)bh + sizeof(struct gfs2_buffer_head); + bh->iov.iov_len = sdp->bsize; + + return bh; +} + +int __breadm(struct gfs2_sbd *sdp, struct gfs2_buffer_head **bhs, size_t n, + uint64_t block, int line, const char *caller) +{ + size_t v = (n < IOV_MAX) ? n : IOV_MAX; + struct iovec *iov = alloca(v * sizeof(struct iovec)); + struct iovec *iovbase = iov; + size_t i = 0; + + while (i < n) { + int j; + ssize_t ret; + ssize_t size = 0; + + for (j = 0; (i + j < n) && (j < IOV_MAX); j++) { + bhs[i + j] = bget(sdp, block + i + j); + if (bhs[i + j] == NULL) + return -1; + iov[j] = bhs[i + j]->iov; + size += bhs[i + j]->iov.iov_len; + } + + ret = preadv(sdp->device_fd, iovbase, j, (block + i) * sdp->bsize); + if (ret != size) { + fprintf(stderr, "bad read: %s from %s:%d: block %llu (0x%llx) " + "count: %d size: %zd ret: %zd\n", strerror(errno), + caller, line, (unsigned long long)block, + (unsigned long long)block, j, size, ret); + exit(-1); + } + i += j; + } + return 0; +} + +struct gfs2_buffer_head *__bread(struct gfs2_sbd *sdp, uint64_t num, int line, + const char *caller) +{ + struct gfs2_buffer_head *bh; + int ret; + + ret = __breadm(sdp, &bh, 1, num, line, caller); + if (ret >= 0) + return bh; + return NULL; +} + +int bwrite(struct gfs2_buffer_head *bh) +{ + struct gfs2_sbd *sdp = bh->sdp; + + if (pwritev(sdp->device_fd, &bh->iov, 1, bh->b_blocknr * sdp->bsize) != bh->iov.iov_len) + return -1; + bh->b_modified = 0; + return 0; +} + +int brelse(struct gfs2_buffer_head *bh) +{ + int error = 0; + + if (bh->b_blocknr == -1) + printf("Double free!\n"); + if (bh->b_modified) + error = bwrite(bh); + bh->b_blocknr = -1; + if (bh->b_altlist.next && !osi_list_empty(&bh->b_altlist)) + osi_list_del(&bh->b_altlist); + free(bh); + return error; +} + +uint32_t lgfs2_get_block_type(const struct gfs2_buffer_head *lbh) +{ + const struct gfs2_meta_header *mh = lbh->iov.iov_base; + + if (be32_to_cpu(mh->mh_magic) == GFS2_MAGIC) + return be32_to_cpu(mh->mh_type); + + return 0; +} diff --git a/gfs2/libgfs2/config.c b/gfs2/libgfs2/config.c new file mode 100644 index 0000000..d2431e4 --- /dev/null +++ b/gfs2/libgfs2/config.c @@ -0,0 +1,9 @@ +#include "libgfs2.h" +#include "config.h" + +int cfg_debug = 0; + +void lgfs2_set_debug(int enable) +{ + cfg_debug = enable; +} diff --git a/gfs2/libgfs2/config.h b/gfs2/libgfs2/config.h new file mode 100644 index 0000000..7c1eb3c --- /dev/null +++ b/gfs2/libgfs2/config.h @@ -0,0 +1,6 @@ +#ifndef __LGFS2_CONFIG_H__ +#define __LGFS2_CONFIG_H__ + +extern int cfg_debug; + +#endif /* __LGFS2_CONFIG_H__ */ diff --git a/gfs2/libgfs2/crc32c.c b/gfs2/libgfs2/crc32c.c new file mode 100644 index 0000000..e04c611 --- /dev/null +++ b/gfs2/libgfs2/crc32c.c @@ -0,0 +1,221 @@ +/* + * Copied from btrfs-progs, kernel-lib/crc32c.c, which was: + * Copied from the kernel source code, lib/libcrc32c.c. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ +#include +#include +#include "crc32c.h" + +static uint32_t __crc32c_le(uint32_t crc, unsigned char const *data, size_t length); +static uint32_t (*crc_function)(uint32_t crc, unsigned char const *data, size_t length) = __crc32c_le; + +#ifdef __x86_64__ + +/* + * Based on a posting to lkml by Austin Zhang + * + * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. + * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) + * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2A: Instruction Set Reference, A-M + */ +#if __SIZEOF_LONG__ == 8 +#define REX_PRE "0x48, " +#define SCALE_F 8 +#else +#define REX_PRE +#define SCALE_F 4 +#endif + +static int crc32c_probed = 0; +static int crc32c_intel_available = 0; + +static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data, + unsigned long length) +{ + while (length--) { + __asm__ __volatile__( + ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" + :"=S"(crc) + :"0"(crc), "c"(*data) + ); + data++; + } + + return crc; +} + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ +static uint32_t crc32c_intel(uint32_t crc, unsigned char const *data, unsigned long length) +{ + unsigned int iquotient = length / SCALE_F; + unsigned int iremainder = length % SCALE_F; + unsigned long *ptmp = (unsigned long *)data; + + while (iquotient--) { + __asm__ __volatile__( + ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" + :"=S"(crc) + :"0"(crc), "c"(*ptmp) + ); + ptmp++; + } + + if (iremainder) + crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, + iremainder); + + return crc; +} + +static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, + unsigned int *edx) +{ + int id = *eax; + + asm("movl %4, %%eax;" + "cpuid;" + "movl %%eax, %0;" + "movl %%ebx, %1;" + "movl %%ecx, %2;" + "movl %%edx, %3;" + : "=r" (*eax), "=r" (*ebx), "=r" (*ecx), "=r" (*edx) + : "r" (id) + : "eax", "ebx", "ecx", "edx"); +} + +static void crc32c_intel_probe(void) +{ + if (!crc32c_probed) { + unsigned int eax, ebx, ecx, edx; + + eax = 1; + + do_cpuid(&eax, &ebx, &ecx, &edx); + crc32c_intel_available = (ecx & (1 << 20)) != 0; + crc32c_probed = 1; + } +} + +void crc32c_optimization_init(void) +{ + crc32c_intel_probe(); + if (crc32c_intel_available) + crc_function = crc32c_intel; +} +#else + +void crc32c_optimization_init(void) +{ +} + +#endif /* __x86_64__ */ + +/* + * This is the CRC-32C table + * Generated with: + * width = 32 bits + * poly = 0x1EDC6F41 + * reflect input bytes = true + * reflect output bytes = true + */ + +static const uint32_t crc32c_table[256] = { + 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, + 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, + 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, + 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, + 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, + 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, + 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, + 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, + 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, + 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, + 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, + 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, + 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, + 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, + 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, + 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, + 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, + 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, + 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, + 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, + 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, + 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, + 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, + 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, + 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, + 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, + 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, + 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, + 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, + 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, + 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, + 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, + 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, + 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, + 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, + 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, + 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, + 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, + 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, + 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, + 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, + 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, + 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, + 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, + 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, + 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, + 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, + 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, + 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, + 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, + 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, + 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, + 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, + 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, + 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, + 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, + 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, + 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, + 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, + 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, + 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, + 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, + 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, + 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static uint32_t __crc32c_le(uint32_t crc, unsigned char const *data, size_t length) +{ + while (length--) + crc = + crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8); + return crc; +} + +uint32_t crc32c(uint32_t crc, unsigned char const *data, size_t length) +{ + /* Use by-byte access for unaligned buffers */ + if ((unsigned long)data % sizeof(unsigned long)) + return __crc32c_le(crc, data, length); + + return crc_function(crc, data, length); +} diff --git a/gfs2/libgfs2/crc32c.h b/gfs2/libgfs2/crc32c.h new file mode 100644 index 0000000..ad06df4 --- /dev/null +++ b/gfs2/libgfs2/crc32c.h @@ -0,0 +1,9 @@ +#ifndef CRC32C_H +#define CRC32C_H +#include +#include + +uint32_t crc32c(uint32_t seed, unsigned char const *data, size_t length); +void crc32c_optimization_init(void); + +#endif diff --git a/gfs2/libgfs2/device_geometry.c b/gfs2/libgfs2/device_geometry.c new file mode 100644 index 0000000..1303f11 --- /dev/null +++ b/gfs2/libgfs2/device_geometry.c @@ -0,0 +1,108 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" +#include "config.h" + +#ifndef BLKSSZGET +#define BLKSSZGET _IO(0x12,104) /* logical_block_size */ +#endif + +#ifndef BLKIOMIN +#define BLKIOMIN _IO(0x12,120) /* minimum_io_size */ +#endif + +#ifndef BLKIOOPT +#define BLKIOOPT _IO(0x12,121) /* optimal_io_size */ +#endif + +#ifndef BLKALIGNOFF +#define BLKALIGNOFF _IO(0x12,122) /* alignment_offset */ +#endif + +#ifndef BLKPBSZGET +#define BLKPBSZGET _IO(0x12,123) /* physical_block_size */ +#endif + +int lgfs2_get_dev_info(int fd, struct lgfs2_dev_info *i) +{ + int ret; + int ro = 0; + off_t off; + + memset(i, 0, sizeof(*i)); + + ret = fstat(fd, &i->stat); + if (ret < 0) + return ret; + + switch (i->stat.st_mode & S_IFMT) { + case S_IFREG: + i->size = i->stat.st_size; + ret = fcntl(fd, F_GETFL, 0); + if ((ret & O_ACCMODE) == O_RDONLY) + i->readonly = 1; + i->io_optimal_size = i->stat.st_blksize; + goto size_check; + case S_IFBLK: + break; + default: + errno = ENOTBLK; + return -1; + } + + ioctl(fd, BLKRAGET, &i->ra_pages); + ioctl(fd, BLKBSZGET, &i->soft_block_size); + ioctl(fd, BLKSSZGET, &i->logical_block_size); + ioctl(fd, BLKIOMIN, &i->io_min_size); + ioctl(fd, BLKIOOPT, &i->io_optimal_size); + ioctl(fd, BLKPBSZGET, &i->physical_block_size); + ioctl(fd, BLKALIGNOFF, &i->io_align_offset); + ioctl(fd, BLKROGET, &ro); + if (ro) + i->readonly = 1; + off = lseek(fd, 0, SEEK_END); + if (off < 0) + return -1; + i->size = off; + +size_check: + if (i->size < (1 << 20)) { + errno = ENOSPC; + return -1; + } + + return 0; +} + +/** + * fix_device_geometry - round off address and lengths and convert to FS blocks + * @sdp: The super block + * + */ + +void fix_device_geometry(struct gfs2_sbd *sdp) +{ + struct device *device = &sdp->device; + + device->length = sdp->dinfo.size / sdp->bsize; + + if (cfg_debug) { + printf("\nDevice Geometry: (in FS blocks)\n"); + printf(" length = %"PRIu64"\n", device->length); + printf("\nDevice Size: %"PRIu64"\n", sdp->dinfo.size); + } +} diff --git a/gfs2/libgfs2/fs_bits.c b/gfs2/libgfs2/fs_bits.c new file mode 100644 index 0000000..e6aef55 --- /dev/null +++ b/gfs2/libgfs2/fs_bits.c @@ -0,0 +1,217 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include + +#include "libgfs2.h" + +#if BITS_PER_LONG == 32 +#define LBITMASK (0x55555555UL) +#define LBITSKIP55 (0x55555555UL) +#define LBITSKIP00 (0x00000000UL) +#else +#define LBITMASK (0x5555555555555555UL) +#define LBITSKIP55 (0x5555555555555555UL) +#define LBITSKIP00 (0x0000000000000000UL) +#endif + +#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) + +/** + * gfs2_bit_search + * @ptr: Pointer to bitmap data + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * + * We xor the bitmap data with a patter which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with + * the original. All the even bit positions (0,2,4, etc) then represent + * successful matches, so we mask with 0x55555..... to remove the unwanted + * odd bit positions. + * + * This allows searching of a whole u64 at once (32 blocks) with a + * single test (on 64 bit arches). + */ + +static inline uint64_t gfs2_bit_search(const unsigned long long *ptr, + unsigned long long mask, + uint8_t state) +{ + unsigned long long tmp; + static const unsigned long long search[] = { + [0] = 0xffffffffffffffffULL, + [1] = 0xaaaaaaaaaaaaaaaaULL, + [2] = 0x5555555555555555ULL, + [3] = 0x0000000000000000ULL, + }; + tmp = le64_to_cpu(*ptr) ^ search[state]; + tmp &= (tmp >> 1); + tmp &= mask; + return tmp; +} + +/** + * gfs2_bitfit - Find a free block in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @goal: the block to try to allocate + * @old_state: the state of the block we're looking for + * + * Return: the block number that was allocated + */ +unsigned long gfs2_bitfit(const unsigned char *buf, const unsigned int len, + unsigned long goal, unsigned char state) +{ + unsigned long spoint = (goal << 1) & ((8 * sizeof(unsigned long long)) - 1); + const unsigned long long *ptr = ((unsigned long long *)buf) + (goal >> 5); + const unsigned long long *end = (unsigned long long *) + (buf + ALIGN(len, sizeof(unsigned long long))); + unsigned long long tmp; + unsigned long long mask = 0x5555555555555555ULL; + unsigned long bit; + + if (state > 3) + return 0; + + /* Mask off bits we don't care about at the start of the search */ + mask <<= spoint; + tmp = gfs2_bit_search(ptr, mask, state); + ptr++; + while(tmp == 0 && ptr < end) { + tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state); + ptr++; + } + /* Mask off any bits which are more than len bytes from the start */ + if (ptr == end && (len & (sizeof(unsigned long long) - 1))) + tmp &= (((unsigned long long)~0) >> + (64 - 8 * (len & (sizeof(unsigned long long) - 1)))); + /* Didn't find anything, so return */ + if (tmp == 0) + return BFITNOENT; + ptr--; + bit = ffsll(tmp); + bit /= 2; /* two bits per entry in the bitmap */ + return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; +} + +/* + * check_range - check if blkno is within FS limits + * @sdp: super block + * @blkno: block number + * + * Returns: 0 if ok, -1 if out of bounds + */ +int gfs2_check_range(struct gfs2_sbd *sdp, uint64_t blkno) +{ + if((blkno > sdp->fssize) || (blkno <= LGFS2_SB_ADDR(sdp))) + return -1; + return 0; +} + +/* + * gfs2_set_bitmap + * @sdp: super block + * @blkno: block number relative to file system + * @state: one of three possible states + * + * This function sets the value of a bit of the + * file system bitmap. + * + * Returns: 0 on success, -1 on error + */ +int gfs2_set_bitmap(lgfs2_rgrp_t rgd, uint64_t blkno, int state) +{ + int buf; + uint32_t rgrp_block; + struct gfs2_bitmap *bits = NULL; + unsigned char *byte, cur_state; + unsigned int bit; + + /* FIXME: should GFS2_BLKST_INVALID be allowed */ + if ((state < GFS2_BLKST_FREE) || (state > GFS2_BLKST_DINODE)) + return -1; + + if(!rgd || blkno < rgd->ri.ri_data0) + return -1; + + rgrp_block = (uint32_t)(blkno - rgd->ri.ri_data0); + for(buf= 0; buf < rgd->ri.ri_length; buf++){ + bits = &(rgd->bits[buf]); + if(rgrp_block < ((bits->bi_start + bits->bi_len)*GFS2_NBBY)) + break; + } + + if (bits == NULL) + return -1; + byte = (unsigned char *)(bits->bi_bh->b_data + bits->bi_offset) + + (rgrp_block/GFS2_NBBY - bits->bi_start); + bit = (rgrp_block % GFS2_NBBY) * GFS2_BIT_SIZE; + + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + *byte ^= cur_state << bit; + *byte |= state << bit; + + bmodified(bits->bi_bh); + return 0; +} + +/* + * gfs2_get_bitmap - get value of FS bitmap + * @sdp: super block + * @blkno: block number relative to file system + * + * This function gets the value of a bit of the + * file system bitmap. + * Possible state values for a block in the bitmap are: + * GFS_BLKST_FREE (0) + * GFS_BLKST_USED (1) + * GFS_BLKST_INVALID (2) + * GFS_BLKST_DINODE (3) + * + * Returns: state on success, -1 on error + */ +int lgfs2_get_bitmap(struct gfs2_sbd *sdp, uint64_t blkno, struct rgrp_tree *rgd) +{ + uint64_t offset; + uint32_t i = 0; + char *byte; + unsigned int bit; + struct gfs2_bitmap *bi; + + if (rgd == NULL) { + rgd = gfs2_blk2rgrpd(sdp, blkno); + if(rgd == NULL) + return -1; + } + + offset = blkno - rgd->ri.ri_data0; + if (offset > UINT_MAX) { + errno = EINVAL; + return -1; + } + if (offset >= rgd->ri.ri_data0 + rgd->ri.ri_data) { + errno = E2BIG; + return -1; + } + + if (offset >= (rgd->bits->bi_start + rgd->bits->bi_len) * GFS2_NBBY) { + offset += (sizeof(struct gfs2_rgrp) - sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; + i = offset / sdp->sd_blocks_per_bitmap; + offset -= i * sdp->sd_blocks_per_bitmap; + } + + bi = &rgd->bits[i]; + if (!bi->bi_bh) + return GFS2_BLKST_FREE; + + byte = (bi->bi_bh->b_data + bi->bi_offset) + (offset/GFS2_NBBY); + bit = (offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + return (*byte >> bit) & GFS2_BIT_MASK; +} diff --git a/gfs2/libgfs2/fs_geometry.c b/gfs2/libgfs2/fs_geometry.c new file mode 100644 index 0000000..df8a13a --- /dev/null +++ b/gfs2/libgfs2/fs_geometry.c @@ -0,0 +1,108 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "libgfs2.h" +#include "config.h" + +/** + * Given a number of blocks in a resource group, return the number of blocks + * needed for bitmaps. Also calculate the adjusted number of free data blocks + * in the resource group and store it in *ri_data. + */ +uint32_t rgblocks2bitblocks(const unsigned int bsize, const uint32_t rgblocks, uint32_t *ri_data) +{ + uint32_t mappable = 0; + uint32_t bitblocks = 0; + /* Number of blocks mappable by bitmap blocks with these header types */ + const uint32_t blks_rgrp = GFS2_NBBY * (bsize - sizeof(struct gfs2_rgrp)); + const uint32_t blks_meta = GFS2_NBBY * (bsize - sizeof(struct gfs2_meta_header)); + + while (blks_rgrp + (blks_meta * bitblocks) < ((rgblocks - bitblocks) & ~(uint32_t)3)) + bitblocks++; + + if (bitblocks > 0) + mappable = blks_rgrp + (blks_meta * (bitblocks - 1)); + + *ri_data = (rgblocks - (bitblocks + 1)) & ~(uint32_t)3; + if (mappable < *ri_data) + bitblocks++; + + return bitblocks; +} + +/** + * build_rgrps - write a bunch of resource groups to disk. + * If fd > 0, write the data to the given file handle. + * Otherwise, use gfs2 buffering in buf.c. + */ +int build_rgrps(struct gfs2_sbd *sdp, int do_write) +{ + struct osi_node *n, *next = NULL; + struct rgrp_tree *rl; + uint32_t rgblocks, bitblocks; + struct gfs2_rindex *ri; + struct gfs2_meta_header mh; + unsigned int x; + + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_RB; + mh.mh_format = GFS2_FORMAT_RB; + if (do_write) + n = osi_first(&sdp->rgtree); + else + n = osi_first(&sdp->rgcalc); + + for (; n; n = next) { + next = osi_next(n); + rl = (struct rgrp_tree *)n; + ri = &rl->ri; + + bitblocks = rgblocks2bitblocks(sdp->bsize, rl->length, &rgblocks); + + ri->ri_addr = rl->start; + ri->ri_length = bitblocks; + ri->ri_data0 = rl->start + bitblocks; + ri->ri_data = rgblocks; + ri->ri_bitbytes = rgblocks / GFS2_NBBY; + + memset(&rl->rg, 0, sizeof(rl->rg)); + rl->rg.rg_header.mh_magic = GFS2_MAGIC; + rl->rg.rg_header.mh_type = GFS2_METATYPE_RG; + rl->rg.rg_header.mh_format = GFS2_FORMAT_RG; + rl->rg.rg_free = rgblocks; + + if (gfs2_compute_bitstructs(sdp->sd_sb.sb_bsize, rl)) + return -1; + + if (do_write) { + for (x = 0; x < bitblocks; x++) { + rl->bits[x].bi_bh = bget(sdp, rl->start + x); + if (x) + gfs2_meta_header_out(&mh, rl->bits[x].bi_bh->b_data); + else + gfs2_rgrp_out(&rl->rg, rl->bits[x].bi_bh->b_data); + bmodified(rl->bits[x].bi_bh); + } + } + + if (cfg_debug) { + printf("\n"); + gfs2_rindex_print(ri); + } + + sdp->blks_total += rgblocks; + sdp->fssize = ri->ri_data0 + ri->ri_data; + } + return 0; +} diff --git a/gfs2/libgfs2/fs_ops.c b/gfs2/libgfs2/fs_ops.c new file mode 100644 index 0000000..7e87e43 --- /dev/null +++ b/gfs2/libgfs2/fs_ops.c @@ -0,0 +1,2007 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "libgfs2.h" +#include "rgrp.h" + +static __inline__ uint64_t *metapointer(struct gfs2_buffer_head *bh, + unsigned int height, + struct metapath *mp) +{ + unsigned int head_size = (height > 0) ? + sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); + + return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height]; +} + +/* Detect directory is a stuffed inode */ +static int inode_is_stuffed(const struct gfs2_inode *ip) +{ + return !ip->i_di.di_height; +} + +struct gfs2_inode *lgfs2_inode_get(struct gfs2_sbd *sdp, struct gfs2_buffer_head *bh) +{ + struct gfs2_inode *ip; + + ip = calloc(1, sizeof(struct gfs2_inode)); + if (ip == NULL) { + return NULL; + } + gfs2_dinode_in(&ip->i_di, bh->b_data); + ip->i_bh = bh; + ip->i_sbd = sdp; + return ip; +} + +struct gfs2_inode *lgfs2_inode_read(struct gfs2_sbd *sdp, uint64_t di_addr) +{ + struct gfs2_inode *ip; + struct gfs2_buffer_head *bh = bread(sdp, di_addr); + if (bh == NULL) { + return NULL; + } + ip = lgfs2_inode_get(sdp, bh); + if (ip == NULL) { + brelse(bh); + return NULL; + } + ip->bh_owned = 1; /* We did the bread so we own the bh */ + return ip; +} + +struct gfs2_inode *is_system_inode(struct gfs2_sbd *sdp, uint64_t block) +{ + int j; + + if (sdp->md.inum && block == sdp->md.inum->i_di.di_num.no_addr) + return sdp->md.inum; + if (sdp->md.statfs && block == sdp->md.statfs->i_di.di_num.no_addr) + return sdp->md.statfs; + if (sdp->md.jiinode && block == sdp->md.jiinode->i_di.di_num.no_addr) + return sdp->md.jiinode; + if (sdp->md.riinode && block == sdp->md.riinode->i_di.di_num.no_addr) + return sdp->md.riinode; + if (sdp->md.qinode && block == sdp->md.qinode->i_di.di_num.no_addr) + return sdp->md.qinode; + if (sdp->md.pinode && block == sdp->md.pinode->i_di.di_num.no_addr) + return sdp->md.pinode; + if (sdp->md.rooti && block == sdp->md.rooti->i_di.di_num.no_addr) + return sdp->md.rooti; + if (sdp->master_dir && block == sdp->master_dir->i_di.di_num.no_addr) + return sdp->master_dir; + for (j = 0; j < sdp->md.journals; j++) + if (sdp->md.journal && sdp->md.journal[j] && + block == sdp->md.journal[j]->i_di.di_num.no_addr) + return sdp->md.journal[j]; + return NULL; +} + +void inode_put(struct gfs2_inode **ip_in) +{ + struct gfs2_inode *ip = *ip_in; + uint64_t block = ip->i_di.di_num.no_addr; + struct gfs2_sbd *sdp = ip->i_sbd; + + if (ip->i_bh->b_modified) { + gfs2_dinode_out(&ip->i_di, ip->i_bh->b_data); + if (!ip->bh_owned && is_system_inode(sdp, block)) + fprintf(stderr, "Warning: Change made to inode " + "were discarded.\n"); + /* This is for debugging only: a convenient place to set + a breakpoint. This means a system inode was modified but + not written. That's not fatal: some places like + adjust_inode in gfs2_convert will do this on purpose. + It can also point out a coding problem, but we don't + want to raise alarm in the users either. */ + } + if (ip->bh_owned) + brelse(ip->i_bh); + ip->i_bh = NULL; + free(ip); + *ip_in = NULL; /* make sure the memory isn't accessed again */ +} + +static uint64_t find_free_block(struct rgrp_tree *rgd) +{ + unsigned bm; + uint64_t blkno = 0; + + if (rgd == NULL || rgd->rg.rg_free == 0) { + errno = ENOSPC; + return 0; + } + + for (bm = 0; bm < rgd->ri.ri_length; bm++) { + unsigned long blk = 0; + struct gfs2_bitmap *bits = &rgd->bits[bm]; + + blk = gfs2_bitfit((uint8_t *)bits->bi_bh->b_data + bits->bi_offset, + bits->bi_len, blk, GFS2_BLKST_FREE); + if (blk != BFITNOENT) { + blkno = blk + (bits->bi_start * GFS2_NBBY) + rgd->ri.ri_data0; + break; + } + } + return blkno; +} + +static int blk_alloc_in_rg(struct gfs2_sbd *sdp, unsigned state, struct rgrp_tree *rgd, uint64_t blkno, int dinode) +{ + if (blkno == 0) + return -1; + + if (gfs2_set_bitmap(rgd, blkno, state)) + return -1; + + if (state == GFS2_BLKST_DINODE) { + struct gfs_rgrp *gfs1rg = (struct gfs_rgrp *)&rgd->rg; + + if (dinode) + rgd->rg.rg_dinodes++; + else if (sdp->gfs1) + gfs1rg->rg_usedmeta++; + } + + rgd->rg.rg_free--; + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + sdp->blks_alloced++; + return 0; +} + +/** + * Allocate a block in a bitmap. In order to plan ahead we look for a + * resource group with blksreq free blocks but only allocate the one block. + * Returns 0 on success with the allocated block number in *blkno or non-zero otherwise. + */ +static int block_alloc(struct gfs2_sbd *sdp, const uint64_t blksreq, int state, uint64_t *blkno, int dinode) +{ + int ret; + int release = 0; + struct rgrp_tree *rgt = NULL; + struct osi_node *n = NULL; + uint64_t bn = 0; + + for (n = osi_first(&sdp->rgtree); n; n = osi_next(n)) { + rgt = (struct rgrp_tree *)n; + if (rgt->rg.rg_free >= blksreq) + break; + } + if (rgt == NULL) + return -1; + + if (rgt->bits[0].bi_bh == NULL) { + if (gfs2_rgrp_read(sdp, rgt)) + return -1; + release = 1; + } + + bn = find_free_block(rgt); + ret = blk_alloc_in_rg(sdp, state, rgt, bn, dinode); + if (release) + gfs2_rgrp_relse(rgt); + *blkno = bn; + return ret; +} + +int lgfs2_dinode_alloc(struct gfs2_sbd *sdp, const uint64_t blksreq, uint64_t *blkno) +{ + int ret = block_alloc(sdp, blksreq, GFS2_BLKST_DINODE, blkno, TRUE); + if (ret == 0) + sdp->dinodes_alloced++; + return ret; +} + +int lgfs2_meta_alloc(struct gfs2_inode *ip, uint64_t *blkno) +{ + int ret = block_alloc(ip->i_sbd, 1, + ip->i_sbd->gfs1 ? GFS2_BLKST_DINODE : + GFS2_BLKST_USED, blkno, FALSE); + if (ret == 0) { + ip->i_di.di_goal_meta = *blkno; + bmodified(ip->i_bh); + } + return ret; +} + +static __inline__ void buffer_clear_tail(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *bh, int head) +{ + memset(bh->b_data + head, 0, sdp->bsize - head); + bmodified(bh); +} + +static __inline__ void +buffer_copy_tail(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *to_bh, int to_head, + struct gfs2_buffer_head *from_bh, int from_head) +{ + memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head, + sdp->bsize - from_head); + memset(to_bh->b_data + sdp->bsize + to_head - from_head, 0, + from_head - to_head); + bmodified(to_bh); +} + +void unstuff_dinode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + uint64_t block = 0; + int isdir = S_ISDIR(ip->i_di.di_mode) || is_gfs_dir(&ip->i_di); + + if (ip->i_di.di_size) { + if (lgfs2_meta_alloc(ip, &block)) + exit(1); + if (isdir) { + struct gfs2_meta_header mh; + + bh = bget(sdp, block); + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_JD; + mh.mh_format = GFS2_FORMAT_JD; + gfs2_meta_header_out(&mh, bh->b_data); + + buffer_copy_tail(sdp, bh, + sizeof(struct gfs2_meta_header), + ip->i_bh, sizeof(struct gfs2_dinode)); + + bmodified(bh); + brelse(bh); + } else { + bh = bget(sdp, block); + + buffer_copy_tail(sdp, bh, 0, + ip->i_bh, sizeof(struct gfs2_dinode)); + brelse(bh); + } + } + + buffer_clear_tail(sdp, ip->i_bh, sizeof(struct gfs2_dinode)); + + if (ip->i_di.di_size) { + *(uint64_t *)(ip->i_bh->b_data + sizeof(struct gfs2_dinode)) = cpu_to_be64(block); + /* no need: bmodified(ip->i_bh); buffer_clear_tail does it */ + ip->i_di.di_blocks++; + } + + ip->i_di.di_height = 1; +} + +/** + * Calculate the total number of blocks required by a file containing 'bytes' bytes of data. + */ +uint64_t lgfs2_space_for_data(const struct gfs2_sbd *sdp, const unsigned bsize, const uint64_t bytes) +{ + uint64_t blks = (bytes + bsize - 1) / bsize; + uint64_t ptrs = blks; + + if (bytes <= bsize - sizeof(struct gfs2_dinode)) + return 1; + + while (ptrs > sdp->sd_diptrs) { + ptrs = (ptrs + sdp->sd_inptrs - 1) / sdp->sd_inptrs; + blks += ptrs; + } + return blks + 1; +} + +/** + * Allocate an extent for a file in a resource group's bitmaps. + * rg: The resource group in which to allocate the extent + * di_size: The size of the file in bytes + * ip: A pointer to the inode structure, whose fields will be set appropriately. + * If ip->i_di.di_num.no_addr is not 0, the extent search will be skipped and + * the file allocated from that address. + * flags: GFS2_DIF_* flags + * mode: File mode flags, see creat(2) + * Returns 0 on success with the contents of ip set accordingly, or non-zero + * with errno set on error. If errno is ENOSPC then rg does not contain a + * large enough free extent for the given di_size. + */ +int lgfs2_file_alloc(lgfs2_rgrp_t rg, uint64_t di_size, struct gfs2_inode *ip, uint32_t flags, unsigned mode) +{ + unsigned extlen; + struct gfs2_dinode *di = &ip->i_di; + struct gfs2_sbd *sdp = rg->rgrps->sdp; + struct lgfs2_rbm rbm = { .rgd = rg, .offset = 0, .bii = 0 }; + uint32_t blocks = lgfs2_space_for_data(sdp, sdp->bsize, di_size); + + if (ip->i_di.di_num.no_addr != 0) { + if (lgfs2_rbm_from_block(&rbm, ip->i_di.di_num.no_addr) != 0) + return 1; + } else if (lgfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &blocks) != 0) { + return 1; + } + + extlen = lgfs2_alloc_extent(&rbm, GFS2_BLKST_DINODE, blocks); + if (extlen < blocks) { + errno = EINVAL; + return 1; + } + + ip->i_sbd = sdp; + + di->di_header.mh_magic = GFS2_MAGIC; + di->di_header.mh_type = GFS2_METATYPE_DI; + di->di_header.mh_format = GFS2_FORMAT_DI; + di->di_size = di_size; + di->di_num.no_addr = lgfs2_rbm_to_block(&rbm); + di->di_num.no_formal_ino = sdp->md.next_inum++; + di->di_mode = mode; + di->di_nlink = 1; + di->di_blocks = blocks; + di->di_atime = di->di_mtime = di->di_ctime = sdp->time; + di->di_goal_data = di->di_num.no_addr + di->di_blocks - 1; + di->di_goal_meta = di->di_goal_data - ((di_size + sdp->bsize - 1) / sdp->bsize); + di->di_height = calc_tree_height(ip, di_size); + di->di_flags = flags; + + rg->rg.rg_free -= blocks; + rg->rg.rg_dinodes += 1; + + sdp->dinodes_alloced++; + sdp->blks_alloced += blocks; + + return 0; +} + +unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + uint64_t *arr; + unsigned int max, height; + + if (ip->i_di.di_size > size) + size = ip->i_di.di_size; + + if (S_ISDIR(ip->i_di.di_mode)) { + arr = sdp->sd_jheightsize; + max = sdp->sd_max_jheight; + } else { + arr = sdp->sd_heightsize; + max = sdp->sd_max_height; + } + + for (height = 0; height < max; height++) + if (arr[height] >= size) + break; + + return height; +} + +void build_height(struct gfs2_inode *ip, int height) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + uint64_t block = 0, *bp; + unsigned int x; + int new_block; + + while (ip->i_di.di_height < height) { + new_block = FALSE; + bp = (uint64_t *)(ip->i_bh->b_data + sizeof(struct gfs2_dinode)); + for (x = 0; x < sdp->sd_diptrs; x++, bp++) + if (*bp) { + new_block = TRUE; + break; + } + + if (new_block) { + struct gfs2_meta_header mh; + + if (lgfs2_meta_alloc(ip, &block)) + exit(1); + bh = bget(sdp, block); + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_IN; + mh.mh_format = GFS2_FORMAT_IN; + gfs2_meta_header_out(&mh, bh->b_data); + buffer_copy_tail(sdp, bh, + sizeof(struct gfs2_meta_header), + ip->i_bh, sizeof(struct gfs2_dinode)); + bmodified(bh); + brelse(bh); + } + + buffer_clear_tail(sdp, ip->i_bh, sizeof(struct gfs2_dinode)); + + if (new_block) { + *(uint64_t *)(ip->i_bh->b_data + sizeof(struct gfs2_dinode)) = cpu_to_be64(block); + /* no need: bmodified(ip->i_bh);*/ + ip->i_di.di_blocks++; + } + + ip->i_di.di_height++; + } +} + +void find_metapath(struct gfs2_inode *ip, uint64_t block, struct metapath *mp) +{ + const uint32_t inptrs = ip->i_sbd->sd_inptrs; + unsigned int i = ip->i_di.di_height; + + memset(mp, 0, sizeof(struct metapath)); + while (i--) { + mp->mp_list[i] = block % inptrs; + block /= inptrs; + } +} + +void lookup_block(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + unsigned int height, struct metapath *mp, + int create, int *new, uint64_t *block) +{ + uint64_t *ptr = metapointer(bh, height, mp); + + if (*ptr) { + *block = be64_to_cpu(*ptr); + return; + } + + *block = 0; + + if (!create) + return; + + if (lgfs2_meta_alloc(ip, block)) + return; + *ptr = cpu_to_be64(*block); + bmodified(bh); + ip->i_di.di_blocks++; + bmodified(ip->i_bh); + + *new = 1; +} + +void block_map(struct gfs2_inode *ip, uint64_t lblock, int *new, + uint64_t *dblock, uint32_t *extlen, int prealloc) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + struct metapath mp; + int create = *new; + unsigned int bsize; + unsigned int height; + unsigned int end_of_metadata; + unsigned int x; + + *new = 0; + *dblock = 0; + if (extlen) + *extlen = 0; + + if (inode_is_stuffed(ip)) { + if (!lblock) { + *dblock = ip->i_di.di_num.no_addr; + if (extlen) + *extlen = 1; + } + return; + } + + bsize = (S_ISDIR(ip->i_di.di_mode)) ? sdp->sd_jbsize : sdp->bsize; + + height = calc_tree_height(ip, (lblock + 1) * bsize); + if (ip->i_di.di_height < height) { + if (!create) + return; + + build_height(ip, height); + } + + find_metapath(ip, lblock, &mp); + end_of_metadata = ip->i_di.di_height - 1; + + bh = ip->i_bh; + + for (x = 0; x < end_of_metadata; x++) { + lookup_block(ip, bh, x, &mp, create, new, dblock); + if (bh != ip->i_bh) + brelse(bh); + if (!*dblock) + return; + + if (*new) { + struct gfs2_meta_header mh; + bh = bget(sdp, *dblock); + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_IN; + mh.mh_format = GFS2_FORMAT_IN; + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + } else { + if (*dblock == ip->i_di.di_num.no_addr) + bh = ip->i_bh; + else + bh = bread(sdp, *dblock); + } + } + + if (!prealloc) + lookup_block(ip, bh, end_of_metadata, &mp, create, new, dblock); + + if (extlen && *dblock) { + *extlen = 1; + + if (!*new) { + uint64_t tmp_dblock; + int tmp_new; + unsigned int nptrs; + + nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs; + + while (++mp.mp_list[end_of_metadata] < nptrs) { + lookup_block(ip, bh, end_of_metadata, &mp, FALSE, &tmp_new, + &tmp_dblock); + + if (*dblock + *extlen != tmp_dblock) + break; + + (*extlen)++; + } + } + } + + if (bh != ip->i_bh) + brelse(bh); +} + +static void +copy2mem(struct gfs2_buffer_head *bh, void **buf, unsigned int offset, + unsigned int size) +{ + char **p = (char **)buf; + + if (bh) + memcpy(*p, bh->b_data + offset, size); + else + memset(*p, 0, size); + + *p += size; +} + +int gfs2_readi(struct gfs2_inode *ip, void *buf, + uint64_t offset, unsigned int size) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + uint64_t lblock, dblock; + unsigned int o; + uint32_t extlen = 0; + unsigned int amount; + int not_new = 0; + int isdir = !!(S_ISDIR(ip->i_di.di_mode)); + int journaled = ip->i_di.di_flags & GFS2_DIF_JDATA; + int copied = 0; + + if (offset >= ip->i_di.di_size) + return 0; + + if ((offset + size) > ip->i_di.di_size) + size = ip->i_di.di_size - offset; + + if (!size) + return 0; + + if ((sdp->gfs1 && journaled) || (!sdp->gfs1 && isdir)) { + lblock = offset; + o = lblock % sdp->sd_jbsize; + lblock /= sdp->sd_jbsize; + } else { + lblock = offset >> sdp->sd_sb.sb_bsize_shift; + o = offset & (sdp->bsize - 1); + } + + if (inode_is_stuffed(ip)) + o += sizeof(struct gfs2_dinode); + else if ((sdp->gfs1 && journaled) || (!sdp->gfs1 && isdir)) + o += sizeof(struct gfs2_meta_header); + + while (copied < size) { + amount = size - copied; + if (amount > sdp->bsize - o) + amount = sdp->bsize - o; + + if (!extlen) { + if (sdp->gfs1) + gfs1_block_map(ip, lblock, ¬_new, &dblock, + &extlen, FALSE); + else + block_map(ip, lblock, ¬_new, &dblock, + &extlen, FALSE); + } + + if (dblock) { + if (dblock == ip->i_di.di_num.no_addr) + bh = ip->i_bh; + else + bh = bread(sdp, dblock); + dblock++; + extlen--; + } else + bh = NULL; + + copy2mem(bh, &buf, o, amount); + if (bh && bh != ip->i_bh) + brelse(bh); + + copied += amount; + lblock++; + + if (sdp->gfs1) + o = (journaled) ? sizeof(struct gfs2_meta_header) : 0; + else + o = (isdir) ? sizeof(struct gfs2_meta_header) : 0; + } + + return copied; +} + +static void copy_from_mem(struct gfs2_buffer_head *bh, void **buf, + unsigned int offset, unsigned int size) +{ + char **p = (char **)buf; + + memcpy(bh->b_data + offset, *p, size); + bmodified(bh); + *p += size; +} + +int __gfs2_writei(struct gfs2_inode *ip, void *buf, + uint64_t offset, unsigned int size, int resize) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + uint64_t lblock, dblock; + unsigned int o; + uint32_t extlen = 0; + unsigned int amount; + int new; + int isdir = !!(S_ISDIR(ip->i_di.di_mode)); + const uint64_t start = offset; + int copied = 0; + + if (!size) + return 0; + + if (inode_is_stuffed(ip) && + ((start + size) > (sdp->bsize - sizeof(struct gfs2_dinode)))) + unstuff_dinode(ip); + + if (isdir) { + lblock = offset; + o = lblock % sdp->sd_jbsize; + lblock /= sdp->sd_jbsize; + } else { + lblock = offset >> sdp->sd_sb.sb_bsize_shift; + o = offset & (sdp->bsize - 1); + } + + if (inode_is_stuffed(ip)) + o += sizeof(struct gfs2_dinode); + else if (isdir) + o += sizeof(struct gfs2_meta_header); + + while (copied < size) { + amount = size - copied; + if (amount > sdp->bsize - o) + amount = sdp->bsize - o; + + if (!extlen) { + new = TRUE; + block_map(ip, lblock, &new, &dblock, &extlen, FALSE); + } + + if (new) { + bh = bget(sdp, dblock); + if (isdir) { + struct gfs2_meta_header mh; + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_JD; + mh.mh_format = GFS2_FORMAT_JD; + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + } + } else { + if (dblock == ip->i_di.di_num.no_addr) + bh = ip->i_bh; + else + bh = bread(sdp, dblock); + } + copy_from_mem(bh, &buf, o, amount); + if (bh != ip->i_bh) + brelse(bh); + + copied += amount; + lblock++; + dblock++; + extlen--; + + o = (isdir) ? sizeof(struct gfs2_meta_header) : 0; + } + + if (resize && ip->i_di.di_size < start + copied) { + bmodified(ip->i_bh); + ip->i_di.di_size = start + copied; + } + + return copied; +} + +struct gfs2_buffer_head *get_file_buf(struct gfs2_inode *ip, uint64_t lbn, + int prealloc) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + uint64_t dbn; + int new = TRUE; + + if (inode_is_stuffed(ip)) + unstuff_dinode(ip); + + block_map(ip, lbn, &new, &dbn, NULL, prealloc); + if (!dbn) { + fprintf(stderr, "get_file_buf\n"); + exit(1); + } + + if (!prealloc && new && + ip->i_di.di_size < (lbn + 1) << sdp->sd_sb.sb_bsize_shift) { + bmodified(ip->i_bh); + ip->i_di.di_size = (lbn + 1) << sdp->sd_sb.sb_bsize_shift; + } + if (dbn == ip->i_di.di_num.no_addr) + return ip->i_bh; + else + return bread(sdp, dbn); +} + +int gfs2_dirent_first(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data; + + if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) { + *dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_leaf)); + return IS_LEAF; + } else { + *dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_dinode)); + return IS_DINODE; + } +} + +int gfs2_dirent_next(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + struct gfs2_dirent **dent) +{ + char *bh_end; + uint16_t cur_rec_len; + + bh_end = bh->b_data + dip->i_sbd->bsize; + cur_rec_len = be16_to_cpu((*dent)->de_rec_len); + + if (cur_rec_len == 0 || (char *)(*dent) + cur_rec_len >= bh_end) + return -ENOENT; + + *dent = (struct gfs2_dirent *)((char *)(*dent) + cur_rec_len); + + return 0; +} + +/** + * Allocate a gfs2 dirent + * Returns 0 on success, with *dent_out pointing to the new dirent, + * or -1 on failure, with errno set + */ +static int dirent_alloc(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + int name_len, struct gfs2_dirent **dent_out) +{ + struct gfs2_dirent *dent, *new; + unsigned int rec_len = GFS2_DIRENT_SIZE(name_len); + unsigned int entries = 0, offset = 0; + int type; + + type = gfs2_dirent_first(dip, bh, &dent); + + if (type == IS_LEAF) { + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + entries = be16_to_cpu(leaf->lf_entries); + offset = sizeof(struct gfs2_leaf); + } else { + struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data; + entries = be32_to_cpu(dinode->di_entries); + offset = sizeof(struct gfs2_dinode); + } + + if (!entries) { + dent->de_rec_len = cpu_to_be16(dip->i_sbd->bsize - offset); + dent->de_name_len = cpu_to_be16(name_len); + bmodified(bh); + *dent_out = dent; + dip->i_di.di_entries++; + bmodified(dip->i_bh); + return 0; + } + + do { + uint16_t cur_rec_len; + uint16_t cur_name_len; + uint16_t new_rec_len; + + cur_rec_len = be16_to_cpu(dent->de_rec_len); + cur_name_len = be16_to_cpu(dent->de_name_len); + + if ((!dent->de_inum.no_formal_ino && cur_rec_len >= rec_len) || + (cur_rec_len >= GFS2_DIRENT_SIZE(cur_name_len) + rec_len)) { + + if (dent->de_inum.no_formal_ino) { + new = (struct gfs2_dirent *)((char *)dent + + GFS2_DIRENT_SIZE(cur_name_len)); + memset(new, 0, sizeof(struct gfs2_dirent)); + + new->de_rec_len = cpu_to_be16(cur_rec_len - + GFS2_DIRENT_SIZE(cur_name_len)); + new->de_name_len = cpu_to_be16(name_len); + + new_rec_len = be16_to_cpu(new->de_rec_len); + dent->de_rec_len = cpu_to_be16(cur_rec_len - new_rec_len); + + *dent_out = new; + bmodified(bh); + dip->i_di.di_entries++; + bmodified(dip->i_bh); + return 0; + } + + dent->de_name_len = cpu_to_be16(name_len); + + *dent_out = dent; + bmodified(bh); + dip->i_di.di_entries++; + bmodified(dip->i_bh); + return 0; + } + } while (gfs2_dirent_next(dip, bh, &dent) == 0); + + errno = ENOSPC; + return -1; +} + +void dirent2_del(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur) +{ + uint16_t cur_rec_len, prev_rec_len; + + bmodified(bh); + if (gfs2_check_meta(bh, GFS2_METATYPE_LF) == 0) { + struct gfs2_leaf *lf = (struct gfs2_leaf *)bh->b_data; + + lf->lf_entries = be16_to_cpu(lf->lf_entries) - 1; + lf->lf_entries = cpu_to_be16(lf->lf_entries); + } + + if (dip->i_di.di_entries) { + bmodified(dip->i_bh); + dip->i_di.di_entries--; + } + if (!prev) { + cur->de_inum.no_addr = 0; + cur->de_inum.no_formal_ino = 0; + return; + } + + prev_rec_len = be16_to_cpu(prev->de_rec_len); + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + prev_rec_len += cur_rec_len; + prev->de_rec_len = cpu_to_be16(prev_rec_len); +} + +int lgfs2_get_leaf_ptr(struct gfs2_inode *dip, const uint32_t lindex, uint64_t *ptr) +{ + uint64_t leaf_no; + int count = gfs2_readi(dip, (char *)&leaf_no, lindex * sizeof(uint64_t), sizeof(uint64_t)); + if (count != sizeof(uint64_t)) + return -1; + + *ptr = be64_to_cpu(leaf_no); + return 0; +} + +void dir_split_leaf(struct gfs2_inode *dip, uint32_t start, uint64_t leaf_no, + struct gfs2_buffer_head *obh) +{ + struct gfs2_buffer_head *nbh; + struct gfs2_leaf *nleaf, *oleaf; + struct gfs2_dirent *dent, *prev = NULL, *next = NULL, *new; + uint32_t len, half_len, divider; + uint64_t bn, *lp; + uint32_t name_len; + int x, moved = FALSE; + int count; + + if (lgfs2_meta_alloc(dip, &bn)) + exit(1); + nbh = bget(dip->i_sbd, bn); + { + struct gfs2_meta_header mh; + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_LF; + mh.mh_format = GFS2_FORMAT_LF; + gfs2_meta_header_out(&mh, nbh->b_data); + bmodified(nbh); + buffer_clear_tail(dip->i_sbd, nbh, + sizeof(struct gfs2_meta_header)); + } + + nleaf = (struct gfs2_leaf *)nbh->b_data; + nleaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); + + oleaf = (struct gfs2_leaf *)obh->b_data; + + len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth)); + half_len = len >> 1; + + lp = calloc(1, half_len * sizeof(uint64_t)); + if (lp == NULL) { + fprintf(stderr, "Out of memory in %s\n", __FUNCTION__); + exit(-1); + } + for (x = 0; x < half_len; x++) + lp[x] = cpu_to_be64(bn); + + if (dip->i_sbd->gfs1) + count = gfs1_writei(dip, (char *)lp, start * sizeof(uint64_t), + half_len * sizeof(uint64_t)); + else + count = gfs2_writei(dip, (char *)lp, start * sizeof(uint64_t), + half_len * sizeof(uint64_t)); + if (count != half_len * sizeof(uint64_t)) { + fprintf(stderr, "dir_split_leaf (2)\n"); + exit(1); + } + + free(lp); + + divider = (start + half_len) << (32 - dip->i_di.di_depth); + + gfs2_dirent_first(dip, obh, &dent); + + do { + next = dent; + if (gfs2_dirent_next(dip, obh, &next)) + next = NULL; + + if (dent->de_inum.no_formal_ino && + be32_to_cpu(dent->de_hash) < divider) { + name_len = be16_to_cpu(dent->de_name_len); + + if (dirent_alloc(dip, nbh, name_len, &new)) { + fprintf(stderr, "dir_split_leaf (3)\n"); + exit(1); + } + + new->de_inum = dent->de_inum; + new->de_hash = dent->de_hash; + new->de_type = dent->de_type; + memcpy((char *)(new + 1), (char *)(dent + 1), name_len); + + nleaf->lf_entries = be16_to_cpu(nleaf->lf_entries) + 1; + nleaf->lf_entries = cpu_to_be16(nleaf->lf_entries); + + dirent2_del(dip, obh, prev, dent); + + if (!prev) + prev = dent; + + moved = TRUE; + } else + prev = dent; + + dent = next; + } while (dent); + + if (!moved) { + if (dirent_alloc(dip, nbh, 0, &new)) { + fprintf(stderr, "dir_split_leaf (4)\n"); + exit(1); + } + new->de_inum.no_formal_ino = 0; + /* Don't count the sentinel dirent as an entry */ + dip->i_di.di_entries--; + } + + oleaf->lf_depth = be16_to_cpu(oleaf->lf_depth) + 1; + oleaf->lf_depth = cpu_to_be16(oleaf->lf_depth); + nleaf->lf_depth = oleaf->lf_depth; + +#ifdef GFS2_HAS_LEAF_HINTS + nleaf->lf_inode = cpu_to_be64(dip->i_di.di_num.no_addr); +#endif + dip->i_di.di_blocks++; + bmodified(dip->i_bh); + + bmodified(obh); /* Need to do this in case nothing was moved */ + bmodified(nbh); + brelse(nbh); +} + +static void dir_double_exhash(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = dip->i_sbd; + uint64_t *buf; + uint64_t *from, *to; + uint64_t block; + int x; + int count; + + buf = calloc(1, 3 * sdp->sd_hash_bsize); + if (buf == NULL) { + fprintf(stderr, "Out of memory in %s\n", __FUNCTION__); + exit(-1); + } + + for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { + count = gfs2_readi(dip, (char *)buf, + block * sdp->sd_hash_bsize, + sdp->sd_hash_bsize); + if (count != sdp->sd_hash_bsize) { + fprintf(stderr, "dir_double_exhash (1)\n"); + exit(1); + } + + from = buf; + to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize); + + for (x = sdp->sd_hash_ptrs; x--; from++) { + *to++ = *from; + *to++ = *from; + } + + if (sdp->gfs1) + count = gfs1_writei(dip, (char *)buf + + sdp->sd_hash_bsize, + block * sdp->bsize, sdp->bsize); + else + count = gfs2_writei(dip, (char *)buf + + sdp->sd_hash_bsize, + block * sdp->bsize, sdp->bsize); + if (count != sdp->bsize) { + fprintf(stderr, "dir_double_exhash (2)\n"); + exit(1); + } + } + + free(buf); + + dip->i_di.di_depth++; + bmodified(dip->i_bh); +} + +/** + * get_leaf - Get leaf + * @dip: + * @leaf_no: + * @bh_out: + * + * Returns: 0 on success, error code otherwise + */ + +int gfs2_get_leaf(struct gfs2_inode *dip, uint64_t leaf_no, + struct gfs2_buffer_head **bhp) +{ + int error = 0; + + *bhp = bread(dip->i_sbd, leaf_no); + error = gfs2_check_meta(*bhp, GFS2_METATYPE_LF); + if(error) + brelse(*bhp); + return error; +} + +/** + * get_first_leaf - Get first leaf + * @dip: The GFS2 inode + * @index: + * @bh_out: + * + * Returns: 0 on success, error code otherwise + */ + +static int get_first_leaf(struct gfs2_inode *dip, uint32_t lindex, struct gfs2_buffer_head **bh_out) +{ + uint64_t leaf_no; + + if (lgfs2_get_leaf_ptr(dip, lindex, &leaf_no) != 0) + return -1; + *bh_out = bread(dip->i_sbd, leaf_no); + if (*bh_out == NULL) + return -1; + return 0; +} + +/** + * get_next_leaf - Get next leaf + * @dip: The GFS2 inode + * @bh_in: The buffer + * @bh_out: + * + * Returns: 0 on success, error code otherwise + */ + +static int get_next_leaf(struct gfs2_inode *dip,struct gfs2_buffer_head *bh_in, + struct gfs2_buffer_head **bh_out) +{ + struct gfs2_leaf *leaf; + + leaf = (struct gfs2_leaf *)bh_in->b_data; + + if (!leaf->lf_next) + return -1; + /* Check for a leaf that points to itself as "next" */ + if (be64_to_cpu(leaf->lf_next) == bh_in->b_blocknr) + return -1; + *bh_out = bread(dip->i_sbd, be64_to_cpu(leaf->lf_next)); + if (*bh_out == NULL) + return -ENOENT; + /* Check for a leaf pointing to a non-leaf */ + if (gfs2_check_meta(*bh_out, GFS2_METATYPE_LF)) { + brelse(*bh_out); + *bh_out = NULL; + return -ENOENT; + } + return 0; +} + +static int dir_e_add(struct gfs2_inode *dip, const char *filename, int len, + struct gfs2_inum *inum, unsigned int type) +{ + struct gfs2_buffer_head *bh, *nbh; + struct gfs2_leaf *leaf, *nleaf; + struct gfs2_dirent *dent; + uint32_t lindex, llen; + uint32_t hash; + uint64_t leaf_no, bn; + int err = 0; + + hash = gfs2_disk_hash(filename, len); +restart: + /* Have to kludge because (hash >> 32) gives hash for some reason. */ + if (dip->i_di.di_depth) + lindex = hash >> (32 - dip->i_di.di_depth); + else + lindex = 0; + + err = lgfs2_get_leaf_ptr(dip, lindex, &leaf_no); + if (err) + return err; + + for (;;) { + bh = bread(dip->i_sbd, leaf_no); + leaf = (struct gfs2_leaf *)bh->b_data; + + if (dirent_alloc(dip, bh, len, &dent)) { + + if (be16_to_cpu(leaf->lf_depth) < dip->i_di.di_depth) { + llen = 1 << (dip->i_di.di_depth - + be16_to_cpu(leaf->lf_depth)); + dir_split_leaf(dip, lindex & ~(llen - 1), + leaf_no, bh); + brelse(bh); + goto restart; + + } else if (dip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) { + brelse(bh); + dir_double_exhash(dip); + goto restart; + + } else if (leaf->lf_next) { + leaf_no = be64_to_cpu(leaf->lf_next); + brelse(bh); + continue; + + } else { + struct gfs2_meta_header mh; + + if (lgfs2_meta_alloc(dip, &bn)) + exit(1); + nbh = bget(dip->i_sbd, bn); + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_LF; + mh.mh_format = GFS2_FORMAT_LF; + gfs2_meta_header_out(&mh, nbh->b_data); + bmodified(nbh); + + leaf->lf_next = cpu_to_be64(bn); + + nleaf = (struct gfs2_leaf *)nbh->b_data; + nleaf->lf_depth = leaf->lf_depth; + nleaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); +#ifdef GFS2_HAS_LEAF_HINTS + nleaf->lf_inode = cpu_to_be64(dip->i_di.di_num.no_addr); +#endif + err = dirent_alloc(dip, nbh, len, &dent); + if (err) + return err; + dip->i_di.di_blocks++; + bmodified(dip->i_bh); + bmodified(bh); + brelse(bh); + bh = nbh; + leaf = nleaf; + } + } + + gfs2_inum_out(inum, (char *)&dent->de_inum); + dent->de_hash = cpu_to_be32(hash); + dent->de_type = cpu_to_be16(type); + memcpy((char *)(dent + 1), filename, len); + + leaf->lf_entries = be16_to_cpu(leaf->lf_entries) + 1; + leaf->lf_entries = cpu_to_be16(leaf->lf_entries); + + bmodified(bh); + brelse(bh); + return err; + } +} + +static void dir_make_exhash(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = dip->i_sbd; + struct gfs2_dirent *dent; + struct gfs2_buffer_head *bh; + struct gfs2_leaf *leaf; + int y; + uint32_t x; + uint64_t *lp, bn; + + if (lgfs2_meta_alloc(dip, &bn)) + exit(1); + bh = bget(sdp, bn); + { + struct gfs2_meta_header mh; + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_LF; + mh.mh_format = GFS2_FORMAT_LF; + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + } + + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); + leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); +#ifdef GFS2_HAS_LEAF_HINTS + leaf->lf_inode = cpu_to_be64(dip->i_di.di_num.no_addr); +#endif + buffer_copy_tail(sdp, bh, sizeof(struct gfs2_leaf), + dip->i_bh, sizeof(struct gfs2_dinode)); + + x = 0; + gfs2_dirent_first(dip, bh, &dent); + + do { + if (!dent->de_inum.no_formal_ino) + continue; + if (++x == dip->i_di.di_entries) + break; + } while (gfs2_dirent_next(dip, bh, &dent) == 0); + + dent->de_rec_len = be16_to_cpu(dent->de_rec_len); + dent->de_rec_len = cpu_to_be16(dent->de_rec_len + + sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf)); + + /* no need to: bmodified(bh); (buffer_copy_tail does it) */ + brelse(bh); + + buffer_clear_tail(sdp, dip->i_bh, sizeof(struct gfs2_dinode)); + + lp = (uint64_t *)(dip->i_bh->b_data + sizeof(struct gfs2_dinode)); + + for (x = sdp->sd_hash_ptrs; x--; lp++) + *lp = cpu_to_be64(bn); + + dip->i_di.di_size = sdp->bsize / 2; + dip->i_di.di_blocks++; + dip->i_di.di_flags |= GFS2_DIF_EXHASH; + dip->i_di.di_payload_format = 0; + /* no need: bmodified(dip->i_bh); buffer_clear_tail does it. */ + + for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; + dip->i_di.di_depth = y; + + gfs2_dinode_out(&dip->i_di, dip->i_bh->b_data); + bwrite(dip->i_bh); +} + +static int dir_l_add(struct gfs2_inode *dip, const char *filename, int len, + struct gfs2_inum *inum, unsigned int type) +{ + struct gfs2_dirent *dent; + int err = 0; + + if (dirent_alloc(dip, dip->i_bh, len, &dent)) { + dir_make_exhash(dip); + err = dir_e_add(dip, filename, len, inum, type); + return err; + } + + gfs2_inum_out(inum, (char *)&dent->de_inum); + dent->de_hash = gfs2_disk_hash(filename, len); + dent->de_hash = cpu_to_be32(dent->de_hash); + dent->de_type = cpu_to_be16(type); + memcpy((char *)(dent + 1), filename, len); + bmodified(dip->i_bh); + return err; +} + +int dir_add(struct gfs2_inode *dip, const char *filename, int len, + struct gfs2_inum *inum, unsigned int type) +{ + int err = 0; + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + err = dir_e_add(dip, filename, len, inum, type); + else + err = dir_l_add(dip, filename, len, inum, type); + return err; +} + +static int __init_dinode(struct gfs2_sbd *sdp, struct gfs2_buffer_head **bhp, struct gfs2_inum *inum, + unsigned int mode, uint32_t flags, struct gfs2_inum *parent, int gfs1) +{ + struct gfs2_buffer_head *bh; + struct gfs2_dinode di = {{0}}; + int is_dir; + + if (gfs1) + is_dir = (IF2DT(mode) == GFS_FILE_DIR); + else + is_dir = S_ISDIR(mode); + + errno = EINVAL; + if (bhp == NULL) + return 1; + + if (*bhp == NULL) { + *bhp = bget(sdp, inum->no_addr); + if (*bhp == NULL) + return 1; + } + + bh = *bhp; + + di.di_header.mh_magic = GFS2_MAGIC; + di.di_header.mh_type = GFS2_METATYPE_DI; + di.di_header.mh_format = GFS2_FORMAT_DI; + di.di_num = *inum; + di.di_mode = mode; + di.di_nlink = 1; + di.di_blocks = 1; + di.di_atime = di.di_mtime = di.di_ctime = sdp->time; + di.di_goal_meta = di.di_goal_data = bh->b_blocknr; + di.di_flags = flags; + + if (is_dir) { + struct gfs2_dirent de1, de2; + + memset(&de1, 0, sizeof(struct gfs2_dirent)); + de1.de_inum = di.di_num; + de1.de_hash = gfs2_disk_hash(".", 1); + de1.de_rec_len = GFS2_DIRENT_SIZE(1); + de1.de_name_len = 1; + de1.de_type = (gfs1 ? GFS_FILE_DIR : IF2DT(S_IFDIR)); + + memset(&de2, 0, sizeof(struct gfs2_dirent)); + de2.de_inum = *parent; + de2.de_hash = gfs2_disk_hash("..", 2); + de2.de_rec_len = sdp->bsize - sizeof(struct gfs2_dinode) - de1.de_rec_len; + de2.de_name_len = 2; + de2.de_type = (gfs1 ? GFS_FILE_DIR : IF2DT(S_IFDIR)); + + gfs2_dirent_out(&de1, bh->b_data + sizeof(struct gfs2_dinode)); + memcpy(bh->b_data + + sizeof(struct gfs2_dinode) + + sizeof(struct gfs2_dirent), + ".", 1); + gfs2_dirent_out(&de2, bh->b_data + sizeof(struct gfs2_dinode) + de1.de_rec_len); + memcpy(bh->b_data + + sizeof(struct gfs2_dinode) + + de1.de_rec_len + + sizeof(struct gfs2_dirent), + "..", 2); + + di.di_nlink = 2; + di.di_size = sdp->bsize - sizeof(struct gfs2_dinode); + di.di_flags |= GFS2_DIF_JDATA; + di.di_payload_format = GFS2_FORMAT_DE; + di.di_entries = 2; + } + gfs2_dinode_out(&di, bh->b_data); + bmodified(bh); + return 0; +} + +int init_dinode(struct gfs2_sbd *sdp, struct gfs2_buffer_head **bhp, struct gfs2_inum *inum, + unsigned int mode, uint32_t flags, struct gfs2_inum *parent) +{ + return __init_dinode(sdp, bhp, inum, mode, flags, parent, 0); +} + +static void lgfs2_fill_indir(char *start, char *end, uint64_t ptr0, unsigned n, unsigned *p) +{ + char *bp; + memset(start, 0, end - start); + for (bp = start; bp < end && *p < n; bp += sizeof(uint64_t)) { + uint64_t pn = ptr0 + *p; + *(uint64_t *)bp = cpu_to_be64(pn); + (*p)++; + } +} + +/** + * Calculate and write the indirect blocks for a single-extent file of a given + * size. + * ip: The inode for which to write indirect blocks, with fields already set + * appropriately (see lgfs2_file_alloc). + * Returns 0 on success or non-zero with errno set on failure. + */ +int lgfs2_write_filemeta(struct gfs2_inode *ip) +{ + unsigned height = 0; + struct metapath mp; + struct gfs2_sbd *sdp = ip->i_sbd; + uint64_t dblocks = (ip->i_di.di_size + sdp->bsize - 1) / sdp->bsize; + uint64_t ptr0 = ip->i_di.di_num.no_addr + 1; + unsigned ptrs = 1; + struct gfs2_meta_header mh = { + .mh_magic = GFS2_MAGIC, + .mh_type = GFS2_METATYPE_IN, + .mh_format = GFS2_FORMAT_IN, + }; + struct gfs2_buffer_head *bh = bget(sdp, ip->i_di.di_num.no_addr); + if (bh == NULL) + return 1; + + /* Using find_metapath() to find the last data block in the file will + effectively give a remainder for the number of pointers at each + height. Just need to add 1 to convert ptr index to quantity later. */ + find_metapath(ip, dblocks - 1, &mp); + + for (height = 0; height < ip->i_di.di_height; height++) { + unsigned p; + /* The number of pointers in this height will be the number of + full indirect blocks pointed to by the previous height + multiplied by the pointer capacity of an indirect block, + plus the remainder which find_metapath() gave us. */ + ptrs = ((ptrs - 1) * sdp->sd_inptrs) + mp.mp_list[height] + 1; + + for (p = 0; p < ptrs; bh->b_blocknr++) { + char *start = bh->b_data; + if (height == 0) { + start += sizeof(struct gfs2_dinode); + gfs2_dinode_out(&ip->i_di, bh->b_data); + } else { + start += sizeof(struct gfs2_meta_header); + gfs2_meta_header_out(&mh, bh->b_data); + } + lgfs2_fill_indir(start, bh->b_data + sdp->bsize, ptr0, ptrs, &p); + if (bwrite(bh)) { + free(bh); + return 1; + } + } + ptr0 += ptrs; + } + free(bh); + return 0; +} + +static struct gfs2_inode *__createi(struct gfs2_inode *dip, + const char *filename, unsigned int mode, + uint32_t flags, int if_gfs1) +{ + struct gfs2_sbd *sdp = dip->i_sbd; + uint64_t bn; + struct gfs2_inum inum; + struct gfs2_buffer_head *bh = NULL; + struct gfs2_inode *ip; + int err = 0; + int is_dir; + + gfs2_lookupi(dip, filename, strlen(filename), &ip); + if (!ip) { + err = lgfs2_dinode_alloc(sdp, 1, &bn); + if (err != 0) + return NULL; + + if (if_gfs1) + inum.no_formal_ino = bn; + else + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = bn; + + err = dir_add(dip, filename, strlen(filename), &inum, IF2DT(mode)); + if (err) + return NULL; + + if (if_gfs1) + is_dir = (IF2DT(mode) == GFS_FILE_DIR); + else + is_dir = S_ISDIR(mode); + if (is_dir) { + bmodified(dip->i_bh); + dip->i_di.di_nlink++; + } + + err = __init_dinode(sdp, &bh, &inum, mode, flags, &dip->i_di.di_num, if_gfs1); + if (err != 0) + return NULL; + + ip = lgfs2_inode_get(sdp, bh); + if (ip == NULL) + return NULL; + bmodified(bh); + } + ip->bh_owned = 1; + return ip; +} + +struct gfs2_inode *createi(struct gfs2_inode *dip, const char *filename, + unsigned int mode, uint32_t flags) +{ + return __createi(dip, filename, mode, flags, 0); +} + +struct gfs2_inode *gfs_createi(struct gfs2_inode *dip, const char *filename, + unsigned int mode, uint32_t flags) +{ + return __createi(dip, filename, mode, flags, 1); +} + +/** + * gfs2_filecmp - Compare two filenames + * @file1: The first filename + * @file2: The second filename + * @len_of_file2: The length of the second file + * + * This routine compares two filenames and returns 1 if they are equal. + * + * Returns: 1 if the files are the same, otherwise 0. + */ + +static int gfs2_filecmp(const char *file1, const char *file2, int len_of_file2) +{ + if (strlen(file1) != len_of_file2) + return 0; + if (memcmp(file1, file2, len_of_file2)) + return 0; + return 1; +} + +/** + * leaf_search + * @bh: + * @id: + * @dent_out: + * @dent_prev: + * + * Returns: + */ +static int leaf_search(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + const char *filename, int len, + struct gfs2_dirent **dent_out, + struct gfs2_dirent **dent_prev) +{ + uint32_t hash; + struct gfs2_dirent *dent, *prev = NULL; + unsigned int entries = 0, x = 0; + int type; + + type = gfs2_dirent_first(dip, bh, &dent); + + if (type == IS_LEAF){ + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + entries = be16_to_cpu(leaf->lf_entries); + } else if (type == IS_DINODE) + entries = dip->i_di.di_entries; + else + return -1; + + hash = gfs2_disk_hash(filename, len); + + do{ + if (!dent->de_inum.no_formal_ino){ + prev = dent; + continue; + } + + if (be32_to_cpu(dent->de_hash) == hash && + gfs2_filecmp(filename, (char *)(dent + 1), + be16_to_cpu(dent->de_name_len))) { + *dent_out = dent; + if (dent_prev) + *dent_prev = prev; + return 0; + } + + if(x >= entries) + return -1; + x++; + prev = dent; + } while (gfs2_dirent_next(dip, bh, &dent) == 0); + + return -ENOENT; +} + +/** + * linked_leaf_search - Linked leaf search + * @dip: The GFS2 inode + * @id: + * @dent_out: + * @dent_prev: + * @bh_out: + * + * Returns: 0 on sucess, error code otherwise + */ + +static int linked_leaf_search(struct gfs2_inode *dip, const char *filename, + int len, struct gfs2_dirent **dent_out, + struct gfs2_buffer_head **bh_out) +{ + struct gfs2_buffer_head *bh = NULL, *bh_next; + uint32_t hsize, lindex; + uint32_t hash; + int error = 0; + + hsize = 1 << dip->i_di.di_depth; + if(hsize * sizeof(uint64_t) != dip->i_di.di_size) + return -1; + + /* Figure out the address of the leaf node. */ + + hash = gfs2_disk_hash(filename, len); + lindex = hash >> (32 - dip->i_di.di_depth); + + error = get_first_leaf(dip, lindex, &bh_next); + if (error) + return error; + if (bh_next == NULL) + return errno; + + /* Find the entry */ + do{ + if (bh && bh != dip->i_bh) + brelse(bh); + + bh = bh_next; + + error = leaf_search(dip, bh, filename, len, dent_out, NULL); + switch (error){ + case 0: + *bh_out = bh; + return 0; + + case -ENOENT: + break; + + default: + if (bh && bh != dip->i_bh) + brelse(bh); + return error; + } + + error = get_next_leaf(dip, bh, &bh_next); + } while (!error && bh_next != NULL); + + if (bh && bh != dip->i_bh) + brelse(bh); + + return error; +} + +/** + * dir_e_search - + * @dip: The GFS2 inode + * @id: + * @inode: + * + * Returns: + */ +static int dir_e_search(struct gfs2_inode *dip, const char *filename, + int len, unsigned int *type, struct gfs2_inum *inum) +{ + struct gfs2_buffer_head *bh = NULL; + struct gfs2_dirent *dent; + int error; + + error = linked_leaf_search(dip, filename, len, &dent, &bh); + if (error) + return error; + + gfs2_inum_in(inum, (char *)&dent->de_inum); + if (type) + *type = be16_to_cpu(dent->de_type); + + brelse(bh); + + return 0; +} + + +/** + * dir_l_search - + * @dip: The GFS2 inode + * @id: + * @inode: + * + * Returns: + */ +static int dir_l_search(struct gfs2_inode *dip, const char *filename, + int len, unsigned int *type, struct gfs2_inum *inum) +{ + struct gfs2_dirent *dent; + int error; + + if(!inode_is_stuffed(dip)) + return -1; + + error = leaf_search(dip, dip->i_bh, filename, len, &dent, NULL); + if (!error) { + gfs2_inum_in(inum, (char *)&dent->de_inum); + if(type) + *type = be16_to_cpu(dent->de_type); + } + return error; +} + +/** + * dir_search - Search a directory + * @dip: The GFS inode + * @id + * @type: + * + * This routine searches a directory for a file or another directory + * given its filename. The component of the identifier that is + * not being used to search will be filled in and must be freed by + * the caller. + * + * Returns: 0 if found, -1 on failure, -ENOENT if not found. + */ +int dir_search(struct gfs2_inode *dip, const char *filename, int len, + unsigned int *type, struct gfs2_inum *inum) +{ + int error; + + if(!S_ISDIR(dip->i_di.di_mode) && !is_gfs_dir(&dip->i_di)) + return -1; + + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + error = dir_e_search(dip, filename, len, type, inum); + else + error = dir_l_search(dip, filename, len, type, inum); + + return error; +} + +static int dir_e_del(struct gfs2_inode *dip, const char *filename, int len) +{ + int lindex; + int error; + int found = 0; + uint64_t leaf_no; + struct gfs2_buffer_head *bh = NULL; + struct gfs2_dirent *cur, *prev; + + lindex = (1 << (dip->i_di.di_depth))-1; + + for(; (lindex >= 0) && !found; lindex--){ + error = lgfs2_get_leaf_ptr(dip, lindex, &leaf_no); + if (error) + return error; + + while(leaf_no && !found){ + bh = bread(dip->i_sbd, leaf_no); + error = leaf_search(dip, bh, filename, len, &cur, &prev); + if (error) { + if(error != -ENOENT){ + brelse(bh); + return -1; + } + leaf_no = be64_to_cpu(((struct gfs2_leaf *)bh->b_data)->lf_next); + brelse(bh); + } else + found = 1; + } + } + + if(!found) + return 1; + + if (bh) { + dirent2_del(dip, bh, prev, cur); + brelse(bh); + } + return 0; +} + +static int dir_l_del(struct gfs2_inode *dip, const char *filename, int len) +{ + int error=0; + struct gfs2_dirent *cur, *prev; + + if(!inode_is_stuffed(dip)) + return -1; + + error = leaf_search(dip, dip->i_bh, filename, len, &cur, &prev); + if (error) { + if (error == -ENOENT) + return 1; + else + return -1; + } + + dirent2_del(dip, dip->i_bh, prev, cur); + return 0; +} + + +/* + * gfs2_dirent_del + * @dip + * filename + * + * Delete a directory entry from a directory. This _only_ + * removes the directory entry - leaving the dinode in + * place. (Likely without a link.) + * + * Returns: 0 on success (or if it doesn't already exist), -1 on failure + */ +int gfs2_dirent_del(struct gfs2_inode *dip, const char *filename, int len) +{ + int error; + + if(!S_ISDIR(dip->i_di.di_mode) && !is_gfs_dir(&dip->i_di)) + return -1; + + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + error = dir_e_del(dip, filename, len); + else + error = dir_l_del(dip, filename, len); + bmodified(dip->i_bh); + return error; +} + +/** + * gfs2_lookupi - Look up a filename in a directory and return its inode + * @dip: The directory to search + * @name: The name of the inode to look for + * @ipp: Used to return the found inode if any + * + * Returns: 0 on success, -EXXXX on failure + */ +int gfs2_lookupi(struct gfs2_inode *dip, const char *filename, int len, + struct gfs2_inode **ipp) +{ + struct gfs2_sbd *sdp = dip->i_sbd; + int error = 0; + struct gfs2_inum inum; + + *ipp = NULL; + + if (!len || len > GFS2_FNAMESIZE) + return -ENAMETOOLONG; + if (gfs2_filecmp(filename, (char *)".", 1)) { + *ipp = dip; + return 0; + } + error = dir_search(dip, filename, len, NULL, &inum); + if (!error) + *ipp = lgfs2_inode_read(sdp, inum.no_addr); + + return error; +} + +/** + * gfs2_free_block - free up a block given its block number + */ +void gfs2_free_block(struct gfs2_sbd *sdp, uint64_t block) +{ + struct rgrp_tree *rgd; + + /* Adjust the free space count for the freed block */ + rgd = gfs2_blk2rgrpd(sdp, block); /* find the rg for indir block */ + if (rgd) { + gfs2_set_bitmap(rgd, block, GFS2_BLKST_FREE); + rgd->rg.rg_free++; /* adjust the free count */ + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + sdp->blks_alloced--; + } +} + +/** + * gfs2_freedi - unlink a disk inode by block number. + * Note: currently only works for regular files. + */ +int gfs2_freedi(struct gfs2_sbd *sdp, uint64_t diblock) +{ + struct gfs2_inode *ip; + struct gfs2_buffer_head *bh, *nbh; + int h, head_size; + uint64_t *ptr, block; + struct rgrp_tree *rgd; + uint32_t height; + osi_list_t metalist[GFS2_MAX_META_HEIGHT]; + osi_list_t *cur_list, *next_list, *tmp; + + for (h = 0; h < GFS2_MAX_META_HEIGHT; h++) + osi_list_init(&metalist[h]); + + bh = bread(sdp, diblock); + if (bh == NULL) + return -1; + ip = lgfs2_inode_get(sdp, bh); + if (ip == NULL) + return -1; + height = ip->i_di.di_height; + osi_list_add(&bh->b_altlist, &metalist[0]); + + for (h = 0; h < height; h++){ + cur_list = &metalist[h]; + next_list = &metalist[h + 1]; + head_size = (h > 0 ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode)); + + for (tmp = cur_list->next; tmp != cur_list; tmp = tmp->next){ + bh = osi_list_entry(tmp, struct gfs2_buffer_head, + b_altlist); + + for (ptr = (uint64_t *)(bh->b_data + head_size); + (char *)ptr < (bh->b_data + sdp->bsize); ptr++) { + if (!*ptr) + continue; + + block = be64_to_cpu(*ptr); + gfs2_free_block(sdp, block); + if (h == height - 1) /* if not metadata */ + continue; /* don't queue it up */ + /* Read the next metadata block in the chain */ + nbh = bread(sdp, block); + osi_list_add(&nbh->b_altlist, next_list); + brelse(nbh); + } + } + } + rgd = gfs2_blk2rgrpd(sdp, diblock); + gfs2_set_bitmap(rgd, diblock, GFS2_BLKST_FREE); + inode_put(&ip); + /* inode_put deallocated the extra block used by the disk inode, */ + /* so adjust it in the superblock struct */ + sdp->blks_alloced--; + rgd->rg.rg_free++; + rgd->rg.rg_dinodes--; + if (sdp->gfs1) + gfs_rgrp_out((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else + gfs2_rgrp_out(&rgd->rg, rgd->bits[0].bi_bh->b_data); + bmodified(rgd->bits[0].bi_bh); + sdp->dinodes_alloced--; + return 0; +} diff --git a/gfs2/libgfs2/gfs1.c b/gfs2/libgfs2/gfs1.c new file mode 100644 index 0000000..b91cce5 --- /dev/null +++ b/gfs2/libgfs2/gfs1.c @@ -0,0 +1,391 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osi_list.h" +#include "libgfs2.h" + +/* GFS1 compatibility functions - so that programs like gfs2_convert + and gfs2_edit can examine/manipulate GFS1 file systems. */ + +static __inline__ int fs_is_jdata(struct gfs2_inode *ip) +{ + return ip->i_di.di_flags & GFS2_DIF_JDATA; +} + +static __inline__ uint64_t * +gfs1_metapointer(struct gfs2_buffer_head *bh, unsigned int height, + struct metapath *mp) +{ + unsigned int head_size = (height > 0) ? + sizeof(struct gfs_indirect) : sizeof(struct gfs_dinode); + + return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height]; +} + +int is_gfs_dir(struct gfs2_dinode *dinode) +{ + if (dinode->__pad1 == GFS_FILE_DIR) + return 1; + return 0; +} + +void gfs1_lookup_block(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + unsigned int height, struct metapath *mp, + int create, int *new, uint64_t *block) +{ + uint64_t *ptr = gfs1_metapointer(bh, height, mp); + + if (*ptr) { + *block = be64_to_cpu(*ptr); + return; + } + + *block = 0; + + if (!create) + return; + + if (lgfs2_meta_alloc(ip, block)) { + *block = 0; + return; + } + + *ptr = cpu_to_be64(*block); + bmodified(bh); + ip->i_di.di_blocks++; + bmodified(ip->i_bh); + + *new = 1; +} + +void gfs1_block_map(struct gfs2_inode *ip, uint64_t lblock, int *new, + uint64_t *dblock, uint32_t *extlen, int prealloc) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + struct metapath mp; + int create = *new; + unsigned int bsize; + unsigned int height; + unsigned int end_of_metadata; + unsigned int x; + + *new = 0; + *dblock = 0; + if (extlen) + *extlen = 0; + + if (!ip->i_di.di_height) { /* stuffed */ + if (!lblock) { + *dblock = ip->i_di.di_num.no_addr; + if (extlen) + *extlen = 1; + } + return; + } + + bsize = (fs_is_jdata(ip)) ? sdp->sd_jbsize : sdp->bsize; + + height = calc_tree_height(ip, (lblock + 1) * bsize); + if (ip->i_di.di_height < height) { + if (!create) + return; + + build_height(ip, height); + } + + find_metapath(ip, lblock, &mp); + end_of_metadata = ip->i_di.di_height - 1; + + bh = ip->i_bh; + + for (x = 0; x < end_of_metadata; x++) { + gfs1_lookup_block(ip, bh, x, &mp, create, new, dblock); + if (bh != ip->i_bh) + brelse(bh); + if (!*dblock) + return; + + if (*new) { + struct gfs2_meta_header mh; + + bh = bget(sdp, *dblock); + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_IN; + mh.mh_format = GFS2_FORMAT_IN; + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + } else { + if (*dblock == ip->i_di.di_num.no_addr) + bh = ip->i_bh; + else + bh = bread(sdp, *dblock); + } + } + + if (!prealloc) + gfs1_lookup_block(ip, bh, end_of_metadata, &mp, create, new, + dblock); + + if (extlen && *dblock) { + *extlen = 1; + + if (!*new) { + uint64_t tmp_dblock; + int tmp_new; + unsigned int nptrs; + + nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs; + + while (++mp.mp_list[end_of_metadata] < nptrs) { + gfs1_lookup_block(ip, bh, end_of_metadata, &mp, + FALSE, &tmp_new, + &tmp_dblock); + + if (*dblock + *extlen != tmp_dblock) + break; + + (*extlen)++; + } + } + } + + if (bh != ip->i_bh) + brelse(bh); +} + +int gfs1_writei(struct gfs2_inode *ip, char *buf, uint64_t offset, + unsigned int size) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_buffer_head *bh; + uint64_t lblock, dblock; + uint32_t extlen = 0; + unsigned int amount; + int new; + int journaled = fs_is_jdata(ip); + const uint64_t start = offset; + int copied = 0; + + if (!size) + return 0; + + if (!ip->i_di.di_height && /* stuffed */ + ((start + size) > (sdp->bsize - sizeof(struct gfs_dinode)))) + unstuff_dinode(ip); + + if (journaled) { + lblock = offset / sdp->sd_jbsize; + offset %= sdp->sd_jbsize; + } else { + lblock = offset >> sdp->sd_sb.sb_bsize_shift; + offset &= sdp->bsize - 1; + } + + if (!ip->i_di.di_height) /* stuffed */ + offset += sizeof(struct gfs_dinode); + else if (journaled) + offset += sizeof(struct gfs2_meta_header); + + while (copied < size) { + amount = size - copied; + if (amount > sdp->bsize - offset) + amount = sdp->bsize - offset; + + if (!extlen){ + new = TRUE; + gfs1_block_map(ip, lblock, &new, &dblock, &extlen, 0); + if (!dblock) + return -1; + } + + if (dblock == ip->i_di.di_num.no_addr) + bh = ip->i_bh; + else + bh = bread(sdp, dblock); + + if (journaled && dblock != ip->i_di.di_num.no_addr ) { + struct gfs2_meta_header mh; + + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_JD; + mh.mh_format = GFS2_FORMAT_JD; + gfs2_meta_header_out(&mh, bh->b_data); + } + + memcpy(bh->b_data + offset, buf + copied, amount); + bmodified(bh); + if (bh != ip->i_bh) + brelse(bh); + + copied += amount; + lblock++; + dblock++; + extlen--; + + offset = (journaled) ? sizeof(struct gfs2_meta_header) : 0; + } + + if (ip->i_di.di_size < start + copied) { + bmodified(ip->i_bh); + ip->i_di.di_size = start + copied; + } + ip->i_di.di_mtime = ip->i_di.di_ctime = time(NULL); + gfs2_dinode_out(&ip->i_di, ip->i_bh->b_data); + bmodified(ip->i_bh); + return copied; +} + +/* ------------------------------------------------------------------------ */ +/* gfs_dinode_in */ +/* ------------------------------------------------------------------------ */ +static void gfs_dinode_in(struct gfs_dinode *di, struct gfs2_buffer_head *bh) +{ + struct gfs_dinode *str = (struct gfs_dinode *)bh->b_data; + + gfs2_meta_header_in(&di->di_header, bh->b_data); + gfs2_inum_in(&di->di_num, (char *)&str->di_num); + + di->di_mode = be32_to_cpu(str->di_mode); + di->di_uid = be32_to_cpu(str->di_uid); + di->di_gid = be32_to_cpu(str->di_gid); + di->di_nlink = be32_to_cpu(str->di_nlink); + di->di_size = be64_to_cpu(str->di_size); + di->di_blocks = be64_to_cpu(str->di_blocks); + di->di_atime = be64_to_cpu(str->di_atime); + di->di_mtime = be64_to_cpu(str->di_mtime); + di->di_ctime = be64_to_cpu(str->di_ctime); + di->di_major = be32_to_cpu(str->di_major); + di->di_minor = be32_to_cpu(str->di_minor); + di->di_goal_dblk = be64_to_cpu(str->di_goal_dblk); + di->di_goal_mblk = be64_to_cpu(str->di_goal_mblk); + di->di_flags = be32_to_cpu(str->di_flags); + di->di_payload_format = be32_to_cpu(str->di_payload_format); + di->di_type = be16_to_cpu(str->di_type); + di->di_height = be16_to_cpu(str->di_height); + di->di_depth = be16_to_cpu(str->di_depth); + di->di_entries = be32_to_cpu(str->di_entries); + di->di_eattr = be64_to_cpu(str->di_eattr); +} + +static struct gfs2_inode *__gfs_inode_get(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *bh, + uint64_t di_addr) +{ + struct gfs_dinode gfs1_dinode; + struct gfs2_inode *ip; + + ip = calloc(1, sizeof(struct gfs2_inode)); + if (ip == NULL) { + return NULL; + } + + ip->bh_owned = 0; + if (!bh) { + bh = bread(sdp, di_addr); + ip->bh_owned = 1; + } + gfs_dinode_in(&gfs1_dinode, bh); + memcpy(&ip->i_di.di_header, &gfs1_dinode.di_header, + sizeof(struct gfs2_meta_header)); + memcpy(&ip->i_di.di_num, &gfs1_dinode.di_num, + sizeof(struct gfs2_inum)); + ip->i_di.di_mode = gfs1_dinode.di_mode; + ip->i_di.di_uid = gfs1_dinode.di_uid; + ip->i_di.di_gid = gfs1_dinode.di_gid; + ip->i_di.di_nlink = gfs1_dinode.di_nlink; + ip->i_di.di_size = gfs1_dinode.di_size; + ip->i_di.di_blocks = gfs1_dinode.di_blocks; + ip->i_di.di_atime = gfs1_dinode.di_atime; + ip->i_di.di_mtime = gfs1_dinode.di_mtime; + ip->i_di.di_ctime = gfs1_dinode.di_ctime; + ip->i_di.di_major = gfs1_dinode.di_major; + ip->i_di.di_minor = gfs1_dinode.di_minor; + ip->i_di.di_goal_data = gfs1_dinode.di_goal_dblk; + ip->i_di.di_goal_meta = gfs1_dinode.di_goal_mblk; + ip->i_di.di_flags = gfs1_dinode.di_flags; + ip->i_di.di_payload_format = gfs1_dinode.di_payload_format; + ip->i_di.__pad1 = gfs1_dinode.di_type; + ip->i_di.di_height = gfs1_dinode.di_height; + ip->i_di.di_depth = gfs1_dinode.di_depth; + ip->i_di.di_entries = gfs1_dinode.di_entries; + ip->i_di.di_eattr = gfs1_dinode.di_eattr; + ip->i_bh = bh; + ip->i_sbd = sdp; + return ip; +} + +struct gfs2_inode *lgfs2_gfs_inode_get(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *bh) +{ + return __gfs_inode_get(sdp, bh, 0); +} + +struct gfs2_inode *lgfs2_gfs_inode_read(struct gfs2_sbd *sdp, uint64_t di_addr) +{ + return __gfs_inode_get(sdp, NULL, di_addr); +} + +/* ------------------------------------------------------------------------ */ +/* gfs_jindex_in - read in a gfs1 jindex structure. */ +/* ------------------------------------------------------------------------ */ +void gfs_jindex_in(struct gfs_jindex *jindex, char *jbuf) +{ + struct gfs_jindex *str = (struct gfs_jindex *) jbuf; + + jindex->ji_addr = be64_to_cpu(str->ji_addr); + jindex->ji_nsegment = be32_to_cpu(str->ji_nsegment); + jindex->ji_pad = be32_to_cpu(str->ji_pad); + memcpy(jindex->ji_reserved, str->ji_reserved, 64); +} + +/* ------------------------------------------------------------------------ */ +/* gfs_rgrp_in - Read in a resource group header */ +/* ------------------------------------------------------------------------ */ +void gfs_rgrp_in(struct gfs_rgrp *rgrp, struct gfs2_buffer_head *rbh) +{ + struct gfs_rgrp *str = (struct gfs_rgrp *)rbh->b_data; + + gfs2_meta_header_in(&rgrp->rg_header, rbh->b_data); + rgrp->rg_flags = be32_to_cpu(str->rg_flags); + rgrp->rg_free = be32_to_cpu(str->rg_free); + rgrp->rg_useddi = be32_to_cpu(str->rg_useddi); + rgrp->rg_freedi = be32_to_cpu(str->rg_freedi); + gfs2_inum_in(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list); + rgrp->rg_usedmeta = be32_to_cpu(str->rg_usedmeta); + rgrp->rg_freemeta = be32_to_cpu(str->rg_freemeta); + + memcpy(rgrp->rg_reserved, str->rg_reserved, 64); +} + +/* ------------------------------------------------------------------------ */ +/* gfs_rgrp_out */ +/* ------------------------------------------------------------------------ */ +void gfs_rgrp_out(struct gfs_rgrp *rgrp, struct gfs2_buffer_head *rbh) +{ + struct gfs_rgrp *str = (struct gfs_rgrp *)rbh->b_data; + + gfs2_meta_header_out(&rgrp->rg_header, rbh->b_data); + str->rg_flags = cpu_to_be32(rgrp->rg_flags); + str->rg_free = cpu_to_be32(rgrp->rg_free); + str->rg_useddi = cpu_to_be32(rgrp->rg_useddi); + str->rg_freedi = cpu_to_be32(rgrp->rg_freedi); + gfs2_inum_out(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list); + str->rg_usedmeta = cpu_to_be32(rgrp->rg_usedmeta); + str->rg_freemeta = cpu_to_be32(rgrp->rg_freemeta); + + memcpy(str->rg_reserved, rgrp->rg_reserved, 64); + bmodified(rbh); +} diff --git a/gfs2/libgfs2/gfs2_disk_hash.c b/gfs2/libgfs2/gfs2_disk_hash.c new file mode 100644 index 0000000..959f5dd --- /dev/null +++ b/gfs2/libgfs2/gfs2_disk_hash.c @@ -0,0 +1,74 @@ +#include "clusterautoconfig.h" + +#include +#include "libgfs2.h" + +static const uint32_t crc_32_tab[] = +{ + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +}; + +/** + * gfs2_disk_hash - hash an array of data + * @data: the data to be hashed + * @len: the length of data to be hashed + * + * This function must produce the same results as the one in the kernel: + * crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF + * + * Take some data and convert it to a 32-bit hash. + * + * The hash function is a 32-bit CRC of the data. The algorithm uses + * the crc_32_tab table above. + * + * This may not be the fastest hash function, but it does a fair bit better + * at providing uniform results than the others I've looked at. That's + * really important for efficient directories. + * + * Returns: the hash + */ + +uint32_t gfs2_disk_hash(const char *data, int len) +{ + uint32_t hash = 0xFFFFFFFF; + + for (; len--; data++) + hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8); + + hash = ~hash; + + return hash; +} + + diff --git a/gfs2/libgfs2/gfs2l.c b/gfs2/libgfs2/gfs2l.c new file mode 100644 index 0000000..a8aa5fa --- /dev/null +++ b/gfs2/libgfs2/gfs2l.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include "libgfs2.h" + +static void usage(const char *cmd) +{ + printf("A language for modifying and querying a gfs2 file system.\n"); + printf("Usage: %s [options] \n", cmd); + printf("Available options:\n"); + printf(" -h Print this help message and exit\n"); + printf(" -f Path to script file or '-' for stdin (the default)\n"); + printf(" -T Print a list of gfs2 structure types and exit\n"); + printf(" -F Print a list of fields belonging to a type and exit\n"); +} + +struct cmdopts { + char *fspath; + FILE *src; + unsigned help:1; +}; + +static int metastrcmp(const void *a, const void *b) +{ + const struct lgfs2_metadata *m1 = *(struct lgfs2_metadata **)a; + const struct lgfs2_metadata *m2 = *(struct lgfs2_metadata **)b; + return strcmp(m1->name, m2->name); +} + +static void print_structs(void) +{ + const struct lgfs2_metadata *mlist[lgfs2_metadata_size]; + int i; + for (i = 0; i < lgfs2_metadata_size; i++) + mlist[i] = &lgfs2_metadata[i]; + + qsort(mlist, lgfs2_metadata_size, sizeof(struct lgfs2_metadata *), metastrcmp); + for (i = 0; i < lgfs2_metadata_size; i++) + if (mlist[i]->mh_type != GFS2_METATYPE_NONE) + printf("%s\n", mlist[i]->name); +} + +static void print_fields(const char *name) +{ + const struct lgfs2_metadata *m = lgfs2_find_mtype_name(name, LGFS2_MD_GFS1|LGFS2_MD_GFS2); + if (m != NULL) { + const struct lgfs2_metafield *fields = m->fields; + const unsigned nfields = m->nfields; + int i; + for (i = 0; i < nfields; i++) + printf("0x%.4x %s\n", fields[i].offset, fields[i].name); + } +} + +static int getopts(int argc, char *argv[], struct cmdopts *opts) +{ + int opt; + opts->src = stdin; + while ((opt = getopt(argc, argv, "F:f:hT")) != -1) { + switch (opt) { + case 'f': + if (strcmp("-", optarg)) { + opts->src = fopen(optarg, "r"); + if (opts->src == NULL) { + perror("Failed to open source file"); + return 1; + } + } + break; + case 'T': + print_structs(); + exit(0); + case 'F': + print_fields(optarg); + exit(0); + case 'h': + opts->help = 1; + return 0; + default: + fprintf(stderr, "Use -h for help\n"); + return 1; + } + } + + if (argc - optind != 1) { + usage(argv[0]); + fprintf(stderr, "Missing file system path. Use -h for help.\n"); + return 1; + } + + opts->fspath = strdup(argv[optind]); + if (opts->fspath == NULL) { + perror("getopts"); + return 1; + } + return 0; +} + +static int openfs(const char *path, struct gfs2_sbd *sdp) +{ + int fd; + int ret; + int sane; + uint64_t count; + + fd = open(path, O_RDWR); + if (fd < 0) { + fprintf(stderr, "Failed to open %s\n", path); + return 1; + } + + memset(sdp, 0, sizeof(*sdp)); + sdp->bsize = GFS2_BASIC_BLOCK; + sdp->device_fd = fd; + ret = compute_constants(sdp); + if (ret != 0) { + perror("Bad constants"); + return 1; + } + ret = lgfs2_get_dev_info(fd, &sdp->dinfo); + if (ret != 0) { + perror("Failed to gather device info"); + return 1; + } + fix_device_geometry(sdp); + + ret = read_sb(sdp); + if (ret != 0) { + perror("Could not read sb"); + return 1; + } + + sdp->master_dir = lgfs2_inode_read(sdp, sdp->sd_sb.sb_master_dir.no_addr); + gfs2_lookupi(sdp->master_dir, "rindex", 6, &sdp->md.riinode); + sdp->fssize = sdp->device.length; + if (sdp->md.riinode) { + rindex_read(sdp, 0, &count, &sane); + } else { + perror("Failed to look up rindex"); + return 1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret; + struct cmdopts opts = {NULL, NULL}; + struct gfs2_sbd sbd; + struct lgfs2_lang_result *result; + struct lgfs2_lang_state *state; + + if (getopts(argc, argv, &opts)) { + exit(1); + } + + if (opts.help) { + usage(argv[0]); + exit(0); + } + + if (openfs(argv[optind], &sbd)) + exit(1); + + state = lgfs2_lang_init(); + if (state == NULL) { + perror("lgfs2_lang_init failed"); + exit(1); + } + + ret = lgfs2_lang_parsef(state, opts.src); + if (ret != 0) { + fprintf(stderr, "Parse failed\n"); + free(opts.fspath); + return ret; + } + + for (result = lgfs2_lang_result_next(state, &sbd); + result != NULL; + result = lgfs2_lang_result_next(state, &sbd)) { + lgfs2_lang_result_print(result); + lgfs2_lang_result_free(&result); + } + + gfs2_rgrp_free(&sbd.rgtree); + inode_put(&sbd.md.riinode); + inode_put(&sbd.master_dir); + lgfs2_lang_free(&state); + free(opts.fspath); + return 0; +} + +// libgfs2 still requires an external print_it function +void print_it(const char *label, const char *fmt, const char *fmt2, ...) { return; } diff --git a/gfs2/libgfs2/lang.c b/gfs2/libgfs2/lang.c new file mode 100644 index 0000000..62ad2ab --- /dev/null +++ b/gfs2/libgfs2/lang.c @@ -0,0 +1,618 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lang.h" +#include "parser.h" +#ifdef GFS2_HAS_UUID +#include +#endif + +const char* ast_type_string[] = { + [AST_NONE] = "NONE", + // Statements + [AST_ST_SET] = "SET", + [AST_ST_GET] = "GET", + + // Expressions + [AST_EX_ID] = "IDENTIFIER", + [AST_EX_NUMBER] = "NUMBER", + [AST_EX_STRING] = "STRING", + [AST_EX_ADDRESS] = "ADDRESS", + [AST_EX_PATH] = "PATH", + [AST_EX_SUBSCRIPT] = "SUBSCRIPT", + [AST_EX_OFFSET] = "OFFSET", + [AST_EX_BLOCKSPEC] = "BLOCKSPEC", + [AST_EX_STRUCTSPEC] = "STRUCTSPEC", + [AST_EX_FIELDSPEC] = "FIELDSPEC", + [AST_EX_TYPESPEC] = "TYPESPEC", + + // Keywords + [AST_KW_STATE] = "STATE", +}; + +/** + * Initialize an expression node of the given type from a source string. + * Currently just converts numerical values and string values where + * appropriate. String values are duplicted into newly allocated buffers as the + * text from the parser will go away. + * Returns 0 on success or non-zero with errno set on failure + */ +static int ast_expr_init(struct ast_node *expr, ast_node_t type, const char *str) +{ + int ret = 0; + switch (type) { + case AST_EX_OFFSET: + str++; // Cut off the + + case AST_EX_NUMBER: + ret = sscanf(str, "%"SCNi64, &expr->ast_num); + if (ret != 1) { + return 1; + } + break; + case AST_EX_ID: + case AST_EX_PATH: + case AST_EX_STRING: + expr->ast_str = strdup(str); + if (expr->ast_str == NULL) { + return 1; + } + break; + case AST_EX_ADDRESS: + case AST_EX_SUBSCRIPT: + case AST_EX_BLOCKSPEC: + case AST_EX_STRUCTSPEC: + case AST_EX_FIELDSPEC: + case AST_EX_TYPESPEC: + case AST_KW_STATE: + break; + default: + errno = EINVAL; + return 1; + } + return 0; +} + +/** + * Create a new AST node of a given type from a source string. + * Returns a pointer to the new node or NULL on failure with errno set. + */ +struct ast_node *ast_new(ast_node_t type, const char *text) +{ + struct ast_node *node; + node = (struct ast_node *)calloc(1, sizeof(struct ast_node)); + if (node == NULL) { + goto return_fail; + } + + if (type > _AST_EX_START && ast_expr_init(node, type, text)) { + goto return_free; + } + + node->ast_text = strdup(text); + if (node->ast_text == NULL) { + goto return_free; + } + node->ast_type = type; + + return node; + +return_free: + if (node->ast_text) { + free(node->ast_text); + } + if (node->ast_str) { + free(node->ast_str); + } + free(node); +return_fail: + fprintf(stderr, "Failed to create new value from %s: %s\n", text, strerror(errno)); + return NULL; +} + +/** + * Free the memory allocated for an AST node and set its pointer to NULL + */ +void ast_destroy(struct ast_node **node) +{ + if (*node == NULL) { + return; + } + ast_destroy(&(*node)->ast_left); + ast_destroy(&(*node)->ast_right); + switch((*node)->ast_type) { + case AST_EX_ID: + case AST_EX_PATH: + case AST_EX_STRING: + free((*node)->ast_str); + break; + default: + break; + } + free((*node)->ast_text); + free(*node); + *node = NULL; +} + +static void ast_string_unescape(char *str) +{ + int head, tail; + for (head = tail = 0; str[head] != '\0'; head++, tail++) { + if (str[head] == '\\' && str[head+1] != '\0') + head++; + str[tail] = str[head]; + } + str[tail] = '\0'; +} + +static uint64_t ast_lookup_path(char *path, struct gfs2_sbd *sbd) +{ + int err = 0; + char *c = NULL; + struct gfs2_inode *ip, *iptmp; + char *segment; + uint64_t bn = 0; + + segment = strtok_r(path, "/", &c); + ip = lgfs2_inode_read(sbd, sbd->sd_sb.sb_root_dir.no_addr); + + while (ip != NULL) { + if (segment == NULL) { // No more segments + bn = ip->i_di.di_num.no_addr; + inode_put(&ip); + return bn; + } + ast_string_unescape(segment); + err = gfs2_lookupi(ip, segment, strlen(segment), &iptmp); + inode_put(&ip); + if (err != 0) { + errno = -err; + break; + } + ip = iptmp; + segment = strtok_r(NULL, "/", &c); + } + + return 0; +} + +enum block_id { + ID_SB = 0, + ID_MASTER, + ID_ROOT, + ID_RINDEX, + + ID_END +}; + +/** + * Names of blocks which can be uniquely identified in the fs + */ +static const char *block_ids[] = { + [ID_SB] = "sb", + [ID_MASTER] = "master", + [ID_ROOT] = "root", + [ID_RINDEX] = "rindex", + + [ID_END] = NULL +}; + +static uint64_t ast_lookup_id(const char *id, struct gfs2_sbd *sbd) +{ + uint64_t bn = 0; + int i; + for (i = 0; i < ID_END; i++) { + if (!strcmp(id, block_ids[i])) { + break; + } + } + switch (i) { + case ID_SB: + bn = LGFS2_SB_ADDR(sbd); + break; + case ID_MASTER: + bn = sbd->sd_sb.sb_master_dir.no_addr; + break; + case ID_ROOT: + bn = sbd->sd_sb.sb_root_dir.no_addr; + break; + case ID_RINDEX: + bn = sbd->md.riinode->i_di.di_num.no_addr; + break; + default: + return 0; + } + return bn; +} + +static uint64_t ast_lookup_rgrp(uint64_t rgnum, struct gfs2_sbd *sbd) +{ + uint64_t i = rgnum; + struct osi_node *n; + + for (n = osi_first(&sbd->rgtree); n != NULL && i > 0; n = osi_next(n), i--); + if (n != NULL && i == 0) + return ((struct rgrp_tree *)n)->ri.ri_addr; + fprintf(stderr, "Resource group number out of range: %"PRIu64"\n", rgnum); + return 0; +} + +static uint64_t ast_lookup_subscript(struct ast_node *id, struct ast_node *index, + struct gfs2_sbd *sbd) +{ + uint64_t bn = 0; + const char *name = id->ast_str; + if (!strcmp(name, "rgrp")) { + bn = ast_lookup_rgrp(index->ast_num, sbd); + } else { + fprintf(stderr, "Unrecognized identifier %s\n", name); + } + return bn; +} + +/** + * Look up a block and return its number. The kind of lookup depends on the + * type of the ast node. + */ +static uint64_t ast_lookup_block_num(struct ast_node *ast, struct gfs2_sbd *sbd) +{ + uint64_t bn = 0; + switch (ast->ast_type) { + case AST_EX_OFFSET: + bn = ast_lookup_block_num(ast->ast_left, sbd) + ast->ast_num; + break; + case AST_EX_ADDRESS: + if (gfs2_check_range(sbd, ast->ast_num)) + break; + bn = ast->ast_num; + break; + case AST_EX_PATH: + bn = ast_lookup_path(ast->ast_str, sbd); + break; + case AST_EX_ID: + bn = ast_lookup_id(ast->ast_str, sbd); + break; + case AST_EX_SUBSCRIPT: + bn = ast_lookup_subscript(ast->ast_left, ast->ast_left->ast_left, sbd); + break; + default: + break; + } + return bn; +} + +static struct gfs2_buffer_head *ast_lookup_block(struct ast_node *node, struct gfs2_sbd *sbd) +{ + uint64_t bn = ast_lookup_block_num(node, sbd); + if (bn == 0) { + fprintf(stderr, "Block not found: %s\n", node->ast_text); + return NULL; + } + + return bread(sbd, bn); +} + +static const char *bitstate_strings[] = { + [GFS2_BLKST_FREE] = "Free", + [GFS2_BLKST_USED] = "Used", + [GFS2_BLKST_UNLINKED] = "Unlinked", + [GFS2_BLKST_DINODE] = "Dinode" +}; + +/** + * Print a representation of an arbitrary field of an arbitrary GFS2 block to stdout + * Returns 0 if successful, 1 otherwise + */ +static int field_print(const struct gfs2_buffer_head *bh, const struct lgfs2_metadata *mtype, + const struct lgfs2_metafield *field) +{ + const char *fieldp = (char *)bh->iov.iov_base + field->offset; + + printf("%s\t%"PRIu64"\t%u\t%u\t%s\t", mtype->name, bh->b_blocknr, field->offset, field->length, field->name); + if (field->flags & LGFS2_MFF_UUID) { +#ifdef GFS2_HAS_UUID + char readable_uuid[36+1]; + uuid_t uuid; + + memcpy(uuid, fieldp, sizeof(uuid_t)); + uuid_unparse(uuid, readable_uuid); + printf("'%s'\n", readable_uuid); +#endif + } else if (field->flags & LGFS2_MFF_STRING) { + printf("'%s'\n", fieldp); + } else { + switch(field->length) { + case 1: + printf("%"PRIu8"\n", *(uint8_t *)fieldp); + break; + case 2: + printf("%"PRIu16"\n", be16_to_cpu(*(uint16_t *)fieldp)); + break; + case 4: + printf("%"PRIu32"\n", be32_to_cpu(*(uint32_t *)fieldp)); + break; + case 8: + printf("%"PRIu64"\n", be64_to_cpu(*(uint64_t *)fieldp)); + break; + default: + // "Reserved" field so just print 0 + printf("0\n"); + return 1; + } + } + return 0; +} + +/** + * Print a representation of an arbitrary GFS2 block to stdout + */ +int lgfs2_lang_result_print(struct lgfs2_lang_result *result) +{ + int i; + if (result->lr_mtype != NULL) { + for (i = 0; i < result->lr_mtype->nfields; i++) { + field_print(result->lr_bh, result->lr_mtype, &result->lr_mtype->fields[i]); + } + } else { + printf("%"PRIu64": %s\n", result->lr_blocknr, bitstate_strings[result->lr_state]); + } + return 0; +} + +static int ast_get_bitstate(uint64_t bn, struct gfs2_sbd *sbd) +{ + int ret = 0; + int state = 0; + struct rgrp_tree *rgd = gfs2_blk2rgrpd(sbd, bn); + if (rgd == NULL) { + fprintf(stderr, "Could not find resource group for block %"PRIu64"\n", bn); + return -1; + } + + ret = gfs2_rgrp_read(sbd, rgd); + if (ret != 0) { + fprintf(stderr, "Failed to read resource group for block %"PRIu64": %d\n", bn, ret); + return -1; + } + + state = lgfs2_get_bitmap(sbd, bn, rgd); + if (state == -1) { + fprintf(stderr, "Failed to acquire bitmap state for block %"PRIu64"\n", bn); + return -1; + } + + gfs2_rgrp_relse(rgd); + return state; +} + +static const struct lgfs2_metadata *ast_lookup_mtype(const struct gfs2_buffer_head *bh) +{ + const struct lgfs2_metadata *mtype; + const uint32_t mh_type = lgfs2_get_block_type(bh); + if (mh_type == 0) { + fprintf(stderr, "Could not determine type for block %"PRIu64"\n", bh->b_blocknr); + return NULL; + } + + mtype = lgfs2_find_mtype(mh_type, bh->sdp->gfs1 ? LGFS2_MD_GFS1 : LGFS2_MD_GFS2); + if (mtype == NULL) { + fprintf(stderr, "Could not determine meta type for block %"PRIu64"\n", bh->b_blocknr); + return NULL; + } + return mtype; +} + +/** + * Interpret the get statement. + */ +static struct lgfs2_lang_result *ast_interp_get(struct lgfs2_lang_state *state, + struct ast_node *ast, struct gfs2_sbd *sbd) +{ + struct lgfs2_lang_result *result = calloc(1, sizeof(struct lgfs2_lang_result)); + if (result == NULL) { + fprintf(stderr, "Failed to allocate memory for result\n"); + return NULL; + } + + if (ast->ast_right->ast_right == NULL) { + result->lr_bh = ast_lookup_block(ast->ast_right, sbd); + if (result->lr_bh == NULL) { + free(result); + return NULL; + } + result->lr_blocknr = result->lr_bh->b_blocknr; + result->lr_mtype = ast_lookup_mtype(result->lr_bh); + + } else if (ast->ast_right->ast_right->ast_type == AST_KW_STATE) { + result->lr_blocknr = ast_lookup_block_num(ast->ast_right, sbd); + if (result->lr_blocknr == 0) { + free(result); + return NULL; + } + result->lr_state = ast_get_bitstate(result->lr_blocknr, sbd); + } + + return result; +} + +/** + * Set a field of a gfs2 block of a given type to a given value. + * Returns AST_INTERP_* to signal success, an invalid field/value or an error. + */ +static int ast_field_set(struct gfs2_buffer_head *bh, const struct lgfs2_metafield *field, + struct ast_node *val) +{ + int err = 0; + + if (field->flags & LGFS2_MFF_UUID) { +#ifdef GFS2_HAS_UUID + uuid_t uuid; + + if (uuid_parse(val->ast_str, uuid) != 0) { + fprintf(stderr, "Invalid UUID\n"); + return AST_INTERP_INVAL; + } + err = lgfs2_field_assign(bh->b_data, field, uuid); +#else + fprintf(stderr, "No UUID support\n"); + err = 1; +#endif + } else if (field->flags & LGFS2_MFF_STRING) { + err = lgfs2_field_assign(bh->b_data, field, val->ast_str); + } else { + err = lgfs2_field_assign(bh->b_data, field, &val->ast_num); + } + + if (err) { + fprintf(stderr, "Invalid field assignment: %s (size %d) = %s\n", + field->name, field->length, val->ast_text); + return AST_INTERP_INVAL; + } + + bmodified(bh); + return AST_INTERP_SUCCESS; +} + +static const struct lgfs2_metadata *lang_find_mtype(struct ast_node *node, struct gfs2_buffer_head *bh, unsigned ver) +{ + const struct lgfs2_metadata *mtype = NULL; + + if (node->ast_type == AST_EX_TYPESPEC) { + mtype = lgfs2_find_mtype_name(node->ast_str, ver); + if (mtype == NULL) + fprintf(stderr, "Invalid block type: %s\n", node->ast_text); + } else { + mtype = lgfs2_find_mtype(lgfs2_get_block_type(bh), ver); + if (mtype == NULL) + fprintf(stderr, "Unrecognised block at: %s\n", node->ast_text); + } + + return mtype; +} + +/** + * Interpret an assignment (set) + */ +static struct lgfs2_lang_result *ast_interp_set(struct lgfs2_lang_state *state, + struct ast_node *ast, struct gfs2_sbd *sbd) +{ + struct ast_node *lookup = ast->ast_right; + struct ast_node *fieldspec; + struct ast_node *fieldname; + struct ast_node *fieldval; + int ret = 0; + unsigned ver = sbd->gfs1 ? LGFS2_MD_GFS1 : LGFS2_MD_GFS2; + + struct lgfs2_lang_result *result = calloc(1, sizeof(struct lgfs2_lang_result)); + if (result == NULL) { + fprintf(stderr, "Failed to allocate memory for result\n"); + return NULL; + } + + result->lr_bh = ast_lookup_block(lookup, sbd); + if (result->lr_bh == NULL) { + goto out_err; + } + + result->lr_mtype = lang_find_mtype(lookup->ast_right, result->lr_bh, ver); + if (result->lr_mtype == NULL) { + fprintf(stderr, "Unrecognised block at: %s\n", lookup->ast_str); + goto out_err; + } + + if (lookup->ast_right->ast_type == AST_EX_TYPESPEC) { + struct gfs2_meta_header mh = { + .mh_magic = GFS2_MAGIC, + .mh_type = result->lr_mtype->mh_type, + .mh_format = result->lr_mtype->mh_format, + }; + gfs2_meta_header_out(&mh, result->lr_bh->iov.iov_base); + lookup = lookup->ast_right; + } + + for (fieldspec = lookup->ast_right; + fieldspec != NULL && fieldspec->ast_type == AST_EX_FIELDSPEC; + fieldspec = fieldspec->ast_left) { + const struct lgfs2_metafield *mfield; + + fieldname = fieldspec->ast_right; + fieldval = fieldname->ast_right; + + mfield = lgfs2_find_mfield_name(fieldname->ast_str, result->lr_mtype); + if (mfield == NULL) { + fprintf(stderr, "No field '%s' found in '%s'\n", + fieldname->ast_str, result->lr_mtype->name); + goto out_err; + } + + ret = ast_field_set(result->lr_bh, mfield, fieldval); + if (ret != AST_INTERP_SUCCESS) { + goto out_err; + } + } + + ret = bwrite(result->lr_bh); + if (ret != 0) { + fprintf(stderr, "Failed to write modified block %"PRIu64": %s\n", + result->lr_bh->b_blocknr, strerror(errno)); + goto out_err; + } + + return result; + +out_err: + lgfs2_lang_result_free(&result); + return NULL; +} + +static struct lgfs2_lang_result *ast_interpret_node(struct lgfs2_lang_state *state, + struct ast_node *ast, struct gfs2_sbd *sbd) +{ + struct lgfs2_lang_result *result = NULL; + + if (ast->ast_type == AST_ST_SET) { + result = ast_interp_set(state, ast, sbd); + } else if (ast->ast_type == AST_ST_GET) { + result = ast_interp_get(state, ast, sbd); + } else { + fprintf(stderr, "Invalid AST node type: %d\n", ast->ast_type); + } + return result; +} + +struct lgfs2_lang_result *lgfs2_lang_result_next(struct lgfs2_lang_state *state, + struct gfs2_sbd *sbd) +{ + struct lgfs2_lang_result *result; + if (state->ls_interp_curr == NULL) { + return NULL; + } + result = ast_interpret_node(state, state->ls_interp_curr, sbd); + if (result == NULL) { + return NULL; + } + state->ls_interp_curr = state->ls_interp_curr->ast_left; + return result; +} + +void lgfs2_lang_result_free(struct lgfs2_lang_result **result) +{ + if (*result == NULL) { + fprintf(stderr, "Warning: attempted to free a null result\n"); + return; + } + + if ((*result)->lr_mtype != NULL) { + (*result)->lr_bh->b_modified = 0; + brelse((*result)->lr_bh); + (*result)->lr_bh = NULL; + } + + free(*result); + *result = NULL; +} diff --git a/gfs2/libgfs2/lang.h b/gfs2/libgfs2/lang.h new file mode 100644 index 0000000..7d9a6e9 --- /dev/null +++ b/gfs2/libgfs2/lang.h @@ -0,0 +1,62 @@ +#ifndef LANG_H +#define LANG_H +#include +#include "libgfs2.h" + +struct lgfs2_lang_state { + int ls_colnum; + int ls_linenum; + int ls_errnum; + struct ast_node *ls_ast_root; + struct ast_node *ls_ast_tail; + struct ast_node *ls_interp_curr; +}; + +typedef enum { + AST_NONE, + // Statements + AST_ST_SET, + AST_ST_GET, + + _AST_EX_START, + // Expressions + AST_EX_ID, + AST_EX_NUMBER, + AST_EX_STRING, + AST_EX_ADDRESS, + AST_EX_PATH, + AST_EX_SUBSCRIPT, + AST_EX_OFFSET, + AST_EX_BLOCKSPEC, + AST_EX_STRUCTSPEC, + AST_EX_FIELDSPEC, + AST_EX_TYPESPEC, + + // Keywords + AST_KW_STATE, +} ast_node_t; + +enum { + AST_INTERP_SUCCESS = 0, // Success + AST_INTERP_FAIL = 1, // Failure + AST_INTERP_INVAL = 2, // Invalid field/type mismatch + AST_INTERP_ERR = 3, // Something went wrong, see errno +}; + +extern const char* ast_type_string[]; + +struct ast_node { + ast_node_t ast_type; + struct ast_node *ast_left; + struct ast_node *ast_right; + char *ast_text; + char *ast_str; + uint64_t ast_num; +}; + +extern struct ast_node *ast_new(ast_node_t type, const char *text); +extern void ast_destroy(struct ast_node **val); + +#define YYSTYPE struct ast_node * + +#endif /* LANG_H */ diff --git a/gfs2/libgfs2/lexer.l b/gfs2/libgfs2/lexer.l new file mode 100644 index 0000000..04b3883 --- /dev/null +++ b/gfs2/libgfs2/lexer.l @@ -0,0 +1,108 @@ +%{ +#include "lang.h" +#include "parser.h" + +#define EXTRA ((struct lgfs2_lang_state *)yyextra) + +#define P(token, type, text) do {\ + *(yylval) = ast_new(type, text);\ + if (*(yylval) == NULL) {\ + EXTRA->ls_errnum = errno;\ + return 1;\ + }\ + return (TOK_##token);\ +} while(0) + +#define COLNUM EXTRA->ls_colnum +#define YY_USER_ACTION COLNUM += yyleng; + +%} +%option bison-bridge reentrant +%option warn debug +%option nounput noinput +%option noyywrap +%option extra-type="struct lgfs2_lang_state *" + +letter [a-zA-Z_] +decdigit [0-9] +decnumber -?{decdigit}+ +hexdigit [0-9a-fA-F] +hexnumber -?0x{hexdigit}+ +number ({decnumber}|{hexnumber}) +offset \+{number} +id {letter}({letter}|{decdigit}|\.)* +string \'([^\']|\\\')*\' +path \'\/([^\']|\\\')*\' +ccomment \/\/.*\n +shcomment \#.*\n +comment ({ccomment}|{shcomment}) +whitespace [ \t\r]+ + +%% + +\{ { + return TOK_LBRACE; + } +\} { + return TOK_RBRACE; + } +\[ { + return TOK_LBRACKET; + } +\] { + P(RBRACKET, AST_EX_SUBSCRIPT, "[ ]"); + } +\, { + return TOK_COMMA; + } +\: { + P(COLON, AST_EX_FIELDSPEC, yytext); + } +\; { + return TOK_SEMI; + } +set { + P(SET, AST_ST_SET, yytext); + } +get { + P(GET, AST_ST_GET, yytext); + } +state { + P(STATE, AST_KW_STATE, yytext); + } +{path} { + yytext[yyleng-1] = '\0'; + P(PATH, AST_EX_PATH, yytext + 1); + } +{string} { + yytext[yyleng-1] = '\0'; + P(STRING, AST_EX_STRING, yytext + 1); + } +{offset} { + P(OFFSET, AST_EX_OFFSET, yytext); + } +{number} { + P(NUMBER, AST_EX_NUMBER, yytext); + } +{id} { + P(ID, AST_EX_ID, yytext); + } +{comment} { + COLNUM = 0; + EXTRA->ls_linenum++; + } +<> { + return 0; + } +\n { + COLNUM = 0; + EXTRA->ls_linenum++; + } +{whitespace} ; +. { + printf("Unexpected character '%s' on line %d column %d\n", + yytext, yylineno, COLNUM); + return 1; + } + +%% diff --git a/gfs2/libgfs2/libgfs2.h b/gfs2/libgfs2/libgfs2.h new file mode 100644 index 0000000..05e4512 --- /dev/null +++ b/gfs2/libgfs2/libgfs2.h @@ -0,0 +1,791 @@ +#ifndef __LIBGFS2_DOT_H__ +#define __LIBGFS2_DOT_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "osi_list.h" +#include "osi_tree.h" + +__BEGIN_DECLS + +#ifndef TRUE +#define TRUE (1) +#endif + +#ifndef FALSE +#define FALSE (0) +#endif + +#if __BYTE_ORDER == __BIG_ENDIAN + +#define be16_to_cpu(x) (x) +#define be32_to_cpu(x) (x) +#define be64_to_cpu(x) (x) + +#define cpu_to_be16(x) (x) +#define cpu_to_be32(x) (x) +#define cpu_to_be64(x) (x) + +#define le16_to_cpu(x) (bswap_16((x))) +#define le32_to_cpu(x) (bswap_32((x))) +#define le64_to_cpu(x) (bswap_64((x))) + +#define cpu_to_le16(x) (bswap_16((x))) +#define cpu_to_le32(x) (bswap_32((x))) +#define cpu_to_le64(x) (bswap_64((x))) + +#endif /* __BYTE_ORDER == __BIG_ENDIAN */ + + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +#define be16_to_cpu(x) (bswap_16((x))) +#define be32_to_cpu(x) (bswap_32((x))) +#define be64_to_cpu(x) (bswap_64((x))) + +#define cpu_to_be16(x) (bswap_16((x))) +#define cpu_to_be32(x) (bswap_32((x))) +#define cpu_to_be64(x) (bswap_64((x))) + +#define le16_to_cpu(x) (x) +#define le32_to_cpu(x) (x) +#define le64_to_cpu(x) (x) + +#define cpu_to_le16(x) (x) +#define cpu_to_le32(x) (x) +#define cpu_to_le64(x) (x) + +#endif /* __BYTE_ORDER == __LITTLE_ENDIAN */ + +enum lgfs2_meta_type { + LGFS2_MT_GFS2_SB = 0, + LGFS2_MT_GFS_SB = 1, + LGFS2_MT_RINDEX = 2, + LGFS2_MT_GFS2_RGRP = 3, + LGFS2_MT_GFS_RGRP = 4, + LGFS2_MT_RGRP_BITMAP = 5, + LGFS2_MT_GFS2_DINODE = 6, + LGFS2_MT_GFS_DINODE = 7, + LGFS2_MT_GFS2_INDIRECT = 8, + LGFS2_MT_GFS_INDIRECT = 9, + LGFS2_MT_DIR_LEAF = 10, + LGFS2_MT_JRNL_DATA = 11, + LGFS2_MT_GFS2_LOG_HEADER = 12, + LGFS2_MT_GFS_LOG_HEADER = 13, + LGFS2_MT_GFS2_LOG_DESC = 14, + LGFS2_MT_GFS_LOG_DESC = 15, + LGFS2_MT_GFS2_LOG_BLOCK = 16, + LGFS2_MT_EA_ATTR = 17, + LGFS2_MT_EA_DATA = 18, + LGFS2_MT_GFS2_QUOTA_CHANGE = 19, + LGFS2_MT_DIRENT = 20, + LGFS2_MT_EA_HEADER = 21, + LGFS2_MT_GFS2_INUM_RANGE = 22, + LGFS2_MT_STATFS_CHANGE = 23, + LGFS2_MT_GFS_JINDEX = 24, + LGFS2_MT_GFS_BLOCK_TAG = 25, + LGFS2_MT_DATA = 26, + LGFS2_MT_FREE = 27, + + LGFS2_MT_NR, +}; + +struct lgfs2_symbolic { + const uint32_t key; + const char *value; +}; + +struct lgfs2_metafield { + const char *name; + const unsigned offset; + const unsigned length; + const unsigned flags; + +#define LGFS2_MFF_RESERVED 0x00001 /* Field is reserved */ +#define LGFS2_MFF_POINTER 0x00002 /* Field is a pointer to a block */ +#define LGFS2_MFF_ENUM 0x00004 /* Field is an enum */ +#define LGFS2_MFF_MASK 0x00008 /* Field is a bitmask */ +#define LGFS2_MFF_UUID 0x00010 /* Field is a UUID */ +#define LGFS2_MFF_STRING 0x00020 /* Field in an ASCII string */ +#define LGFS2_MFF_UID 0x00040 /* Field is a UID */ +#define LGFS2_MFF_GID 0x00080 /* Field is a GID */ +#define LGFS2_MFF_MODE 0x00100 /* Field is a file mode */ +#define LGFS2_MFF_FSBLOCKS 0x00200 /* Units are fs blocks */ +#define LGFS2_MFF_BYTES 0x00400 /* Units are bytes */ +#define LGFS2_MFF_SHIFT 0x00800 /* Log_{2} quantity */ +#define LGFS2_MFF_CHECK 0x01000 /* Field is a checksum */ +#define LGFS2_MFF_SECS 0x02000 /* Units are seconds */ +#define LGFS2_MFF_NSECS 0x04000 /* Units are nsecs */ +#define LGFS2_MFF_MAJOR 0x08000 /* Major device number */ +#define LGFS2_MFF_MINOR 0x10000 /* Minor device number */ + + /* If it is a pointer, then this field must be set */ + const unsigned points_to; + /* If isenum or ismask are set, these must also be filled in */ + const struct lgfs2_symbolic *symtab; + const unsigned nsyms; +}; + +struct lgfs2_metadata { + const unsigned versions:2; +#define LGFS2_MD_GFS1 0x01 +#define LGFS2_MD_GFS2 0x02 + const unsigned header:1; + const uint32_t mh_type; + const uint32_t mh_format; + const char *name; + const struct lgfs2_metafield *fields; + const unsigned nfields; + const unsigned size; +}; + +struct lgfs2_dev_info { + struct stat stat; + unsigned readonly:1; + long ra_pages; + int soft_block_size; + int logical_block_size; + unsigned int physical_block_size; + unsigned int io_min_size; + unsigned int io_optimal_size; + int io_align_offset; + uint64_t size; +}; + +struct device { + uint64_t length; +}; + +struct gfs2_bitmap +{ + struct gfs2_buffer_head *bi_bh; + uint32_t bi_offset; /* The offset in the buffer of the first byte */ + uint32_t bi_start; /* The position of the first byte in this block */ + uint32_t bi_len; /* The number of bytes in this block */ +}; + +struct gfs2_sbd; +struct gfs2_inode; +typedef struct _lgfs2_rgrps *lgfs2_rgrps_t; + +struct rgrp_tree { + struct osi_node node; + uint64_t start; /* The offset of the beginning of this resource group */ + uint64_t length; /* The length of this resource group */ + + struct gfs2_rindex ri; + struct gfs2_rgrp rg; + struct gfs2_bitmap *bits; + lgfs2_rgrps_t rgrps; +}; + +typedef struct rgrp_tree *lgfs2_rgrp_t; + +extern lgfs2_rgrps_t lgfs2_rgrps_init(struct gfs2_sbd *sdp, uint64_t align, uint64_t offset); +extern void lgfs2_rgrps_free(lgfs2_rgrps_t *rgs); +extern uint64_t lgfs2_rindex_entry_new(lgfs2_rgrps_t rgs, struct gfs2_rindex *entry, uint64_t addr, uint32_t len); +extern unsigned lgfs2_rindex_read_fd(int fd, lgfs2_rgrps_t rgs); +extern const struct gfs2_rindex *lgfs2_rindex_read_one(struct gfs2_inode *rip, lgfs2_rgrps_t rgs, unsigned i); +extern uint64_t lgfs2_rgrp_align_addr(const lgfs2_rgrps_t rgs, uint64_t addr); +extern uint32_t lgfs2_rgrp_align_len(const lgfs2_rgrps_t rgs, uint32_t len); +extern unsigned lgfs2_rgsize_for_data(uint64_t blksreq, unsigned bsize); +extern uint32_t lgfs2_rgrps_plan(const lgfs2_rgrps_t rgs, uint64_t space, uint32_t tgtsize); +extern lgfs2_rgrp_t lgfs2_rgrps_append(lgfs2_rgrps_t rgs, struct gfs2_rindex *entry, uint32_t rg_skip); +extern int lgfs2_rgrp_bitbuf_alloc(lgfs2_rgrp_t rg); +extern void lgfs2_rgrp_bitbuf_free(lgfs2_rgrp_t rg); +extern int lgfs2_rgrp_write(int fd, lgfs2_rgrp_t rg); +extern const struct gfs2_rindex *lgfs2_rgrp_index(lgfs2_rgrp_t rg); +extern const struct gfs2_rgrp *lgfs2_rgrp_rgrp(lgfs2_rgrp_t rg); +extern lgfs2_rgrp_t lgfs2_rgrp_first(lgfs2_rgrps_t rgs); +extern lgfs2_rgrp_t lgfs2_rgrp_last(lgfs2_rgrps_t rgs); +extern lgfs2_rgrp_t lgfs2_rgrp_next(lgfs2_rgrp_t rg); +extern lgfs2_rgrp_t lgfs2_rgrp_prev(lgfs2_rgrp_t rg); +// Temporary function to aid API migration +extern struct osi_node *lgfs2_rgrps_root(lgfs2_rgrps_t rgs) __attribute__((deprecated)); + +struct gfs2_buffer_head { + osi_list_t b_altlist; /* alternate list */ + uint64_t b_blocknr; + union { + char *b_data; + struct iovec iov; + }; + struct gfs2_sbd *sdp; + int b_modified; +}; + +struct special_blocks { + osi_list_t list; + uint64_t block; +}; + +struct gfs2_inode { + struct gfs2_dinode i_di; + struct gfs2_buffer_head *i_bh; + struct gfs2_sbd *i_sbd; + struct rgrp_tree *i_rgd; /* performance hint */ + int bh_owned; /* Is this bh owned, iow, should we release it later? */ +}; + +struct master_dir +{ + struct gfs2_inode *inum; + uint64_t next_inum; + struct gfs2_inode *statfs; + struct gfs2_inode *qinode; + + struct gfs2_inode *jiinode; + struct gfs2_inode *riinode; + struct gfs2_inode *rooti; + struct gfs2_inode *pinode; + + struct gfs2_inode **journal; /* Array of journals */ + uint32_t journals; /* Journal count */ +}; + +#define LGFS2_SB_ADDR(sdp) (GFS2_SB_ADDR >> (sdp)->sd_fsb2bb_shift) +struct gfs2_sbd { + struct gfs2_sb sd_sb; /* a copy of the ondisk structure */ + + unsigned int bsize; /* The block size of the FS (in bytes) */ + unsigned int jsize; /* Size of journals (in MB) */ + unsigned int rgsize; /* Size of resource groups (in MB) */ + unsigned int qcsize; /* Size of quota change files (in MB) */ + + /* Constants */ + + uint32_t sd_fsb2bb; + uint32_t sd_fsb2bb_shift; + uint32_t sd_diptrs; + uint32_t sd_inptrs; + uint32_t sd_jbsize; + uint32_t sd_hash_bsize; + uint32_t sd_hash_bsize_shift; + uint32_t sd_hash_ptrs; + uint32_t sd_blocks_per_bitmap; + uint32_t sd_max_dirres; + uint32_t sd_max_height; + uint32_t sd_max_jheight; + uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT]; + uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT]; + + /* Not specified on the command line, but... */ + + int64_t time; + + struct lgfs2_dev_info dinfo; + struct device device; + + int device_fd; + int path_fd; + + uint64_t fssize; + uint64_t blks_total; + uint64_t blks_alloced; + uint64_t dinodes_alloced; + + uint64_t orig_rgrps; + uint64_t rgrps; + uint64_t new_rgrps; + struct osi_root rgtree; + struct osi_root rgcalc; + + struct gfs2_inode *master_dir; + struct master_dir md; + + uint64_t rg_one_length; + uint64_t rg_length; + int gfs1; +}; + +struct metapath { + unsigned int mp_list[GFS2_MAX_META_HEIGHT]; +}; + + +#define GFS2_DEFAULT_BSIZE (4096) +#define GFS2_DEFAULT_JSIZE (128) +#define GFS2_MAX_JSIZE (1024) +#define GFS2_MIN_JSIZE (8) +#define GFS2_DEFAULT_RGSIZE (256) +#define GFS2_DEFAULT_UTSIZE (1) +#define GFS2_DEFAULT_QCSIZE (1) +#define GFS2_DEFAULT_LOCKPROTO "lock_dlm" +#define GFS2_MIN_GROW_SIZE (10) +#define GFS2_EXCESSIVE_RGS (10000) + +#define GFS2_MIN_RGSIZE (32) +#define GFS2_MAX_RGSIZE (2048) + +/* meta.c */ +extern const struct lgfs2_metadata lgfs2_metadata[]; +extern const unsigned lgfs2_metadata_size; +extern const struct lgfs2_symbolic lgfs2_metatypes[]; +extern const unsigned lgfs2_metatype_size; +extern const struct lgfs2_symbolic lgfs2_metaformats[]; +extern const unsigned lgfs2_metaformat_size; +extern const struct lgfs2_symbolic lgfs2_di_flags[]; +extern const unsigned lgfs2_di_flag_size; +extern const struct lgfs2_symbolic lgfs2_lh_flags[]; +extern const unsigned lgfs2_lh_flag_size; +extern const struct lgfs2_symbolic lgfs2_ld_types[]; +extern const unsigned lgfs2_ld_type_size; +extern const struct lgfs2_symbolic lgfs2_ld1_types[]; +extern const unsigned lgfs2_ld1_type_size; +extern int lgfs2_selfcheck(void); +extern const struct lgfs2_metadata *lgfs2_find_mtype(uint32_t mh_type, const unsigned versions); +extern const struct lgfs2_metadata *lgfs2_find_mtype_name(const char *name, const unsigned versions); +extern const struct lgfs2_metafield *lgfs2_find_mfield_name(const char *name, const struct lgfs2_metadata *mtype); +extern int lgfs2_field_str(char *str, const size_t size, const char *blk, const struct lgfs2_metafield *field, int hex); +extern int lgfs2_field_assign(char *blk, const struct lgfs2_metafield *field, const void *val); + +/* block_list.c */ + +extern struct special_blocks *blockfind(struct special_blocks *blist, uint64_t num); +extern void gfs2_special_add(struct special_blocks *blocklist, uint64_t block); +extern void gfs2_special_set(struct special_blocks *blocklist, uint64_t block); +extern void gfs2_special_free(struct special_blocks *blist); +extern void gfs2_special_clear(struct special_blocks *blocklist, + uint64_t block); + +/* buf.c */ +extern struct gfs2_buffer_head *bget(struct gfs2_sbd *sdp, uint64_t num); +extern struct gfs2_buffer_head *__bread(struct gfs2_sbd *sdp, uint64_t num, + int line, const char *caller); +extern int __breadm(struct gfs2_sbd *sdp, struct gfs2_buffer_head **bhs, size_t n, uint64_t block, int line, const char *caller); +extern int bwrite(struct gfs2_buffer_head *bh); +extern int brelse(struct gfs2_buffer_head *bh); +extern uint32_t lgfs2_get_block_type(const struct gfs2_buffer_head *lbh); + +#define bmodified(bh) do { bh->b_modified = 1; } while(0) + +#define bread(bl, num) __bread(bl, num, __LINE__, __FUNCTION__) +#define breadm(bl, bhs, n, block) __breadm(bl, bhs, n, block, __LINE__, __FUNCTION__) + +/* config.c */ +extern void lgfs2_set_debug(int enable); + +/* device_geometry.c */ +extern int lgfs2_get_dev_info(int fd, struct lgfs2_dev_info *i); +extern void fix_device_geometry(struct gfs2_sbd *sdp); + +/* fs_bits.c */ +#define BFITNOENT (0xFFFFFFFF) + +/* functions with blk #'s that are buffer relative */ +extern unsigned long gfs2_bitfit(const unsigned char *buffer, + const unsigned int buflen, + unsigned long goal, unsigned char old_state); + +/* functions with blk #'s that are rgrp relative */ +extern uint32_t gfs2_blkalloc_internal(struct rgrp_tree *rgd, uint32_t goal, + unsigned char old_state, + unsigned char new_state, int do_it); +extern int gfs2_check_range(struct gfs2_sbd *sdp, uint64_t blkno); + +/* functions with blk #'s that are file system relative */ +extern int lgfs2_get_bitmap(struct gfs2_sbd *sdp, uint64_t blkno, struct rgrp_tree *rgd); +extern int gfs2_set_bitmap(lgfs2_rgrp_t rg, uint64_t blkno, int state); + +/* fs_geometry.c */ +extern uint32_t rgblocks2bitblocks(const unsigned int bsize, const uint32_t rgblocks, + uint32_t *ri_data) __attribute__((nonnull(3))); +extern int build_rgrps(struct gfs2_sbd *sdp, int write); + +/* fs_ops.c */ +#define IS_LEAF (1) +#define IS_DINODE (2) + +extern void find_metapath(struct gfs2_inode *ip, uint64_t block, struct metapath *mp); +extern void lookup_block(struct gfs2_inode *ip, struct gfs2_buffer_head *bh, + unsigned int height, struct metapath *mp, + int create, int *new, uint64_t *block); +extern struct gfs2_inode *lgfs2_inode_get(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *bh); +extern struct gfs2_inode *lgfs2_inode_read(struct gfs2_sbd *sdp, uint64_t di_addr); +extern struct gfs2_inode *is_system_inode(struct gfs2_sbd *sdp, + uint64_t block); +extern void inode_put(struct gfs2_inode **ip); +extern uint64_t data_alloc(struct gfs2_inode *ip); +extern int lgfs2_meta_alloc(struct gfs2_inode *ip, uint64_t *blkno); +extern int lgfs2_dinode_alloc(struct gfs2_sbd *sdp, const uint64_t blksreq, uint64_t *blkno); +extern uint64_t lgfs2_space_for_data(const struct gfs2_sbd *sdp, unsigned bsize, uint64_t bytes); +extern int lgfs2_file_alloc(lgfs2_rgrp_t rg, uint64_t di_size, struct gfs2_inode *ip, uint32_t flags, unsigned mode); + +extern int gfs2_readi(struct gfs2_inode *ip, void *buf, uint64_t offset, + unsigned int size); +#define gfs2_writei(ip, buf, offset, size) \ + __gfs2_writei(ip, buf, offset, size, 1) +extern int __gfs2_writei(struct gfs2_inode *ip, void *buf, uint64_t offset, + unsigned int size, int resize); +extern struct gfs2_buffer_head *get_file_buf(struct gfs2_inode *ip, + uint64_t lbn, int prealloc); +extern int init_dinode(struct gfs2_sbd *sdp, struct gfs2_buffer_head **bhp, struct gfs2_inum *inum, + unsigned int mode, uint32_t flags, struct gfs2_inum *parent); +extern struct gfs2_inode *createi(struct gfs2_inode *dip, const char *filename, + unsigned int mode, uint32_t flags); +extern struct gfs2_inode *gfs_createi(struct gfs2_inode *dip, + const char *filename, unsigned int mode, + uint32_t flags); +extern void dirent2_del(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur); +extern int dir_search(struct gfs2_inode *dip, const char *filename, int len, + unsigned int *type, struct gfs2_inum *inum); +extern int gfs2_lookupi(struct gfs2_inode *dip, const char *filename, int len, + struct gfs2_inode **ipp); +extern int dir_add(struct gfs2_inode *dip, const char *filename, int len, + struct gfs2_inum *inum, unsigned int type); +extern int gfs2_dirent_del(struct gfs2_inode *dip, const char *filename, + int filename_len); +extern void block_map(struct gfs2_inode *ip, uint64_t lblock, int *new, + uint64_t *dblock, uint32_t *extlen, int prealloc); +extern int lgfs2_get_leaf_ptr(struct gfs2_inode *dip, uint32_t index, uint64_t *ptr) __attribute__((warn_unused_result)); +extern void dir_split_leaf(struct gfs2_inode *dip, uint32_t start, + uint64_t leaf_no, struct gfs2_buffer_head *obh); +extern void gfs2_free_block(struct gfs2_sbd *sdp, uint64_t block); +extern int gfs2_freedi(struct gfs2_sbd *sdp, uint64_t block); +extern int gfs2_get_leaf(struct gfs2_inode *dip, uint64_t leaf_no, + struct gfs2_buffer_head **bhp); +extern int gfs2_dirent_first(struct gfs2_inode *dip, + struct gfs2_buffer_head *bh, + struct gfs2_dirent **dent); +extern int gfs2_dirent_next(struct gfs2_inode *dip, struct gfs2_buffer_head *bh, + struct gfs2_dirent **dent); +extern void build_height(struct gfs2_inode *ip, int height); +extern void unstuff_dinode(struct gfs2_inode *ip); +extern unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size); +extern int write_journal(struct gfs2_inode *jnl, unsigned bsize, unsigned blocks); +extern int lgfs2_write_journal_data(struct gfs2_inode *ip); +extern int lgfs2_write_filemeta(struct gfs2_inode *ip); +extern uint32_t lgfs2_log_header_hash(char *buf); +extern uint32_t lgfs2_log_header_crc(char *buf, unsigned bsize); + +/* gfs1.c - GFS1 backward compatibility structures and functions */ + +#define GFS_FORMAT_SB (100) /* Super-Block */ +#define GFS_METATYPE_SB (1) /* Super-Block */ +#define GFS_FORMAT_FS (1309) /* Filesystem (all-encompassing) */ +#define GFS_FORMAT_MULTI (1401) /* Multi-Host */ +/* GFS1 Dinode types */ +#define GFS_FILE_NON (0) +#define GFS_FILE_REG (1) /* regular file */ +#define GFS_FILE_DIR (2) /* directory */ +#define GFS_FILE_LNK (5) /* link */ +#define GFS_FILE_BLK (7) /* block device node */ +#define GFS_FILE_CHR (8) /* character device node */ +#define GFS_FILE_FIFO (101) /* fifo/pipe */ +#define GFS_FILE_SOCK (102) /* socket */ + +/* GFS 1 journal block types: */ +#define GFS_LOG_DESC_METADATA (300) /* metadata */ +#define GFS_LOG_DESC_IUL (400) /* unlinked inode */ +#define GFS_LOG_DESC_IDA (401) /* de-allocated inode */ +#define GFS_LOG_DESC_Q (402) /* quota */ +#define GFS_LOG_DESC_LAST (500) /* final in a logged transaction */ + +struct gfs_indirect { + struct gfs2_meta_header in_header; + + char in_reserved[64]; +}; + +struct gfs_dinode { + struct gfs2_meta_header di_header; + + struct gfs2_inum di_num; /* formal inode # and block address */ + + __be32 di_mode; /* mode of file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number (qty) of links to this file */ + __be64 di_size; /* number (qty) of bytes in file */ + __be64 di_blocks; /* number (qty) of blocks in file */ + __be64 di_atime; /* time last accessed */ + __be64 di_mtime; /* time last modified */ + __be64 di_ctime; /* time last changed */ + + /* Non-zero only for character or block device nodes */ + __be32 di_major; /* device major number */ + __be32 di_minor; /* device minor number */ + + /* Block allocation strategy */ + __be64 di_rgrp; /* dinode rgrp block number */ + __be64 di_goal_rgrp; /* rgrp to alloc from next */ + __be32 di_goal_dblk; /* data block goal */ + __be32 di_goal_mblk; /* metadata block goal */ + + __be32 di_flags; /* GFS_DIF_... */ + + /* struct gfs_rindex, struct gfs_jindex, or struct gfs_dirent */ + __be32 di_payload_format; /* GFS_FORMAT_... */ + __be16 di_type; /* GFS_FILE_... type of file */ + __be16 di_height; /* height of metadata (0 == stuffed) */ + __be32 di_incarn; /* incarnation (unused, see gfs_meta_header) */ + __be16 di_pad; + + /* These only apply to directories */ + __be16 di_depth; /* Number of bits in the table */ + __be32 di_entries; /* The # (qty) of entries in the directory */ + + /* This formed an on-disk chain of unused dinodes */ + struct gfs2_inum di_next_unused; /* used in old versions only */ + + __be64 di_eattr; /* extended attribute block number */ + + char di_reserved[56]; +}; + +struct gfs_sb { + /* Order is important; need to be able to read old superblocks + in order to support on-disk version upgrades */ + struct gfs2_meta_header sb_header; + + __be32 sb_fs_format; /* GFS_FORMAT_FS (on-disk version) */ + __be32 sb_multihost_format; /* GFS_FORMAT_MULTI */ + __be32 sb_flags; /* ?? */ + + __be32 sb_bsize; /* fundamental FS block size in bytes */ + __be32 sb_bsize_shift; /* log2(sb_bsize) */ + __be32 sb_seg_size; /* Journal segment size in FS blocks */ + + /* These special inodes do not appear in any on-disk directory. */ + struct gfs2_inum sb_jindex_di; /* journal index inode */ + struct gfs2_inum sb_rindex_di; /* resource group index inode */ + struct gfs2_inum sb_root_di; /* root directory inode */ + + /* Default inter-node locking protocol (lock module) and namespace */ + uint8_t sb_lockproto[GFS2_LOCKNAME_LEN]; /* lock protocol name */ + uint8_t sb_locktable[GFS2_LOCKNAME_LEN]; /* unique name for this FS */ + + /* More special inodes */ + struct gfs2_inum sb_quota_di; /* quota inode */ + struct gfs2_inum sb_license_di; /* license inode */ + + char sb_reserved[96]; +}; + +struct gfs_rgrp { + struct gfs2_meta_header rg_header; + + __be32 rg_flags; + __be32 rg_free; /* Number (qty) of free data blocks */ + + /* Dinodes are USEDMETA, but are handled separately from other METAs */ + __be32 rg_useddi; /* Number (qty) of dinodes (used or free) */ + __be32 rg_freedi; /* Number (qty) of unused (free) dinodes */ + struct gfs2_inum rg_freedi_list; /* 1st block in chain of free dinodes */ + + /* These META statistics do not include dinodes (used or free) */ + __be32 rg_usedmeta; /* Number (qty) of used metadata blocks */ + __be32 rg_freemeta; /* Number (qty) of unused metadata blocks */ + + char rg_reserved[64]; +}; + +struct gfs_log_header { + struct gfs2_meta_header lh_header; + + __be32 lh_flags; /* GFS_LOG_HEAD_... */ + __be32 lh_pad; + + __be64 lh_first; /* Block number of first header in this trans */ + __be64 lh_sequence; /* Sequence number of this transaction */ + + __be64 lh_tail; /* Block number of log tail */ + __be64 lh_last_dump; /* Block number of last dump */ + + uint8_t lh_reserved[64]; +}; + +struct gfs_jindex { + __be64 ji_addr; /* starting block of the journal */ + __be32 ji_nsegment; /* number (quantity) of segments in journal */ + __be32 ji_pad; + + uint8_t ji_reserved[64]; +}; + +struct gfs_log_descriptor { + struct gfs2_meta_header ld_header; + + __be32 ld_type; /* GFS_LOG_DESC_... Type of this log chunk */ + __be32 ld_length; /* Number of buffers in this chunk */ + __be32 ld_data1; /* descriptor-specific field */ + __be32 ld_data2; /* descriptor-specific field */ + uint8_t ld_reserved[64]; +}; + +extern int is_gfs_dir(struct gfs2_dinode *dinode); +extern void gfs1_lookup_block(struct gfs2_inode *ip, + struct gfs2_buffer_head *bh, + unsigned int height, struct metapath *mp, + int create, int *new, uint64_t *block); +extern void gfs1_block_map(struct gfs2_inode *ip, uint64_t lblock, int *new, + uint64_t *dblock, uint32_t *extlen, int prealloc); +extern int gfs1_writei(struct gfs2_inode *ip, char *buf, uint64_t offset, + unsigned int size); +extern int gfs1_ri_update(struct gfs2_sbd *sdp, int fd, int *rgcount, int quiet); +extern struct gfs2_inode *lgfs2_gfs_inode_get(struct gfs2_sbd *sdp, + struct gfs2_buffer_head *bh); +extern struct gfs2_inode *lgfs2_gfs_inode_read(struct gfs2_sbd *sdp, + uint64_t di_addr); +extern void gfs_jindex_in(struct gfs_jindex *jindex, char *buf); +extern void gfs_rgrp_in(struct gfs_rgrp *rg, struct gfs2_buffer_head *bh); +extern void gfs_rgrp_out(struct gfs_rgrp *rg, struct gfs2_buffer_head *bh); + +/* misc.c */ +extern int compute_heightsize(unsigned bsize, uint64_t *heightsize, + uint32_t *maxheight, uint32_t bsize1, int diptrs, int inptrs); +extern int compute_constants(struct gfs2_sbd *sdp); +extern int lgfs2_open_mnt(const char *path, int dirflags, int *dirfd, int devflags, int *devfd, struct mntent **mnt); +extern int lgfs2_open_mnt_dev(const char *path, int flags, struct mntent **mnt); +extern int lgfs2_open_mnt_dir(const char *path, int flags, struct mntent **mnt); + +/* recovery.c */ +extern void gfs2_replay_incr_blk(struct gfs2_inode *ip, unsigned int *blk); +extern int gfs2_replay_read_block(struct gfs2_inode *ip, unsigned int blk, + struct gfs2_buffer_head **bh); +extern int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, + unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); +extern int get_log_header(struct gfs2_inode *ip, unsigned int blk, + struct gfs2_log_header *head); +extern int gfs2_find_jhead(struct gfs2_inode *ip, struct gfs2_log_header *head); +extern int clean_journal(struct gfs2_inode *ip, struct gfs2_log_header *head); + +/* rgrp.c */ +extern int gfs2_compute_bitstructs(const uint32_t bsize, struct rgrp_tree *rgd); +extern struct rgrp_tree *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk); +extern int lgfs2_rgrp_crc_check(char *buf); +extern void lgfs2_rgrp_crc_set(char *buf); +extern uint64_t gfs2_rgrp_read(struct gfs2_sbd *sdp, struct rgrp_tree *rgd); +extern void gfs2_rgrp_relse(struct rgrp_tree *rgd); +extern struct rgrp_tree *rgrp_insert(struct osi_root *rgtree, + uint64_t rgblock); +extern void gfs2_rgrp_free(struct osi_root *rgrp_tree); +/* figure out the size of the given resource group, in blocks */ +static inline unsigned int rgrp_size(struct rgrp_tree *rgrp) +{ + return rgrp->ri.ri_data + rgrp->ri.ri_length; +} + +/* structures.c */ +extern int build_master(struct gfs2_sbd *sdp); +extern void lgfs2_sb_init(struct gfs2_sb *sb, unsigned bsize); +extern int lgfs2_sb_write(const struct gfs2_sb *sb, int fd, const unsigned bsize); +extern int build_journal(struct gfs2_sbd *sdp, int j, + struct gfs2_inode *jindex); +extern int build_jindex(struct gfs2_sbd *sdp); +extern int lgfs2_build_jindex(struct gfs2_inode *master, struct gfs2_inum *jnls, size_t nmemb); +extern int build_per_node(struct gfs2_sbd *sdp); +extern int build_inum(struct gfs2_sbd *sdp); +extern int build_statfs(struct gfs2_sbd *sdp); +extern int build_rindex(struct gfs2_sbd *sdp); +extern int build_quota(struct gfs2_sbd *sdp); +extern int build_root(struct gfs2_sbd *sdp); +extern int do_init_inum(struct gfs2_sbd *sdp); +extern int do_init_statfs(struct gfs2_sbd *sdp); +extern int gfs2_check_meta(struct gfs2_buffer_head *bh, int type); +extern unsigned lgfs2_bm_scan(struct rgrp_tree *rgd, unsigned idx, + uint64_t *buf, uint8_t state); +extern int build_inum_range(struct gfs2_inode *per_node, unsigned int j); +extern int build_statfs_change(struct gfs2_inode *per_node, unsigned int j); +extern int build_quota_change(struct gfs2_inode *per_node, unsigned int j); + +/* super.c */ +extern int check_sb(struct gfs2_sb *sb); +extern int read_sb(struct gfs2_sbd *sdp); +extern int rindex_read(struct gfs2_sbd *sdp, int fd, uint64_t *count1, + int *sane); +extern int ri_update(struct gfs2_sbd *sdp, int fd, int *rgcount, int *sane); +extern int write_sb(struct gfs2_sbd *sdp); + +/* ondisk.c */ +extern uint32_t gfs2_disk_hash(const char *data, int len); +extern void print_it(const char *label, const char *fmt, const char *fmt2, ...) + __attribute__((format(printf,2,4))); + +/* Translation functions */ + +extern void gfs2_inum_in(struct gfs2_inum *no, char *buf); +extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf); +extern void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf); +extern void gfs2_meta_header_out(const struct gfs2_meta_header *mh, char *buf); +extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf); +extern void gfs2_sb_out(const struct gfs2_sb *sb, char *buf); +extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf); +extern void gfs2_rindex_out(const struct gfs2_rindex *ri, char *buf); +extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf); +extern void gfs2_rgrp_out(const struct gfs2_rgrp *rg, char *buf); +extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf); +extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf); +extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf); +extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf); +extern void gfs2_dirent_in(struct gfs2_dirent *de, char *buf); +extern void gfs2_dirent_out(struct gfs2_dirent *de, char *buf); +extern void gfs2_leaf_in(struct gfs2_leaf *lf, char *buf); +extern void gfs2_leaf_out(struct gfs2_leaf *lf, char *buf); +extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf); +extern void gfs2_log_header_v1_in(struct gfs2_log_header *lh, char *buf); +extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf); +extern void gfs2_log_header_v1_out(struct gfs2_log_header *lh, char *buf); +extern void gfs2_log_header_out(struct gfs2_log_header *lh, char *buf); +extern void gfs2_log_descriptor_in(struct gfs2_log_descriptor *ld, char *buf); +extern void gfs2_log_descriptor_out(struct gfs2_log_descriptor *ld, char *buf); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf); +extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf); +extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf); +extern void gfs2_quota_change_out(struct gfs2_quota_change *qc, char *buf); + +/* Printing functions */ + +extern void gfs2_inum_print(const struct gfs2_inum *no); +extern void gfs2_meta_header_print(const struct gfs2_meta_header *mh); +extern void gfs2_sb_print(const struct gfs2_sb *sb); +extern void gfs2_rindex_print(const struct gfs2_rindex *ri); +extern void gfs2_rgrp_print(const struct gfs2_rgrp *rg); +extern void gfs2_quota_print(const struct gfs2_quota *qu); +extern void gfs2_dinode_print(const struct gfs2_dinode *di); +extern void gfs2_leaf_print(const struct gfs2_leaf *lf); +extern void gfs2_ea_header_print(const struct gfs2_ea_header *ea, char *name); +extern void gfs2_log_header_v1_print(const struct gfs2_log_header *lh); +extern void gfs2_log_header_print(const struct gfs2_log_header *lh); +extern void gfs2_log_descriptor_print(const struct gfs2_log_descriptor *ld); +extern void gfs2_statfs_change_print(const struct gfs2_statfs_change *sc); +extern void gfs2_quota_change_print(const struct gfs2_quota_change *qc); + +/* Language functions */ + +struct lgfs2_lang_state; + +struct lgfs2_lang_result { + uint64_t lr_blocknr; + struct gfs2_buffer_head *lr_bh; + const struct lgfs2_metadata *lr_mtype; + int lr_state; // GFS2_BLKST_* +}; + +extern struct lgfs2_lang_state *lgfs2_lang_init(void); +extern int lgfs2_lang_parsef(struct lgfs2_lang_state *state, FILE *script); +extern int lgfs2_lang_parses(struct lgfs2_lang_state *state, const char *script); +extern struct lgfs2_lang_result *lgfs2_lang_result_next(struct lgfs2_lang_state *state, struct gfs2_sbd *sbd); +extern int lgfs2_lang_result_print(struct lgfs2_lang_result *result); +extern void lgfs2_lang_result_free(struct lgfs2_lang_result **result); +extern void lgfs2_lang_free(struct lgfs2_lang_state **state); + +__END_DECLS + +#endif /* __LIBGFS2_DOT_H__ */ diff --git a/gfs2/libgfs2/meta.c b/gfs2/libgfs2/meta.c new file mode 100644 index 0000000..a828946 --- /dev/null +++ b/gfs2/libgfs2/meta.c @@ -0,0 +1,979 @@ +#include +#include +#include "libgfs2.h" +#include "clusterautoconfig.h" + +#ifdef GFS2_HAS_UUID +#include +#endif + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) +#define SYM(x) { x, #x }, + +const struct lgfs2_symbolic lgfs2_metatypes[] = { +SYM(GFS2_METATYPE_NONE) +SYM(GFS2_METATYPE_SB) +SYM(GFS2_METATYPE_RG) +SYM(GFS2_METATYPE_RB) +SYM(GFS2_METATYPE_DI) +SYM(GFS2_METATYPE_IN) +SYM(GFS2_METATYPE_LF) +SYM(GFS2_METATYPE_JD) +SYM(GFS2_METATYPE_LH) +SYM(GFS2_METATYPE_LD) +SYM(GFS2_METATYPE_LB) +SYM(GFS2_METATYPE_EA) +SYM(GFS2_METATYPE_ED) +SYM(GFS2_METATYPE_QC) +}; + +const unsigned lgfs2_metatype_size = ARRAY_SIZE(lgfs2_metatypes); + +const struct lgfs2_symbolic lgfs2_metaformats[] = { +SYM(GFS2_FORMAT_NONE) +SYM(GFS2_FORMAT_SB) +SYM(GFS2_FORMAT_RG) +SYM(GFS2_FORMAT_RB) +SYM(GFS2_FORMAT_DI) +SYM(GFS2_FORMAT_IN) +SYM(GFS2_FORMAT_LF) +SYM(GFS2_FORMAT_JD) +SYM(GFS2_FORMAT_LH) +SYM(GFS2_FORMAT_LD) +SYM(GFS2_FORMAT_LB) +SYM(GFS2_FORMAT_EA) +SYM(GFS2_FORMAT_ED) +SYM(GFS2_FORMAT_QC) +SYM(GFS2_FORMAT_RI) +SYM(GFS2_FORMAT_DE) +SYM(GFS2_FORMAT_QU) +}; + +const unsigned lgfs2_metaformat_size = ARRAY_SIZE(lgfs2_metaformats); + +const struct lgfs2_symbolic lgfs2_di_flags[] = { +SYM(GFS2_DIF_JDATA) +SYM(GFS2_DIF_EXHASH) +SYM(GFS2_DIF_UNUSED) +SYM(GFS2_DIF_EA_INDIRECT) +SYM(GFS2_DIF_DIRECTIO) +SYM(GFS2_DIF_IMMUTABLE) +SYM(GFS2_DIF_APPENDONLY) +SYM(GFS2_DIF_NOATIME) +SYM(GFS2_DIF_SYNC) +SYM(GFS2_DIF_SYSTEM) +SYM(GFS2_DIF_TRUNC_IN_PROG) +SYM(GFS2_DIF_INHERIT_DIRECTIO) +SYM(GFS2_DIF_INHERIT_JDATA) +}; + +const unsigned lgfs2_di_flag_size = ARRAY_SIZE(lgfs2_di_flags); + +const struct lgfs2_symbolic lgfs2_lh_flags[] = { +SYM(GFS2_LOG_HEAD_UNMOUNT) +}; + +const unsigned int lgfs2_lh_flag_size = ARRAY_SIZE(lgfs2_lh_flags); + +const struct lgfs2_symbolic lgfs2_ld_types[] = { +SYM(GFS2_LOG_DESC_METADATA) +SYM(GFS2_LOG_DESC_REVOKE) +SYM(GFS2_LOG_DESC_JDATA) +}; + +const unsigned int lgfs2_ld_type_size = ARRAY_SIZE(lgfs2_ld_types); + +const struct lgfs2_symbolic lgfs2_ld1_types[] = { +SYM(GFS_LOG_DESC_METADATA) +SYM(GFS_LOG_DESC_IUL) +SYM(GFS_LOG_DESC_IDA) +SYM(GFS_LOG_DESC_Q) +SYM(GFS_LOG_DESC_LAST) +}; + +const unsigned int lgfs2_ld1_type_size = ARRAY_SIZE(lgfs2_ld1_types); + +#undef SYM + + + + +#define F(f,...) { .name = #f, \ + .offset = offsetof(struct STRUCT, f), \ + .length = sizeof(((struct STRUCT *)(0))->f), \ + __VA_ARGS__ }, +#define FP(f,...) F(f, .flags = LGFS2_MFF_POINTER, __VA_ARGS__) +#define RF(f) F(f, .flags = LGFS2_MFF_RESERVED) +#define RFP(f,...) F(f, .flags = LGFS2_MFF_POINTER|LGFS2_MFF_RESERVED, __VA_ARGS__) + + +#define MH(f) F(f.mh_magic) \ + F(f.mh_type, .flags = LGFS2_MFF_ENUM, .symtab=lgfs2_metatypes, .nsyms=ARRAY_SIZE(lgfs2_metatypes)) \ + RF(f.__pad0) \ + F(f.mh_format, .flags = LGFS2_MFF_ENUM, .symtab=lgfs2_metaformats, .nsyms=ARRAY_SIZE(lgfs2_metaformats)) \ + F(f.mh_jid) + +#define IN(f,...) F(f.no_formal_ino) \ + FP(f.no_addr, __VA_ARGS__) + +#define INR(f,...) RF(f.no_formal_ino) \ + RFP(f.no_addr, __VA_ARGS__) +#define ANY_COMMON_BLOCK (1 << LGFS2_MT_DIR_LEAF) | \ + (1 << LGFS2_MT_JRNL_DATA) | \ + (1 << LGFS2_MT_EA_ATTR) | \ + (1 << LGFS2_MT_EA_DATA) | \ + (1 << LGFS2_MT_DATA) + +#define ANY_GFS2_BLOCK (1 << LGFS2_MT_GFS2_DINODE) | \ + (1 << LGFS2_MT_GFS2_INDIRECT) | \ + (1 << LGFS2_MT_GFS2_LOG_HEADER) | \ + (1 << LGFS2_MT_GFS2_LOG_DESC) | \ + (1 << LGFS2_MT_GFS2_LOG_BLOCK) | \ + ANY_COMMON_BLOCK + +#define ANY_GFS_BLOCK (1 << LGFS2_MT_GFS_DINODE) | \ + (1 << LGFS2_MT_GFS_INDIRECT) | \ + ANY_COMMON_BLOCK + +#undef STRUCT +#define STRUCT gfs2_sb + +static const struct lgfs2_metafield gfs2_sb_fields[] = { +MH(sb_header) +F(sb_fs_format) +F(sb_multihost_format) +RF(__pad0) +F(sb_bsize, .flags = LGFS2_MFF_BYTES) +F(sb_bsize_shift, .flags = LGFS2_MFF_BYTES|LGFS2_MFF_SHIFT) +RF(__pad1) +IN(sb_master_dir, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +INR(__pad2, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +IN(sb_root_dir, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +F(sb_lockproto, .flags = LGFS2_MFF_STRING) +F(sb_locktable, .flags = LGFS2_MFF_STRING) +INR(__pad3, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +INR(__pad4, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +#ifdef GFS2_HAS_UUID +F(sb_uuid, .flags = LGFS2_MFF_UUID) +#endif +}; + +#undef STRUCT +#define STRUCT gfs_sb + +static const struct lgfs2_metafield gfs_sb_fields[] = { +MH(sb_header) +F(sb_fs_format) +F(sb_multihost_format) +F(sb_flags) +F(sb_bsize, .flags = LGFS2_MFF_BYTES) +F(sb_bsize_shift, .flags = LGFS2_MFF_BYTES|LGFS2_MFF_SHIFT) +F(sb_seg_size, .flags = LGFS2_MFF_FSBLOCKS) +IN(sb_jindex_di, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +IN(sb_rindex_di, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +IN(sb_root_di, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +F(sb_lockproto, .flags = LGFS2_MFF_STRING) +F(sb_locktable, .flags = LGFS2_MFF_STRING) +IN(sb_quota_di, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +IN(sb_license_di, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +RF(sb_reserved) +}; + +#undef STRUCT +#define STRUCT gfs2_rindex + +static const struct lgfs2_metafield gfs2_rindex_fields[] = { +FP(ri_addr, .points_to = (1 << LGFS2_MT_GFS2_RGRP)) +F(ri_length, .flags = LGFS2_MFF_FSBLOCKS) +RF(__pad) +FP(ri_data0, .points_to = ANY_GFS2_BLOCK|(1 << LGFS2_MT_FREE)) +F(ri_data, .flags = LGFS2_MFF_FSBLOCKS) +F(ri_bitbytes, .flags = LGFS2_MFF_BYTES) +F(ri_reserved) +}; + +#undef STRUCT +#define STRUCT gfs2_rgrp + +static const struct lgfs2_metafield gfs2_rgrp_fields[] = { +MH(rg_header) +F(rg_flags) +F(rg_free, .flags = LGFS2_MFF_FSBLOCKS) +F(rg_dinodes, .flags = LGFS2_MFF_FSBLOCKS) +#ifdef GFS2_HAS_RG_SKIP +FP(rg_skip, .points_to = (1 << LGFS2_MT_GFS2_RGRP)) +#else +RF(__pad) +#endif +F(rg_igeneration) +#ifdef GFS2_HAS_RG_RI_FIELDS +FP(rg_data0, .points_to = ANY_GFS2_BLOCK|(1 << LGFS2_MT_FREE)) +F(rg_data, .flags = LGFS2_MFF_FSBLOCKS) +F(rg_bitbytes, .flags = LGFS2_MFF_BYTES) +F(rg_crc, .flags = LGFS2_MFF_CHECK) +#endif +RF(rg_reserved) +}; + +#undef STRUCT +#define STRUCT gfs_rgrp + +static const struct lgfs2_metafield gfs_rgrp_fields[] = { +MH(rg_header) +F(rg_flags) +F(rg_free, .flags = LGFS2_MFF_FSBLOCKS) +F(rg_useddi, .flags = LGFS2_MFF_FSBLOCKS) +F(rg_freedi, .flags = LGFS2_MFF_FSBLOCKS) +IN(rg_freedi_list, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +F(rg_usedmeta, .flags = LGFS2_MFF_FSBLOCKS) +F(rg_freemeta, .flags = LGFS2_MFF_FSBLOCKS) +RF(rg_reserved) +}; + +#undef STRUCT +struct gfs2_rgrp_bitmap { struct gfs2_meta_header rb_header; }; +#define STRUCT gfs2_rgrp_bitmap + +static const struct lgfs2_metafield gfs2_rgrp_bitmap_fields[] = { +MH(rb_header) +}; + +#undef STRUCT +#define STRUCT gfs2_dinode + +static const struct lgfs2_metafield gfs2_dinode_fields[] = { +MH(di_header) +IN(di_num, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +F(di_mode, .flags = LGFS2_MFF_MODE) +F(di_uid, .flags = LGFS2_MFF_UID) +F(di_gid, .flags = LGFS2_MFF_GID) +F(di_nlink) +F(di_size, .flags = LGFS2_MFF_BYTES) +F(di_blocks, .flags = LGFS2_MFF_FSBLOCKS) +F(di_atime, .flags = LGFS2_MFF_SECS) +F(di_mtime, .flags = LGFS2_MFF_SECS) +F(di_ctime, .flags = LGFS2_MFF_SECS) +F(di_major, .flags = LGFS2_MFF_MAJOR) +F(di_minor, .flags = LGFS2_MFF_MINOR) +FP(di_goal_meta, .points_to = ANY_GFS2_BLOCK | (1 << LGFS2_MT_FREE)) +FP(di_goal_data, .points_to = ANY_GFS2_BLOCK | (1 << LGFS2_MT_FREE)) +F(di_generation) +F(di_flags, .flags = LGFS2_MFF_MASK, .symtab=lgfs2_di_flags, .nsyms=ARRAY_SIZE(lgfs2_di_flags)) +F(di_payload_format) +RF(__pad1) +F(di_height) +RF(__pad2) +RF(__pad3) +F(di_depth) +F(di_entries) +INR(__pad4, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +FP(di_eattr, .points_to = (1 << LGFS2_MT_EA_ATTR)|(1 << LGFS2_MT_GFS2_INDIRECT)) +F(di_atime_nsec, .flags = LGFS2_MFF_NSECS) +F(di_mtime_nsec, .flags = LGFS2_MFF_NSECS) +F(di_ctime_nsec, .flags = LGFS2_MFF_NSECS) +RF(di_reserved) +}; + +#undef STRUCT +#define STRUCT gfs_dinode + +static const struct lgfs2_metafield gfs_dinode_fields[] = { +MH(di_header) +IN(di_num, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +F(di_mode, .flags = LGFS2_MFF_MODE) +F(di_uid, .flags = LGFS2_MFF_UID) +F(di_gid, .flags = LGFS2_MFF_GID) +F(di_nlink) +F(di_size, .flags = LGFS2_MFF_BYTES) +F(di_blocks, .flags = LGFS2_MFF_FSBLOCKS) +F(di_atime, .flags = LGFS2_MFF_SECS) +F(di_mtime, .flags = LGFS2_MFF_SECS) +F(di_ctime, .flags = LGFS2_MFF_SECS) +F(di_major, .flags = LGFS2_MFF_MAJOR) +F(di_minor, .flags = LGFS2_MFF_MINOR) +FP(di_rgrp, .points_to = LGFS2_MT_GFS_RGRP) +FP(di_goal_rgrp, .points_to = LGFS2_MT_GFS_RGRP) +F(di_goal_dblk) +F(di_goal_mblk) +F(di_flags, .flags = LGFS2_MFF_MASK, .symtab=lgfs2_di_flags, .nsyms=ARRAY_SIZE(lgfs2_di_flags)) +F(di_payload_format) +F(di_type) +F(di_height) +F(di_incarn) +F(di_pad) +F(di_depth) +F(di_entries) +INR(di_next_unused, .points_to = (1 << LGFS2_MT_GFS_DINODE)) +FP(di_eattr, .points_to = (1 << LGFS2_MT_EA_ATTR)|(1 << LGFS2_MT_GFS_INDIRECT)) +F(di_reserved) +}; + +#undef STRUCT +struct gfs2_indirect { struct gfs2_meta_header in_header; }; +#define STRUCT gfs2_indirect + +static const struct lgfs2_metafield gfs2_indirect_fields[] = { +MH(in_header) +}; + +#undef STRUCT +#define STRUCT gfs_indirect + +static const struct lgfs2_metafield gfs_indirect_fields[] = { +MH(in_header) +RF(in_reserved) +}; + +#undef STRUCT +#define STRUCT gfs2_leaf + +static const struct lgfs2_metafield gfs2_leaf_fields[] = { +MH(lf_header) +F(lf_depth) +F(lf_entries) +F(lf_dirent_format) +F(lf_next) +#ifdef GFS2_HAS_LEAF_HINTS +FP(lf_inode, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +F(lf_dist) +F(lf_nsec, .flags = LGFS2_MFF_NSECS) +F(lf_sec, .flags = LGFS2_MFF_SECS) +RF(lf_reserved2) +#else +RF(lf_reserved) +#endif +}; + +#undef STRUCT +struct gfs2_jrnl_data { struct gfs2_meta_header jd_header; }; +#define STRUCT gfs2_jrnl_data + +static const struct lgfs2_metafield gfs2_jdata_fields[] = { +MH(jd_header) +}; + +#undef STRUCT +#define STRUCT gfs2_log_header + +static const struct lgfs2_metafield gfs2_log_header_fields[] = { +MH(lh_header) +F(lh_sequence) +F(lh_flags) +F(lh_tail) +F(lh_blkno) +F(lh_hash, .flags = LGFS2_MFF_CHECK) +#ifdef GFS2_HAS_LH_V2 +F(lh_crc, .flags = LGFS2_MFF_CHECK) +F(lh_nsec, .flags = LGFS2_MFF_NSECS) +F(lh_sec, .flags = LGFS2_MFF_SECS) +FP(lh_addr, .points_to = (1 << LGFS2_MT_GFS2_LOG_BLOCK)) +FP(lh_jinode, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +FP(lh_statfs_addr, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +FP(lh_quota_addr, .points_to = (1 << LGFS2_MT_GFS2_DINODE)) +F(lh_local_total, .flags = LGFS2_MFF_FSBLOCKS) +F(lh_local_free, .flags = LGFS2_MFF_FSBLOCKS) +F(lh_local_dinodes, .flags = LGFS2_MFF_FSBLOCKS) +#endif +}; + +#undef STRUCT +#define STRUCT gfs_log_header + +static const struct lgfs2_metafield gfs_log_header_fields[] = { +MH(lh_header) +F(lh_flags, .flags = LGFS2_MFF_MASK, .symtab = lgfs2_lh_flags, .nsyms = ARRAY_SIZE(lgfs2_lh_flags)) +RF(lh_pad) +F(lh_first) +F(lh_sequence) +F(lh_tail) +F(lh_last_dump) +RF(lh_reserved) +}; + +#undef STRUCT +#define STRUCT gfs2_log_descriptor + +static const struct lgfs2_metafield gfs2_log_desc_fields[] = { +MH(ld_header) +F(ld_type, .flags = LGFS2_MFF_ENUM, .symtab = lgfs2_ld_types, .nsyms = ARRAY_SIZE(lgfs2_ld_types)) +F(ld_length, .flags = LGFS2_MFF_FSBLOCKS) +F(ld_data1) +F(ld_data2) +RF(ld_reserved) +}; + +#undef STRUCT +#define STRUCT gfs_log_descriptor + +static const struct lgfs2_metafield gfs_log_desc_fields[] = { +MH(ld_header) +F(ld_type, .flags = LGFS2_MFF_ENUM, .symtab = lgfs2_ld1_types, .nsyms = ARRAY_SIZE(lgfs2_ld1_types)) +F(ld_length, .flags = LGFS2_MFF_FSBLOCKS) +F(ld_data1) +F(ld_data2) +RF(ld_reserved) +}; + +#undef STRUCT +struct gfs2_log_block { struct gfs2_meta_header lb_header; }; +#define STRUCT gfs2_log_block + +static const struct lgfs2_metafield gfs2_log_block_fields[] = { +MH(lb_header) +}; + +#undef STRUCT +struct gfs2_ea_attr { struct gfs2_meta_header ea_header; }; +#define STRUCT gfs2_ea_attr + +static const struct lgfs2_metafield gfs2_ea_attr_fields[] = { +MH(ea_header) +}; + +#undef STRUCT +struct gfs2_ea_data { struct gfs2_meta_header ed_header; }; +#define STRUCT gfs2_ea_data + +static const struct lgfs2_metafield gfs2_ea_data_fields[] = { +MH(ed_header) +}; + +#undef STRUCT +#define STRUCT gfs2_quota_change + +static const struct lgfs2_metafield gfs2_quota_change_fields[] = { +F(qc_change, .flags = LGFS2_MFF_FSBLOCKS) +F(qc_flags) +F(qc_id) +}; + +#undef STRUCT +#define STRUCT gfs2_dirent + +static const struct lgfs2_metafield gfs2_dirent_fields[] = { +IN(de_inum, .points_to = (1 << LGFS2_MT_GFS_DINODE)|(1 << LGFS2_MT_GFS2_DINODE)) +F(de_hash, .flags = LGFS2_MFF_CHECK) +F(de_rec_len, .flags = LGFS2_MFF_BYTES) +F(de_name_len, .flags = LGFS2_MFF_BYTES) +F(de_type) +#ifdef GFS2_HAS_DE_RAHEAD +F(de_rahead) +#ifdef GFS2_HAS_DE_COOKIE +F(de_cookie) +RF(pad3) +#else +RF(pad2) +#endif /* GFS2_HAS_DE_COOKIE */ +#else +RF(__pad) +#endif /* GFS2_HAS_DE_RAHEAD */ +}; + +#undef STRUCT +#define STRUCT gfs2_ea_header + +static const struct lgfs2_metafield gfs2_ea_header_fields[] = { +F(ea_rec_len, .flags = LGFS2_MFF_BYTES) +F(ea_data_len, .flags = LGFS2_MFF_BYTES) +F(ea_name_len, .flags = LGFS2_MFF_BYTES) +F(ea_type) +F(ea_flags) +F(ea_num_ptrs) +RF(__pad) +}; + +#undef STRUCT +#define STRUCT gfs2_inum_range + +static const struct lgfs2_metafield gfs2_inum_range_fields[] = { +F(ir_start) +F(ir_length) +}; + +#undef STRUCT +#define STRUCT gfs2_statfs_change + +static const struct lgfs2_metafield gfs2_statfs_change_fields[] = { +F(sc_total, .flags = LGFS2_MFF_FSBLOCKS) +F(sc_free, .flags = LGFS2_MFF_FSBLOCKS) +F(sc_dinodes, .flags = LGFS2_MFF_FSBLOCKS) +}; + +#undef STRUCT +#define STRUCT gfs_jindex + +static const struct lgfs2_metafield gfs_jindex_fields[] = { +FP(ji_addr, .points_to = (1 << LGFS2_MT_DATA)) +F(ji_nsegment) +RF(ji_pad) +RF(ji_reserved) +}; + +#undef STRUCT +struct gfs_block_tag { + uint64_t bt_blkno; /* inplace block number */ + uint32_t bt_flags; /* ?? */ + uint32_t bt_pad; +}; +#define STRUCT gfs_block_tag + +static const struct lgfs2_metafield gfs_block_tag_fields[] = { +FP(bt_blkno, .points_to = ANY_GFS_BLOCK) +RF(bt_flags) +RF(bt_pad) +}; + +const struct lgfs2_metadata lgfs2_metadata[] = { + [LGFS2_MT_GFS2_SB] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_SB, + .mh_format = GFS2_FORMAT_SB, + .name = "gfs2_sb", + .fields = gfs2_sb_fields, + .nfields = ARRAY_SIZE(gfs2_sb_fields), + .size = sizeof(struct gfs2_sb), + }, + [LGFS2_MT_GFS_SB] = { + .versions = LGFS2_MD_GFS1, + .header = 1, + .mh_type = GFS2_METATYPE_SB, + .mh_format = GFS_FORMAT_SB, + .name = "gfs_sb", + .fields = gfs_sb_fields, + .nfields = ARRAY_SIZE(gfs_sb_fields), + .size = sizeof(struct gfs_sb), + }, + [LGFS2_MT_RINDEX] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .name = "rindex", + .fields = gfs2_rindex_fields, + .nfields = ARRAY_SIZE(gfs2_rindex_fields), + .size = sizeof(struct gfs2_rindex), + }, + [LGFS2_MT_GFS2_RGRP] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_RG, + .mh_format = GFS2_FORMAT_RG, + .name = "gfs2_rgrp", + .fields = gfs2_rgrp_fields, + .nfields = ARRAY_SIZE(gfs2_rgrp_fields), + .size = sizeof(struct gfs2_rgrp), + }, + [LGFS2_MT_GFS_RGRP] = { + .versions = LGFS2_MD_GFS1, + .header = 1, + .mh_type = GFS2_METATYPE_RG, + .mh_format = GFS2_FORMAT_RG, + .name = "gfs_rgrp", + .fields = gfs_rgrp_fields, + .nfields = ARRAY_SIZE(gfs_rgrp_fields), + .size = sizeof(struct gfs_rgrp), + }, + [LGFS2_MT_RGRP_BITMAP] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_RB, + .mh_format = GFS2_FORMAT_RB, + .name = "gfs2_rgrp_bitmap", + .fields = gfs2_rgrp_bitmap_fields, + .nfields = ARRAY_SIZE(gfs2_rgrp_bitmap_fields), + .size = sizeof(struct gfs2_meta_header), + }, + [LGFS2_MT_GFS2_DINODE] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_DI, + .mh_format = GFS2_FORMAT_DI, + .name = "gfs2_dinode", + .fields = gfs2_dinode_fields, + .nfields = ARRAY_SIZE(gfs2_dinode_fields), + .size = sizeof(struct gfs2_dinode), + }, + [LGFS2_MT_GFS_DINODE] = { + .versions = LGFS2_MD_GFS1, + .header = 1, + .mh_type = GFS2_METATYPE_DI, + .mh_format = GFS2_FORMAT_DI, + .name = "gfs_dinode", + .fields = gfs_dinode_fields, + .nfields = ARRAY_SIZE(gfs_dinode_fields), + .size = sizeof(struct gfs_dinode), + }, + [LGFS2_MT_GFS2_INDIRECT] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_IN, + .mh_format = GFS2_FORMAT_IN, + .name = "gfs2_indirect", + .fields = gfs2_indirect_fields, + .nfields = ARRAY_SIZE(gfs2_indirect_fields), + .size = sizeof(struct gfs2_meta_header), + }, + [LGFS2_MT_GFS_INDIRECT] = { + .versions = LGFS2_MD_GFS1, + .header = 1, + .mh_type = GFS2_METATYPE_IN, + .mh_format = GFS2_FORMAT_IN, + .name = "gfs_indirect", + .fields = gfs_indirect_fields, + .nfields = ARRAY_SIZE(gfs_indirect_fields), + .size = sizeof(struct gfs_indirect), + }, + [LGFS2_MT_DIR_LEAF] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_LF, + .mh_format = GFS2_FORMAT_LF, + .name = "gfs2_leaf", + .fields = gfs2_leaf_fields, + .nfields = ARRAY_SIZE(gfs2_leaf_fields), + .size = sizeof(struct gfs2_leaf), + }, + [LGFS2_MT_JRNL_DATA] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_JD, + .mh_format = GFS2_FORMAT_JD, + .name = "gfs2_jdata", + .fields = gfs2_jdata_fields, + .nfields = ARRAY_SIZE(gfs2_jdata_fields), + .size = sizeof(struct gfs2_meta_header), + }, + [LGFS2_MT_GFS2_LOG_HEADER] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_LH, + .mh_format = GFS2_FORMAT_LH, + .name = "gfs2_log_header", + .fields = gfs2_log_header_fields, + .nfields = ARRAY_SIZE(gfs2_log_header_fields), + .size = sizeof(struct gfs2_log_header), + }, + [LGFS2_MT_GFS_LOG_HEADER] = { + .versions = LGFS2_MD_GFS1, + .header = 1, + .mh_type = GFS2_METATYPE_LH, + .mh_format = GFS2_FORMAT_LH, + .name = "gfs_log_header", + .fields = gfs_log_header_fields, + .nfields = ARRAY_SIZE(gfs_log_header_fields), + .size = sizeof(struct gfs_log_header), + }, + [LGFS2_MT_GFS2_LOG_DESC] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_LD, + .mh_format = GFS2_FORMAT_LD, + .name = "gfs2_log_desc", + .fields = gfs2_log_desc_fields, + .nfields = ARRAY_SIZE(gfs2_log_desc_fields), + .size = sizeof(struct gfs2_log_descriptor), + }, + [LGFS2_MT_GFS_LOG_DESC] = { + .versions = LGFS2_MD_GFS1, + .header = 1, + .mh_type = GFS2_METATYPE_LD, + .mh_format = GFS2_FORMAT_LD, + .name = "gfs_log_desc", + .fields = gfs_log_desc_fields, + .nfields = ARRAY_SIZE(gfs_log_desc_fields), + .size = sizeof(struct gfs_log_descriptor), + }, + [LGFS2_MT_GFS2_LOG_BLOCK] = { + .versions = LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_LB, + .mh_format = GFS2_FORMAT_LB, + .name = "gfs2_log_block", + .fields = gfs2_log_block_fields, + .nfields = ARRAY_SIZE(gfs2_log_block_fields), + .size = sizeof(struct gfs2_meta_header), + }, + [LGFS2_MT_EA_ATTR] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_EA, + .mh_format = GFS2_FORMAT_EA, + .name = "gfs2_ea_attr", + .fields = gfs2_ea_attr_fields, + .nfields = ARRAY_SIZE(gfs2_ea_attr_fields), + .size = sizeof(struct gfs2_meta_header), + }, + [LGFS2_MT_EA_DATA] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .header = 1, + .mh_type = GFS2_METATYPE_ED, + .mh_format = GFS2_FORMAT_ED, + .name = "gfs2_ea_data", + .fields = gfs2_ea_data_fields, + .nfields = ARRAY_SIZE(gfs2_ea_data_fields), + .size = sizeof(struct gfs2_meta_header), + }, + [LGFS2_MT_GFS2_QUOTA_CHANGE] = { + .versions = LGFS2_MD_GFS2, + .name = "gfs2_quota_change", + .fields = gfs2_quota_change_fields, + .nfields = ARRAY_SIZE(gfs2_quota_change_fields), + .size = sizeof(struct gfs2_quota_change), + }, + [LGFS2_MT_DIRENT] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .name = "gfs2_dirent", + .fields = gfs2_dirent_fields, + .nfields = ARRAY_SIZE(gfs2_dirent_fields), + .size = sizeof(struct gfs2_dirent), + }, + [LGFS2_MT_EA_HEADER] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .name = "gfs2_ea_header", + .fields = gfs2_ea_header_fields, + .nfields = ARRAY_SIZE(gfs2_ea_header_fields), + .size = sizeof(struct gfs2_ea_header), + }, + [LGFS2_MT_GFS2_INUM_RANGE] = { + .versions = LGFS2_MD_GFS2, + .name = "gfs2_inum_range", + .fields = gfs2_inum_range_fields, + .nfields = ARRAY_SIZE(gfs2_inum_range_fields), + .size = sizeof(struct gfs2_inum_range), + }, + [LGFS2_MT_STATFS_CHANGE] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .name = "gfs2_statfs_change", + .fields = gfs2_statfs_change_fields, + .nfields = ARRAY_SIZE(gfs2_statfs_change_fields), + .size = sizeof(struct gfs2_statfs_change), + }, + [LGFS2_MT_GFS_JINDEX] = { + .versions = LGFS2_MD_GFS1, + .name = "gfs_jindex", + .fields = gfs_jindex_fields, + .nfields = ARRAY_SIZE(gfs_jindex_fields), + .size = sizeof(struct gfs_jindex), + }, + [LGFS2_MT_GFS_BLOCK_TAG] = { + .versions = LGFS2_MD_GFS1, + .name = "gfs_block_tag", + .fields = gfs_block_tag_fields, + .nfields = ARRAY_SIZE(gfs_block_tag_fields), + .size = sizeof(struct gfs_block_tag), + }, + [LGFS2_MT_DATA] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .name = "data", + }, + [LGFS2_MT_FREE] = { + .versions = LGFS2_MD_GFS1 | LGFS2_MD_GFS2, + .name = "free", + }, +}; + +const unsigned lgfs2_metadata_size = ARRAY_SIZE(lgfs2_metadata); + +const struct lgfs2_metafield *lgfs2_find_mfield_name(const char *name, const struct lgfs2_metadata *mtype) +{ + int j; + const struct lgfs2_metafield *f; + + for (j = 0; j < mtype->nfields; j++) { + f = &mtype->fields[j]; + if (strcmp(f->name, name) == 0) + return f; + } + return NULL; +} + +static int check_metadata_sizes(void) +{ + unsigned offset; + int i, j; + int ret = 0; + + for (i = 0; i < lgfs2_metadata_size; i++) { + const struct lgfs2_metadata *m = &lgfs2_metadata[i]; + offset = 0; + for (j = 0; j < m->nfields; j++) { + const struct lgfs2_metafield *f = &m->fields[j]; + if (f->offset != offset) { + fprintf(stderr, "%s: %s: offset is %u, expected %u\n", m->name, f->name, f->offset, offset); + ret = -1; + } + offset += f->length; + } + if (offset != m->size) { + fprintf(stderr, "%s: size mismatch between struct %u and fields %u\n", m->name, m->size, offset); + ret = -1; + } + } + + return ret; +} + +static int check_symtab(void) +{ + int i, j; + int ret = 0; + + for (i = 0; i < lgfs2_metadata_size; i++) { + const struct lgfs2_metadata *m = &lgfs2_metadata[i]; + for (j = 0; j < m->nfields; j++) { + const struct lgfs2_metafield *f = &m->fields[j]; + if (f->flags & (LGFS2_MFF_MASK|LGFS2_MFF_ENUM)) { + if (f->symtab == NULL) { + fprintf(stderr, "%s: Missing symtab for %s\n", m->name, f->name); + ret = -1; + } + } + if (f->symtab) { + if (!(f->flags & (LGFS2_MFF_MASK|LGFS2_MFF_ENUM))) { + fprintf(stderr, "%s: Symtab for non-enum and non-mask field %s\n", m->name, f->name); + ret = -1; + } + } + } + } + + return ret; +} + +static int check_ptrs(void) +{ + int i, j; + int ret = 0; + + for (i = 0; i < lgfs2_metadata_size; i++) { + const struct lgfs2_metadata *m = &lgfs2_metadata[i]; + for (j = 0; j < m->nfields; j++) { + const struct lgfs2_metafield *f = &m->fields[j]; + if ((f->flags & LGFS2_MFF_POINTER) && !f->points_to) { + fprintf(stderr, "%s: Pointer entry %s has no destination\n", m->name, f->name); + ret = -1; + } + } + } + + return ret; +} + +int lgfs2_selfcheck(void) +{ + int ret = 0; + + ret |= check_metadata_sizes(); + ret |= check_symtab(); + ret |= check_ptrs(); + + return ret; +} + +const struct lgfs2_metadata *lgfs2_find_mtype(uint32_t mh_type, const unsigned versions) +{ + const struct lgfs2_metadata *m = lgfs2_metadata; + unsigned n = 0; + + do { + if ((m[n].versions & versions) && m[n].mh_type == mh_type) + return &m[n]; + n++; + } while (n < lgfs2_metadata_size); + + return NULL; +} + +const struct lgfs2_metadata *lgfs2_find_mtype_name(const char *name, const unsigned versions) +{ + const struct lgfs2_metadata *m = lgfs2_metadata; + unsigned n = 0; + + do { + if ((m[n].versions & versions) && !strcmp(m[n].name, name)) + return &m[n]; + n++; + } while (n < lgfs2_metadata_size); + + return NULL; +} + +int lgfs2_field_str(char *str, const size_t size, const char *blk, const struct lgfs2_metafield *field, int hex) +{ + const char *fieldp = blk + field->offset; + + errno = EINVAL; + if (str == NULL) + return 1; + + if (field->flags & LGFS2_MFF_UUID) { +#ifdef GFS2_HAS_UUID + char readable_uuid[36+1]; + uuid_t uuid; + + memcpy(uuid, fieldp, sizeof(uuid_t)); + uuid_unparse(uuid, readable_uuid); + snprintf(str, size, "%s", readable_uuid); +#endif + } else if (field->flags & LGFS2_MFF_STRING) { + snprintf(str, size, "%s", fieldp); + } else { + switch(field->length) { + case sizeof(uint8_t): + snprintf(str, size, hex? "%"PRIx8 : "%"PRIu8, *(uint8_t *)fieldp); + break; + case sizeof(uint16_t): + snprintf(str, size, hex? "%"PRIx16 : "%"PRIu16, be16_to_cpu(*(uint16_t *)fieldp)); + break; + case sizeof(uint32_t): + snprintf(str, size, hex? "%"PRIx32 : "%"PRIu32, be32_to_cpu(*(uint32_t *)fieldp)); + break; + case sizeof(uint64_t): + snprintf(str, size, hex? "%"PRIx64 : "%"PRIu64, be64_to_cpu(*(uint64_t *)fieldp)); + break; + default: + break; + } + } + str[size - 1] = '\0'; + return 0; +} + +int lgfs2_field_assign(char *blk, const struct lgfs2_metafield *field, const void *val) +{ + char *fieldp = blk + field->offset; + + if (field->flags & LGFS2_MFF_UUID) { + memcpy(fieldp, val, 16); + return 0; + } + + errno = EINVAL; + if (field->flags & LGFS2_MFF_STRING) { + size_t len = strnlen(val, field->length); + + if (len >= field->length) + return 1; + strncpy(fieldp, val, field->length - 1); + fieldp[field->length - 1] = '\0'; + return 0; + } + + switch(field->length) { + case sizeof(uint8_t): + *fieldp = *(uint8_t *)val; + return 0; + case sizeof(uint16_t): + *(uint16_t *)fieldp = cpu_to_be16(*(uint16_t *)val); + return 0; + case sizeof(uint32_t): + *(uint32_t *)fieldp = cpu_to_be32(*(uint32_t *)val); + return 0; + case sizeof(uint64_t): + *(uint64_t *)fieldp = cpu_to_be64(*(uint64_t *)val); + return 0; + default: + /* Will never happen */ + break; + } + + return 1; +} diff --git a/gfs2/libgfs2/misc.c b/gfs2/libgfs2/misc.c new file mode 100644 index 0000000..6dfd9af --- /dev/null +++ b/gfs2/libgfs2/misc.c @@ -0,0 +1,182 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" + +#define PAGE_SIZE (4096) +#define DIV_RU(x, y) (((x) + (y) - 1) / (y)) + +int compute_heightsize(unsigned bsize, uint64_t *heightsize, + uint32_t *maxheight, uint32_t bsize1, int diptrs, int inptrs) +{ + heightsize[0] = bsize - sizeof(struct gfs2_dinode); + heightsize[1] = (uint64_t)bsize1 * diptrs; + for (*maxheight = 2;; (*maxheight)++) { + uint64_t space, d; + uint32_t m; + + space = heightsize[*maxheight - 1] * inptrs; + m = space % inptrs; + d = space / inptrs; + + if (d != heightsize[*maxheight - 1] || m) + break; + heightsize[*maxheight] = space; + } + if (*maxheight > GFS2_MAX_META_HEIGHT) { + errno = EINVAL; + return -1; + } + return 0; +} + +int compute_constants(struct gfs2_sbd *sdp) +{ + uint32_t hash_blocks, ind_blocks, leaf_blocks; + uint32_t tmp_blocks; + + sdp->md.next_inum = 1; + + sdp->sd_sb.sb_bsize_shift = ffs(sdp->bsize) - 1; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->bsize - sizeof(struct gfs2_dinode)) / + sizeof(uint64_t); + sdp->sd_inptrs = (sdp->bsize - sizeof(struct gfs2_meta_header)) / + sizeof(uint64_t); + sdp->sd_jbsize = sdp->bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t); + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_RU(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_RU(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + if (compute_heightsize(sdp->bsize, sdp->sd_heightsize, &sdp->sd_max_height, + sdp->bsize, sdp->sd_diptrs, sdp->sd_inptrs)) { + return -1; + } + if (compute_heightsize(sdp->bsize, sdp->sd_jheightsize, &sdp->sd_max_jheight, + sdp->sd_jbsize, sdp->sd_diptrs, sdp->sd_inptrs)) { + return -1; + } + return 0; +} + +/* Returns 0 if fd1 and fd2 refer to the same device/file, 1 otherwise, or -1 on error */ +static int fdcmp(int fd1, int fd2) +{ + struct stat st1, st2; + + if (fd1 < 0 || fd2 < 0) + return -1; + if ((fstat(fd1, &st1) != 0) || (fstat(fd2, &st2) != 0)) + return -1; + if (S_ISBLK(st1.st_mode) && S_ISBLK(st2.st_mode)) { + if (st1.st_rdev == st2.st_rdev) { + return 0; + } + } else if ((st1.st_dev == st2.st_dev) && (st1.st_ino == st2.st_ino)) { + return 0; + } + return 1; +} + +int lgfs2_open_mnt(const char *path, int dirflags, int *dirfd, int devflags, int *devfd, struct mntent **mnt) +{ + FILE *fp = setmntent("/proc/mounts", "r"); + if (fp == NULL) { + perror("open: /proc/mounts"); + return 1; + } + /* Assume path is mount point until we know better. */ + *dirfd = open(path, dirflags); + if (*dirfd < 0) + return 1; + + while ((*mnt = getmntent(fp)) != NULL) { + int fd; + if (strcmp((*mnt)->mnt_type, "gfs2") != 0) + continue; + *devfd = open((*mnt)->mnt_fsname, devflags); + /* Defer checking *devfd until later: whether it's ok to ignore + * the error depends on whether we find the mount point. */ + + if (strcmp(path, (*mnt)->mnt_dir) == 0) + break; + if (strcmp(path, (*mnt)->mnt_fsname) == 0 || fdcmp(*dirfd, *devfd) == 0) { + /* We have a match but our above assumption was + incorrect and *dirfd is actually the device. */ + close(*dirfd); + *dirfd = open((*mnt)->mnt_dir, dirflags); + break; + } + + fd = open((*mnt)->mnt_dir, dirflags); + if (fd >= 0) { + int diff = fdcmp(*dirfd, fd); + close(fd); + if (diff == 0) + break; + } + if (*devfd >= 0) + close(*devfd); + } + endmntent(fp); + if (*mnt == NULL) { + close(*dirfd); + return 0; /* Success. Answer is no. Both fds closed. */ + } + if (*dirfd < 0) { + close(*devfd); + return 1; + } + if (*devfd < 0) { + close(*dirfd); + return 1; + } + return 0; /* Success. Answer is yes. Both fds open. */ +} + +int lgfs2_open_mnt_dev(const char *path, int flags, struct mntent **mnt) +{ + int dirfd = -1; + int devfd = -1; + if (lgfs2_open_mnt(path, O_RDONLY, &dirfd, flags, &devfd, mnt) != 0) + return -1; + if (*mnt != NULL) + close(dirfd); + return devfd; +} + +int lgfs2_open_mnt_dir(const char *path, int flags, struct mntent **mnt) +{ + int dirfd = -1; + int devfd = -1; + if (lgfs2_open_mnt(path, flags, &dirfd, O_RDONLY, &devfd, mnt) != 0) + return -1; + if (*mnt != NULL) + close(devfd); + return dirfd; +} diff --git a/gfs2/libgfs2/ondisk.c b/gfs2/libgfs2/ondisk.c new file mode 100644 index 0000000..648a914 --- /dev/null +++ b/gfs2/libgfs2/ondisk.c @@ -0,0 +1,715 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include "libgfs2.h" +#ifdef GFS2_HAS_UUID +#include +#endif + +#define pv(struct, member, fmt, fmt2) do { \ + print_it(" "#member, fmt, fmt2, struct->member); \ + } while (FALSE); +#define pv2(struct, member, fmt, fmt2) do { \ + print_it(" ", fmt, fmt2, struct->member); \ + } while (FALSE); + + +#define CPIN_08(s1, s2, member, count) {memcpy((s1->member), (s2->member), (count));} +#define CPOUT_08(s1, s2, member, count) {memcpy((s2->member), (s1->member), (count));} +#define CPIN_16(s1, s2, member) {(s1->member) = be16_to_cpu((s2->member));} +#define CPOUT_16(s1, s2, member) {(s2->member) = cpu_to_be16((s1->member));} +#define CPIN_32(s1, s2, member) {(s1->member) = be32_to_cpu((s2->member));} +#define CPOUT_32(s1, s2, member) {(s2->member) = cpu_to_be32((s1->member));} +#define CPIN_64(s1, s2, member) {(s1->member) = be64_to_cpu((s2->member));} +#define CPOUT_64(s1, s2, member) {(s2->member) = cpu_to_be64((s1->member));} + +/* + * gfs2_xxx_in - read in an xxx struct + * first arg: the cpu-order structure + * buf: the disk-order block data + * + * gfs2_xxx_out - write out an xxx struct + * first arg: the cpu-order structure + * buf: the disk-order block data + * + * gfs2_xxx_print - print out an xxx struct + * first arg: the cpu-order structure + */ + +void gfs2_inum_in(struct gfs2_inum *no, char *buf) +{ + struct gfs2_inum *str = (struct gfs2_inum *)buf; + + CPIN_64(no, str, no_formal_ino); + CPIN_64(no, str, no_addr); +} + +void gfs2_inum_out(const struct gfs2_inum *no, char *buf) +{ + struct gfs2_inum *str = (struct gfs2_inum *)buf; + + CPOUT_64(no, str, no_formal_ino); + CPOUT_64(no, str, no_addr); +} + +void gfs2_inum_print(const struct gfs2_inum *no) +{ + pv(no, no_formal_ino, "%llu", "0x%llx"); + pv(no, no_addr, "%llu", "0x%llx"); +} + +void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf) +{ + struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf; + + CPIN_32(mh, str, mh_magic); + CPIN_32(mh, str, mh_type); + CPIN_32(mh, str, mh_format); +} + +void gfs2_meta_header_out(const struct gfs2_meta_header *mh, char *buf) +{ + struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf; + + CPOUT_32(mh, str, mh_magic); + CPOUT_32(mh, str, mh_type); + CPOUT_32(mh, str, mh_format); + str->__pad0 = 0; + str->__pad1 = 0; +} + +void gfs2_meta_header_print(const struct gfs2_meta_header *mh) +{ + pv(mh, mh_magic, "0x%08X", NULL); + pv(mh, mh_type, "%u", "0x%x"); + pv(mh, mh_format, "%u", "0x%x"); +} + +void gfs2_sb_in(struct gfs2_sb *sb, char *buf) +{ + struct gfs2_sb *str = (struct gfs2_sb *)buf; + + gfs2_meta_header_in(&sb->sb_header, buf); + + CPIN_32(sb, str, sb_fs_format); + CPIN_32(sb, str, sb_multihost_format); + CPIN_32(sb, str, __pad0); /* gfs sb_flags */ + + CPIN_32(sb, str, sb_bsize); + CPIN_32(sb, str, sb_bsize_shift); + CPIN_32(sb, str, __pad1); /* gfs sb_seg_size */ + + gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir); + gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir); + + CPIN_08(sb, str, sb_lockproto, GFS2_LOCKNAME_LEN); + CPIN_08(sb, str, sb_locktable, GFS2_LOCKNAME_LEN); + gfs2_inum_in(&sb->__pad2, (char *)&str->__pad2); /* gfs rindex */ + gfs2_inum_in(&sb->__pad3, (char *)&str->__pad3); /* gfs quota */ + gfs2_inum_in(&sb->__pad4, (char *)&str->__pad4); /* gfs license */ +#ifdef GFS2_HAS_UUID + CPIN_08(sb, str, sb_uuid, sizeof(sb->sb_uuid)); +#endif +} + +void gfs2_sb_out(const struct gfs2_sb *sb, char *buf) +{ + struct gfs2_sb *str = (struct gfs2_sb *)buf; + + gfs2_meta_header_out(&sb->sb_header, buf); + + CPOUT_32(sb, str, sb_fs_format); + CPOUT_32(sb, str, sb_multihost_format); + CPOUT_32(sb, str, __pad0); /* gfs sb_flags */ + + CPOUT_32(sb, str, sb_bsize); + CPOUT_32(sb, str, sb_bsize_shift); + CPOUT_32(sb, str, __pad1); /* gfs sb_seg_size */ + + gfs2_inum_out(&sb->sb_master_dir, (char *)&str->sb_master_dir); + gfs2_inum_out(&sb->sb_root_dir, (char *)&str->sb_root_dir); + + CPOUT_08(sb, str, sb_lockproto, GFS2_LOCKNAME_LEN); + CPOUT_08(sb, str, sb_locktable, GFS2_LOCKNAME_LEN); + gfs2_inum_out(&sb->__pad2, (char *)&str->__pad2); /* gfs rindex */ + gfs2_inum_out(&sb->__pad3, (char *)&str->__pad3); /* gfs quota */ + gfs2_inum_out(&sb->__pad4, (char *)&str->__pad4); /* gfs license */ +#ifdef GFS2_HAS_UUID + memcpy(str->sb_uuid, sb->sb_uuid, 16); +#endif +} + +void gfs2_sb_print(const struct gfs2_sb *sb) +{ + gfs2_meta_header_print(&sb->sb_header); + + pv(sb, sb_fs_format, "%u", "0x%x"); + pv(sb, sb_multihost_format, "%u", "0x%x"); + + pv(sb, sb_bsize, "%u", "0x%x"); + pv(sb, sb_bsize_shift, "%u", "0x%x"); + + gfs2_inum_print(&sb->sb_master_dir); + gfs2_inum_print(&sb->sb_root_dir); + + pv(sb, sb_lockproto, "%s", NULL); + pv(sb, sb_locktable, "%s", NULL); + +#ifdef GFS2_HAS_UUID + { + char readable_uuid[36+1]; + + uuid_unparse(sb->sb_uuid, readable_uuid); + print_it(" uuid", "%36s", NULL, readable_uuid); + } +#endif +} + +void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf) +{ + struct gfs2_rindex *str = (struct gfs2_rindex *)buf; + + CPIN_64(ri, str, ri_addr); + CPIN_32(ri, str, ri_length); + CPIN_32(ri, str, __pad); + CPIN_64(ri, str, ri_data0); + CPIN_32(ri, str, ri_data); + CPIN_32(ri, str, ri_bitbytes); + CPIN_08(ri, str, ri_reserved, sizeof(ri->ri_reserved)); +} + +void gfs2_rindex_out(const struct gfs2_rindex *ri, char *buf) +{ + struct gfs2_rindex *str = (struct gfs2_rindex *)buf; + + CPOUT_64(ri, str, ri_addr); + CPOUT_32(ri, str, ri_length); + str->__pad = 0; + + CPOUT_64(ri, str, ri_data0); + CPOUT_32(ri, str, ri_data); + + CPOUT_32(ri, str, ri_bitbytes); + + CPOUT_08(ri, str, ri_reserved, sizeof(ri->ri_reserved)); +} + +void gfs2_rindex_print(const struct gfs2_rindex *ri) +{ + pv(ri, ri_addr, "%llu", "0x%llx"); + pv(ri, ri_length, "%u", "0x%x"); + + pv(ri, ri_data0, "%llu", "0x%llx"); + pv(ri, ri_data, "%u", "0x%x"); + + pv(ri, ri_bitbytes, "%u", "0x%x"); +} + +void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf) +{ + struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf; + + gfs2_meta_header_in(&rg->rg_header, buf); + CPIN_32(rg, str, rg_flags); + CPIN_32(rg, str, rg_free); + CPIN_32(rg, str, rg_dinodes); +#ifdef GFS2_HAS_RG_SKIP + CPIN_32(rg, str, rg_skip); +#else + CPIN_32(rg, str, __pad); +#endif + CPIN_64(rg, str, rg_igeneration); +#ifdef GFS2_HAS_RG_RI_FIELDS + CPIN_64(rg, str, rg_data0); + CPIN_32(rg, str, rg_data); + CPIN_32(rg, str, rg_bitbytes); + CPIN_32(rg, str, rg_crc); +#endif + CPIN_08(rg, str, rg_reserved, sizeof(rg->rg_reserved)); +} + +void gfs2_rgrp_out(const struct gfs2_rgrp *rg, char *buf) +{ + struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf; + + gfs2_meta_header_out(&rg->rg_header, buf); + CPOUT_32(rg, str, rg_flags); + CPOUT_32(rg, str, rg_free); + CPOUT_32(rg, str, rg_dinodes); +#ifdef GFS2_HAS_RG_SKIP + CPOUT_32(rg, str, rg_skip); +#else + CPOUT_32(rg, str, __pad); +#endif + CPOUT_64(rg, str, rg_igeneration); +#ifdef GFS2_HAS_RG_RI_FIELDS + CPOUT_64(rg, str, rg_data0); + CPOUT_32(rg, str, rg_data); + CPOUT_32(rg, str, rg_bitbytes); + CPOUT_08(rg, str, rg_reserved, sizeof(rg->rg_reserved)); + lgfs2_rgrp_crc_set(buf); +#else + CPOUT_08(rg, str, rg_reserved, sizeof(rg->rg_reserved)); +#endif +} + +void gfs2_rgrp_print(const struct gfs2_rgrp *rg) +{ + gfs2_meta_header_print(&rg->rg_header); + pv(rg, rg_flags, "%u", "0x%x"); + pv(rg, rg_free, "%u", "0x%x"); + pv(rg, rg_dinodes, "%u", "0x%x"); +#ifdef GFS2_HAS_RG_SKIP + pv(rg, rg_skip, "%u", "0x%x"); +#else + pv(rg, __pad, "%u", "0x%x"); +#endif + pv(rg, rg_igeneration, "%llu", "0x%llx"); +#ifdef GFS2_HAS_RG_RI_FIELDS + pv(rg, rg_data0, "%llu", "0x%llx"); + pv(rg, rg_data, "%u", "0x%x"); + pv(rg, rg_bitbytes, "%u", "0x%x"); + pv(rg, rg_crc, "%u", "0x%x"); +#endif +} + +void gfs2_quota_in(struct gfs2_quota *qu, char *buf) +{ + struct gfs2_quota *str = (struct gfs2_quota *)buf; + + CPIN_64(qu, str, qu_limit); + CPIN_64(qu, str, qu_warn); + CPIN_64(qu, str, qu_value); + CPIN_08(qu, str, qu_reserved, sizeof(qu->qu_reserved)); +} + +void gfs2_quota_out(struct gfs2_quota *qu, char *buf) +{ + struct gfs2_quota *str = (struct gfs2_quota *)buf; + + CPOUT_64(qu, str, qu_limit); + CPOUT_64(qu, str, qu_warn); + CPOUT_64(qu, str, qu_value); + memset(qu->qu_reserved, 0, sizeof(qu->qu_reserved)); +} + +void gfs2_quota_print(const struct gfs2_quota *qu) +{ + pv(qu, qu_limit, "%llu", "0x%llx"); + pv(qu, qu_warn, "%llu", "0x%llx"); + pv(qu, qu_value, "%lld", "0x%llx"); +} + +void gfs2_dinode_in(struct gfs2_dinode *di, char *buf) +{ + struct gfs2_dinode *str = (struct gfs2_dinode *)buf; + + gfs2_meta_header_in(&di->di_header, buf); + gfs2_inum_in(&di->di_num, (char *)&str->di_num); + + CPIN_32(di, str, di_mode); + CPIN_32(di, str, di_uid); + CPIN_32(di, str, di_gid); + CPIN_32(di, str, di_nlink); + CPIN_64(di, str, di_size); + CPIN_64(di, str, di_blocks); + CPIN_64(di, str, di_atime); + CPIN_64(di, str, di_mtime); + CPIN_64(di, str, di_ctime); + CPIN_32(di, str, di_major); + CPIN_32(di, str, di_minor); + + CPIN_64(di, str, di_goal_meta); + CPIN_64(di, str, di_goal_data); + + CPIN_32(di, str, di_flags); + CPIN_32(di, str, di_payload_format); + CPIN_16(di, str, __pad1); + CPIN_16(di, str, di_height); + + CPIN_16(di, str, di_depth); + CPIN_32(di, str, di_entries); + + CPIN_64(di, str, di_eattr); + + CPIN_08(di, str, di_reserved, 32); +} + +void gfs2_dinode_out(struct gfs2_dinode *di, char *buf) +{ + struct gfs2_dinode *str = (struct gfs2_dinode *)buf; + + gfs2_meta_header_out(&di->di_header, buf); + gfs2_inum_out(&di->di_num, (char *)&str->di_num); + + CPOUT_32(di, str, di_mode); + CPOUT_32(di, str, di_uid); + CPOUT_32(di, str, di_gid); + CPOUT_32(di, str, di_nlink); + CPOUT_64(di, str, di_size); + CPOUT_64(di, str, di_blocks); + CPOUT_64(di, str, di_atime); + CPOUT_64(di, str, di_mtime); + CPOUT_64(di, str, di_ctime); + CPOUT_32(di, str, di_major); + CPOUT_32(di, str, di_minor); + + CPOUT_64(di, str, di_goal_meta); + CPOUT_64(di, str, di_goal_data); + + CPOUT_32(di, str, di_flags); + CPOUT_32(di, str, di_payload_format); + CPOUT_16(di, str, __pad1); + CPOUT_16(di, str, di_height); + + CPOUT_16(di, str, di_depth); + CPOUT_32(di, str, di_entries); + + CPOUT_64(di, str, di_eattr); + + CPOUT_08(di, str, di_reserved, 32); +} + +void gfs2_dinode_print(const struct gfs2_dinode *di) +{ + gfs2_meta_header_print(&di->di_header); + gfs2_inum_print(&di->di_num); + + pv(di, di_mode, "0%o", NULL); + pv(di, di_uid, "%u", "0x%x"); + pv(di, di_gid, "%u", "0x%x"); + pv(di, di_nlink, "%u", "0x%x"); + pv(di, di_size, "%llu", "0x%llx"); + pv(di, di_blocks, "%llu", "0x%llx"); + pv(di, di_atime, "%lld", "0x%llx"); + pv(di, di_mtime, "%lld", "0x%llx"); + pv(di, di_ctime, "%lld", "0x%llx"); + pv(di, di_major, "%u", "0x%llx"); + pv(di, di_minor, "%u", "0x%llx"); + + pv(di, di_goal_meta, "%llu", "0x%llx"); + pv(di, di_goal_data, "%llu", "0x%llx"); + + pv(di, di_flags, "0x%.8X", NULL); + pv(di, di_payload_format, "%u", "0x%x"); + pv(di, di_height, "%u", "0x%x"); + + pv(di, di_depth, "%u", "0x%x"); + pv(di, di_entries, "%u", "0x%x"); + + pv(di, di_eattr, "%llu", "0x%llx"); +} + +void gfs2_dirent_in(struct gfs2_dirent *de, char *buf) +{ + struct gfs2_dirent *str = (struct gfs2_dirent *)buf; + + gfs2_inum_in(&de->de_inum, buf); + CPIN_32(de, str, de_hash); + CPIN_16(de, str, de_rec_len); + CPIN_16(de, str, de_name_len); + CPIN_16(de, str, de_type); +#ifdef GFS2_HAS_DE_RAHEAD + CPIN_16(de, str, de_rahead); +#ifdef GFS2_HAS_DE_COOKIE + CPIN_32(de, str, de_cookie); + CPIN_08(de, str, pad3, 8); +#else + CPIN_08(de, str, pad2, 12); +#endif /* GFS2_HAS_DE_COOKIE */ +#else + CPIN_08(de, str, __pad, 14); +#endif /* GFS2_HAS_DE_RAHEAD */ +} + +void gfs2_dirent_out(struct gfs2_dirent *de, char *buf) +{ + struct gfs2_dirent *str = (struct gfs2_dirent *)buf; + + gfs2_inum_out(&de->de_inum, buf); + CPOUT_32(de, str, de_hash); + CPOUT_16(de, str, de_rec_len); + CPOUT_16(de, str, de_name_len); + CPOUT_16(de, str, de_type); +#ifdef GFS2_HAS_DE_RAHEAD + CPOUT_16(de, str, de_rahead); +#ifdef GFS2_HAS_DE_COOKIE + CPOUT_32(de, str, de_cookie); + CPOUT_08(de, str, pad3, 8); +#else + CPOUT_08(de, str, pad2, 12); +#endif /* GFS2_HAS_DE_COOKIE */ +#else + CPOUT_08(de, str, __pad, 14); +#endif /* GFS2_HAS_DE_RAHEAD */ +} + +void gfs2_leaf_in(struct gfs2_leaf *lf, char *buf) +{ + struct gfs2_leaf *str = (struct gfs2_leaf *)buf; + + gfs2_meta_header_in(&lf->lf_header, buf); + CPIN_16(lf, str, lf_depth); + CPIN_16(lf, str, lf_entries); + CPIN_32(lf, str, lf_dirent_format); + CPIN_64(lf, str, lf_next); +#ifdef GFS2_HAS_LEAF_HINTS + CPIN_64(lf, str, lf_inode); + CPIN_32(lf, str, lf_dist); + CPIN_32(lf, str, lf_nsec); + CPIN_64(lf, str, lf_sec); + CPIN_08(lf, str, lf_reserved2, 40); +#else + CPIN_08(lf, str, lf_reserved, 32); +#endif +} + +void gfs2_leaf_out(struct gfs2_leaf *lf, char *buf) +{ + struct gfs2_leaf *str = (struct gfs2_leaf *)buf; + + gfs2_meta_header_out(&lf->lf_header, buf); + CPOUT_16(lf, str, lf_depth); + CPOUT_16(lf, str, lf_entries); + CPOUT_32(lf, str, lf_dirent_format); + CPOUT_64(lf, str, lf_next); +#ifdef GFS2_HAS_LEAF_HINTS + CPOUT_64(lf, str, lf_inode); + CPOUT_32(lf, str, lf_dist); + CPOUT_32(lf, str, lf_nsec); + CPOUT_64(lf, str, lf_sec); + CPOUT_08(lf, str, lf_reserved2, 40); +#else + CPOUT_08(lf, str, lf_reserved, 64); +#endif +} + +void gfs2_leaf_print(const struct gfs2_leaf *lf) +{ + gfs2_meta_header_print(&lf->lf_header); + pv(lf, lf_depth, "%u", "0x%x"); + pv(lf, lf_entries, "%u", "0x%x"); + pv(lf, lf_dirent_format, "%u", "0x%x"); + pv(lf, lf_next, "%llu", "0x%llx"); +#ifdef GFS2_HAS_LEAF_HINTS + pv(lf, lf_inode, "%llu", "0x%llx"); + pv(lf, lf_dist, "%u", "0x%x"); + pv(lf, lf_nsec, "%u", "0x%x"); + pv(lf, lf_sec, "%llu", "0x%llx"); +#endif +} + +void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf) +{ + struct gfs2_ea_header *str = (struct gfs2_ea_header *)buf; + + CPIN_32(ea, str, ea_rec_len); + CPIN_32(ea, str, ea_data_len); + ea->ea_name_len = str->ea_name_len; + ea->ea_type = str->ea_type; + ea->ea_flags = str->ea_flags; + ea->ea_num_ptrs = str->ea_num_ptrs; +} + +void gfs2_ea_header_print(const struct gfs2_ea_header *ea, char *name) +{ + char buf[GFS2_EA_MAX_NAME_LEN + 1]; + + pv(ea, ea_rec_len, "%u", "0x%x"); + pv(ea, ea_data_len, "%u", "0x%x"); + pv(ea, ea_name_len, "%u", "0x%x"); + pv(ea, ea_type, "%u", "0x%x"); + pv(ea, ea_flags, "%u", "0x%x"); + pv(ea, ea_num_ptrs, "%u", "0x%x"); + + memset(buf, 0, GFS2_EA_MAX_NAME_LEN + 1); + memcpy(buf, name, ea->ea_name_len); + print_it(" name", "%s", NULL, buf); +} + +void gfs2_log_header_v1_in(struct gfs2_log_header *lh, char *buf) +{ + struct gfs2_log_header *str = (struct gfs2_log_header *)buf; + + gfs2_meta_header_in(&lh->lh_header, buf); + CPIN_64(lh, str, lh_sequence); + CPIN_32(lh, str, lh_flags); + CPIN_32(lh, str, lh_tail); + CPIN_32(lh, str, lh_blkno); + CPIN_32(lh, str, lh_hash); +} + +void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf) +{ + gfs2_log_header_v1_in(lh, buf); +#ifdef GFS2_HAS_LH_V2 + { + struct gfs2_log_header *str = (struct gfs2_log_header *)buf; + + CPIN_32(lh, str, lh_crc); + CPIN_32(lh, str, lh_nsec); + CPIN_64(lh, str, lh_sec); + CPIN_64(lh, str, lh_addr); + CPIN_64(lh, str, lh_jinode); + CPIN_64(lh, str, lh_statfs_addr); + CPIN_64(lh, str, lh_quota_addr); + CPIN_64(lh, str, lh_local_total); + CPIN_64(lh, str, lh_local_free); + CPIN_64(lh, str, lh_local_dinodes); + } +#endif +} + +void gfs2_log_header_v1_out(struct gfs2_log_header *lh, char *buf) +{ + struct gfs2_log_header *str = (struct gfs2_log_header *)buf; + + gfs2_meta_header_out(&lh->lh_header, buf); + CPOUT_64(lh, str, lh_sequence); + CPOUT_32(lh, str, lh_flags); + CPOUT_32(lh, str, lh_tail); + CPOUT_32(lh, str, lh_blkno); + CPOUT_32(lh, str, lh_hash); +} + +void gfs2_log_header_out(struct gfs2_log_header *lh, char *buf) +{ + gfs2_log_header_v1_out(lh, buf); +#ifdef GFS2_HAS_LH_V2 + { + struct gfs2_log_header *str = (struct gfs2_log_header *)buf; + + CPOUT_32(lh, str, lh_crc); + CPOUT_32(lh, str, lh_nsec); + CPOUT_64(lh, str, lh_sec); + CPOUT_64(lh, str, lh_addr); + CPOUT_64(lh, str, lh_jinode); + CPOUT_64(lh, str, lh_statfs_addr); + CPOUT_64(lh, str, lh_quota_addr); + CPOUT_64(lh, str, lh_local_total); + CPOUT_64(lh, str, lh_local_free); + CPOUT_64(lh, str, lh_local_dinodes); + } +#endif +} + +void gfs2_log_header_v1_print(const struct gfs2_log_header *lh) +{ + gfs2_meta_header_print(&lh->lh_header); + pv(lh, lh_sequence, "%llu", "0x%llx"); + pv(lh, lh_flags, "0x%.8X", NULL); + pv(lh, lh_tail, "%u", "0x%x"); + pv(lh, lh_blkno, "%u", "0x%x"); + pv(lh, lh_hash, "0x%.8X", NULL); +} + +void gfs2_log_header_print(const struct gfs2_log_header *lh) +{ + gfs2_log_header_v1_print(lh); +#ifdef GFS2_HAS_LH_V2 + pv(lh, lh_crc, "0x%.8X", NULL); + pv(lh, lh_nsec, "%u", "0x%x"); + pv(lh, lh_sec, "%llu", "0x%llx"); + pv(lh, lh_addr, "%llu", "0x%llx"); + pv(lh, lh_jinode, "%llu", "0x%llx"); + pv(lh, lh_statfs_addr, "%llu", "0x%llx"); + pv(lh, lh_quota_addr, "%llu", "0x%llx"); + pv(lh, lh_local_total, "%lld", "0x%llx"); + pv(lh, lh_local_free, "%lld", "0x%llx"); + pv(lh, lh_local_dinodes, "%lld", "0x%llx"); +#endif +} + +void gfs2_log_descriptor_in(struct gfs2_log_descriptor *ld, char *buf) +{ + struct gfs2_log_descriptor *str = (struct gfs2_log_descriptor *)buf; + + gfs2_meta_header_in(&ld->ld_header, buf); + CPIN_32(ld, str, ld_type); + CPIN_32(ld, str, ld_length); + CPIN_32(ld, str, ld_data1); + CPIN_32(ld, str, ld_data2); + + CPIN_08(ld, str, ld_reserved, 32); +} + +void gfs2_log_descriptor_out(struct gfs2_log_descriptor *ld, char *buf) +{ + struct gfs2_log_descriptor *str = (struct gfs2_log_descriptor *)buf; + + gfs2_meta_header_out(&ld->ld_header, buf); + CPOUT_32(ld, str, ld_type); + CPOUT_32(ld, str, ld_length); + CPOUT_32(ld, str, ld_data1); + CPOUT_32(ld, str, ld_data2); + + CPOUT_08(ld, str, ld_reserved, 32); +} + +void gfs2_log_descriptor_print(const struct gfs2_log_descriptor *ld) +{ + gfs2_meta_header_print(&ld->ld_header); + pv(ld, ld_type, "%u", "0x%x"); + pv(ld, ld_length, "%u", "0x%x"); + pv(ld, ld_data1, "%u", "0x%x"); + pv(ld, ld_data2, "%u", "0x%x"); +} + +void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf) +{ + struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf; + + CPIN_64(sc, str, sc_total); + CPIN_64(sc, str, sc_free); + CPIN_64(sc, str, sc_dinodes); +} + +void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf) +{ + struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf; + + CPOUT_64(sc, str, sc_total); + CPOUT_64(sc, str, sc_free); + CPOUT_64(sc, str, sc_dinodes); +} + +void gfs2_statfs_change_print(const struct gfs2_statfs_change *sc) +{ + pv(sc, sc_total, "%lld", "0x%llx"); + pv(sc, sc_free, "%lld", "0x%llx"); + pv(sc, sc_dinodes, "%lld", "0x%llx"); +} + +void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf) +{ + struct gfs2_quota_change *str = (struct gfs2_quota_change *)(buf + + sizeof(struct gfs2_meta_header)); + + CPIN_64(qc, str, qc_change); + CPIN_32(qc, str, qc_flags); + CPIN_32(qc, str, qc_id); +} + +void gfs2_quota_change_out(struct gfs2_quota_change *qc, char *buf) +{ + struct gfs2_quota_change *str = (struct gfs2_quota_change *)(buf + + sizeof(struct gfs2_meta_header)); + + CPOUT_64(qc, str, qc_change); + CPOUT_32(qc, str, qc_flags); + CPOUT_32(qc, str, qc_id); +} + +void gfs2_quota_change_print(const struct gfs2_quota_change *qc) +{ + pv(qc, qc_change, "%lld", "0x%llx"); + pv(qc, qc_flags, "0x%.8X", NULL); + pv(qc, qc_id, "%u", "0x%x"); +} + diff --git a/gfs2/libgfs2/parser.y b/gfs2/libgfs2/parser.y new file mode 100644 index 0000000..521c5df --- /dev/null +++ b/gfs2/libgfs2/parser.y @@ -0,0 +1,202 @@ +%code requires { +/* Required to break a circular dependency introduced with bison 2.6 */ +typedef void* yyscan_t; +} +%code top { +#include +#include "lang.h" +#include "lexer.h" + +static int yyerror(struct lgfs2_lang_state *state, yyscan_t lexer, const char *errorstr) +{ + fprintf(stderr, "%d:%d: %s\n", state->ls_linenum, state->ls_colnum, errorstr); + return 1; +} + +} +%defines +%debug +%define api.pure +%parse-param { struct lgfs2_lang_state *state } +%parse-param { yyscan_t lexer } +%lex-param { yyscan_t lexer } +%start script +%token TOK_COLON +%token TOK_COMMA +%token TOK_ID +%token TOK_LBRACE +%token TOK_LBRACKET +%token TOK_NUMBER +%token TOK_OFFSET +%token TOK_RBRACE +%token TOK_RBRACKET +%token TOK_SEMI +%token TOK_SET +%token TOK_GET +%token TOK_STATE +%token TOK_STRING +%token TOK_PATH +%% +script: statements { + state->ls_ast_root = $1; + state->ls_interp_curr = $1; + } + | statements TOK_SEMI { + state->ls_ast_root = $1; + state->ls_interp_curr = $1; + } +; +statements: statements TOK_SEMI statement { + state->ls_ast_tail->ast_left = $3; + state->ls_ast_tail = $3; + $$ = $1; + } + | statement { + if (state->ls_ast_tail == NULL) + state->ls_ast_tail = $1; + $$ = $1; + } +; +statement: set_stmt { $$ = $1;} + | get_stmt { $$ = $1; } +; +set_stmt: TOK_SET blockspec structspec { + $1->ast_right = $2; + $2->ast_right = $3; + $$ = $1; + } + | TOK_SET blockspec typespec structspec { + $1->ast_right = $2; + $2->ast_right = $3; + $3->ast_right = $4; + $$ = $1; + } +; +get_stmt: TOK_GET blockspec { + $1->ast_right = $2; $$ = $1; + } + | TOK_GET blockspec TOK_STATE { + $1->ast_right = $2; + $2->ast_right = $3; + $$ = $1; + } +; +blockspec: offset { $$ = $1; } + | address { $$ = $1; } + | path { $$ = $1; } + | block_literal { $$ = $1; } + | subscript { $$ = $1; } +; +offset: blockspec TOK_OFFSET { + $2->ast_left = $1; + $$ = $2; + } +; +typespec: identifier { + $1->ast_type = AST_EX_TYPESPEC; + $$ = $1; + } +; +block_literal: identifier { $$ = $1; } +; +subscript: block_literal TOK_LBRACKET index TOK_RBRACKET { + $4->ast_left = $1; + $1->ast_left = $3; + $$ = $4; + } +; +index: number { $$ = $1; } + | identifier { $$ = $1; } +; +address: number { + $1->ast_type = AST_EX_ADDRESS; + $$ = $1; + } +; +structspec: TOK_LBRACE fieldspecs TOK_RBRACE { $$ = $2; } + | TOK_LBRACE TOK_RBRACE { $$ = NULL; } +; +fieldspecs: fieldspecs TOK_COMMA fieldspec { + $1->ast_left = $3; + $$ = $1; + } + | fieldspec { $$ = $1; } +; +fieldspec: identifier TOK_COLON fieldvalue { + $2->ast_right = $1; + $1->ast_right = $3; + $$ = $2; + } +; +fieldvalue: number { $$ = $1; } + | string { $$ = $1; } +; +number: TOK_NUMBER { $$ = $1; } +string: TOK_STRING { $$ = $1; } +identifier: TOK_ID { $$ = $1; } +path: TOK_PATH { $$ = $1; } +%% + +/** + * Allocate and initialize a new parse state structure. The caller must free the + * memory returned by this function. + */ +struct lgfs2_lang_state *lgfs2_lang_init(void) +{ + struct lgfs2_lang_state *state; + state = calloc(1, sizeof(struct lgfs2_lang_state)); + if (state == NULL) { + return NULL; + } + state->ls_linenum = 1; + return state; +} + +void lgfs2_lang_free(struct lgfs2_lang_state **state) +{ + ast_destroy(&(*state)->ls_ast_root); + free(*state); + *state = NULL; +} + +int lgfs2_lang_parsef(struct lgfs2_lang_state *state, FILE *src) +{ + int ret = 0; + yyscan_t lexer; + + ret = yylex_init_extra(state, &lexer); + if (ret != 0) { + fprintf(stderr, "Failed to initialize lexer.\n"); + return ret; + } + + yyset_in(src, lexer); + ret = yyparse(state, lexer); + yylex_destroy(lexer); + return ret; +} + +int lgfs2_lang_parses(struct lgfs2_lang_state *state, const char *cstr) +{ + int ret; + FILE *src; + char *str = strdup(cstr); + + if (str == NULL) { + perror("Failed to duplicate source string"); + return 1; + } + src = fmemopen(str, strlen(str), "r"); + if (src == NULL) { + perror("Failed to open string as source file"); + free(str); + return 1; + } + ret = lgfs2_lang_parsef(state, src); + fclose(src); + free(str); + if (ret != 0 || state->ls_errnum != 0) { + return 1; + } + return 0; +} diff --git a/gfs2/libgfs2/recovery.c b/gfs2/libgfs2/recovery.c new file mode 100644 index 0000000..6b14bf9 --- /dev/null +++ b/gfs2/libgfs2/recovery.c @@ -0,0 +1,251 @@ +#include "clusterautoconfig.h" + +/* + * NOTE: + * + * This code was pilfered from the gfs2 kernel and adapted to userland. + * If you change this part, you should evaluate whether the upstream kernel + * version of recovery.c should be changed as well. Likewise, if the + * upstream version changes, this part should be kept in sync. + * + */ + +#include +#include +#include "libgfs2.h" + +void gfs2_replay_incr_blk(struct gfs2_inode *ip, unsigned int *blk) +{ + uint32_t jd_blocks = ip->i_di.di_size / ip->i_sbd->sd_sb.sb_bsize; + + if (++*blk == jd_blocks) + *blk = 0; +} + +int gfs2_replay_read_block(struct gfs2_inode *ip, unsigned int blk, + struct gfs2_buffer_head **bh) +{ + int new = 0; + uint64_t dblock; + + block_map(ip, blk, &new, &dblock, NULL, FALSE); + if (!dblock) + return -EIO; + + *bh = bread(ip->i_sbd, dblock); + return 0; +} + +/** + * get_log_header - read the log header for a given segment + * @ip: the journal incore inode + * @blk: the block to look at + * @lh: the log header to return + * + * Read the log header for a given segement in a given journal. Do a few + * sanity checks on it. + * + * Returns: 0 on success, + * 1 if the header was invalid or incomplete, + * errno on error + */ + +int get_log_header(struct gfs2_inode *ip, unsigned int blk, + struct gfs2_log_header *head) +{ + struct gfs2_buffer_head *bh; + struct gfs2_log_header lh, *tmp; + uint32_t hash, saved_hash; + uint32_t lh_crc = 0; + uint32_t crc; + int error; + + error = gfs2_replay_read_block(ip, blk, &bh); + if (error) + return error; + + tmp = (struct gfs2_log_header *)bh->b_data; + saved_hash = tmp->lh_hash; + tmp->lh_hash = 0; + hash = lgfs2_log_header_hash(bh->b_data); + tmp->lh_hash = saved_hash; + crc = lgfs2_log_header_crc(bh->b_data, ip->i_sbd->bsize); + gfs2_log_header_in(&lh, bh->b_data); + brelse(bh); +#ifdef GFS2_HAS_LH_V2 + lh_crc = lh.lh_crc; +#endif + if (error || lh.lh_blkno != blk || lh.lh_hash != hash) + return 1; + /* Don't check the crc if it's zero, as it is in pre-v2 log headers */ + if (lh_crc != 0 && lh_crc != crc) + return 1; + + *head = lh; + + return 0; +} + +/** + * find_good_lh - find a good log header + * @ip: the journal incore inode + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ +static int find_good_lh(struct gfs2_inode *ip, unsigned int *blk, struct gfs2_log_header *head) +{ + unsigned int orig_blk = *blk; + int error; + uint32_t jd_blocks = ip->i_di.di_size / ip->i_sbd->sd_sb.sb_bsize; + + for (;;) { + error = get_log_header(ip, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd_blocks) + *blk = 0; + + if (*blk == orig_blk) + return -EIO; + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_inode *ip, struct gfs2_log_header *head) +{ + unsigned int blk = head->lh_blkno; + uint32_t jd_blocks = ip->i_di.di_size / ip->i_sbd->sd_sb.sb_bsize; + struct gfs2_log_header lh; + int error; + + for (;;) { + if (++blk == jd_blocks) + blk = 0; + + error = get_log_header(ip, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) + return -EIO; + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_inode *ip, struct gfs2_log_header *head) +{ + struct gfs2_log_header lh_1, lh_m; + uint32_t blk_1, blk_2, blk_m; + uint32_t jd_blocks = ip->i_di.di_size / ip->i_sbd->sd_sb.sb_bsize; + int error; + + blk_1 = 0; + blk_2 = jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(ip, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(ip, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(ip, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + +/** + * clean_journal - mark a dirty journal as being clean + * @sdp: the filesystem + * @jd: the journal + * @head: the head journal to start from + * + * Returns: errno + */ + +int clean_journal(struct gfs2_inode *ip, struct gfs2_log_header *head) +{ + unsigned int lblock; + struct gfs2_log_header *lh; + uint32_t hash; + struct gfs2_buffer_head *bh; + int new = 0; + uint64_t dblock; + + lblock = head->lh_blkno; + gfs2_replay_incr_blk(ip, &lblock); + block_map(ip, lblock, &new, &dblock, NULL, 0); + if (!dblock) + return -EIO; + + bh = bread(ip->i_sbd, dblock); + memset(bh->b_data, 0, ip->i_sbd->bsize); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); + lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); + lh->lh_blkno = cpu_to_be32(lblock); + hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + bmodified(bh); + brelse(bh); + + return 0; +} + diff --git a/gfs2/libgfs2/rgrp.c b/gfs2/libgfs2/rgrp.c new file mode 100644 index 0000000..190715e --- /dev/null +++ b/gfs2/libgfs2/rgrp.c @@ -0,0 +1,1016 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" +#include "rgrp.h" + +#define RG_SYNC_TOLERANCE 1000 +#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S)) + +static void compute_bitmaps(lgfs2_rgrp_t rg, const unsigned bsize) +{ + int x; + + rg->bits[0].bi_offset = sizeof(struct gfs2_rgrp); + rg->bits[0].bi_start = 0; + rg->bits[0].bi_len = bsize - sizeof(struct gfs2_rgrp); + + for (x = 1; x < rg->ri.ri_length; x++) { + rg->bits[x].bi_offset = sizeof(struct gfs2_meta_header); + rg->bits[x].bi_start = rg->bits[x - 1].bi_start + rg->bits[x - 1].bi_len; + rg->bits[x].bi_len = bsize - sizeof(struct gfs2_meta_header); + } + x--; + rg->bits[x].bi_len = rg->ri.ri_bitbytes - rg->bits[x].bi_start; +} + +/** + * gfs2_compute_bitstructs - Compute the bitmap sizes + * bsize: Block size + * rgd: The resource group descriptor + * Returns: 0 on success, -1 on error + */ +int gfs2_compute_bitstructs(const uint32_t bsize, struct rgrp_tree *rgd) +{ + uint32_t length = rgd->ri.ri_length; + uint32_t bytes_left; + int ownbits = 0; + + /* Max size of an rg is 2GB. A 2GB RG with (minimum) 512-byte blocks + has 4194304 blocks. We can represent 4 blocks in one bitmap byte. + Therefore, all 4194304 blocks can be represented in 1048576 bytes. + Subtract a metadata header for each 512-byte block and we get + 488 bytes of bitmap per block. Divide 1048576 by 488 and we can + be assured we should never have more than 2149 of them. */ + errno = EINVAL; + if (length > 2149 || length == 0) + return -1; + + if(rgd->bits == NULL) { + rgd->bits = calloc(length, sizeof(struct gfs2_bitmap)); + if(rgd->bits == NULL) + return -1; + ownbits = 1; + } + + compute_bitmaps(rgd, bsize); + bytes_left = rgd->ri.ri_bitbytes - (rgd->bits[rgd->ri.ri_length - 1].bi_start + + rgd->bits[rgd->ri.ri_length - 1].bi_len); + errno = EINVAL; + if(bytes_left) + goto errbits; + + if((rgd->bits[length - 1].bi_start + + rgd->bits[length - 1].bi_len) * GFS2_NBBY != rgd->ri.ri_data) + goto errbits; + + return 0; +errbits: + if (ownbits) { + free(rgd->bits); + rgd->bits = NULL; + } + return -1; +} + + +/** + * blk2rgrpd - Find resource group for a given data block number + * @sdp: The GFS superblock + * @n: The data block number + * + * Returns: Ths resource group, or NULL if not found + */ +struct rgrp_tree *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk) +{ + struct rgrp_tree *rgd = (struct rgrp_tree *)sdp->rgtree.osi_node; + while (rgd) { + if (blk < rgd->ri.ri_addr) + rgd = (struct rgrp_tree *)rgd->node.osi_left; + else if (blk >= rgd->ri.ri_data0 + rgd->ri.ri_data) + rgd = (struct rgrp_tree *)rgd->node.osi_right; + else + return rgd; + } + return NULL; +} + +/** + * Allocate a multi-block buffer for a resource group's bitmaps. This is done + * as one chunk and should be freed using lgfs2_rgrp_bitbuf_free(). + * Returns 0 on success with the bitmap buffer allocated in the resource group, + * or non-zero on failure with errno set. + */ +int lgfs2_rgrp_bitbuf_alloc(lgfs2_rgrp_t rg) +{ + struct gfs2_sbd *sdp = rg->rgrps->sdp; + struct gfs2_buffer_head *bhs; + size_t len = rg->ri.ri_length * sdp->bsize; + unsigned long io_align = sdp->bsize; + unsigned i; + char *bufs; + + if (rg->rgrps->align > 0) { + len = ROUND_UP(len, rg->rgrps->align * sdp->bsize); + io_align = rg->rgrps->align_off * sdp->bsize; + } + bhs = calloc(rg->ri.ri_length, sizeof(struct gfs2_buffer_head)); + if (bhs == NULL) + return 1; + + if (posix_memalign((void **)&bufs, io_align, len) != 0) { + errno = ENOMEM; + free(bhs); + return 1; + } + memset(bufs, 0, len); + + for (i = 0; i < rg->ri.ri_length; i++) { + rg->bits[i].bi_bh = bhs + i; + rg->bits[i].bi_bh->iov.iov_base = bufs + (i * sdp->bsize); + rg->bits[i].bi_bh->iov.iov_len = sdp->bsize; + rg->bits[i].bi_bh->b_blocknr = rg->ri.ri_addr + i; + rg->bits[i].bi_bh->sdp = sdp; + } + return 0; +} + +/** + * Free the multi-block bitmap buffer from a resource group. The buffer should + * have been allocated as a single chunk as in lgfs2_rgrp_bitbuf_alloc(). + * This does not implicitly write the bitmaps to disk. Use lgfs2_rgrp_write() + * for that. + * rg: The resource groups whose bitmap buffer should be freed. + */ +void lgfs2_rgrp_bitbuf_free(lgfs2_rgrp_t rg) +{ + unsigned i; + free(rg->bits[0].bi_bh->iov.iov_base); + free(rg->bits[0].bi_bh); + for (i = 0; i < rg->ri.ri_length; i++) + rg->bits[i].bi_bh = NULL; +} + +/** + * Check a resource group's crc + * Returns 0 on success, non-zero if crc is bad + */ +int lgfs2_rgrp_crc_check(char *buf) +{ + int ret = 0; +#ifdef GFS2_HAS_RG_RI_FIELDS + struct gfs2_rgrp *rg = (struct gfs2_rgrp *)buf; + uint32_t crc = rg->rg_crc; + + if (crc == 0) + return 0; + + rg->rg_crc = 0; + if (be32_to_cpu(crc) != gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp))) + ret = 1; + rg->rg_crc = crc; +#endif + return ret; +} + +/** + * Set the crc of an on-disk resource group + */ +void lgfs2_rgrp_crc_set(char *buf) +{ +#ifdef GFS2_HAS_RG_RI_FIELDS + struct gfs2_rgrp *rg = (struct gfs2_rgrp *)buf; + uint32_t crc; + + rg->rg_crc = 0; + crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp)); + rg->rg_crc = cpu_to_be32(crc); +#endif +} + +/** + * gfs2_rgrp_read - read in the resource group information from disk. + * @rgd - resource group structure + * returns: 0 if no error, otherwise the block number that failed + */ +uint64_t gfs2_rgrp_read(struct gfs2_sbd *sdp, struct rgrp_tree *rgd) +{ + unsigned x, length = rgd->ri.ri_length; + struct gfs2_buffer_head **bhs; + + if (length == 0 || gfs2_check_range(sdp, rgd->ri.ri_addr)) + return -1; + + bhs = calloc(length, sizeof(struct gfs2_buffer_head *)); + if (bhs == NULL) + return -1; + + if (breadm(sdp, bhs, length, rgd->ri.ri_addr)) { + free(bhs); + return -1; + } + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = &rgd->bits[x]; + int mtype = (x ? GFS2_METATYPE_RB : GFS2_METATYPE_RG); + + bi->bi_bh = bhs[x]; + if (gfs2_check_meta(bi->bi_bh, mtype)) { + unsigned err = x; + do { + brelse(rgd->bits[x].bi_bh); + rgd->bits[x].bi_bh = NULL; + } while (x-- != 0); + free(bhs); + return rgd->ri.ri_addr + err; + } + } + if (sdp->gfs1) + gfs_rgrp_in((struct gfs_rgrp *)&rgd->rg, rgd->bits[0].bi_bh); + else { + if (lgfs2_rgrp_crc_check(rgd->bits[0].bi_bh->b_data)) { + free(bhs); + return rgd->ri.ri_addr; + } + gfs2_rgrp_in(&rgd->rg, rgd->bits[0].bi_bh->b_data); + } + free(bhs); + return 0; +} + +void gfs2_rgrp_relse(struct rgrp_tree *rgd) +{ + int x, length = rgd->ri.ri_length; + + if (rgd->bits == NULL) + return; + for (x = 0; x < length; x++) { + if (rgd->bits[x].bi_bh && rgd->bits[x].bi_bh->b_data) { + brelse(rgd->bits[x].bi_bh); + rgd->bits[x].bi_bh = NULL; + } + } +} + +struct rgrp_tree *rgrp_insert(struct osi_root *rgtree, uint64_t rgblock) +{ + struct osi_node **newn = &rgtree->osi_node, *parent = NULL; + struct rgrp_tree *data; + + /* Figure out where to put new node */ + while (*newn) { + struct rgrp_tree *cur = (struct rgrp_tree *)*newn; + + parent = *newn; + if (rgblock < cur->ri.ri_addr) + newn = &((*newn)->osi_left); + else if (rgblock > cur->ri.ri_addr) + newn = &((*newn)->osi_right); + else + return cur; + } + + data = calloc(1, sizeof(struct rgrp_tree)); + if (!data) + return NULL; + /* Add new node and rebalance tree. */ + data->ri.ri_addr = rgblock; + osi_link_node(&data->node, parent, newn); + osi_insert_color(&data->node, rgtree); + + return data; +} + +void gfs2_rgrp_free(struct osi_root *rgrp_tree) +{ + struct rgrp_tree *rgd; + int rgs_since_sync = 0; + struct osi_node *n; + struct gfs2_sbd *sdp = NULL; + + if (OSI_EMPTY_ROOT(rgrp_tree)) + return; + while ((n = osi_first(rgrp_tree))) { + rgd = (struct rgrp_tree *)n; + + if (rgd->bits) { + if (rgd->bits[0].bi_bh) { /* if a buffer exists */ + rgs_since_sync++; + if (rgs_since_sync >= RG_SYNC_TOLERANCE) { + if (!sdp) + sdp = rgd->bits[0].bi_bh->sdp; + fsync(sdp->device_fd); + rgs_since_sync = 0; + } + gfs2_rgrp_relse(rgd); /* free them all. */ + } + free(rgd->bits); + } + osi_erase(&rgd->node, rgrp_tree); + free(rgd); + } +} + +static uint64_t align_block(const uint64_t base, const uint64_t align) +{ + if ((align > 0) && ((base % align) > 0)) + return (base - (base % align)) + align; + return base; +} + +/** + * Calculate the aligned block address of a resource group. + * rgs: The resource groups handle + * base: The base address of the first resource group address, in blocks + * Returns the aligned address of the first resource group. + */ +uint64_t lgfs2_rgrp_align_addr(const lgfs2_rgrps_t rgs, uint64_t addr) +{ + return align_block(addr, rgs->align); +} + +/** + * Calculate the aligned relative address of the next resource group (and thus + * the aligned length of this one). + * rgs: The resource groups handle + * base: The base length of the current resource group, in blocks + * Returns the length of the resource group (the aligned relative address of + * the next one) + */ +uint32_t lgfs2_rgrp_align_len(const lgfs2_rgrps_t rgs, uint32_t len) +{ + return align_block(len, rgs->align) + rgs->align_off; +} + +/** + * Plan the sizes of resource groups for remaining free space, based on a + * target maximum size. In order to make best use of the space while keeping + * the resource groups aligned appropriately we need to either reduce the + * length of every resource group or of a subset of the resource groups, so + * we're left with either one or two resource group sizes. We keep track of + * both of these and the numbers of each size of resource group inside the + * resource groups descriptor. + * rgs: The resource groups descriptor + * space: The number of remaining blocks to be allocated + * tgtsize: The target resource group size in blocks + * Returns the number of resource groups planned to fit in the given space, or + * 0 if the smallest resource group would be smaller than GFS2_MIN_RGSIZE. + */ +uint32_t lgfs2_rgrps_plan(const lgfs2_rgrps_t rgs, uint64_t space, uint32_t tgtsize) +{ + uint32_t maxlen = (GFS2_MAX_RGSIZE << 20) / rgs->sdp->bsize; + uint32_t minlen = (GFS2_MIN_RGSIZE << 20) / rgs->sdp->bsize; + + /* Apps should already have checked that the rg size is <= + GFS2_MAX_RGSIZE but just in case alignment pushes it over we clamp + it back down while calculating the initial rgrp length. */ + do { + rgs->plan[0].len = lgfs2_rgrp_align_len(rgs, tgtsize); + tgtsize -= (rgs->align + 1); + } while (rgs->plan[0].len > maxlen); + + rgs->plan[0].num = space / rgs->plan[0].len; + + if ((space - (rgs->plan[0].num * rgs->plan[0].len)) > rgs->align) { + unsigned adj = (rgs->align > 0) ? rgs->align : 1; + + /* Spread the adjustment required to fit a new rgrp at the end + over all of the rgrps so that we don't end with a single + tiny one. */ + rgs->plan[0].num++; + while (((rgs->plan[0].len - adj) * (uint64_t)rgs->plan[0].num) >= space) + rgs->plan[0].len -= adj; + + /* We've adjusted the size of the rgrps down as far as we can + without leaving a large gap at the end of the device now, + but we still need to reduce the size of some rgrps in order + to make everything fit, so we use the second rgplan to + specify a second length for a subset of the resource groups. + If plan[0].len already divides the space with no remainder, + plan[1].num will stay 0 and it won't be used. */ + rgs->plan[1].len = rgs->plan[0].len - adj; + rgs->plan[1].num = 0; + + while (((rgs->plan[0].len * rgs->plan[0].num) + + (rgs->plan[1].len * rgs->plan[1].num)) >= space) { + /* Total number of rgrps stays constant now. We just + need to shift some weight around */ + rgs->plan[0].num--; + rgs->plan[1].num++; + } + } + + /* Once we've reached this point, + (plan[0].num * plan[0].len) + (plan[1].num * plan[1].len) + will be less than one adjustment smaller than 'space'. */ + if (rgs->plan[0].len < minlen) + return 0; + + return rgs->plan[0].num + rgs->plan[1].num; +} + +/** + * Create and initialise an empty set of resource groups + * bsize: The block size of the fs + * devlen: The length of the device, in fs blocks + * align: The required stripe alignment of the resource groups. Must be a multiple of 'offset'. + * offset: The required stripe offset of the resource groups + * Returns an initialised lgfs2_rgrps_t or NULL if unsuccessful with errno set + */ +lgfs2_rgrps_t lgfs2_rgrps_init(struct gfs2_sbd *sdp, uint64_t align, uint64_t offset) +{ + lgfs2_rgrps_t rgs; + + errno = EINVAL; + if (offset != 0 && (align % offset) != 0) + return NULL; + + rgs = calloc(1, sizeof(*rgs)); + if (rgs == NULL) + return NULL; + + rgs->sdp = sdp; + rgs->align = align; + rgs->align_off = offset; + memset(&rgs->root, 0, sizeof(rgs->root)); + + return rgs; +} + +/** + * Populate a set of resource groups from a gfs2 rindex file. + * fd: An open file descriptor for the rindex file. + * rgs: The set of resource groups. + * Returns the number of resource groups added to the set or 0 on error with + * errno set. + */ +unsigned lgfs2_rindex_read_fd(int fd, lgfs2_rgrps_t rgs) +{ + unsigned count = 0; + char buf[sizeof(struct gfs2_rindex)]; + + errno = EINVAL; + if (fd < 0 || rgs == NULL) + return 0; + + while (1) { + lgfs2_rgrp_t rg; + struct gfs2_rindex ri; + ssize_t ret = read(fd, buf, sizeof(struct gfs2_rindex)); + if (ret == 0) + break; + + if (ret != sizeof(struct gfs2_rindex)) + return 0; + + gfs2_rindex_in(&ri, buf); + rg = lgfs2_rgrps_append(rgs, &ri, 0); + if (rg == NULL) + return 0; + count++; + } + return count; +} + +/** + * Read a rindex entry into a set of resource groups + * rip: The inode of the rindex file + * rgs: The set of resource groups. + * i: The index of the entry to read from the rindex file + * Returns the new rindex entry added to the set or NULL on error with errno + * set. + */ +const struct gfs2_rindex *lgfs2_rindex_read_one(struct gfs2_inode *rip, lgfs2_rgrps_t rgs, unsigned i) +{ + uint64_t off = i * sizeof(struct gfs2_rindex); + char buf[sizeof(struct gfs2_rindex)]; + struct gfs2_rindex ri; + lgfs2_rgrp_t rg; + int ret; + + errno = EINVAL; + if (rip == NULL || rgs == NULL) + return NULL; + + ret = gfs2_readi(rip, buf, off, sizeof(struct gfs2_rindex)); + if (ret != sizeof(struct gfs2_rindex)) + return NULL; + + gfs2_rindex_in(&ri, buf); + rg = lgfs2_rgrps_append(rgs, &ri, 0); + if (rg == NULL) + return NULL; + + return &rg->ri; +} + +/** + * Free a set of resource groups created with lgfs2_rgrps_append() etc. This + * does not write any dirty buffers to disk. See lgfs2_rgrp_write(). + * rgs: A pointer to the set of resource groups to be freed. + */ +void lgfs2_rgrps_free(lgfs2_rgrps_t *rgs) +{ + lgfs2_rgrp_t rg; + struct osi_root *tree = &(*rgs)->root; + + while ((rg = (struct rgrp_tree *)osi_first(tree))) { + int i; + for (i = 0; i < rg->ri.ri_length; i++) { + if (rg->bits[i].bi_bh != NULL) { + free(rg->bits[i].bi_bh); + rg->bits[i].bi_bh = NULL; + } + } + osi_erase(&rg->node, tree); + free(rg); + } + free(*rgs); + *rgs = NULL; +} + +/** + * Calculate the fields for a new entry in the resource group index. + * ri: A pointer to the resource group index entry to be calculated. + * addr: The address at which to place this resource group + * len: The required length of the resource group, in fs blocks. + * If rglen is 0, geometry previously calculated by lgfs2_rgrps_plan() will be used. + * Returns the calculated address of the next resource group or 0 with errno set: + * EINVAL - The entry pointer is NULL + * ENOSPC - This rgrp would extend past the end of the device + */ +uint64_t lgfs2_rindex_entry_new(lgfs2_rgrps_t rgs, struct gfs2_rindex *ri, uint64_t addr, uint32_t len) +{ + int plan = -1; + errno = EINVAL; + if (!ri) + return 0; + + errno = ENOSPC; + if (rgs->plan[0].num > 0) + plan = 0; + else if (rgs->plan[1].num > 0) + plan = 1; + else if (len == 0) + return 0; + + if (plan >= 0 && (len == 0 || len == rgs->plan[plan].len)) { + len = rgs->plan[plan].len; + rgs->plan[plan].num--; + } + + if (addr + len > rgs->sdp->device.length) + return 0; + + ri->ri_addr = addr; + ri->ri_length = rgblocks2bitblocks(rgs->sdp->bsize, len, &ri->ri_data); + ri->__pad = 0; + ri->ri_data0 = ri->ri_addr + ri->ri_length; + ri->ri_bitbytes = ri->ri_data / GFS2_NBBY; + memset(&ri->ri_reserved, 0, sizeof(ri->ri_reserved)); + + return ri->ri_addr + len; +} + +/** + * Return the rindex structure relating to a resource group. + * The return type is const to advise callers that making changes to this + * structure directly isn't wise. libgfs2 functions should be used instead. + */ +const struct gfs2_rindex *lgfs2_rgrp_index(lgfs2_rgrp_t rg) +{ + return &rg->ri; +} + +/** + * Return the rgrp structure relating to a resource group. + * The return type is const to advise callers that making changes to this + * structure directly isn't wise. libgfs2 functions should be used instead. + */ +const struct gfs2_rgrp *lgfs2_rgrp_rgrp(lgfs2_rgrp_t rg) +{ + return &rg->rg; +} + +/** + * Returns the total resource group size, in blocks, required to give blksreq data blocks + */ +unsigned lgfs2_rgsize_for_data(uint64_t blksreq, unsigned bsize) +{ + const uint32_t blks_rgrp = GFS2_NBBY * (bsize - sizeof(struct gfs2_rgrp)); + const uint32_t blks_meta = GFS2_NBBY * (bsize - sizeof(struct gfs2_meta_header)); + unsigned bitblocks = 1; + blksreq = (blksreq + 3) & ~3; + if (blksreq > blks_rgrp) + bitblocks += ((blksreq - blks_rgrp) + blks_meta - 1) / blks_meta; + return bitblocks + blksreq; +} + +// Temporary function to aid in API migration +struct osi_node *lgfs2_rgrps_root(lgfs2_rgrps_t rgs) +{ + return rgs->root.osi_node; +} + +/** + * Insert a new resource group after the last resource group in a set. + * rgs: The set of resource groups + * entry: The entry to be added + * rg_skip: The value to be used for this resource group's rg_skip field + * Returns the new resource group on success or NULL on failure with errno set. + */ +lgfs2_rgrp_t lgfs2_rgrps_append(lgfs2_rgrps_t rgs, struct gfs2_rindex *entry, uint32_t rg_skip) +{ + lgfs2_rgrp_t rg; + struct osi_node **link = &rgs->root.osi_node; + struct osi_node *parent = osi_last(&rgs->root); + lgfs2_rgrp_t lastrg = (lgfs2_rgrp_t)parent; + + errno = EINVAL; + if (entry == NULL) + return NULL; + + if (lastrg != NULL) { /* Tree is not empty */ + if (entry->ri_addr <= lastrg->ri.ri_addr) + return NULL; /* Appending with a lower address doesn't make sense */ + link = &lastrg->node.osi_right; + } + + rg = calloc(1, sizeof(*rg) + (entry->ri_length * sizeof(struct gfs2_bitmap))); + if (rg == NULL) + return NULL; + + rg->bits = (struct gfs2_bitmap *)(rg + 1); + + osi_link_node(&rg->node, parent, link); + osi_insert_color(&rg->node, &rgs->root); + + memcpy(&rg->ri, entry, sizeof(struct gfs2_rindex)); + rg->rg.rg_header.mh_magic = GFS2_MAGIC; + rg->rg.rg_header.mh_type = GFS2_METATYPE_RG; + rg->rg.rg_header.mh_format = GFS2_FORMAT_RG; + rg->rg.rg_free = rg->ri.ri_data; +#ifdef GFS2_HAS_RG_SKIP + rg->rg.rg_skip = rg_skip; +#endif +#ifdef GFS2_HAS_RG_RI_FIELDS + rg->rg.rg_data0 = rg->ri.ri_data0; + rg->rg.rg_data = rg->ri.ri_data; + rg->rg.rg_bitbytes = rg->ri.ri_bitbytes; + rg->rg.rg_crc = 0; +#endif + compute_bitmaps(rg, rgs->sdp->bsize); + rg->rgrps = rgs; + return rg; +} + +/** + * Write a resource group to a file descriptor. + * Returns 0 on success or non-zero on failure with errno set + */ +int lgfs2_rgrp_write(int fd, const lgfs2_rgrp_t rg) +{ + struct gfs2_sbd *sdp = rg->rgrps->sdp; + unsigned int i; + const struct gfs2_meta_header bmh = { + .mh_magic = GFS2_MAGIC, + .mh_type = GFS2_METATYPE_RB, + .mh_format = GFS2_FORMAT_RB, + }; + int freebufs = 0; + ssize_t ret; + size_t len; + + if (rg->bits[0].bi_bh == NULL) { + freebufs = 1; + if (lgfs2_rgrp_bitbuf_alloc(rg) != 0) + return -1; + } + gfs2_rgrp_out(&rg->rg, rg->bits[0].bi_bh->b_data); + for (i = 1; i < rg->ri.ri_length; i++) + gfs2_meta_header_out(&bmh, rg->bits[i].bi_bh->b_data); + + len = sdp->bsize * rg->ri.ri_length; + if (rg->rgrps->align > 0) + len = ROUND_UP(len, rg->rgrps->align * sdp->bsize); + + ret = pwrite(sdp->device_fd, rg->bits[0].bi_bh->b_data, len, + rg->bits[0].bi_bh->b_blocknr * sdp->bsize); + + if (freebufs) + lgfs2_rgrp_bitbuf_free(rg); + + return ret == len ? 0 : -1; +} + +lgfs2_rgrp_t lgfs2_rgrp_first(lgfs2_rgrps_t rgs) +{ + return (lgfs2_rgrp_t)osi_first(&rgs->root); +} + +lgfs2_rgrp_t lgfs2_rgrp_next(lgfs2_rgrp_t rg) +{ + return (lgfs2_rgrp_t)osi_next(&rg->node); +} + +lgfs2_rgrp_t lgfs2_rgrp_prev(lgfs2_rgrp_t rg) +{ + return (lgfs2_rgrp_t)osi_prev(&rg->node); +} + +lgfs2_rgrp_t lgfs2_rgrp_last(lgfs2_rgrps_t rgs) +{ + return (lgfs2_rgrp_t)osi_last(&rgs->root); +} + +/** + * gfs2_rbm_from_block - Set the rbm based upon rgd and block number + * @rbm: The rbm with rgd already set correctly + * @block: The block number (filesystem relative) + * + * This sets the bi and offset members of an rbm based on a + * resource group and a filesystem relative block number. The + * resource group must be set in the rbm on entry, the bi and + * offset members will be set by this function. + * + * Returns: 0 on success, or non-zero with errno set + */ +int lgfs2_rbm_from_block(struct lgfs2_rbm *rbm, uint64_t block) +{ + uint64_t rblock = block - rbm->rgd->ri.ri_data0; + struct gfs2_sbd *sdp = rbm_bi(rbm)->bi_bh->sdp; + + if (rblock > UINT_MAX) { + errno = EINVAL; + return 1; + } + if (block >= rbm->rgd->ri.ri_data0 + rbm->rgd->ri.ri_data) { + errno = E2BIG; + return 1; + } + + rbm->bii = 0; + rbm->offset = (uint32_t)(rblock); + /* Check if the block is within the first block */ + if (rbm->offset < (rbm_bi(rbm)->bi_len * GFS2_NBBY)) + return 0; + + /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ + rbm->offset += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) * GFS2_NBBY; + rbm->bii = rbm->offset / sdp->sd_blocks_per_bitmap; + rbm->offset -= rbm->bii * sdp->sd_blocks_per_bitmap; + return 0; +} + +/** + * lgfs2_rbm_incr - increment an rbm structure + * @rbm: The rbm with rgd already set correctly + * + * This function takes an existing rbm structure and increments it to the next + * viable block offset. + * + * Returns: If incrementing the offset would cause the rbm to go past the + * end of the rgrp, true is returned, otherwise false. + * + */ +static int lgfs2_rbm_incr(struct lgfs2_rbm *rbm) +{ + if (rbm->offset + 1 < (rbm_bi(rbm)->bi_len * GFS2_NBBY)) { /* in the same bitmap */ + rbm->offset++; + return 0; + } + if (rbm->bii == rbm->rgd->ri.ri_length - 1) /* at the last bitmap */ + return 1; + + rbm->offset = 0; + rbm->bii++; + return 0; +} + +/** + * lgfs2_testbit - test a bit in the bitmaps + * @rbm: The bit to test + * + * Returns: The two bit block state of the requested bit + */ +static inline uint8_t lgfs2_testbit(const struct lgfs2_rbm *rbm) +{ + struct gfs2_bitmap *bi = rbm_bi(rbm); + const uint8_t *buffer = (uint8_t *)bi->bi_bh->b_data + bi->bi_offset; + const uint8_t *byte; + unsigned int bit; + + byte = buffer + (rbm->offset / GFS2_NBBY); + bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + return (*byte >> bit) & GFS2_BIT_MASK; +} + +/** + * lgfs2_unaligned_extlen - Look for free blocks which are not byte aligned + * @rbm: Position to search (value/result) + * @n_unaligned: Number of unaligned blocks to check + * @len: Decremented for each block found (terminate on zero) + * + * Returns: true if a non-free block is encountered + */ +static int lgfs2_unaligned_extlen(struct lgfs2_rbm *rbm, uint32_t n_unaligned, uint32_t *len) +{ + uint32_t n; + uint8_t res; + + for (n = 0; n < n_unaligned; n++) { + res = lgfs2_testbit(rbm); + if (res != GFS2_BLKST_FREE) + return 1; + (*len)--; + if (*len == 0) + return 1; + if (lgfs2_rbm_incr(rbm)) + return 1; + } + + return 0; +} + +static uint8_t *check_bytes8(const uint8_t *start, uint8_t value, unsigned bytes) +{ + while (bytes) { + if (*start != value) + return (void *)start; + start++; + bytes--; + } + return NULL; +} + +/** + * lgfs2_free_extlen - Return extent length of free blocks + * @rbm: Starting position + * @len: Max length to check + * + * Starting at the block specified by the rbm, see how many free blocks + * there are, not reading more than len blocks ahead. This can be done + * using check_bytes8 when the blocks are byte aligned, but has to be done + * on a block by block basis in case of unaligned blocks. Also this + * function can cope with bitmap boundaries (although it must stop on + * a resource group boundary) + * + * Returns: Number of free blocks in the extent + */ +static uint32_t lgfs2_free_extlen(const struct lgfs2_rbm *rrbm, uint32_t len) +{ + struct lgfs2_rbm rbm = *rrbm; + uint32_t n_unaligned = rbm.offset & 3; + uint32_t size = len; + uint32_t bytes; + uint32_t chunk_size; + uint8_t *ptr, *start, *end; + uint64_t block; + struct gfs2_bitmap *bi; + + if (n_unaligned && + lgfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) + goto out; + + n_unaligned = len & 3; + /* Start is now byte aligned */ + while (len > 3) { + bi = rbm_bi(&rbm); + start = (uint8_t *)bi->bi_bh->b_data; + end = start + bi->bi_bh->sdp->bsize; + start += bi->bi_offset; + start += (rbm.offset / GFS2_NBBY); + bytes = (len / GFS2_NBBY) < (end - start) ? (len / GFS2_NBBY):(end - start); + ptr = check_bytes8(start, 0, bytes); + chunk_size = ((ptr == NULL) ? bytes : (ptr - start)); + chunk_size *= GFS2_NBBY; + len -= chunk_size; + block = lgfs2_rbm_to_block(&rbm); + if (lgfs2_rbm_from_block(&rbm, block + chunk_size)) { + n_unaligned = 0; + break; + } + if (ptr) { + n_unaligned = 3; + break; + } + n_unaligned = len & 3; + } + + /* Deal with any bits left over at the end */ + if (n_unaligned) + lgfs2_unaligned_extlen(&rbm, n_unaligned, &len); +out: + return size - len; +} + +/** + * gfs2_rbm_find - Look for blocks of a particular state + * @rbm: Value/result starting position and final position + * @state: The state which we want to find + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. + * + * Returns: 0 on success, non-zero with errno == ENOSPC if there is no block of the requested state + */ +int lgfs2_rbm_find(struct lgfs2_rbm *rbm, uint8_t state, uint32_t *minext) +{ + int initial_bii; + uint32_t offset; + int n = 0; + int iters = rbm->rgd->ri.ri_length; + uint32_t extlen; + + /* If we are not starting at the beginning of a bitmap, then we + * need to add one to the bitmap count to ensure that we search + * the starting bitmap twice. + */ + if (rbm->offset != 0) + iters++; + + for (n = 0; n < iters; n++) { + struct gfs2_bitmap *bi = rbm_bi(rbm); + struct gfs2_buffer_head *bh = bi->bi_bh; + uint8_t *buf = (uint8_t *)bh->b_data + bi->bi_offset; + uint64_t block; + int ret; + + if ((rbm->rgd->rg.rg_free < *minext) && (state == GFS2_BLKST_FREE)) + goto next_bitmap; + + offset = gfs2_bitfit(buf, bi->bi_len, rbm->offset, state); + if (offset == BFITNOENT) + goto next_bitmap; + + rbm->offset = offset; + initial_bii = rbm->bii; + block = lgfs2_rbm_to_block(rbm); + extlen = 1; + + if (*minext != 0) + extlen = lgfs2_free_extlen(rbm, *minext); + + if (extlen >= *minext) + return 0; + + ret = lgfs2_rbm_from_block(rbm, block + extlen); + if (ret == 0) { + n += (rbm->bii - initial_bii); + continue; + } + + if (errno == E2BIG) { + rbm->bii = 0; + rbm->offset = 0; + n += (rbm->bii - initial_bii); + goto res_covered_end_of_rgrp; + } + + return ret; + +next_bitmap: /* Find next bitmap in the rgrp */ + rbm->offset = 0; + rbm->bii++; + if (rbm->bii == rbm->rgd->ri.ri_length) + rbm->bii = 0; + +res_covered_end_of_rgrp: + if (rbm->bii == 0) + break; + } + + errno = ENOSPC; + return 1; +} + +/** + * lgfs2_alloc_extent - allocate an extent from a given bitmap + * @rbm: the resource group information + * @state: The state of the first block, GFS2_BLKST_DINODE or GFS2_BLKST_USED + * @elen: The requested extent length + * Returns the length of the extent allocated. + */ +unsigned lgfs2_alloc_extent(const struct lgfs2_rbm *rbm, int state, const unsigned elen) +{ + struct lgfs2_rbm pos = { .rgd = rbm->rgd, }; + const uint64_t block = lgfs2_rbm_to_block(rbm); + unsigned len; + + gfs2_set_bitmap(rbm->rgd, block, state); + + for (len = 1; len < elen; len++) { + int ret = lgfs2_rbm_from_block(&pos, block + len); + if (ret || lgfs2_testbit(&pos) != GFS2_BLKST_FREE) + break; + gfs2_set_bitmap(pos.rgd, block + len, GFS2_BLKST_USED); + } + return len; +} diff --git a/gfs2/libgfs2/rgrp.h b/gfs2/libgfs2/rgrp.h new file mode 100644 index 0000000..fd442b1 --- /dev/null +++ b/gfs2/libgfs2/rgrp.h @@ -0,0 +1,51 @@ +#ifndef __RGRP_DOT_H__ +#define __RGRP_DOT_H__ + +#include "libgfs2.h" + +struct rgplan { + uint32_t num; + uint32_t len; +}; + +/** + * This structure is defined in libgfs2.h as an opaque type. It stores the + * constants and context required for creating resource groups from any point + * in an application. + */ +struct _lgfs2_rgrps { + struct osi_root root; + struct rgplan plan[2]; + struct gfs2_sbd *sdp; + unsigned long align; + unsigned long align_off; +}; + +struct lgfs2_rbm { + lgfs2_rgrp_t rgd; + uint32_t offset; /* The offset is bitmap relative */ + unsigned bii; /* Bitmap index */ +}; + +static inline struct gfs2_bitmap *rbm_bi(const struct lgfs2_rbm *rbm) +{ + return rbm->rgd->bits + rbm->bii; +} + +static inline uint64_t lgfs2_rbm_to_block(const struct lgfs2_rbm *rbm) +{ + return rbm->rgd->ri.ri_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; +} + +static inline int lgfs2_rbm_eq(const struct lgfs2_rbm *rbm1, const struct lgfs2_rbm *rbm2) +{ + return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && + (rbm1->offset == rbm2->offset); +} + +extern int lgfs2_rbm_from_block(struct lgfs2_rbm *rbm, uint64_t block); +extern int lgfs2_rbm_find(struct lgfs2_rbm *rbm, uint8_t state, uint32_t *minext); +extern unsigned lgfs2_alloc_extent(const struct lgfs2_rbm *rbm, int state, const unsigned elen); + +#endif /* __RGRP_DOT_H__ */ diff --git a/gfs2/libgfs2/structures.c b/gfs2/libgfs2/structures.c new file mode 100644 index 0000000..c84701d --- /dev/null +++ b/gfs2/libgfs2/structures.c @@ -0,0 +1,679 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" +#include "config.h" +#include "crc32c.h" + +#ifdef GFS2_HAS_UUID +#include +#endif + +int build_master(struct gfs2_sbd *sdp) +{ + struct gfs2_inum inum; + uint64_t bn; + struct gfs2_buffer_head *bh = NULL; + int err = lgfs2_dinode_alloc(sdp, 1, &bn); + + if (err != 0) + return -1; + + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = bn; + + err = init_dinode(sdp, &bh, &inum, S_IFDIR | 0755, GFS2_DIF_SYSTEM, &inum); + if (err != 0) + return -1; + + sdp->master_dir = lgfs2_inode_get(sdp, bh); + if (sdp->master_dir == NULL) + return -1; + + if (cfg_debug) { + printf("\nMaster dir:\n"); + gfs2_dinode_print(&sdp->master_dir->i_di); + } + sdp->master_dir->bh_owned = 1; + return 0; +} + +/** + * Initialise a gfs2_sb structure with sensible defaults. + */ +void lgfs2_sb_init(struct gfs2_sb *sb, unsigned bsize) +{ + memset(sb, 0, sizeof(struct gfs2_sb)); + sb->sb_header.mh_magic = GFS2_MAGIC; + sb->sb_header.mh_type = GFS2_METATYPE_SB; + sb->sb_header.mh_format = GFS2_FORMAT_SB; + sb->sb_fs_format = GFS2_FORMAT_FS; + sb->sb_multihost_format = GFS2_FORMAT_MULTI; + sb->sb_bsize = bsize; + sb->sb_bsize_shift = ffs(bsize) - 1; +#ifdef GFS2_HAS_UUID + uuid_generate(sb->sb_uuid); +#endif +} + +int lgfs2_sb_write(const struct gfs2_sb *sb, int fd, const unsigned bsize) +{ + int i, err = -1; + struct iovec *iov; + const size_t sb_addr = GFS2_SB_ADDR * GFS2_BASIC_BLOCK / bsize; + const size_t len = sb_addr + 1; + + /* We only need 2 blocks: one for zeroing and a second for the superblock */ + char *buf = calloc(2, bsize); + if (buf == NULL) + return -1; + + iov = malloc(len * sizeof(*iov)); + if (iov == NULL) + goto out_buf; + + for (i = 0; i < len; i++) { + iov[i].iov_base = buf; + iov[i].iov_len = bsize; + } + + gfs2_sb_out(sb, buf + bsize); + iov[sb_addr].iov_base = buf + bsize; + + if (pwritev(fd, iov, len, 0) < (len * bsize)) + goto out_iov; + + err = 0; +out_iov: + free(iov); +out_buf: + free(buf); + return err; +} + +uint32_t lgfs2_log_header_hash(char *buf) +{ + /* lh_hash only CRCs the fields in the old lh, which ends where lh_crc is now */ + const off_t v1_end = offsetof(struct gfs2_log_header, lh_hash) + 4; + + return gfs2_disk_hash(buf, v1_end); +} + +uint32_t lgfs2_log_header_crc(char *buf, unsigned bsize) +{ +#ifdef GFS2_HAS_LH_V2 + /* lh_crc CRCs the rest of the block starting after lh_crc */ + const off_t v1_end = offsetof(struct gfs2_log_header, lh_hash) + 4; + const unsigned char *lb = (const unsigned char *)buf; + + return crc32c(~0, lb + v1_end + 4, bsize - v1_end - 4); +#else + return 0; +#endif +} + +/** + * Intialise and write the data blocks for a new journal as a contiguous + * extent. The indirect blocks pointing to these data blocks should have been + * written separately using lgfs2_write_filemeta() and the extent should have + * been allocated using lgfs2_file_alloc(). + * ip: The journal's inode + * Returns 0 on success or -1 with errno set on error. + */ +int lgfs2_write_journal_data(struct gfs2_inode *ip) +{ + struct gfs2_log_header lh = { + .lh_header.mh_magic = GFS2_MAGIC, + .lh_header.mh_type = GFS2_METATYPE_LH, + .lh_header.mh_format = GFS2_FORMAT_LH, + .lh_tail = 0, + .lh_blkno = 0, + .lh_hash = 0, +#ifdef GFS2_HAS_LH_V2 + .lh_flags = GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_USERSPACE, + .lh_crc = 0, + .lh_nsec = 0, + .lh_sec = 0, + .lh_jinode = ip->i_di.di_num.no_addr, + .lh_statfs_addr = 0, + .lh_quota_addr = 0, + .lh_local_total = 0, + .lh_local_free = 0, + .lh_local_dinodes = 0, +#else + .lh_flags = GFS2_LOG_HEAD_UNMOUNT, +#endif + }; + struct gfs2_buffer_head *bh; + struct gfs2_sbd *sdp = ip->i_sbd; + unsigned blocks = (ip->i_di.di_size + sdp->bsize - 1) / sdp->bsize; + uint64_t jext0 = ip->i_di.di_num.no_addr + ip->i_di.di_blocks - blocks; + uint64_t seq = ((blocks) * (random() / (RAND_MAX + 1.0))); + + bh = bget(sdp, jext0); + if (bh == NULL) + return -1; + + crc32c_optimization_init(); + do { + struct gfs2_log_header *buflh = (struct gfs2_log_header *)bh->b_data; + + lh.lh_sequence = seq; + lh.lh_blkno = bh->b_blocknr - jext0; + gfs2_log_header_out(&lh, bh->b_data); + + buflh->lh_hash = cpu_to_be32(lgfs2_log_header_hash(bh->b_data)); +#ifdef GFS2_HAS_LH_V2 + buflh->lh_addr = cpu_to_be64(bh->b_blocknr); + buflh->lh_crc = cpu_to_be32(lgfs2_log_header_crc(bh->b_data, sdp->bsize)); +#endif + + if (bwrite(bh)) { + free(bh); + return -1; + } + + if (++seq == blocks) + seq = 0; + + } while (++bh->b_blocknr < jext0 + blocks); + + free(bh); + return 0; +} + +int write_journal(struct gfs2_inode *jnl, unsigned bsize, unsigned int blocks) +{ + struct gfs2_log_header lh; + unsigned int x; + uint64_t seq = ((blocks) * (random() / (RAND_MAX + 1.0))); + uint32_t hash; + unsigned int height; + + /* Build the height up so our journal blocks will be contiguous and */ + /* not broken up by indirect block pages. */ + height = calc_tree_height(jnl, (blocks + 1) * bsize); + build_height(jnl, height); + + memset(&lh, 0, sizeof(struct gfs2_log_header)); + lh.lh_header.mh_magic = GFS2_MAGIC; + lh.lh_header.mh_type = GFS2_METATYPE_LH; + lh.lh_header.mh_format = GFS2_FORMAT_LH; + lh.lh_flags = GFS2_LOG_HEAD_UNMOUNT; +#ifdef GFS2_HAS_LH_V2 + lh.lh_flags |= GFS2_LOG_HEAD_USERSPACE; + lh.lh_jinode = jnl->i_di.di_num.no_addr; +#endif + for (x = 0; x < blocks; x++) { + struct gfs2_buffer_head *bh = get_file_buf(jnl, x, TRUE); + if (!bh) + return -1; + bmodified(bh); + brelse(bh); + } + crc32c_optimization_init(); + for (x = 0; x < blocks; x++) { + struct gfs2_buffer_head *bh = get_file_buf(jnl, x, FALSE); + if (!bh) + return -1; + + memset(bh->b_data, 0, bsize); + lh.lh_sequence = seq; + lh.lh_blkno = x; + gfs2_log_header_out(&lh, bh->b_data); + hash = lgfs2_log_header_hash(bh->b_data); + ((struct gfs2_log_header *)bh->b_data)->lh_hash = cpu_to_be32(hash); +#ifdef GFS2_HAS_LH_V2 + ((struct gfs2_log_header *)bh->b_data)->lh_addr = cpu_to_be64(bh->b_blocknr); + hash = lgfs2_log_header_crc(bh->b_data, bsize); + ((struct gfs2_log_header *)bh->b_data)->lh_crc = cpu_to_be32(hash); +#endif + bmodified(bh); + brelse(bh); + + if (++seq == blocks) + seq = 0; + } + + return 0; +} + +int build_journal(struct gfs2_sbd *sdp, int j, struct gfs2_inode *jindex) +{ + char name[256]; + int ret; + + sprintf(name, "journal%u", j); + sdp->md.journal[j] = createi(jindex, name, S_IFREG | 0600, + GFS2_DIF_SYSTEM); + if (sdp->md.journal[j] == NULL) { + return errno; + } + ret = write_journal(sdp->md.journal[j], sdp->bsize, + sdp->jsize << 20 >> sdp->sd_sb.sb_bsize_shift); + return ret; +} + +/** + * Write a jindex file given a list of journal inums. + * master: Inode of the master directory + * jnls: List of inum structures relating to previously created journals. + * nmemb: The number of entries in the list (number of journals). + * Returns 0 on success or non-zero on error with errno set. + */ +int lgfs2_build_jindex(struct gfs2_inode *master, struct gfs2_inum *jnls, size_t nmemb) +{ + char fname[GFS2_FNAMESIZE + 1]; + struct gfs2_inode *jindex; + unsigned j; + int ret; + + if (nmemb == 0 || jnls == NULL) { + errno = EINVAL; + return 1; + } + jindex = createi(master, "jindex", S_IFDIR | 0700, GFS2_DIF_SYSTEM); + if (jindex == NULL) + return 1; + + fname[GFS2_FNAMESIZE] = '\0'; + + for (j = 0; j < nmemb; j++) { + snprintf(fname, GFS2_FNAMESIZE, "journal%u", j); + ret = dir_add(jindex, fname, strlen(fname), &jnls[j], IF2DT(S_IFREG | 0600)); + if (ret) { + inode_put(&jindex); + return 1; + } + } + + if (cfg_debug) { + printf("\nJindex:\n"); + gfs2_dinode_print(&jindex->i_di); + } + + inode_put(&jindex); + return 0; +} + +int build_jindex(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *jindex; + unsigned int j; + int ret; + + jindex = createi(sdp->master_dir, "jindex", S_IFDIR | 0700, + GFS2_DIF_SYSTEM); + if (jindex == NULL) { + return errno; + } + sdp->md.journal = malloc(sdp->md.journals * + sizeof(struct gfs2_inode *)); + for (j = 0; j < sdp->md.journals; j++) { + ret = build_journal(sdp, j, jindex); + if (ret) + return ret; + inode_put(&sdp->md.journal[j]); + } + if (cfg_debug) { + printf("\nJindex:\n"); + gfs2_dinode_print(&jindex->i_di); + } + + free(sdp->md.journal); + inode_put(&jindex); + return 0; +} + +int build_inum_range(struct gfs2_inode *per_node, unsigned int j) +{ + char name[256]; + struct gfs2_inode *ip; + + sprintf(name, "inum_range%u", j); + ip = createi(per_node, name, S_IFREG | 0600, + GFS2_DIF_SYSTEM | GFS2_DIF_JDATA); + if (ip == NULL) { + return errno; + } + ip->i_di.di_size = sizeof(struct gfs2_inum_range); + gfs2_dinode_out(&ip->i_di, ip->i_bh->b_data); + bmodified(ip->i_bh); + if (cfg_debug) { + printf("\nInum Range %u:\n", j); + gfs2_dinode_print(&ip->i_di); + } + + inode_put(&ip); + return 0; +} + +int build_statfs_change(struct gfs2_inode *per_node, unsigned int j) +{ + char name[256]; + struct gfs2_inode *ip; + + sprintf(name, "statfs_change%u", j); + ip = createi(per_node, name, S_IFREG | 0600, + GFS2_DIF_SYSTEM | GFS2_DIF_JDATA); + if (ip == NULL) { + return errno; + } + ip->i_di.di_size = sizeof(struct gfs2_statfs_change); + gfs2_dinode_out(&ip->i_di, ip->i_bh->b_data); + bmodified(ip->i_bh); + if (cfg_debug) { + printf("\nStatFS Change %u:\n", j); + gfs2_dinode_print(&ip->i_di); + } + + inode_put(&ip); + return 0; +} + +int build_quota_change(struct gfs2_inode *per_node, unsigned int j) +{ + struct gfs2_sbd *sdp = per_node->i_sbd; + struct gfs2_meta_header mh; + char name[256]; + struct gfs2_inode *ip; + unsigned int blocks = sdp->qcsize << (20 - sdp->sd_sb.sb_bsize_shift); + unsigned int x; + unsigned int hgt; + struct gfs2_buffer_head *bh; + + memset(&mh, 0, sizeof(struct gfs2_meta_header)); + mh.mh_magic = GFS2_MAGIC; + mh.mh_type = GFS2_METATYPE_QC; + mh.mh_format = GFS2_FORMAT_QC; + + sprintf(name, "quota_change%u", j); + ip = createi(per_node, name, S_IFREG | 0600, GFS2_DIF_SYSTEM); + if (ip == NULL) { + return errno; + } + + hgt = calc_tree_height(ip, (blocks + 1) * sdp->bsize); + build_height(ip, hgt); + + for (x = 0; x < blocks; x++) { + bh = get_file_buf(ip, x, FALSE); + if (!bh) + return -1; + + memset(bh->b_data, 0, sdp->bsize); + gfs2_meta_header_out(&mh, bh->b_data); + bmodified(bh); + brelse(bh); + } + + if (cfg_debug) { + printf("\nQuota Change %u:\n", j); + gfs2_dinode_print(&ip->i_di); + } + + inode_put(&ip); + return 0; +} + +int build_per_node(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *per_node; + unsigned int j; + int err; + + per_node = createi(sdp->master_dir, "per_node", S_IFDIR | 0700, + GFS2_DIF_SYSTEM); + if (per_node == NULL) { + return errno; + } + + for (j = 0; j < sdp->md.journals; j++) { + err = build_inum_range(per_node, j); + if (err) { + return err; + } + err = build_statfs_change(per_node, j); + if (err) { + return err; + } + err = build_quota_change(per_node, j); + if (err) { + return err; + } + } + + if (cfg_debug) { + printf("\nper_node:\n"); + gfs2_dinode_print(&per_node->i_di); + } + + inode_put(&per_node); + return 0; +} + +int build_inum(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + ip = createi(sdp->master_dir, "inum", S_IFREG | 0600, + GFS2_DIF_SYSTEM | GFS2_DIF_JDATA); + if (ip == NULL) { + return errno; + } + + if (cfg_debug) { + printf("\nInum Inode:\n"); + gfs2_dinode_print(&ip->i_di); + } + + inode_put(&ip); + return 0; +} + +int build_statfs(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + ip = createi(sdp->master_dir, "statfs", S_IFREG | 0600, + GFS2_DIF_SYSTEM | GFS2_DIF_JDATA); + if (ip == NULL) { + return errno; + } + + if (cfg_debug) { + printf("\nStatFS Inode:\n"); + gfs2_dinode_print(&ip->i_di); + } + + inode_put(&ip); + return 0; +} + +int build_rindex(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + struct osi_node *n, *next = NULL; + struct rgrp_tree *rl; + char buf[sizeof(struct gfs2_rindex)]; + int count; + + ip = createi(sdp->master_dir, "rindex", S_IFREG | 0600, + GFS2_DIF_SYSTEM | GFS2_DIF_JDATA); + if (ip == NULL) { + return errno; + } + ip->i_di.di_payload_format = GFS2_FORMAT_RI; + bmodified(ip->i_bh); + + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rl = (struct rgrp_tree *)n; + + gfs2_rindex_out(&rl->ri, buf); + + count = gfs2_writei(ip, buf, ip->i_di.di_size, + sizeof(struct gfs2_rindex)); + if (count != sizeof(struct gfs2_rindex)) + return -1; + } + memset(buf, 0, sizeof(struct gfs2_rindex)); + count = __gfs2_writei(ip, buf, ip->i_di.di_size, + sizeof(struct gfs2_rindex), 0); + if (count != sizeof(struct gfs2_rindex)) + return -1; + + if (cfg_debug) { + printf("\nResource Index:\n"); + gfs2_dinode_print(&ip->i_di); + } + + inode_put(&ip); + return 0; +} + +int build_quota(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + struct gfs2_quota qu; + char buf[sizeof(struct gfs2_quota)]; + int count; + + ip = createi(sdp->master_dir, "quota", S_IFREG | 0600, + GFS2_DIF_SYSTEM | GFS2_DIF_JDATA); + if (ip == NULL) { + return errno; + } + ip->i_di.di_payload_format = GFS2_FORMAT_QU; + bmodified(ip->i_bh); + + memset(&qu, 0, sizeof(struct gfs2_quota)); + qu.qu_value = 1; + gfs2_quota_out(&qu, buf); + + count = gfs2_writei(ip, buf, ip->i_di.di_size, sizeof(struct gfs2_quota)); + if (count != sizeof(struct gfs2_quota)) + return -1; + count = gfs2_writei(ip, buf, ip->i_di.di_size, sizeof(struct gfs2_quota)); + if (count != sizeof(struct gfs2_quota)) + return -1; + + if (cfg_debug) { + printf("\nRoot quota:\n"); + gfs2_quota_print(&qu); + } + + inode_put(&ip); + return 0; +} + +int build_root(struct gfs2_sbd *sdp) +{ + struct gfs2_inum inum; + uint64_t bn; + struct gfs2_buffer_head *bh = NULL; + int err = lgfs2_dinode_alloc(sdp, 1, &bn); + + if (err != 0) + return -1; + + inum.no_formal_ino = sdp->md.next_inum++; + inum.no_addr = bn; + + err = init_dinode(sdp, &bh, &inum, S_IFDIR | 0755, 0, &inum); + if (err != 0) + return -1; + + sdp->md.rooti = lgfs2_inode_get(sdp, bh); + if (sdp->md.rooti == NULL) + return -1; + + if (cfg_debug) { + printf("\nRoot directory:\n"); + gfs2_dinode_print(&sdp->md.rooti->i_di); + } + sdp->md.rooti->bh_owned = 1; + return 0; +} + +int do_init_inum(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = sdp->md.inum; + uint64_t buf; + int count; + + buf = cpu_to_be64(sdp->md.next_inum); + count = gfs2_writei(ip, &buf, 0, sizeof(uint64_t)); + if (count != sizeof(uint64_t)) + return -1; + + if (cfg_debug) + printf("\nNext Inum: %"PRIu64"\n", + sdp->md.next_inum); + return 0; +} + +int do_init_statfs(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = sdp->md.statfs; + struct gfs2_statfs_change sc; + char buf[sizeof(struct gfs2_statfs_change)]; + int count; + + sc.sc_total = sdp->blks_total; + sc.sc_free = sdp->blks_total - sdp->blks_alloced; + sc.sc_dinodes = sdp->dinodes_alloced; + + gfs2_statfs_change_out(&sc, buf); + count = gfs2_writei(ip, buf, 0, sizeof(struct gfs2_statfs_change)); + if (count != sizeof(struct gfs2_statfs_change)) + return -1; + + if (cfg_debug) { + printf("\nStatfs:\n"); + gfs2_statfs_change_print(&sc); + } + return 0; +} + +int gfs2_check_meta(struct gfs2_buffer_head *bh, int type) +{ + uint32_t check_magic = ((struct gfs2_meta_header *)(bh->b_data))->mh_magic; + uint32_t check_type = ((struct gfs2_meta_header *)(bh->b_data))->mh_type; + + check_magic = be32_to_cpu(check_magic); + check_type = be32_to_cpu(check_type); + if((check_magic != GFS2_MAGIC) || (type && (check_type != type))) + return -1; + return 0; +} + +unsigned lgfs2_bm_scan(struct rgrp_tree *rgd, unsigned idx, uint64_t *buf, uint8_t state) +{ + struct gfs2_bitmap *bi = &rgd->bits[idx]; + unsigned n = 0; + uint32_t blk = 0; + + while(blk < (bi->bi_len * GFS2_NBBY)) { + blk = gfs2_bitfit((uint8_t *)bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, blk, state); + if (blk == BFITNOENT) + break; + buf[n++] = blk + (bi->bi_start * GFS2_NBBY) + rgd->ri.ri_data0; + blk++; + } + + return n; +} diff --git a/gfs2/libgfs2/super.c b/gfs2/libgfs2/super.c new file mode 100644 index 0000000..6e7d8c2 --- /dev/null +++ b/gfs2/libgfs2/super.c @@ -0,0 +1,366 @@ +#include "clusterautoconfig.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libgfs2.h" +#include "osi_list.h" + +/** + * check_sb - Check superblock + * @sb: The superblock + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + * + * Returns: -1 on failure, 1 if this is gfs (gfs1), 2 if this is gfs2 + */ +int check_sb(struct gfs2_sb *sb) +{ + if (sb->sb_header.mh_magic != GFS2_MAGIC || + sb->sb_header.mh_type != GFS2_METATYPE_SB) { + errno = EIO; + return -1; + } + if (sb->sb_fs_format == GFS_FORMAT_FS && + sb->sb_header.mh_format == GFS_FORMAT_SB && + sb->sb_multihost_format == GFS_FORMAT_MULTI) { + return 1; + } + return 2; +} + + +/* + * read_sb: read the super block from disk + * sdp: in-core super block + * + * This function reads in the super block from disk and + * initializes various constants maintained in the super + * block + * + * Returns: 0 on success, -1 on failure + * sdp->gfs1 will be set if this is gfs (gfs1) + */ +int read_sb(struct gfs2_sbd *sdp) +{ + struct gfs2_buffer_head *bh; + uint64_t space = 0; + unsigned int x; + int ret; + + bh = bread(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + gfs2_sb_in(&sdp->sd_sb, bh->b_data); + brelse(bh); + + ret = check_sb(&sdp->sd_sb); + if (ret < 0) + return ret; + if (ret == 1) + sdp->gfs1 = 1; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; + sdp->bsize = sdp->sd_sb.sb_bsize; + if (sdp->bsize < 512 || sdp->bsize != (sdp->bsize & -sdp->bsize)) { + return -1; + } + if (sdp->gfs1) { + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs_dinode)) / + sizeof(uint64_t); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs_indirect)) / + sizeof(uint64_t); + } else { + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / + sizeof(uint64_t); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(uint64_t); + } + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t); + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2; x < GFS2_MAX_META_HEIGHT; x++){ + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + /* FIXME: Do we really need this first check?? */ + if (space / sdp->sd_inptrs != sdp->sd_heightsize[x - 1] || + space % sdp->sd_inptrs != 0) + break; + sdp->sd_heightsize[x] = space; + } + if (x > GFS2_MAX_META_HEIGHT){ + errno = E2BIG; + return -1; + } + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2; ; x++){ + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + if (space / sdp->sd_inptrs != sdp->sd_jheightsize[x - 1] || + space % sdp->sd_inptrs != 0) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + if(sdp->sd_max_jheight > GFS2_MAX_META_HEIGHT) { + errno = E2BIG; + return -1; + } + sdp->fssize = lseek(sdp->device_fd, 0, SEEK_END) / sdp->sd_sb.sb_bsize; + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; + sdp->qcsize = GFS2_DEFAULT_QCSIZE; + + return 0; +} + +/* rgd_seems_sane - check some general things about the rindex entry + * + * If rg lengths are not consistent, it's not sane (or it's converted from + * gfs1). The first RG will be a different length due to space reserved for + * the superblock, so we can't detect this until we check rgrp 3, when we + * can compare the distance between rgrp 1 and rgrp 2. + * + * Returns: 1 if the rgd seems relatively sane + */ +static int rgd_seems_sane(struct gfs2_sbd *sdp, struct rgrp_tree *rgd) +{ + uint32_t most_bitmaps_possible; + + /* rg length must be at least 1 */ + if (rgd->ri.ri_length == 0) + return 0; + + /* A max rgrp, 2GB, divided into blocksize, divided by blocks/byte + represented in the bitmap, NBBY. Rough approximation only, due to + metadata headers. I'm doing the math this way to avoid overflow. */ + most_bitmaps_possible = (GFS2_MAX_RGSIZE * 1024 * 256) / sdp->bsize; + if (rgd->ri.ri_length > most_bitmaps_possible) + return 0; + + if (rgd->ri.ri_data0 != rgd->ri.ri_addr + rgd->ri.ri_length) + return 0; + + if (rgd->ri.ri_bitbytes != rgd->ri.ri_data / GFS2_NBBY) + return 0; + + return 1; +} + +/* good_on_disk - check if the rindex points to what looks like an rgrp on disk + * + * This is only called when the rindex pointers aren't spaced evenly, which + * isn't often. The rindex is pointing to an unexpected location, so we + * check if the block it is pointing to is really an rgrp. If so, we count the + * rindex entry as "sane" (after all, it did pass the previous checks above.) + * If not, we count it as not sane, and therefore, the whole rindex is not to + * be trusted by fsck.gfs2. + */ +static int good_on_disk(struct gfs2_sbd *sdp, struct rgrp_tree *rgd) +{ + struct gfs2_buffer_head *bh; + int is_rgrp; + + bh = bread(sdp, rgd->ri.ri_addr); + is_rgrp = (gfs2_check_meta(bh, GFS2_METATYPE_RG) == 0); + brelse(bh); + return is_rgrp; +} + +/** + * rindex_read - read in the rg index file + * @sdp: the incore superblock pointer + * fd: optional file handle for rindex file (if meta_fs file system is mounted) + * (if fd is <= zero, it will read from raw device) + * @count1: return count of the rgs. + * @sane: return whether rindex is consistent + * + * Returns: 0 on success, -1 on failure + */ +int rindex_read(struct gfs2_sbd *sdp, int fd, uint64_t *count1, int *sane) +{ + unsigned int rg; + int error; + union { + struct gfs2_rindex bufgfs2; + } buf; + struct gfs2_rindex ri; + struct rgrp_tree *rgd = NULL, *prev_rgd = NULL; + uint64_t prev_length = 0; + + *sane = 1; + *count1 = 0; + if (!fd && sdp->md.riinode->i_di.di_size % sizeof(struct gfs2_rindex)) + *sane = 0; /* rindex file size must be a multiple of 96 */ + for (rg = 0; ; rg++) { + if (fd > 0) + error = read(fd, &buf, sizeof(struct gfs2_rindex)); + else + error = gfs2_readi(sdp->md.riinode, + (char *)&buf.bufgfs2, + rg * sizeof(struct gfs2_rindex), + sizeof(struct gfs2_rindex)); + if (!error) + break; + if (error != sizeof(struct gfs2_rindex)) + return -1; + + gfs2_rindex_in(&ri, (char *)&buf.bufgfs2); + if (gfs2_check_range(sdp, ri.ri_addr) != 0) { + *sane = 0; + if (prev_rgd == NULL) + continue; + ri.ri_addr = prev_rgd->ri.ri_addr + prev_rgd->length; + } + rgd = rgrp_insert(&sdp->rgtree, ri.ri_addr); + memcpy(&rgd->ri, &ri, sizeof(struct gfs2_rindex)); + + rgd->start = rgd->ri.ri_addr; + if (prev_rgd) { + /* If rg addresses go backwards, it's not sane + (or it's converted from gfs1). */ + if (!sdp->gfs1) { + if (prev_rgd->start >= rgd->start) + *sane = 0; + else if (!rgd_seems_sane(sdp, rgd)) + *sane = 0; + else if (*sane && rg > 2 && prev_length && + prev_length != rgd->start - + prev_rgd->start) + *sane = good_on_disk(sdp, rgd); + } + prev_length = rgd->start - prev_rgd->start; + prev_rgd->length = rgrp_size(prev_rgd); + } + + if(gfs2_compute_bitstructs(sdp->sd_sb.sb_bsize, rgd)) + *sane = 0; + + (*count1)++; + prev_rgd = rgd; + } + if (prev_rgd) + prev_rgd->length = rgrp_size(prev_rgd); + if (*count1 == 0) + return -1; + return 0; +} + +#define RA_WINDOW 32 + +static unsigned gfs2_rgrp_reada(struct gfs2_sbd *sdp, unsigned cur_window, + struct osi_node *n) +{ + struct rgrp_tree *rgd; + unsigned i; + off_t start, len; + + for (i = 0; i < RA_WINDOW; i++, n = osi_next(n)) { + if (n == NULL) + return i; + if (i < cur_window) + continue; + rgd = (struct rgrp_tree *)n; + start = rgd->ri.ri_addr * sdp->bsize; + len = rgd->ri.ri_length * sdp->bsize; + posix_fadvise(sdp->device_fd, start, len, POSIX_FADV_WILLNEED); + } + + return i; +} + +/** + * ri_update - attach rgrps to the super block + * @sdp: incore superblock data + * fd: optional file handle for rindex (through the meta_fs) + * @rgcount: returned count of rgs + * + * Given the rgrp index inode, link in all rgrps into the super block + * and be sure that they can be read. + * + * Returns: 0 on success, -1 on failure. + */ +static int __ri_update(struct gfs2_sbd *sdp, int fd, int *rgcount, int *sane, + int quiet) +{ + struct rgrp_tree *rgd; + struct gfs2_rindex *ri; + uint64_t count1 = 0, count2 = 0; + uint64_t errblock = 0; + uint64_t rmax = 0; + struct osi_node *n, *next = NULL; + unsigned ra_window = 0; + + /* Turn off generic readhead */ + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_RANDOM); + + if (rindex_read(sdp, fd, &count1, sane)) + goto fail; + for (n = osi_first(&sdp->rgtree); n; n = next) { + next = osi_next(n); + rgd = (struct rgrp_tree *)n; + /* Readahead resource group headers */ + if (ra_window < RA_WINDOW/2) + ra_window = gfs2_rgrp_reada(sdp, ra_window, n); + /* Read resource group header */ + errblock = gfs2_rgrp_read(sdp, rgd); + if (errblock) + return errblock; + ra_window--; + count2++; + if (!quiet && count2 % 100 == 0) { + printf("."); + fflush(stdout); + } + ri = &rgd->ri; + if (ri->ri_data0 + ri->ri_data - 1 > rmax) + rmax = ri->ri_data0 + ri->ri_data - 1; + } + + sdp->fssize = rmax; + *rgcount = count1; + if (count1 != count2) + goto fail; + + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); + return 0; + + fail: + posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL); + gfs2_rgrp_free(&sdp->rgtree); + return -1; +} + +int ri_update(struct gfs2_sbd *sdp, int fd, int *rgcount, int *sane) +{ + return __ri_update(sdp, fd, rgcount, sane, 1); +} + +/** + * gfs1_ri_update - attach rgrps to the super block + * Stolen from libgfs2/super.c, but modified to handle gfs1. + * @sdp: + * + * Given the rgrp index inode, link in all rgrps into the super block + * and be sure that they can be read. + * + * Returns: 0 on success, -1 on failure. + */ +int gfs1_ri_update(struct gfs2_sbd *sdp, int fd, int *rgcount, int quiet) +{ + int sane; + + return __ri_update(sdp, fd, rgcount, &sane, quiet); +} diff --git a/gfs2/man/Makefile.am b/gfs2/man/Makefile.am new file mode 100644 index 0000000..b6cc120 --- /dev/null +++ b/gfs2/man/Makefile.am @@ -0,0 +1,14 @@ +MAINTAINERCLEANFILES = Makefile.in + +dist_man_MANS = \ + fsck.gfs2.8 \ + gfs2.5 \ + gfs2_convert.8 \ + gfs2_edit.8 \ + gfs2_grow.8 \ + gfs2_jadd.8 \ + mkfs.gfs2.8 \ + tunegfs2.8 \ + gfs2_lockcapture.8 \ + gfs2_trace.8 \ + glocktop.8 diff --git a/gfs2/man/fsck.gfs2.8 b/gfs2/man/fsck.gfs2.8 new file mode 100644 index 0000000..b2b326f --- /dev/null +++ b/gfs2/man/fsck.gfs2.8 @@ -0,0 +1,88 @@ +.TH fsck.gfs2 8 + +.SH NAME +fsck.gfs2 - Offline GFS and GFS2 file system checker + +.SH SYNOPSIS +.B fsck.gfs2 +[\fIOPTION\fR]... \fIDEVICE\fR + +.SH WARNING +All computers \fImust\fP have the filesystem unmounted before running +fsck.gfs2. Failure to unmount from all nodes in a cluster will likely result +in filesystem corruption. + +.SH DESCRIPTION +fsck.gfs2 will check that the GFS or GFS2 file system on a device is structurally valid. +It should not be run on a mounted file system. If file system corruption is +detected, it will attempt to repair the file system. There is a limit to what +fsck.gfs2 can do. If important file system structures are destroyed, such that +the checker cannot determine what the repairs should be, reparations could +fail. + +GFS2 is a journaled file system, and as such should be able to repair damage to +the file system on its own. However, faulty hardware has the ability to write +incomplete blocks to a file system thereby causing corruption that GFS2 cannot +fix. The first step to ensuring a healthy file system is the selection of +reliable hardware (i.e. storage systems that will write complete blocks - even +in the event of power failure). + +Note: Most file system checkers will not check the file system if it is +"clean" (i.e. unmounted since the last use). The fsck.gfs program behaves +differently because the storage may be shared among several nodes in a +cluster, and therefore problems may have been introduced on a different +computer. Therefore, fsck.gfs2 will always check the file system unless +the -p (preen) option is used, in which case it follows special rules +(see below). + +fsck.gfs2 will log to the system log on start and exit to aid debugging and +administration. +.SH OPTIONS +.TP +\fB-a\fP +Same as the \fB-p\fP (preen) option. +.TP +\fB-f\fP +Force checking even if the file system seems clean. +.TP +\fB-h\fP +Help. + +This prints out the proper command line usage syntax. +.TP +\fB-q\fP +Quiet. +.TP +\fB-n\fP +No to all questions. By specifying this option, fsck.gfs2 will only show the changes that +would be made, but not make any changes to the filesystem. + +This option may not be used with the \fB-y\fP or \fB-p\fP/\fB-a\fP options. +.TP +\fB-p\fP +Automatically repair ("preen") the file system if it is dirty and safe to do so, +otherwise exit. + +If the file system has locking protocol \fIlock_nolock\fR, it is considered a +non-shared storage device and it is considered safe. If the locking protocol +is lock_dlm and \fB-a\fP or \fB-p\fP was specified, the check is considered unsafe as it +cannot be determined whether the device is mounted by other nodes in the cluster. +In this case a warning is given if any damage or dirty journals are found. The +file system should then be unmounted from all nodes in the cluster and +fsck.gfs2 should be run manually without the \fB-a\fP or \fB-p\fP options. + +This option may not be used with the \fB-n\fP or \fB-y\fP options. +.TP +\fB-V\fP +Print out the program version information. +.TP +\fB-v\fP +Verbose operation. + +Print more information while running. +.TP +\fB-y\fP +Yes to all questions. By specifying this option, fsck.gfs2 will not prompt before making +changes. + +This option may not be used with the \fB-n\fP or \fB-p\fP/\fB-a\fP options. diff --git a/gfs2/man/gfs2.5 b/gfs2/man/gfs2.5 new file mode 100644 index 0000000..56d1a00 --- /dev/null +++ b/gfs2/man/gfs2.5 @@ -0,0 +1,419 @@ +.TH gfs2 5 + +.SH NAME +gfs2 \- GFS2 reference guide + +.SH SYNOPSIS +Overview of the GFS2 filesystem + +.SH DESCRIPTION + +GFS2 is a clustered filesystem, designed for sharing data between +multiple nodes +connected to a common shared storage device. It can also be used as a +local filesystem on a single node, however since the design is aimed +at clusters, that will usually result in lower performance than using +a filesystem designed specifically for single node use. + +GFS2 is a journaling filesystem and one journal is required for each node +that will mount the filesystem. The one exception to that is spectator +mounts which are equivalent to mounting a read-only block device and as +such can neither recover a journal or write to the filesystem, so do not +require a journal assigned to them. + +.SH MOUNT OPTIONS + +.TP +\fBlockproto=\fP\fILockProtoName\fR +This specifies which inter-node lock protocol is used by the GFS2 filesystem +for this mount, overriding the default lock protocol name stored in the +filesystem's on-disk superblock. + +The \fILockProtoName\fR must be one of the supported locking protocols, +currently these are \fIlock_nolock\fR and \fIlock_dlm\fR. + +The default lock protocol name is written to disk initially when creating the +filesystem with \fBmkfs.gfs2\fP(8), -p option. It can be changed on-disk by +using the \fBgfs2_tool\fP(8) utility's \fBsb proto\fP command. + +The \fBlockproto\fP mount option should be used only under special +circumstances in which you want to temporarily use a different lock protocol +without changing the on-disk default. Using the incorrect lock protocol +on a cluster filesystem mounted from more than one node will almost +certainly result in filesystem corruption. +.TP +\fBlocktable=\fP\fILockTableName\fR +This specifies the identity of the cluster and of the filesystem for this +mount, overriding the default cluster/filesystem identify stored in the +filesystem's on-disk superblock. The cluster/filesystem name is recognized +globally throughout the cluster, and establishes a unique namespace for +the inter-node locking system, enabling the mounting of multiple GFS2 +filesystems. + +The format of \fILockTableName\fR is lock-module-specific. For +\fIlock_dlm\fR, the format is \fIclustername:fsname\fR. For +\fIlock_nolock\fR, the field is ignored. + +The default cluster/filesystem name is written to disk initially when creating +the filesystem with \fBmkfs.gfs2\fP(8), -t option. It can be changed on-disk +by using the \fBgfs2_tool\fP(8) utility's \fBsb table\fP command. + +The \fBlocktable\fP mount option should be used only under special +circumstances in which you want to mount the filesystem in a different cluster, +or mount it as a different filesystem name, without changing the on-disk +default. +.TP +\fBlocalflocks\fP +This flag tells GFS2 that it is running as a local (not clustered) filesystem, +so it can allow the kernel VFS layer to do all flock and fcntl file locking. +When running in cluster mode, these file locks require inter-node locks, +and require the support of GFS2. When running locally, better performance +is achieved by letting VFS handle the whole job. + +This is turned on automatically by the lock_nolock module. +.TP +\fBerrors=\fP\fI[panic|withdraw]\fR +Setting errors=panic causes GFS2 to oops when encountering an error that +would otherwise cause the +mount to withdraw or print an assertion warning. The default setting +is errors=withdraw. This option should not be used in a production system. +It replaces the earlier \fBdebug\fP option on kernel versions 2.6.31 and +above. +.TP +\fBacl\fP +Enables POSIX Access Control List \fBacl\fP(5) support within GFS2. +.TP +\fBspectator\fP +Mount this filesystem using a special form of read-only mount. The mount +does not use one of the filesystem's journals. The node is unable to +recover journals for other nodes. +.TP +\fBnorecovery\fP +A synonym for spectator +.TP +\fBsuiddir\fP +Sets owner of any newly created file or directory to be that of parent +directory, if parent directory has S_ISUID permission attribute bit set. +Sets S_ISUID in any new directory, if its parent directory's S_ISUID is set. +Strips all execution bits on a new file, if parent directory owner is different +from owner of process creating the file. Set this option only if you know +why you are setting it. +.TP +\fBquota=\fP\fI[off/account/on]\fR +Turns quotas on or off for a filesystem. Setting the quotas to be in +the "account" state causes the per UID/GID usage statistics to be +correctly maintained by the filesystem, limit and warn values are +ignored. The default value is "off". +.TP +\fBdiscard\fP +Causes GFS2 to generate "discard" I/O requests for blocks which have +been freed. These can be used by suitable hardware to implement +thin-provisioning and similar schemes. This feature is supported +in kernel version 2.6.30 and above. +.TP +\fBbarrier\fP +This option, which defaults to on, causes GFS2 to send I/O barriers +when flushing the journal. The option is automatically turned off +if the underlying device does not support I/O barriers. We highly +recommend the use of I/O barriers with GFS2 at all times unless +the block device is designed so that it cannot lose its write cache +content (e.g. its on a UPS, or it doesn't have a write cache) +.TP +\fBcommit=\fP\fIsecs\fR +This is similar to the ext3 \fBcommit=\fP option in that it sets +the maximum number of seconds between journal commits if there is +dirty data in the journal. The default is 60 seconds. This option +is only provided in kernel versions 2.6.31 and above. +.TP +\fBdata=\fP\fI[ordered|writeback]\fR +When data=ordered is set, the user data modified by a transaction is +flushed to the disk before the transaction is committed to disk. This +should prevent the user from seeing uninitialized blocks in a file +after a crash. Data=writeback mode writes the user data to the disk +at any time after it's dirtied. This doesn't provide the same +consistency guarantee as ordered mode, but it should be slightly +faster for some workloads. The default is ordered mode. +.TP +\fBmeta\fP +This option results in selecting the meta filesystem root rather than +the normal filesystem root. This option is normally only used by +the GFS2 utility functions. Altering any file on the GFS2 meta filesystem +may render the filesystem unusable, so only experts in the GFS2 +on-disk layout should use this option. +.TP +\fBquota_quantum=\fP\fIsecs\fR +This sets the number of seconds for which a change in the quota +information may sit on one node before being written to the quota +file. This is the preferred way to set this parameter. The value +is an integer number of seconds greater than zero. The default is +60 seconds. Shorter settings result in faster updates of the lazy +quota information and less likelihood of someone exceeding their +quota. Longer settings make filesystem operations involving quotas +faster and more efficient. +.TP +\fBstatfs_quantum=\fP\fIsecs\fR +Setting statfs_quantum to 0 is the preferred way to set the slow version +of statfs. The default value is 30 secs which sets the maximum time +period before statfs changes will be syned to the master statfs file. +This can be adjusted to allow for faster, less accurate statfs values +or slower more accurate values. When set to 0, statfs will always +report the true values. +.TP +\fBstatfs_percent=\fP\fIvalue\fR +This setting provides a bound on the maximum percentage change in +the statfs information on a local basis before it is synced back +to the master statfs file, even if the time period has not +expired. If the setting of statfs_quantum is 0, then this setting +is ignored. +.TP +\fBrgrplvb\fP +This flag tells gfs2 to look for information about a resource group's free +space and unlinked inodes in its glock lock value block. This keeps gfs2 from +having to read in the resource group data from disk, speeding up allocations in +some cases. This option was added in the 3.6 Linux kernel. Prior to this +kernel, no information was saved to the resource group lvb. \fBNote:\fP To +safely turn on this option, all nodes mounting the filesystem must be running +at least a 3.6 Linux kernel. If any nodes had previously mounted the filesystem +using older kernels, the filesystem must be unmounted on all nodes before it +can be mounted with this option enabled. This option does not need to be +enabled on all nodes using a filesystem. +.TP +\fBloccookie\fP +This flag tells gfs2 to use location based readdir cookies, instead of its +usual filename hash readdir cookies. The filename hash cookies are not +guaranteed to be unique, and as the number of files in a directory increases, +so does the likelihood of a collision. NFS requires readdir cookies to be +unique, which can cause problems with very large directories (over 100,000 +files). With this flag set, gfs2 will try to give out location based cookies. +Since the cookie is 31 bits, gfs2 will eventually run out of unique cookies, +and will fail back to using hash cookies. The maximum number of files that +could have unique location cookies assuming perfectly even hashing and names of +8 or fewer characters is 1,073,741,824. An average directory should be able to +give out well over half a billion location based cookies. This option was added +in the 4.5 Linux kernel. Prior to this kernel, gfs2 did not add directory +entries in a way that allowed it to use location based readdir cookies. +\fBNote:\fP To safely turn on this option, all nodes mounting the filesystem +must be running at least a 4.5 Linux kernel. If this option is only enabled on +some of the nodes mounting a filesystem, the cookies returned by nodes using +this option will not be valid on nodes that are not using this option, and vice +versa. Finally, when first enabling this option on a filesystem that had been +previously mounted without it, you must make sure that there are no outstanding +cookies being cached by other software, such as NFS. + +.SH BUGS + +GFS2 doesn't support \fBerrors=\fP\fIremount-ro\fR or \fBdata=\fP\fIjournal\fR. +It is not possible to switch support for user and group quotas on and +off independently of each other. Some of the error messages are rather +cryptic, if you encounter one of these messages check firstly that gfs_controld +is running and secondly that you have enough journals on the filesystem +for the number of nodes in use. + +.SH SEE ALSO + +\fBmount\fP(8) for general mount options, +\fBchmod\fP(1) and \fBchmod\fP(2) for access permission flags, +\fBacl\fP(5) for access control lists, +\fBlvm\fP(8) for volume management, +\fBccs\fP(7) for cluster management, +\fBumount\fP(8), +\fBinitrd\fP(4). + +The GFS2 documentation has been split into a number of sections: + +\fBgfs2_edit\fP(8) A GFS2 debug tool (use with caution) +\fBfsck.gfs2\fP(8) The GFS2 file system checker +\fBgfs2_grow\fP(8) Growing a GFS2 file system +\fBgfs2_jadd\fP(8) Adding a journal to a GFS2 file system +\fBmkfs.gfs2\fP(8) Make a GFS2 file system +\fBgfs2_quota\fP(8) Manipulate GFS2 disk quotas +\fBgfs2_tool\fP(8) Tool to manipulate a GFS2 file system (obsolete) +\fBtunegfs2\fP(8) Tool to manipulate GFS2 superblocks + +.SH SETUP + +GFS2 clustering is driven by the dlm, which depends on dlm_controld to +provide clustering from userspace. dlm_controld clustering is built on +corosync cluster/group membership and messaging. + +Follow these steps to manually configure and run gfs2/dlm/corosync. + +.B 1. create /etc/corosync/corosync.conf and copy to all nodes + +In this sample, replace cluster_name and IP addresses, and add nodes as +needed. If using only two nodes, uncomment the two_node line. +See corosync.conf(5) for more information. + +.nf +totem { + version: 2 + secauth: off + cluster_name: abc +} + +nodelist { + node { + ring0_addr: 10.10.10.1 + nodeid: 1 + } + node { + ring0_addr: 10.10.10.2 + nodeid: 2 + } + node { + ring0_addr: 10.10.10.3 + nodeid: 3 + } +} + +quorum { + provider: corosync_votequorum +# two_node: 1 +} + +logging { + to_syslog: yes +} +.fi + +.PP + +.B 2. start corosync on all nodes + +.nf +systemctl start corosync +.fi + +Run corosync-quorumtool to verify that all nodes are listed. + +.PP + +.B 3. create /etc/dlm/dlm.conf and copy to all nodes + +.B * +To use no fencing, use this line: + +.nf +enable_fencing=0 +.fi + +.B * +To use no fencing, but exercise fencing functions, use this line: + +.nf +fence_all /bin/true +.fi + +The "true" binary will be executed for all nodes and will succeed (exit 0) +immediately. + +.B * +To use manual fencing, use this line: + +.nf +fence_all /bin/false +.fi + +The "false" binary will be executed for all nodes and will fail (exit 1) +immediately. + +When a node fails, manually run: dlm_tool fence_ack + +.B * +To use stonith/pacemaker for fencing, use this line: + +.nf +fence_all /usr/sbin/dlm_stonith +.fi + +The "dlm_stonith" binary will be executed for all nodes. If +stonith/pacemaker systems are not available, dlm_stonith will fail and +this config becomes the equivalent of the previous /bin/false config. + +.B * +To use an APC power switch, use these lines: + +.nf +device apc /usr/sbin/fence_apc ipaddr=1.1.1.1 login=admin password=pw +connect apc node=1 port=1 +connect apc node=2 port=2 +connect apc node=3 port=3 +.fi + +Other network switch based agents are configured similarly. + +.B * +To use sanlock/watchdog fencing, use these lines: + +.nf +device wd /usr/sbin/fence_sanlock path=/dev/fence/leases +connect wd node=1 host_id=1 +connect wd node=2 host_id=2 +unfence wd +.fi + +See fence_sanlock(8) for more information. + +.B * +For other fencing configurations see dlm.conf(5) man page. + +.PP + +.B 4. start dlm_controld on all nodes + +.nf +systemctl start dlm +.fi + +Run "dlm_tool status" to verify that all nodes are listed. + +.PP + +.B 5. if using clvm, start clvmd on all nodes + +systemctl clvmd start + +.PP + +.B 6. make new gfs2 file systems + +mkfs.gfs2 -p lock_dlm -t cluster_name:fs_name -j num /path/to/storage + +The cluster_name must match the name used in step 1 above. +The fs_name must be a unique name in the cluster. +The -j option is the number of journals to create, there must +be one for each node that will mount the fs. + +.PP + +.B 7. mount gfs2 file systems + +mount /path/to/storage /mountpoint + +Run "dlm_tool ls" to verify the nodes that have each fs mounted. + +.PP + +.B 8. shut down + +.nf +umount -a -t gfs2 +systemctl clvmd stop +systemctl dlm stop +systemctl corosync stop +.fi + +.PP + +.B More setup information: +.br +.BR dlm_controld (8), +.br +.BR dlm_tool (8), +.br +.BR dlm.conf (5), +.br +.BR corosync (8), +.br +.BR corosync.conf (5) +.br diff --git a/gfs2/man/gfs2_convert.8 b/gfs2/man/gfs2_convert.8 new file mode 100644 index 0000000..ba70ee1 --- /dev/null +++ b/gfs2/man/gfs2_convert.8 @@ -0,0 +1,68 @@ +.TH gfs2_convert 8 + +.SH NAME +gfs2_convert - Convert a GFS1 filesystem to GFS2 + +.SH SYNOPSIS +.B gfs2_convert +[\fIOPTION\fR]... \fIDEVICE\fR + +.SH DESCRIPTION +gfs2_convert is used to convert a filesystem from GFS1 to GFS2. It is +required that the GFS1 filesystem be checked and fixed for errors using +\fBfsck.gfs2\fP and that the filesystem be backed up before +attempting to convert it. The convert process is irreversible and any +error encountered during the conversion can result in the abrupt +termination of the program and consequently an unusable filesystem. +See \fBNOTES\fP section below for more information. + +.SH OPTIONS +.TP +\fB-h\fP +Help. + +This prints out the proper command line usage syntax. +.TP +\fB-q\fP +Quiet. Print less information while running. +.TP +\fB-n\fP +No to all questions. +.TP +\fB-V\fP +Print program Version information only. + +Print out the current version name. +.TP +\fB-v\fP +Verbose operation. + +Print more information while running. +.TP +\fB-y\fP +Yes to all questions. + +By specifying this option, gfs2_convert will not prompt before making +changes. + +.SH EXAMPLE +.TP +gfs2_convert /dev/vg0/lvol0 +This will convert the Global File System on the block device +"/dev/vg0/lvol0" to gfs2 format. + +.SH NOTES +If gfs2_convert is interrupted for some reason other than a conversion +failure, DO NOT run \fBfsck.gfs2\fP on this partially converted filesystem. +When this occurs, reissue the gfs2_convert command on the partially converted +filesystem to complete the conversion process. + +The GFS2 filesystem does not support Context-Dependent Path Names (CDPNs). +gfs2_convert identifies such CDPNs and replaces them with empty directories +with the same name. The administrator can use bind mounts on these +directories to get the same effect as CDPNs. + +When converting full or nearly full filesystems, it is possible that there +won't be enough space available to fit all the GFS2 filesystem data +structures. In such cases, the size of all the journals is reduced uniformly +such that everything fits in the available space. diff --git a/gfs2/man/gfs2_edit.8 b/gfs2/man/gfs2_edit.8 new file mode 100644 index 0000000..1a5ccac --- /dev/null +++ b/gfs2/man/gfs2_edit.8 @@ -0,0 +1,417 @@ +.TH gfs2_edit 8 + +.SH NAME +gfs2_edit - Display, print or edit GFS2 or GFS internal structures. + +.SH SYNOPSIS +.B gfs2_edit +[\fIOPTION\fR]... [\fIDEVICE\fR] + +.SH DESCRIPTION +The gfs2_edit command is a tool used to examine, edit or +display internal data structures of a GFS2 or GFS file system. +The gfs2_edit command can be run interactively, as described +below in INTERACTIVE MODE. + +Caution: Several options of the gfs2_edit command alter the +file system metadata and can cause file system corruption. +These options should be used with great care. + +.SH OPTIONS +.TP +\fB-p\fP [\fIstruct\fR | \fIblock\fR] [\fIblocktype\fR] [\fIblockalloc [val]\fR] [\fIblockbits\fR] [\fIblockrg\fR] [\fIfind sb|rg|rb|di|in|lf|jd|lh|ld|ea|ed|lb|13|qc\fR] [\fIfield [val]\fR] +Print a gfs2 data structure in human-readable format to stdout. +You can enter either a block number or a data structure name. Block numbers +may be specified in hex (e.g., 0x10) or decimal (e.g., 16). + +You can specify the following well-known locations with the -p option. + +\fIsb\fR, \fIsuperblock\fR - Print the superblock. + +\fIroot\fR - Print the root directory. + +\fImaster\fR - Print the master system directory. + +\fIjindex\fR - Print the journal index system directory. + +\fIper_node\fR - Print the per_node system directory. + +\fIinum\fR - Print the system inum file. + +\fIstatfs\fR - Print the system statfs file. + +\fIrindex\fR, \fIrgindex\fR - Print the resource group index system file. + +\fIrg X\fR - Print the resource group information for RG X (zero-based). + +\fIrgs\fR - Print the resource group information. + +\fIquota\fR - Print the contents of the system quota file. + +\fIidentify\fR - Identify a data block rather than print the block's contents. + +\fIsize\fR - Print the device size information. + +\fIjournalX\fR - Print the contents of journal X, where X is a journal +number from 0 to . +Only the journal headers and journal descriptors are dumped. For journal +descriptors, this option prints out every file system block number logged +in that section of the journal. The actual journaled blocks are not printed. + +If you specify a block number rather than a structure name, gfs2_edit will +print out a breakdown of the structure for that block. +For example: \fBgfs2_edit -p sb\fP will print the superblock, but so does +\fBgfs2_edit -p 0x10\fP and \fBgfs2_edit -p 16\fP. + +If you specify -p without a block or structure name, gfs2_edit prints the +superblock. + +You can specify more than one data structure with a single -p option. +For example, \fBgfs2_edit -p inum statfs /dev/sda1\fP prints the system inum +file and the system statfs file on /dev/sda1. + +Optionally, you may specify the keyword \fIblocktype\fR to print out the +gfs2 block type for the specified block. Valid gfs2 block types are: +0 (Clump), 1 (Superblock), 2 (Resource Group Header), 3 (Resource Group +Bitmap), 4 (Dinode), 5 (Indirect Block), 6 (Leaf), 7 (Journaled data), +8 (Log Header), 9 (Log descriptor), 10 (Extended attribute), +11 (Eattr Data), 12 (Log Buffer), 13 (Invalid), and 14 (Quota Change). + +Optionally, you may specify the keyword \fIblockalloc\fR with an +optional value to assign. If no value is specified, the blockalloc +keyword will print the block allocation type for the specified block. +Valid block allocation types are: 0 (Free block), 1 (Data block), +2 (Unlinked block), and 3 (Metadata block). If a value from 0 to 3 is +specified, the resource group bitmap will be changed to the new value. +This may be used, for example, to artificially free or allocate a block +in order to test fsck.gfs2's ability to detect and fix the problem. + +Optionally, you may specify the keyword \fIblockbits\fR. This option +will locate and print the block containing the bitmap corresponding to +the specified block. + +Optionally, you may specify the keyword \fIblockrg\fR. This option +will locate and print the block number of the resource group that holds +information about the specified block. + +You may also use gfs2_edit to find the next occurrence of a metadata +block of a certain type. Valid metadata types are: \fInone\fR (unused +metadata clump block), \fIsb\fR (superblock), \fIrg\fR (resource group), +\fIrb\fR (rg bitmap), \fIdi\fR (disk inode aka dinode), \fIin\fR (indirect +block list), \fIlf\fR (directory leaf), \fIjd\fR (journaled data), +\fIlh\fR (journal log header), \fIld\fR (journal log descriptor), +\fIea\fR (extended attribute), \fIed\fR (ea data block), \fIlb\fR (log buffer), +\fI13\fR (unused block type 13), \fIqc\fR (quota change). +The block AFTER the one specified with -p is the starting point for +the search. For example, if you specify \fBgfs2_edit -p rg 12 find rg +/dev/your/device\fP, it will find the rg that follows rg 12 (normally, +this would be rg 13). Note, however, that since metadata often appears +in the journals, it could be a copy of a different RG, inside a journal. +Also note that gfs2_edit will only find \fBallocated\fR metadata blocks +unless the type specified is none, sb, rg or rb. In other words, if you +try to find a disk inode, it will only find an allocated dinode, not a +deallocated one. + +Optionally, you may specify the keyword \fIfield\fR followed by a +valid metadata field name. Right now, only the fields in disk inodes +and resource groups are allowed. If no value is specified after the +field, the value of the field will be printed to stdout. If a value +is specified, the field's value will be changed. This may be used, +for example, to artificially change the di_size field for an inode +in order to test fsck.gfs2's ability to detect and fix the problem. + +.TP +\fB-s\fs [\fIstructure\fR | \fIblock\fR] +Specify a starting block for interactive mode. Any of the well-known +locations found in the -p option may be specified. If you want to start +on a particular resource group, specify it in quotes, e.g. -s "rg 3" +.TP +\fB-h, -help, -usage\fP +Print help information. +.TP +\fB-c\fP [\fI0\fR | \fI1\fR] +Use alternate color scheme for interactive mode: 0=normal (dark colors on +white background), or 1 (light colors on black background). +.TP +\fB-V\fP +Print program version information only. +.TP +\fB-x\fP +Print in hex mode. +.TP +\fB-z <0-9>\fP +Compress metadata with gzip compression level 1 to 9 (default 9). 0 means no compression at all. +.TP +\fBrg\fP \fI\fR \fI\fR +Print the contents of Resource Group \fI\fR on \fI\fR. + +\fI\fR is a number from 0 to X - 1, where X is the number of RGs. +.TP +\fBrgcount\fP \fI\fR +Print the number of Resource Groups in the file system on \fI\fR. +.TP +\fBrgflags\fP \fI\fR [\fInew_value\fR] \fI\fR +Print and/or modify the rg_flags value of Resource Group \fI\fR on +\fI\fR. + +\fI\fR is a number from 0 to X - 1, where X is the number of RGs. +If \fInew_value\fR is not specified, the current rg_flags value will be +printed but not modified. If \fInew_value\fR is specified, the rg_flags +field will be overwritten with the new value. +.TP +\fBprintsavedmeta\fP \fI\fR +Print off a list of blocks from that were saved with the savemeta +option. +.TP +\fBsavemeta\fP \fI\fR \fI\fR +Save off the GFS2 metadata (not user data) for the file system on the +specified device to a file given by . You can use this option +to analyze file system problems without revealing sensitive information +that may be contained in the files. This option works quickly by +using the system bitmap blocks in the resource groups to determine the +location of all the metadata. If there is corruption +in the bitmaps, resource groups or rindex file, this method may fail and +you may need to use the savemetaslow option. The destination file is +compressed using gzip unless -z 0 is specified. +.TP +\fBsavemetaslow\fP \fI\fR \fI\fR +Save off GFS2 metadata, as with the savemeta option, examining every +block in the file system for metadata. This option is less prone to failure +due to file system corruption than the savemeta option, but it is +extremely slow. The destination file is compressed using gzip unless +-z 0 is specified. +.TP +\fBsavergs\fP \fI\fR \fI\fR +Save off only the GFS2 resource group metadata for the file system on the +specified device to a file given by . The destination file is +compressed using gzip unless -z 0 is specified. +.TP +\fBrestoremeta\fP \fI\fR \fI\fR +Take a compressed or uncompressed file created with the savemeta option and +restores its contents on top of the specified destination device. +\fBWARNING\fP: When you use this option, the file system and all data on the +destination device is destroyed. Since only metadata (but no data) is +restored, every file in the resulting file system is likely to be corrupt. The +ONLY purpose of this option is to examine and debug file system problems by +restoring and examining the state of the saved metadata. If the destination +file system is the same size or larger than the source file system where the +metadata was saved, the resulting file system will be the same size as the +source. If the destination device is smaller than the source file system, +gfs2_edit will restore as much as it can, then quit, leaving you with a file +system that probably will not mount, but from which you might still be able to +figure out what is wrong with the source file system. + +.SH INTERACTIVE MODE +If you specify a device on the gfs2_edit command line and you specify +no options other than -c, gfs2_edit will act as an interactive GFS2 +file system editor for the file system you specify. There +are three display modes: hex mode, structure mode and pointers mode. +You use the m key to switch between the modes, as described below. +The modes are as follows: +.TP +Hex mode (default) +Display or edit blocks of the file system in hexadecimal and ascii. + +Lines at the top indicate the currently displayed block in both hex and +decimal. If the block contains a GFS2 data structure, the name of that +structure will appear in the upper right corner of the display. +If the block is a well-known block, such as the superblock or rindex, +there will be a line to indicate what it is. + +In hex mode, you can edit blocks by pressing \fB\fP and entering +hexadecimal digits to replace the highlighted hex digits. Do NOT precede +the numbers with "0x". For example, if you want to change the value at +offset 0x60 from a 0x12 to 0xef, position your cursor to offset 0x60, +so that the 12 is highlighted, then press \fB\fP and type in "ef". +Press \fB\fP or \fB\fP to exit edit mode. + +In hex mode, different colors indicate different things. +For example, in the default color scheme, the GFS2 data structure will +be black, data offsets will be light blue, and actual data (anything after +the gfs2 data structure) will be red. + +.TP +Structure mode +Decode the file system block into its GFS2 structure and +display the values of that structure. This mode is most useful for +jumping around the file system. For example, you can use the arrow +keys to position down to a pointer and press \fBJ\fP to jump to that block. + +.TP +Pointers mode +Display any additional information appearing on the block. +For example, if an inode has block pointers, this will display them and +allow you to scroll through them. You can also position to one of them +and press \fBJ\fP to jump to that block. + +.SH Interactive mode command keys: +.TP +\fBq\fP or \fB\fP +The \fBq\fP or \fB\fP keys are used to exit gfs2_edit. + +.TP +\fB\fP up, down, right, left, pg-up, pg-down, home, end +The arrow keys are used to highlight an area of the display. The \fBJ\fP +key may be used to jump to the block that is highlighted. + +.TP +\fBm\fP - Mode switch +The \fBm\fP key is used to switch between the three display modes. +The initial mode is hex mode. Pressing the \fBm\fP key once switches to +structure mode. Pressing it a second time switches from structure mode +to pointers mode. Pressing it a third time takes you back to hex mode again. + +.TP +\fBj\fP - Jump to block +The \fBj\fP key jumps to the block number that is currently highlighted. +In hex mode, hitting J will work when any byte of the pointer is highlighted. + +.TP +\fBg\fP - Goto block +The \fBg\fP key asks for a block number, then jumps there. Note that +in many cases, you can also arrow up so that the current block number +is highlighted, then press \fB\fP to enter a block number to jump to. + +.TP +\fBh\fP - Help display +The \fBh\fP key causes the interactive help display to be shown. + +.TP +\fBe\fP - Extended mode +The \fBe\fP key causes gfs2_edit to switch to extended ("pointers") mode. + +.TP +\fBc\fP - Color scheme +The \fBc\fP key causes gfs2_edit to switch to its alternate color scheme. + +.TP +\fBf\fP - Forward block +The \fBf\fP key causes you to scroll forward one block. This does +not affect the "jump" status. In other words, if you use the \fBf\fP +key to move forward several blocks, pressing \fB\fP will +not roll you back up. + +.TP +\fB\fP - Edit value +The \fB\fP key causes you to go from display mode to edit mode. +If you are in hex mode and you hit enter, you can type new hex values +at the cursor's current location. Note: pressing \fB\fP +in structure mode allows you to enter a new value, with the following +restrictions: For gfs2 disk inodes and resource groups, it will +actually change the value on disk. However, inode numbers may not be +changed. For all other structures, the values entered are ignored. + +If you use the up arrow key to highlight the block number, then press +\fB\fP, you may then enter a new block number, or any of the +well-known block locations listed above (e.g. sb, rindex, inum, rg 17, +etc.) and gfs2_edit will jump to the block specified. If you specify +a slash character followed by a metadata type, gfs2_edit will search for +the next occurrence of that metadata block type, and jump there. It +will take you to block 0 if it does not find any more blocks of the +specified metadata type. + +.TP +\fB\fP +If you are in pointers mode, this takes you back to the starts of the +pointers you are viewing. Otherwise it takes you back to the superblock. + +.TP +\fB\fP +This takes you back to the block you were displaying before a jump. + +.TP +\fB\fP +This takes you forward to the block you were displaying when you hit +\fB\fP. + +.SH EXAMPLES +.TP +gfs2_edit /dev/roth_vg/roth_lv +Display and optionally edit the file system on /dev/roth_vg/roth_lv + +.TP +gfs2_edit -p sb /dev/vg0/lvol0 +Print the superblock of the gfs2 file system located on +/dev/vg0/lvol0. + +.TP +gfs2_edit -p identify 2746 2748 /dev/sda2 +Print out what kind of blocks are at block numbers 2746 and 2748 on +device /dev/sda2. + +.TP +gfs2_edit -p rindex /dev/sda1 +Print the resource group index system file located on device +/dev/sda1. + +.TP +gfs2_edit savemeta /dev/sda1 /tmp/our_fs.gz +Save off all metadata (but no user data) to file /tmp/our_fs.gz + +.TP +gfs2_edit -p root /dev/my_vg/my_lv +Print the contents of the root directory in /dev/my_vg/my_lv. + +.TP +gfs2-edit -x -p 0x3f7a /dev/sda1 +Print the contents of block 16250 of /dev/sda1 in hex. + +.TP +gfs2_edit -p 12345 /dev/sdc2 +Print the gfs2 data structure at block 12345. + +.TP +gfs2_edit rgcount /dev/sdb1 +Print how many Resource Groups exist for /dev/sdb1. + +.TP +gfs2_edit -p rg 17 /dev/sdb1 +Print the contents of the eighteenth Resource Group on /dev/sdb1. + +.TP +gfs2_edit rgflags 3 /dev/sdb1 +Print the rg_flags value for the fourth Resource Group on /dev/sdb1. + +.TP +gfs2_edit rgflags 3 8 /dev/sdb1 +Set the GFS2_RGF_NOALLOC flag on for the fourth Resource Group on /dev/sdb1. + +.TP +gfs2_edit -p 25 blockalloc /dev/roth_vg/roth_lv +Print the block allocation type of block 25. +May produce this output: +3 (Metadata) + +.TP +gfs2_edit -p 25 blockalloc 1 /dev/roth_vg/roth_lv +Change the block allocation type of block 25 to data. +May produce this output: +1 + +.TP +gfs2_edit -p 25 blocktype /dev/roth_vg/roth_lv +Print the metadata block type of block 25. +May produce this output: +4 (Block 25 is type 4: Dinode) + +.TP +gfs2_edit -p 25 field di_size /dev/roth_vg/roth_lv +Print the di_size field of block 25. +May produce this output: +134217728 + +.TP +gfs2_edit -x -p 25 field di_size /dev/roth_vg/roth_lv +Print the di_size field of block 25, in hexadecimal. +May produce this output: +0x8000000 + +.TP +gfs2_edit -p 25 field di_size 0x4000 /dev/roth_vg/roth_lv +Change the di_size field of block 25 to the hexadecimal value 0x4000. +May produce this output: +16384 +.SH KNOWN BUGS +.TP +The directory code does not work well. It might be confused +by directory "sentinel" entries. diff --git a/gfs2/man/gfs2_grow.8 b/gfs2/man/gfs2_grow.8 new file mode 100644 index 0000000..317d4f2 --- /dev/null +++ b/gfs2/man/gfs2_grow.8 @@ -0,0 +1,66 @@ +.TH gfs2_grow 8 + +.SH NAME +gfs2_grow - Expand a GFS2 filesystem + +.SH SYNOPSIS +.B gfs2_grow +[\fIOPTION\fR]... <\fIDEVICE\fR|\fIMOUNTPOINT\fR>... + +.SH DESCRIPTION +gfs2_grow is used to expand a GFS2 filesystem after the device +upon which the filesystem resides has also been expanded. By +running gfs2_grow on a GFS2 filesystem, you are requesting that +any spare space between the current end of the filesystem and +the end of the device is filled with a newly initialized GFS2 +filesystem extension. When this operation is complete, the resource +group index for the filesystem is updated so that all nodes in the +cluster can use the extra storage space that has been added. + +You may only run gfs2_grow on a mounted filesystem; expansion of +unmounted filesystems is not supported. You only need to +run gfs2_grow on one node in the cluster. All the other nodes will +see the expansion has occurred and automatically start to use the +newly available space. + +You must be superuser to execute \fBgfs2_grow\fP. The gfs2_grow +tool tries to prevent you from corrupting your filesystem by checking as +many of the likely problems as it can. When expanding a filesystem, +only the last step of updating the resource index affects the currently +mounted filesystem and so failure part way through the expansion process +should leave your filesystem in its original unexpanded state. + +You can run gfs2_grow with the \fB-T\fP flag to get a display +of the current state of a mounted GFS2 filesystem. + +The gfs2_grow tool uses the resource group (RG) size that was originally +calculated when mkfs.gfs2 was done. This allows tools like fsck.gfs2 +to better ensure the integrity of the file system. Since the new free +space often does not lie on even boundaries based on that RG size, +there may be some unused space on the device after gfs2_grow is run. +.SH OPTIONS +.TP +\fB-D\fP +Print out debugging information about the filesystem layout. +.TP +\fB-h\fP +Prints out a short usage message and exits. +.TP +\fB-q\fP +Be quiet. Don't print anything. +.TP +\fB-T\fP +Test. Do all calculations, but do not write any data to the disk and do +not expand the filesystem. This is used to discover what the tool would +have done were it run without this flag. +.TP +\fB-V\fP +Version. Print out version information, then exit. + +.SH BUGS + +There is no way to shrink a GFS2 filesystem. + +.SH SEE ALSO +mkfs.gfs2(8) gfs2_jadd(8) + diff --git a/gfs2/man/gfs2_jadd.8 b/gfs2/man/gfs2_jadd.8 new file mode 100644 index 0000000..5453edd --- /dev/null +++ b/gfs2/man/gfs2_jadd.8 @@ -0,0 +1,57 @@ +.TH gfs2_jadd 8 + +.SH NAME +gfs2_jadd \- Add journals to a GFS2 filesystem + +.SH SYNOPSIS +.B gfs2_jadd +[\fIOPTION\fR]... <\fIDEVICE\fR|\fIMOINTPOINT\fR>... + +.SH DESCRIPTION +\fIgfs2_jadd\fR is used to add journals (and a few other per-node +files) to a GFS2 filesystem. When this operation is complete, the +journal index is updated so that machines mounting the filesystem at a +later date will see the newly created journals in addition to the +journals already there. Machines which are already running in the +cluster are unaffected. + +You may only run \fIgfs2_jadd\fR on a mounted filesystem, addition of +journals to unmounted filesystems is not supported. You only need to +run \fIgfs2_jadd\fR on one node in the cluster. All the other nodes +will see the expansion has occurred when required. + +You must be superuser to execute \fIgfs2_jadd\fR. The \fIgfs2_jadd\fR +tool tries to prevent you from corrupting your filesystem by checking +as many of the likely problems as it can. When growing a filesystem, +only the last step of updating the journal index affects the currently +mounted filesystem and so failure part way through the expansion +process should leave your filesystem in its original state. + +.SH OPTIONS +.TP +\fB-c MegaBytes\fP +Initial size of each journal's quota change file +.TP +\fB-D\fP +Print out debugging information about the filesystem layout. +.TP +\fB-h\fP +Prints out a short usage message and exits. +.TP +\fB-J size\fP +The size of the new journals in megabytes. The defaults to 32MB (the +minimum size allowed is 8MB). If you want to add journals of different +sizes to the filesystem, you'll need to run gfs2_jadd once for each +different size of journal. +.TP +\fB-j num\fP +The number of new journals to add. +.TP +\fB-q\fP +Be quiet. Don't print anything. +.TP +\fB-V\fP +Version. Print version information, then exit. +. +.SH SEE ALSO +mkfs.gfs2(8) gfs2_grow(8) diff --git a/gfs2/man/gfs2_lockcapture.8 b/gfs2/man/gfs2_lockcapture.8 new file mode 100644 index 0000000..7d51ccb --- /dev/null +++ b/gfs2/man/gfs2_lockcapture.8 @@ -0,0 +1,101 @@ +.TH gfs2_lockcapture 8 + +.SH NAME +gfs2_lockcapture \- capture locking information from GFS2 file systems and DLM. + +.SH SYNOPSIS +.B gfs2_lockcapture \fR[-dqyP] [-o \fIoutput directory]\fR [-r \fInumber of runs]\fR [-s \fIseconds to sleep]\fR [-n \fIname of GFS2 file system]\fP +.PP +.B gfs2_lockcapture \fR[-dqyi] + +.SH DESCRIPTION +\fIgfs2_lockcapture\fR is used to capture the GFS2 lockdump data and +corresponding DLM data for GFS2 file systems. The number of captures and their +frequency can be configured. By default all of the mounted GFS2 file systems +will have their data collected unless GFS2 file systems are specified. +.PP +Please note that sysrq -t (thread) and -m (memory) dumps and the pid +directories in /proc are collected unless they are disabled with the -P option. + +.SH OPTIONS +.TP +\fB-h, --help\fP +Prints out a short usage message and exits. +.TP +\fB-d, --debug\fP +Enables debug logging. +.TP +\fB-q, --quiet\fP +Disables logging to console. +.TP +\fB-y, --no_ask\fP +Disables all questions and assumes yes. +.TP +\fB-i, --info\fP +Prints information about the mounted GFS2 file systems. +.TP +\fB-P, --disable_process_gather\fP +The gathering of process information will be disabled. +.TP +\fB-o \fI, \fB--path_to_output_dir\fR=\fI\fP +The directory where all the collect data will be stored. +.TP +\fB-r \fI, \fB--num_of_runs\fR=\fI\fP +The number of runs capturing the lockdump data. The default is 3 runs. +.TP +\fB-s \fI, \fB--seconds_sleep\fR=\fI\fP +The number of seconds to sleep between runs of capturing the lockdump data. The default is 120 seconds. +.TP +\fB-n \fI, \fB--fs_name\fR=\fI\fP +The name of the GFS2 filesystem(s) that will have their lockdump data captured. By default, all mounted GFS2 file systems will have their data captured. +. +.SH NOTES +The output of the following commands will be captured: +.IP \(bu 2 +uname -a +.IP \(bu 2 +uptime +.IP \(bu 2 +ps h -AL -o "tid,s,cmd" +.IP \(bu 2 +df -h +.IP \(bu 2 +lsof +.IP \(bu 2 +mount -l +.IP \(bu 2 +dlm_tool ls +.IP \(bu 2 +dlm_tool lockdebug -v -s -w +.IP \(bu 2 +echo "t" > /proc/sysrq-trigger (If /proc/1/stack does not exist) +.IP \(bu 2 +echo "m" > /proc/sysrq-trigger (If /proc/1/stack does not exist) + +.SH AUTHOR +.nf +Shane Bradley +.fi +.SH FILES +.I /proc/mounts +.br +.I /proc/slabinfo +.br +.I /sys/kernel/config/dlm/cluster/lkbtbl_size +.br +.I /sys/kernel/config/dlm/cluster/dirtbl_size +.br +.I /sys/kernel/config/dlm/cluster/rsbtbl_size +.br +.I /sys/kernel/debug/gfs2/ +.br +.I /sys/kernel/debug/dlm/ +.br +.I /proc// +(If /proc/1/stack does exists) +.br +.I /var/log/messages +.br +.I /var/log/cluster/ +.br +.SH SEE ALSO diff --git a/gfs2/man/gfs2_trace.8 b/gfs2/man/gfs2_trace.8 new file mode 100644 index 0000000..dd98072 --- /dev/null +++ b/gfs2/man/gfs2_trace.8 @@ -0,0 +1,45 @@ +.TH gfs2_trace 8 + +.SH NAME +gfs2_trace \- can enable trace events, disable trace events, and capture data from GFS2 trace events. + +.SH SYNOPSIS +.B gfs2_trace \fR[-dqEN] [-e \fItrace event name]\fR [-n \fItrace event name]\fR [-o \fIoutput filename]\fR +.PP + +.SH DESCRIPTION +\fIgfs2_trace\fR can enabled and disable trace events on all trace events or selected trace events. \fIgfs2_trace\fR can +capture the output of the trace events and write the output to a file. When capturing trace events, the script will exit +when control-c is pressed. The trace events will be then written to the selected file. +.PP + +.SH OPTIONS +.TP +\fB-h, --help\fP +Prints out a short usage message and exits. +.TP +\fB-d, --debug\fP +enables debug logging. +.TP +\fB-q, --quiet\fP +disables logging to console. +.TP +\fB-l, --list\fP +lists the enabled state and filters for the GFS2 trace events +.TP +\fB-E, --enable_all_trace_events\fP +enables all trace_events for GFS2 +.TP +\fB-e \fI, \fB----enable_trace_event\fR=\fI\fP +selected trace_events that will be enabled for GFS2 +.TP +\fB-N, --disable_all_trace_events\fP +disables all trace_events for GFS2 +.TP +\fB-n \fI, \fB----disable_trace_event\fR=\fI\fP +selected trace_events that will be enabled for GFS2 +.TP +\fB-c \fI, \fB--capture\fR=\fI\fP +enables capturing of trace events and will save the data to a file +. +.SH SEE ALSO diff --git a/gfs2/man/glocktop.8 b/gfs2/man/glocktop.8 new file mode 100644 index 0000000..19379c3 --- /dev/null +++ b/gfs2/man/glocktop.8 @@ -0,0 +1,281 @@ +.TH glocktop 8 + +.SH NAME +glocktop - Display or print active GFS2 locks. + +.SH SYNOPSIS +.B glocktop +[\fIOPTIONS\fR] + +.SH DESCRIPTION +The glocktop tool is used to display active GFS2 inter-node locks, +also known as glocks. Simply put, it's a tool to filter and interpret the +contents of the glocks debugfs file. The glocks debugfs file shows +all glocks known to GFS2, their holders, and technical data such as flags. +The glocktop tool will only show the glocks that are important: glocks that +are being held or for which there are waiters. It also interprets the debugfs +file of DLM (Distributed Lock Manager). + +.SH OPTIONS +.TP +\fB-d\fP \fI\fP +Specify a time delay (in seconds) between reports. (Default is 30 seconds) +.TP +\fB-h\fP +Print help information. +.TP +\fB-i\fP +Interactive mode. In this mode, glocktop acts more like the top command. +It shows the pertinent glocks on the terminal session (as many as it can +fit). The advantage is that it uses different colors to draw attention to +what's important. The disadvantage is that it's limited by the size of +your display, so you may not see all the glocks. +.TP +\fB-D\fP +Omit DLM status. This may be used to reduce the amount of output for +interactive mode. +.TP +\fB-n\fP \fI\fP +End the program after the specified number of iterations (reports). The +default is to keep running until interrupted. +.TP +\fB-r\fP +Show resource group reservation information. Normally, glocktop omits +resource group reservation information to condense the output. This +information is only important when debugging information related to the +GFS2 block allocator and file system fragmentation. +.TP +\fB-s\fP \fI\fR +Print glock summary information every \fI\fR reports. +The glock summary information is bulky and often not needed, so it's +only printed once every 10 reports. You can eliminate it entirely from +the output by specifying a value of 0. If you want the statistics to +print after every report, specify freq as 1. +.TP +\fB-t\fP +Trace directory path. A lot of GFS2 glock performance problems are caused +by an application's contention for one or two directories. These show up +as regular inodes in the output, but there's no good way to tell from the +output which directory is contended. Ordinarily, glocktop won't try to +look up the full pathname of a contended directory because it's slow, +especially if there are millions of glocks. This option instructs glocktop +to try to determine the full directory path names when it can, so you can +tell the full path (within the mount point) of contended directories. +.TP +\fB-H\fP +Don't show Held glocks, unless there are also waiters for the lock. +Ordinarily, glocktop will show glocks that are held (but not iopen +glocks which are almost always held by the thousands) as well as glocks +for which there are waiters. If it only showed glocks with waiters, you +could see, for example, that a glock is being blocked on one node, +but you couldn't see the information for a different node currently +holding the lock and thus, blocking the waiter. This option forces glocktop to +stop printing information for glocks with no waiters (on that node). +The advantage is that the output is smaller and easier to look at. +The disadvantage is that you can't see information from the node that's +blocking the waiter, unless both waiter and holder are on the same node. +.SH OUTPUT LINES +.TP +\fB@ name\fP +This is the GFS2 file system name for which the information is printed. It +also gives the time stamp of the report, and the cluster node name. +.TP +\fBG:\fP +This line represents a glock (internode GFS2 lock). + G: s:UN n:2/609b4 f:lIqob t:EX d:EX/0 a:0 v:0 r:3 m:200 (inode) +.TP +\fBD:\fP +This line gives you glocktop's interpretation of the glock's state as +far as DLM (distributed lock manager) is concerned. + D: Granted PR on node 2 to pid 17511 [python] +.TP +\fBH:\fP +This line represents a glock holder: a process that's either holding the +glock, or is waiting to hold it. The value after S: represents how this +holder needs the lock: EX (Exclusive), SH (Shared), PR (Protected Read), +or UN (Unlocked). The value after F: indicates the holder flags: a W +indicates the holder is Waiting for the lock to be granted. An H indicates +the holder is currently holding the lock. + H: s:EX f:W e:0 p:17511 [python] gfs2_unlink+0x7e/0x250 [gfs2] +.TP +\fBU:\fP +These lines represent glocktop's user interpretation of the data, both glock +and holder. Lines that begin with (N/A:...) can probably be ignored because +they ought to be unimportant: system files such as journals, etc. + U: W inode 183f5 Is:Shared, Want:Exclusive [Demote pending, Reply pending, Queued, Blocking] + U: W ---> waiting pid 17511 [python] (Granted PR on node 2 to pid 17511 [python]) +.TP +\fBC:\fP +These lines give you the call trace (call stack) of the process that's +either holding or waiting to hold the glock. +.TP +\fBS\fP +These lines give you the summary of all glocks for this file system: How many of +each category are unlocked, locked, how many are held in EX, SH, and DF, and how +many are waiting. G Waiting is how many glocks have waiters. P Waiting is +how many processes are waiting. Thus, you could have one glock that's got +ten processes waiting, or ten glocks that have ten processes waiting. +.SH EXAMPLE OUTPUT +.nf +.RS +# glocktop +.PP +@ nate_bob1 Wed Jan 27 07:24:14 2016 @host-050 + G: s:EX n:9/1 f:Iqb t:EX d:EX/0 a:0 v:0 r:2 m:200 (journal) + D: Granted EX on node 2 to pid 17468 [ended] + H: s:EX f:eH e:0 p:17468 [(ended)] gfs2_glock_nq_num+0x5b/0xa0 [gfs2] + U: (N/A:Journl) H journal 1 Held:Exclusive [Queued, Blocking] + U: (N/A:Journl) H ---> held by pid 17468 [(ended)] (Granted EX on node 2 to pid 17468 [ended]) + G: s:SH n:1/1 f:Iqb t:SH d:EX/0 a:0 v:0 r:2 m:200 (non-disk) + D: Granted PR on node 2 to pid 17468 [ended] + H: s:SH f:eEH e:0 p:17468 [(ended)] gfs2_glock_nq_num+0x5b/0xa0 [gfs2] + U: (N/A:Not EX) H non-disk 1 Held:Shared [Queued, Blocking] + U: (N/A:Not EX) H ---> held by pid 17468 [(ended)] (Granted PR on node 2 to pid 17468 [ended]) + G: s:EX n:2/181ec f:yIqob t:EX d:EX/0 a:1 v:0 r:3 m:200 (inode) + D: Granted EX on this node to pid 17468 [ended] + H: s:EX f:H e:0 p:17468 [(ended)] init_per_node+0x17d/0x280 [gfs2] + I: n:12/98796 t:8 f:0x00 d:0x00000201 s:24 + U: (N/A:System) H inode 181ec Held:Exclusive [Dirty, Queued, Blocking] + U: (N/A:System) H ---> held by pid 17468 [(ended)] (Granted EX on this node to pid 17468 [ended]) + G: s:EX n:2/181ed f:Iqob t:EX d:EX/0 a:0 v:0 r:3 m:200 (inode) + D: Granted EX on this node to pid 17468 [ended] + H: s:EX f:H e:0 p:17468 [(ended)] init_per_node+0x1b0/0x280 [gfs2] + I: n:13/98797 t:8 f:0x00 d:0x00000200 s:1048576 + U: (N/A:System) H inode 181ed Held:Exclusive [Queued, Blocking] + U: (N/A:System) H ---> held by pid 17468 [(ended)] (Granted EX on this node to pid 17468 [ended]) + G: s:SH n:2/183f5 f:ldrIqob t:EX d:UN/0 a:0 v:0 r:5 m:10 (inode) + D: Granted PR on node 2 to pid 17511 [python] + H: s:EX f:W e:0 p:17511 [python] gfs2_unlink+0x7e/0x250 [gfs2] + I: n:1/99317 t:4 f:0x00 d:0x00000003 s:2048 + U: W inode 183f5 Is:Shared, Want:Exclusive [Demote pending, Reply pending, Queued, Blocking] + U: W ---> waiting pid 17511 [python] (Granted PR on node 2 to pid 17511 [python]) + C: gfs2_unlink+0xdc/0x250 [gfs2] + C: vfs_unlink+0xa0/0xf0 + C: do_unlinkat+0x163/0x260 + C: sys_unlink+0x16/0x20 + G: s:SH n:2/805b f:Iqob t:SH d:EX/0 a:0 v:0 r:3 m:200 (inode) + D: Granted PR on node 2 to pid 17468 [ended] + H: s:SH f:eEcH e:0 p:17468 [(ended)] init_journal+0x185/0x500 [gfs2] + I: n:5/32859 t:8 f:0x01 d:0x00000200 s:134217728 + U: (N/A:Not EX) H inode 805b Held:Shared [Queued, Blocking] + U: (N/A:Not EX) H ---> held by pid 17468 [(ended)] (Granted PR on node 2 to pid 17468 [ended]) +S glocks nondisk inode rgrp iopen flock quota jrnl Total +S --------- ------- -------- ------- ------- ------- ----- ---- -------- +S Unlocked: 1 5 4 0 0 0 0 10 +S Locked: 2 245 6 58 0 0 1 313 +S Total: 3 250 10 58 0 0 1 323 +S +S Held EX: 0 2 0 0 0 0 1 3 +S Held SH: 1 1 0 57 0 0 0 59 +S Held DF: 0 0 0 0 0 0 0 0 +S G Waiting: 0 1 0 0 0 0 0 1 +S P Waiting: 0 1 0 0 0 0 0 1 +S DLM wait: 0 + +@ nate_bob0 Wed Jan 27 07:24:14 2016 @host-050 + G: s:EX n:2/180e9 f:yIqob t:EX d:EX/0 a:1 v:0 r:3 m:200 (inode) + D: Granted EX on this node to pid 17465 [ended] + H: s:EX f:H e:0 p:17465 [(ended)] init_per_node+0x17d/0x280 [gfs2] + I: n:9/98537 t:8 f:0x00 d:0x00000201 s:24 + U: (N/A:System) H inode 180e9 Held:Exclusive [Dirty, Queued, Blocking] + U: (N/A:System) H ---> held by pid 17465 [(ended)] (Granted EX on this node to pid 17465 [ended]) + G: s:UN n:2/609b4 f:lIqob t:EX d:EX/0 a:0 v:0 r:3 m:200 (inode) + D: Granted EX on node 2 to pid 14367 [ended] + H: s:EX f:W e:0 p:16297 [delete_workqueu] gfs2_delete_inode+0x9d/0x450 [gfs2] + U: W inode 609b4 Is:Unlocked, Want:Exclusive [Queued, Blocking] + U: W ---> waiting pid 16297 [delete_workqueu] (Granted EX on node 2 to pid 14367 [ended]) + C: gfs2_delete_inode+0xa5/0x450 [gfs2] + C: generic_delete_inode+0xde/0x1d0 + C: generic_drop_inode+0x65/0x80 + C: gfs2_drop_inode+0x37/0x40 [gfs2] + G: s:SH n:2/19 f:Iqob t:SH d:EX/0 a:0 v:0 r:3 m:200 (inode) + D: Granted PR on this node to pid 17465 [ended] + H: s:SH f:eEcH e:0 p:17465 [(ended)] init_journal+0x185/0x500 [gfs2] + I: n:4/25 t:8 f:0x01 d:0x00000200 s:134217728 + U: (N/A:Not EX) H inode 19 Held:Shared [Queued, Blocking] + U: (N/A:Not EX) H ---> held by pid 17465 [(ended)] (Granted PR on this node to pid 17465 [ended]) + G: s:EX n:2/180ea f:Iqob t:EX d:EX/0 a:0 v:0 r:3 m:200 (inode) + D: Granted EX on this node to pid 17465 [ended] + H: s:EX f:H e:0 p:17465 [(ended)] init_per_node+0x1b0/0x280 [gfs2] + I: n:10/98538 t:8 f:0x00 d:0x00000200 s:1048576 + U: (N/A:System) H inode 180ea Held:Exclusive [Queued, Blocking] + U: (N/A:System) H ---> held by pid 17465 [(ended)] (Granted EX on this node to pid 17465 [ended]) + G: s:EX n:9/0 f:Iqb t:EX d:EX/0 a:0 v:0 r:2 m:200 (journal) + D: Granted EX on this node to pid 17465 [ended] + H: s:EX f:eH e:0 p:17465 [(ended)] gfs2_glock_nq_num+0x5b/0xa0 [gfs2] + U: (N/A:Journl) H journal 0 Held:Exclusive [Queued, Blocking] + U: (N/A:Journl) H ---> held by pid 17465 [(ended)] (Granted EX on this node to pid 17465 [ended]) + G: s:UN n:2/4fe12 f:ldIqob t:EX d:UN/0 a:0 v:0 r:4 m:10 (inode) + H: s:EX f:W e:0 p:17523 [python] gfs2_rename+0x344/0x8b0 [gfs2] + H: s:SH f:AW e:0 p:17527 [python] gfs2_permission+0x176/0x210 [gfs2] + U: W inode 4fe12 Is:Unlocked, Want:Exclusive [Demote pending, Queued, Blocking] + U: W ---> waiting pid 17523 [python] + C: gfs2_permission+0x17f/0x210 [gfs2] + C: __link_path_walk+0xb3/0x1000 + C: path_walk+0x6a/0xe0 + C: filename_lookup+0x6b/0xc0 + U: W ---> waiting pid 17527 [python] + C: do_unlinkat+0x107/0x260 + C: sys_unlink+0x16/0x20 + C: system_call_fastpath+0x16/0x1b + C: 0xffffffffffffffff + G: s:SH n:1/1 f:Iqb t:SH d:EX/0 a:0 v:0 r:2 m:200 (non-disk) + D: Granted PR on node 2 to pid 14285 [ended] + D: Granted PR on this node to pid 17465 [ended] + H: s:SH f:eEH e:0 p:17465 [(ended)] gfs2_glock_nq_num+0x5b/0xa0 [gfs2] + U: (N/A:Not EX) H non-disk 1 Held:Shared [Queued, Blocking] + U: (N/A:Not EX) H ---> held by pid 17465 [(ended)] (Granted PR on node 2 to pid 14285 [ended]) (Granted PR on this node to pid 17465 [ended]) +S glocks nondisk inode rgrp iopen flock quota jrnl Total +S --------- ------- -------- ------- ------- ------- ----- ---- -------- +S Unlocked: 1 8 7 0 0 0 0 16 +S Locked: 2 208 3 41 0 0 1 256 +S Total: 3 216 10 41 0 0 1 272 +S +S Held EX: 0 2 0 0 0 0 1 3 +S Held SH: 1 1 0 41 0 0 0 43 +S Held DF: 0 0 0 0 0 0 0 0 +S G Waiting: 0 2 0 0 0 0 0 2 +S P Waiting: 0 3 0 0 0 0 0 3 +S DLM wait: 0 +.RE +.fi +.PP +From this example output, we can see there are two GFS2 file systems +mounted on system host-050: nate_bob1 and nate_bob0. In nate_bob1, we can +see six glocks, but we can ignore all of them marked (N/A:...) because they +are system files or held in SHared mode, and therefore other nodes should +be able to hold the lock in SHared as well. +.PP +There is one glock, for inode 183f5, which is has a process waiting to +hold it. The lock is currently in SHared mode (s:SH on the G: line) but +process 17511 (python) wants to hold the lock in EXclusive mode (S:EX +on the H: line). That process has a call stack that indicates it is trying +to hold the glock from gfs2_unlink. The DLM says the lock is currently +granted on node 2 in PR (Protected Read) mode. +.PP +For file system nate_bob0, there are 7 glocks listed. All but two are +uninteresting. Locks 2/609b4 and 2/4fe12 have processes waiting to +hold them. +.PP +In the summary data for nate_bob0, you can see there are 3 processes waiting +for 2 inode glocks (so one of those glocks has multiple processes waiting). +.PP +Since DLM wait is 0 in the summary data for both GFS2 mount points, +nobody is waiting for DLM to grant the lock. + +.SH KNOWN BUGS AND LIMITATIONS +.PP +Since the GFS2 debugfs files are completely separate from the DLM debugfs +files, and locks can change status in a few nanoseconds time, there will +always be a lag between the GFS2 view of a lock and the DLM view of a lock. +If there is some kind of long-term hang, they are more likely to match. +However, under ordinary conditions, by the time glocktop gets around to +fetching the DLM status of a lock, the information has changed. Therefore, +don't be surprised if the DLM's view of a lock is at odds with its glock. +.PP +Since iopen glocks are held by the thousands, glocktop skips most of the +information related to them unless there's a waiter. For that reason, +iopen lock problems may be difficult to debug with glocktop. +.PP +It doesn't handle very large numbers (millions) of glocks. + diff --git a/gfs2/man/mkfs.gfs2.8 b/gfs2/man/mkfs.gfs2.8 new file mode 100644 index 0000000..35e355a --- /dev/null +++ b/gfs2/man/mkfs.gfs2.8 @@ -0,0 +1,149 @@ +.TH mkfs.gfs2 8 + +.SH NAME +mkfs.gfs2 - create a gfs2 filesystem + +.SH SYNOPSIS +.B mkfs.gfs2 +[\fIoptions\fR] \fIdevice\fR \fI[block-count]\fR + +.SH DESCRIPTION +mkfs.gfs2 is used to create a gfs2 file system. + +.SH OPTIONS +The default values of the following options have been chosen for best results. +In most cases, there should be no need to choose different values. The +exceptions to this are the number of journals (\fB-j\fP) and the lock table +(\fB-t\fP), as these options will be specific to your cluster. +.TP +\fB-b\fP \fIbytes\fR +Set the filesystem block size to \fIbytes\fR which must be a power of two. The +minimum block size is 512 and the block size cannot exceed the machine's memory +page size, which on most architectures is 4096 bytes. The default block size +is 4096 bytes. +.TP +\fB-c\fP \fImegabytes\fR +Initial size of each journal's quota change file. The default is 1MB. +.TP +\fB-D\fP +Enable debugging output. +.TP +\fB-h\fP +Print out a help message describing the available options, then exit. +.TP +\fB-J\fP \fImegabytes\fR +The size of each journal. The minimum size is 8 megabytes and the maximum is +1024. If this is not specified, a value based on a sensible proportion of the +file system will be chosen. +.TP +\fB-j\fP \fIjournals\fR +The number of journals for mkfs.gfs2 to create. At least one journal is +required for each machine that will mount the filesystem concurrently. If this +option is not specified, only one journal will be created. This number may be +used as an indicator of the number of nodes in the cluster in order to optimize +the layout of the filesystem. As such, it is best to set this option with the +maximum number of mounters in mind than to add more journals later. +.TP +\fB-K\fP +Do not attempt to discard the block device contents. Issuing discards to the +device allows some solid state devices and sparse or thin-provisioned storage +devices to optimise free space. Other devices may emulate this behaviour by +zeroing the device contents, which can be a slow process. +.TP +\fB-O\fP +Override. This option prevents mkfs.gfs2 from asking for confirmation before +writing the filesystem. +.TP +\fB-o\fP +Specify extended options. Multiple options can be separated by commas. Valid +extended options are: +.RS 1.0i +.TP +.BI help +Display an extended options help summary, then exit. +.TP +.BI sunit= bytes +This is used to specify the stripe unit for a RAID device or striped logical +volume. This option ensures that resource groups will be stripe unit aligned +and overrides the stripe unit value obtained by probing the device. This value +must be a multiple of the file system block size and must be specified with the +.I swidth +option. +.TP +.BI swidth= bytes +This is used to specify the stripe width for a RAID device or striped logical +volume. This option ensures that resource groups will be stripe aligned and +overrides the stripe width value obtained by probing the device. This value +must be a multiple of the +.I sunit +option and must also be specified with it. +.TP +.BI align= [0|1] +Disable or enable the alignment of resource groups. The default behaviour is to +align resource groups to the stripe width and stripe unit values obtained from +probing the device or specified with the +.I swidth +and +.I sunit +extended options. +.RE +.TP +\fB-p\fP \fIprotocol\fR +Specify the locking protocol to use when no locking protocol is specified at +mount time. Valid locking protocols are: +.RS 1.0i +.TP +.BI lock_dlm +This is the default. It enables DLM-based locking for use in shared storage +configurations. +.TP +.BI lock_nolock +This enables single-node locking +.RE +.TP +\fB-q\fP +Quiet mode. Do not print anything. +.TP +\fB-r\fP \fImegabytes\fR +mkfs.gfs2 will try to make resource groups approximately this large. The +minimum resource group size is 32 MB and the maximum is 2048 MB. A large +resource group size may increase performance on very large file systems. If +not specified, mkfs.gfs2 will choose the resource group size based on the +size and alignment characteristics of the target device. +.TP +\fB-t\fP \fIclustername:lockspace\fR +The "lock table" pair used to uniquely identify this filesystem in a cluster. +The cluster name segment (maximum 32 characters) must match the name given to +your cluster in its configuration; only members of this cluster are permitted +to use this file system. The lockspace segment (maximum 30 characters) is a +unique file system name used to distinguish this gfs2 file system. Valid +\fIclustername\fRs and \fIlockspace\fRs may only contain alphanumeric +characters, hyphens (-) and underscores (_). +.TP +\fB-V\fP +Print program version information, then exit. +.TP +\fIblock-count\fR +Use \fIblock-count\fR as the size of the filesystem instead of using the whole +device. \fIblock-count\fR is specified as a number of filesystem blocks. +.SH EXAMPLE +.nf +.RS +# mkfs.gfs2 -t mycluster:mygfs2 -p lock_dlm -j 2 /dev/vg0/lv_gfs2 +.PP +This will create a gfs2 filesystem on the block device /dev/vg0/lv_gfs2. It +will belong to a cluster named "mycluster" and use the "mygfs2" lock space. It +will use DLM for locking and create journals for a two-node cluster. +.PP +# mkfs.gfs2 -t mycluster:mygfs2 -p lock_nolock -j 3 /dev/vg0/lv_gfs2 +.PP +This will create a filesystem on the block device /dev/vg0/lv_gfs2. It +will belong to a cluster named "mycluster" and use the "mygfs2" lockspace, but +it will have no cluster locking by default as lock_nolock is used. It will +have journals for a three-node cluster. +.RE +.fi +.SH SEE ALSO +.BR gfs2 (5), +.BR gfs2_jadd (8), +.BR gfs2_grow (8) diff --git a/gfs2/man/tunegfs2.8 b/gfs2/man/tunegfs2.8 new file mode 100644 index 0000000..c4446f4 --- /dev/null +++ b/gfs2/man/tunegfs2.8 @@ -0,0 +1,59 @@ +.TH tunegfs2 8 + +.SH NAME +tunegfs2 - View and manipulate gfs2 superblocks + +.SH SYNOPSIS +.B tunegfs2 +[\fIOPTIONS\fR] +/dev/blockdevice + +.SH DESCRIPTION +tunegfs2 allows viewing and manipulating the values contained in a +GFS or GFS2 superblock. It is able to modify the \fIUUID\fR (on GFS2 only), +\fIlabel\fR, \fIlockproto\fR and \fIlocktable\fR. + +The values in the GFS2 superblock are read only on mount. Any +changes on a live filesystem will not take effect until the next +time it is mounted. Making changes on a live filesystem is not +recommended for this reason. + +.SH OPTIONS + +.TP +\fB-h\fP + +Prints out usage information for this command. + +.TP +\fB-l\fP + +List contents of the filesystem superblock. Includes the current values of the +parameters that can be set by this program. + +.TP +\fB-L\fP \fI