From 2be57eec919ea0cd285a9e6109837a841b24335a Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Aug 16 2018 12:49:20 +0000 Subject: import systemd-219-57.el7_5.1 --- diff --git a/SOURCES/0614-umount-always-use-MNT_FORCE-in-umount_all-7213.patch b/SOURCES/0614-umount-always-use-MNT_FORCE-in-umount_all-7213.patch new file mode 100644 index 0000000..bf44671 --- /dev/null +++ b/SOURCES/0614-umount-always-use-MNT_FORCE-in-umount_all-7213.patch @@ -0,0 +1,72 @@ +From d9232a652080ac260c78ab647608860269b45d03 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Wed, 8 Nov 2017 19:29:32 +1100 +Subject: [PATCH] umount: always use MNT_FORCE in umount_all() (#7213) + +The linux umount2() systemcall accepts a MNT_FORCE flags +which some filesystems honor, particularly FUSE and various +network filesystems such as NFS. +These filesystems can sometimes wait for an indefinite period +for a response from an external service, and the wait if +sometimes "uninterruptible" meaning that the process cannot be +killed. +Using MNT_FORCE causes any such request that are outstanding to +be aborted. This normally allows the waiting process to +be killed. It will then realease and reference it has to the +filesytem, this allowing the filesystem to be unmounted. + +If there remain active references to the filesystem, MNT_FORCE +is *not* forcefull enough to unmount the filesystem anyway. + +By the time that umount_all() is run by systemd-shutdown, all +filesystems *should* be unmounted, and sync() will have been +called. Anything that remains cannot be unmounted in a +completely clean manner and just nees to be dealt with as firmly +as possible. So use MNT_FORCE and try to explain why in the +comment. + +Also enhance an earlier comment to explain why umount2() is +safe even though mount(MNT_REMOUNT) isn't. + +(cherry picked from commit c44cac7c6c43407d28bd8daebff39f6145a2a33e) + +Resolves: #1571098 +--- + src/core/umount.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/src/core/umount.c b/src/core/umount.c +index 3eec0d459..91d67c06c 100644 +--- a/src/core/umount.c ++++ b/src/core/umount.c +@@ -377,7 +377,9 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e + the superblock here, not the bind mount. + If the filesystem is a network fs, also skip the + remount. It brings no value (we cannot leave +- a "dirty fs") and could hang if the network is down. */ ++ a "dirty fs") and could hang if the network is down. ++ Note that umount2() is more careful and will not ++ hang because of the network being down. */ + if (detect_container(NULL) <= 0 && + !fstype_is_network(m->type)) { + _cleanup_free_ char *options = NULL; +@@ -418,11 +420,15 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e + ) + continue; + +- /* Trying to umount. We don't force here since we rely +- * on busy NFS and FUSE file systems to return EBUSY +- * until we closed everything on top of them. */ ++ /* Trying to umount. Using MNT_FORCE causes some ++ * filesystems (e.g. FUSE and NFS and other network ++ * filesystems) to abort any pending requests and ++ * return -EIO rather than blocking indefinitely. ++ * If the filesysten is "busy", this may allow processes ++ * to die, thus making the filesystem less busy so ++ * the unmount might succeed (rather then return EBUSY).*/ + log_info("Unmounting %s.", m->path); +- if (umount2(m->path, 0) == 0) { ++ if (umount2(m->path, MNT_FORCE) == 0) { + if (changed) + *changed = true; + diff --git a/SOURCES/0615-core-Implement-timeout-based-umount-remount-limit.patch b/SOURCES/0615-core-Implement-timeout-based-umount-remount-limit.patch new file mode 100644 index 0000000..00af896 --- /dev/null +++ b/SOURCES/0615-core-Implement-timeout-based-umount-remount-limit.patch @@ -0,0 +1,304 @@ +From dd531b196e9dd794f7409f97320814e24f50081b Mon Sep 17 00:00:00 2001 +From: Kyle Walker +Date: Wed, 13 Dec 2017 12:49:26 -0500 +Subject: [PATCH] core: Implement timeout based umount/remount limit + +Remount, and subsequent umount, attempts can hang for inaccessible network +based mount points. This can leave a system in a hard hang state that +requires a hard reset in order to recover. This change moves the remount, +and umount attempts into separate child processes. The remount and umount +operations will block for up to 90 seconds (DEFAULT_TIMEOUT_USEC). Should +those waits fail, the parent will issue a SIGKILL to the child and continue +with the shutdown efforts. + +In addition, instead of only reporting some additional errors on the final +attempt, failures are reported as they occur. + +(cherry picked from commit d5641e0d7e8f55937fbc3a7ecd667e42c5836d80) + +Related: #1571098 +--- + src/core/umount.c | 112 +++++++++++++++++++++++++++++++++++++--------- + src/shared/def.h | 2 - + src/shared/login-shared.c | 1 + + src/shared/util.c | 61 +++++++++++++++++++++++++ + src/shared/util.h | 15 +++++++ + 5 files changed, 167 insertions(+), 24 deletions(-) + +diff --git a/src/core/umount.c b/src/core/umount.c +index 91d67c06c..bd3896612 100644 +--- a/src/core/umount.c ++++ b/src/core/umount.c +@@ -363,7 +363,84 @@ static int delete_dm(dev_t devnum) { + return r >= 0 ? 0 : -errno; + } + +-static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_error) { ++static int remount_with_timeout(MountPoint *m, char *options, int *n_failed) { ++ pid_t pid; ++ int r; ++ ++ BLOCK_SIGNALS(SIGCHLD); ++ ++ /* Due to the possiblity of a remount operation hanging, we ++ * fork a child process and set a timeout. If the timeout ++ * lapses, the assumption is that that particular remount ++ * failed. */ ++ pid = fork(); ++ if (pid < 0) ++ return log_error_errno(errno, "Failed to fork: %m"); ++ ++ if (pid == 0) { ++ log_info("Remounting '%s' read-only in with options '%s'.", m->path, options); ++ ++ /* Start the mount operation here in the child */ ++ r = mount(NULL, m->path, NULL, MS_REMOUNT|MS_RDONLY, options); ++ if (r < 0) ++ log_error_errno(errno, "Failed to remount '%s' read-only: %m", m->path); ++ ++ _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); ++ } ++ ++ r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC); ++ if (r == -ETIMEDOUT) { ++ log_error_errno(errno, "Remounting '%s' - timed out, issuing SIGKILL to PID "PID_FMT".", m->path, pid); ++ (void) kill(pid, SIGKILL); ++ } else if (r < 0) ++ log_error_errno(r, "Failed to wait for process: %m"); ++ ++ return r; ++} ++ ++static int umount_with_timeout(MountPoint *m, bool *changed) { ++ pid_t pid; ++ int r; ++ ++ BLOCK_SIGNALS(SIGCHLD); ++ ++ /* Due to the possiblity of a umount operation hanging, we ++ * fork a child process and set a timeout. If the timeout ++ * lapses, the assumption is that that particular umount ++ * failed. */ ++ pid = fork(); ++ if (pid < 0) ++ return log_error_errno(errno, "Failed to fork: %m"); ++ ++ if (pid == 0) { ++ log_info("Unmounting '%s'.", m->path); ++ ++ /* Start the mount operation here in the child Using MNT_FORCE ++ * causes some filesystems (e.g. FUSE and NFS and other network ++ * filesystems) to abort any pending requests and return -EIO ++ * rather than blocking indefinitely. If the filesysten is ++ * "busy", this may allow processes to die, thus making the ++ * filesystem less busy so the unmount might succeed (rather ++ * then return EBUSY).*/ ++ r = umount2(m->path, MNT_FORCE); ++ if (r < 0) ++ log_error_errno(errno, "Failed to unmount %s: %m", m->path); ++ ++ _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); ++ } ++ ++ r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC); ++ if (r == -ETIMEDOUT) { ++ log_error_errno(errno, "Unmounting '%s' - timed out, issuing SIGKILL to PID "PID_FMT".", m->path, pid); ++ (void) kill(pid, SIGKILL); ++ } else if (r < 0) ++ log_error_errno(r, "Failed to wait for process: %m"); ++ ++ return r; ++} ++ ++ ++static int mount_points_list_umount(MountPoint **head, bool *changed) { + MountPoint *m, *n; + int n_failed = 0; + +@@ -405,9 +482,13 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e + * explicitly remount the super block of that + * alias read-only we hence should be + * relatively safe regarding keeping the fs we +- * can otherwise not see dirty. */ +- log_info("Remounting '%s' read-only with options '%s'.", m->path, options); +- (void) mount(NULL, m->path, NULL, MS_REMOUNT|MS_RDONLY, options); ++ * can otherwise not see dirty. ++ * ++ * Since the remount can hang in the instance of ++ * remote filesystems, we remount asynchronously ++ * and skip the subsequent umount if it fails */ ++ if (remount_with_timeout(m, options, &n_failed) < 0) ++ continue; + } + + /* Skip / and /usr since we cannot unmount that +@@ -420,22 +501,14 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e + ) + continue; + +- /* Trying to umount. Using MNT_FORCE causes some +- * filesystems (e.g. FUSE and NFS and other network +- * filesystems) to abort any pending requests and +- * return -EIO rather than blocking indefinitely. +- * If the filesysten is "busy", this may allow processes +- * to die, thus making the filesystem less busy so +- * the unmount might succeed (rather then return EBUSY).*/ +- log_info("Unmounting %s.", m->path); +- if (umount2(m->path, MNT_FORCE) == 0) { ++ /* Trying to umount */ ++ if (umount_with_timeout(m, changed) < 0) ++ n_failed++; ++ else { + if (changed) + *changed = true; + + mount_point_free(head, m); +- } else if (log_error) { +- log_warning_errno(errno, "Could not unmount %s: %m", m->path); +- n_failed++; + } + } + +@@ -550,17 +623,12 @@ int umount_all(bool *changed) { + do { + umount_changed = false; + +- mount_points_list_umount(&mp_list_head, &umount_changed, false); ++ mount_points_list_umount(&mp_list_head, &umount_changed); + if (umount_changed) + *changed = true; + + } while (umount_changed); + +- /* umount one more time with logging enabled */ +- r = mount_points_list_umount(&mp_list_head, &umount_changed, true); +- if (r <= 0) +- goto end; +- + end: + mount_points_list_free(&mp_list_head); + +diff --git a/src/shared/def.h b/src/shared/def.h +index 9e008a6d2..f193ab1f9 100644 +--- a/src/shared/def.h ++++ b/src/shared/def.h +@@ -21,8 +21,6 @@ + along with systemd; If not, see . + ***/ + +-#include "util.h" +- + #define DEFAULT_TIMEOUT_USEC (90*USEC_PER_SEC) + #define DEFAULT_RESTART_USEC (100*USEC_PER_MSEC) + #define DEFAULT_CONFIRM_USEC (30*USEC_PER_SEC) +diff --git a/src/shared/login-shared.c b/src/shared/login-shared.c +index 054c77503..5da0f0583 100644 +--- a/src/shared/login-shared.c ++++ b/src/shared/login-shared.c +@@ -21,6 +21,7 @@ + + #include "login-shared.h" + #include "def.h" ++#include "util.h" + + bool session_id_valid(const char *id) { + assert(id); +diff --git a/src/shared/util.c b/src/shared/util.c +index af0953273..5e27a41da 100644 +--- a/src/shared/util.c ++++ b/src/shared/util.c +@@ -8893,3 +8893,64 @@ uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) { + + return m / max; + } ++ ++/* ++ * Return values: ++ * < 0 : wait_for_terminate_with_timeout() failed to get the state of the ++ * process, the process timed out, the process was terminated by a ++ * signal, or failed for an unknown reason. ++ * >=0 : The process terminated normally with no failures. ++ * ++ * Success is indicated by a return value of zero, a timeout is indicated ++ * by ETIMEDOUT, and all other child failure states are indicated by error ++ * is indicated by a non-zero value. ++*/ ++int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) { ++ sigset_t mask; ++ int r; ++ usec_t until; ++ ++ assert_se(sigemptyset(&mask) == 0); ++ assert_se(sigaddset(&mask, SIGCHLD) == 0); ++ ++ /* Drop into a sigtimewait-based timeout. Waiting for the ++ * pid to exit. */ ++ until = now(CLOCK_MONOTONIC) + timeout; ++ for (;;) { ++ usec_t n; ++ siginfo_t status = {}; ++ struct timespec ts; ++ ++ n = now(CLOCK_MONOTONIC); ++ if (n >= until) ++ break; ++ ++ r = sigtimedwait(&mask, NULL, timespec_store(&ts, until - n)) < 0 ? -errno : 0; ++ /* Assuming we woke due to the child exiting. */ ++ if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) { ++ if (status.si_pid == pid) { ++ /* This is the correct child.*/ ++ if (status.si_code == CLD_EXITED) ++ return (status.si_status == 0) ? 0 : -EPROTO; ++ else ++ return -EPROTO; ++ } ++ } ++ /* Not the child, check for errors and proceed appropriately */ ++ if (r < 0) { ++ switch (r) { ++ case -EAGAIN: ++ /* Timed out, child is likely hung. */ ++ return -ETIMEDOUT; ++ case -EINTR: ++ /* Received a different signal and should retry */ ++ continue; ++ default: ++ /* Return any unexpected errors */ ++ return r; ++ } ++ } ++ } ++ ++ return -EPROTO; ++} +diff --git a/src/shared/util.h b/src/shared/util.h +index 526a6fe84..81aef034e 100644 +--- a/src/shared/util.h ++++ b/src/shared/util.h +@@ -22,6 +22,7 @@ + ***/ + + #include ++#include + #include + #include + #include +@@ -1112,3 +1113,17 @@ int parse_percent(const char *p); + + uint64_t system_tasks_max(void); + uint64_t system_tasks_max_scale(uint64_t v, uint64_t max); ++ ++int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout); ++ ++static inline void block_signals_reset(sigset_t *ss) { ++ assert_se(sigprocmask(SIG_SETMASK, ss, NULL) >= 0); ++} ++ ++#define BLOCK_SIGNALS(...) \ ++ _cleanup_(block_signals_reset) _unused_ sigset_t _saved_sigset = ({ \ ++ sigset_t _t; \ ++ assert_se(sigprocmask(SIG_SETMASK, NULL, &_t) == 0); \ ++ assert_se(sigprocmask_many(SIG_BLOCK, __VA_ARGS__, -1) >= 0); \ ++ _t; \ ++ }) diff --git a/SOURCES/0616-core-Implement-sync_with_progress.patch b/SOURCES/0616-core-Implement-sync_with_progress.patch new file mode 100644 index 0000000..11ff498 --- /dev/null +++ b/SOURCES/0616-core-Implement-sync_with_progress.patch @@ -0,0 +1,169 @@ +From ddc1464f3b129ea8490d7da3dd000ab1874b3a4d Mon Sep 17 00:00:00 2001 +From: Kyle Walker +Date: Thu, 14 Dec 2017 11:46:03 -0500 +Subject: [PATCH] core: Implement sync_with_progress() + +In similar fashion to the previous change, sync() operations can stall +endlessly if cache is unable to be written out. In order to avoid an +unbounded hang, the sync takes place within a child process. Every 10 +seconds (SYNC_TIMEOUT_USEC), the value of /proc/meminfo "Dirty" is checked +to verify it is smaller than the last iteration. If the sync is not making +progress for 3 successive iterations (SYNC_PROGRESS_ATTEMPTS), a SIGKILL is +sent to the sync process and the shutdown continues. + +(cherry picked from commit 73ad712fcfea5d8ba475044698d31d2c15d4180d) + +Related: #1571098 +--- + src/core/shutdown.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 111 insertions(+), 5 deletions(-) + +diff --git a/src/core/shutdown.c b/src/core/shutdown.c +index 71f001ac1..0b0a54a7d 100644 +--- a/src/core/shutdown.c ++++ b/src/core/shutdown.c +@@ -53,6 +53,9 @@ + + #define FINALIZE_ATTEMPTS 50 + ++#define SYNC_PROGRESS_ATTEMPTS 3 ++#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC) ++ + static char* arg_verb; + + static int parse_argv(int argc, char *argv[]) { +@@ -152,6 +155,102 @@ static int switch_root_initramfs(void) { + return switch_root("/run/initramfs", "/oldroot", false, MS_BIND); + } + ++/* Read the following fields from /proc/meminfo: ++ * ++ * NFS_Unstable ++ * Writeback ++ * Dirty ++ * ++ * Return true if the sum of these fields is greater than the previous ++ * value input. For all other issues, report the failure and indicate that ++ * the sync is not making progress. ++ */ ++static bool sync_making_progress(unsigned long long *prev_dirty) { ++ _cleanup_fclose_ FILE *f = NULL; ++ char line[LINE_MAX]; ++ bool r = false; ++ unsigned long long val = 0; ++ ++ f = fopen("/proc/meminfo", "re"); ++ if (!f) ++ return log_warning_errno(errno, "Failed to open /proc/meminfo: %m"); ++ ++ FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) { ++ unsigned long long ull = 0; ++ ++ if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:")) ++ continue; ++ ++ errno = 0; ++ if (sscanf(line, "%*s %llu %*s", &ull) != 1) { ++ if (errno != 0) ++ log_warning_errno(errno, "Failed to parse /proc/meminfo: %m"); ++ else ++ log_warning("Failed to parse /proc/meminfo"); ++ ++ return false; ++ } ++ ++ val += ull; ++ } ++ ++ r = *prev_dirty > val; ++ ++ *prev_dirty = val; ++ ++ return r; ++} ++ ++static void sync_with_progress(void) { ++ unsigned checks; ++ pid_t pid; ++ int r; ++ unsigned long long dirty = ULONG_LONG_MAX; ++ ++ BLOCK_SIGNALS(SIGCHLD); ++ ++ /* Due to the possiblity of the sync operation hanging, we fork ++ * a child process and monitor the progress. If the timeout ++ * lapses, the assumption is that that particular sync stalled. */ ++ pid = fork(); ++ if (pid < 0) { ++ log_error_errno(errno, "Failed to fork: %m"); ++ return; ++ } ++ ++ if (pid == 0) { ++ /* Start the sync operation here in the child */ ++ sync(); ++ _exit(EXIT_SUCCESS); ++ } ++ ++ log_info("Syncing filesystems and block devices."); ++ ++ /* Start monitoring the sync operation. If more than ++ * SYNC_PROGRESS_ATTEMPTS lapse without progress being made, ++ * we assume that the sync is stalled */ ++ for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) { ++ r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC); ++ if (r == 0) ++ /* Sync finished without error. ++ * (The sync itself does not return an error code) */ ++ return; ++ else if (r == -ETIMEDOUT) { ++ /* Reset the check counter if the "Dirty" value is ++ * decreasing */ ++ if (sync_making_progress(&dirty)) ++ checks = 0; ++ } else { ++ log_error_errno(r, "Failed to sync filesystems and block devices: %m"); ++ return; ++ } ++ } ++ ++ /* Only reached in the event of a timeout. We should issue a kill ++ * to the stray process. */ ++ log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid); ++ (void) kill(pid, SIGKILL); ++} + + int main(int argc, char *argv[]) { + bool need_umount, need_swapoff, need_loop_detach, need_dm_detach; +@@ -202,6 +301,13 @@ int main(int argc, char *argv[]) { + /* lock us into memory */ + mlockall(MCL_CURRENT|MCL_FUTURE); + ++ /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that ++ * slow IO is processed here already and the final process killing spree is not impacted by processes ++ * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will ++ * result. */ ++ if (!in_container) ++ sync_with_progress(); ++ + log_info("Sending SIGTERM to remaining processes..."); + broadcast_signal(SIGTERM, true, true); + +@@ -338,12 +444,12 @@ int main(int argc, char *argv[]) { + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : ""); + +- /* The kernel will automaticall flush ATA disks and suchlike +- * on reboot(), but the file systems need to be synce'd +- * explicitly in advance. So let's do this here, but not +- * needlessly slow down containers. */ ++ /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be ++ * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we ++ * sync'ed things already once above, but we did some more work since then which might have caused IO, hence ++ * let's do it once more. Do not remove this sync, data corruption will result. */ + if (!in_container) +- sync(); ++ sync_with_progress(); + + switch (cmd) { + diff --git a/SPECS/systemd.spec b/SPECS/systemd.spec index 6bdbb74..6a7ae36 100644 --- a/SPECS/systemd.spec +++ b/SPECS/systemd.spec @@ -7,7 +7,7 @@ Name: systemd Url: http://www.freedesktop.org/wiki/Software/systemd Version: 219 -Release: 57%{?dist} +Release: 57%{?dist}.1 # For a breakdown of the licensing, see README License: LGPLv2+ and MIT and GPLv2+ Summary: A System and Service Manager @@ -652,6 +652,9 @@ Patch0610: 0610-sd-journal-properly-handle-inotify-queue-overflow.patch Patch0611: 0611-sd-journal-make-sure-it-s-safe-to-call-sd_journal_pr.patch Patch0612: 0612-journalctl-Periodically-call-sd_journal_process-in-j.patch Patch0613: 0613-sd-journal-when-picking-up-a-new-file-compare-inode-.patch +Patch0614: 0614-umount-always-use-MNT_FORCE-in-umount_all-7213.patch +Patch0615: 0615-core-Implement-timeout-based-umount-remount-limit.patch +Patch0616: 0616-core-Implement-sync_with_progress.patch %global num_patches %{lua: c=0; for i,p in ipairs(patches) do c=c+1; end; print(c);} @@ -1625,6 +1628,11 @@ fi %{_mandir}/man8/systemd-resolved.* %changelog +* Mon Jun 25 2018 Lukas Nykryn - 219-57.1 +- umount: always use MNT_FORCE in umount_all() (#7213) (#1571098) +- core: Implement timeout based umount/remount limit (#1571098) +- core: Implement sync_with_progress() (#1571098) + * Tue Feb 20 2018 Lukas Nykryn - 219-57 - sd-journal: properly handle inotify queue overflow (#1540538) - sd-journal: make sure it's safe to call sd_journal_process() before the first sd_journal_wait() (#1540538)