From 6acc9f9f4d552ac34bd4831c09badaf5ffec6b09 Mon Sep 17 00:00:00 2001 From: Pavel Snajdr Date: Thu, 3 Sep 2020 12:11:39 +0200 Subject: [PATCH] Fix zfs_fsync deadlock There are three locks at play, when handling ZIL records. First is a zl_issuer_lock, that is supposed to be held, whenever we're issuing a new record to ZIL. Then there is zl_lock to actually protect the in the zilog struct. And then there's a last lock, zcw_lock, which is used to protect the lists of ZIL call-back waiters. The order of locking is supposed to be: zl_issuer_lock -> zcw_lock -> zl_lock; as implied their usage in zil_alloc_lwb and zil_commit_waiter_timeout functions. Function zil_commit_waiter_link_lwb goes against this, it is expecting to be entered with zl_lock already held and only then it is taking the zcw_lock; This patch straightens the locking in zil_commit_waiter_link_lwb to take the zl_lock on its own, correcting the locking order. Following is an attached messsage from the Linux lockdep mechanism describing the potential deadlock - in our production, it was a matter of a few hours to hit this one... (redacted to fit 72 cols) systemd-journal/5561 is trying to acquire lock: (&zilog->zl_lock){+.+.}-{4:4}, at: zil_alloc_lwb+0x1df/0x3e0 but task is already holding lock: (&zcw->zcw_lock){+.+.}-{4:4}, at: zil_commit_impl+0x52b/0x1850 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&zcw->zcw_lock){+.+.}-{4:4}: __mutex_lock+0xac/0x9e0 mutex_lock_nested+0x1b/0x20 zil_commit_waiter_link_lwb+0x51/0x1a0 [zfs] zil_commit_impl+0x12b0/0x1850 [zfs] zil_commit+0x43/0x60 [zfs] zpl_writepages+0xf8/0x1a0 [zfs] do_writepages+0x43/0xf0 __filemap_fdatawrite_range+0xd5/0x110 filemap_write_and_wait_range+0x4b/0xb0 zpl_fsync+0x4d/0xb0 [zfs] vfs_fsync_range+0x49/0x80 do_fsync+0x3d/0x70 __x64_sys_fsync+0x14/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&zilog->zl_lock){+.+.}-{4:4}: __lock_acquire+0x12b6/0x2480 lock_acquire+0xab/0x380 __mutex_lock+0xac/0x9e0 mutex_lock_nested+0x1b/0x20 zil_alloc_lwb+0x1df/0x3e0 [zfs] zil_lwb_write_issue+0x265/0x3f0 [zfs] zil_commit_impl+0x577/0x1850 [zfs] zil_commit+0x43/0x60 [zfs] zpl_writepages+0xf8/0x1a0 [zfs] do_writepages+0x43/0xf0 __filemap_fdatawrite_range+0xd5/0x110 filemap_write_and_wait_range+0x4b/0xb0 zpl_fsync+0x4d/0xb0 [zfs] vfs_fsync_range+0x49/0x80 do_fsync+0x3d/0x70 __x64_sys_fsync+0x14/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&zcw->zcw_lock); lock(&zilog->zl_lock); lock(&zcw->zcw_lock); lock(&zilog->zl_lock); *** DEADLOCK *** 2 locks held by systemd-journal/5561: #0: (&zilog->zl_issuer_lock){+.+.}-{4:4}, at: zil_commit_impl+0x4dc... #1: (&zcw->zcw_lock){+.+.}-{4:4}, at: zil_commit_impl+0x52b/0x1850 Fixes #7038 Fixes #10440 Signed-off-by: Pavel Snajdr --- module/zfs/zil.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 9dc20ba14f37..f1b737815200 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -998,11 +998,8 @@ zil_commit_waiter_skip(zil_commit_waiter_t *zcw) static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { - /* - * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. - */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); @@ -1012,8 +1009,10 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE); + mutex_enter(&lwb->lwb_zilog->zl_lock); list_insert_tail(&lwb->lwb_waiters, zcw); zcw->zcw_lwb = lwb; + mutex_exit(&lwb->lwb_zilog->zl_lock); mutex_exit(&zcw->zcw_lock); } @@ -1635,10 +1634,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * For more details, see the comment above zil_commit(). */ if (lrc->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); return (lwb); } @@ -2168,8 +2165,8 @@ zil_prune_commit_list(zilog_t *zilog) break; mutex_enter(&zilog->zl_lock); - lwb_t *last_lwb = zilog->zl_last_lwb_opened; + if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { /* @@ -2178,13 +2175,14 @@ zil_prune_commit_list(zilog_t *zilog) * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ + mutex_exit(&zilog->zl_lock); zil_commit_waiter_skip(itx->itx_private); } else { + mutex_exit(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); itx->itx_private = NULL; } - mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx);