From 13e55573d3ac656165610ebc9a8f4da17f00d08a Mon Sep 17 00:00:00 2001 From: Shaan Nobee Date: Wed, 4 May 2022 00:23:26 +0400 Subject: [PATCH] Speed up WB_SYNC_NONE when a WB_SYNC_ALL occurs simultaneously Page writebacks with WB_SYNC_NONE can take several seconds to complete since they wait for the transaction group to close before being committed. This is usually not a problem since the caller does not need to wait. However, if we're simultaneously doing a writeback with WB_SYNC_ALL (e.g via msync), the latter can block for several seconds (up to zfs_txg_timeout) due to the active WB_SYNC_NONE writeback since it needs to wait for the transaction to complete and the PG_writeback bit to be cleared. This commit deals with 2 cases: - No page writeback is active. A WB_SYNC_ALL page writeback starts and even completes. But when it's about to check if the PG_writeback bit has been cleared, another writeback with WB_SYNC_NONE starts. The sync page writeback ends up waiting for the non-sync page writeback to complete. - A page writeback with WB_SYNC_NONE is already active when a WB_SYNC_ALL writeback starts. The WB_SYNC_ALL writeback ends up waiting for the WB_SYNC_NONE writeback. The fix works by carefully keeping track of active sync/non-sync writebacks and committing when beneficial. Reviewed-by: Brian Behlendorf Signed-off-by: Shaan Nobee Closes #12662 Closes #12790 --- include/os/linux/zfs/sys/trace_acl.h | 14 +- include/os/linux/zfs/sys/zfs_vnops_os.h | 2 +- include/sys/zfs_znode.h | 2 + module/os/freebsd/zfs/zfs_znode.c | 8 + module/os/linux/zfs/zfs_ctldir.c | 2 + module/os/linux/zfs/zfs_vnops_os.c | 52 +++++- module/os/linux/zfs/zfs_znode.c | 8 + module/os/linux/zfs/zpl_file.c | 55 ++++++- module/zfs/zfs_vnops.c | 2 + tests/runfiles/common.run | 2 +- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/mmap_sync.c | 152 ++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + .../functional/mmap/mmap_sync_001_pos.ksh | 63 ++++++++ 15 files changed, 347 insertions(+), 18 deletions(-) create mode 100644 tests/zfs-tests/cmd/mmap_sync.c create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h index f8e0aa8acaf8..35bf78bed94e 100644 --- a/include/os/linux/zfs/sys/trace_acl.h +++ b/include/os/linux/zfs/sys/trace_acl.h @@ -58,6 +58,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint64_t, z_size) __field(uint64_t, z_pflags) __field(uint32_t, z_sync_cnt) + __field(uint32_t, z_sync_writes_cnt) + __field(uint32_t, z_async_writes_cnt) __field(mode_t, z_mode) __field(boolean_t, z_is_sa) __field(boolean_t, z_is_mapped) @@ -91,6 +93,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_size = zn->z_size; __entry->z_pflags = zn->z_pflags; __entry->z_sync_cnt = zn->z_sync_cnt; + __entry->z_sync_writes_cnt = zn->z_sync_writes_cnt; + __entry->z_async_writes_cnt = zn->z_async_writes_cnt; __entry->z_mode = zn->z_mode; __entry->z_is_sa = zn->z_is_sa; __entry->z_is_mapped = zn->z_is_mapped; @@ -116,16 +120,18 @@ DECLARE_EVENT_CLASS(zfs_ace_class, TP_printk("zn { id %llu unlinked %u atime_dirty %u " "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " - "sync_cnt %u mode 0x%x is_sa %d " - "is_mapped %d is_ctldir %d is_stale %d inode { " + "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u " + "mode 0x%x is_sa %d is_mapped %d " + "is_ctldir %d is_stale %d inode { " "uid %u gid %u ino %lu nlink %u size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " "ace { type %u flags %u access_mask %u } mask_matched %u", __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, __entry->z_zn_prefetch, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, - __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode, - __entry->z_is_sa, __entry->z_is_mapped, + __entry->z_pflags, __entry->z_sync_cnt, + __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt, + __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped, __entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_size, __entry->i_blkbits, diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 47f91e4a6cf4..1d6a58d56b67 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -70,7 +70,7 @@ extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, extern int zfs_fid(struct inode *ip, fid_t *fidp); extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); extern int zfs_putpage(struct inode *ip, struct page *pp, - struct writeback_control *wbc); + struct writeback_control *wbc, boolean_t for_sync); extern int zfs_dirty_inode(struct inode *ip, int flags); extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags); diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 127fd8736ffc..098cf9dbc16f 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -199,6 +199,8 @@ typedef struct znode { uint64_t z_size; /* file size (cached) */ uint64_t z_pflags; /* pflags (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ + uint32_t z_sync_writes_cnt; /* synchronous write count */ + uint32_t z_async_writes_cnt; /* asynchronous write count */ mode_t z_mode; /* mode (cached) */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 5ca92f332c6f..84cff13f0dfe 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -153,6 +153,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; + return (0); } @@ -172,6 +175,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); + + ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); + ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } @@ -453,6 +459,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; #if __FreeBSD_version >= 1300139 atomic_store_ptr(&zp->z_cached_symlink, NULL); #endif diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 7723b52563c0..aae19f6346fd 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -496,6 +496,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 2ba90d889369..d6ff838806eb 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -3396,7 +3396,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, } static void -zfs_putpage_commit_cb(void *arg) +zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; @@ -3404,13 +3404,26 @@ zfs_putpage_commit_cb(void *arg) end_page_writeback(pp); } +static void +zfs_putpage_async_commit_cb(void *arg) +{ + struct page *pp = arg; + znode_t *zp = ITOZ(pp->mapping->host); + + ClearPageError(pp); + end_page_writeback(pp); + atomic_dec_32(&zp->z_async_writes_cnt); +} + /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * - * IN: ip - page mapped for inode. - * pp - page to push (page is locked) - * wbc - writeback control data + * IN: ip - page mapped for inode. + * pp - page to push (page is locked) + * wbc - writeback control data + * for_sync - does the caller intend to wait synchronously for the + * page writeback to complete? * * RETURN: 0 if success * error code if failure @@ -3419,7 +3432,8 @@ zfs_putpage_commit_cb(void *arg) * ip - ctime|mtime updated */ int -zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) +zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, + boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -3517,6 +3531,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Speed up any non-sync page writebacks since + * they may take several seconds to complete. + * Refer to the comment in zpl_fsync() (when + * HAVE_FSYNC_RANGE is defined) for details. + */ + if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { + zil_commit(zfsvfs->z_log, zp->z_id); + } + if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); @@ -3542,6 +3566,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; + if (!for_sync) + atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); @@ -3563,6 +3589,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) #endif ClearPageError(pp); end_page_writeback(pp); + if (!for_sync) + atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); @@ -3587,7 +3615,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, - zfs_putpage_commit_cb, pp); + for_sync ? zfs_putpage_sync_commit_cb : + zfs_putpage_async_commit_cb, pp); + dmu_tx_commit(tx); zfs_rangelock_exit(lr); @@ -3599,6 +3629,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); + } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { + /* + * If the caller does not intend to wait synchronously + * for this page writeback to complete and there are active + * synchronous calls on this file, do a commit so that + * the latter don't accidentally end up waiting for + * our writeback to complete. Refer to the comment in + * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. + */ + zil_commit(zfsvfs->z_log, zp->z_id); } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index b76e65d16822..d921f2b07463 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -134,6 +134,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; + return (0); } @@ -154,6 +157,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); + + ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); + ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int @@ -554,6 +560,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 2d7864ddb3c2..8b84eb795fc3 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -165,17 +165,56 @@ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; + znode_t *zp = ITOZ(inode); + zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; + /* + * The variables z_sync_writes_cnt and z_async_writes_cnt work in + * tandem so that sync writes can detect if there are any non-sync + * writes going on and vice-versa. The "vice-versa" part to this logic + * is located in zfs_putpage() where non-sync writes check if there are + * any ongoing sync writes. If any sync and non-sync writes overlap, + * we do a commit to complete the non-sync writes since the latter can + * potentially take several seconds to complete and thus block sync + * writes in the upcoming call to filemap_write_and_wait_range(). + */ + atomic_inc_32(&zp->z_sync_writes_cnt); + /* + * If the following check does not detect an overlapping non-sync write + * (say because it's just about to start), then it is guaranteed that + * the non-sync write will detect this sync write. This is because we + * always increment z_sync_writes_cnt / z_async_writes_cnt before doing + * the check on z_async_writes_cnt / z_sync_writes_cnt here and in + * zfs_putpage() respectively. + */ + if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { + ZPL_ENTER(zfsvfs); + zil_commit(zfsvfs->z_log, zp->z_id); + ZPL_EXIT(zfsvfs); + } + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + + /* + * The sync write is not complete yet but we decrement + * z_sync_writes_cnt since zfs_fsync() increments and decrements + * it internally. If a non-sync write starts just after the decrement + * operation but before we call zfs_fsync(), it may not detect this + * overlapping sync write but it does not matter since we have already + * gone past filemap_write_and_wait_range() and we won't block due to + * the non-sync write. + */ + atomic_dec_32(&zp->z_sync_writes_cnt); + if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_fsync(ITOZ(inode), datasync, cr); + error = -zfs_fsync(zp, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -680,14 +719,14 @@ zpl_readahead(struct readahead_control *ractl) static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { - struct address_space *mapping = data; + boolean_t *for_sync = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); - (void) zfs_putpage(mapping->host, pp, wbc); + (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); return (0); @@ -714,8 +753,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ + boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; - result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync); if (sync_mode != wbc->sync_mode) { ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); @@ -731,7 +771,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; - result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + result = write_cache_pages(mapping, wbc, zpl_putpage, + &for_sync); } return (result); } @@ -748,7 +789,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - return (zpl_putpage(pp, wbc, pp->mapping)); + boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); + + return (zpl_putpage(pp, wbc, &for_sync)); } /* diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 30a4a38bac9a..a9737c19ce1d 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); + atomic_dec_32(&zp->z_sync_writes_cnt); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9c3db923ffe1..4ff46e7af35f 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -685,7 +685,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', tags = ['functional', 'migration'] [tests/functional/mmap] -tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos'] +tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 08256482aaf8..ddb9bb7eed1d 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -168,6 +168,7 @@ if sys.platform.startswith('freebsd'): 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], + 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], }) elif sys.platform.startswith('linux'): known.update({ diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index c2ef0e8c8349..1830cab76fee 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -19,6 +19,7 @@ /mmap_exec /mmap_libaio /mmap_seek +/mmap_sync /mmapwrite /nvlist_to_lua /randfree_file diff --git a/tests/zfs-tests/cmd/mmap_sync.c b/tests/zfs-tests/cmd/mmap_sync.c new file mode 100644 index 000000000000..0e4bba37d7be --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_sync.c @@ -0,0 +1,152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +cleanup(char *file) +{ + remove(file); +} + +int +main(int argc, char *argv[]) +{ + char *testdir = getenv("TESTDIR"); + if (!testdir) { + fprintf(stderr, "environment variable TESTDIR not set\n"); + return (1); + } + + struct stat st; + umask(0); + if (stat(testdir, &st) != 0 && + mkdir(testdir, 0777) != 0) { + perror("mkdir"); + return (1); + } + + if (argc > 3) { + fprintf(stderr, "usage: %s " + "[run time in mins] " + "[max msync time in ms]\n", argv[0]); + return (1); + } + + int run_time_mins = 5; + if (argc >= 2) { + run_time_mins = atoi(argv[1]); + } + + int max_msync_time_ms = 1000; + if (argc >= 3) { + max_msync_time_ms = atoi(argv[2]); + } + + char filepath[512]; + filepath[0] = '\0'; + char *file = &filepath[0]; + + strcat(file, testdir); + strcat(file, "/msync_file"); + + const int LEN = 8; + cleanup(file); + + int fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR | + S_IRGRP | S_IROTH); + + if (fd == -1) { + (void) fprintf(stderr, "%s: %s: ", argv[0], file); + perror("open"); + return (1); + } + + if (ftruncate(fd, LEN) != 0) { + perror("ftruncate"); + cleanup(file); + return (1); + } + + void *ptr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (ptr == MAP_FAILED) { + perror("mmap"); + cleanup(file); + return (1); + } + + struct timeval tstart; + gettimeofday(&tstart, NULL); + + long long x = 0LL; + + for (;;) { + *((long long *)ptr) = x; + x++; + + struct timeval t1, t2; + gettimeofday(&t1, NULL); + if (msync(ptr, LEN, MS_SYNC|MS_INVALIDATE) != 0) { + perror("msync"); + cleanup(file); + return (1); + } + + gettimeofday(&t2, NULL); + + double elapsed = (t2.tv_sec - t1.tv_sec) * 1000.0; + elapsed += ((t2.tv_usec - t1.tv_usec) / 1000.0); + if (elapsed > max_msync_time_ms) { + fprintf(stderr, "slow msync: %f ms\n", elapsed); + munmap(ptr, LEN); + cleanup(file); + return (1); + } + + double elapsed_start = (t2.tv_sec - tstart.tv_sec) * 1000.0; + elapsed_start += ((t2.tv_usec - tstart.tv_usec) / 1000.0); + if (elapsed_start > run_time_mins * 60 * 1000) { + break; + } + } + + if (munmap(ptr, LEN) != 0) { + perror("munmap"); + cleanup(file); + return (1); + } + + if (close(fd) != 0) { + perror("close"); + } + + cleanup(file); + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index d51518049e40..9dc2b4d0e08b 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -194,6 +194,7 @@ export ZFSTEST_FILES='badsend mmap_exec mmap_libaio mmap_seek + mmap_sync mmapwrite nvlist_to_lua randfree_file diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh new file mode 100755 index 000000000000..b764d6607ba6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015, 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# msync()s of mmap()'ed file should complete quickly during +# background dirty page writebacks by the kernel. +# + +function cleanup +{ + log_must eval "echo $saved_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs" + log_must eval "echo $saved_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio" + log_must eval "echo $saved_vm_dirty_writeback_centisecs > /proc/sys/vm/dirty_writeback_centisecs" + + # revert to some sensible defaults if the values we saved + # were incorrect due to a previous run being interrupted + if [ $( /proc/sys/vm/dirty_expire_centisecs" + fi + + if [ $( /proc/sys/vm/dirty_background_ratio" + fi + + if [ $( /proc/sys/vm/dirty_writeback_centisecs" + fi +} + +if ! is_linux; then + log_unsupported "Only supported on Linux, requires /proc/sys/vm/ tunables" +fi + +log_onexit cleanup +log_assert "Run the tests for mmap_sync" + +read -r saved_vm_dirty_expire_centisecs < /proc/sys/vm/dirty_expire_centisecs +read -r saved_vm_dirty_background_ratio < /proc/sys/vm/dirty_background_ratio +read -r saved_vm_dirty_writeback_centisecs < /proc/sys/vm/dirty_writeback_centisecs + +log_must eval "echo 1 > /proc/sys/vm/dirty_expire_centisecs" +log_must eval "echo 1 > /proc/sys/vm/dirty_background_bytes" +log_must eval "echo 1 > /proc/sys/vm/dirty_writeback_centisecs" + +log_must mmap_sync +log_pass "mmap_sync tests passed."