Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve dirty page writeout performance #2250

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,10 @@ extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag,
extern int zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag,
cred_t *cr);
extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
extern int zfs_putpage(struct inode *ip, struct page *pp,
struct writeback_control *wbc);
extern int zfs_putpage_single(struct page *pp, struct writeback_control *wbc,
cred_t *cr);
extern int zfs_putpage(struct inode *ip, struct writeback_control *wbc,
cred_t *cr);
extern int zfs_dirty_inode(struct inode *ip, int flags);
extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
size_t len, unsigned long vm_flags);
Expand Down
175 changes: 158 additions & 17 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -3851,10 +3851,10 @@ zfs_putpage_commit_cb(void *arg)
/*
* Push a page out to disk, once the page is on stable storage the
* registered commit callback will be run as notification of completion.
* Callers are responsible for calling zil_commit()
*
* IN: ip - page mapped for inode.
* pp - page to push (page is locked)
* wbc - writeback control data
*
* RETURN: 0 if success
* error code if failure
Expand All @@ -3864,25 +3864,22 @@ zfs_putpage_commit_cb(void *arg)
*/
/* ARGSUSED */
int
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_putapage(struct inode *ip, struct page *pp)
{
znode_t *zp = ITOZ(ip);
zfs_sb_t *zsb = ITOZSB(ip);
loff_t offset;
loff_t pgoff;
unsigned int pglen;
rl_t *rl;
dmu_tx_t *tx;
caddr_t va;
int err = 0;
uint64_t mtime[2], ctime[2];
sa_bulk_attr_t bulk[3];
int cnt = 0;

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);

ASSERT(PageLocked(pp));
ASSERT(!PageWriteback(pp));

pgoff = page_offset(pp); /* Page byte-offset in file */
offset = i_size_read(ip); /* File length in bytes */
Expand All @@ -3892,7 +3889,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
/* Page is beyond end of file */
if (pgoff >= offset) {
unlock_page(pp);
ZFS_EXIT(zsb);
return (0);
}

Expand All @@ -3915,7 +3911,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
set_page_writeback(pp);
unlock_page(pp);

rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
tx = dmu_tx_create(zsb->z_os);

dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
Expand All @@ -3931,8 +3926,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
__set_page_dirty_nobuffers(pp);
ClearPageError(pp);
end_page_writeback(pp);
zfs_range_unlock(rl);
ZFS_EXIT(zsb);
return (err);
}

Expand All @@ -3957,18 +3950,166 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_putpage_commit_cb, pp);
dmu_tx_commit(tx);

zfs_range_unlock(rl);
return (err);
}

if (wbc->sync_mode != WB_SYNC_NONE) {
/*
* Note that this is rarely called under writepages(), because
* writepages() normally handles the entire commit for
* performance reasons.
*/
int
zfs_putpage_single(struct page *pp, struct writeback_control *wbc, cred_t *cr)
{
struct inode *ip = pp->mapping->host;
znode_t *zp = ITOZ(ip);
zfs_sb_t *zsb = ITOZSB(ip);
rl_t *rl;
loff_t pgoff;
unsigned int pglen;
int err;

ASSERT(PageLocked(pp));
ASSERT(!(current->flags & PF_NOFS));

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);

pgoff = page_offset(pp); /* Page byte-offset in file */
pglen = PAGE_CACHE_SIZE; /* Page length in bytes */

/*
* Annotate this call path with a flag that indicates that it is
* unsafe to use KM_SLEEP during memory allocations due to the
* potential for a deadlock. KM_PUSHPAGE should be used instead.
*/
current->flags |= PF_NOFS;

rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
err = zfs_putapage(ip, pp);

zfs_range_unlock(rl);
if (wbc->sync_mode == WB_SYNC_ALL ||
zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zsb->z_log, zp->z_id);
ZFS_EXIT(zsb);

current->flags &= ~PF_NOFS;

return (err);
}

int
zfs_putpage_cb(struct page *page, struct writeback_control *wbc, void *data)
{
struct inode *ip = data;
return (zfs_putapage(ip, page));
}

int
zfs_putpage(struct inode *ip, struct writeback_control *wbc, cred_t *cr)
{
znode_t *zp = ITOZ(ip);
zfs_sb_t *zsb = ITOZSB(ip);
rl_t *rl;
int err;
loff_t range_start, range_end;
long nr_to_write;
enum writeback_sync_modes sync_mode;

ASSERT(!(current->flags & PF_NOFS));
ASSERT(wbc);

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);

/*
* There's nothing to do if no data is cached.
*/
if (ip->i_mapping->nrpages == 0) {
ZFS_EXIT(zsb);
return (0);
}

/*
* Annotate this call path with a flag that indicates that it is
* unsafe to use KM_SLEEP during memory allocations due to the
* potential for a deadlock. KM_PUSHPAGE should be used instead.
*/
current->flags |= PF_NOFS;

range_end = (wbc->range_end) ? wbc->range_end : zp->z_size;

ASSERT(wbc->range_start < range_end);

rl = zfs_range_lock(zp, wbc->range_start, range_end -
wbc->range_start, RL_WRITER);

/*
* Since we drop the range lock before calling zil_commit(), passing
* WB_SYNC_ALL allows write_cache_pages() to block waiting for another
* thread to clear the writeback bit. Synchronous behavior is enforced
* by zil_commit(), so we avoid blocking in write_cache_pages() by
* changing wbc->sync_mode to WBC_SYNC_NONE before the call and
* restoring it afterward. zfs_putapage will have been called on all
* dirty pages by the time write_cache_pages() has returned, so a
* zil_commit() is all that is required to enforce synchronous
* behavior. It is also worth nothing that we have no backing device,
* so there should be no chance of bdi throttling us in
* write_cache_pages().
*/
sync_mode = wbc->sync_mode;
wbc->sync_mode = WB_SYNC_NONE;

/*
* Similarly, we wish to writeback all dirty pages in the range, so we
* set nr_to_write to LONG_MAX. We also cache range_start in the event
* we have more than LONG_MAX dirty pages, which requires that we call
* write_cache_pages() multiple times.
*/
range_start = wbc->range_start;
nr_to_write = wbc->nr_to_write;
begin_writeback:
wbc->nr_to_write = LONG_MAX;

err = write_cache_pages(ip->i_mapping, wbc, &zfs_putpage_cb, ip);
if (err)
goto out;

/*
* Invoke Linux BUG_ON() to complain should we see write_cache_pages()
* write below zero. That implies that write_cache_pages() has been
* changed in a way that we might not handle properly.
*/
BUG_ON(wbc->nr_to_write < 0);

/*
* The value of wbc->nr_to_write is decremented by write_cache_pages()
* on each zfs_putpage_cb invocation. Since sizeof(long) is 4 bytes on
* 64-bit Linux and we initialize it to LONG_MAX, seeing
* wbc->nr_to_write == 0 means that we wrote out precisely LONG_MAX
* dirty pages. In this situation, we likely have more pages in the
* range to write out, so we restart. Since Linux's struct
* writeback_control is limited to a long, we shall report the number
* of pages written out modulo LONG_MAX, but continue until all dirty
* pages in the range have been written out. In this situation, we
* increment wbc->range_start to minimize repeat work iterating through
* Linux's radix tree.
*/
if (wbc->nr_to_write == 0) {
wbc->range_start += (LONG_MAX - wbc->nr_to_write) <<
PAGE_CACHE_SHIFT;

if (wbc->range_start <= range_end)
goto begin_writeback;
}

out:
zfs_range_unlock(rl);
if (sync_mode == WB_SYNC_ALL || zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zsb->z_log, zp->z_id);
ZFS_EXIT(zsb);

current->flags &= ~PF_NOFS;
wbc->range_start = range_start;
wbc->sync_mode = sync_mode;
wbc->nr_to_write += nr_to_write - LONG_MAX;

return (err);
}

Expand Down
83 changes: 22 additions & 61 deletions module/zfs/zpl_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,66 +399,16 @@ zpl_readpages(struct file *filp, struct address_space *mapping,
}

int
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct address_space *mapping = data;
cred_t *cr = CRED();
int err;

ASSERT(PageLocked(pp));
ASSERT(!PageWriteback(pp));
ASSERT(!(current->flags & PF_NOFS));

/*
* Annotate this call path with a flag that indicates that it is
* unsafe to use KM_SLEEP during memory allocations due to the
* potential for a deadlock. KM_PUSHPAGE should be used instead.
*/
current->flags |= PF_NOFS;
(void) zfs_putpage(mapping->host, pp, wbc);
current->flags &= ~PF_NOFS;

return (0);
}
crhold(cr);
err = -zfs_putpage(mapping->host, wbc, cr);
crfree(cr);

static int
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
znode_t *zp = ITOZ(mapping->host);
zfs_sb_t *zsb = ITOZSB(mapping->host);
enum writeback_sync_modes sync_mode;
int result;

ZFS_ENTER(zsb);
if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
wbc->sync_mode = WB_SYNC_ALL;
ZFS_EXIT(zsb);
sync_mode = wbc->sync_mode;

/*
* We don't want to run write_cache_pages() in SYNC mode here, because
* that would make putpage() wait for a single page to be committed to
* disk every single time, resulting in atrocious performance. Instead
* we run it once in non-SYNC mode so that the ZIL gets all the data,
* and then we commit it all in one go.
*/
wbc->sync_mode = WB_SYNC_NONE;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
if (sync_mode != wbc->sync_mode) {
ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);
zil_commit(zsb->z_log, zp->z_id);
ZFS_EXIT(zsb);

/*
* We need to call write_cache_pages() again (we can't just
* return after the commit) because the previous call in
* non-SYNC mode does not guarantee that we got all the dirty
* pages (see the implementation of write_cache_pages() for
* details). That being said, this is a no-op in most cases.
*/
wbc->sync_mode = sync_mode;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
}
return (result);
return (err);
}

/*
Expand All @@ -467,13 +417,24 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
* which never call .write(). These dirty pages are kept in sync with
* the ARC buffers via this hook.
*/
static int
int
zpl_writepage(struct page *pp, struct writeback_control *wbc)
{
if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
wbc->sync_mode = WB_SYNC_ALL;
cred_t *cr = CRED();
int err;

ASSERT(PageLocked(pp));

if (pp->mapping == NULL) {
unlock_page(pp);
return (0);
}

crhold(cr);
err = -zfs_putpage_single(pp, wbc, cr);
crfree(cr);

return (zpl_putpage(pp, wbc, pp->mapping));
return (err);
}

/*
Expand Down