/*
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
/*
TODO:
- handle O_DIRECT
- maintain offset, flush on lseek
- ensure efficient memory management in case of random seek
*/
#include <glusterfs/glusterfs.h>
#include <glusterfs/logging.h>
#include <glusterfs/dict.h>
#include <glusterfs/xlator.h>
#include "read-ahead.h"
#include <glusterfs/statedump.h>
#include <assert.h>
#include <sys/time.h>
#include "read-ahead-messages.h"
static void
read_ahead(call_frame_t *frame, ra_file_t *file);
int
ra_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, fd_t *fd, dict_t *xdata)
{
ra_conf_t *conf = NULL;
ra_file_t *file = NULL;
int ret = 0;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
conf = this->private;
if (op_ret == -1) {
goto unwind;
}
file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t);
if (!file) {
op_ret = -1;
op_errno = ENOMEM;
goto unwind;
}
/* If O_DIRECT open, we disable caching on it */
if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY))
file->disabled = 1;
file->offset = (unsigned long long)0;
file->conf = conf;
file->pages.next = &file->pages;
file->pages.prev = &file->pages;
file->pages.offset = (unsigned long long)0;
file->pages.file = file;
ra_conf_lock(conf);
{
file->next = conf->files.next;
conf->files.next = file;
file->next->prev = file;
file->prev = &conf->files;
}
ra_conf_unlock(conf);
file->fd = fd;
file->page_count = conf->page_count;
file->page_size = conf->page_size;
pthread_mutex_init(&file->file_lock, NULL);
if (!file->disabled) {
file->page_count = 1;
}
ret = fd_ctx_set(fd, this, (uint64_t)(long)file);
if (ret == -1) {
gf_msg(frame->this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY,
"cannot set read-ahead context"
"information in fd (%p)",
fd);
ra_file_destroy(file);
op_ret = -1;
op_errno = ENOMEM;
}
unwind:
frame->local = NULL;
STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
return 0;
}
int
ra_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
ra_conf_t *conf = NULL;
ra_file_t *file = NULL;
int ret = 0;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
conf = this->private;
if (op_ret == -1) {
goto unwind;
}
file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t);
if (!file) {
op_ret = -1;
op_errno = ENOMEM;
goto unwind;
}
/* If O_DIRECT open, we disable caching on it */
if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY))
file->disabled = 1;
file->offset = (unsigned long long)0;
// file->size = fd->inode->buf.ia_size;
file->conf = conf;
file->pages.next = &file->pages;
file->pages.prev = &file->pages;
file->pages.offset = (unsigned long long)0;
file->pages.file = file;
ra_conf_lock(conf);
{
file->next = conf->files.next;
conf->files.next = file;
file->next->prev = file;
file->prev = &conf->files;
}
ra_conf_unlock(conf);
file->fd = fd;
file->page_count = conf->page_count;
file->page_size = conf->page_size;
pthread_mutex_init(&file->file_lock, NULL);
ret = fd_ctx_set(fd, this, (uint64_t)(long)file);
if (ret == -1) {
gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY,
"cannot set read ahead context"
"information in fd (%p)",
fd);
ra_file_destroy(file);
op_ret = -1;
op_errno = ENOMEM;
}
unwind:
STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
preparent, postparent, xdata);
return 0;
}
int
ra_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, dict_t *xdata)
{
GF_ASSERT(frame);
GF_ASSERT(this);
STACK_WIND(frame, ra_open_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
return 0;
}
int
ra_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
GF_ASSERT(frame);
GF_ASSERT(this);
STACK_WIND(frame, ra_create_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
xdata);
return 0;
}
/* free cache pages between offset and offset+size,
does not touch pages with frames waiting on it
*/
static void
flush_region(call_frame_t *frame, ra_file_t *file, off_t offset, off_t size,
int for_write)
{
ra_page_t *trav = NULL;
ra_page_t *next = NULL;
ra_file_lock(file);
{
trav = file->pages.next;
while (trav != &file->pages && trav->offset < (offset + size)) {
next = trav->next;
if (trav->offset >= offset) {
if (!trav->waitq) {
ra_page_purge(trav);
} else {
trav->stale = 1;
if (for_write) {
trav->poisoned = 1;
}
}
}
trav = next;
}
}
ra_file_unlock(file);
}
int
ra_release(xlator_t *this, fd_t *fd)
{
uint64_t tmp_file = 0;
int ret = 0;
GF_VALIDATE_OR_GOTO("read-ahead", this, out);
GF_VALIDATE_OR_GOTO(this->name, fd, out);
ret = fd_ctx_del(fd, this, &tmp_file);
if (!ret) {
ra_file_destroy((ra_file_t *)(long)tmp_file);
}
out:
return 0;
}
void
read_ahead(call_frame_t *frame, ra_file_t *file)
{
off_t ra_offset = 0;
size_t ra_size = 0;
off_t trav_offset = 0;
ra_page_t *trav = NULL;
off_t cap = 0;
char fault = 0;
GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
GF_VALIDATE_OR_GOTO(frame->this->name, file, out);
if (!file->page_count) {
goto out;
}
ra_size = file->page_size * file->page_count;
ra_offset = gf_floor(file->offset, file->page_size);
cap = file->size ? file->size : file->offset + ra_size;
while (ra_offset < min(file->offset + ra_size, cap)) {
ra_file_lock(file);
{
trav = ra_page_get(file, ra_offset);
}
ra_file_unlock(file);
if (!trav)
break;
ra_offset += file->page_size;
}
if (trav) {
/* comfortable enough */
goto out;
}
trav_offset = ra_offset;
cap = file->size ? file->size : ra_offset + ra_size;
while (trav_offset < min(ra_offset + ra_size, cap)) {
fault = 0;
ra_file_lock(file);
{
trav = ra_page_get(file, trav_offset);
if (!trav) {
fault = 1;
trav = ra_page_create(file, trav_offset);
if (trav)
trav->dirty = 1;
}
}
ra_file_unlock(file);
if (!trav) {
/* OUT OF MEMORY */
break;
}
if (fault) {
gf_msg_trace(frame->this->name, 0, "RA at offset=%" PRId64,
trav_offset);
ra_page_fault(file, frame, trav_offset);
}
trav_offset += file->page_size;
}
out:
return;
}
int
ra_need_atime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iovec *vector,
int32_t count, struct iatt *stbuf, struct iobref *iobref,
dict_t *xdata)
{
GF_ASSERT(frame);
STACK_DESTROY(frame->root);
return 0;
}
static void
dispatch_requests(call_frame_t *frame, ra_file_t *file)
{
ra_local_t *local = NULL;
ra_conf_t *conf = NULL;
off_t rounded_offset = 0;
off_t rounded_end = 0;
off_t trav_offset = 0;
ra_page_t *trav = NULL;
call_frame_t *ra_frame = NULL;
char need_atime_update = 1;
char fault = 0;
GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
GF_VALIDATE_OR_GOTO(frame->this->name, file, out);
local = frame->local;
conf = file->conf;
rounded_offset = gf_floor(local->offset, file->page_size);
rounded_end = gf_roof(local->offset + local->size, file->page_size);
trav_offset = rounded_offset;
while (trav_offset < rounded_end) {
fault = 0;
ra_file_lock(file);
{
trav = ra_page_get(file, trav_offset);
if (!trav) {
trav = ra_page_create(file, trav_offset);
if (!trav) {
local->op_ret = -1;
local->op_errno = ENOMEM;
goto unlock;
}
fault = 1;
need_atime_update = 0;
}
trav->dirty = 0;
if (trav->ready) {
gf_msg_trace(frame->this->name, 0, "HIT at offset=%" PRId64 ".",
trav_offset);
ra_frame_fill(trav, frame);
} else {
gf_msg_trace(frame->this->name, 0,
"IN-TRANSIT at "
"offset=%" PRId64 ".",
trav_offset);
ra_wait_on_page(trav, frame);
need_atime_update = 0;
}
}
unlock:
ra_file_unlock(file);
if (local->op_ret == -1) {
goto out;
}
if (fault) {
gf_msg_trace(frame->this->name, 0, "MISS at offset=%" PRId64 ".",
trav_offset);
ra_page_fault(file, frame, trav_offset);
}
trav_offset += file->page_size;
}
if (need_atime_update && conf->force_atime_update) {
/* TODO: use untimens() since readv() can confuse underlying
io-cache and others */
ra_frame = copy_frame(frame);
if (ra_frame == NULL) {
goto out;
}
STACK_WIND(ra_frame, ra_need_atime_cbk, FIRST_CHILD(frame->this),
FIRST_CHILD(frame->this)->fops->readv, file->fd, 1, 1, 0,
NULL);
}
out:
return;
}
int
ra_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iovec *vector,
int32_t count, struct iatt *stbuf, struct iobref *iobref,
dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
iobref, xdata);
return 0;
}
int
ra_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, uint32_t flags, dict_t *xdata)
{
ra_file_t *file = NULL;
ra_local_t *local = NULL;
ra_conf_t *conf = NULL;
int op_errno = EINVAL;
char expected_offset = 1;
uint64_t tmp_file = 0;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
conf = this->private;
gf_msg_trace(this->name, 0,
"NEW REQ at offset=%" PRId64 " for size=%" GF_PRI_SIZET "",
offset, size);
fd_ctx_get(fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file || file->disabled) {
goto disabled;
}
if (file->offset != offset) {
gf_msg_trace(this->name, 0,
"unexpected offset (%" PRId64 " != %" PRId64
") "
"resetting",
file->offset, offset);
expected_offset = file->expected = file->page_count = 0;
} else {
gf_msg_trace(this->name, 0,
"expected offset (%" PRId64 ") when page_count=%d", offset,
file->page_count);
if (file->expected < (file->page_size * conf->page_count)) {
file->expected += size;
file->page_count = min((file->expected / file->page_size),
conf->page_count);
}
}
if (!expected_offset) {
flush_region(frame, file, 0, file->pages.prev->offset + 1, 0);
}
local = mem_get0(this->local_pool);
if (!local) {
op_errno = ENOMEM;
goto unwind;
}
local->fd = fd;
local->offset = offset;
local->size = size;
local->wait_count = 1;
local->fill.next = &local->fill;
local->fill.prev = &local->fill;
pthread_mutex_init(&local->local_lock, NULL);
frame->local = local;
dispatch_requests(frame, file);
flush_region(frame, file, 0, gf_floor(offset, file->page_size), 0);
read_ahead(frame, file);
ra_frame_return(frame);
file->offset = offset + size;
return 0;
unwind:
STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
return 0;
disabled:
STACK_WIND(frame, ra_readv_disabled_cbk, FIRST_CHILD(frame->this),
FIRST_CHILD(frame->this)->fops->readv, fd, size, offset, flags,
xdata);
return 0;
}
int
ra_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata);
return 0;
}
int
ra_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
return 0;
}
int
ra_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
int32_t op_errno = EINVAL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
STACK_WIND(frame, ra_flush_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->flush, fd, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(flush, frame, -1, op_errno, NULL);
return 0;
}
int
ra_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
dict_t *xdata)
{
int32_t op_errno = EINVAL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
STACK_WIND(frame, ra_fsync_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
int
ra_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
dict_t *xdata)
{
ra_file_t *file = NULL;
GF_ASSERT(frame);
file = frame->local;
if (file) {
flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
}
frame->local = NULL;
STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
int
ra_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
dict_t *xdata)
{
ra_file_t *file = NULL;
uint64_t tmp_file = 0;
int32_t op_errno = EINVAL;
inode_t *inode = NULL;
fd_t *iter_fd = NULL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
inode = fd->inode;
LOCK(&inode->lock);
{
list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
{
tmp_file = 0;
fd_ctx_get(iter_fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
if (iter_fd == fd)
frame->local = file;
flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
/* reset the read-ahead counters too */
file->expected = file->page_count = 0;
}
}
UNLOCK(&inode->lock);
STACK_WIND(frame, ra_writev_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
flags, iobref, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
int
ra_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
int
ra_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
int
ra_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
dict_t *xdata)
{
ra_file_t *file = NULL;
fd_t *iter_fd = NULL;
inode_t *inode = NULL;
uint64_t tmp_file = 0;
int32_t op_errno = EINVAL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, loc, unwind);
inode = loc->inode;
LOCK(&inode->lock);
{
list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
{
tmp_file = 0;
fd_ctx_get(iter_fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
/*
* Truncation invalidates reads just like writing does.
* TBD: this seems to flush more than it should. The
* only time we should flush at all is when we're
* shortening (not lengthening) the file, and then only
* from new EOF to old EOF. The same problem exists in
* ra_ftruncate.
*/
flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
}
}
UNLOCK(&inode->lock);
STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
void
ra_page_dump(struct ra_page *page)
{
int i = 0;
call_frame_t *frame = NULL;
char key[GF_DUMP_MAX_BUF_LEN] = {
0,
};
ra_waitq_t *trav = NULL;
if (page == NULL) {
goto out;
}
gf_proc_dump_write("offset", "%" PRId64, page->offset);
gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size);
gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no");
gf_proc_dump_write("poisoned", "%s", page->poisoned ? "yes" : "no");
gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no");
for (trav = page->waitq; trav; trav = trav->next) {
frame = trav->data;
sprintf(key, "waiting-frame[%d]", i++);
gf_proc_dump_write(key, "%" PRId64, frame->root->unique);
}
out:
return;
}
int32_t
ra_fdctx_dump(xlator_t *this, fd_t *fd)
{
ra_file_t *file = NULL;
ra_page_t *page = NULL;
int32_t ret = 0, i = 0;
uint64_t tmp_file = 0;
char *path = NULL;
char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
0,
};
fd_ctx_get(fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (file == NULL) {
ret = 0;
goto out;
}
gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "file");
gf_proc_dump_add_section("%s", key_prefix);
ret = __inode_path(fd->inode, NULL, &path);
if (path != NULL) {
gf_proc_dump_write("path", "%s", path);
GF_FREE(path);
}
gf_proc_dump_write("fd", "%p", fd);
gf_proc_dump_write("disabled", "%s", file->disabled ? "yes" : "no");
if (file->disabled) {
ret = 0;
goto out;
}
gf_proc_dump_write("page-size", "%" PRId64, file->page_size);
gf_proc_dump_write("page-count", "%u", file->page_count);
gf_proc_dump_write("next-expected-offset-for-sequential-reads", "%" PRId64,
file->offset);
for (page = file->pages.next; page != &file->pages; page = page->next) {
gf_proc_dump_write("page", "%d: %p", i++, (void *)page);
ra_page_dump(page);
}
ret = 0;
out:
return ret;
}
int
ra_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
ra_file_t *file = NULL;
fd_t *iter_fd = NULL;
inode_t *inode = NULL;
uint64_t tmp_file = 0;
int32_t op_errno = EINVAL;
ra_conf_t *conf = NULL;
conf = this->private;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
inode = fd->inode;
if (conf->force_atime_update) {
LOCK(&inode->lock);
{
list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
{
tmp_file = 0;
fd_ctx_get(iter_fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
flush_region(frame, file, 0, file->pages.prev->offset + 1, 0);
}
}
UNLOCK(&inode->lock);
}
STACK_WIND(frame, ra_attr_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fstat, fd, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, NULL);
return 0;
}
int
ra_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
dict_t *xdata)
{
ra_file_t *file = NULL;
fd_t *iter_fd = NULL;
inode_t *inode = NULL;
uint64_t tmp_file = 0;
int32_t op_errno = EINVAL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
inode = fd->inode;
LOCK(&inode->lock);
{
list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
{
tmp_file = 0;
fd_ctx_get(iter_fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
/*
* Truncation invalidates reads just like writing does.
* TBD: this seems to flush more than it should. The
* only time we should flush at all is when we're
* shortening (not lengthening) the file, and then only
* from new EOF to old EOF. The same problem exists in
* ra_truncate.
*/
flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
}
}
UNLOCK(&inode->lock);
STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
int
ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
static int
ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
size_t len, dict_t *xdata)
{
ra_file_t *file = NULL;
fd_t *iter_fd = NULL;
inode_t *inode = NULL;
uint64_t tmp_file = 0;
int32_t op_errno = EINVAL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
inode = fd->inode;
LOCK(&inode->lock);
{
list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
{
tmp_file = 0;
fd_ctx_get(iter_fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
flush_region(frame, file, offset, len, 1);
}
}
UNLOCK(&inode->lock);
STACK_WIND(frame, ra_discard_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(discard, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
int
ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
GF_ASSERT(frame);
STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
static int
ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
off_t len, dict_t *xdata)
{
ra_file_t *file = NULL;
fd_t *iter_fd = NULL;
inode_t *inode = NULL;
uint64_t tmp_file = 0;
int32_t op_errno = EINVAL;
GF_ASSERT(frame);
GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
inode = fd->inode;
LOCK(&inode->lock);
{
list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
{
tmp_file = 0;
fd_ctx_get(iter_fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
flush_region(frame, file, offset, len, 1);
}
}
UNLOCK(&inode->lock);
STACK_WIND(frame, ra_zerofill_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
return 0;
unwind:
STACK_UNWIND_STRICT(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
int
ra_priv_dump(xlator_t *this)
{
ra_conf_t *conf = NULL;
int ret = -1;
char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
0,
};
gf_boolean_t add_section = _gf_false;
if (!this) {
goto out;
}
conf = this->private;
if (!conf) {
gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_XLATOR_CONF_NULL,
"conf null in xlator");
goto out;
}
gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "priv");
gf_proc_dump_add_section("%s", key_prefix);
add_section = _gf_true;
ret = pthread_mutex_trylock(&conf->conf_lock);
if (ret)
goto out;
{
gf_proc_dump_write("page_size", "%" PRIu64, conf->page_size);
gf_proc_dump_write("page_count", "%d", conf->page_count);
gf_proc_dump_write("force_atime_update", "%d",
conf->force_atime_update);
}
pthread_mutex_unlock(&conf->conf_lock);
ret = 0;
out:
if (ret && conf) {
if (add_section == _gf_false)
gf_proc_dump_add_section("%s", key_prefix);
gf_proc_dump_write("Unable to dump priv",
"(Lock acquisition failed) %s", this->name);
}
return ret;
}
int32_t
mem_acct_init(xlator_t *this)
{
int ret = -1;
if (!this) {
goto out;
}
ret = xlator_mem_acct_init(this, gf_ra_mt_end + 1);
if (ret != 0) {
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY,
"Memory accounting init"
"failed");
}
out:
return ret;
}
int
reconfigure(xlator_t *this, dict_t *options)
{
ra_conf_t *conf = NULL;
int ret = -1;
GF_VALIDATE_OR_GOTO("read-ahead", this, out);
GF_VALIDATE_OR_GOTO("read-ahead", this->private, out);
conf = this->private;
GF_OPTION_RECONF("page-count", conf->page_count, options, uint32, out);
GF_OPTION_RECONF("page-size", conf->page_size, options, size_uint64, out);
GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
ret = 0;
out:
return ret;
}
int
init(xlator_t *this)
{
ra_conf_t *conf = NULL;
int32_t ret = -1;
GF_VALIDATE_OR_GOTO("read-ahead", this, out);
if (!this->children || this->children->next) {
gf_msg(this->name, GF_LOG_ERROR, 0,
READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED,
"FATAL: read-ahead not configured with exactly one"
" child");
goto out;
}
if (!this->parents) {
gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_VOL_MISCONFIGURED,
"dangling volume. check volfile ");
}
conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_ra_mt_ra_conf_t);
if (conf == NULL) {
goto out;
}
conf->page_size = this->ctx->page_size;
GF_OPTION_INIT("page-size", conf->page_size, size_uint64, out);
GF_OPTION_INIT("page-count", conf->page_count, uint32, out);
GF_OPTION_INIT("force-atime-update", conf->force_atime_update, bool, out);
GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
conf->files.next = &conf->files;
conf->files.prev = &conf->files;
pthread_mutex_init(&conf->conf_lock, NULL);
this->local_pool = mem_pool_new(ra_local_t, 64);
if (!this->local_pool) {
ret = -1;
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY,
"failed to create local_t's memory pool");
goto out;
}
this->private = conf;
ret = 0;
out:
if (ret == -1) {
GF_FREE(conf);
}
return ret;
}
void
fini(xlator_t *this)
{
ra_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO("read-ahead", this, out);
conf = this->private;
if (conf == NULL) {
goto out;
}
this->private = NULL;
/* The files structures allocated in open and create are not deleted.
* until that is freed, marking the below assert as warning.
GF_ASSERT ((conf->files.next == &conf->files)
&& (conf->files.prev == &conf->files));
*/
if (!((conf->files.next == &conf->files) &&
(conf->files.prev == &conf->files))) {
gf_msg(this->name, GF_LOG_INFO, 0,
READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND,
"undestroyed read ahead file structures found");
}
pthread_mutex_destroy(&conf->conf_lock);
GF_FREE(conf);
out:
return;
}
struct xlator_fops fops = {
.open = ra_open,
.create = ra_create,
.readv = ra_readv,
.writev = ra_writev,
.flush = ra_flush,
.fsync = ra_fsync,
.truncate = ra_truncate,
.ftruncate = ra_ftruncate,
.fstat = ra_fstat,
.discard = ra_discard,
.zerofill = ra_zerofill,
};
struct xlator_cbks cbks = {
.release = ra_release,
};
struct xlator_dumpops dumpops = {
.priv = ra_priv_dump,
.fdctx = ra_fdctx_dump,
};
struct volume_options options[] = {
{
.key = {"read-ahead"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "enable/disable read-ahead",
.op_version = {GD_OP_VERSION_6_0},
.flags = OPT_FLAG_SETTABLE,
},
{.key = {"force-atime-update"},
.type = GF_OPTION_TYPE_BOOL,
.op_version = {1},
.tags = {"read-ahead"},
.default_value = "false"},
{.key = {"page-count"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 16,
.default_value = "4",
.op_version = {1},
.tags = {"read-ahead"},
.description = "Number of pages that will be pre-fetched"},
{.key = {"page-size"},
.type = GF_OPTION_TYPE_SIZET,
.min = 4096,
.max = 1048576 * 64,
.default_value = "131072",
.op_version = {1},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
.tags = {"read-ahead"},
.description = "Page size with which read-ahead performs server I/O"},
{.key = {"pass-through"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "false",
.op_version = {GD_OP_VERSION_4_1_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
.tags = {"read-ahead"},
.description = "Enable/Disable read ahead translator"},
{.key = {NULL}},
};
xlator_api_t xlator_api = {
.init = init,
.fini = fini,
.reconfigure = reconfigure,
.mem_acct_init = mem_acct_init,
.op_version = {1}, /* Present from the initial version */
.dumpops = &dumpops,
.fops = &fops,
.cbks = &cbks,
.options = options,
.identifier = "read-ahead",
.category = GF_MAINTAINED,
};