/*
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#include <libgen.h>
#include <unistd.h>
#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
#include "afr-common.c"
#include "afr-messages.h"
struct volume_options options[];
static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
[AFR_FAV_CHILD_NONE] = "none",
[AFR_FAV_CHILD_BY_SIZE] = "size",
[AFR_FAV_CHILD_BY_CTIME] = "ctime",
[AFR_FAV_CHILD_BY_MTIME] = "mtime",
[AFR_FAV_CHILD_BY_MAJORITY] = "majority",
[AFR_FAV_CHILD_POLICY_MAX] = NULL,
};
int32_t
notify(xlator_t *this, int32_t event, void *data, ...)
{
int ret = -1;
va_list ap;
void *data2 = NULL;
va_start(ap, data);
data2 = va_arg(ap, dict_t *);
va_end(ap);
ret = afr_notify(this, event, data, data2);
return ret;
}
int32_t
mem_acct_init(xlator_t *this)
{
int ret = -1;
if (!this)
return ret;
ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1);
if (ret != 0) {
return ret;
}
return ret;
}
int
xlator_subvolume_index(xlator_t *this, xlator_t *subvol)
{
int index = -1;
int i = 0;
xlator_list_t *list = NULL;
list = this->children;
while (list) {
if (subvol == list->xlator ||
strcmp(subvol->name, list->xlator->name) == 0) {
index = i;
break;
}
list = list->next;
i++;
}
return index;
}
static void
fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
dict_t *options)
{
if (dict_get_sizen(options, "quorum-type") == NULL) {
/* If user doesn't configure anything enable auto-quorum if the
* replica has more than two subvolumes */
if (priv->child_count > 2)
qtype = "auto";
}
if (priv->quorum_count && strcmp(qtype, "fixed")) {
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
"quorum-type %s overriding quorum-count %u", qtype,
priv->quorum_count);
}
if (!strcmp(qtype, "none")) {
priv->quorum_count = 0;
} else if (!strcmp(qtype, "auto")) {
priv->quorum_count = AFR_QUORUM_AUTO;
}
}
int
afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
{
int index = -1;
index = gf_get_index_by_elem(afr_favorite_child_policies, policy);
if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
return -1;
priv->fav_child_policy = index;
return 0;
}
static void
set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
{
if (!algo) {
priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
} else if (strcmp(algo, "full") == 0) {
priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
} else if (strcmp(algo, "diff") == 0) {
priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
} else {
priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
}
}
int
reconfigure(xlator_t *this, dict_t *options)
{
afr_private_t *priv = NULL;
xlator_t *read_subvol = NULL;
int read_subvol_index = -1;
int timeout_old = 0;
int ret = -1;
int index = -1;
char *qtype = NULL;
char *fav_child_policy = NULL;
char *data_self_heal = NULL;
char *data_self_heal_algorithm = NULL;
char *locking_scheme = NULL;
gf_boolean_t consistent_io = _gf_false;
gf_boolean_t choose_local_old = _gf_false;
gf_boolean_t enabled_old = _gf_false;
priv = this->private;
GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
priv->metadata_splitbrain_forced_heal, options, bool, out);
GF_OPTION_RECONF("background-self-heal-count",
priv->background_self_heal_count, options, uint32, out);
GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options,
uint32, out);
GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
bool, out);
GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
gf_string2boolean(data_self_heal, &priv->data_self_heal);
GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
out);
GF_OPTION_RECONF("data-self-heal-window-size",
priv->data_self_heal_window_size, options, uint32, out);
GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
options, str, out);
set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);
GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
options, uint32, out);
GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
options, uint32, out);
GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options,
uint32, out);
GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options,
uint32, out);
GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options,
uint32, out);
GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out);
choose_local_old = priv->choose_local;
GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out);
if (choose_local_old != priv->choose_local) {
priv->read_child = -1;
if (choose_local_old == _gf_false)
priv->did_discovery = _gf_false;
}
GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out);
if (read_subvol) {
index = xlator_subvolume_index(this, read_subvol);
if (index == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
"%s not a subvolume", read_subvol->name);
goto out;
}
priv->read_child = index;
}
GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32,
out);
if (read_subvol_index > -1) {
index = read_subvol_index;
if (index >= priv->child_count) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
"%d not a subvolume-index", index);
goto out;
}
priv->read_child = index;
}
GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
out);
GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
fix_quorum_options(this, priv, qtype, options);
if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
"Client-quorum is not met");
GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options,
uint32, out);
GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options,
size_uint64, out);
/* Reset this so we re-discover in case the topology changed. */
GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
bool, out);
enabled_old = priv->shd.enabled;
GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
out);
timeout_old = priv->shd.timeout;
GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
bool, out);
GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32,
out);
GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options,
uint32, out);
GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str,
out);
if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
goto out;
priv->did_discovery = _gf_false;
GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out);
if (priv->quorum_count != 0)
consistent_io = _gf_false;
priv->consistent_io = consistent_io;
if (priv->shd.enabled) {
if ((priv->shd.enabled != enabled_old) ||
(timeout_old != priv->shd.timeout))
afr_selfheal_childup(this, priv);
}
ret = 0;
out:
return ret;
}
static int
afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this)
{
int ret = -1;
int i = 0;
char *ptr = NULL;
char *ptr1 = NULL;
char *xattrs_list = NULL;
xlator_list_t *trav = NULL;
int child_count = -1;
trav = this->children;
child_count = priv->child_count;
if (priv->thin_arbiter_count) {
/* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the
* name of the thin arbiter file for persistence across add/
* removal of DHT subvols.*/
child_count++;
}
GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out);
priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count,
gf_afr_mt_char);
if (!priv->pending_key) {
ret = -ENOMEM;
goto out;
}
if (!xattrs_list) {
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG,
"Unable to fetch afr-pending-xattr option from volfile."
" Falling back to using client translator names. ");
while (i < child_count) {
ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
trav->xlator->name);
if (ret == -1) {
ret = -ENOMEM;
goto out;
}
trav = trav->next;
i++;
}
ret = 0;
goto out;
}
ptr = ptr1 = gf_strdup(xattrs_list);
if (!ptr) {
ret = -ENOMEM;
goto out;
}
for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) {
ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
ptr);
if (ret == -1) {
ret = -ENOMEM;
goto out;
}
i++;
}
ret = 0;
out:
GF_FREE(ptr1);
return ret;
}
void
afr_ta_init(afr_private_t *priv)
{
priv->thin_arbiter_count = 1;
priv->child_count--;
priv->ta_child_up = 0;
priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
priv->ta_notify_dom_lock_offset = 0;
priv->ta_in_mem_txn_count = 0;
priv->ta_on_wire_txn_count = 0;
priv->release_ta_notify_dom_lock = _gf_false;
INIT_LIST_HEAD(&priv->ta_waitq);
INIT_LIST_HEAD(&priv->ta_onwireq);
*priv->ta_gfid = 0;
}
int32_t
init(xlator_t *this)
{
afr_private_t *priv = NULL;
int child_count = 0;
xlator_list_t *trav = NULL;
int i = 0;
int ret = -1;
GF_UNUSED int op_errno = 0;
xlator_t *read_subvol = NULL;
int read_subvol_index = -1;
char *qtype = NULL;
char *fav_child_policy = NULL;
char *thin_arbiter = NULL;
char *data_self_heal = NULL;
char *locking_scheme = NULL;
char *data_self_heal_algorithm = NULL;
if (!this->children) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
"replicate translator needs more than one "
"subvolume defined.");
return -1;
}
if (!this->parents) {
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED,
"Volume is dangling.");
}
this->private = GF_CALLOC(1, sizeof(afr_private_t),
gf_afr_mt_afr_private_t);
if (!this->private)
goto out;
priv = this->private;
LOCK_INIT(&priv->lock);
child_count = xlator_subvolume_count(this);
priv->child_count = child_count;
priv->read_child = -1;
GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
if (thin_arbiter && strlen(thin_arbiter) > 0) {
afr_ta_init(priv);
}
INIT_LIST_HEAD(&priv->healing);
INIT_LIST_HEAD(&priv->heal_waiting);
priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out);
GF_OPTION_INIT("metadata-splitbrain-forced-heal",
priv->metadata_splitbrain_forced_heal, bool, out);
GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out);
if (read_subvol) {
priv->read_child = xlator_subvolume_index(this, read_subvol);
if (priv->read_child == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
"%s not a subvolume", read_subvol->name);
goto out;
}
}
GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out);
if (read_subvol_index > -1) {
if (read_subvol_index >= priv->child_count) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
"%d not a subvolume-index", read_subvol_index);
goto out;
}
priv->read_child = read_subvol_index;
}
GF_OPTION_INIT("choose-local", priv->choose_local, bool, out);
priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads),
priv->child_count, gf_afr_mt_atomic_t);
GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out);
priv->favorite_child = -1;
GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out);
if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
goto out;
GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out);
GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out);
GF_OPTION_INIT("background-self-heal-count",
priv->background_self_heal_count, uint32, out);
GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
gf_string2boolean(data_self_heal, &priv->data_self_heal);
GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
out);
set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
GF_OPTION_INIT("data-self-heal-window-size",
priv->data_self_heal_window_size, uint32, out);
GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out);
GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out);
GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
uint32, out);
GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32,
out);
GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out);
GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out);
GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out);
GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
uint32, out);
GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool,
out);
GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);
GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out);
GF_OPTION_INIT("quorum-type", qtype, str, out);
GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out);
GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
out);
fix_quorum_options(this, priv, qtype, this->options);
GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out);
GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out);
GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out);
GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
if (priv->quorum_count != 0)
priv->consistent_io = _gf_false;
priv->wait_count = 1;
priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char);
if (!priv->local) {
ret = -ENOMEM;
goto out;
}
priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
gf_afr_mt_child_latency_t);
if (!priv->child_up || !priv->child_latency) {
ret = -ENOMEM;
goto out;
}
/*Initialize to -ve ping timeout so that they are not considered
* in child-up events until ping-event comes*/
for (i = 0; i < child_count; i++)
priv->child_latency[i] = -1;
priv->children = GF_CALLOC(sizeof(xlator_t *), child_count,
gf_afr_mt_xlator_t);
if (!priv->children) {
ret = -ENOMEM;
goto out;
}
ret = afr_pending_xattrs_init(priv, this);
if (ret)
goto out;
trav = this->children;
i = 0;
while (i < child_count) {
priv->children[i] = trav->xlator;
trav = trav->next;
i++;
}
ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name);
if (-1 == ret) {
ret = -ENOMEM;
goto out;
}
priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event),
gf_afr_mt_int32_t);
if (!priv->last_event) {
ret = -ENOMEM;
goto out;
}
this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
if (!this->itable) {
ret = -ENOMEM;
goto out;
}
if (priv->shd.iamshd) {
ret = afr_selfheal_daemon_init(this);
if (ret) {
ret = -ENOMEM;
goto out;
}
}
/* keep more local here as we may need them for self-heal etc */
this->local_pool = mem_pool_new(afr_local_t, 512);
if (!this->local_pool) {
ret = -1;
goto out;
}
priv->root_inode = NULL;
ret = 0;
out:
return ret;
}
void
fini(xlator_t *this)
{
afr_private_t *priv = NULL;
priv = this->private;
LOCK(&priv->lock);
if (priv->timer != NULL) {
gf_timer_call_cancel(this->ctx, priv->timer);
priv->timer = NULL;
}
UNLOCK(&priv->lock);
this->private = NULL;
afr_priv_destroy(priv);
if (this->itable) {
inode_table_destroy(this->itable);
this->itable = NULL;
}
return;
}
struct xlator_fops fops = {
.lookup = afr_lookup,
.lk = afr_lk,
.flush = afr_flush,
.statfs = afr_statfs,
.fsyncdir = afr_fsyncdir,
.inodelk = afr_inodelk,
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
.fentrylk = afr_fentrylk,
.ipc = afr_ipc,
.lease = afr_lease,
/* inode read */
.access = afr_access,
.stat = afr_stat,
.fstat = afr_fstat,
.readlink = afr_readlink,
.getxattr = afr_getxattr,
.fgetxattr = afr_fgetxattr,
.readv = afr_readv,
/* inode write */
.writev = afr_writev,
.truncate = afr_truncate,
.ftruncate = afr_ftruncate,
.setxattr = afr_setxattr,
.fsetxattr = afr_fsetxattr,
.setattr = afr_setattr,
.fsetattr = afr_fsetattr,
.removexattr = afr_removexattr,
.fremovexattr = afr_fremovexattr,
.fallocate = afr_fallocate,
.discard = afr_discard,
.zerofill = afr_zerofill,
.xattrop = afr_xattrop,
.fxattrop = afr_fxattrop,
.fsync = afr_fsync,
/*inode open*/
.opendir = afr_opendir,
.open = afr_open,
/* dir read */
.readdir = afr_readdir,
.readdirp = afr_readdirp,
/* dir write */
.create = afr_create,
.mknod = afr_mknod,
.mkdir = afr_mkdir,
.unlink = afr_unlink,
.rmdir = afr_rmdir,
.link = afr_link,
.symlink = afr_symlink,
.rename = afr_rename,
};
struct xlator_dumpops dumpops = {
.priv = afr_priv_dump,
};
struct xlator_cbks cbks = {
.release = afr_release,
.releasedir = afr_releasedir,
.forget = afr_forget,
};
struct volume_options options[] = {
{.key = {"read-subvolume"},
.type = GF_OPTION_TYPE_XLATOR,
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "inode-read fops happen only on one of the bricks in "
"replicate. Afr will prefer the one specified using "
"this option if it is not stale. Option value must be "
"one of the xlator names of the children. "
"Ex: <volname>-client-0 till "
"<volname>-client-<number-of-bricks - 1>"},
{.key = {"read-subvolume-index"},
.type = GF_OPTION_TYPE_INT,
.default_value = "-1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "inode-read fops happen only on one of the bricks in "
"replicate. AFR will prefer the one specified using "
"this option if it is not stale. allowed options"
" include -1 till replica-count - 1"},
{.key = {"read-hash-mode"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 3,
.default_value = "1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description =
"inode-read fops happen only on one of the bricks in "
"replicate. AFR will prefer the one computed using "
"the method specified using this option.\n"
"0 = first readable child of AFR, starting from 1st child.\n"
"1 = hash by GFID of file (all clients use "
"same subvolume).\n"
"2 = hash by GFID of file and client PID.\n"
"3 = brick having the least outstanding read requests."},
{
.key = {"choose-local"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "true",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Choose a local subvolume (i.e. Brick) to read from"
" if read-subvolume is not explicitly set.",
},
{.key = {"background-self-heal-count"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 256,
.default_value = "8",
.validate = GF_OPT_VALIDATE_MIN,
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This specifies the number of per client self-heal "
"jobs that can perform parallel heals in the "
"background."},
{.key = {"halo-shd-max-latency"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 99999,
.default_value = "99999",
.op_version = {GD_OP_VERSION_3_11_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate", "halo"},
.description = "Maximum latency for shd halo replication in msec."},
{.key = {"halo-enabled"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "False",
.op_version = {GD_OP_VERSION_3_11_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate", "halo"},
.description = "Enable Halo (geo) replication mode."},
{.key = {"halo-nfsd-max-latency"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 99999,
.default_value = "5",
.op_version = {GD_OP_VERSION_3_11_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate", "halo"},
.description = "Maximum latency for nfsd halo replication in msec."},
{.key = {"halo-max-latency"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = AFR_HALO_MAX_LATENCY,
.default_value = "5",
.op_version = {GD_OP_VERSION_3_11_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate", "halo"},
.description = "Maximum latency for halo replication in msec."},
{.key = {"halo-max-replicas"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 99999,
.default_value = "99999",
.op_version = {GD_OP_VERSION_3_11_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate", "halo"},
.description = "The maximum number of halo replicas; replicas"
" beyond this value will be written asynchronously"
"via the SHD."},
{.key = {"halo-min-replicas"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 99999,
.default_value = "2",
.op_version = {GD_OP_VERSION_3_11_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate", "halo"},
.description = "The minimmum number of halo replicas, before adding "
"out of region replicas."},
{.key = {"heal-wait-queue-length"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
.default_value = "128",
.validate = GF_OPT_VALIDATE_MIN,
.op_version = {GD_OP_VERSION_3_7_10},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This specifies the number of heals that can be queued"
" for the parallel background self heal jobs."},
{.key = {"data-self-heal"},
.type = GF_OPTION_TYPE_STR,
.value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
"disable", "open"},
.default_value = "off",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Using this option we can enable/disable data "
"self-heal on the file. \"open\" means data "
"self-heal action will only be triggered by file "
"open operations."},
{.key = {"data-self-heal-algorithm"},
.type = GF_OPTION_TYPE_STR,
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Select between \"full\", \"diff\". The "
"\"full\" algorithm copies the entire file from "
"source to sink. The \"diff\" algorithm copies to "
"sink only those blocks whose checksums don't match "
"with those of source. If no option is configured "
"the option is chosen dynamically as follows: "
"If the file does not exist on one of the sinks "
"or empty file exists or if the source file size is "
"about the same as page size the entire file will "
"be read and written i.e \"full\" algo, "
"otherwise \"diff\" algo is chosen.",
.value = {"diff", "full"}},
{.key = {"data-self-heal-window-size"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 1024,
.default_value = "1",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Maximum number blocks per file for which self-heal "
"process would be applied simultaneously."},
{.key = {"metadata-self-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
/*.validate_fn = validate_replica*/
.description = "Using this option we can enable/disable metadata "
"i.e. Permissions, ownerships, xattrs self-heal on "
"the file/directory."},
{.key = {"entry-self-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
/*.validate_fn = validate_replica*/
.description = "Using this option we can enable/disable entry "
"self-heal on the directory."},
{.key = {"data-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
{.key = {"metadata-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
{.key = {"entry-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
{.key = {"optimistic-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.description = "Entry/Metadata fops will not perform "
"pre fop changelog operations in afr transaction "
"if this option is enabled."},
{.key = {"inodelk-trace"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "Enabling this option logs inode lock/unlocks"},
{.key = {"entrylk-trace"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "Enabling this option logs entry lock/unlocks"},
{.key = {"pre-op-compat"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.description = "Use separate pre-op xattrop() FOP rather than "
"overloading xdata of the OP"},
{.key = {"eager-lock"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description =
"Enable/Disable eager lock for replica volume. "
"Lock phase of a transaction has two sub-phases. "
"First is an attempt to acquire locks in parallel by "
"broadcasting non-blocking lock requests. If lock "
"acquisition fails on any server, then the held locks "
"are unlocked and we revert to a blocking locks mode "
"sequentially on one server after another. If this "
"option is enabled the initial broadcasting lock "
"request attempts to acquire a full lock on the entire file. "
"If this fails, we revert back to the sequential "
"\"regional\" blocking locks as before. In the case "
"where such an \"eager\" lock is granted in the "
"non-blocking phase, it gives rise to an opportunity "
"for optimization. i.e, if the next write transaction "
"on the same FD arrives before the unlock phase of "
"the first transaction, it \"takes over\" the full "
"file lock. Similarly if yet another data transaction "
"arrives before the unlock phase of the \"optimized\" "
"transaction, that in turn \"takes over\" the lock as "
"well. The actual unlock now happens at the end of "
"the last \"optimized\" transaction."
},
{.key = {"self-heal-daemon"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
.tags = {"replicate"},
/*.validate_fn = validate_replica_heal_enable_disable*/
.description = "This option applies to only self-heal-daemon. "
"Index directory crawl and automatic healing of files "
"will not be performed if this option is turned off."},
{.key = {"iam-self-heal-daemon"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "This option differentiates if the replicate "
"translator is running as part of self-heal-daemon "
"or not."},
{.key = {"iam-nfs-daemon"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "This option differentiates if the replicate "
"translator is running as part of an NFS daemon "
"or not."},
{
.key = {"quorum-type"},
.type = GF_OPTION_TYPE_STR,
.value = {"none", "auto", "fixed"},
.default_value = "none",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
/*.option = quorum-type*/
.description = "If value is \"fixed\" only allow writes if "
"quorum-count bricks are present. If value is "
"\"auto\" only allow writes if more than half of "
"bricks, or exactly half including the first, are "
"present.",
},
{
.key = {"quorum-count"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = INT_MAX,
.default_value = 0,
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
/*.option = quorum-count*/
/*.validate_fn = validate_quorum_count*/
.description = "If quorum-type is \"fixed\" only allow writes if "
"this many bricks are present. Other quorum types "
"will OVERWRITE this value.",
},
{
.key = {"quorum-reads"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
.op_version = {GD_OP_VERSION_3_7_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option has been removed. Reads are not allowed "
"if quorum is not met.",
},
{
.key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
.description = "Local glusterd uuid string, used in starting "
"self-heal-daemon so that it can crawl only on "
"local index directories.",
},
{
.key = {"post-op-delay-secs"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = INT_MAX,
.default_value = "1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Time interval induced artificially before "
"post-operation phase of the transaction to "
"enhance overlap of adjacent write operations.",
},
{
.key = {AFR_SH_READDIR_SIZE_KEY},
.type = GF_OPTION_TYPE_SIZET,
.description = "readdirp size for performing entry self-heal",
.min = 1024,
.max = 131072,
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
.tags = {"replicate"},
.default_value = "1KB",
},
{
.key = {"ensure-durability"},
.type = GF_OPTION_TYPE_BOOL,
.op_version = {3},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Afr performs fsyncs for transactions if this "
"option is on to make sure the changelogs/data is "
"written to the disk",
.default_value = "on",
},
{
.key = {"afr-dirty-xattr"},
.type = GF_OPTION_TYPE_STR,
.default_value = AFR_DIRTY_DEFAULT,
},
{.key = {"afr-pending-xattr"},
.type = GF_OPTION_TYPE_STR,
.description = "Comma separated list of xattrs that are used to "
"capture information on pending heals."},
{
.key = {"metadata-splitbrain-forced-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
{.key = {"heal-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 5,
.max = INT_MAX,
.default_value = "600",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "time interval for checking the need to self-heal "
"in self-heal-daemon"},
{
.key = {"consistent-metadata"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
.op_version = {GD_OP_VERSION_3_7_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "If this option is enabled, readdirp will force "
"lookups on those entries read whose read child is "
"not the same as that of the parent. This will "
"guarantee that all read operations on a file serve "
"attributes from the same subvol as long as it holds "
" a good copy of the file/dir.",
},
{.key = {"arbiter-count"},
.type = GF_OPTION_TYPE_INT,
.description = "subset of child_count. Has to be 0 or 1."},
{
.key = {"thin-arbiter"},
.type = GF_OPTION_TYPE_STR,
.op_version = {GD_OP_VERSION_4_1_0},
.flags = OPT_FLAG_SETTABLE,
.tags = {"replicate"},
.description = "contains host:path of thin abriter brick",
},
{.key = {"shd-max-threads"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 64,
.default_value = "1",
.op_version = {GD_OP_VERSION_3_7_12},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "Maximum number of parallel heals SHD can do per "
"local brick. This can substantially lower heal times"
", but can also crush your bricks if you don't have "
"the storage hardware to support this."},
{
.key = {"shd-wait-qlength"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 655536,
.default_value = "1024",
.op_version = {GD_OP_VERSION_3_7_12},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option can be used to control number of heals"
" that can wait in SHD per subvolume",
},
{
.key = {"locking-scheme"},
.type = GF_OPTION_TYPE_STR,
.value = {"full", "granular"},
.default_value = "full",
.op_version = {GD_OP_VERSION_3_7_12},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "If this option is set to granular, self-heal will "
"stop being compatible with afr-v1, which helps afr "
"be more granular while self-healing",
},
{.key = {"full-lock"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "yes",
.op_version = {GD_OP_VERSION_3_13_2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
.tags = {"replicate"},
.description = "If this option is disabled, then the IOs will take "
"range locks same as versions till 3.13.1."},
{
.key = {"granular-entry-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
.op_version = {GD_OP_VERSION_3_8_0},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "If this option is enabled, self-heal will resort to "
"granular way of recording changelogs and doing entry "
"self-heal.",
},
{
.key = {"favorite-child-policy"},
.type = GF_OPTION_TYPE_STR,
.value = {"none", "size", "ctime", "mtime", "majority"},
.default_value = "none",
.op_version = {GD_OP_VERSION_3_7_12},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option can be used to automatically resolve "
"split-brains using various policies without user "
"intervention. \"size\" picks the file with the "
"biggest size as the source. \"ctime\" and \"mtime\" "
"pick the file with the latest ctime and mtime "
"respectively as the source. \"majority\" picks a file"
" with identical mtime and size in more than half the "
"number of bricks in the replica.",
},
{
.key = {"consistent-io"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
.description = "If this option is enabled, i/o will fail even if "
"one of the bricks is down in the replicas",
},
{.key = {"use-compound-fops"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
.op_version = {GD_OP_VERSION_3_8_4},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
{.key = {NULL}},
};
xlator_api_t xlator_api = {
.init = init,
.fini = fini,
.notify = notify,
.reconfigure = reconfigure,
.mem_acct_init = mem_acct_init,
.op_version = {1}, /* Present from the initial version */
.dumpops = &dumpops,
.fops = &fops,
.cbks = &cbks,
.options = options,
.identifier = "replicate",
.category = GF_MAINTAINED,
};