Blob Blame History Raw
/*
  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
  This file is part of GlusterFS.

  This file is licensed to you under your choice of the GNU Lesser
  General Public License, version 3 or any later version (LGPLv3 or
  later), or the GNU General Public License, version 2 (GPLv2), in all
  cases as published by the Free Software Foundation.
*/

#include <libgen.h>
#include <unistd.h>
#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>

#include "afr-common.c"
#include "afr-messages.h"

struct volume_options options[];

static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
    [AFR_FAV_CHILD_NONE] = "none",
    [AFR_FAV_CHILD_BY_SIZE] = "size",
    [AFR_FAV_CHILD_BY_CTIME] = "ctime",
    [AFR_FAV_CHILD_BY_MTIME] = "mtime",
    [AFR_FAV_CHILD_BY_MAJORITY] = "majority",
    [AFR_FAV_CHILD_POLICY_MAX] = NULL,
};

int32_t
notify(xlator_t *this, int32_t event, void *data, ...)
{
    int ret = -1;
    va_list ap;
    void *data2 = NULL;

    va_start(ap, data);
    data2 = va_arg(ap, dict_t *);
    va_end(ap);
    ret = afr_notify(this, event, data, data2);

    return ret;
}

int32_t
mem_acct_init(xlator_t *this)
{
    int ret = -1;

    if (!this)
        return ret;

    ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1);

    if (ret != 0) {
        return ret;
    }

    return ret;
}

int
xlator_subvolume_index(xlator_t *this, xlator_t *subvol)
{
    int index = -1;
    int i = 0;
    xlator_list_t *list = NULL;

    list = this->children;

    while (list) {
        if (subvol == list->xlator ||
            strcmp(subvol->name, list->xlator->name) == 0) {
            index = i;
            break;
        }
        list = list->next;
        i++;
    }

    return index;
}

static void
fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
                   dict_t *options)
{
    if (dict_get_sizen(options, "quorum-type") == NULL) {
        /* If user doesn't configure anything enable auto-quorum if the
         * replica has more than two subvolumes */
        if (priv->child_count > 2)
            qtype = "auto";
    }

    if (priv->quorum_count && strcmp(qtype, "fixed")) {
        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
               "quorum-type %s overriding quorum-count %u", qtype,
               priv->quorum_count);
    }

    if (!strcmp(qtype, "none")) {
        priv->quorum_count = 0;
    } else if (!strcmp(qtype, "auto")) {
        priv->quorum_count = AFR_QUORUM_AUTO;
    }
}

int
afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
{
    int index = -1;

    index = gf_get_index_by_elem(afr_favorite_child_policies, policy);
    if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
        return -1;

    priv->fav_child_policy = index;

    return 0;
}

static void
set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
{
    if (!algo) {
        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
    } else if (strcmp(algo, "full") == 0) {
        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
    } else if (strcmp(algo, "diff") == 0) {
        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
    } else {
        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
    }
}

int
reconfigure(xlator_t *this, dict_t *options)
{
    afr_private_t *priv = NULL;
    xlator_t *read_subvol = NULL;
    int read_subvol_index = -1;
    int timeout_old = 0;
    int ret = -1;
    int index = -1;
    char *qtype = NULL;
    char *fav_child_policy = NULL;
    char *data_self_heal = NULL;
    char *data_self_heal_algorithm = NULL;
    char *locking_scheme = NULL;
    gf_boolean_t consistent_io = _gf_false;
    gf_boolean_t choose_local_old = _gf_false;
    gf_boolean_t enabled_old = _gf_false;

    priv = this->private;

    GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
                     priv->metadata_splitbrain_forced_heal, options, bool, out);

    GF_OPTION_RECONF("background-self-heal-count",
                     priv->background_self_heal_count, options, uint32, out);

    GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options,
                     uint32, out);

    GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
                     bool, out);

    GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
    gf_string2boolean(data_self_heal, &priv->data_self_heal);

    GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
                     out);

    GF_OPTION_RECONF("data-self-heal-window-size",
                     priv->data_self_heal_window_size, options, uint32, out);

    GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
                     options, str, out);
    set_data_self_heal_algorithm(priv, data_self_heal_algorithm);

    GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);

    GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
                     options, uint32, out);

    GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
                     options, uint32, out);

    GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options,
                     uint32, out);

    GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options,
                     uint32, out);

    GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options,
                     uint32, out);

    GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out);

    choose_local_old = priv->choose_local;
    GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out);

    if (choose_local_old != priv->choose_local) {
        priv->read_child = -1;
        if (choose_local_old == _gf_false)
            priv->did_discovery = _gf_false;
    }

    GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out);

    if (read_subvol) {
        index = xlator_subvolume_index(this, read_subvol);
        if (index == -1) {
            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
                   "%s not a subvolume", read_subvol->name);
            goto out;
        }
        priv->read_child = index;
    }

    GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32,
                     out);

    if (read_subvol_index > -1) {
        index = read_subvol_index;
        if (index >= priv->child_count) {
            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
                   "%d not a subvolume-index", index);
            goto out;
        }
        priv->read_child = index;
    }

    GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
    GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
    priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
    GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
    GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
                     out);

    GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
    GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
    GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
    fix_quorum_options(this, priv, qtype, options);
    if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
               "Client-quorum is not met");

    GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options,
                     uint32, out);

    GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options,
                     size_uint64, out);
    /* Reset this so we re-discover in case the topology changed.  */
    GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
                     bool, out);

    enabled_old = priv->shd.enabled;
    GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);

    GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
                     out);

    timeout_old = priv->shd.timeout;
    GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);

    GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
                     bool, out);

    GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32,
                     out);

    GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options,
                     uint32, out);

    GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str,
                     out);
    if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
        goto out;

    priv->did_discovery = _gf_false;

    GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out);
    if (priv->quorum_count != 0)
        consistent_io = _gf_false;
    priv->consistent_io = consistent_io;

    if (priv->shd.enabled) {
        if ((priv->shd.enabled != enabled_old) ||
            (timeout_old != priv->shd.timeout))
            afr_selfheal_childup(this, priv);
    }

    ret = 0;
out:
    return ret;
}

static int
afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this)
{
    int ret = -1;
    int i = 0;
    char *ptr = NULL;
    char *ptr1 = NULL;
    char *xattrs_list = NULL;
    xlator_list_t *trav = NULL;
    int child_count = -1;

    trav = this->children;
    child_count = priv->child_count;
    if (priv->thin_arbiter_count) {
        /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the
         * name of the thin arbiter file for persistence across add/
         * removal of DHT subvols.*/
        child_count++;
    }

    GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out);
    priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count,
                                  gf_afr_mt_char);
    if (!priv->pending_key) {
        ret = -ENOMEM;
        goto out;
    }
    if (!xattrs_list) {
        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG,
               "Unable to fetch afr-pending-xattr option from volfile."
               " Falling back to using client translator names. ");

        while (i < child_count) {
            ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
                              trav->xlator->name);
            if (ret == -1) {
                ret = -ENOMEM;
                goto out;
            }
            trav = trav->next;
            i++;
        }
        ret = 0;
        goto out;
    }

    ptr = ptr1 = gf_strdup(xattrs_list);
    if (!ptr) {
        ret = -ENOMEM;
        goto out;
    }
    for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) {
        ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
                          ptr);
        if (ret == -1) {
            ret = -ENOMEM;
            goto out;
        }
        i++;
    }
    ret = 0;

out:
    GF_FREE(ptr1);
    return ret;
}

void
afr_ta_init(afr_private_t *priv)
{
    priv->thin_arbiter_count = 1;
    priv->child_count--;
    priv->ta_child_up = 0;
    priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
    priv->ta_notify_dom_lock_offset = 0;
    priv->ta_in_mem_txn_count = 0;
    priv->ta_on_wire_txn_count = 0;
    priv->release_ta_notify_dom_lock = _gf_false;
    INIT_LIST_HEAD(&priv->ta_waitq);
    INIT_LIST_HEAD(&priv->ta_onwireq);
    *priv->ta_gfid = 0;
}

int32_t
init(xlator_t *this)
{
    afr_private_t *priv = NULL;
    int child_count = 0;
    xlator_list_t *trav = NULL;
    int i = 0;
    int ret = -1;
    GF_UNUSED int op_errno = 0;
    xlator_t *read_subvol = NULL;
    int read_subvol_index = -1;
    char *qtype = NULL;
    char *fav_child_policy = NULL;
    char *thin_arbiter = NULL;
    char *data_self_heal = NULL;
    char *locking_scheme = NULL;
    char *data_self_heal_algorithm = NULL;

    if (!this->children) {
        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
               "replicate translator needs more than one "
               "subvolume defined.");
        return -1;
    }

    if (!this->parents) {
        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED,
               "Volume is dangling.");
    }

    this->private = GF_CALLOC(1, sizeof(afr_private_t),
                              gf_afr_mt_afr_private_t);
    if (!this->private)
        goto out;

    priv = this->private;
    LOCK_INIT(&priv->lock);

    child_count = xlator_subvolume_count(this);

    priv->child_count = child_count;

    priv->read_child = -1;

    GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
    GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
    if (thin_arbiter && strlen(thin_arbiter) > 0) {
        afr_ta_init(priv);
    }
    INIT_LIST_HEAD(&priv->healing);
    INIT_LIST_HEAD(&priv->heal_waiting);

    priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;

    GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out);

    GF_OPTION_INIT("metadata-splitbrain-forced-heal",
                   priv->metadata_splitbrain_forced_heal, bool, out);

    GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out);
    if (read_subvol) {
        priv->read_child = xlator_subvolume_index(this, read_subvol);
        if (priv->read_child == -1) {
            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
                   "%s not a subvolume", read_subvol->name);
            goto out;
        }
    }
    GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out);
    if (read_subvol_index > -1) {
        if (read_subvol_index >= priv->child_count) {
            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
                   "%d not a subvolume-index", read_subvol_index);
            goto out;
        }
        priv->read_child = read_subvol_index;
    }
    GF_OPTION_INIT("choose-local", priv->choose_local, bool, out);

    priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads),
                                    priv->child_count, gf_afr_mt_atomic_t);

    GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out);

    priv->favorite_child = -1;

    GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out);
    if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
        goto out;

    GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out);

    GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out);

    GF_OPTION_INIT("background-self-heal-count",
                   priv->background_self_heal_count, uint32, out);

    GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);

    GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
    gf_string2boolean(data_self_heal, &priv->data_self_heal);

    GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
                   out);
    set_data_self_heal_algorithm(priv, data_self_heal_algorithm);

    GF_OPTION_INIT("data-self-heal-window-size",
                   priv->data_self_heal_window_size, uint32, out);

    GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out);

    GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out);

    GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
                   uint32, out);

    GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32,
                   out);
    GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out);
    GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out);

    GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out);

    GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
                   uint32, out);

    GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);

    GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool,
                   out);

    GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
    GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
    priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
    GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
    GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);

    GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out);
    GF_OPTION_INIT("quorum-type", qtype, str, out);
    GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out);
    GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
                   out);
    fix_quorum_options(this, priv, qtype, this->options);

    GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
    GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out);

    GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out);

    GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
    GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out);

    GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
    GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);

    if (priv->quorum_count != 0)
        priv->consistent_io = _gf_false;

    priv->wait_count = 1;

    priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char);
    if (!priv->local) {
        ret = -ENOMEM;
        goto out;
    }

    priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
                               gf_afr_mt_char);

    priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
                                    gf_afr_mt_child_latency_t);

    if (!priv->child_up || !priv->child_latency) {
        ret = -ENOMEM;
        goto out;
    }
    /*Initialize to -ve ping timeout so that they are not considered
     * in child-up events until ping-event comes*/
    for (i = 0; i < child_count; i++)
        priv->child_latency[i] = -1;

    priv->children = GF_CALLOC(sizeof(xlator_t *), child_count,
                               gf_afr_mt_xlator_t);
    if (!priv->children) {
        ret = -ENOMEM;
        goto out;
    }

    ret = afr_pending_xattrs_init(priv, this);
    if (ret)
        goto out;

    trav = this->children;
    i = 0;
    while (i < child_count) {
        priv->children[i] = trav->xlator;
        trav = trav->next;
        i++;
    }

    ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name);
    if (-1 == ret) {
        ret = -ENOMEM;
        goto out;
    }

    priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event),
                                 gf_afr_mt_int32_t);
    if (!priv->last_event) {
        ret = -ENOMEM;
        goto out;
    }

    this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
    if (!this->itable) {
        ret = -ENOMEM;
        goto out;
    }

    if (priv->shd.iamshd) {
        ret = afr_selfheal_daemon_init(this);
        if (ret) {
            ret = -ENOMEM;
            goto out;
        }
    }

    /* keep more local here as we may need them for self-heal etc */
    this->local_pool = mem_pool_new(afr_local_t, 512);
    if (!this->local_pool) {
        ret = -1;
        goto out;
    }

    priv->root_inode = NULL;

    ret = 0;
out:
    return ret;
}

void
fini(xlator_t *this)
{
    afr_private_t *priv = NULL;

    priv = this->private;
    LOCK(&priv->lock);
    if (priv->timer != NULL) {
        gf_timer_call_cancel(this->ctx, priv->timer);
        priv->timer = NULL;
    }
    UNLOCK(&priv->lock);
    this->private = NULL;
    afr_priv_destroy(priv);
    if (this->itable) {
        inode_table_destroy(this->itable);
        this->itable = NULL;
    }

    return;
}

struct xlator_fops fops = {
    .lookup = afr_lookup,
    .lk = afr_lk,
    .flush = afr_flush,
    .statfs = afr_statfs,
    .fsyncdir = afr_fsyncdir,
    .inodelk = afr_inodelk,
    .finodelk = afr_finodelk,
    .entrylk = afr_entrylk,
    .fentrylk = afr_fentrylk,
    .ipc = afr_ipc,
    .lease = afr_lease,

    /* inode read */
    .access = afr_access,
    .stat = afr_stat,
    .fstat = afr_fstat,
    .readlink = afr_readlink,
    .getxattr = afr_getxattr,
    .fgetxattr = afr_fgetxattr,
    .readv = afr_readv,

    /* inode write */
    .writev = afr_writev,
    .truncate = afr_truncate,
    .ftruncate = afr_ftruncate,
    .setxattr = afr_setxattr,
    .fsetxattr = afr_fsetxattr,
    .setattr = afr_setattr,
    .fsetattr = afr_fsetattr,
    .removexattr = afr_removexattr,
    .fremovexattr = afr_fremovexattr,
    .fallocate = afr_fallocate,
    .discard = afr_discard,
    .zerofill = afr_zerofill,
    .xattrop = afr_xattrop,
    .fxattrop = afr_fxattrop,
    .fsync = afr_fsync,

    /*inode open*/
    .opendir = afr_opendir,
    .open = afr_open,

    /* dir read */
    .readdir = afr_readdir,
    .readdirp = afr_readdirp,

    /* dir write */
    .create = afr_create,
    .mknod = afr_mknod,
    .mkdir = afr_mkdir,
    .unlink = afr_unlink,
    .rmdir = afr_rmdir,
    .link = afr_link,
    .symlink = afr_symlink,
    .rename = afr_rename,
};

struct xlator_dumpops dumpops = {
    .priv = afr_priv_dump,
};

struct xlator_cbks cbks = {
    .release = afr_release,
    .releasedir = afr_releasedir,
    .forget = afr_forget,
};

struct volume_options options[] = {
    {.key = {"read-subvolume"},
     .type = GF_OPTION_TYPE_XLATOR,
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "inode-read fops happen only on one of the bricks in "
                    "replicate. Afr will prefer the one specified using "
                    "this option if it is not stale. Option value must be "
                    "one of the xlator names of the children. "
                    "Ex: <volname>-client-0 till "
                    "<volname>-client-<number-of-bricks - 1>"},
    {.key = {"read-subvolume-index"},
     .type = GF_OPTION_TYPE_INT,
     .default_value = "-1",
     .op_version = {2},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "inode-read fops happen only on one of the bricks in "
                    "replicate. AFR will prefer the one specified using "
                    "this option if it is not stale. allowed options"
                    " include -1 till replica-count - 1"},
    {.key = {"read-hash-mode"},
     .type = GF_OPTION_TYPE_INT,
     .min = 0,
     .max = 3,
     .default_value = "1",
     .op_version = {2},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description =
         "inode-read fops happen only on one of the bricks in "
         "replicate. AFR will prefer the one computed using "
         "the method specified using this option.\n"
         "0 = first readable child of AFR, starting from 1st child.\n"
         "1 = hash by GFID of file (all clients use "
         "same subvolume).\n"
         "2 = hash by GFID of file and client PID.\n"
         "3 = brick having the least outstanding read requests."},
    {
        .key = {"choose-local"},
        .type = GF_OPTION_TYPE_BOOL,
        .default_value = "true",
        .op_version = {2},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "Choose a local subvolume (i.e. Brick) to read from"
                       " if read-subvolume is not explicitly set.",
    },
    {.key = {"background-self-heal-count"},
     .type = GF_OPTION_TYPE_INT,
     .min = 0,
     .max = 256,
     .default_value = "8",
     .validate = GF_OPT_VALIDATE_MIN,
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "This specifies the number of per client self-heal "
                    "jobs that can perform parallel heals in the "
                    "background."},
    {.key = {"halo-shd-max-latency"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = 99999,
     .default_value = "99999",
     .op_version = {GD_OP_VERSION_3_11_0},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate", "halo"},
     .description = "Maximum latency for shd halo replication in msec."},
    {.key = {"halo-enabled"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "False",
     .op_version = {GD_OP_VERSION_3_11_0},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate", "halo"},
     .description = "Enable Halo (geo) replication mode."},
    {.key = {"halo-nfsd-max-latency"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = 99999,
     .default_value = "5",
     .op_version = {GD_OP_VERSION_3_11_0},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate", "halo"},
     .description = "Maximum latency for nfsd halo replication in msec."},
    {.key = {"halo-max-latency"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = AFR_HALO_MAX_LATENCY,
     .default_value = "5",
     .op_version = {GD_OP_VERSION_3_11_0},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate", "halo"},
     .description = "Maximum latency for halo replication in msec."},
    {.key = {"halo-max-replicas"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = 99999,
     .default_value = "99999",
     .op_version = {GD_OP_VERSION_3_11_0},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate", "halo"},
     .description = "The maximum number of halo replicas; replicas"
                    " beyond this value will be written asynchronously"
                    "via the SHD."},
    {.key = {"halo-min-replicas"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = 99999,
     .default_value = "2",
     .op_version = {GD_OP_VERSION_3_11_0},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate", "halo"},
     .description = "The minimmum number of halo replicas, before adding "
                    "out of region replicas."},
    {.key = {"heal-wait-queue-length"},
     .type = GF_OPTION_TYPE_INT,
     .min = 0,
     .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
     .default_value = "128",
     .validate = GF_OPT_VALIDATE_MIN,
     .op_version = {GD_OP_VERSION_3_7_10},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "This specifies the number of heals that can be queued"
                    " for the parallel background self heal jobs."},
    {.key = {"data-self-heal"},
     .type = GF_OPTION_TYPE_STR,
     .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
               "disable", "open"},
     .default_value = "off",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "Using this option we can enable/disable data "
                    "self-heal on the file. \"open\" means data "
                    "self-heal action will only be triggered by file "
                    "open operations."},
    {.key = {"data-self-heal-algorithm"},
     .type = GF_OPTION_TYPE_STR,
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "Select between \"full\", \"diff\". The "
                    "\"full\" algorithm copies the entire file from "
                    "source to sink. The \"diff\" algorithm copies to "
                    "sink only those blocks whose checksums don't match "
                    "with those of source. If no option is configured "
                    "the option is chosen dynamically as follows: "
                    "If the file does not exist on one of the sinks "
                    "or empty file exists or if the source file size is "
                    "about the same as page size the entire file will "
                    "be read and written i.e \"full\" algo, "
                    "otherwise \"diff\" algo is chosen.",
     .value = {"diff", "full"}},
    {.key = {"data-self-heal-window-size"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = 1024,
     .default_value = "1",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "Maximum number blocks per file for which self-heal "
                    "process would be applied simultaneously."},
    {.key = {"metadata-self-heal"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "off",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     /*.validate_fn = validate_replica*/
     .description = "Using this option we can enable/disable metadata "
                    "i.e. Permissions, ownerships, xattrs self-heal on "
                    "the file/directory."},
    {.key = {"entry-self-heal"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "off",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     /*.validate_fn = validate_replica*/
     .description = "Using this option we can enable/disable entry "
                    "self-heal on the directory."},
    {.key = {"data-change-log"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "This option exists only for backward compatibility "
                    "and configuring it doesn't have any effect"},
    {.key = {"metadata-change-log"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "This option exists only for backward compatibility "
                    "and configuring it doesn't have any effect"},
    {.key = {"entry-change-log"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "This option exists only for backward compatibility "
                    "and configuring it doesn't have any effect"},
    {.key = {"optimistic-change-log"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .description = "Entry/Metadata fops will not perform "
                    "pre fop changelog operations in afr transaction "
                    "if this option is enabled."},
    {.key = {"inodelk-trace"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "off",
     .description = "Enabling this option logs inode lock/unlocks"},
    {.key = {"entrylk-trace"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "off",
     .description = "Enabling this option logs entry lock/unlocks"},
    {.key = {"pre-op-compat"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .description = "Use separate pre-op xattrop() FOP rather than "
                    "overloading xdata of the OP"},
    {.key = {"eager-lock"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description =
         "Enable/Disable eager lock for replica volume. "
         "Lock phase of a transaction has two sub-phases. "
         "First is an attempt to acquire locks in parallel by "
         "broadcasting non-blocking lock requests. If lock "
         "acquisition fails on any server, then the held locks "
         "are unlocked and we revert to a blocking locks mode "
         "sequentially on one server after another.  If this "
         "option is enabled the initial broadcasting lock "
         "request attempts to acquire a full lock on the entire file. "
         "If this fails, we revert back to the sequential "
         "\"regional\" blocking locks as before. In the case "
         "where such an \"eager\" lock is granted in the "
         "non-blocking phase, it gives rise to an opportunity "
         "for optimization. i.e, if the next write transaction "
         "on the same FD arrives before the unlock phase of "
         "the first transaction, it \"takes over\" the full "
         "file lock. Similarly if yet another data transaction "
         "arrives before the unlock phase of the \"optimized\" "
         "transaction, that in turn \"takes over\" the lock as "
         "well. The actual unlock now happens at the end of "
         "the last \"optimized\" transaction."

    },
    {.key = {"self-heal-daemon"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "on",
     .op_version = {1},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
     .tags = {"replicate"},
     /*.validate_fn   = validate_replica_heal_enable_disable*/
     .description = "This option applies to only self-heal-daemon. "
                    "Index directory crawl and automatic healing of files "
                    "will not be performed if this option is turned off."},
    {.key = {"iam-self-heal-daemon"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "off",
     .description = "This option differentiates if the replicate "
                    "translator is running as part of self-heal-daemon "
                    "or not."},
    {.key = {"iam-nfs-daemon"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "off",
     .description = "This option differentiates if the replicate "
                    "translator is running as part of an NFS daemon "
                    "or not."},
    {
        .key = {"quorum-type"},
        .type = GF_OPTION_TYPE_STR,
        .value = {"none", "auto", "fixed"},
        .default_value = "none",
        .op_version = {1},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        /*.option = quorum-type*/
        .description = "If value is \"fixed\" only allow writes if "
                       "quorum-count bricks are present.  If value is "
                       "\"auto\" only allow writes if more than half of "
                       "bricks, or exactly half including the first, are "
                       "present.",
    },
    {
        .key = {"quorum-count"},
        .type = GF_OPTION_TYPE_INT,
        .min = 1,
        .max = INT_MAX,
        .default_value = 0,
        .op_version = {1},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        /*.option = quorum-count*/
        /*.validate_fn = validate_quorum_count*/
        .description = "If quorum-type is \"fixed\" only allow writes if "
                       "this many bricks are present.  Other quorum types "
                       "will OVERWRITE this value.",
    },
    {
        .key = {"quorum-reads"},
        .type = GF_OPTION_TYPE_BOOL,
        .default_value = "no",
        .op_version = {GD_OP_VERSION_3_7_0},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "This option has been removed. Reads are not allowed "
                       "if quorum is not met.",
    },
    {
        .key = {"node-uuid"},
        .type = GF_OPTION_TYPE_STR,
        .description = "Local glusterd uuid string, used in starting "
                       "self-heal-daemon so that it can crawl only on "
                       "local index directories.",
    },
    {
        .key = {"post-op-delay-secs"},
        .type = GF_OPTION_TYPE_INT,
        .min = 0,
        .max = INT_MAX,
        .default_value = "1",
        .op_version = {2},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "Time interval induced artificially before "
                       "post-operation phase of the transaction to "
                       "enhance overlap of adjacent write operations.",
    },
    {
        .key = {AFR_SH_READDIR_SIZE_KEY},
        .type = GF_OPTION_TYPE_SIZET,
        .description = "readdirp size for performing entry self-heal",
        .min = 1024,
        .max = 131072,
        .op_version = {2},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
        .tags = {"replicate"},
        .default_value = "1KB",
    },
    {
        .key = {"ensure-durability"},
        .type = GF_OPTION_TYPE_BOOL,
        .op_version = {3},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "Afr performs fsyncs for transactions if this "
                       "option is on to make sure the changelogs/data is "
                       "written to the disk",
        .default_value = "on",
    },
    {
        .key = {"afr-dirty-xattr"},
        .type = GF_OPTION_TYPE_STR,
        .default_value = AFR_DIRTY_DEFAULT,
    },
    {.key = {"afr-pending-xattr"},
     .type = GF_OPTION_TYPE_STR,
     .description = "Comma separated list of xattrs that are used to  "
                    "capture information on pending heals."},
    {
        .key = {"metadata-splitbrain-forced-heal"},
        .type = GF_OPTION_TYPE_BOOL,
        .default_value = "off",
    },
    {.key = {"heal-timeout"},
     .type = GF_OPTION_TYPE_INT,
     .min = 5,
     .max = INT_MAX,
     .default_value = "600",
     .op_version = {2},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "time interval for checking the need to self-heal "
                    "in self-heal-daemon"},
    {
        .key = {"consistent-metadata"},
        .type = GF_OPTION_TYPE_BOOL,
        .default_value = "no",
        .op_version = {GD_OP_VERSION_3_7_0},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "If this option is enabled, readdirp will force "
                       "lookups on those entries read whose read child is "
                       "not the same as that of the parent. This will "
                       "guarantee that all read operations on a file serve "
                       "attributes from the same subvol as long as it holds "
                       " a good copy of the file/dir.",
    },
    {.key = {"arbiter-count"},
     .type = GF_OPTION_TYPE_INT,
     .description = "subset of child_count. Has to be 0 or 1."},
    {
        .key = {"thin-arbiter"},
        .type = GF_OPTION_TYPE_STR,
        .op_version = {GD_OP_VERSION_4_1_0},
        .flags = OPT_FLAG_SETTABLE,
        .tags = {"replicate"},
        .description = "contains host:path of thin abriter brick",
    },
    {.key = {"shd-max-threads"},
     .type = GF_OPTION_TYPE_INT,
     .min = 1,
     .max = 64,
     .default_value = "1",
     .op_version = {GD_OP_VERSION_3_7_12},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "Maximum number of parallel heals SHD can do per "
                    "local brick. This can substantially lower heal times"
                    ", but can also crush your bricks if you don't have "
                    "the storage hardware to support this."},
    {
        .key = {"shd-wait-qlength"},
        .type = GF_OPTION_TYPE_INT,
        .min = 1,
        .max = 655536,
        .default_value = "1024",
        .op_version = {GD_OP_VERSION_3_7_12},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "This option can be used to control number of heals"
                       " that can wait in SHD per subvolume",
    },
    {
        .key = {"locking-scheme"},
        .type = GF_OPTION_TYPE_STR,
        .value = {"full", "granular"},
        .default_value = "full",
        .op_version = {GD_OP_VERSION_3_7_12},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "If this option is set to granular, self-heal will "
                       "stop being compatible with afr-v1, which helps afr "
                       "be more granular while self-healing",
    },
    {.key = {"full-lock"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "yes",
     .op_version = {GD_OP_VERSION_3_13_2},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
     .tags = {"replicate"},
     .description = "If this option is disabled, then the IOs will take "
                    "range locks same as versions till 3.13.1."},
    {
        .key = {"granular-entry-heal"},
        .type = GF_OPTION_TYPE_BOOL,
        .default_value = "no",
        .op_version = {GD_OP_VERSION_3_8_0},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "If this option is enabled, self-heal will resort to "
                       "granular way of recording changelogs and doing entry "
                       "self-heal.",
    },
    {
        .key = {"favorite-child-policy"},
        .type = GF_OPTION_TYPE_STR,
        .value = {"none", "size", "ctime", "mtime", "majority"},
        .default_value = "none",
        .op_version = {GD_OP_VERSION_3_7_12},
        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
        .tags = {"replicate"},
        .description = "This option can be used to automatically resolve "
                       "split-brains using various policies without user "
                       "intervention. \"size\" picks the file with the "
                       "biggest size as the source. \"ctime\" and \"mtime\" "
                       "pick the file with the latest ctime and mtime "
                       "respectively as the source. \"majority\" picks a file"
                       " with identical mtime and size in more than half the "
                       "number of bricks in the replica.",
    },
    {
        .key = {"consistent-io"},
        .type = GF_OPTION_TYPE_BOOL,
        .default_value = "no",
        .description = "If this option is enabled, i/o will fail even if "
                       "one of the bricks is down in the replicas",
    },
    {.key = {"use-compound-fops"},
     .type = GF_OPTION_TYPE_BOOL,
     .default_value = "no",
     .op_version = {GD_OP_VERSION_3_8_4},
     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
     .tags = {"replicate"},
     .description = "This option exists only for backward compatibility "
                    "and configuring it doesn't have any effect"},
    {.key = {NULL}},
};

xlator_api_t xlator_api = {
    .init = init,
    .fini = fini,
    .notify = notify,
    .reconfigure = reconfigure,
    .mem_acct_init = mem_acct_init,
    .op_version = {1}, /* Present from the initial version */
    .dumpops = &dumpops,
    .fops = &fops,
    .cbks = &cbks,
    .options = options,
    .identifier = "replicate",
    .category = GF_MAINTAINED,
};