/*
Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#include "afr.h"
#include "afr-self-heal.h"
#include <glusterfs/byte-order.h>
#include "protocol-common.h"
#include <glusterfs/events.h>
#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE)
static gf_boolean_t
_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata)
{
return afr_is_xattr_ignorable(k);
}
void
afr_delete_ignorable_xattrs(dict_t *xattr)
{
dict_foreach_match(xattr, _afr_ignorable_key_match, NULL,
dict_remove_foreach_fn, NULL);
}
int
__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode,
int source, unsigned char *healed_sinks,
struct afr_reply *locked_replies)
{
int ret = -1;
loc_t loc = {
0,
};
dict_t *xattr = NULL;
dict_t *old_xattr = NULL;
afr_private_t *priv = NULL;
int i = 0;
priv = this->private;
loc.inode = inode_ref(inode);
gf_uuid_copy(loc.gfid, inode->gfid);
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
"performing metadata selfheal on %s", uuid_utoa(inode->gfid));
ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL,
NULL);
if (ret < 0) {
ret = -EIO;
goto out;
}
afr_delete_ignorable_xattrs(xattr);
for (i = 0; i < priv->child_count; i++) {
if (old_xattr) {
dict_unref(old_xattr);
old_xattr = NULL;
}
if (!healed_sinks[i])
continue;
ret = syncop_setattr(priv->children[i], &loc,
&locked_replies[source].poststat, AFR_HEAL_ATTR,
NULL, NULL, NULL, NULL);
if (ret)
healed_sinks[i] = 0;
ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL,
NULL);
if (old_xattr) {
afr_delete_ignorable_xattrs(old_xattr);
ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr,
NULL);
if (ret)
healed_sinks[i] = 0;
}
ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL);
if (ret)
healed_sinks[i] = 0;
}
ret = 0;
out:
loc_wipe(&loc);
if (xattr)
dict_unref(xattr);
if (old_xattr)
dict_unref(old_xattr);
return ret;
}
static uint64_t
mtime_ns(struct iatt *ia)
{
uint64_t ret;
ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) +
(uint64_t)(ia->ia_mtime_nsec);
return ret;
}
/*
* When directory content is modified, [mc]time is updated. On
* Linux, the filesystem does it, while at least on NetBSD, the
* kernel file-system independent code does it. This means that
* when entries are added while bricks are down, the kernel sends
* a SETATTR [mc]time which will cause metadata split brain for
* the directory. In this case, clear the split brain by finding
* the source with the most recent modification date.
*/
static int
afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this,
struct afr_reply *replies,
unsigned char *locked_on)
{
afr_private_t *priv = NULL;
int source = -1;
struct iatt source_ia;
struct iatt child_ia;
uint64_t mtime = 0;
int i;
int ret = -1;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
if (!locked_on[i])
continue;
if (!replies[i].valid)
continue;
if (replies[i].op_ret != 0)
continue;
if (mtime_ns(&replies[i].poststat) <= mtime)
continue;
mtime = mtime_ns(&replies[i].poststat);
source = i;
}
if (source == -1)
goto out;
source_ia = replies[source].poststat;
if (source_ia.ia_type != IA_IFDIR)
goto out;
for (i = 0; i < priv->child_count; i++) {
if (i == source)
continue;
if (!replies[i].valid)
continue;
if (replies[i].op_ret != 0)
continue;
child_ia = replies[i].poststat;
if (!IA_EQUAL(source_ia, child_ia, gfid) ||
!IA_EQUAL(source_ia, child_ia, type) ||
!IA_EQUAL(source_ia, child_ia, prot) ||
!IA_EQUAL(source_ia, child_ia, uid) ||
!IA_EQUAL(source_ia, child_ia, gid) ||
!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata))
goto out;
}
/*
* Metadata split brain is just about [amc]time
* We return our source.
*/
ret = source;
out:
return ret;
}
/*
* Look for mismatching uid/gid or mode or user xattrs even if
* AFR xattrs don't say so, and pick one arbitrarily as winner. */
static int
__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
inode_t *inode, unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *undid_pending,
unsigned char *locked_on,
struct afr_reply *replies)
{
int i = 0;
afr_private_t *priv = NULL;
struct iatt srcstat = {
0,
};
int source = -1;
int sources_count = 0;
priv = this->private;
sources_count = AFR_COUNT(sources, priv->child_count);
if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
!sources_count) {
source = afr_mark_split_brain_source_sinks(
frame, this, inode, sources, sinks, healed_sinks, locked_on,
replies, AFR_METADATA_TRANSACTION);
if (source >= 0) {
_afr_fav_child_reset_sink_xattrs(
frame, this, inode, source, healed_sinks, undid_pending,
AFR_METADATA_TRANSACTION, locked_on, replies);
goto out;
}
/* If this is a directory mtime/ctime only split brain
use the most recent */
source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on);
if (source != -1) {
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN,
"clear time "
"split brain on %s",
uuid_utoa(replies[source].poststat.ia_gfid));
sources[source] = 1;
healed_sinks[source] = 0;
goto out;
}
if (!priv->metadata_splitbrain_forced_heal) {
gf_event(EVENT_AFR_SPLIT_BRAIN,
"subvol=%s;"
"type=metadata;file=%s",
this->name, uuid_utoa(inode->gfid));
return -EIO;
}
/* Metadata split brain, select one subvol
arbitrarily */
for (i = 0; i < priv->child_count; i++) {
if (locked_on[i] && healed_sinks[i]) {
sources[i] = 1;
healed_sinks[i] = 0;
break;
}
}
}
/* No split brain at this point. If we were called from
* afr_heal_splitbrain_file(), abort.*/
if (afr_dict_contains_heal_op(frame))
return -EIO;
source = afr_choose_source_by_policy(priv, sources,
AFR_METADATA_TRANSACTION);
srcstat = replies[source].poststat;
for (i = 0; i < priv->child_count; i++) {
if (!sources[i] || i == source)
continue;
if (!IA_EQUAL(srcstat, replies[i].poststat, type) ||
!IA_EQUAL(srcstat, replies[i].poststat, uid) ||
!IA_EQUAL(srcstat, replies[i].poststat, gid) ||
!IA_EQUAL(srcstat, replies[i].poststat, prot)) {
gf_msg_debug(this->name, 0,
"%s: iatt mismatch "
"for source(%d) vs (%d)",
uuid_utoa(replies[source].poststat.ia_gfid), source,
i);
sources[i] = 0;
healed_sinks[i] = 1;
}
}
for (i = 0; i < priv->child_count; i++) {
if (!sources[i] || i == source)
continue;
if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) {
gf_msg_debug(this->name, 0,
"%s: xattr mismatch "
"for source(%d) vs (%d)",
uuid_utoa(replies[source].poststat.ia_gfid), source,
i);
sources[i] = 0;
healed_sinks[i] = 1;
}
}
out:
afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
return source;
}
int
__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
inode_t *inode, unsigned char *locked_on,
unsigned char *sources, unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *undid_pending,
struct afr_reply *replies, unsigned char *pflag)
{
int ret = -1;
int source = -1;
afr_private_t *priv = NULL;
int i = 0;
uint64_t *witness = NULL;
priv = this->private;
ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
if (ret)
return ret;
witness = alloca0(sizeof(*witness) * priv->child_count);
ret = afr_selfheal_find_direction(frame, this, replies,
AFR_METADATA_TRANSACTION, locked_on,
sources, sinks, witness, pflag);
if (ret)
return ret;
/* Initialize the healed_sinks[] array optimistically to
the intersection of to-be-healed (i.e sinks[]) and
the list of servers which are up (i.e locked_on[]).
As we encounter failures in the healing process, we
will unmark the respective servers in the healed_sinks[]
array.
*/
AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
/* If any source has witness, pick first
* witness source and make everybody else sinks */
for (i = 0; i < priv->child_count; i++) {
if (sources[i] && witness[i]) {
source = i;
break;
}
}
if (source != -1) {
for (i = 0; i < priv->child_count; i++) {
if (i != source && sources[i]) {
sources[i] = 0;
healed_sinks[i] = 1;
}
}
}
source = __afr_selfheal_metadata_finalize_source(
frame, this, inode, sources, sinks, healed_sinks, undid_pending,
locked_on, replies);
if (source < 0)
return -EIO;
return source;
}
int
afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
afr_private_t *priv = NULL;
int ret = -1;
unsigned char *sources = NULL;
unsigned char *sinks = NULL;
unsigned char *data_lock = NULL;
unsigned char *healed_sinks = NULL;
unsigned char *undid_pending = NULL;
struct afr_reply *locked_replies = NULL;
gf_boolean_t did_sh = _gf_true;
int source = -1;
priv = this->private;
sources = alloca0(priv->child_count);
sinks = alloca0(priv->child_count);
healed_sinks = alloca0(priv->child_count);
undid_pending = alloca0(priv->child_count);
data_lock = alloca0(priv->child_count);
locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
data_lock);
{
if (ret < AFR_SH_MIN_PARTICIPANTS) {
ret = -ENOTCONN;
goto unlock;
}
ret = __afr_selfheal_metadata_prepare(
frame, this, inode, data_lock, sources, sinks, healed_sinks,
undid_pending, locked_replies, NULL);
if (ret < 0)
goto unlock;
source = ret;
if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
did_sh = _gf_false;
goto unlock;
}
ret = __afr_selfheal_metadata_do(frame, this, inode, source,
healed_sinks, locked_replies);
if (ret)
goto unlock;
/* Restore atime/mtime for files that don't need data heal as
* restoring timestamps happens only as a part of data-heal.
*/
if (!IA_ISREG(locked_replies[source].poststat.ia_type))
afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
locked_replies);
ret = afr_selfheal_undo_pending(
frame, this, inode, sources, sinks, healed_sinks, undid_pending,
AFR_METADATA_TRANSACTION, locked_replies, data_lock);
}
unlock:
afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
data_lock);
if (did_sh)
afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources,
healed_sinks);
else
ret = 1;
if (locked_replies)
afr_replies_wipe(locked_replies, priv->child_count);
return ret;
}
int
afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf)
{
inode_t *inode = NULL;
inode_t *link_inode = NULL;
call_frame_t *frame = NULL;
int ret = 0;
if (gf_uuid_is_null(stbuf->ia_gfid)) {
ret = -EINVAL;
goto out;
}
inode = inode_new(this->itable);
if (!inode) {
ret = -ENOMEM;
goto out;
}
link_inode = inode_link(inode, NULL, NULL, stbuf);
if (!link_inode) {
ret = -ENOMEM;
goto out;
}
frame = afr_frame_create(this, &ret);
if (!frame) {
ret = -ret;
goto out;
}
ret = afr_selfheal_metadata(frame, this, link_inode);
out:
if (inode)
inode_unref(inode);
if (link_inode)
inode_unref(link_inode);
if (frame)
AFR_STACK_DESTROY(frame);
return ret;
}