/*
Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#include "afr.h"
#include "afr-self-heal.h"
#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
#include "afr-messages.h"
#include <glusterfs/syncop-utils.h>
#include <glusterfs/events.h>
static int
afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
inode_t *inode, int child, struct afr_reply *replies)
{
afr_private_t *priv = NULL;
xlator_t *subvol = NULL;
int ret = 0;
loc_t loc = {
0,
};
char g[64];
priv = this->private;
subvol = priv->children[child];
loc.parent = inode_ref(dir);
gf_uuid_copy(loc.pargfid, dir->gfid);
loc.name = name;
loc.inode = inode_ref(inode);
if (replies[child].valid && replies[child].op_ret == 0) {
switch (replies[child].poststat.ia_type) {
case IA_IFDIR:
gf_msg(this->name, GF_LOG_WARNING, 0,
AFR_MSG_EXPUNGING_FILE_OR_DIR,
"expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid),
name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
subvol->name);
ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
break;
default:
gf_msg(this->name, GF_LOG_WARNING, 0,
AFR_MSG_EXPUNGING_FILE_OR_DIR,
"expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
subvol->name);
ret = syncop_unlink(subvol, &loc, NULL, NULL);
break;
}
}
loc_wipe(&loc);
return ret;
}
int
afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
unsigned char *sources, inode_t *dir,
const char *name, inode_t *inode,
struct afr_reply *replies)
{
int ret = 0;
loc_t loc = {
0,
};
loc_t srcloc = {
0,
};
xlator_t *this = frame->this;
afr_private_t *priv = NULL;
dict_t *xdata = NULL;
struct iatt *iatt = NULL;
char *linkname = NULL;
mode_t mode = 0;
struct iatt newent = {
0,
};
unsigned char *newentry = NULL;
priv = this->private;
iatt = &replies[source].poststat;
if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
"Invalid ia_type (%d) or gfid(%s). source brick=%d, "
"pargfid=%s, name=%s",
iatt->ia_type, uuid_utoa(iatt->ia_gfid), source,
uuid_utoa(dir->gfid), name);
ret = -EINVAL;
goto out;
}
xdata = dict_new();
if (!xdata)
return -ENOMEM;
newentry = alloca0(priv->child_count);
loc.parent = inode_ref(dir);
gf_uuid_copy(loc.pargfid, dir->gfid);
loc.name = name;
loc.inode = inode_ref(inode);
ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies);
if (ret)
goto out;
ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid,
true);
if (ret)
goto out;
srcloc.inode = inode_ref(inode);
gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
if (iatt->ia_type != IA_IFDIR)
ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) {
newentry[dst] = 1;
ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
sources, newentry);
if (ret)
goto out;
}
mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
switch (iatt->ia_type) {
case IA_IFDIR:
ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL);
break;
case IA_IFLNK:
if (!newentry[dst]) {
ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent,
NULL, NULL);
} else {
ret = syncop_readlink(priv->children[source], &srcloc,
&linkname, 4096, NULL, NULL);
if (ret <= 0)
goto out;
ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL,
xdata, NULL);
}
break;
default:
ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
if (ret)
goto out;
ret = syncop_mknod(
priv->children[dst], &loc, mode,
makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)),
&newent, xdata, NULL);
break;
}
out:
if (xdata)
dict_unref(xdata);
GF_FREE(linkname);
loc_wipe(&loc);
loc_wipe(&srcloc);
return ret;
}
static int
__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
char *name, inode_t *inode, int source,
unsigned char *sources, unsigned char *healed_sinks,
unsigned char *locked_on, struct afr_reply *replies)
{
int ret = 0;
afr_private_t *priv = NULL;
int i = 0;
priv = this->private;
if (!replies[source].valid)
return -EIO;
/* Skip healing this entry if the last lookup on it failed for reasons
* other than ENOENT.
*/
if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT))
return -replies[source].op_errno;
if (replies[source].op_ret == 0) {
ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
source, sources,
&replies[source].poststat.ia_gfid, NULL);
if (ret)
return ret;
}
for (i = 0; i < priv->child_count; i++) {
if (!healed_sinks[i])
continue;
if (replies[source].op_ret == -1 &&
replies[source].op_errno == ENOENT) {
ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i,
replies);
} else {
if (!gf_uuid_compare(replies[i].poststat.ia_gfid,
replies[source].poststat.ia_gfid))
continue;
ret = afr_selfheal_recreate_entry(frame, i, source, sources,
fd->inode, name, inode, replies);
}
if (ret < 0)
break;
}
return ret;
}
static int
afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
struct afr_reply *replies,
inode_t *inode, uuid_t pargfid,
char *bname, int src_idx,
unsigned char *locked_on, int *src)
{
int i = 0;
int ret = -1;
afr_private_t *priv = NULL;
void *gfid = NULL;
ia_type_t ia_type = IA_INVAL;
priv = this->private;
gfid = &replies[src_idx].poststat.ia_gfid;
ia_type = replies[src_idx].poststat.ia_type;
for (i = 0; i < priv->child_count; i++) {
if (i == src_idx)
continue;
if (!replies[i].valid)
continue;
if (replies[i].op_ret != 0)
continue;
if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
continue;
if (replies[i].poststat.ia_type == IA_INVAL)
continue;
if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
src_idx = i;
ia_type = replies[src_idx].poststat.ia_type;
gfid = &replies[src_idx].poststat.ia_gfid;
continue;
}
if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
(ia_type == replies[i].poststat.ia_type)) {
ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
bname, src_idx, i, locked_on, src,
NULL);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
"Skipping conservative merge on the "
"file.");
return ret;
}
if (ia_type != replies[i].poststat.ia_type) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
"Type mismatch detected "
"for <gfid:%s>/%s>, %s on %s and %s on %s. "
"Skipping conservative merge on the file.",
uuid_utoa(pargfid), bname,
gf_inode_type_to_str(replies[i].poststat.ia_type),
priv->children[i]->name,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
priv->children[src_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
"client-pid=%d;"
"subvol=%s;type=file;"
"file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
"%d=%s;child-%d=%s;type-%d=%s",
this->ctx->cmd_args.client_pid, this->name,
uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
priv->children[src_idx]->name, src_idx,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
return -1;
}
}
return 0;
}
static int
__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
char *name, inode_t *inode, unsigned char *sources,
unsigned char *healed_sinks,
unsigned char *locked_on, struct afr_reply *replies)
{
int ret = 0;
int i = 0;
int source = -1;
int src = -1;
afr_private_t *priv = NULL;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid && replies[i].op_ret == 0) {
source = i;
break;
}
}
if (source == -1) {
/* entry got deleted in the mean time? */
return 0;
}
/* Set all the sources as 1, otheriwse newentry_mark won't be set */
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid && replies[i].op_ret == 0) {
sources[i] = 1;
}
}
ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
source, sources,
&replies[source].poststat.ia_gfid, NULL);
if (ret)
return ret;
/* In case of type mismatch / unable to resolve gfid mismatch on the
* entry, return -1.*/
ret = afr_selfheal_detect_gfid_and_type_mismatch(
this, replies, inode, fd->inode->gfid, name, source, locked_on, &src);
if (ret < 0)
return ret;
if (src != -1) {
source = src;
for (i = 0; i < priv->child_count; i++) {
if (i != src && replies[i].valid &&
gf_uuid_compare(replies[src].poststat.ia_gfid,
replies[i].poststat.ia_gfid)) {
sources[i] = 0;
}
}
}
for (i = 0; i < priv->child_count; i++) {
if (i == source || !healed_sinks[i])
continue;
if (src != -1) {
if (!gf_uuid_compare(replies[src].poststat.ia_gfid,
replies[i].poststat.ia_gfid))
continue;
} else if (replies[i].op_errno != ENOENT) {
continue;
}
ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode,
name, inode, replies);
}
return ret;
}
static int
__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
char *name, inode_t *inode, int source,
unsigned char *sources, unsigned char *healed_sinks,
unsigned char *locked_on, struct afr_reply *replies)
{
int ret = -1;
if (source < 0)
ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources,
healed_sinks, locked_on, replies);
else
ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source,
sources, healed_sinks, locked_on,
replies);
return ret;
}
static gf_boolean_t
is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx)
{
int i = 0;
int pending[3] = {
0,
};
void *pending_raw = NULL;
afr_private_t *priv = NULL;
priv = this->private;
if (!xdata)
return _gf_false;
/* Iterate over each of the priv->pending_keys[] elements and then
* see if any of them have data segment non-zero. If they do, return
* true. Else return false.
*/
for (i = 0; i < priv->child_count; i++) {
if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
continue;
if (!pending_raw)
continue;
memcpy(pending, pending_raw, sizeof(pending));
if (ntoh32(pending[idx]))
return _gf_true;
}
return _gf_false;
}
static gf_boolean_t
afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source,
unsigned char *healed_sinks, afr_transaction_type type)
{
int i = 0;
int idx = 0;
afr_private_t *priv = NULL;
priv = this->private;
if (!priv->esh_granular)
return _gf_true;
if (type != AFR_ENTRY_TRANSACTION)
return _gf_true;
priv = this->private;
idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
/* If there is a clear source, check whether the full-heal-indicator
* is present in its xdata. Otherwise, we need to examine all the
* participating bricks and then figure if *even* one of them has a
* full-heal-indicator.
*/
if (source != -1) {
if (is_full_heal_marker_present(this, replies[source].xdata, idx))
return _gf_true;
}
/* else ..*/
for (i = 0; i < priv->child_count; i++) {
if (!healed_sinks[i])
continue;
if (is_full_heal_marker_present(this, replies[i].xdata, idx))
return _gf_true;
}
return _gf_false;
}
static int
__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies,
uint64_t *witness)
{
afr_private_t *priv = NULL;
int source = -1;
int sources_count = 0;
int i = 0;
priv = this->private;
sources_count = AFR_COUNT(sources, priv->child_count);
if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
!sources_count || afr_does_witness_exist(this, witness)) {
memset(sources, 0, sizeof(*sources) * priv->child_count);
afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
return -1;
}
source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
/*If the selected source does not blame any other brick, then mark
* everything as sink to trigger conservative merge.
*/
if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
for (i = 0; i < priv->child_count; i++) {
if (locked_on[i]) {
sources[i] = 0;
healed_sinks[i] = 1;
}
}
return -1;
}
return source;
}
int
__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
inode_t *inode, unsigned char *locked_on,
unsigned char *sources, unsigned char *sinks,
unsigned char *healed_sinks,
struct afr_reply *replies, int *source_p,
unsigned char *pflag)
{
int ret = -1;
int source = -1;
afr_private_t *priv = NULL;
uint64_t *witness = NULL;
priv = this->private;
ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
if (ret)
return ret;
witness = alloca0(sizeof(*witness) * priv->child_count);
ret = afr_selfheal_find_direction(frame, this, replies,
AFR_ENTRY_TRANSACTION, locked_on, sources,
sinks, witness, pflag);
if (ret)
return ret;
/* Initialize the healed_sinks[] array optimistically to
the intersection of to-be-healed (i.e sinks[]) and
the list of servers which are up (i.e locked_on[]).
As we encounter failures in the healing process, we
will unmark the respective servers in the healed_sinks[]
array.
*/
AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks,
locked_on, replies, witness);
if (source < 0) {
/* If source is < 0 (typically split-brain), we perform a
conservative merge of entries rather than erroring out */
}
*source_p = source;
return ret;
}
static int
afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
char *name, inode_t *parent_idx_inode,
xlator_t *subvol, gf_boolean_t full_crawl)
{
int ret = 0;
int source = -1;
unsigned char *locked_on = NULL;
unsigned char *sources = NULL;
unsigned char *sinks = NULL;
unsigned char *healed_sinks = NULL;
inode_t *inode = NULL;
struct afr_reply *replies = NULL;
struct afr_reply *par_replies = NULL;
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
priv = this->private;
xattr = dict_new();
if (!xattr)
return -ENOMEM;
ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
if (ret) {
dict_unref(xattr);
return -1;
}
sources = alloca0(priv->child_count);
sinks = alloca0(priv->child_count);
healed_sinks = alloca0(priv->child_count);
locked_on = alloca0(priv->child_count);
replies = alloca0(priv->child_count * sizeof(*replies));
par_replies = alloca0(priv->child_count * sizeof(*par_replies));
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
locked_on);
{
if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes "
" could be locked in %s domain",
uuid_utoa(fd->inode->gfid), ret, this->name);
ret = -ENOTCONN;
goto unlock;
}
ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on,
sources, sinks, healed_sinks,
par_replies, &source, NULL);
if (ret < 0)
goto unlock;
inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies,
locked_on, xattr);
if (!inode) {
ret = -ENOMEM;
goto unlock;
}
ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source,
sources, healed_sinks, locked_on,
replies);
if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
ret = afr_shd_index_purge(subvol, parent_idx_inode, name,
inode->ia_type);
/* Why is ret force-set to 0? We do not care about
* index purge failing for full heal as it is quite
* possible during replace-brick that not all files
* and directories have their name indices present in
* entry-changes/.
*/
ret = 0;
}
}
unlock:
afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on,
NULL);
if (inode)
inode_unref(inode);
if (replies)
afr_replies_wipe(replies, priv->child_count);
if (par_replies)
afr_replies_wipe(par_replies, priv->child_count);
if (xattr)
dict_unref(xattr);
return ret;
}
static inode_t *
afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol,
uuid_t pargfid)
{
int ret = -1;
void *index_gfid = NULL;
loc_t rootloc = {
0,
};
loc_t loc = {
0,
};
dict_t *xattr = NULL;
inode_t *inode = NULL;
struct iatt iatt = {
0,
};
rootloc.inode = inode_ref(this->itable->root);
gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
ret = syncop_getxattr(subvol, &rootloc, &xattr,
GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
if (ret || !xattr) {
errno = -ret;
goto out;
}
ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
if (ret) {
errno = EINVAL;
goto out;
}
loc.inode = inode_new(this->itable);
if (!loc.inode) {
errno = ENOMEM;
goto out;
}
gf_uuid_copy(loc.pargfid, index_gfid);
loc.name = gf_strdup(uuid_utoa(pargfid));
ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL);
if (ret < 0) {
errno = -ret;
goto out;
}
inode = inode_link(loc.inode, NULL, NULL, &iatt);
out:
if (xattr)
dict_unref(xattr);
loc_wipe(&rootloc);
GF_FREE((char *)loc.name);
loc_wipe(&loc);
return inode;
}
static int
afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
int child)
{
int ret = 0;
gf_dirent_t entries;
gf_dirent_t *entry = NULL;
off_t offset = 0;
call_frame_t *iter_frame = NULL;
xlator_t *subvol = NULL;
afr_private_t *priv = NULL;
gf_boolean_t mismatch = _gf_false;
afr_local_t *local = NULL;
loc_t loc = {
0,
};
priv = this->private;
subvol = priv->children[child];
INIT_LIST_HEAD(&entries.list);
local = frame->local;
iter_frame = afr_copy_frame(frame);
if (!iter_frame)
return -ENOMEM;
loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
fd->inode->gfid);
while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL,
NULL))) {
if (ret > 0)
ret = 0;
list_for_each_entry(entry, &entries.list, list)
{
offset = entry->d_off;
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
if (__is_root_gfid(fd->inode->gfid) &&
!strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR))
continue;
ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
loc.inode, subvol,
local->need_full_crawl);
AFR_STACK_RESET(iter_frame);
if (iter_frame->local == NULL) {
ret = -ENOTCONN;
break;
}
if (ret == -1) {
/* gfid or type mismatch. */
mismatch = _gf_true;
ret = 0;
}
if (ret)
break;
}
gf_dirent_free(&entries);
if (ret)
break;
}
loc_wipe(&loc);
AFR_STACK_DESTROY(iter_frame);
if (mismatch == _gf_true)
/* undo pending will be skipped */
ret = -1;
return ret;
}
static int
afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
loc_t *parent, void *data)
{
int ret = 0;
loc_t loc = {
0,
};
struct iatt iatt = {
0,
};
afr_granular_esh_args_t *args = data;
/* Look up the actual inode associated with entry. If the lookup returns
* ESTALE or ENOENT, then it means we have a stale index. Remove it.
* This is analogous to the check in afr_shd_index_heal() except that
* here it is achieved through LOOKUP and in afr_shd_index_heal() through
* a GETXATTR.
*/
loc.inode = inode_new(args->xl->itable);
loc.parent = inode_ref(args->heal_fd->inode);
gf_uuid_copy(loc.pargfid, loc.parent->gfid);
loc.name = entry->d_name;
ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL);
if ((ret == -ENOENT) || (ret == -ESTALE)) {
/* The name indices under the pgfid index dir are guaranteed
* to be regular files. Hence the hardcoding.
*/
afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
ret = 0;
goto out;
}
/* TBD: afr_shd_zero_xattrop? */
ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd,
entry->d_name, parent->inode, subvol,
_gf_false);
AFR_STACK_RESET(args->frame);
if (args->frame->local == NULL)
ret = -ENOTCONN;
if (ret == -1)
args->mismatch = _gf_true;
out:
loc_wipe(&loc);
return 0;
}
static int
afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
int subvol_idx, gf_boolean_t is_src)
{
int ret = 0;
loc_t loc = {
0,
};
xlator_t *subvol = NULL;
afr_private_t *priv = NULL;
afr_granular_esh_args_t args = {
0,
};
priv = this->private;
subvol = priv->children[subvol_idx];
args.frame = afr_copy_frame(frame);
if (!args.frame)
goto out;
args.xl = this;
/* args.heal_fd represents the fd associated with the original directory
* on which entry heal is being attempted.
*/
args.heal_fd = fd;
/* @subvol here represents the subvolume of AFR where
* indices/entry-changes/<pargfid> will be processed
*/
loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
fd->inode->gfid);
if (!loc.inode) {
/* If granular heal failed on the sink (as it might sometimes
* because it is the src that would mostly contain the granular
* changelogs and the sink's entry-changes would be empty),
* do not treat heal as failure.
*/
if (is_src)
ret = -errno;
else
ret = 0;
goto out;
}
ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
afr_selfheal_entry_granular_dirent);
loc_wipe(&loc);
if (args.mismatch == _gf_true)
ret = -1;
out:
if (args.frame)
AFR_STACK_DESTROY(args.frame);
return ret;
}
static int
afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
unsigned char *sources, unsigned char *healed_sinks)
{
int i = 0;
int ret = 0;
gf_boolean_t mismatch = _gf_false;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
priv = this->private;
local = frame->local;
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
"performing entry selfheal on %s", uuid_utoa(fd->inode->gfid));
for (i = 0; i < priv->child_count; i++) {
/* Expunge */
if (!healed_sinks[i])
continue;
if (!local->need_full_crawl)
/* Why call afr_selfheal_entry_granular() on a "healed sink",
* given that it is the source that contains the granular
* indices?
* If the index for this directory is non-existent or empty on
* this subvol (=> clear sink), then it will return early
* without failure status.
* If the index is non-empty and it is yet a 'healed sink', then
* it is due to a split-brain in which case we anyway need to
* crawl the indices/entry-changes/pargfid directory.
*/
ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false);
else
ret = afr_selfheal_entry_do_subvol(frame, this, fd, i);
if (ret == -1) {
/* gfid or type mismatch. */
mismatch = _gf_true;
ret = 0;
}
if (ret)
break;
}
if (!ret && source != -1) {
/* Impunge */
if (local->need_full_crawl)
ret = afr_selfheal_entry_do_subvol(frame, this, fd, source);
else
ret = afr_selfheal_entry_granular(frame, this, fd, source,
_gf_true);
}
if (mismatch == _gf_true)
/* undo pending will be skipped */
ret = -1;
return ret;
}
static int
__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
unsigned char *locked_on)
{
int ret = -1;
int source = -1;
unsigned char *sources = NULL;
unsigned char *sinks = NULL;
unsigned char *data_lock = NULL;
unsigned char *postop_lock = NULL;
unsigned char *healed_sinks = NULL;
unsigned char *undid_pending = NULL;
struct afr_reply *locked_replies = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
gf_boolean_t did_sh = _gf_true;
priv = this->private;
local = frame->local;
sources = alloca0(priv->child_count);
sinks = alloca0(priv->child_count);
healed_sinks = alloca0(priv->child_count);
undid_pending = alloca0(priv->child_count);
data_lock = alloca0(priv->child_count);
postop_lock = alloca0(priv->child_count);
locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
data_lock);
{
if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
"be locked in %s domain",
uuid_utoa(fd->inode->gfid), ret, this->name);
ret = -ENOTCONN;
goto unlock;
}
ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock,
sources, sinks, healed_sinks,
locked_replies, &source, NULL);
if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
did_sh = _gf_false;
goto unlock;
}
local->need_full_crawl = afr_need_full_heal(
this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION);
}
unlock:
afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock,
NULL);
if (ret < 0)
goto out;
if (!did_sh)
goto out;
ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks);
if (ret)
goto out;
/* Take entrylks in xlator domain before doing post-op (undo-pending) in
* entry self-heal. This is to prevent a parallel name self-heal on
* an entry under @fd->inode from reading pending xattrs while it is
* being modified by SHD after entry sh below, given that
* name self-heal takes locks ONLY in xlator domain and is free to read
* pending changelog in the absence of the following locking.
*/
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
postop_lock);
{
if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"post-op after entry self-heal as %d "
"sub-volumes, as opposed to %d, "
"could be locked in %s domain",
uuid_utoa(fd->inode->gfid), ret,
AFR_COUNT(data_lock, priv->child_count), this->name);
ret = -ENOTCONN;
goto postop_unlock;
}
afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
locked_replies);
ret = afr_selfheal_undo_pending(
frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
}
postop_unlock:
afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL,
postop_lock, NULL);
out:
if (did_sh)
afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources,
healed_sinks);
else
ret = 1;
if (locked_replies)
afr_replies_wipe(locked_replies, priv->child_count);
return ret;
}
static fd_t *
afr_selfheal_data_opendir(xlator_t *this, inode_t *inode)
{
loc_t loc = {
0,
};
int ret = 0;
fd_t *fd = NULL;
fd = fd_create(inode, 0);
if (!fd)
return NULL;
loc.inode = inode_ref(inode);
gf_uuid_copy(loc.gfid, inode->gfid);
ret = syncop_opendir(this, &loc, fd, NULL, NULL);
if (ret) {
fd_unref(fd);
fd = NULL;
} else {
fd_bind(fd);
}
loc_wipe(&loc);
return fd;
}
int
afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
afr_private_t *priv = NULL;
unsigned char *locked_on = NULL;
fd_t *fd = NULL;
int ret = 0;
priv = this->private;
fd = afr_selfheal_data_opendir(this, inode);
if (!fd)
return -EIO;
locked_on = alloca0(priv->child_count);
ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
NULL, locked_on);
{
if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
"be locked in %s domain",
uuid_utoa(fd->inode->gfid), ret, priv->sh_domain);
/* Either less than two subvols available, or another
selfheal (from another server) is in progress. Skip
for now in any case there isn't anything to do.
*/
ret = -ENOTCONN;
goto unlock;
}
ret = __afr_selfheal_entry(frame, this, fd, locked_on);
}
unlock:
afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on,
NULL);
if (fd)
fd_unref(fd);
return ret;
}