/*
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#include <glusterfs/glusterfs.h>
#include <glusterfs/xlator.h>
#include "dht-common.h"
#include <glusterfs/byte-order.h>
#include "dht-messages.h"
#include "unittest/unittest.h"
#define layout_base_size (sizeof(dht_layout_t))
#define layout_entry_size (sizeof((dht_layout_t *)NULL)->list[0])
#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
dht_layout_t *
dht_layout_new(xlator_t *this, int cnt)
{
dht_layout_t *layout = NULL;
dht_conf_t *conf = NULL;
REQUIRE(NULL != this);
REQUIRE(cnt >= 0);
conf = this->private;
layout = GF_CALLOC(1, layout_size(cnt), gf_dht_mt_dht_layout_t);
if (!layout) {
goto out;
}
layout->type = DHT_HASH_TYPE_DM;
layout->cnt = cnt;
if (conf) {
layout->spread_cnt = conf->dir_spread_cnt;
layout->gen = conf->gen;
}
GF_ATOMIC_INIT(layout->ref, 1);
ENSURE(NULL != layout);
ENSURE(layout->type == DHT_HASH_TYPE_DM);
ENSURE(layout->cnt == cnt);
ENSURE(GF_ATOMIC_GET(layout->ref) == 1);
out:
return layout;
}
dht_layout_t *
dht_layout_get(xlator_t *this, inode_t *inode)
{
dht_layout_t *layout = NULL;
int ret = 0;
ret = dht_inode_ctx_layout_get(inode, this, &layout);
if ((!ret) && layout) {
GF_ATOMIC_INC(layout->ref);
}
return layout;
}
int
dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout)
{
dht_conf_t *conf = NULL;
int oldret = -1;
int ret = -1;
dht_layout_t *old_layout;
conf = this->private;
if (!conf || !layout)
goto out;
LOCK(&conf->layout_lock);
{
oldret = dht_inode_ctx_layout_get(inode, this, &old_layout);
if (layout)
GF_ATOMIC_INC(layout->ref);
ret = dht_inode_ctx_layout_set(inode, this, layout);
}
UNLOCK(&conf->layout_lock);
if (!oldret) {
dht_layout_unref(this, old_layout);
}
if (ret)
GF_ATOMIC_DEC(layout->ref);
out:
return ret;
}
void
dht_layout_unref(xlator_t *this, dht_layout_t *layout)
{
int ref = 0;
if (!layout || layout->preset || !this->private)
return;
ref = GF_ATOMIC_DEC(layout->ref);
if (!ref)
GF_FREE(layout);
}
dht_layout_t *
dht_layout_ref(xlator_t *this, dht_layout_t *layout)
{
if (layout->preset || !this->private)
return layout;
GF_ATOMIC_INC(layout->ref);
return layout;
}
xlator_t *
dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name)
{
uint32_t hash = 0;
xlator_t *subvol = NULL;
int i = 0;
int ret = 0;
ret = dht_hash_compute(this, layout->type, name, &hash);
if (ret != 0) {
gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED,
"hash computation failed for type=%d name=%s", layout->type,
name);
goto out;
}
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].start <= hash && layout->list[i].stop >= hash) {
subvol = layout->list[i].xlator;
break;
}
}
if (!subvol) {
gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
"no subvolume for hash (value) = %u", hash);
}
out:
return subvol;
}
dht_layout_t *
dht_layout_for_subvol(xlator_t *this, xlator_t *subvol)
{
dht_conf_t *conf = NULL;
dht_layout_t *layout = NULL;
int i = 0;
conf = this->private;
if (!conf)
goto out;
for (i = 0; i < conf->subvolume_cnt; i++) {
if (conf->subvolumes[i] == subvol) {
layout = conf->file_layouts[i];
break;
}
}
out:
return layout;
}
int
dht_layouts_init(xlator_t *this, dht_conf_t *conf)
{
dht_layout_t *layout = NULL;
int i = 0;
int ret = -1;
if (!conf)
goto out;
conf->file_layouts = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_layout_t *),
gf_dht_mt_dht_layout_t);
if (!conf->file_layouts) {
goto out;
}
for (i = 0; i < conf->subvolume_cnt; i++) {
layout = dht_layout_new(this, 1);
if (!layout) {
goto out;
}
layout->preset = 1;
layout->list[0].xlator = conf->subvolumes[i];
conf->file_layouts[i] = layout;
}
ret = 0;
out:
return ret;
}
int
dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
int32_t **disk_layout_p)
{
int ret = -1;
int32_t *disk_layout = NULL;
disk_layout = GF_CALLOC(5, sizeof(int), gf_dht_mt_int32_t);
if (!disk_layout) {
goto out;
}
disk_layout[0] = hton32(layout->list[pos].commit_hash);
disk_layout[1] = hton32(layout->type);
disk_layout[2] = hton32(layout->list[pos].start);
disk_layout[3] = hton32(layout->list[pos].stop);
if (disk_layout_p)
*disk_layout_p = disk_layout;
else
GF_FREE(disk_layout);
ret = 0;
out:
return ret;
}
int
dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
xlator_t *subvol, int32_t **disk_layout_p)
{
int i = 0;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].xlator == subvol)
break;
}
if (i == layout->cnt)
return -1;
return dht_disk_layout_extract(this, layout, i, disk_layout_p);
}
int
dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
void *disk_layout_raw, int disk_layout_len)
{
int type = 0;
int start_off = 0;
int stop_off = 0;
int commit_hash = 0;
int disk_layout[4];
if (!disk_layout_raw) {
gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
"error no layout on disk for merge");
return -1;
}
GF_ASSERT(disk_layout_len == sizeof(disk_layout));
memcpy(disk_layout, disk_layout_raw, disk_layout_len);
type = ntoh32(disk_layout[1]);
switch (type) {
case DHT_HASH_TYPE_DM_USER:
gf_msg_debug(this->name, 0, "found user-set layout");
layout->type = type;
/* Fall through. */
case DHT_HASH_TYPE_DM:
break;
default:
gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT,
"Invalid disk layout: "
"Catastrophic error layout with unknown type found %d",
disk_layout[1]);
return -1;
}
commit_hash = ntoh32(disk_layout[0]);
start_off = ntoh32(disk_layout[2]);
stop_off = ntoh32(disk_layout[3]);
layout->list[pos].commit_hash = commit_hash;
layout->list[pos].start = start_off;
layout->list[pos].stop = stop_off;
gf_msg_trace(
this->name, 0, "merged to layout: %u - %u (type %d, hash %d) from %s",
start_off, stop_off, commit_hash, type, layout->list[pos].xlator->name);
return 0;
}
int
dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
{
int i = 0;
int ret = -1;
int err = -1;
void *disk_layout_raw = NULL;
int disk_layout_len = 0;
dht_conf_t *conf = this->private;
if (op_ret != 0) {
err = op_errno;
}
if (!layout)
goto out;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].xlator == NULL) {
layout->list[i].err = err;
layout->list[i].xlator = subvol;
break;
}
}
if (op_ret != 0) {
ret = 0;
goto out;
}
if (xattr) {
/* during lookup and not mkdir */
ret = dict_get_ptr_and_len(xattr, conf->xattr_name, &disk_layout_raw,
&disk_layout_len);
}
if (ret != 0) {
layout->list[i].err = 0;
gf_msg_trace(this->name, 0, "Missing disk layout on %s. err = %d",
subvol->name, err);
ret = 0;
goto out;
}
ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw,
disk_layout_len);
if (ret != 0) {
gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
"layout merge from subvolume %s failed", subvol->name);
goto out;
}
if (layout->commit_hash == 0) {
layout->commit_hash = layout->list[i].commit_hash;
} else if (layout->commit_hash != layout->list[i].commit_hash) {
layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
}
layout->list[i].err = 0;
out:
return ret;
}
void
dht_layout_entry_swap(dht_layout_t *layout, int i, int j)
{
uint32_t start_swap = 0;
uint32_t stop_swap = 0;
uint32_t commit_hash_swap = 0;
xlator_t *xlator_swap = 0;
int err_swap = 0;
start_swap = layout->list[i].start;
stop_swap = layout->list[i].stop;
xlator_swap = layout->list[i].xlator;
err_swap = layout->list[i].err;
commit_hash_swap = layout->list[i].commit_hash;
layout->list[i].start = layout->list[j].start;
layout->list[i].stop = layout->list[j].stop;
layout->list[i].xlator = layout->list[j].xlator;
layout->list[i].err = layout->list[j].err;
layout->list[i].commit_hash = layout->list[j].commit_hash;
layout->list[j].start = start_swap;
layout->list[j].stop = stop_swap;
layout->list[j].xlator = xlator_swap;
layout->list[j].err = err_swap;
layout->list[j].commit_hash = commit_hash_swap;
}
void
dht_layout_range_swap(dht_layout_t *layout, int i, int j)
{
uint32_t start_swap = 0;
uint32_t stop_swap = 0;
start_swap = layout->list[i].start;
stop_swap = layout->list[i].stop;
layout->list[i].start = layout->list[j].start;
layout->list[i].stop = layout->list[j].stop;
layout->list[j].start = start_swap;
layout->list[j].stop = stop_swap;
}
int64_t
dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j)
{
return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name));
}
gf_boolean_t
dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator)
{
int i = 0;
for (i = 0; i < layout->cnt; i++) {
/* Check if xlator is already part of layout, and layout is
* non-zero. */
if (!strcmp(layout->list[i].xlator->name, xlator->name)) {
if (layout->list[i].start != layout->list[i].stop)
return _gf_true;
break;
}
}
return _gf_false;
}
int64_t
dht_layout_entry_cmp(dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
/* swap zero'ed out layouts to front, if needed */
if (!layout->list[j].start && !layout->list[j].stop) {
diff = (int64_t)layout->list[i].stop - (int64_t)layout->list[j].stop;
goto out;
}
diff = (int64_t)layout->list[i].start - (int64_t)layout->list[j].start;
out:
return diff;
}
int
dht_layout_sort(dht_layout_t *layout)
{
int i = 0;
int j = 0;
int64_t ret = 0;
/* TODO: O(n^2) -- bad bad */
for (i = 0; i < layout->cnt - 1; i++) {
for (j = i + 1; j < layout->cnt; j++) {
ret = dht_layout_entry_cmp(layout, i, j);
if (ret > 0)
dht_layout_entry_swap(layout, i, j);
}
}
return 0;
}
int
dht_layout_sort_volname(dht_layout_t *layout)
{
int i = 0;
int j = 0;
int64_t ret = 0;
/* TODO: O(n^2) -- bad bad */
for (i = 0; i < layout->cnt - 1; i++) {
for (j = i + 1; j < layout->cnt; j++) {
ret = dht_layout_entry_cmp_volname(layout, i, j);
if (ret > 0)
dht_layout_entry_swap(layout, i, j);
}
}
return 0;
}
void
dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t *holes_p, uint32_t *overlaps_p,
uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
uint32_t *no_space_p)
{
uint32_t overlaps = 0;
uint32_t missing = 0;
uint32_t down = 0;
uint32_t misc = 0;
uint32_t hole_cnt = 0;
uint32_t overlap_cnt = 0;
int i = 0;
uint32_t prev_stop = 0;
uint32_t last_stop = 0;
char is_virgin = 1;
uint32_t no_space = 0;
/* This function scans through the layout spread of a directory to
check if there are any anomalies. Prior to calling this function
the layout entries should be sorted in the ascending order.
If the layout entry has err != 0
then increment the corresponding anomaly.
else
if (start of the current layout entry > stop + 1 of previous
non erroneous layout entry)
then it indicates a hole in the layout
if (start of the current layout entry < stop + 1 of previous
non erroneous layout entry)
then it indicates an overlap in the layout
*/
last_stop = layout->list[0].start - 1;
prev_stop = last_stop;
for (i = 0; i < layout->cnt; i++) {
switch (layout->list[i].err) {
case -1:
case ENOENT:
case ESTALE:
missing++;
continue;
case ENOTCONN:
down++;
continue;
case ENOSPC:
no_space++;
continue;
case 0:
/* if err == 0 and start == stop, then it is a non misc++;
* participating subvolume(spread-cnt). Then, do not
* check for anomalies. If start != stop, then treat it
* as misc err */
if (layout->list[i].start == layout->list[i].stop) {
continue;
}
break;
default:
misc++;
continue;
}
is_virgin = 0;
if ((prev_stop + 1) < layout->list[i].start) {
hole_cnt++;
}
if ((prev_stop + 1) > layout->list[i].start) {
overlap_cnt++;
overlaps += ((prev_stop + 1) - layout->list[i].start);
}
prev_stop = layout->list[i].stop;
}
if ((last_stop - prev_stop) || is_virgin)
hole_cnt++;
if (holes_p)
*holes_p = hole_cnt;
if (overlaps_p)
*overlaps_p = overlap_cnt;
if (missing_p)
*missing_p = missing;
if (down_p)
*down_p = down;
if (misc_p)
*misc_p = misc;
if (no_space_p)
*no_space_p = no_space;
}
int
dht_layout_missing_dirs(dht_layout_t *layout)
{
int i = 0, missing = 0;
if (layout == NULL)
goto out;
for (i = 0; i < layout->cnt; i++) {
if ((layout->list[i].err == ENOENT) ||
((layout->list[i].err == -1) && (layout->list[i].start == 0) &&
(layout->list[i].stop == 0))) {
missing++;
}
}
out:
return missing;
}
int
dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout)
{
int ret = 0;
uint32_t holes = 0;
uint32_t overlaps = 0;
uint32_t missing = 0;
uint32_t down = 0;
uint32_t misc = 0, missing_dirs = 0;
char gfid[GF_UUID_BUF_SIZE] = {0};
ret = dht_layout_sort(layout);
if (ret == -1) {
gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
"sort failed?! how the ....");
goto out;
}
gf_uuid_unparse(loc->gfid, gfid);
dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down,
&misc, NULL);
if (holes || overlaps) {
if (missing == layout->cnt) {
gf_msg_debug(this->name, 0,
"Directory %s looked up first time"
" gfid = %s",
loc->path, gfid);
} else {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO,
"Found anomalies in %s (gfid = %s). "
"Holes=%d overlaps=%d",
loc->path, gfid, holes, overlaps);
}
ret = -1;
}
if (ret >= 0) {
missing_dirs = dht_layout_missing_dirs(layout);
/* TODO During DHT selfheal rewrite (almost) find a better place
* to detect this - probably in dht_layout_anomalies()
*/
if (missing_dirs > 0)
ret += missing_dirs;
}
out:
return ret;
}
int
dht_dir_has_layout(dict_t *xattr, char *name)
{
void *disk_layout_raw = NULL;
return dict_get_ptr(xattr, name, &disk_layout_raw);
}
int
dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
loc_t *loc, dict_t *xattr)
{
int idx = 0;
int pos = -1;
int ret = 0;
int err = 0;
int dict_ret = 0;
int32_t disk_layout[4];
void *disk_layout_raw = NULL;
uint32_t start_off = -1;
uint32_t stop_off = -1;
uint32_t commit_hash = -1;
dht_conf_t *conf = this->private;
char gfid[GF_UUID_BUF_SIZE] = {0};
if (loc && loc->inode)
gf_uuid_unparse(loc->inode->gfid, gfid);
for (idx = 0; idx < layout->cnt; idx++) {
if (layout->list[idx].xlator == subvol) {
pos = idx;
break;
}
}
if (pos == -1) {
if (loc) {
gf_msg_debug(this->name, 0, "%s - no layout info for subvolume %s",
loc ? loc->path : "path not found", subvol->name);
}
ret = 1;
goto out;
}
err = layout->list[pos].err;
if (!xattr) {
if (err == 0) {
if (loc) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DICT_GET_FAILED,
"%s: xattr dictionary is NULL", loc->path);
} else {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DICT_GET_FAILED,
"path not found: "
"xattr dictionary is NULL");
}
ret = -1;
}
goto out;
}
dict_ret = dict_get_ptr(xattr, conf->xattr_name, &disk_layout_raw);
if (dict_ret < 0) {
if (err == 0 && layout->list[pos].stop) {
if (loc) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
"%s: Disk layout missing, gfid = %s", loc->path, gfid);
} else {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
"path not found: "
"Disk layout missing, gfid = %s",
gfid);
}
ret = -1;
}
goto out;
}
memcpy(disk_layout, disk_layout_raw, sizeof(disk_layout));
start_off = ntoh32(disk_layout[2]);
stop_off = ntoh32(disk_layout[3]);
commit_hash = ntoh32(disk_layout[0]);
if ((layout->list[pos].start != start_off) ||
(layout->list[pos].stop != stop_off) ||
(layout->list[pos].commit_hash != commit_hash)) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO,
"subvol: %s; inode layout - %" PRIu32 " - %" PRIu32 " - %" PRIu32
"; "
"disk layout - %" PRIu32 " - %" PRIu32 " - %" PRIu32,
layout->list[pos].xlator->name, layout->list[pos].start,
layout->list[pos].stop, layout->list[pos].commit_hash, start_off,
stop_off, commit_hash);
ret = 1;
} else {
ret = 0;
}
out:
return ret;
}
int
dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode)
{
dht_layout_t *layout = NULL;
int ret = -1;
dht_conf_t *conf = NULL;
conf = this->private;
if (!conf)
goto out;
layout = dht_layout_for_subvol(this, subvol);
if (!layout) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
"no pre-set layout for subvolume %s",
subvol ? subvol->name : "<nil>");
ret = -1;
goto out;
}
gf_msg_debug(this->name, 0, "file = %s, subvol = %s",
uuid_utoa(inode->gfid), subvol ? subvol->name : "<nil>");
LOCK(&conf->layout_lock);
{
dht_inode_ctx_layout_set(inode, this, layout);
}
UNLOCK(&conf->layout_lock);
ret = 0;
out:
return ret;
}
int
dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol)
{
int i = 0, ret = -1;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].xlator == subvol) {
ret = i;
break;
}
}
return ret;
}