/*
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
/* TODO: add NS locking */
#include <glusterfs/statedump.h>
#include "dht-common.h"
#include "dht-messages.h"
#ifndef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#endif
#define GF_DECIDE_DEFRAG_THROTTLE_COUNT(throttle_count, conf) \
{ \
pthread_mutex_lock(&conf->defrag->dfq_mutex); \
\
if (!strcasecmp(conf->dthrottle, "lazy")) \
conf->defrag->recon_thread_count = 1; \
\
throttle_count = MAX((sysconf(_SC_NPROCESSORS_ONLN) - 4), 4); \
\
if (!strcasecmp(conf->dthrottle, "normal")) \
conf->defrag->recon_thread_count = (throttle_count / 2); \
\
if (!strcasecmp(conf->dthrottle, "aggressive")) \
conf->defrag->recon_thread_count = throttle_count; \
\
pthread_mutex_unlock(&conf->defrag->dfq_mutex); \
}
/* TODO:
- use volumename in xattr instead of "dht"
- use NS locks
- handle all cases in self heal layout reconstruction
- complete linkfile selfheal
*/
extern dht_methods_t dht_methods;
void
dht_layout_dump(dht_layout_t *layout, const char *prefix)
{
char key[GF_DUMP_MAX_BUF_LEN];
int i = 0;
if (!layout)
goto out;
if (!prefix)
goto out;
gf_proc_dump_build_key(key, prefix, "cnt");
gf_proc_dump_write(key, "%d", layout->cnt);
gf_proc_dump_build_key(key, prefix, "preset");
gf_proc_dump_write(key, "%d", layout->preset);
gf_proc_dump_build_key(key, prefix, "gen");
gf_proc_dump_write(key, "%d", layout->gen);
if (layout->type != IA_INVAL) {
gf_proc_dump_build_key(key, prefix, "inode type");
gf_proc_dump_write(key, "%d", layout->type);
}
if (!IA_ISDIR(layout->type))
goto out;
for (i = 0; i < layout->cnt; i++) {
gf_proc_dump_build_key(key, prefix, "list[%d].err", i);
gf_proc_dump_write(key, "%d", layout->list[i].err);
gf_proc_dump_build_key(key, prefix, "list[%d].start", i);
gf_proc_dump_write(key, "%u", layout->list[i].start);
gf_proc_dump_build_key(key, prefix, "list[%d].stop", i);
gf_proc_dump_write(key, "%u", layout->list[i].stop);
if (layout->list[i].xlator) {
gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i);
gf_proc_dump_write(key, "%s", layout->list[i].xlator->type);
gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i);
gf_proc_dump_write(key, "%s", layout->list[i].xlator->name);
}
}
out:
return;
}
int32_t
dht_priv_dump(xlator_t *this)
{
char key_prefix[GF_DUMP_MAX_BUF_LEN];
char key[GF_DUMP_MAX_BUF_LEN];
int i = 0;
dht_conf_t *conf = NULL;
int ret = -1;
if (!this)
goto out;
conf = this->private;
if (!conf)
goto out;
ret = TRY_LOCK(&conf->subvolume_lock);
if (ret != 0) {
return ret;
}
gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv",
this->name);
gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt);
for (i = 0; i < conf->subvolume_cnt; i++) {
snprintf(key, sizeof(key), "subvolumes[%d]", i);
gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
conf->subvolumes[i]->name);
if (conf->file_layouts && conf->file_layouts[i]) {
snprintf(key, sizeof(key), "file_layouts[%d]", i);
dht_layout_dump(conf->file_layouts[i], key);
}
if (conf->dir_layouts && conf->dir_layouts[i]) {
snprintf(key, sizeof(key), "dir_layouts[%d]", i);
dht_layout_dump(conf->dir_layouts[i], key);
}
if (conf->subvolume_status) {
snprintf(key, sizeof(key), "subvolume_status[%d]", i);
gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]);
}
}
gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
gf_proc_dump_write("gen", "%d", conf->gen);
gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp);
if (conf->du_stats && conf->subvolume_status) {
for (i = 0; i < conf->subvolume_cnt; i++) {
if (!conf->subvolume_status[i])
continue;
snprintf(key, sizeof(key), "subvolumes[%d]", i);
gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name);
snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i);
gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent);
snprintf(key, sizeof(key), "du_stats[%d].avail_space", i);
gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space);
snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i);
gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes);
snprintf(key, sizeof(key), "du_stats[%d].log", i);
gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log);
}
}
if (conf->last_stat_fetch.tv_sec)
gf_proc_dump_write("last_stat_fetch", "%s",
ctime(&conf->last_stat_fetch.tv_sec));
UNLOCK(&conf->subvolume_lock);
out:
return ret;
}
int32_t
dht_inodectx_dump(xlator_t *this, inode_t *inode)
{
int ret = -1;
dht_layout_t *layout = NULL;
if (!this)
goto out;
if (!inode)
goto out;
ret = dht_inode_ctx_layout_get(inode, this, &layout);
if ((ret != 0) || !layout)
return ret;
gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
dht_layout_dump(layout, "layout");
out:
return ret;
}
void
dht_fini(xlator_t *this)
{
int i = 0;
dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO("dht", this, out);
conf = this->private;
this->private = NULL;
if (conf) {
if (conf->file_layouts) {
for (i = 0; i < conf->subvolume_cnt; i++) {
GF_FREE(conf->file_layouts[i]);
}
GF_FREE(conf->file_layouts);
}
dict_unref(conf->leaf_to_subvol);
/* allocated in dht_init_subvolumes() */
GF_FREE(conf->subvolumes);
GF_FREE(conf->subvolume_status);
GF_FREE(conf->last_event);
GF_FREE(conf->subvol_up_time);
GF_FREE(conf->du_stats);
GF_FREE(conf->decommissioned_bricks);
/* allocated in dht_init() */
GF_FREE(conf->mds_xattr_key);
GF_FREE(conf->link_xattr_name);
GF_FREE(conf->commithash_xattr_name);
GF_FREE(conf->wild_xattr_name);
/* allocated in dht_init_regex() */
if (conf->rsync_regex_valid)
regfree(&conf->rsync_regex);
if (conf->extra_regex_valid)
regfree(&conf->extra_regex);
synclock_destroy(&conf->link_lock);
if (conf->lock_pool)
mem_pool_destroy(conf->lock_pool);
GF_FREE(conf);
}
out:
return;
}
int32_t
mem_acct_init(xlator_t *this)
{
int ret = -1;
GF_VALIDATE_OR_GOTO("dht", this, out);
ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1);
if (ret != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
"Memory accounting init failed");
return ret;
}
out:
return ret;
}
int
dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf,
const char *bricks)
{
int i = 0;
int ret = -1;
char *tmpstr = NULL;
char *dup_brick = NULL;
char *node = NULL;
if (!conf || !bricks)
goto out;
dup_brick = gf_strdup(bricks);
if (dup_brick == NULL) {
goto out;
}
node = strtok_r(dup_brick, ",", &tmpstr);
while (node) {
for (i = 0; i < conf->subvolume_cnt; i++) {
if (!strcmp(conf->subvolumes[i]->name, node)) {
conf->decommissioned_bricks[i] = conf->subvolumes[i];
conf->decommission_subvols_cnt++;
gf_msg(this->name, GF_LOG_INFO, 0,
DHT_MSG_SUBVOL_DECOMMISSION_INFO,
"decommissioning subvolume %s",
conf->subvolumes[i]->name);
break;
}
}
if (i == conf->subvolume_cnt) {
/* Wrong node given. */
goto out;
}
node = strtok_r(NULL, ",", &tmpstr);
}
ret = 0;
conf->decommission_in_progress = 1;
out:
GF_FREE(dup_brick);
return ret;
}
int
dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf)
{
int i = 0;
int ret = -1;
if (!conf)
goto out;
for (i = 0; i < conf->subvolume_cnt; i++) {
if (conf->decommissioned_bricks[i]) {
conf->decommissioned_bricks[i] = NULL;
conf->decommission_subvols_cnt--;
}
}
ret = 0;
out:
return ret;
}
void
dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re,
gf_boolean_t *re_valid, dht_conf_t *conf)
{
char *temp_str = NULL;
if (dict_get_str(odict, name, &temp_str) != 0) {
if (strcmp(name, "rsync-hash-regex")) {
return;
}
temp_str = "^\\.(.+)\\.[^.]+$";
}
LOCK(&conf->lock);
{
if (*re_valid) {
regfree(re);
*re_valid = _gf_false;
}
if (!strcmp(temp_str, "none")) {
goto unlock;
}
if (regcomp(re, temp_str, REG_EXTENDED) == 0) {
gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str);
*re_valid = _gf_true;
} else {
gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO,
"compiling regex %s failed", temp_str);
}
}
unlock:
UNLOCK(&conf->lock);
}
int
dht_set_subvol_range(xlator_t *this)
{
int ret = -1;
dht_conf_t *conf = NULL;
conf = this->private;
if (!conf)
goto out;
conf->leaf_to_subvol = dict_new();
if (!conf->leaf_to_subvol)
goto out;
ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol);
out:
return ret;
}
int
dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str)
{
int rebal_thread_count = 0;
int ret = 0;
pthread_mutex_lock(&conf->defrag->dfq_mutex);
{
if (!strcasecmp(temp_str, "lazy")) {
conf->defrag->recon_thread_count = 1;
} else if (!strcasecmp(temp_str, "normal")) {
conf->defrag->recon_thread_count = 2;
} else if (!strcasecmp(temp_str, "aggressive")) {
conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4);
} else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) {
if ((rebal_thread_count > 0) &&
(rebal_thread_count <= MAX_REBAL_THREADS)) {
gf_msg(this->name, GF_LOG_INFO, 0, 0,
"rebal thread count configured to %d",
rebal_thread_count);
conf->defrag->recon_thread_count = rebal_thread_count;
} else {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option: Reconfigure: "
"rebal-throttle should be "
"within range of 0 and maximum number of"
" cores available");
ret = -1;
pthread_mutex_unlock(&conf->defrag->dfq_mutex);
goto out;
}
} else {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option: Reconfigure: "
"rebal-throttle should be {lazy|normal|aggressive}"
" or a number up to the number of cores available,"
" not (%s), defaulting to (%d)",
temp_str, conf->dthrottle);
ret = -1;
}
}
pthread_mutex_unlock(&conf->defrag->dfq_mutex);
out:
return ret;
}
int
dht_reconfigure(xlator_t *this, dict_t *options)
{
dht_conf_t *conf = NULL;
char *temp_str = NULL;
gf_boolean_t search_unhashed;
int ret = -1;
GF_VALIDATE_OR_GOTO("dht", this, out);
GF_VALIDATE_OR_GOTO("dht", options, out);
conf = this->private;
if (!conf)
return 0;
if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) {
/* If option is not "auto", other options _should_ be boolean*/
if (strcasecmp(temp_str, "auto")) {
if (!gf_string2boolean(temp_str, &search_unhashed)) {
gf_msg_debug(this->name, 0,
"Reconfigure: "
"lookup-unhashed reconfigured(%s)",
temp_str);
conf->search_unhashed = search_unhashed;
} else {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option: Reconfigure: "
"lookup-unhashed should be boolean,"
" not (%s), defaulting to (%d)",
temp_str, conf->search_unhashed);
ret = -1;
goto out;
}
} else {
gf_msg_debug(this->name, 0,
"Reconfigure:"
" lookup-unhashed reconfigured auto ");
conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
}
}
GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool,
out);
GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options,
percent_or_size, out);
/* option can be any one of percent or bytes */
conf->disk_unit = 0;
if (conf->min_free_disk < 100.0)
conf->disk_unit = 'p';
GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent,
out);
GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options,
uint32, out);
GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool,
out);
GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid,
options, bool, out);
GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options,
bool, out);
GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool,
out);
if (conf->defrag) {
if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) {
ret = dht_configure_throttle(this, conf, temp_str);
if (ret == -1)
goto out;
}
}
if (conf->defrag) {
conf->defrag->lock_migration_enabled = conf->lock_migration_enabled;
}
if (conf->defrag) {
GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool,
out);
}
if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) {
ret = dht_parse_decommissioned_bricks(this, conf, temp_str);
if (ret == -1)
goto out;
} else {
ret = dht_decommissioned_remove(this, conf);
if (ret == -1)
goto out;
}
dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex,
&conf->rsync_regex_valid, conf);
dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex,
&conf->extra_regex_valid, conf);
GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool,
out);
GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out);
ret = 0;
out:
return ret;
}
static int
gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag,
char *data)
{
int ret = -1;
char *tmp_str = NULL;
char *tmp_str1 = NULL;
char *dup_str = NULL;
char *num = NULL;
char *pattern_str = NULL;
char *pattern = NULL;
gf_defrag_pattern_list_t *temp_list = NULL;
gf_defrag_pattern_list_t *pattern_list = NULL;
if (!this || !defrag || !data)
goto out;
/* Get the pattern for pattern list. "pattern:<optional-size>"
* eg: *avi, *pdf:10MB, *:1TB
*/
pattern_str = strtok_r(data, ",", &tmp_str);
while (pattern_str) {
dup_str = gf_strdup(pattern_str);
pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1);
if (!pattern_list) {
goto out;
}
pattern = strtok_r(dup_str, ":", &tmp_str1);
num = strtok_r(NULL, ":", &tmp_str1);
if (!pattern)
goto out;
if (!num) {
if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) {
pattern = "*";
}
} else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option. Defrag pattern:"
" Invalid number format \"%s\"",
num);
goto out;
}
memcpy(pattern_list->path_pattern, pattern, strlen(dup_str));
if (!defrag->defrag_pattern)
temp_list = NULL;
else
temp_list = defrag->defrag_pattern;
pattern_list->next = temp_list;
defrag->defrag_pattern = pattern_list;
pattern_list = NULL;
GF_FREE(dup_str);
dup_str = NULL;
pattern_str = strtok_r(NULL, ",", &tmp_str);
}
ret = 0;
out:
if (ret)
GF_FREE(pattern_list);
GF_FREE(dup_str);
return ret;
}
int
dht_init_methods(xlator_t *this)
{
int ret = -1;
dht_conf_t *conf = NULL;
dht_methods_t *methods = NULL;
GF_VALIDATE_OR_GOTO("dht", this, err);
conf = this->private;
methods = &(conf->methods);
methods->migration_get_dst_subvol = dht_migration_get_dst_subvol;
methods->migration_needed = dht_migration_needed;
methods->migration_other = NULL;
methods->layout_search = dht_layout_search;
ret = 0;
err:
return ret;
}
int
dht_init(xlator_t *this)
{
dht_conf_t *conf = NULL;
char *temp_str = NULL;
int ret = -1;
int i = 0;
gf_defrag_info_t *defrag = NULL;
int cmd = 0;
char *node_uuid = NULL;
uint32_t commit_hash = 0;
GF_VALIDATE_OR_GOTO("dht", this, err);
if (!this->children) {
gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION,
"Distribute needs more than one subvolume");
return -1;
}
if (!this->parents) {
gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION,
"dangling volume. check volfile");
}
conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t);
if (!conf) {
goto err;
}
LOCK_INIT(&conf->subvolume_lock);
LOCK_INIT(&conf->layout_lock);
LOCK_INIT(&conf->lock);
synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT);
/* We get the commit-hash to set only for rebalance process */
if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO,
"%s using commit hash %u", __func__, commit_hash);
conf->vol_commit_hash = commit_hash;
conf->vch_forced = _gf_true;
}
ret = dict_get_int32(this->options, "rebalance-cmd", &cmd);
if (cmd) {
defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt);
GF_VALIDATE_OR_GOTO(this->name, defrag, err);
LOCK_INIT(&defrag->lock);
defrag->is_exiting = 0;
conf->defrag = defrag;
defrag->this = this;
ret = dict_get_str(this->options, "node-uuid", &node_uuid);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION,
"Invalid volume configuration: "
"node-uuid not specified");
goto err;
}
if (gf_uuid_parse(node_uuid, defrag->node_uuid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option:"
" Cannot parse glusterd node uuid");
goto err;
}
defrag->cmd = cmd;
defrag->stats = _gf_false;
defrag->queue = NULL;
defrag->crawl_done = 0;
defrag->global_error = 0;
defrag->q_entry_count = 0;
defrag->wakeup_crawler = 0;
pthread_mutex_init(&defrag->dfq_mutex, 0);
pthread_cond_init(&defrag->parallel_migration_cond, 0);
pthread_cond_init(&defrag->rebalance_crawler_alarm, 0);
pthread_cond_init(&defrag->df_wakeup_thread, 0);
pthread_mutex_init(&defrag->fc_mutex, 0);
pthread_cond_init(&defrag->fc_wakeup_cond, 0);
defrag->global_error = 0;
}
conf->use_fallocate = 1;
conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) {
/* If option is not "auto", other options _should_ be boolean */
if (strcasecmp(temp_str, "auto")) {
gf_boolean_t search_unhashed_bool;
ret = gf_string2boolean(temp_str, &search_unhashed_bool);
if (ret == -1) {
goto err;
}
conf->search_unhashed = search_unhashed_bool
? GF_DHT_LOOKUP_UNHASHED_ON
: GF_DHT_LOOKUP_UNHASHED_OFF;
} else {
conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
}
}
GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err);
GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err);
GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err);
GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err);
GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err);
conf->dir_spread_cnt = conf->subvolume_cnt;
GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32,
err);
GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool,
err);
GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err);
GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err);
GF_OPTION_INIT("force-migration", conf->force_migration, bool, err);
if (defrag) {
defrag->lock_migration_enabled = conf->lock_migration_enabled;
GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err);
if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) {
if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option:"
" Cannot parse rebalance-filter (%s)",
temp_str);
goto err;
}
}
}
/* option can be any one of percent or bytes */
conf->disk_unit = 0;
if (conf->min_free_disk < 100)
conf->disk_unit = 'p';
ret = dht_init_subvolumes(this, conf);
if (ret == -1) {
goto err;
}
if (cmd) {
ret = dht_init_local_subvolumes(this, conf);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0,
DHT_MSG_INIT_LOCAL_SUBVOL_FAILED,
"dht_init_local_subvolumes failed");
goto err;
}
}
if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) {
ret = dht_parse_decommissioned_bricks(this, conf, temp_str);
if (ret == -1)
goto err;
}
dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex,
&conf->rsync_regex_valid, conf);
dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex,
&conf->extra_regex_valid, conf);
ret = dht_layouts_init(this, conf);
if (ret == -1) {
goto err;
}
conf->gen = 1;
this->local_pool = mem_pool_new(dht_local_t, 512);
if (!this->local_pool) {
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
" DHT initialisation failed. "
"failed to create local_t's memory pool");
goto err;
}
GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid,
bool, err);
if (defrag) {
GF_OPTION_INIT("rebal-throttle", temp_str, str, err);
if (temp_str) {
ret = dht_configure_throttle(this, conf, temp_str);
if (ret == -1)
goto err;
}
}
GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err);
gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name);
gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR,
conf->xattr_name);
gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR,
conf->xattr_name);
gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name);
if (!conf->link_xattr_name || !conf->wild_xattr_name) {
goto err;
}
GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err);
conf->lock_pool = mem_pool_new(dht_lock_t, 512);
if (!conf->lock_pool) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED,
"failed to create lock mem_pool, failing "
"initialization");
goto err;
}
this->private = conf;
if (dht_set_subvol_range(this))
goto err;
if (dht_init_methods(this))
goto err;
return 0;
err:
if (conf) {
if (conf->file_layouts) {
for (i = 0; i < conf->subvolume_cnt; i++) {
GF_FREE(conf->file_layouts[i]);
}
GF_FREE(conf->file_layouts);
}
GF_FREE(conf->subvolumes);
GF_FREE(conf->subvolume_status);
GF_FREE(conf->du_stats);
GF_FREE(conf->defrag);
GF_FREE(conf->xattr_name);
GF_FREE(conf->link_xattr_name);
GF_FREE(conf->wild_xattr_name);
GF_FREE(conf->mds_xattr_key);
if (conf->lock_pool)
mem_pool_destroy(conf->lock_pool);
GF_FREE(conf);
}
return -1;
}
struct volume_options dht_options[] = {
{
.key = {"lookup-unhashed"},
.value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on",
"off"},
.type = GF_OPTION_TYPE_STR,
.default_value = "on",
.description =
"This option if set to ON, does a lookup through "
"all the sub-volumes, in case a lookup didn't return any result "
"from the hash subvolume. If set to OFF, it does not do a lookup "
"on the remaining subvolumes.",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
.level = OPT_STATUS_BASIC,
},
{.key = {"lookup-optimize"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.description =
"This option if set to ON enables the optimization "
"of -ve lookups, by not doing a lookup on non-hashed subvolumes for "
"files, in case the hashed subvolume does not return any result. "
"This option disregards the lookup-unhashed setting, when enabled.",
.op_version = {GD_OP_VERSION_3_7_2},
.level = OPT_STATUS_ADVANCED,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"min-free-disk"},
.type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
.default_value = "10%",
.description =
"Percentage/Size of disk space, after which the "
"process starts balancing out the cluster, and logs will appear "
"in log files",
.op_version = {1},
.level = OPT_STATUS_BASIC,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"min-free-inodes"},
.type = GF_OPTION_TYPE_PERCENT,
.default_value = "5%",
.description = "after system has only N% of inodes, warnings "
"starts to appear in log files",
.op_version = {1},
.level = OPT_STATUS_BASIC,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{
.key = {"unhashed-sticky-bit"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
{.key = {"use-readdirp"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.description = "This option if set to ON, forces the use of "
"readdirp, and hence also displays the stats of the files.",
.level = OPT_STATUS_ADVANCED,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"assert-no-child-down"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "This option if set to ON, in the event of "
"CHILD_DOWN, will call exit."},
{
.key = {"directory-layout-spread"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.validate = GF_OPT_VALIDATE_MIN,
.description = "Specifies the directory layout spread. Takes number "
"of subvolumes as default value.",
.op_version = {2},
},
{
.key = {"decommissioned-bricks"},
.type = GF_OPTION_TYPE_ANY,
.description =
"This option if set to ON, decommissions "
"the brick, so that no new data is allowed to be created "
"on that brick.",
.level = OPT_STATUS_ADVANCED,
},
{
.key = {"rebalance-cmd"},
.type = GF_OPTION_TYPE_INT,
},
{
.key = {"commit-hash"},
.type = GF_OPTION_TYPE_INT,
},
{
.key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
},
{
.key = {"rebalance-stats"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description =
"This option if set to ON displays and logs the "
" time taken for migration of each file, during the rebalance "
"process. If set to OFF, the rebalance logs will only display the "
"time spent in each directory.",
.op_version = {2},
.level = OPT_STATUS_BASIC,
},
{.key = {"readdir-optimize"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description =
"This option if set to ON enables the optimization "
"that allows DHT to requests non-first subvolumes to filter out "
"directory entries.",
.op_version = {1},
.level = OPT_STATUS_ADVANCED,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"rsync-hash-regex"},
.type = GF_OPTION_TYPE_STR,
/* Setting a default here doesn't work. See dht_init_regex. */
.description =
"Regular expression for stripping temporary-file "
"suffix and prefix used by rsync, to prevent relocation when the "
"file is renamed.",
.op_version = {3},
.level = OPT_STATUS_BASIC,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"extra-hash-regex"},
.type = GF_OPTION_TYPE_STR,
/* Setting a default here doesn't work. See dht_init_regex. */
.description =
"Regular expression for stripping temporary-file "
"suffix and prefix used by an application, to prevent relocation when "
"the file is renamed.",
.op_version = {3},
.level = OPT_STATUS_BASIC,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{
.key = {"rebalance-filter"},
.type = GF_OPTION_TYPE_STR,
},
{
.key = {"xattr-name"},
.type = GF_OPTION_TYPE_STR,
.default_value = "trusted.glusterfs.dht",
.description =
"Base for extended attributes used by this "
"translator instance, to avoid conflicts with others above or "
"below it.",
.op_version = {3},
},
{.key = {"weighted-rebalance"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.description =
"When enabled, files will be allocated to bricks "
"with a probability proportional to their size. Otherwise, all "
"bricks will have the same probability (legacy behavior).",
.op_version = {GD_OP_VERSION_RHS_3_0},
.level = OPT_STATUS_BASIC,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
/* NUFA option */
{.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR},
/* tier options */
{
.key = {"tier-pause"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
{
.key = {"tier-promote-frequency"},
.type = GF_OPTION_TYPE_INT,
.default_value = "120",
},
{
.key = {"tier-demote-frequency"},
.type = GF_OPTION_TYPE_INT,
.default_value = "3600",
},
{
.key = {"write-freq-threshold"},
.type = GF_OPTION_TYPE_INT,
.default_value = "0",
},
{
.key = {"read-freq-threshold"},
.type = GF_OPTION_TYPE_INT,
.default_value = "0",
},
{
.key = {"watermark-hi"},
.type = GF_OPTION_TYPE_PERCENT,
.default_value = "90",
},
{
.key = {"watermark-low"},
.type = GF_OPTION_TYPE_PERCENT,
.default_value = "75",
},
{
.key = {"tier-mode"},
.type = GF_OPTION_TYPE_STR,
.default_value = "test",
},
{
.key = {"tier-compact"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
{.key = {"tier-hot-compact-frequency"},
.type = GF_OPTION_TYPE_INT,
.default_value = "604800",
.description = "Frequency to compact DBs on hot tier in system"},
{.key = {"tier-cold-compact-frequency"},
.type = GF_OPTION_TYPE_INT,
.default_value = "604800",
.description = "Frequency to compact DBs on cold tier in system"},
{
.key = {"tier-max-mb"},
.type = GF_OPTION_TYPE_INT,
.default_value = "4000",
},
{
.key = {"tier-max-promote-file-size"},
.type = GF_OPTION_TYPE_INT,
.default_value = "0",
},
{
.key = {"tier-max-files"},
.type = GF_OPTION_TYPE_INT,
.default_value = "10000",
},
{
.key = {"tier-query-limit"},
.type = GF_OPTION_TYPE_INT,
.default_value = "100",
},
/* switch option */
{.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY},
{
.key = {"randomize-hash-range-by-gfid"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description =
"Use gfid of directory to determine the subvolume "
"from which hash ranges are allocated starting with 0. "
"Note that we still use a directory/file's name to determine the "
"subvolume to which it hashes",
.op_version = {GD_OP_VERSION_RHS_3_0},
},
{.key = {"rebal-throttle"},
.type = GF_OPTION_TYPE_STR,
.default_value = "normal",
.description = " Sets the maximum number of parallel file migrations "
"allowed on a node during the rebalance operation. The"
" default value is normal and allows a max of "
"[($(processing units) - 4) / 2), 2] files to be "
"migrated at a time. Lazy will allow only one file to "
"be migrated at a time and aggressive will allow "
"max of [($(processing units) - 4) / 2), 4]",
.op_version = {GD_OP_VERSION_RHS_3_0},
.level = OPT_STATUS_BASIC,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC
},
{.key = {"lock-migration"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = " If enabled this feature will migrate the posix locks"
" associated with a file during rebalance",
.op_version = {GD_OP_VERSION_3_8_0},
.level = OPT_STATUS_ADVANCED,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"force-migration"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "If disabled, rebalance will not migrate files that "
"are being written to by an application",
.op_version = {GD_OP_VERSION_4_0_0},
.level = OPT_STATUS_ADVANCED,
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {NULL}},
};
#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0]))
extern struct volume_options options[NUM_DHT_OPTIONS]
__attribute__((alias("dht_options")));