/*
Copyright (c) 2006-2017 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#define __XOPEN_SOURCE 500
/* for SEEK_HOLE and SEEK_DATA */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <openssl/md5.h>
#include <stdint.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <errno.h>
#include <libgen.h>
#include <pthread.h>
#include <ftw.h>
#include <sys/stat.h>
#include <signal.h>
#include <sys/uio.h>
#include <unistd.h>
#include <ftw.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
#endif /* GF_BSD_HOST_OS */
#ifdef HAVE_LINKAT
#include <fcntl.h>
#endif /* HAVE_LINKAT */
#include <glusterfs/glusterfs.h>
#include <glusterfs/checksum.h>
#include <glusterfs/dict.h>
#include <glusterfs/logging.h>
#include "posix.h"
#include "posix-inode-handle.h"
#include <glusterfs/xlator.h>
#include <glusterfs/defaults.h>
#include <glusterfs/common-utils.h>
#include <glusterfs/compat-errno.h>
#include <glusterfs/compat.h>
#include <glusterfs/byte-order.h>
#include <glusterfs/syscall.h>
#include <glusterfs/statedump.h>
#include <glusterfs/locking.h>
#include <glusterfs/timer.h>
#include "glusterfs3-xdr.h"
#include <glusterfs/hashfn.h>
#include "posix-aio.h"
#include <glusterfs/glusterfs-acl.h>
#include "posix-messages.h"
#include <glusterfs/events.h>
#include "posix-gfid-path.h"
#include <glusterfs/compat-uuid.h>
#include "timer-wheel.h"
extern char *marker_xattrs[];
#define ALIGN_SIZE 4096
#undef HAVE_SET_FSID
#ifdef HAVE_SET_FSID
#define DECLARE_OLD_FS_ID_VAR \
uid_t old_fsuid; \
gid_t old_fsgid;
#define SET_FS_ID(uid, gid) \
do { \
old_fsuid = setfsuid(uid); \
old_fsgid = setfsgid(gid); \
} while (0)
#define SET_TO_OLD_FS_ID() \
do { \
setfsuid(old_fsuid); \
setfsgid(old_fsgid); \
} while (0)
#else
#define DECLARE_OLD_FS_ID_VAR
#define SET_FS_ID(uid, gid)
#define SET_TO_OLD_FS_ID()
#endif
/* Setting microseconds or nanoseconds depending on what's supported:
The passed in `tv` can be
struct timespec
if supported (better, because it supports nanosecond resolution) or
struct timeval
otherwise. */
#if HAVE_UTIMENSAT
#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) tv.tv_nsec = nanosecs
#else
#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \
tv.tv_usec = nanosecs / 1000
#endif
int32_t
posix_priv(xlator_t *this)
{
struct posix_private *priv = NULL;
char key_prefix[GF_DUMP_MAX_BUF_LEN];
(void)snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type,
this->name);
gf_proc_dump_add_section("%s", key_prefix);
if (!this)
return 0;
priv = this->private;
if (!priv)
return 0;
gf_proc_dump_write("base_path", "%s", priv->base_path);
gf_proc_dump_write("base_path_length", "%d", priv->base_path_length);
gf_proc_dump_write("max_read", "%" PRId64, GF_ATOMIC_GET(priv->read_value));
gf_proc_dump_write("max_write", "%" PRId64,
GF_ATOMIC_GET(priv->write_value));
return 0;
}
int32_t
posix_inode(xlator_t *this)
{
return 0;
}
/**
* notify - when parent sends PARENT_UP, send CHILD_UP event from here
*/
int32_t
posix_notify(xlator_t *this, int32_t event, void *data, ...)
{
xlator_t *victim = data;
struct posix_private *priv = this->private;
int ret = 0;
struct timespec sleep_till = {
0,
};
glusterfs_ctx_t *ctx = this->ctx;
switch (event) {
case GF_EVENT_PARENT_UP: {
/* the parent that posix xlator is up */
default_notify(this, GF_EVENT_CHILD_UP, data);
} break;
case GF_EVENT_PARENT_DOWN: {
if (!victim->cleanup_starting)
break;
if (priv->janitor) {
pthread_mutex_lock(&priv->janitor_mutex);
{
priv->janitor_task_stop = _gf_true;
ret = gf_tw_del_timer(this->ctx->tw->timer_wheel,
priv->janitor);
if (!ret) {
clock_gettime(CLOCK_REALTIME, &sleep_till);
sleep_till.tv_sec += 1;
/* Wait to set janitor_task flag to _gf_false by
* janitor_task_done */
while (priv->janitor_task_stop) {
(void)pthread_cond_timedwait(&priv->janitor_cond,
&priv->janitor_mutex,
&sleep_till);
clock_gettime(CLOCK_REALTIME, &sleep_till);
sleep_till.tv_sec += 1;
}
}
}
pthread_mutex_unlock(&priv->janitor_mutex);
GF_FREE(priv->janitor);
}
priv->janitor = NULL;
pthread_mutex_lock(&ctx->fd_lock);
{
while (priv->rel_fdcount > 0) {
pthread_cond_wait(&priv->fd_cond, &ctx->fd_lock);
}
}
pthread_mutex_unlock(&ctx->fd_lock);
gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s",
victim->name);
default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data);
} break;
default:
/* */
break;
}
return 0;
}
int32_t
mem_acct_init(xlator_t *this)
{
int ret = -1;
if (!this)
return ret;
ret = xlator_mem_acct_init(this, gf_posix_mt_end + 1);
if (ret != 0) {
return ret;
}
return ret;
}
static int
posix_set_owner(xlator_t *this, uid_t uid, gid_t gid)
{
struct posix_private *priv = NULL;
int ret = -1;
struct stat st = {
0,
};
priv = this->private;
ret = sys_lstat(priv->base_path, &st);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_DIR_OPERATION_FAILED,
"Failed to stat "
"brick path %s",
priv->base_path);
return ret;
}
if ((uid == -1 || st.st_uid == uid) && (gid == -1 || st.st_gid == gid))
return 0;
ret = sys_chown(priv->base_path, uid, gid);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_DIR_OPERATION_FAILED,
"Failed to set uid/gid for"
" brick path %s",
priv->base_path);
return ret;
}
static int
set_gfid2path_separator(struct posix_private *priv, const char *str)
{
int str_len = 0;
str_len = strlen(str);
if (str_len > 0 && str_len < 8) {
strcpy(priv->gfid2path_sep, str);
return 0;
}
return -1;
}
static int
set_batch_fsync_mode(struct posix_private *priv, const char *str)
{
if (strcmp(str, "none") == 0)
priv->batch_fsync_mode = BATCH_NONE;
else if (strcmp(str, "syncfs") == 0)
priv->batch_fsync_mode = BATCH_SYNCFS;
else if (strcmp(str, "syncfs-single-fsync") == 0)
priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC;
else if (strcmp(str, "syncfs-reverse-fsync") == 0)
priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC;
else if (strcmp(str, "reverse-fsync") == 0)
priv->batch_fsync_mode = BATCH_REVERSE_FSYNC;
else
return -1;
return 0;
}
#ifdef GF_DARWIN_HOST_OS
static int
set_xattr_user_namespace_mode(struct posix_private *priv, const char *str)
{
if (strcmp(str, "none") == 0)
priv->xattr_user_namespace = XATTR_NONE;
else if (strcmp(str, "strip") == 0)
priv->xattr_user_namespace = XATTR_STRIP;
else if (strcmp(str, "append") == 0)
priv->xattr_user_namespace = XATTR_APPEND;
else if (strcmp(str, "both") == 0)
priv->xattr_user_namespace = XATTR_BOTH;
else
return -1;
return 0;
}
#endif
int
posix_reconfigure(xlator_t *this, dict_t *options)
{
int ret = -1;
struct posix_private *priv = NULL;
int32_t uid = -1;
int32_t gid = -1;
char *batch_fsync_mode_str = NULL;
char *gfid2path_sep = NULL;
int32_t force_create_mode = -1;
int32_t force_directory_mode = -1;
int32_t create_mask = -1;
int32_t create_directory_mask = -1;
priv = this->private;
GF_OPTION_RECONF("brick-uid", uid, options, int32, out);
GF_OPTION_RECONF("brick-gid", gid, options, int32, out);
if (uid != -1 || gid != -1)
posix_set_owner(this, uid, gid);
GF_OPTION_RECONF("batch-fsync-delay-usec", priv->batch_fsync_delay_usec,
options, uint32, out);
GF_OPTION_RECONF("batch-fsync-mode", batch_fsync_mode_str, options, str,
out);
if (set_batch_fsync_mode(priv, batch_fsync_mode_str) != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT,
"Unknown mode string: %s", batch_fsync_mode_str);
goto out;
}
GF_OPTION_RECONF("gfid2path-separator", gfid2path_sep, options, str, out);
if (set_gfid2path_separator(priv, gfid2path_sep) != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT,
"Length of separator exceeds 7: %s", gfid2path_sep);
goto out;
}
#ifdef GF_DARWIN_HOST_OS
char *xattr_user_namespace_mode_str = NULL;
GF_OPTION_RECONF("xattr-user-namespace-mode", xattr_user_namespace_mode_str,
options, str, out);
if (set_xattr_user_namespace_mode(priv, xattr_user_namespace_mode_str) !=
0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_UNKNOWN_ARGUMENT,
"Unknown xattr user namespace mode string: %s",
xattr_user_namespace_mode_str);
goto out;
}
#endif
GF_OPTION_RECONF("linux-aio", priv->aio_configured, options, bool, out);
if (priv->aio_configured)
posix_aio_on(this);
else
posix_aio_off(this);
GF_OPTION_RECONF("update-link-count-parent", priv->update_pgfid_nlinks,
options, bool, out);
GF_OPTION_RECONF("gfid2path", priv->gfid2path, options, bool, out);
GF_OPTION_RECONF("node-uuid-pathinfo", priv->node_uuid_pathinfo, options,
bool, out);
if (priv->node_uuid_pathinfo && (gf_uuid_is_null(priv->glusterd_uuid))) {
gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_UUID_NULL,
"glusterd uuid is NULL, pathinfo xattr would"
" fallback to <hostname>:<export>");
}
GF_OPTION_RECONF("reserve", priv->disk_reserve, options, percent_or_size,
out);
/* option can be any one of percent or bytes */
priv->disk_unit = 0;
if (priv->disk_reserve < 100.0)
priv->disk_unit = 'p';
if (priv->disk_reserve) {
ret = posix_spawn_disk_space_check_thread(this);
if (ret) {
gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED,
"Getting disk space check from thread failed");
goto out;
}
}
GF_OPTION_RECONF("health-check-interval", priv->health_check_interval,
options, uint32, out);
GF_OPTION_RECONF("health-check-timeout", priv->health_check_timeout,
options, uint32, out);
if (priv->health_check_interval) {
ret = posix_spawn_health_check_thread(this);
if (ret)
goto out;
}
GF_OPTION_RECONF("shared-brick-count", priv->shared_brick_count, options,
int32, out);
GF_OPTION_RECONF("disable-landfill-purge", priv->disable_landfill_purge,
options, bool, out);
if (priv->disable_landfill_purge) {
gf_log(this->name, GF_LOG_WARNING,
"Janitor WILL NOT purge the landfill directory. "
"Your landfill directory"
" may fill up this brick.");
} else {
gf_msg_debug(this->name, 0,
"Janitor will purge the landfill "
"directory, which is default behavior");
}
GF_OPTION_RECONF("force-create-mode", force_create_mode, options, int32,
out);
priv->force_create_mode = force_create_mode;
GF_OPTION_RECONF("force-directory-mode", force_directory_mode, options,
int32, out);
priv->force_directory_mode = force_directory_mode;
GF_OPTION_RECONF("create-mask", create_mask, options, int32, out);
priv->create_mask = create_mask;
GF_OPTION_RECONF("create-directory-mask", create_directory_mask, options,
int32, out);
priv->create_directory_mask = create_directory_mask;
GF_OPTION_RECONF("max-hardlinks", priv->max_hardlinks, options, uint32,
out);
GF_OPTION_RECONF("fips-mode-rchecksum", priv->fips_mode_rchecksum, options,
bool, out);
GF_OPTION_RECONF("ctime", priv->ctime, options, bool, out);
ret = 0;
out:
return ret;
}
int32_t
posix_delete_unlink_entry(const char *fpath, const struct stat *sb,
int typeflag, struct FTW *ftwbuf)
{
int ret = 0;
if (!fpath)
goto out;
switch (typeflag) {
case FTW_SL:
case FTW_NS:
case FTW_F:
case FTW_SLN:
ret = sys_unlink(fpath);
break;
case FTW_D:
case FTW_DP:
case FTW_DNR:
if (ftwbuf->level != 0) {
ret = sys_rmdir(fpath);
}
break;
default:
break;
}
if (ret) {
gf_msg("posix_delete_unlink_entry", GF_LOG_WARNING, errno,
P_MSG_HANDLE_CREATE,
"Deletion of entries %s failed"
"Please delete it manually",
fpath);
}
out:
return 0;
}
int32_t
posix_delete_unlink(const char *unlink_path)
{
int ret = -1;
int flags = 0;
flags |= (FTW_DEPTH | FTW_PHYS);
ret = nftw(unlink_path, posix_delete_unlink_entry, 2, flags);
if (ret) {
gf_msg("posix_delete_unlink", GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE,
"Deleting files from %s failed", unlink_path);
}
return ret;
}
int32_t
posix_create_unlink_dir(xlator_t *this)
{
struct posix_private *priv = NULL;
struct stat stbuf;
int ret = -1;
uuid_t gfid = {0};
char gfid_str[64] = {0};
char unlink_path[PATH_MAX] = {
0,
};
char landfill_path[PATH_MAX] = {
0,
};
priv = this->private;
(void)snprintf(unlink_path, sizeof(unlink_path), "%s/%s", priv->base_path,
GF_UNLINK_PATH);
gf_uuid_generate(gfid);
uuid_utoa_r(gfid, gfid_str);
(void)snprintf(landfill_path, sizeof(landfill_path), "%s/%s/%s",
priv->base_path, GF_LANDFILL_PATH, gfid_str);
ret = sys_stat(unlink_path, &stbuf);
switch (ret) {
case -1:
if (errno != ENOENT) {
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE,
"Checking for %s failed", unlink_path);
return -1;
}
break;
case 0:
if (!S_ISDIR(stbuf.st_mode)) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE,
"Not a directory: %s", unlink_path);
return -1;
}
ret = posix_delete_unlink(unlink_path);
return 0;
default:
break;
}
ret = sys_mkdir(unlink_path, 0600);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE,
"Creating directory %s failed", unlink_path);
return -1;
}
return 0;
}
/**
* init -
*/
int
posix_init(xlator_t *this)
{
struct posix_private *_private = NULL;
data_t *dir_data = NULL;
data_t *tmp_data = NULL;
struct stat buf = {
0,
};
gf_boolean_t tmp_bool = 0;
int ret = 0;
int op_ret = -1;
int op_errno = 0;
ssize_t size = -1;
uuid_t old_uuid = {
0,
};
uuid_t dict_uuid = {
0,
};
uuid_t gfid = {
0,
};
uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
char *guuid = NULL;
int32_t uid = -1;
int32_t gid = -1;
char *batch_fsync_mode_str;
char *gfid2path_sep = NULL;
int force_create = -1;
int force_directory = -1;
int create_mask = -1;
int create_directory_mask = -1;
dir_data = dict_get(this->options, "directory");
if (this->children) {
gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_SUBVOLUME_ERROR,
"FATAL: storage/posix cannot have subvolumes");
ret = -1;
goto out;
}
if (!this->parents) {
gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_VOLUME_DANGLING,
"Volume is dangling. Please check the volume file.");
}
if (!dir_data) {
gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_EXPORT_DIR_MISSING,
"Export directory not specified in volume file.");
ret = -1;
goto out;
}
umask(000); // umask `masking' is done at the client side
/* Check whether the specified directory exists, if not log it. */
op_ret = sys_stat(dir_data->data, &buf);
if ((op_ret != 0) || !S_ISDIR(buf.st_mode)) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DIR_OPERATION_FAILED,
"Directory '%s' doesn't exist, exiting.", dir_data->data);
ret = -1;
goto out;
}
_private = GF_CALLOC(1, sizeof(*_private), gf_posix_mt_posix_private);
if (!_private) {
ret = -1;
goto out;
}
_private->base_path = gf_strdup(dir_data->data);
_private->base_path_length = strlen(_private->base_path);
ret = dict_get_str(this->options, "hostname", &_private->hostname);
if (ret) {
_private->hostname = GF_CALLOC(256, sizeof(char), gf_common_mt_char);
if (!_private->hostname) {
goto out;
}
ret = gethostname(_private->hostname, 256);
if (ret < 0) {
gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_HOSTNAME_MISSING,
"could not find hostname ");
}
}
/* Check for Extended attribute support, if not present, log it */
op_ret = sys_lsetxattr(dir_data->data, "trusted.glusterfs.test", "working",
8, 0);
if (op_ret != -1) {
ret = sys_lremovexattr(dir_data->data, "trusted.glusterfs.test");
if (ret) {
gf_msg(this->name, GF_LOG_DEBUG, errno, P_MSG_INVALID_OPTION,
"failed to remove xattr: "
"trusted.glusterfs.test");
}
} else {
tmp_data = dict_get(this->options, "mandate-attribute");
if (tmp_data) {
if (gf_string2boolean(tmp_data->data, &tmp_bool) == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION,
"wrong option provided for key "
"\"mandate-attribute\"");
ret = -1;
goto out;
}
if (!tmp_bool) {
gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_XATTR_NOTSUP,
"Extended attribute not supported, "
"starting as per option");
} else {
gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_XATTR_NOTSUP,
"Extended attribute not supported, "
"exiting.");
ret = -1;
goto out;
}
} else {
gf_msg(this->name, GF_LOG_CRITICAL, 0, P_MSG_XATTR_NOTSUP,
"Extended attribute not supported, exiting.");
ret = -1;
goto out;
}
}
tmp_data = dict_get(this->options, "volume-id");
if (tmp_data) {
op_ret = gf_uuid_parse(tmp_data->data, dict_uuid);
if (op_ret < 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_VOLUME_ID,
"wrong volume-id (%s) set"
" in volume file",
tmp_data->data);
ret = -1;
goto out;
}
size = sys_lgetxattr(dir_data->data, "trusted.glusterfs.volume-id",
old_uuid, 16);
if (size == 16) {
if (gf_uuid_compare(old_uuid, dict_uuid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_VOLUME_ID,
"mismatching volume-id (%s) received. "
"already is a part of volume %s ",
tmp_data->data, uuid_utoa(old_uuid));
gf_event(EVENT_POSIX_ALREADY_PART_OF_VOLUME,
"volume-id=%s;brick=%s:%s", uuid_utoa(old_uuid),
_private->hostname, _private->base_path);
ret = -1;
goto out;
}
} else if ((size == -1) && (errno == ENODATA || errno == ENOATTR)) {
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_VOLUME_ID_ABSENT,
"Extended attribute trusted.glusterfs."
"volume-id is absent");
gf_event(EVENT_POSIX_BRICK_NOT_IN_VOLUME, "brick=%s:%s",
_private->hostname, _private->base_path);
ret = -1;
goto out;
} else if ((size == -1) && (errno != ENODATA) && (errno != ENOATTR)) {
/* Wrong 'volume-id' is set, it should be error */
gf_event(EVENT_POSIX_BRICK_VERIFICATION_FAILED, "brick=%s:%s",
_private->hostname, _private->base_path);
gf_msg(this->name, GF_LOG_WARNING, errno,
P_MSG_VOLUME_ID_FETCH_FAILED,
"%s: failed to fetch volume-id", dir_data->data);
ret = -1;
goto out;
} else {
ret = -1;
gf_event(EVENT_POSIX_BRICK_VERIFICATION_FAILED, "brick=%s:%s",
_private->hostname, _private->base_path);
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_VOLUME_ID_FETCH_FAILED,
"failed to fetch proper volume id from export");
goto out;
}
}
/* Now check if the export directory has some other 'gfid',
other than that of root '/' */
size = sys_lgetxattr(dir_data->data, "trusted.gfid", gfid, 16);
if (size == 16) {
if (!__is_root_gfid(gfid)) {
gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_SET_FAILED,
"%s: gfid (%s) is not that of glusterfs '/' ",
dir_data->data, uuid_utoa(gfid));
ret = -1;
goto out;
}
} else if (size != -1) {
/* Wrong 'gfid' is set, it should be error */
gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_SET_FAILED,
"%s: wrong value set as gfid", dir_data->data);
ret = -1;
goto out;
} else if ((size == -1) && (errno != ENODATA) && (errno != ENOATTR)) {
/* Wrong 'gfid' is set, it should be error */
gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_GFID_SET_FAILED,
"%s: failed to fetch gfid", dir_data->data);
ret = -1;
goto out;
} else {
/* First time volume, set the GFID */
size = sys_lsetxattr(dir_data->data, "trusted.gfid", rootgfid, 16,
XATTR_CREATE);
if (size == -1) {
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_GFID_SET_FAILED,
"%s: failed to set gfid", dir_data->data);
ret = -1;
goto out;
}
}
ret = 0;
size = sys_lgetxattr(dir_data->data, POSIX_ACL_ACCESS_XATTR, NULL, 0);
if ((size < 0) && (errno == ENOTSUP)) {
gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_ACL_NOTSUP,
"Posix access control list is not supported.");
gf_event(EVENT_POSIX_ACL_NOT_SUPPORTED, "brick=%s:%s",
_private->hostname, _private->base_path);
}
/*
* _XOPEN_PATH_MAX is the longest file path len we MUST
* support according to POSIX standard. When prepended
* by the brick base path it may exceed backed filesystem
* capacity (which MAY be bigger than _XOPEN_PATH_MAX). If
* this is the case, chdir() to the brick base path and
* use relative paths when they are too long. See also
* MAKE_REAL_PATH in posix-handle.h
*/
_private->path_max = pathconf(_private->base_path, _PC_PATH_MAX);
if (_private->path_max != -1 &&
_XOPEN_PATH_MAX + _private->base_path_length > _private->path_max) {
ret = chdir(_private->base_path);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_BASEPATH_CHDIR_FAILED,
"chdir() to \"%s\" failed", _private->base_path);
goto out;
}
#ifdef __NetBSD__
/*
* At least on NetBSD, the chdir() above uncovers a
* race condition which cause file lookup to fail
* with ENODATA for a few seconds. The volume quickly
* reaches a sane state, but regression tests are fast
* enough to choke on it. The reason is obscure (as
* often with race conditions), but sleeping here for
* a second seems to workaround the problem.
*/
sleep(1);
#endif
}
LOCK_INIT(&_private->lock);
GF_ATOMIC_INIT(_private->read_value, 0);
GF_ATOMIC_INIT(_private->write_value, 0);
_private->export_statfs = 1;
tmp_data = dict_get(this->options, "export-statfs-size");
if (tmp_data) {
if (gf_string2boolean(tmp_data->data, &_private->export_statfs) == -1) {
ret = -1;
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL,
"'export-statfs-size' takes only boolean "
"options");
goto out;
}
if (!_private->export_statfs)
gf_msg_debug(this->name, 0, "'statfs()' returns dummy size");
}
_private->background_unlink = 0;
tmp_data = dict_get(this->options, "background-unlink");
if (tmp_data) {
if (gf_string2boolean(tmp_data->data, &_private->background_unlink) ==
-1) {
ret = -1;
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL,
"'background-unlink'"
" takes only boolean options");
goto out;
}
if (_private->background_unlink)
gf_msg_debug(this->name, 0,
"unlinks will be performed in background");
}
tmp_data = dict_get(this->options, "o-direct");
if (tmp_data) {
if (gf_string2boolean(tmp_data->data, &_private->o_direct) == -1) {
ret = -1;
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL,
"wrong option provided for 'o-direct'");
goto out;
}
if (_private->o_direct)
gf_msg_debug(this->name, 0,
"o-direct mode is enabled"
" (O_DIRECT for every open)");
}
tmp_data = dict_get(this->options, "update-link-count-parent");
if (tmp_data) {
if (gf_string2boolean(tmp_data->data, &_private->update_pgfid_nlinks) ==
-1) {
ret = -1;
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION,
"wrong value provided "
"for 'update-link-count-parent'");
goto out;
}
if (_private->update_pgfid_nlinks)
gf_msg_debug(this->name, 0,
"update-link-count-parent"
" is enabled. Thus for each file an "
"extended attribute representing the "
"number of hardlinks for that file "
"within the same parent directory is"
" set.");
}
ret = dict_get_str(this->options, "glusterd-uuid", &guuid);
if (!ret) {
if (gf_uuid_parse(guuid, _private->glusterd_uuid))
gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_INVALID_NODE_UUID,
"Cannot parse "
"glusterd (node) UUID, node-uuid xattr "
"request would return - \"No such attribute\"");
} else {
gf_msg_debug(this->name, 0,
"No glusterd (node) UUID passed -"
" node-uuid xattr request will return \"No such"
" attribute\"");
}
ret = 0;
GF_OPTION_INIT("janitor-sleep-duration", _private->janitor_sleep_duration,
int32, out);
/* performing open dir on brick dir locks the brick dir
* and prevents it from being unmounted
*/
_private->mount_lock = sys_opendir(dir_data->data);
if (!_private->mount_lock) {
ret = -1;
op_errno = errno;
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DIR_OPERATION_FAILED,
"Could not lock brick directory (%s)", strerror(op_errno));
goto out;
}
#ifndef GF_DARWIN_HOST_OS
{
struct rlimit lim;
lim.rlim_cur = 1048576;
lim.rlim_max = 1048576;
if (setrlimit(RLIMIT_NOFILE, &lim) == -1) {
gf_msg(this->name, GF_LOG_WARNING, errno, P_MSG_SET_ULIMIT_FAILED,
"Failed to set 'ulimit -n "
" 1048576'");
lim.rlim_cur = 65536;
lim.rlim_max = 65536;
if (setrlimit(RLIMIT_NOFILE, &lim) == -1) {
gf_msg(this->name, GF_LOG_WARNING, errno,
P_MSG_SET_FILE_MAX_FAILED,
"Failed to set maximum allowed open "
"file descriptors to 64k");
} else {
gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_MAX_FILE_OPEN,
"Maximum allowed "
"open file descriptors set to 65536");
}
}
}
#endif
_private->shared_brick_count = 1;
ret = dict_get_int32(this->options, "shared-brick-count",
&_private->shared_brick_count);
if (ret == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_OPTION_VAL,
"'shared-brick-count' takes only integer "
"values");
goto out;
}
this->private = (void *)_private;
op_ret = posix_handle_init(this);
if (op_ret == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE,
"Posix handle setup failed");
ret = -1;
goto out;
}
op_ret = posix_handle_trash_init(this);
if (op_ret < 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE_TRASH,
"Posix landfill setup failed");
ret = -1;
goto out;
}
op_ret = posix_create_unlink_dir(this);
if (op_ret == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_HANDLE_CREATE,
"Creation of unlink directory failed");
ret = -1;
goto out;
}
_private->aio_init_done = _gf_false;
_private->aio_capable = _gf_false;
GF_OPTION_INIT("brick-uid", uid, int32, out);
GF_OPTION_INIT("brick-gid", gid, int32, out);
if (uid != -1 || gid != -1)
posix_set_owner(this, uid, gid);
GF_OPTION_INIT("linux-aio", _private->aio_configured, bool, out);
if (_private->aio_configured) {
op_ret = posix_aio_on(this);
if (op_ret == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_POSIX_AIO,
"Posix AIO init failed");
ret = -1;
goto out;
}
}
GF_OPTION_INIT("node-uuid-pathinfo", _private->node_uuid_pathinfo, bool,
out);
if (_private->node_uuid_pathinfo &&
(gf_uuid_is_null(_private->glusterd_uuid))) {
gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_UUID_NULL,
"glusterd uuid is NULL, pathinfo xattr would"
" fallback to <hostname>:<export>");
}
_private->disk_space_check_active = _gf_false;
_private->disk_space_full = 0;
GF_OPTION_INIT("reserve", _private->disk_reserve, percent_or_size, out);
/* option can be any one of percent or bytes */
_private->disk_unit = 0;
if (_private->disk_reserve < 100.0)
_private->disk_unit = 'p';
if (_private->disk_reserve) {
ret = posix_spawn_disk_space_check_thread(this);
if (ret) {
gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED,
"Getting disk space check from thread failed ");
goto out;
}
}
_private->health_check_active = _gf_false;
GF_OPTION_INIT("health-check-interval", _private->health_check_interval,
uint32, out);
GF_OPTION_INIT("health-check-timeout", _private->health_check_timeout,
uint32, out);
if (_private->health_check_interval) {
ret = posix_spawn_health_check_thread(this);
if (ret)
goto out;
}
posix_janitor_timer_start(this);
pthread_mutex_init(&_private->fsync_mutex, NULL);
pthread_cond_init(&_private->fsync_cond, NULL);
pthread_mutex_init(&_private->janitor_mutex, NULL);
pthread_cond_init(&_private->janitor_cond, NULL);
pthread_cond_init(&_private->fd_cond, NULL);
INIT_LIST_HEAD(&_private->fsyncs);
_private->rel_fdcount = 0;
ret = posix_spawn_ctx_janitor_thread(this);
if (ret)
goto out;
ret = gf_thread_create(&_private->fsyncer, NULL, posix_fsyncer, this,
"posixfsy");
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, errno,
P_MSG_FSYNCER_THREAD_CREATE_FAILED,
"fsyncer thread creation failed");
goto out;
}
GF_OPTION_INIT("batch-fsync-mode", batch_fsync_mode_str, str, out);
if (set_batch_fsync_mode(_private, batch_fsync_mode_str) != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT,
"Unknown mode string: %s", batch_fsync_mode_str);
goto out;
}
GF_OPTION_INIT("gfid2path", _private->gfid2path, bool, out);
GF_OPTION_INIT("gfid2path-separator", gfid2path_sep, str, out);
if (set_gfid2path_separator(_private, gfid2path_sep) != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT,
"Length of separator exceeds 7: %s", gfid2path_sep);
goto out;
}
#ifdef GF_DARWIN_HOST_OS
char *xattr_user_namespace_mode_str = NULL;
GF_OPTION_INIT("xattr-user-namespace-mode", xattr_user_namespace_mode_str,
str, out);
if (set_xattr_user_namespace_mode(_private,
xattr_user_namespace_mode_str) != 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_INVALID_ARGUMENT,
"Unknown xattr user namespace mode string: %s",
xattr_user_namespace_mode_str);
goto out;
}
#endif
GF_OPTION_INIT("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,
uint32, out);
GF_OPTION_INIT("disable-landfill-purge", _private->disable_landfill_purge,
bool, out);
if (_private->disable_landfill_purge) {
gf_msg(this->name, GF_LOG_WARNING, 0, 0,
"Janitor WILL NOT purge the landfill directory. "
"Your landfill directory"
" may fill up this brick.");
}
GF_OPTION_INIT("force-create-mode", force_create, int32, out);
_private->force_create_mode = force_create;
GF_OPTION_INIT("force-directory-mode", force_directory, int32, out);
_private->force_directory_mode = force_directory;
GF_OPTION_INIT("create-mask", create_mask, int32, out);
_private->create_mask = create_mask;
GF_OPTION_INIT("create-directory-mask", create_directory_mask, int32, out);
_private->create_directory_mask = create_directory_mask;
GF_OPTION_INIT("max-hardlinks", _private->max_hardlinks, uint32, out);
GF_OPTION_INIT("fips-mode-rchecksum", _private->fips_mode_rchecksum, bool,
out);
GF_OPTION_INIT("ctime", _private->ctime, bool, out);
out:
if (ret) {
if (_private) {
GF_FREE(_private->base_path);
GF_FREE(_private->hostname);
GF_FREE(_private->trash_path);
GF_FREE(_private);
}
this->private = NULL;
}
return ret;
}
void
posix_fini(xlator_t *this)
{
struct posix_private *priv = this->private;
gf_boolean_t health_check = _gf_false;
glusterfs_ctx_t *ctx = this->ctx;
uint32_t count;
int ret = 0;
if (!priv)
return;
LOCK(&priv->lock);
{
health_check = priv->health_check_active;
priv->health_check_active = _gf_false;
}
UNLOCK(&priv->lock);
if (health_check) {
(void)gf_thread_cleanup_xint(priv->health_check);
priv->health_check = 0;
}
if (priv->disk_space_check) {
priv->disk_space_check_active = _gf_false;
(void)gf_thread_cleanup_xint(priv->disk_space_check);
priv->disk_space_check = 0;
}
if (priv->janitor) {
/*TODO: Make sure the synctask is also complete */
ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor);
if (ret < 0) {
gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TIMER_DELETE_FAILED,
"Failed to delete janitor timer");
}
GF_FREE(priv->janitor);
priv->janitor = NULL;
}
pthread_mutex_lock(&ctx->fd_lock);
{
count = --ctx->pxl_count;
if (count == 0) {
pthread_cond_signal(&ctx->fd_cond);
}
}
pthread_mutex_unlock(&ctx->fd_lock);
if (count == 0) {
pthread_join(ctx->janitor, NULL);
}
if (priv->fsyncer) {
(void)gf_thread_cleanup_xint(priv->fsyncer);
priv->fsyncer = 0;
}
/*unlock brick dir*/
if (priv->mount_lock)
(void)sys_closedir(priv->mount_lock);
GF_FREE(priv->base_path);
LOCK_DESTROY(&priv->lock);
pthread_mutex_destroy(&priv->fsync_mutex);
pthread_cond_destroy(&priv->fsync_cond);
pthread_mutex_destroy(&priv->janitor_mutex);
pthread_cond_destroy(&priv->janitor_cond);
GF_FREE(priv->hostname);
GF_FREE(priv->trash_path);
GF_FREE(priv);
this->private = NULL;
return;
}
struct volume_options posix_options[] = {
{.key = {"o-direct"}, .type = GF_OPTION_TYPE_BOOL},
{.key = {"directory"},
.type = GF_OPTION_TYPE_PATH,
.default_value = "{{brick.path}}"},
{.key = {"hostname"}, .type = GF_OPTION_TYPE_ANY},
{.key = {"export-statfs-size"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on"},
{.key = {"mandate-attribute"}, .type = GF_OPTION_TYPE_BOOL},
{.key = {"background-unlink"}, .type = GF_OPTION_TYPE_BOOL},
{.key = {"janitor-sleep-duration"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.validate = GF_OPT_VALIDATE_MIN,
.default_value = "10",
.description = "Interval (in seconds) between times the internal "
"'landfill' directory is emptied."},
{.key = {"volume-id"},
.type = GF_OPTION_TYPE_ANY,
.default_value = "{{brick.volumeid}}"},
{.key = {"glusterd-uuid"}, .type = GF_OPTION_TYPE_STR},
{.key = {"linux-aio"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "Support for native Linux AIO",
.op_version = {1},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"brick-uid"},
.type = GF_OPTION_TYPE_INT,
.min = -1,
.validate = GF_OPT_VALIDATE_MIN,
.default_value = "-1",
.description = "Support for setting uid of brick's owner",
.op_version = {1},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"brick-gid"},
.type = GF_OPTION_TYPE_INT,
.min = -1,
.validate = GF_OPT_VALIDATE_MIN,
.default_value = "-1",
.description = "Support for setting gid of brick's owner",
.op_version = {1},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"node-uuid-pathinfo"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "return glusterd's node-uuid in pathinfo xattr"
" string instead of hostname",
.op_version = {3},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"health-check-interval"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.default_value = "30",
.validate = GF_OPT_VALIDATE_MIN,
.description = "Interval in seconds for a filesystem health check, "
"set to 0 to disable",
.op_version = {3},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"health-check-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.default_value = "20",
.validate = GF_OPT_VALIDATE_MIN,
.description =
"Interval in seconds to wait aio_write finish for health check, "
"set to 0 to disable",
.op_version = {GD_OP_VERSION_4_0_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"reserve"},
.type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
.min = 0,
.default_value = "1",
.validate = GF_OPT_VALIDATE_MIN,
.description = "Percentage/Size of disk space to be reserved."
" Set to 0 to disable",
.op_version = {GD_OP_VERSION_3_13_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"batch-fsync-mode"},
.type = GF_OPTION_TYPE_STR,
.default_value = "reverse-fsync",
.description =
"Possible values:\n"
"\t- syncfs: Perform one syncfs() on behalf oa batch"
"of fsyncs.\n"
"\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch"
" of fsyncs and one fsync() per batch.\n"
"\t- syncfs-reverse-fsync: Perform one syncfs() on behalf of a batch"
" of fsyncs and fsync() each file in the batch in reverse order.\n"
" in reverse order.\n"
"\t- reverse-fsync: Perform fsync() of each file in the batch in"
" reverse order.",
.op_version = {3},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"batch-fsync-delay-usec"},
.type = GF_OPTION_TYPE_INT,
.default_value = "0",
.description = "Num of usecs to wait for aggregating fsync"
" requests",
.op_version = {3},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"update-link-count-parent"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "Enable placeholders for gfid to path conversion",
.op_version = {GD_OP_VERSION_RHS_3_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"gfid2path"},
.type = GF_OPTION_TYPE_BOOL,
#ifdef __NetBSD__
/*
* NetBSD storage of extended attributes for UFS1 badly
* scales when the list of extended attributes names rises.
* This option can add as many extended attributes names
* as we have files, hence we keep it disabled for performance
* sake.
*/
.default_value = "off",
#else
.default_value = "on",
#endif
.description = "Enable logging metadata for gfid to path conversion",
.op_version = {GD_OP_VERSION_3_12_0},
.flags = OPT_FLAG_SETTABLE},
{.key = {"gfid2path-separator"},
.type = GF_OPTION_TYPE_STR,
.default_value = ":",
.description = "Path separator for glusterfs.gfidtopath virt xattr",
.op_version = {GD_OP_VERSION_3_12_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
#if GF_DARWIN_HOST_OS
{.key = {"xattr-user-namespace-mode"},
.type = GF_OPTION_TYPE_STR,
.default_value = "none",
.description =
"Option to control XATTR user namespace on the raw filesystem: "
"\t- None: Will use the user namespace, so files will be exchangeable "
"with Linux.\n"
" The raw filesystem will not be compatible with OS X Finder.\n"
"\t- Strip: Will strip the user namespace before setting. The raw "
"filesystem will work in OS X.\n",
.op_version = {GD_OP_VERSION_RHS_3_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
#endif
{
.key = {"shared-brick-count"},
.type = GF_OPTION_TYPE_INT,
.default_value = "1",
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.description =
"Number of bricks sharing the same backend export."
" Useful for displaying the proper usable size through statvfs() "
"call (df command)",
},
{
.key = {"disable-landfill-purge"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "Disable glusterfs/landfill purges. "
"WARNING: This can fill up a brick.",
.op_version = {GD_OP_VERSION_4_0_0},
.tags = {"diagnosis"},
},
{.key = {"force-create-mode"},
.type = GF_OPTION_TYPE_INT,
.min = 0000,
.max = 0777,
.default_value = "0000",
.validate = GF_OPT_VALIDATE_MIN,
.validate = GF_OPT_VALIDATE_MAX,
.description = "Mode bit permission that will always be set on a file."},
{.key = {"force-directory-mode"},
.type = GF_OPTION_TYPE_INT,
.min = 0000,
.max = 0777,
.default_value = "0000",
.validate = GF_OPT_VALIDATE_MIN,
.validate = GF_OPT_VALIDATE_MAX,
.description = "Mode bit permission that will be always set on directory"},
{.key = {"create-mask"},
.type = GF_OPTION_TYPE_INT,
.min = 0000,
.max = 0777,
.default_value = "0777",
.validate = GF_OPT_VALIDATE_MIN,
.validate = GF_OPT_VALIDATE_MAX,
.description = "Any bit not set here will be removed from the"
"modes set on a file when it is created"},
{.key = {"create-directory-mask"},
.type = GF_OPTION_TYPE_INT,
.min = 0000,
.max = 0777,
.default_value = "0777",
.validate = GF_OPT_VALIDATE_MIN,
.validate = GF_OPT_VALIDATE_MAX,
.description = "Any bit not set here will be removed from the"
"modes set on a directory when it is created"},
{.key = {"max-hardlinks"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.default_value = "100",
.op_version = {GD_OP_VERSION_4_0_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"posix"},
.validate = GF_OPT_VALIDATE_MIN,
.description = "max number of hardlinks allowed on any one inode.\n"
"0 is unlimited, 1 prevents any hardlinking at all."},
{.key = {"fips-mode-rchecksum"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.op_version = {GD_OP_VERSION_4_0_0},
.flags = OPT_FLAG_SETTABLE,
.tags = {"posix"},
.description = "If enabled, posix_rchecksum uses the FIPS compliant"
"SHA256 checksum. MD5 otherwise."},
{.key = {"ctime"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.op_version = {GD_OP_VERSION_4_1_0},
.tags = {"ctime"},
.description =
"When this option is enabled, time attributes (ctime,mtime,atime) "
"are stored in xattr to keep it consistent across replica and "
"distribute set. The time attributes stored at the backend are "
"not considered "},
{.key = {NULL}},
};