Blob Blame History Raw
/**
 * Enhanced Seccomp Filter DB
 *
 * Copyright (c) 2012,2016,2018 Red Hat <pmoore@redhat.com>
 * Copyright (c) 2019 Cisco Systems, Inc. <pmoore2@cisco.com>
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License as
 * published by the Free Software Foundation.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, see <http://www.gnu.org/licenses>.
 */

#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>

#include <seccomp.h>

#include "arch.h"
#include "db.h"
#include "system.h"
#include "helper.h"

/* state values */
#define _DB_STA_VALID			0xA1B2C3D4
#define _DB_STA_FREED			0x1A2B3C4D

/* the priority field is fairly simple - without any user hints, or in the case
 * of a hint "tie", we give higher priority to syscalls with less chain nodes
 * (filter is easier to evaluate) */
#define _DB_PRI_MASK_CHAIN		0x0000FFFF
#define _DB_PRI_MASK_USER		0x00FF0000
#define _DB_PRI_USER(x)			(((x) << 16) & _DB_PRI_MASK_USER)

/* prove information about the sub-tree check results */
struct db_iter_state {
#define _DB_IST_NONE			0x00000000
#define _DB_IST_MATCH			0x00000001
#define _DB_IST_MATCH_ONCE		0x00000002
#define _DB_IST_X_FINISHED		0x00000010
#define _DB_IST_N_FINISHED		0x00000020
#define _DB_IST_X_PREFIX		0x00000100
#define _DB_IST_N_PREFIX		0x00000200
#define _DB_IST_M_MATCHSET		(_DB_IST_MATCH|_DB_IST_MATCH_ONCE)
#define _DB_IST_M_REDUNDANT		(_DB_IST_MATCH| \
					 _DB_IST_X_FINISHED| \
					 _DB_IST_N_PREFIX)
	unsigned int flags;
	uint32_t action;
	struct db_sys_list *sx;
};

static unsigned int _db_node_put(struct db_arg_chain_tree **node);

/**
 * Define the syscall argument priority for nodes on the same level of the tree
 * @param a tree node
 *
 * Prioritize the syscall argument value, taking into account hi/lo words.
 * Should only ever really be called by _db_chain_{lt,eq}().  Returns an
 * arbitrary value indicating priority.
 *
 */
static unsigned int __db_chain_arg_priority(const struct db_arg_chain_tree *a)
{
	return (a->arg << 1) + (a->arg_h_flg ? 1 : 0);
}

/**
 * Define the "op" priority for nodes on the same level of the tree
 * @param op the argument operator
 *
 * Prioritize the syscall argument comparison operator.  Should only ever
 * really be called by _db_chain_{lt,eq}().  Returns an arbitrary value
 * indicating priority.
 *
 */
static unsigned int __db_chain_op_priority(enum scmp_compare op)
{
	/* the distinction between LT/LT and GT/GE is mostly to make the
	 * ordering as repeatable as possible regardless of the order in which
	 * the rules are added */
	switch (op) {
	case SCMP_CMP_MASKED_EQ:
	case SCMP_CMP_EQ:
	case SCMP_CMP_NE:
		return 3;
	case SCMP_CMP_LE:
	case SCMP_CMP_LT:
		return 2;
	case SCMP_CMP_GE:
	case SCMP_CMP_GT:
		return 1;
	default:
		return 0;
	}
}

/**
 * Determine if node "a" is less than node "b"
 * @param a tree node
 * @param b tree node
 *
 * The logic is best explained by looking at the comparison code in the
 * function.
 *
 */
static bool _db_chain_lt(const struct db_arg_chain_tree *a,
			 const struct db_arg_chain_tree *b)
{
	unsigned int a_arg, b_arg;
	unsigned int a_op, b_op;

	a_arg = __db_chain_arg_priority(a);
	b_arg = __db_chain_arg_priority(b);
	if (a_arg < b_arg)
		return true;
	else if (a_arg > b_arg)
		return false;

	a_op = __db_chain_op_priority(a->op_orig);
	b_op = __db_chain_op_priority(b->op_orig);
	if (a_op < b_op)
		return true;
	else if (a_op > b_op)
		return false;

	/* NOTE: at this point the arg and op priorities are equal */

	switch (a->op_orig) {
	case SCMP_CMP_LE:
	case SCMP_CMP_LT:
		/* in order to ensure proper ordering for LT/LE comparisons we
		 * need to invert the argument value so smaller values come
		 * first */
		if (a->datum > b->datum)
			return true;
		break;
	default:
		if (a->datum < b->datum)
			return true;
		break;
	}

	return false;
}

/**
 * Determine if two nodes have equal argument datum values
 * @param a tree node
 * @param b tree node
 *
 * In order to return true the nodes must have the same datum and mask for the
 * same argument.
 *
 */
static bool _db_chain_eq(const struct db_arg_chain_tree *a,
			 const struct db_arg_chain_tree *b)
{
	unsigned int a_arg, b_arg;

	a_arg = __db_chain_arg_priority(a);
	b_arg = __db_chain_arg_priority(b);

	return ((a_arg == b_arg) && (a->op == b->op) &&
		(a->datum == b->datum) && (a->mask == b->mask));
}

/**
 * Determine if a given tree node is a leaf node
 * @param iter the node to test
 *
 * A leaf node is a node with no other nodes beneath it.
 *
 */
static bool _db_chain_leaf(const struct db_arg_chain_tree *iter)
{
	return (iter->nxt_t == NULL && iter->nxt_f == NULL);
}

/**
 * Determine if a given tree node is a zombie node
 * @param iter the node to test
 *
 * A zombie node is a leaf node that also has no true or false actions.
 *
 */
static bool _db_chain_zombie(const struct db_arg_chain_tree *iter)
{
	return (_db_chain_leaf(iter) &&
		!(iter->act_t_flg) && !(iter->act_f_flg));
}

/**
 * Get a node reference
 * @param node pointer to a node
 *
 * This function gets a reference to an individual node.  Returns a pointer
 * to the node.
 *
 */
static struct db_arg_chain_tree *_db_node_get(struct db_arg_chain_tree *node)
{
	if (node != NULL)
		node->refcnt++;
	return node;
}

/**
 * Garbage collect a level of the tree
 * @param node tree node
 *
 * Check the entire level on which @node resides, if there is no other part of
 * the tree which points to a node on this level, remove the entire level.
 * Returns the number of nodes removed.
 *
 */
static unsigned int _db_level_clean(struct db_arg_chain_tree *node)
{
	int cnt = 0;
	unsigned int links;
	struct db_arg_chain_tree *n = node;
	struct db_arg_chain_tree *start;

	while (n->lvl_prv)
		n = n->lvl_prv;
	start = n;

	while (n != NULL) {
		links = 0;
		if (n->lvl_prv)
			links++;
		if (n->lvl_nxt)
			links++;

		if (n->refcnt > links)
			return cnt;

		n = n->lvl_nxt;
	}

	n = start;
	while (n != NULL)
		cnt += _db_node_put(&n);

	return cnt;
}

/**
 * Free a syscall filter argument chain tree
 * @param tree the argument chain list
 *
 * This function drops a reference to the tree pointed to by @tree and garbage
 * collects the top level.  Returns the number of nodes removed.
 *
 */
static unsigned int _db_tree_put(struct db_arg_chain_tree **tree)
{
	unsigned int cnt;

	cnt = _db_node_put(tree);
	if (*tree)
		cnt += _db_level_clean(*tree);

	return cnt;
}

/**
 * Release a node reference
 * @param node pointer to a node
 *
 * This function drops a reference to an individual node, unless this is the
 * last reference in which the entire sub-tree is affected.  Returns the number
 * of nodes freed.
 *
 */
static unsigned int _db_node_put(struct db_arg_chain_tree **node)
{
	unsigned int cnt = 0;
	struct db_arg_chain_tree *n = *node;
	struct db_arg_chain_tree *lvl_p, *lvl_n, *nxt_t, *nxt_f;

	if (n == NULL)
		return 0;

	if (--(n->refcnt) == 0) {
		lvl_p = n->lvl_prv;
		lvl_n = n->lvl_nxt;
		nxt_t = n->nxt_t;
		nxt_f = n->nxt_f;

		/* split the current level */
		/* NOTE: we still hold a ref for both lvl_p and lvl_n */
		if (lvl_p)
			lvl_p->lvl_nxt = NULL;
		if (lvl_n)
			lvl_n->lvl_prv = NULL;

		/* drop refcnts on the current level */
		if (lvl_p)
			cnt += _db_node_put(&lvl_p);
		if (lvl_n)
			cnt += _db_node_put(&lvl_n);

		/* re-link current level if it still exists */
		if (lvl_p)
			lvl_p->lvl_nxt = _db_node_get(lvl_n);
		if (lvl_n)
			lvl_n->lvl_prv = _db_node_get(lvl_p);

		/* update caller's pointer */
		if (lvl_p)
			*node = lvl_p;
		else if (lvl_n)
			*node = lvl_n;
		else
			*node = NULL;

		/* drop the next level(s) */
		cnt += _db_tree_put(&nxt_t);
		cnt += _db_tree_put(&nxt_f);

		/* cleanup and accounting */
		free(n);
		cnt++;
	}

	return cnt;
}

/**
 * Remove a node from an argument chain tree
 * @param tree the pointer to the tree
 * @param node the node to remove
 *
 * This function searches the tree looking for the node and removes it as well
 * as any sub-trees beneath it.  Returns the number of nodes freed.
 *
 */
static unsigned int _db_tree_remove(struct db_arg_chain_tree **tree,
				    struct db_arg_chain_tree *node)
{
	int cnt = 0;
	struct db_arg_chain_tree *c_iter;

	if (tree == NULL || *tree == NULL || node == NULL)
		return 0;

	c_iter = *tree;
	while (c_iter->lvl_prv != NULL)
		c_iter = c_iter->lvl_prv;

	do {
		/* current node? */
		if (c_iter == node)
			goto remove;

		/* check the sub-trees */
		cnt += _db_tree_remove(&(c_iter->nxt_t), node);
		cnt += _db_tree_remove(&(c_iter->nxt_f), node);

		/* check for empty/zombie nodes */
		if (_db_chain_zombie(c_iter))
			goto remove;

		/* next node on this level */
		c_iter = c_iter->lvl_nxt;
	} while (c_iter != NULL && cnt == 0);

	return cnt;

remove:
	/* reset the tree pointer if needed */
	if (c_iter == *tree) {
		if (c_iter->lvl_prv != NULL)
			*tree = c_iter->lvl_prv;
		else
			*tree = c_iter->lvl_nxt;
	}

	/* remove the node from the current level */
	if (c_iter->lvl_prv)
		c_iter->lvl_prv->lvl_nxt = c_iter->lvl_nxt;
	if (c_iter->lvl_nxt)
		c_iter->lvl_nxt->lvl_prv = c_iter->lvl_prv;
	c_iter->lvl_prv = NULL;
	c_iter->lvl_nxt = NULL;

	/* free the node and any sub-trees */
	cnt += _db_node_put(&c_iter);

	return cnt;
}

/**
 * Traverse a tree checking the action values
 * @param tree the pointer to the tree
 * @param action the action
 *
 * Traverse the tree inspecting each action to see if it matches the given
 * action.  Returns zero if all actions match the given action, negative values
 * on failure.
 *
 */
static int _db_tree_act_check(struct db_arg_chain_tree *tree, uint32_t action)
{
	int rc;
	struct db_arg_chain_tree *c_iter;

	if (tree == NULL)
		return 0;

	c_iter = tree;
	while (c_iter->lvl_prv != NULL)
		c_iter = c_iter->lvl_prv;

	do {
		if (c_iter->act_t_flg && c_iter->act_t != action)
			return -EEXIST;
		if (c_iter->act_f_flg && c_iter->act_f != action)
			return -EEXIST;

		rc = _db_tree_act_check(c_iter->nxt_t, action);
		if (rc < 0)
			return rc;
		rc = _db_tree_act_check(c_iter->nxt_f, action);
		if (rc < 0)
			return rc;

		c_iter = c_iter->lvl_nxt;
	} while (c_iter != NULL);

	return 0;
}

/**
 * Checks for a sub-tree match in an existing tree and prunes the tree
 * @param existing pointer to the existing tree
 * @param new pointer to the new tree
 * @param state pointer to a state structure
 *
 * This function searches the existing tree trying to prune it based on the
 * new tree.  Returns the number of nodes removed from the tree on success,
 * zero if no changes were made.
 *
 */
static int _db_tree_prune(struct db_arg_chain_tree **existing,
			  struct db_arg_chain_tree *new,
			  struct db_iter_state *state)
{
	int cnt = 0;
	struct db_iter_state state_nxt;
	struct db_iter_state state_new = *state;
	struct db_arg_chain_tree *x_iter_next;
	struct db_arg_chain_tree *x_iter = *existing;
	struct db_arg_chain_tree *n_iter = new;

	/* check if either tree is finished */
	if (n_iter == NULL || x_iter == NULL)
		goto prune_return;

	/* bail out if we have a broken match */
	if ((state->flags & _DB_IST_M_MATCHSET) == _DB_IST_MATCH_ONCE)
		goto prune_return;

	/* get to the start of the existing level */
	while (x_iter->lvl_prv)
		x_iter = x_iter->lvl_prv;

	/* NOTE: a few comments on the code below ...
	 * 1) we need to take a reference before we go down a level in case
	 *    we end up dropping the sub-tree (see the _db_node_get() calls)
	 * 2) since the new tree really only has one branch, we can only ever
	 *    match on one branch in the existing tree, if we "hit" then we
	 *    can bail on the other branches */

	do {
		/* store this now in case we remove x_iter */
		x_iter_next = x_iter->lvl_nxt;

		/* compare the two nodes */
		if (_db_chain_eq(x_iter, n_iter)) {
			/* we have a match */
			state_new.flags |= _DB_IST_M_MATCHSET;

			/* check if either tree is finished */
			if (_db_chain_leaf(n_iter))
				state_new.flags |= _DB_IST_N_FINISHED;
			if (_db_chain_leaf(x_iter))
				state_new.flags |= _DB_IST_X_FINISHED;

			/* don't remove nodes if we have more actions/levels */
			if ((x_iter->act_t_flg || x_iter->nxt_t) &&
			    !(n_iter->act_t_flg || n_iter->nxt_t))
				goto prune_return;
			if ((x_iter->act_f_flg || x_iter->nxt_f) &&
			    !(n_iter->act_f_flg || n_iter->nxt_f))
				goto prune_return;

			/* if finished, compare actions */
			if ((state_new.flags & _DB_IST_N_FINISHED) &&
			    (state_new.flags & _DB_IST_X_FINISHED)) {
				if (n_iter->act_t_flg != x_iter->act_t_flg)
					goto prune_return;
				if (n_iter->act_t != x_iter->act_t)
					goto prune_return;

				if (n_iter->act_f_flg != x_iter->act_f_flg)
					goto prune_return;
				if (n_iter->act_f != x_iter->act_f)
					goto prune_return;
			}

			/* check next level */
			if (n_iter->nxt_t) {
				_db_node_get(x_iter);
				state_nxt = *state;
				state_nxt.flags |= _DB_IST_M_MATCHSET;
				cnt += _db_tree_prune(&x_iter->nxt_t,
						      n_iter->nxt_t,
						      &state_nxt);
				cnt += _db_node_put(&x_iter);
				if (state_nxt.flags & _DB_IST_MATCH) {
					state_new.flags |= state_nxt.flags;
					/* don't return yet, we need to check
					 * the current node */
				}
				if (x_iter == NULL)
					goto prune_next_node;
			}
			if (n_iter->nxt_f) {
				_db_node_get(x_iter);
				state_nxt = *state;
				state_nxt.flags |= _DB_IST_M_MATCHSET;
				cnt += _db_tree_prune(&x_iter->nxt_f,
						      n_iter->nxt_f,
						      &state_nxt);
				cnt += _db_node_put(&x_iter);
				if (state_nxt.flags & _DB_IST_MATCH) {
					state_new.flags |= state_nxt.flags;
					/* don't return yet, we need to check
					 * the current node */
				}
				if (x_iter == NULL)
					goto prune_next_node;
			}

			/* remove the node? */
			if (!_db_tree_act_check(x_iter, state_new.action) &&
			    (state_new.flags & _DB_IST_MATCH) &&
			    (state_new.flags & _DB_IST_N_FINISHED) &&
			    (state_new.flags & _DB_IST_X_PREFIX)) {
				/* yes - the new tree is "shorter" */
				cnt += _db_tree_remove(&state->sx->chains,
						       x_iter);
				if (state->sx->chains == NULL)
					goto prune_return;
			} else if (!_db_tree_act_check(x_iter, state_new.action)
				   && (state_new.flags & _DB_IST_MATCH) &&
				   (state_new.flags & _DB_IST_X_FINISHED) &&
				   (state_new.flags & _DB_IST_N_PREFIX)) {
				/* no - the new tree is "longer" */
				goto prune_return;
			}
		} else if (_db_chain_lt(x_iter, n_iter)) {
			/* bail if we have a prefix on the new tree */
			if (state->flags & _DB_IST_N_PREFIX)
				goto prune_return;

			/* check the next level in the existing tree */
			if (x_iter->nxt_t) {
				_db_node_get(x_iter);
				state_nxt = *state;
				state_nxt.flags &= ~_DB_IST_MATCH;
				state_nxt.flags |= _DB_IST_X_PREFIX;
				cnt += _db_tree_prune(&x_iter->nxt_t, n_iter,
						      &state_nxt);
				cnt += _db_node_put(&x_iter);
				if (state_nxt.flags & _DB_IST_MATCH) {
					state_new.flags |= state_nxt.flags;
					goto prune_return;
				}
				if (x_iter == NULL)
					goto prune_next_node;
			}
			if (x_iter->nxt_f) {
				_db_node_get(x_iter);
				state_nxt = *state;
				state_nxt.flags &= ~_DB_IST_MATCH;
				state_nxt.flags |= _DB_IST_X_PREFIX;
				cnt += _db_tree_prune(&x_iter->nxt_f, n_iter,
						      &state_nxt);
				cnt += _db_node_put(&x_iter);
				if (state_nxt.flags & _DB_IST_MATCH) {
					state_new.flags |= state_nxt.flags;
					goto prune_return;
				}
				if (x_iter == NULL)
					goto prune_next_node;
			}
		} else {
			/* bail if we have a prefix on the existing tree */
			if (state->flags & _DB_IST_X_PREFIX)
				goto prune_return;

			/* check the next level in the new tree */
			if (n_iter->nxt_t) {
				_db_node_get(x_iter);
				state_nxt = *state;
				state_nxt.flags &= ~_DB_IST_MATCH;
				state_nxt.flags |= _DB_IST_N_PREFIX;
				cnt += _db_tree_prune(&x_iter, n_iter->nxt_t,
						      &state_nxt);
				cnt += _db_node_put(&x_iter);
				if (state_nxt.flags & _DB_IST_MATCH) {
					state_new.flags |= state_nxt.flags;
					goto prune_return;
				}
				if (x_iter == NULL)
					goto prune_next_node;
			}
			if (n_iter->nxt_f) {
				_db_node_get(x_iter);
				state_nxt = *state;
				state_nxt.flags &= ~_DB_IST_MATCH;
				state_nxt.flags |= _DB_IST_N_PREFIX;
				cnt += _db_tree_prune(&x_iter, n_iter->nxt_f,
						      &state_nxt);
				cnt += _db_node_put(&x_iter);
				if (state_nxt.flags & _DB_IST_MATCH) {
					state_new.flags |= state_nxt.flags;
					goto prune_return;
				}
				if (x_iter == NULL)
					goto prune_next_node;
			}
		}

prune_next_node:
		/* check next node on this level */
		x_iter = x_iter_next;
	} while (x_iter);

	// if we are falling through, we clearly didn't match on anything
	state_new.flags &= ~_DB_IST_MATCH;

prune_return:
	/* no more nodes on this level, return to the level above */
	if (state_new.flags & _DB_IST_MATCH)
		state->flags |= state_new.flags;
	else
		state->flags &= ~_DB_IST_MATCH;
	return cnt;
}

/**
 * Add a new tree into an existing tree
 * @param existing pointer to the existing tree
 * @param new pointer to the new tree
 * @param state pointer to a state structure
 *
 * This function adds the new tree into the existing tree, fetching additional
 * references as necessary.  Returns zero on success, negative values on
 * failure.
 *
 */
static int _db_tree_add(struct db_arg_chain_tree **existing,
			struct db_arg_chain_tree *new,
			struct db_iter_state *state)
{
	int rc;
	struct db_arg_chain_tree *x_iter = *existing;
	struct db_arg_chain_tree *n_iter = new;

	do {
		if (_db_chain_eq(x_iter, n_iter)) {
			if (n_iter->act_t_flg) {
				if (!x_iter->act_t_flg) {
					/* new node has a true action */

					/* do the actions match? */
					rc = _db_tree_act_check(x_iter->nxt_t,
								n_iter->act_t);
					if (rc != 0)
						return rc;

					/* update with the new action */
					rc = _db_node_put(&x_iter->nxt_t);
					x_iter->nxt_t = NULL;
					x_iter->act_t = n_iter->act_t;
					x_iter->act_t_flg = true;
					state->sx->node_cnt -= rc;
				} else if (n_iter->act_t != x_iter->act_t) {
					/* if we are dealing with a 64-bit
					 * comparison, we need to adjust our
					 * action based on the full 64-bit
					 * value to ensure we handle GT/GE
					 * comparisons correctly */
					if (n_iter->arg_h_flg &&
					    (n_iter->datum_full >
					     x_iter->datum_full))
						x_iter->act_t = n_iter->act_t;
					if (_db_chain_leaf(x_iter) ||
					    _db_chain_leaf(n_iter))
						return -EEXIST;
				}
			}
			if (n_iter->act_f_flg) {
				if (!x_iter->act_f_flg) {
					/* new node has a false action */

					/* do the actions match? */
					rc = _db_tree_act_check(x_iter->nxt_f,
								n_iter->act_f);
					if (rc != 0)
						return rc;

					/* update with the new action */
					rc = _db_node_put(&x_iter->nxt_f);
					x_iter->nxt_f = NULL;
					x_iter->act_f = n_iter->act_f;
					x_iter->act_f_flg = true;
					state->sx->node_cnt -= rc;
				} else if (n_iter->act_f != x_iter->act_f) {
					/* if we are dealing with a 64-bit
					 * comparison, we need to adjust our
					 * action based on the full 64-bit
					 * value to ensure we handle LT/LE
					 * comparisons correctly */
					if (n_iter->arg_h_flg &&
					    (n_iter->datum_full <
					     x_iter->datum_full))
						x_iter->act_t = n_iter->act_t;
					if (_db_chain_leaf(x_iter) ||
					    _db_chain_leaf(n_iter))
						return -EEXIST;
				}
			}

			if (n_iter->nxt_t) {
				if (x_iter->nxt_t) {
					/* compare the next level */
					rc = _db_tree_add(&x_iter->nxt_t,
							  n_iter->nxt_t,
							  state);
					if (rc != 0)
						return rc;
				} else if (!x_iter->act_t_flg) {
					/* add a new sub-tree */
					x_iter->nxt_t = _db_node_get(n_iter->nxt_t);
				} else
					/* done - existing tree is "shorter" */
					return 0;
			}
			if (n_iter->nxt_f) {
				if (x_iter->nxt_f) {
					/* compare the next level */
					rc = _db_tree_add(&x_iter->nxt_f,
							  n_iter->nxt_f,
							  state);
					if (rc != 0)
						return rc;
				} else if (!x_iter->act_f_flg) {
					/* add a new sub-tree */
					x_iter->nxt_f = _db_node_get(n_iter->nxt_f);
				} else
					/* done - existing tree is "shorter" */
					return 0;
			}

			return 0;
		} else if (!_db_chain_lt(x_iter, n_iter)) {
			/* try to move along the current level */
			if (x_iter->lvl_nxt == NULL) {
				/* add to the end of this level */
				n_iter->lvl_prv = _db_node_get(x_iter);
				x_iter->lvl_nxt = _db_node_get(n_iter);
				return 0;
			} else
				/* next */
				x_iter = x_iter->lvl_nxt;
		} else {
			/* add before the existing node on this level*/
			if (x_iter->lvl_prv != NULL) {
				x_iter->lvl_prv->lvl_nxt = _db_node_get(n_iter);
				n_iter->lvl_prv = x_iter->lvl_prv;
				x_iter->lvl_prv = _db_node_get(n_iter);
				n_iter->lvl_nxt = x_iter;
			} else {
				x_iter->lvl_prv = _db_node_get(n_iter);
				n_iter->lvl_nxt = _db_node_get(x_iter);
			}
			if (*existing == x_iter) {
				*existing = _db_node_get(n_iter);
				_db_node_put(&x_iter);
			}
			return 0;
		}
	} while (x_iter);

	return 0;
}

/**
 * Free and reset the seccomp filter DB
 * @param db the seccomp filter DB
 *
 * This function frees any existing filters and resets the filter DB to a
 * default state; only the DB architecture is preserved.
 *
 */
static void _db_reset(struct db_filter *db)
{
	struct db_sys_list *s_iter;
	struct db_api_rule_list *r_iter;

	if (db == NULL)
		return;

	/* free any filters */
	if (db->syscalls != NULL) {
		s_iter = db->syscalls;
		while (s_iter != NULL) {
			db->syscalls = s_iter->next;
			_db_tree_put(&s_iter->chains);
			free(s_iter);
			s_iter = db->syscalls;
		}
		db->syscalls = NULL;
	}

	/* free any rules */
	if (db->rules != NULL) {
		/* split the loop first then loop and free */
		db->rules->prev->next = NULL;
		r_iter = db->rules;
		while (r_iter != NULL) {
			db->rules = r_iter->next;
			free(r_iter);
			r_iter = db->rules;
		}
		db->rules = NULL;
	}
}

/**
 * Intitalize a seccomp filter DB
 * @param arch the architecture definition
 *
 * This function initializes a seccomp filter DB and readies it for use.
 * Returns a pointer to the DB on success, NULL on failure.
 *
 */
static struct db_filter *_db_init(const struct arch_def *arch)
{
	struct db_filter *db;

	db = zmalloc(sizeof(*db));
	if (db == NULL)
		return NULL;

	/* set the arch and reset the DB to a known state */
	db->arch = arch;
	_db_reset(db);

	return db;
}

/**
 * Destroy a seccomp filter DB
 * @param db the seccomp filter DB
 *
 * This function destroys a seccomp filter DB.  After calling this function,
 * the filter should no longer be referenced.
 *
 */
static void _db_release(struct db_filter *db)
{
	if (db == NULL)
		return;

	/* free and reset the DB */
	_db_reset(db);
	free(db);
}

/**
 * Destroy a seccomp filter snapshot
 * @param snap the seccomp filter snapshot
 *
 * This function destroys a seccomp filter snapshot.  After calling this
 * function, the snapshot should no longer be referenced.
 *
 */
static void _db_snap_release(struct db_filter_snap *snap)
{
	unsigned int iter;

	if (snap->filter_cnt > 0) {
		for (iter = 0; iter < snap->filter_cnt; iter++) {
			if (snap->filters[iter])
				_db_release(snap->filters[iter]);
		}
		free(snap->filters);
	}
	free(snap);
}

/**
 * Update the user specified portion of the syscall priority
 * @param db the seccomp filter db
 * @param syscall the syscall number
 * @param priority the syscall priority
 *
 * This function sets, or updates, the syscall priority; the highest priority
 * value between the existing and specified value becomes the new syscall
 * priority.  If the syscall entry does not already exist, a new phantom
 * syscall entry is created as a placeholder.  Returns zero on success,
 * negative values on failure.
 *
 */
static int _db_syscall_priority(struct db_filter *db,
				int syscall, uint8_t priority)
{
	unsigned int sys_pri = _DB_PRI_USER(priority);
	struct db_sys_list *s_new, *s_iter, *s_prev = NULL;

	assert(db != NULL);

	s_iter = db->syscalls;
	while (s_iter != NULL && s_iter->num < syscall) {
		s_prev = s_iter;
		s_iter = s_iter->next;
	}

	/* matched an existing syscall entry */
	if (s_iter != NULL && s_iter->num == syscall) {
		if (sys_pri > (s_iter->priority & _DB_PRI_MASK_USER)) {
			s_iter->priority &= (~_DB_PRI_MASK_USER);
			s_iter->priority |= sys_pri;
		}
		return 0;
	}

	/* no existing syscall entry - create a phantom entry */
	s_new = zmalloc(sizeof(*s_new));
	if (s_new == NULL)
		return -ENOMEM;
	s_new->num = syscall;
	s_new->priority = sys_pri;
	s_new->valid = false;

	/* add it before s_iter */
	if (s_prev != NULL) {
		s_new->next = s_prev->next;
		s_prev->next = s_new;
	} else {
		s_new->next = db->syscalls;
		db->syscalls = s_new;
	}

	return 0;
}

/**
 * Create a new rule
 * @param strict the strict value
 * @param action the rule's action
 * @param syscall the syscall number
 * @param chain the syscall argument filter
 *
 * This function creates a new rule structure based on the given arguments.
 * Returns a pointer to the new rule on success, NULL on failure.
 *
 */
static struct db_api_rule_list *_db_rule_new(bool strict,
					     uint32_t action, int syscall,
					     struct db_api_arg *chain)
{
	struct db_api_rule_list *rule;

	rule = zmalloc(sizeof(*rule));
	if (rule == NULL)
		return NULL;
	rule->action = action;
	rule->syscall = syscall;
	rule->strict = strict;
	memcpy(rule->args, chain, sizeof(*chain) * ARG_COUNT_MAX);

	return rule;
}

/**
 * Duplicate an existing filter rule
 * @param src the rule to duplicate
 *
 * This function makes an exact copy of the given rule, but does not add it
 * to any lists.  Returns a pointer to the new rule on success, NULL on
 * failure.
 *
 */
struct db_api_rule_list *db_rule_dup(const struct db_api_rule_list *src)
{
	struct db_api_rule_list *dest;

	dest = malloc(sizeof(*dest));
	if (dest == NULL)
		return NULL;
	memcpy(dest, src, sizeof(*dest));
	dest->prev = NULL;
	dest->next = NULL;

	return dest;
}

/**
 * Free and reset the seccomp filter collection
 * @param col the seccomp filter collection
 * @param def_action the default filter action
 *
 * This function frees any existing filter DBs and resets the collection to a
 * default state.  In the case of failure the filter collection may be in an
 * unknown state and should be released.  Returns zero on success, negative
 * values on failure.
 *
 */
int db_col_reset(struct db_filter_col *col, uint32_t def_action)
{
	unsigned int iter;
	struct db_filter *db;
	struct db_filter_snap *snap;

	if (col == NULL)
		return -EINVAL;

	/* free any filters */
	for (iter = 0; iter < col->filter_cnt; iter++)
		_db_release(col->filters[iter]);
	col->filter_cnt = 0;
	if (col->filters)
		free(col->filters);
	col->filters = NULL;

	/* set the endianess to undefined */
	col->endian = 0;

	/* set the default attribute values */
	col->attr.act_default = def_action;
	col->attr.act_badarch = SCMP_ACT_KILL;
	col->attr.nnp_enable = 1;
	col->attr.tsync_enable = 0;
	col->attr.api_tskip = 0;
	col->attr.log_enable = 0;

	/* set the state */
	col->state = _DB_STA_VALID;

	/* reset the initial db */
	db = _db_init(arch_def_native);
	if (db == NULL)
		return -ENOMEM;
	if (db_col_db_add(col, db) < 0) {
		_db_release(db);
		return -ENOMEM;
	}

	/* reset the transactions */
	while (col->snapshots) {
		snap = col->snapshots;
		col->snapshots = snap->next;
		for (iter = 0; iter < snap->filter_cnt; iter++)
			_db_release(snap->filters[iter]);
		free(snap->filters);
		free(snap);
	}

	return 0;
}

/**
 * Intitalize a seccomp filter collection
 * @param def_action the default filter action
 *
 * This function initializes a seccomp filter collection and readies it for
 * use.  Returns a pointer to the collection on success, NULL on failure.
 *
 */
struct db_filter_col *db_col_init(uint32_t def_action)
{
	struct db_filter_col *col;

	col = zmalloc(sizeof(*col));
	if (col == NULL)
		return NULL;

	/* reset the DB to a known state */
	if (db_col_reset(col, def_action) < 0)
		goto init_failure;

	return col;

init_failure:
	db_col_release(col);
	return NULL;
}

/**
 * Destroy a seccomp filter collection
 * @param col the seccomp filter collection
 *
 * This function destroys a seccomp filter collection.  After calling this
 * function, the filter should no longer be referenced.
 *
 */
void db_col_release(struct db_filter_col *col)
{
	unsigned int iter;

	if (col == NULL)
		return;

	/* set the state, just in case */
	col->state = _DB_STA_FREED;

	/* free any filters */
	for (iter = 0; iter < col->filter_cnt; iter++)
		_db_release(col->filters[iter]);
	col->filter_cnt = 0;
	if (col->filters)
		free(col->filters);
	col->filters = NULL;

	/* free the collection */
	free(col);
}

/**
 * Validate the seccomp action
 * @param action the seccomp action
 *
 * Verify that the given action is a valid seccomp action; return zero if
 * valid, -EINVAL if invalid.
 */
int db_action_valid(uint32_t action)
{
	if (sys_chk_seccomp_action(action) == 1)
		return 0;
	return -EINVAL;
}

/**
 * Validate a filter collection
 * @param col the seccomp filter collection
 *
 * This function validates a seccomp filter collection.  Returns zero if the
 * collection is valid, negative values on failure.
 *
 */
int db_col_valid(struct db_filter_col *col)
{
	if (col != NULL && col->state == _DB_STA_VALID && col->filter_cnt > 0)
		return 0;
	return -EINVAL;
}

/**
 * Merge two filter collections
 * @param col_dst the destination filter collection
 * @param col_src the source filter collection
 *
 * This function merges two filter collections into the given destination
 * collection.  The source filter collection is no longer valid if the function
 * returns successfully.  Returns zero on success, negative values on failure.
 *
 */
int db_col_merge(struct db_filter_col *col_dst, struct db_filter_col *col_src)
{
	unsigned int iter_a, iter_b;
	struct db_filter **dbs;

	/* verify that the endianess is a match */
	if (col_dst->endian != col_src->endian)
		return -EDOM;

	/* make sure we don't have any arch/filter collisions */
	for (iter_a = 0; iter_a < col_dst->filter_cnt; iter_a++) {
		for (iter_b = 0; iter_b < col_src->filter_cnt; iter_b++) {
			if (col_dst->filters[iter_a]->arch->token ==
			    col_src->filters[iter_b]->arch->token)
				return -EEXIST;
		}
	}

	/* expand the destination */
	dbs = realloc(col_dst->filters,
		      sizeof(struct db_filter *) *
		      (col_dst->filter_cnt + col_src->filter_cnt));
	if (dbs == NULL)
		return -ENOMEM;
	col_dst->filters = dbs;

	/* transfer the architecture filters */
	for (iter_a = col_dst->filter_cnt, iter_b = 0;
	     iter_b < col_src->filter_cnt; iter_a++, iter_b++) {
		col_dst->filters[iter_a] = col_src->filters[iter_b];
		col_dst->filter_cnt++;
	}

	/* free the source */
	col_src->filter_cnt = 0;
	db_col_release(col_src);

	return 0;
}

/**
 * Check to see if an architecture filter exists in the filter collection
 * @param col the seccomp filter collection
 * @param arch_token the architecture token
 *
 * Iterate through the given filter collection checking to see if a filter
 * exists for the specified architecture.  Returns -EEXIST if a filter is found,
 * zero if a matching filter does not exist.
 *
 */
int db_col_arch_exist(struct db_filter_col *col, uint32_t arch_token)
{
	unsigned int iter;

	for (iter = 0; iter < col->filter_cnt; iter++)
		if (col->filters[iter]->arch->token == arch_token)
			return -EEXIST;

	return 0;
}

/**
 * Get a filter attribute
 * @param col the seccomp filter collection
 * @param attr the filter attribute
 * @param value the filter attribute value
 *
 * Get the requested filter attribute and provide it via @value.  Returns zero
 * on success, negative values on failure.
 *
 */
int db_col_attr_get(const struct db_filter_col *col,
		    enum scmp_filter_attr attr, uint32_t *value)
{
	int rc = 0;

	switch (attr) {
	case SCMP_FLTATR_ACT_DEFAULT:
		*value = col->attr.act_default;
		break;
	case SCMP_FLTATR_ACT_BADARCH:
		*value = col->attr.act_badarch;
		break;
	case SCMP_FLTATR_CTL_NNP:
		*value = col->attr.nnp_enable;
		break;
	case SCMP_FLTATR_CTL_TSYNC:
		*value = col->attr.tsync_enable;
		break;
	case SCMP_FLTATR_API_TSKIP:
		*value = col->attr.api_tskip;
		break;
	case SCMP_FLTATR_CTL_LOG:
		*value = col->attr.log_enable;
		break;
	default:
		rc = -EEXIST;
		break;
	}

	return rc;
}

/**
 * Set a filter attribute
 * @param col the seccomp filter collection
 * @param attr the filter attribute
 * @param value the filter attribute value
 *
 * Set the requested filter attribute with the given value.  Returns zero on
 * success, negative values on failure.
 *
 */
int db_col_attr_set(struct db_filter_col *col,
		    enum scmp_filter_attr attr, uint32_t value)
{
	int rc = 0;

	switch (attr) {
	case SCMP_FLTATR_ACT_DEFAULT:
		/* read only */
		return -EACCES;
		break;
	case SCMP_FLTATR_ACT_BADARCH:
		if (db_action_valid(value) == 0)
			col->attr.act_badarch = value;
		else
			return -EINVAL;
		break;
	case SCMP_FLTATR_CTL_NNP:
		col->attr.nnp_enable = (value ? 1 : 0);
		break;
	case SCMP_FLTATR_CTL_TSYNC:
		rc = sys_chk_seccomp_flag(SECCOMP_FILTER_FLAG_TSYNC);
		if (rc == 1) {
			/* supported */
			rc = 0;
			col->attr.tsync_enable = (value ? 1 : 0);
		} else if (rc == 0)
			/* unsupported */
			rc = -EOPNOTSUPP;
		break;
	case SCMP_FLTATR_API_TSKIP:
		col->attr.api_tskip = (value ? 1 : 0);
		break;
	case SCMP_FLTATR_CTL_LOG:
		rc = sys_chk_seccomp_flag(SECCOMP_FILTER_FLAG_LOG);
		if (rc == 1) {
			/* supported */
			rc = 0;
			col->attr.log_enable = (value ? 1 : 0);
		} else if (rc == 0) {
			/* unsupported */
			rc = -EOPNOTSUPP;
		}
		break;
	default:
		rc = -EEXIST;
		break;
	}

	return rc;
}

/**
 * Add a new architecture filter to a filter collection
 * @param col the seccomp filter collection
 * @param arch the architecture
 *
 * This function adds a new architecture filter DB to an existing seccomp
 * filter collection assuming there isn't a filter DB already present with the
 * same architecture.  Returns zero on success, negative values on failure.
 *
 */
int db_col_db_new(struct db_filter_col *col, const struct arch_def *arch)
{
	int rc;
	struct db_filter *db;

	db = _db_init(arch);
	if (db == NULL)
		return -ENOMEM;
	rc = db_col_db_add(col, db);
	if (rc < 0)
		_db_release(db);

	return rc;
}

/**
 * Add a new filter DB to a filter collection
 * @param col the seccomp filter collection
 * @param db the seccomp filter DB
 *
 * This function adds an existing seccomp filter DB to an existing seccomp
 * filter collection assuming there isn't a filter DB already present with the
 * same architecture.  Returns zero on success, negative values on failure.
 *
 */
int db_col_db_add(struct db_filter_col *col, struct db_filter *db)
{
	struct db_filter **dbs;

	if (col->endian != 0 && col->endian != db->arch->endian)
		return -EDOM;

	if (db_col_arch_exist(col, db->arch->token))
		return -EEXIST;

	dbs = realloc(col->filters,
		      sizeof(struct db_filter *) * (col->filter_cnt + 1));
	if (dbs == NULL)
		return -ENOMEM;
	col->filters = dbs;
	col->filter_cnt++;
	col->filters[col->filter_cnt - 1] = db;
	if (col->endian == 0)
		col->endian = db->arch->endian;

	return 0;
}

/**
 * Remove a filter DB from a filter collection
 * @param col the seccomp filter collection
 * @param arch_token the architecture token
 *
 * This function removes an existing seccomp filter DB from an existing seccomp
 * filter collection.  Returns zero on success, negative values on failure.
 *
 */
int db_col_db_remove(struct db_filter_col *col, uint32_t arch_token)
{
	unsigned int iter;
	unsigned int found;
	struct db_filter **dbs;

	if ((col->filter_cnt <= 0) || (db_col_arch_exist(col, arch_token) == 0))
		return -EINVAL;

	for (found = 0, iter = 0; iter < col->filter_cnt; iter++) {
		if (found)
			col->filters[iter - 1] = col->filters[iter];
		else if (col->filters[iter]->arch->token == arch_token) {
			_db_release(col->filters[iter]);
			found = 1;
		}
	}
	col->filters[--col->filter_cnt] = NULL;

	if (col->filter_cnt > 0) {
		/* NOTE: if we can't do the realloc it isn't fatal, we just
		 *       have some extra space allocated */
		dbs = realloc(col->filters,
			      sizeof(struct db_filter *) * col->filter_cnt);
		if (dbs != NULL)
			col->filters = dbs;
	} else {
		/* this was the last filter so free all the associated memory
		 * and reset the endian token */
		free(col->filters);
		col->filters = NULL;
		col->endian = 0;
	}

	return 0;
}

/**
 * Test if the argument filter can be skipped because it's a tautology
 * @param arg argument filter
 *
 * If this argument filter applied to the lower 32 bit can be skipped this
 * function returns false.
 *
 */
static bool _db_arg_cmp_need_lo(const struct db_api_arg *arg)
{
	if (arg->op == SCMP_CMP_MASKED_EQ && D64_LO(arg->mask) == 0)
		return false;

	return true;
}

/**
 * Test if the argument filter can be skipped because it's a tautology
 * @param arg argument filter
 *
 * If this argument filter applied to the upper 32 bit can be skipped this
 * function returns false.
 *
 */
static bool _db_arg_cmp_need_hi(const struct db_api_arg *arg)
{
	if (arg->op == SCMP_CMP_MASKED_EQ && D64_HI(arg->mask) == 0)
		return false;

	return true;
}

/**
 * Fixup the node based on the op/mask
 * @param node the chain node
 *
 * Ensure the datum is masked as well.
 *
 */
static void _db_node_mask_fixup(struct db_arg_chain_tree *node)
{
	node->datum &= node->mask;
}

/**
 * Generate a new filter rule for a 64 bit system
 * @param arch the architecture definition
 * @param rule the new filter rule
 *
 * This function generates a new syscall filter for a 64 bit system. Returns
 * zero on success, negative values on failure.
 *
 */
static struct db_sys_list *_db_rule_gen_64(const struct arch_def *arch,
					   const struct db_api_rule_list *rule)
{
	unsigned int iter;
	struct db_sys_list *s_new;
	const struct db_api_arg *chain = rule->args;
	struct db_arg_chain_tree *c_iter[3] = { NULL, NULL, NULL };
	struct db_arg_chain_tree *c_prev[3] = { NULL, NULL, NULL };
	enum scmp_compare op_prev = _SCMP_CMP_MIN;
	unsigned int arg;
	scmp_datum_t mask;
	scmp_datum_t datum;

	s_new = zmalloc(sizeof(*s_new));
	if (s_new == NULL)
		return NULL;
	s_new->num = rule->syscall;
	s_new->valid = true;
	/* run through the argument chain */
	for (iter = 0; iter < ARG_COUNT_MAX; iter++) {
		if (chain[iter].valid == 0)
			continue;

		/* TODO: handle the case were either hi or lo isn't needed */

		/* skip generating instruction which are no-ops */
		if (!_db_arg_cmp_need_hi(&chain[iter]) &&
		    !_db_arg_cmp_need_lo(&chain[iter]))
			continue;

		c_iter[0] = zmalloc(sizeof(*c_iter[0]));
		if (c_iter[0] == NULL)
			goto gen_64_failure;
		c_iter[1] = zmalloc(sizeof(*c_iter[1]));
		if (c_iter[1] == NULL) {
			free(c_iter[0]);
			goto gen_64_failure;
		}
		c_iter[2] = NULL;

		arg = chain[iter].arg;
		mask = chain[iter].mask;
		datum = chain[iter].datum;

		/* NOTE: with the idea that a picture is worth a thousand
		 *       words, i'm presenting the following diagrams which
		 *       show how we should compare 64-bit syscall arguments
		 *       using 32-bit comparisons.
		 *
		 *       in the diagrams below "A(x)" is the syscall argument
		 *       being evaluated and "R(x)" is the syscall argument
		 *       value specified in the libseccomp rule.  the "ACCEPT"
		 *       verdict indicates a rule match and processing should
		 *       continue on to the rest of the rule, or the final rule
		 *       action should be triggered.  the "REJECT" verdict
		 *       indicates that the rule does not match and processing
		 *       should continue to the next rule or the default
		 *       action.
		 *
		 * SCMP_CMP_GT:
		 *                   +------------------+
		 *                +--|  Ah(x) >  Rh(x)  |------+
		 *                |  +------------------+      |
		 *              FALSE                         TRUE     A
		 *                |                            |       C
		 *                +-----------+                +---->  C
		 *                            v                +---->  E
		 *                   +------------------+      |       P
		 *                +--|  Ah(x) == Rh(x)  |--+   |       T
		 *        R       |  +------------------+  |   |
		 *        E     FALSE                     TRUE |
		 *        J  <----+                        |   |
		 *        E  <----+           +------------+   |
		 *        C     FALSE         v                |
		 *        T       |  +------------------+      |
		 *                +--|  Al(x) >  Rl(x)  |------+
		 *                   +------------------+
		 *
		 * SCMP_CMP_GE:
		 *                   +------------------+
		 *                +--|  Ah(x) >  Rh(x)  |------+
		 *                |  +------------------+      |
		 *              FALSE                         TRUE     A
		 *                |                            |       C
		 *                +-----------+                +---->  C
		 *                            v                +---->  E
		 *                   +------------------+      |       P
		 *                +--|  Ah(x) == Rh(x)  |--+   |       T
		 *        R       |  +------------------+  |   |
		 *        E     FALSE                     TRUE |
		 *        J  <----+                        |   |
		 *        E  <----+           +------------+   |
		 *        C     FALSE         v                |
		 *        T       |  +------------------+      |
		 *                +--|  Al(x) >= Rl(x)  |------+
		 *                   +------------------+
		 *
		 * SCMP_CMP_LT:
		 *                   +------------------+
		 *                +--|  Ah(x) >  Rh(x)  |------+
		 *                |  +------------------+      |
		 *              FALSE                         TRUE     R
		 *                |                            |       E
		 *                +-----------+                +---->  J
		 *                            v                +---->  E
		 *                   +------------------+      |       C
		 *                +--|  Ah(x) == Rh(x)  |--+   |       T
		 *        A       |  +------------------+  |   |
		 *        C     FALSE                     TRUE |
		 *        C  <----+                        |   |
		 *        E  <----+           +------------+   |
		 *        P     FALSE         v                |
		 *        T       |  +------------------+      |
		 *                +--|  Al(x) >= Rl(x)  |------+
		 *                   +------------------+
		 *
		 * SCMP_CMP_LE:
		 *                   +------------------+
		 *                +--|  Ah(x) >  Rh(x)  |------+
		 *                |  +------------------+      |
		 *              FALSE                         TRUE     R
		 *                |                            |       E
		 *                +-----------+                +---->  J
		 *                            v                +---->  E
		 *                   +------------------+      |       C
		 *                +--|  Ah(x) == Rh(x)  |--+   |       T
		 *        A       |  +------------------+  |   |
		 *        C     FALSE                     TRUE |
		 *        C  <----+                        |   |
		 *        E  <----+           +------------+   |
		 *        P     FALSE         v                |
		 *        T       |  +------------------+      |
		 *                +--|  Al(x) >  Rl(x)  |------+
		 *                   +------------------+
		 *
		 * SCMP_CMP_EQ:
		 *                   +------------------+
		 *                +--|  Ah(x) == Rh(x)  |--+
		 *        R       |  +------------------+  |           A
		 *        E     FALSE                     TRUE         C
		 *        J  <----+                        |           C
		 *        E  <----+           +------------+   +---->  E
		 *        C     FALSE         v                |       P
		 *        T       |  +------------------+      |       T
		 *                +--|  Al(x) == Rl(x)  |------+
		 *                   +------------------+
		 *
		 * SCMP_CMP_NE:
		 *                   +------------------+
		 *                +--|  Ah(x) == Rh(x)  |--+
		 *        A       |  +------------------+  |           R
		 *        C     FALSE                     TRUE         E
		 *        C  <----+                        |           J
		 *        E  <----+           +------------+   +---->  E
		 *        P     FALSE         v                |       C
		 *        T       |  +------------------+      |       T
		 *                +--|  Al(x) == Rl(x)  |------+
		 *                   +------------------+
		 *
		 */

		/* setup the level */
		switch (chain[iter].op) {
		case SCMP_CMP_GT:
		case SCMP_CMP_GE:
		case SCMP_CMP_LE:
		case SCMP_CMP_LT:
			c_iter[2] = zmalloc(sizeof(*c_iter[2]));
			if (c_iter[2] == NULL) {
				free(c_iter[0]);
				free(c_iter[1]);
				goto gen_64_failure;
			}

			c_iter[0]->arg = arg;
			c_iter[1]->arg = arg;
			c_iter[2]->arg = arg;
			c_iter[0]->arg_h_flg = true;
			c_iter[1]->arg_h_flg = true;
			c_iter[2]->arg_h_flg = false;
			c_iter[0]->arg_offset = arch_arg_offset_hi(arch, arg);
			c_iter[1]->arg_offset = arch_arg_offset_hi(arch, arg);
			c_iter[2]->arg_offset = arch_arg_offset_lo(arch, arg);

			c_iter[0]->mask = D64_HI(mask);
			c_iter[1]->mask = D64_HI(mask);
			c_iter[2]->mask = D64_LO(mask);
			c_iter[0]->datum = D64_HI(datum);
			c_iter[1]->datum = D64_HI(datum);
			c_iter[2]->datum = D64_LO(datum);
			c_iter[0]->datum_full = datum;
			c_iter[1]->datum_full = datum;
			c_iter[2]->datum_full = datum;

			_db_node_mask_fixup(c_iter[0]);
			_db_node_mask_fixup(c_iter[1]);
			_db_node_mask_fixup(c_iter[2]);

			c_iter[0]->op = SCMP_CMP_GT;
			c_iter[1]->op = SCMP_CMP_EQ;
			switch (chain[iter].op) {
			case SCMP_CMP_GT:
			case SCMP_CMP_LE:
				c_iter[2]->op = SCMP_CMP_GT;
				break;
			case SCMP_CMP_GE:
			case SCMP_CMP_LT:
				c_iter[2]->op = SCMP_CMP_GE;
				break;
			default:
				/* we should never get here */
				goto gen_64_failure;
			}
			c_iter[0]->op_orig = chain[iter].op;
			c_iter[1]->op_orig = chain[iter].op;
			c_iter[2]->op_orig = chain[iter].op;

			c_iter[0]->nxt_f = _db_node_get(c_iter[1]);
			c_iter[1]->nxt_t = _db_node_get(c_iter[2]);
			break;
		case SCMP_CMP_EQ:
		case SCMP_CMP_MASKED_EQ:
		case SCMP_CMP_NE:
			c_iter[0]->arg = arg;
			c_iter[1]->arg = arg;
			c_iter[0]->arg_h_flg = true;
			c_iter[1]->arg_h_flg = false;
			c_iter[0]->arg_offset = arch_arg_offset_hi(arch, arg);
			c_iter[1]->arg_offset = arch_arg_offset_lo(arch, arg);

			c_iter[0]->mask = D64_HI(mask);
			c_iter[1]->mask = D64_LO(mask);
			c_iter[0]->datum = D64_HI(datum);
			c_iter[1]->datum = D64_LO(datum);
			c_iter[0]->datum_full = datum;
			c_iter[1]->datum_full = datum;

			_db_node_mask_fixup(c_iter[0]);
			_db_node_mask_fixup(c_iter[1]);

			switch (chain[iter].op) {
			case SCMP_CMP_MASKED_EQ:
				c_iter[0]->op = SCMP_CMP_MASKED_EQ;
				c_iter[1]->op = SCMP_CMP_MASKED_EQ;
				break;
			default:
				c_iter[0]->op = SCMP_CMP_EQ;
				c_iter[1]->op = SCMP_CMP_EQ;
			}
			c_iter[0]->op_orig = chain[iter].op;
			c_iter[1]->op_orig = chain[iter].op;

			c_iter[0]->nxt_t = _db_node_get(c_iter[1]);
			break;
		default:
			/* we should never get here */
			goto gen_64_failure;
		}

		/* link this level to the previous level */
		if (c_prev[0] != NULL) {
			switch (op_prev) {
			case SCMP_CMP_GT:
			case SCMP_CMP_GE:
				c_prev[0]->nxt_t = _db_node_get(c_iter[0]);
				c_prev[2]->nxt_t = _db_node_get(c_iter[0]);
				break;
			case SCMP_CMP_EQ:
			case SCMP_CMP_MASKED_EQ:
				c_prev[1]->nxt_t = _db_node_get(c_iter[0]);
				break;
			case SCMP_CMP_LE:
			case SCMP_CMP_LT:
				c_prev[1]->nxt_f = _db_node_get(c_iter[0]);
				c_prev[2]->nxt_f = _db_node_get(c_iter[0]);
				break;
			case SCMP_CMP_NE:
				c_prev[0]->nxt_f = _db_node_get(c_iter[0]);
				c_prev[1]->nxt_f = _db_node_get(c_iter[0]);
				break;
			default:
				/* we should never get here */
				goto gen_64_failure;
			}
		} else
			s_new->chains = _db_node_get(c_iter[0]);

		/* update the node count */
		switch (chain[iter].op) {
		case SCMP_CMP_NE:
		case SCMP_CMP_EQ:
		case SCMP_CMP_MASKED_EQ:
			s_new->node_cnt += 2;
			break;
		default:
			s_new->node_cnt += 3;
		}

		/* keep pointers to this level */
		c_prev[0] = c_iter[0];
		c_prev[1] = c_iter[1];
		c_prev[2] = c_iter[2];
		op_prev = chain[iter].op;
	}
	if (c_iter[0] != NULL) {
		/* set the actions on the last layer */
		switch (op_prev) {
		case SCMP_CMP_GT:
		case SCMP_CMP_GE:
			c_iter[0]->act_t_flg = true;
			c_iter[0]->act_t = rule->action;
			c_iter[2]->act_t_flg = true;
			c_iter[2]->act_t = rule->action;
			break;
		case SCMP_CMP_LE:
		case SCMP_CMP_LT:
			c_iter[1]->act_f_flg = true;
			c_iter[1]->act_f = rule->action;
			c_iter[2]->act_f_flg = true;
			c_iter[2]->act_f = rule->action;
			break;
		case SCMP_CMP_EQ:
		case SCMP_CMP_MASKED_EQ:
			c_iter[1]->act_t_flg = true;
			c_iter[1]->act_t = rule->action;
			break;
		case SCMP_CMP_NE:
			c_iter[0]->act_f_flg = true;
			c_iter[0]->act_f = rule->action;
			c_iter[1]->act_f_flg = true;
			c_iter[1]->act_f = rule->action;
			break;
		default:
			/* we should never get here */
			goto gen_64_failure;
		}
	} else
		s_new->action = rule->action;

	return s_new;

gen_64_failure:
	/* free the new chain and its syscall struct */
	_db_tree_put(&s_new->chains);
	free(s_new);
	return NULL;
}

/**
 * Generate a new filter rule for a 32 bit system
 * @param arch the architecture definition
 * @param rule the new filter rule
 *
 * This function generates a new syscall filter for a 32 bit system. Returns
 * zero on success, negative values on failure.
 *
 */
static struct db_sys_list *_db_rule_gen_32(const struct arch_def *arch,
					   const struct db_api_rule_list *rule)
{
	unsigned int iter;
	struct db_sys_list *s_new;
	const struct db_api_arg *chain = rule->args;
	struct db_arg_chain_tree *c_iter = NULL, *c_prev = NULL;
	bool tf_flag;

	s_new = zmalloc(sizeof(*s_new));
	if (s_new == NULL)
		return NULL;
	s_new->num = rule->syscall;
	s_new->valid = true;
	/* run through the argument chain */
	for (iter = 0; iter < ARG_COUNT_MAX; iter++) {
		if (chain[iter].valid == 0)
			continue;

		/* skip generating instructions which are no-ops */
		if (!_db_arg_cmp_need_lo(&chain[iter]))
			continue;

		c_iter = zmalloc(sizeof(*c_iter));
		if (c_iter == NULL)
			goto gen_32_failure;
		c_iter->arg = chain[iter].arg;
		c_iter->arg_h_flg = false;
		c_iter->arg_offset = arch_arg_offset(arch, c_iter->arg);
		c_iter->op = chain[iter].op;
		c_iter->op_orig = chain[iter].op;
		/* implicitly strips off the upper 32 bit */
		c_iter->mask = chain[iter].mask;
		c_iter->datum = chain[iter].datum;
		c_iter->datum_full = chain[iter].datum;

		/* link in the new node and update the chain */
		if (c_prev != NULL) {
			if (tf_flag)
				c_prev->nxt_t = _db_node_get(c_iter);
			else
				c_prev->nxt_f = _db_node_get(c_iter);
		} else
			s_new->chains = _db_node_get(c_iter);
		s_new->node_cnt++;

		/* rewrite the op to reduce the op/datum combos */
		switch (c_iter->op) {
		case SCMP_CMP_NE:
			c_iter->op = SCMP_CMP_EQ;
			tf_flag = false;
			break;
		case SCMP_CMP_LT:
			c_iter->op = SCMP_CMP_GE;
			tf_flag = false;
			break;
		case SCMP_CMP_LE:
			c_iter->op = SCMP_CMP_GT;
			tf_flag = false;
			break;
		default:
			tf_flag = true;
		}

		/* fixup the mask/datum */
		_db_node_mask_fixup(c_iter);

		c_prev = c_iter;
	}
	if (c_iter != NULL) {
		/* set the leaf node */
		if (tf_flag) {
			c_iter->act_t_flg = true;
			c_iter->act_t = rule->action;
		} else {
			c_iter->act_f_flg = true;
			c_iter->act_f = rule->action;
		}
	} else
		s_new->action = rule->action;

	return s_new;

gen_32_failure:
	/* free the new chain and its syscall struct */
	_db_tree_put(&s_new->chains);
	free(s_new);
	return NULL;
}

/**
 * Add a new rule to the seccomp filter DB
 * @param db the seccomp filter db
 * @param rule the filter rule
 *
 * This function adds a new syscall filter to the seccomp filter DB, adding to
 * the existing filters for the syscall, unless no argument specific filters
 * are present (filtering only on the syscall).  When adding new chains, the
 * shortest chain, or most inclusive filter match, will be entered into the
 * filter DB. Returns zero on success, negative values on failure.
 *
 * It is important to note that in the case of failure the db may be corrupted,
 * the caller must use the transaction mechanism if the db integrity is
 * important.
 *
 */
int db_rule_add(struct db_filter *db, const struct db_api_rule_list *rule)
{
	int rc = -ENOMEM;
	struct db_sys_list *s_new, *s_iter, *s_prev = NULL;
	struct db_iter_state state;
	bool rm_flag = false;

	assert(db != NULL);

	/* do all our possible memory allocation up front so we don't have to
	 * worry about failure once we get to the point where we start updating
	 * the filter db */
	if (db->arch->size == ARCH_SIZE_64)
		s_new = _db_rule_gen_64(db->arch, rule);
	else if (db->arch->size == ARCH_SIZE_32)
		s_new = _db_rule_gen_32(db->arch, rule);
	else
		return -EFAULT;
	if (s_new == NULL)
		return -ENOMEM;

	/* find a matching syscall/chain or insert a new one */
	s_iter = db->syscalls;
	while (s_iter != NULL && s_iter->num < rule->syscall) {
		s_prev = s_iter;
		s_iter = s_iter->next;
	}
	s_new->priority = _DB_PRI_MASK_CHAIN - s_new->node_cnt;
add_reset:
	if (s_iter == NULL || s_iter->num != rule->syscall) {
		/* new syscall, add before s_iter */
		if (s_prev != NULL) {
			s_new->next = s_prev->next;
			s_prev->next = s_new;
		} else {
			s_new->next = db->syscalls;
			db->syscalls = s_new;
		}
		return 0;
	} else if (s_iter->chains == NULL) {
		if (rm_flag || !s_iter->valid) {
			/* we are here because our previous pass cleared the
			 * entire syscall chain when searching for a subtree
			 * match or the existing syscall entry is a phantom,
			 * so either way add the new chain */
			s_iter->chains = s_new->chains;
			s_iter->action = s_new->action;
			s_iter->node_cnt = s_new->node_cnt;
			if (s_iter->valid)
				s_iter->priority = s_new->priority;
			s_iter->valid = true;
			free(s_new);
			rc = 0;
			goto add_priority_update;
		} else {
			/* syscall exists without any chains - existing filter
			 * is at least as large as the new entry so cleanup and
			 * exit */
			_db_tree_put(&s_new->chains);
			free(s_new);
			goto add_free_ok;
		}
	} else if (s_iter->chains != NULL && s_new->chains == NULL) {
		/* syscall exists with chains but the new filter has no chains
		 * so we need to clear the existing chains and exit */
		_db_tree_put(&s_iter->chains);
		s_iter->chains = NULL;
		s_iter->node_cnt = 0;
		s_iter->action = rule->action;

		/* cleanup the new tree and return */
		_db_tree_put(&s_new->chains);
		free(s_new);
		goto add_free_ok;
	}

	/* prune any sub-trees that are no longer required */
	memset(&state, 0, sizeof(state));
	state.sx = s_iter;
	state.action = rule->action;
	rc = _db_tree_prune(&s_iter->chains, s_new->chains, &state);
	if (rc > 0) {
		/* we pruned at least some of the existing tree */
		rm_flag = true;
		s_iter->node_cnt -= rc;
		if (s_iter->chains == NULL)
			/* we pruned the entire tree */
			goto add_reset;
	} else if ((state.flags & _DB_IST_M_REDUNDANT) == _DB_IST_M_REDUNDANT) {
		/* the existing tree is "shorter", drop the new one */
		_db_tree_put(&s_new->chains);
		free(s_new);
		goto add_free_ok;
	}

	/* add the new rule to the existing filter and cleanup */
	memset(&state, 0, sizeof(state));
	state.sx = s_iter;
	rc = _db_tree_add(&s_iter->chains, s_new->chains, &state);
	if (rc < 0)
		goto add_failure;
	s_iter->node_cnt += s_new->node_cnt;
	s_iter->node_cnt -= _db_tree_put(&s_new->chains);
	free(s_new);

add_free_ok:
	rc = 0;
add_priority_update:
	/* update the priority */
	if (s_iter != NULL) {
		s_iter->priority &= (~_DB_PRI_MASK_CHAIN);
		s_iter->priority |= (_DB_PRI_MASK_CHAIN - s_iter->node_cnt);
	}
	return rc;

add_failure:
	/* NOTE: another reminder that we don't do any db error recovery here,
	 * use the transaction mechanism as previously mentioned */
	_db_tree_put(&s_new->chains);
	free(s_new);
	return rc;
}

/**
 * Set the priority of a given syscall
 * @param col the filter collection
 * @param syscall the syscall number
 * @param priority priority value, higher value == higher priority
 *
 * This function sets the priority of the given syscall; this value is used
 * when generating the seccomp filter code such that higher priority syscalls
 * will incur less filter code overhead than the lower priority syscalls in the
 * filter.  Returns zero on success, negative values on failure.
 *
 */
int db_col_syscall_priority(struct db_filter_col *col,
			    int syscall, uint8_t priority)
{
	int rc = 0, rc_tmp;
	unsigned int iter;
	int sc_tmp;
	struct db_filter *filter;

	for (iter = 0; iter < col->filter_cnt; iter++) {
		filter = col->filters[iter];
		sc_tmp = syscall;

		rc_tmp = arch_syscall_translate(filter->arch, &sc_tmp);
		if (rc_tmp < 0)
			goto priority_failure;

		/* if this is a pseudo syscall then we need to rewrite the
		 * syscall for some arch specific reason, don't forget the
		 * special handling for syscall -1 */
		if (sc_tmp < -1) {
			/* we set this as a strict op - we don't really care
			 * since priorities are a "best effort" thing - as we
			 * want to catch the -EDOM error and bail on this
			 * architecture */
			rc_tmp = arch_syscall_rewrite(filter->arch, &sc_tmp);
			if (rc_tmp == -EDOM)
				continue;
			if (rc_tmp < 0)
				goto priority_failure;
		}

		rc_tmp = _db_syscall_priority(filter, sc_tmp, priority);

priority_failure:
		if (rc == 0 && rc_tmp < 0)
			rc = rc_tmp;
	}

	return rc;
}

/**
 * Add a new rule to the current filter
 * @param col the filter collection
 * @param strict the strict flag
 * @param action the filter action
 * @param syscall the syscall number
 * @param arg_cnt the number of argument filters in the argument filter chain
 * @param arg_array the argument filter chain, (uint, enum scmp_compare, ulong)
 *
 * This function adds a new argument/comparison/value to the seccomp filter for
 * a syscall; multiple arguments can be specified and they will be chained
 * together (essentially AND'd together) in the filter.  When the strict flag
 * is true the function will fail if the exact rule can not be added to the
 * filter, if the strict flag is false the function will not fail if the
 * function needs to adjust the rule due to architecture specifics.  Returns
 * zero on success, negative values on failure.
 *
 */
int db_col_rule_add(struct db_filter_col *col,
		    bool strict, uint32_t action, int syscall,
		    unsigned int arg_cnt, const struct scmp_arg_cmp *arg_array)
{
	int rc = 0, rc_tmp;
	unsigned int iter;
	unsigned int arg_num;
	size_t chain_size;
	struct db_api_arg *chain = NULL;
	struct scmp_arg_cmp arg_data;
	struct db_api_rule_list *rule, *rule_tmp;
	struct db_filter *db;

	/* collect the arguments for the filter rule */
	chain_size = sizeof(*chain) * ARG_COUNT_MAX;
	chain = zmalloc(chain_size);
	if (chain == NULL)
		return -ENOMEM;
	for (iter = 0; iter < arg_cnt; iter++) {
		arg_data = arg_array[iter];
		arg_num = arg_data.arg;
		if (arg_num < ARG_COUNT_MAX && chain[arg_num].valid == 0) {
			chain[arg_num].valid = 1;
			chain[arg_num].arg = arg_num;
			chain[arg_num].op = arg_data.op;
			/* TODO: we should check datum/mask size against the
			 *	 arch definition, e.g. 64 bit datum on x86 */
			switch (chain[arg_num].op) {
			case SCMP_CMP_NE:
			case SCMP_CMP_LT:
			case SCMP_CMP_LE:
			case SCMP_CMP_EQ:
			case SCMP_CMP_GE:
			case SCMP_CMP_GT:
				chain[arg_num].mask = DATUM_MAX;
				chain[arg_num].datum = arg_data.datum_a;
				break;
			case SCMP_CMP_MASKED_EQ:
				chain[arg_num].mask = arg_data.datum_a;
				chain[arg_num].datum = arg_data.datum_b;
				break;
			default:
				rc = -EINVAL;
				goto add_return;
			}
		} else {
			rc = -EINVAL;
			goto add_return;
		}
	}

	/* create a checkpoint */
	rc = db_col_transaction_start(col);
	if (rc != 0)
		goto add_return;

	/* add the rule to the different filters in the collection */
	for (iter = 0; iter < col->filter_cnt; iter++) {

		/* TODO: consolidate with db_col_transaction_start() */

		db = col->filters[iter];

		/* create the rule */
		rule = _db_rule_new(strict, action, syscall, chain);
		if (rule == NULL) {
			rc_tmp = -ENOMEM;
			goto add_arch_fail;
		}

		/* add the rule */
		rc_tmp = arch_filter_rule_add(db, rule);
		if (rc_tmp == 0) {
			/* insert the chain to the end of the rule list */
			rule_tmp = rule;
			while (rule_tmp->next)
				rule_tmp = rule_tmp->next;
			if (db->rules != NULL) {
				rule->prev = db->rules->prev;
				rule_tmp->next = db->rules;
				db->rules->prev->next = rule;
				db->rules->prev = rule_tmp;
			} else {
				rule->prev = rule_tmp;
				rule_tmp->next = rule;
				db->rules = rule;
			}
		} else
			free(rule);
add_arch_fail:
		if (rc_tmp != 0 && rc == 0)
			rc = rc_tmp;
	}

	/* commit the transaction or abort */
	if (rc == 0)
		db_col_transaction_commit(col);
	else
		db_col_transaction_abort(col);

add_return:
	if (chain != NULL)
		free(chain);
	return rc;
}

/**
 * Start a new seccomp filter transaction
 * @param col the filter collection
 *
 * This function starts a new seccomp filter transaction for the given filter
 * collection.  Returns zero on success, negative values on failure.
 *
 */
int db_col_transaction_start(struct db_filter_col *col)
{
	int rc;
	unsigned int iter;
	struct db_filter_snap *snap;
	struct db_filter *filter_o, *filter_s;
	struct db_api_rule_list *rule_o, *rule_s = NULL, *rule_tmp;

	/* allocate the snapshot */
	snap = zmalloc(sizeof(*snap));
	if (snap == NULL)
		return -ENOMEM;
	snap->filters = zmalloc(sizeof(struct db_filter *) * col->filter_cnt);
	if (snap->filters == NULL) {
		free(snap);
		return -ENOMEM;
	}
	snap->filter_cnt = col->filter_cnt;
	for (iter = 0; iter < snap->filter_cnt; iter++)
		snap->filters[iter] = NULL;
	snap->next = NULL;

	/* create a snapshot of the current filter state */
	for (iter = 0; iter < col->filter_cnt; iter++) {
		/* allocate a new filter */
		filter_o = col->filters[iter];
		filter_s = _db_init(filter_o->arch);
		if (filter_s == NULL)
			goto trans_start_failure;
		snap->filters[iter] = filter_s;

		/* create a filter snapshot from existing rules */
		rule_o = filter_o->rules;
		if (rule_o == NULL)
			continue;
		do {

			/* TODO: consolidate with db_col_rule_add() */

			/* duplicate the rule */
			rule_s = db_rule_dup(rule_o);
			if (rule_s == NULL)
				goto trans_start_failure;

			/* add the rule */
			rc = arch_filter_rule_add(filter_s, rule_s);
			if (rc != 0)
				goto trans_start_failure;

			/* insert the chain to the end of the rule list */
			rule_tmp = rule_s;
			while (rule_tmp->next)
				rule_tmp = rule_tmp->next;
			if (filter_s->rules != NULL) {
				rule_s->prev = filter_s->rules->prev;
				rule_tmp->next = filter_s->rules;
				filter_s->rules->prev->next = rule_s;
				filter_s->rules->prev = rule_tmp;
			} else {
				rule_s->prev = rule_tmp;
				rule_tmp->next = rule_s;
				filter_s->rules = rule_s;
			}
			rule_s = NULL;

			/* next rule */
			rule_o = rule_o->next;
		} while (rule_o != filter_o->rules);
	}

	/* add the snapshot to the list */
	snap->next = col->snapshots;
	col->snapshots = snap;

	return 0;

trans_start_failure:
	if (rule_s != NULL)
		free(rule_s);
	_db_snap_release(snap);
	return -ENOMEM;
}

/**
 * Abort the top most seccomp filter transaction
 * @param col the filter collection
 *
 * This function aborts the most recent seccomp filter transaction.
 *
 */
void db_col_transaction_abort(struct db_filter_col *col)
{
	int iter;
	unsigned int filter_cnt;
	struct db_filter **filters;
	struct db_filter_snap *snap;

	if (col->snapshots == NULL)
		return;

	/* replace the current filter with the last snapshot */
	snap = col->snapshots;
	col->snapshots = snap->next;
	filter_cnt = col->filter_cnt;
	filters = col->filters;
	col->filter_cnt = snap->filter_cnt;
	col->filters = snap->filters;
	free(snap);

	/* free the filter we swapped out */
	for (iter = 0; iter < filter_cnt; iter++)
		_db_release(filters[iter]);
	free(filters);
}

/**
 * Commit the top most seccomp filter transaction
 * @param col the filter collection
 *
 * This function commits the most recent seccomp filter transaction.
 *
 */
void db_col_transaction_commit(struct db_filter_col *col)
{
	struct db_filter_snap *snap;

	snap = col->snapshots;
	col->snapshots = snap->next;
	_db_snap_release(snap);
}