/*
 * IPv4 Address Conflict Detection
 *
 * This file implements the probe object. A probe is basically the
 * state-machine of a single ACD run. It takes an address to probe for, checks
 * for conflicts and then defends it once configured.
 */

#include <assert.h>
#include <c-rbtree.h>
#include <c-stdaux.h>
#include <endian.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <netinet/if_ether.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include "n-acd.h"
#include "n-acd-private.h"

/*
 * These parameters and timing intervals are specified in RFC-5227. The
 * original values are:
 *
 *     PROBE_NUM                                3
 *     PROBE_WAIT                               1s
 *     PROBE_MIN                                1s
 *     PROBE_MAX                                3s
 *     ANNOUNCE_NUM                             3
 *     ANNOUNCE_WAIT                            2s
 *     ANNOUNCE_INTERVAL                        2s
 *     MAX_CONFLICTS                            10
 *     RATE_LIMIT_INTERVAL                      60s
 *     DEFEND_INTERVAL                          10s
 *
 * If we assume a best-case and worst-case scenario for non-conflicted runs, we
 * end up with a runtime between 4s and 9s to finish the probe. Then it still
 * takes a fixed 4s to finish the announcements.
 *
 * RFC 5227 section 1.1:
 *     [...] (Note that the values listed here are fixed constants; they are
 *     not intended to be modifiable by implementers, operators, or end users.
 *     These constants are given symbolic names here to facilitate the writing
 *     of future standards that may want to reference this document with
 *     different values for these named constants; however, at the present time
 *     no such future standards exist.) [...]
 *
 * Unfortunately, no-one ever stepped up to write a "future standard" to revise
 * the timings. A 9s timeout for successful link setups is not acceptable today.
 * Hence, we will just go forward and ignore the proposed values. On both
 * wired and wireless local links round-trip latencies of below 3ms are common.
 * We require the caller to set a timeout multiplier, where 1 corresponds to a
 * total probe time between 0.5 ms and 1.0 ms. On modern networks a multiplier
 * of about 100 should be a reasonable default. To comply with the RFC select a
 * multiplier of 9000.
 */
#define N_ACD_RFC_PROBE_NUM                     (3)
#define N_ACD_RFC_PROBE_WAIT_NSEC               (UINT64_C(111111)) /* 1/9 ms */
#define N_ACD_RFC_PROBE_MIN_NSEC                (UINT64_C(111111)) /* 1/9 ms */
#define N_ACD_RFC_PROBE_MAX_NSEC                (UINT64_C(333333)) /* 3/9 ms */
#define N_ACD_RFC_ANNOUNCE_NUM                  (3)
#define N_ACD_RFC_ANNOUNCE_WAIT_NSEC            (UINT64_C(222222)) /* 2/9 ms */
#define N_ACD_RFC_ANNOUNCE_INTERVAL_NSEC        (UINT64_C(222222)) /* 2/9 ms */
#define N_ACD_RFC_MAX_CONFLICTS                 (10)
#define N_ACD_RFC_RATE_LIMIT_INTERVAL_NSEC      (UINT64_C(60000000000)) /* 60s */
#define N_ACD_RFC_DEFEND_INTERVAL_NSEC          (UINT64_C(10000000000)) /* 10s */

/**
 * n_acd_probe_config_new() - create probe configuration
 * @configp:                    output argument for new probe configuration
 *
 * This creates a new probe configuration. It will be returned in @configp to
 * the caller, which upon return fully owns the object.
 *
 * A probe configuration collects parameters for probes. It never validates the
 * input, but this is left to the consumer of the configuration to do.
 *
 * Return: 0 on success, negative error code on failure.
 */
_c_public_ int n_acd_probe_config_new(NAcdProbeConfig **configp) {
        _c_cleanup_(n_acd_probe_config_freep) NAcdProbeConfig *config = NULL;

        config = malloc(sizeof(*config));
        if (!config)
                return -ENOMEM;

        *config = (NAcdProbeConfig)N_ACD_PROBE_CONFIG_NULL(*config);

        *configp = config;
        config = NULL;
        return 0;
}

/**
 * n_acd_probe_config_free() - destroy probe configuration
 * @config:                     configuration to operate on, or NULL
 *
 * This destroys the probe configuration and all associated objects. If @config
 * is NULL, this is a no-op.
 *
 * Return: NULL is returned.
 */
_c_public_ NAcdProbeConfig *n_acd_probe_config_free(NAcdProbeConfig *config) {
        if (!config)
                return NULL;

        free(config);

        return NULL;
}

/**
 * n_acd_probe_config_set_ip() - set ip property
 * @config:                     configuration to operate on
 * @ip:                         ip to set
 *
 * This sets the IP property to the value `ip`. The address is copied into the
 * configuration object. No validation is performed.
 *
 * The IP property selects the IP address that a probe checks for. It is the
 * caller's responsibility to guarantee the address is valid and can be used.
 */
_c_public_ void n_acd_probe_config_set_ip(NAcdProbeConfig *config, struct in_addr ip) {
        config->ip = ip;
}

/**
 * n_acd_probe_config_set_timeout() - set timeout property
 * @config:                     configuration to operate on
 * @msecs:                      timeout to set, in milliseconds
 *
 * This sets the timeout to use for a conflict detection probe. The
 * specification default is provided as `N_ACD_TIMEOUT_RFC5227` and corresponds
 * to 9 seconds.
 *
 * If set to 0, conflict detection is skipped and the address is immediately
 * advertised and defended.
 *
 * Depending on the transport used, the API user should select a suitable
 * timeout. Since `ACD` only operates on the link layer, timeouts in the
 * hundreds of milliseconds range should be more than enough for any modern
 * network. Note that increasing this value directly affects the time it takes
 * to connect to a network, since an address should not be used unless conflict
 * detection finishes.
 *
 * Using the specification default is **discouraged**. It is way too slow and
 * not appropriate for modern networks.
 *
 * Default value is `N_ACD_TIMEOUT_RFC5227`.
 */
_c_public_ void n_acd_probe_config_set_timeout(NAcdProbeConfig *config, uint64_t msecs) {
        config->timeout_msecs = msecs;
}

static void n_acd_probe_schedule(NAcdProbe *probe, uint64_t n_timeout, unsigned int n_jitter) {
        uint64_t n_time;

        timer_now(&probe->acd->timer, &n_time);
        n_time += n_timeout;

        /*
         * ACD specifies jitter values to reduce packet storms on the local
         * link. This call accepts the maximum relative jitter value in
         * nanoseconds as @n_jitter. We then use rand_r(3p) to get a
         * pseudo-random jitter on top of the real timeout given as @n_timeout.
         */
        if (n_jitter) {
                uint64_t random;

                random = ((uint64_t)rand_r(&probe->acd->seed) << 32) | (uint64_t)rand_r(&probe->acd->seed);
                n_time += random % n_jitter;
        }

        timeout_schedule(&probe->timeout, &probe->acd->timer, n_time);
}

static void n_acd_probe_unschedule(NAcdProbe *probe) {
        timeout_unschedule(&probe->timeout);
}

static bool n_acd_probe_is_unique(NAcdProbe *probe) {
        NAcdProbe *sibling;

        if (!c_rbnode_is_linked(&probe->ip_node))
                return false;

        sibling = c_rbnode_entry(c_rbnode_next(&probe->ip_node), NAcdProbe, ip_node);
        if (sibling && sibling->ip.s_addr == probe->ip.s_addr)
                return false;

        sibling = c_rbnode_entry(c_rbnode_prev(&probe->ip_node), NAcdProbe, ip_node);
        if (sibling && sibling->ip.s_addr == probe->ip.s_addr)
                return false;

        return true;
}

static int n_acd_probe_link(NAcdProbe *probe) {
        int r;

        /*
         * Make sure the kernel bpf map has space for at least one more
         * entry.
         */
        r = n_acd_ensure_bpf_map_space(probe->acd);
        if (r)
                return r;

        /*
         * Link entry into context, indexed by its IP. Note that we allow
         * duplicates just fine. It is up to you to decide whether to avoid
         * duplicates, if you don't want them. Duplicates on the same context
         * do not conflict with each other, though.
         */
        {
                CRBNode **slot, *parent;
                NAcdProbe *other;

                slot = &probe->acd->ip_tree.root;
                parent = NULL;
                while (*slot) {
                        other = c_rbnode_entry(*slot, NAcdProbe, ip_node);
                        parent = *slot;
                        if (probe->ip.s_addr < other->ip.s_addr)
                                slot = &(*slot)->left;
                        else
                                slot = &(*slot)->right;
                }

                c_rbtree_add(&probe->acd->ip_tree, parent, slot, &probe->ip_node);
        }

        /*
         * Add the ip address to the map, if it is not already there.
         */
        if (n_acd_probe_is_unique(probe)) {
                r = n_acd_bpf_map_add(probe->acd->fd_bpf_map, &probe->ip);
                if (r) {
                        /*
                         * Make sure the IP address is linked in userspace iff
                         * it is linked in the kernel.
                         */
                        c_rbnode_unlink(&probe->ip_node);
                        return r;
                }
                ++probe->acd->n_bpf_map;
        }

        return 0;
}

static void n_acd_probe_unlink(NAcdProbe *probe) {
        int r;

        /*
         * If this is the only probe for a given IP, remove the IP from the
         * kernel BPF map.
         */
        if (n_acd_probe_is_unique(probe)) {
                r = n_acd_bpf_map_remove(probe->acd->fd_bpf_map, &probe->ip);
                c_assert(r >= 0);
                --probe->acd->n_bpf_map;
        }
        c_rbnode_unlink(&probe->ip_node);
}

int n_acd_probe_new(NAcdProbe **probep, NAcd *acd, NAcdProbeConfig *config) {
        _c_cleanup_(n_acd_probe_freep) NAcdProbe *probe = NULL;
        int r;

        if (!config->ip.s_addr)
                return N_ACD_E_INVALID_ARGUMENT;

        probe = malloc(sizeof(*probe));
        if (!probe)
                return -ENOMEM;

        *probe = (NAcdProbe)N_ACD_PROBE_NULL(*probe);
        probe->acd = n_acd_ref(acd);
        probe->ip = config->ip;

        /*
         * We use the provided timeout-length as multiplier for all our
         * timeouts. The provided timeout defines the maximum length of an
         * entire probe-interval until the first announcement. Given the
         * spec-provided parameters, this ends up as:
         *
         *     PROBE_WAIT + PROBE_MAX + PROBE_MAX + ANNOUNCE_WAIT
         *   =         1s +        3s +        3s +            2s
         *   = 9s
         *
         * Hence, the default value for this timeout is 9000ms, which just
         * ends up matching the spec-provided values.
         *
         * What we now semantically do is divide this timeout by 1ns/1000000.
         * This first turns it into nanoseconds, then strips the unit by
         * turning it into a multiplier. However, rather than performing the
         * division here, we multiplier all our timeouts by 1000000 statically
         * at compile time. Therefore, we can use the user-provided timeout as
         * unmodified multiplier. No conversion necessary.
         */
        probe->timeout_multiplier = config->timeout_msecs;

        r = n_acd_probe_link(probe);
        if (r)
                return r;

        /*
         * Now that everything is set up, we have to send the first probe. This
         * is done after ~PROBE_WAIT seconds, hence we schedule our timer.
         * In case no timeout-multiplier is set, we pretend we already sent all
         * probes successfully and schedule the timer so we proceed with the
         * announcements. We must schedule a fake timer there, since we are not
         * allowed to advance the state machine outside of n_acd_dispatch().
         */
        if (probe->timeout_multiplier) {
                probe->n_iteration = 0;
                n_acd_probe_schedule(probe,
                                     0,
                                     probe->timeout_multiplier * N_ACD_RFC_PROBE_WAIT_NSEC);
        } else {
                probe->n_iteration = N_ACD_RFC_PROBE_NUM;
                n_acd_probe_schedule(probe, 0, 0);
        }

        *probep = probe;
        probe = NULL;
        return 0;
}

/**
 * n_acd_probe_free() - destroy a probe
 * @probe:                      probe to operate on, or NULL
 *
 * This destroys the probe specified by @probe. All operations are immediately
 * ceded and all associated objects are released.
 *
 * If @probe is NULL, this is a no-op.
 *
 * This function will flush all events associated with @probe from the event
 * queue. That is, no events will be returned for this @probe anymore.
 *
 * Return: NULL is returned.
 */
_c_public_ NAcdProbe *n_acd_probe_free(NAcdProbe *probe) {
        NAcdEventNode *node, *t_node;

        if (!probe)
                return NULL;

        c_list_for_each_entry_safe(node, t_node, &probe->event_list, probe_link)
                n_acd_event_node_free(node);

        n_acd_probe_unschedule(probe);
        n_acd_probe_unlink(probe);
        probe->acd = n_acd_unref(probe->acd);
        free(probe);

        return NULL;
}

int n_acd_probe_raise(NAcdProbe *probe, NAcdEventNode **nodep, unsigned int event) {
        _c_cleanup_(n_acd_event_node_freep) NAcdEventNode *node = NULL;
        int r;

        r = n_acd_raise(probe->acd, &node, event);
        if (r)
                return r;

        switch (event) {
        case N_ACD_EVENT_READY:
                node->event.ready.probe = probe;
                break;
        case N_ACD_EVENT_USED:
                node->event.used.probe = probe;
                break;
        case N_ACD_EVENT_DEFENDED:
                node->event.defended.probe = probe;
                break;
        case N_ACD_EVENT_CONFLICT:
                node->event.conflict.probe = probe;
                break;
        default:
                c_assert(0);
                return -ENOTRECOVERABLE;
        }

        c_list_link_tail(&probe->event_list, &node->probe_link);

        if (nodep)
                *nodep = node;
        node = NULL;
        return 0;
}

int n_acd_probe_handle_timeout(NAcdProbe *probe) {
        int r;

        switch (probe->state) {
        case N_ACD_PROBE_STATE_PROBING:
                /*
                 * We are still PROBING. We send 3 probes with a random timeout
                 * scheduled between each. If, after a fixed timeout, we did
                 * not receive any conflict we consider the probing successful.
                 */
                if (probe->n_iteration < N_ACD_RFC_PROBE_NUM) {
                        /*
                         * We have not sent all 3 probes, yet. A timer fired,
                         * so we are ready to send the next probe. If this is
                         * the third probe, schedule a timer for ANNOUNCE_WAIT
                         * to give other peers a chance to answer. If this is
                         * not the third probe, wait between PROBE_MIN and
                         * PROBE_MAX for the next probe.
                         */

                        r = n_acd_send(probe->acd, &probe->ip, NULL);
                        if (r) {
                                if (r != -N_ACD_E_DROPPED)
                                        return r;

                                /*
                                 * Packet was dropped, and we know about it. It
                                 * never reached the network. Reasons are
                                 * manifold, and n_acd_send() raises events if
                                 * necessary.
                                 * From a probe-perspective, we simply pretend
                                 * we never sent the probe and schedule a
                                 * timeout for the next probe, effectively
                                 * doubling a single probe-interval.
                                 */
                        } else {
                                /* Successfully sent, so advance counter. */
                                ++probe->n_iteration;
                        }

                        if (probe->n_iteration < N_ACD_RFC_PROBE_NUM)
                                n_acd_probe_schedule(probe,
                                                     probe->timeout_multiplier * N_ACD_RFC_PROBE_MIN_NSEC,
                                                     probe->timeout_multiplier * (N_ACD_RFC_PROBE_MAX_NSEC - N_ACD_RFC_PROBE_MIN_NSEC));
                        else
                                n_acd_probe_schedule(probe,
                                                     probe->timeout_multiplier * N_ACD_RFC_ANNOUNCE_WAIT_NSEC,
                                                     0);
                } else {
                        /*
                         * All 3 probes succeeded and we waited enough to
                         * consider this address usable by now. Do not announce
                         * the address, yet. We must first give the caller a
                         * chance to configure the address (so they can answer
                         * ARP requests), before announcing it.
                         */
                        r = n_acd_probe_raise(probe, NULL, N_ACD_EVENT_READY);
                        if (r)
                                return r;

                        probe->state = N_ACD_PROBE_STATE_CONFIGURING;
                }

                break;

        case N_ACD_PROBE_STATE_ANNOUNCING:
                /*
                 * We are ANNOUNCING, meaning the caller configured the address
                 * on the interface and is actively using it. We send 3
                 * announcements out, in a short interval, and then just
                 * perform passive conflict detection.
                 * Note that once all 3 announcements are sent, we no longer
                 * schedule a timer, so this part should not trigger, anymore.
                 */

                r = n_acd_send(probe->acd, &probe->ip, &probe->ip);
                if (r) {
                        if (r != -N_ACD_E_DROPPED)
                                return r;

                        /*
                         * See above in STATE_PROBING for details. We know the
                         * packet was never sent, so we simply try again after
                         * extending the timer.
                         */
                } else {
                        /* Successfully sent, so advance counter. */
                        ++probe->n_iteration;
                }

                if (probe->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) {
                        /*
                         * Announcements are always scheduled according to the
                         * time-intervals specified in the spec. We always use
                         * the RFC5227-mandated multiplier.
                         * If you reconsider this, note that timeout_multiplier
                         * might be 0 here.
                         */
                        n_acd_probe_schedule(probe,
                                             N_ACD_TIMEOUT_RFC5227 * N_ACD_RFC_ANNOUNCE_INTERVAL_NSEC,
                                             0);
                }

                break;

        case N_ACD_PROBE_STATE_CONFIGURING:
        case N_ACD_PROBE_STATE_FAILED:
        default:
                /*
                 * There are no timeouts in these states. If we trigger one,
                 * something is fishy.
                 */
                c_assert(0);
                return -ENOTRECOVERABLE;
        }

        return 0;
}

int n_acd_probe_handle_packet(NAcdProbe *probe, struct ether_arp *packet, bool hard_conflict) {
        NAcdEventNode *node;
        uint64_t now;
        int r;

        timer_now(&probe->acd->timer, &now);

        switch (probe->state) {
        case N_ACD_PROBE_STATE_PROBING:
                /*
                 * Regardless whether this is a hard or soft conflict, we must
                 * treat this as a probe failure. That is, notify the caller of
                 * the conflict and wait for further instructions. We do not
                 * react to this, until the caller tells us what to do, but we
                 * do stop sending further probes.
                 */
                r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_USED);
                if (r)
                        return r;

                node->event.used.sender = node->sender;
                node->event.used.n_sender = ETH_ALEN;
                memcpy(node->sender, packet->arp_sha, ETH_ALEN);

                n_acd_probe_unschedule(probe);
                n_acd_probe_unlink(probe);
                probe->state = N_ACD_PROBE_STATE_FAILED;

                break;

        case N_ACD_PROBE_STATE_CONFIGURING:
                /*
                 * We are waiting for the caller to configure the interface and
                 * start ANNOUNCING. In this state, we cannot defend the
                 * address as that would indicate that it is ready to be used,
                 * and we cannot signal CONFLICT or USED as the caller may
                 * already have started to use the address (and may have
                 * configured the engine to always defend it, which means they
                 * should be able to rely on never losing it after READY).
                 * Simply drop the event, and rely on the anticipated ANNOUNCE
                 * to trigger it again.
                 */

                break;

        case N_ACD_PROBE_STATE_ANNOUNCING: {
                /*
                 * We were already instructed to announce the address, which
                 * means the address is configured and in use. Hence, the
                 * caller is responsible to serve regular ARP queries. Meaning,
                 * we can ignore any soft conflicts (other peers doing ACD).
                 *
                 * But if we see a hard-conflict, we either defend the address
                 * according to the caller's instructions, or we report the
                 * conflict and bail out.
                 */
                bool conflict = false, rate_limited = false;

                if (!hard_conflict)
                        break;

                rate_limited = now < probe->last_defend + N_ACD_RFC_DEFEND_INTERVAL_NSEC;

                switch (probe->defend) {
                case N_ACD_DEFEND_NEVER:
                        conflict = true;
                        break;
                case N_ACD_DEFEND_ONCE:
                        if (rate_limited) {
                                conflict = true;
                                break;
                        }

                        /* fallthrough */
                case N_ACD_DEFEND_ALWAYS:
                        if (!rate_limited) {
                                r = n_acd_send(probe->acd, &probe->ip, &probe->ip);
                                if (r) {
                                        if (r != -N_ACD_E_DROPPED)
                                                return r;

                                        if (probe->defend == N_ACD_DEFEND_ONCE) {
                                                conflict = true;
                                                break;
                                        }
                                }

                                if (r != -N_ACD_E_DROPPED)
                                        probe->last_defend = now;
                        }

                        r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_DEFENDED);
                        if (r)
                                return r;

                        node->event.defended.sender = node->sender;
                        node->event.defended.n_sender = ETH_ALEN;
                        memcpy(node->sender, packet->arp_sha, ETH_ALEN);

                        break;
                }

                if (conflict) {
                        r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_CONFLICT);
                        if (r)
                                return r;

                        node->event.conflict.sender = node->sender;
                        node->event.conflict.n_sender = ETH_ALEN;
                        memcpy(node->sender, packet->arp_sha, ETH_ALEN);

                        n_acd_probe_unschedule(probe);
                        n_acd_probe_unlink(probe);
                        probe->state = N_ACD_PROBE_STATE_FAILED;
                }

                break;
        }

        case N_ACD_PROBE_STATE_FAILED:
        default:
                /*
                 * We are not listening for packets in these states. If we receive one,
                 * something is fishy.
                 */
                c_assert(0);
                return -ENOTRECOVERABLE;
        }

        return 0;
}

/**
 * n_acd_probe_set_userdata - set userdata
 * @probe:                      probe to operate on
 * @userdata:                   userdata pointer
 *
 * This can be used to set a caller-controlled user-data pointer on @probe. The
 * value of the pointer is never inspected or used by `n-acd` and is fully
 * under control of the caller.
 *
 * The default value is NULL.
 */
_c_public_ void n_acd_probe_set_userdata(NAcdProbe *probe, void *userdata) {
        probe->userdata = userdata;
}

/**
 * n_acd_probe_get_userdata - get userdata
 * @probe:                      probe to operate on
 *
 * This queries the userdata pointer that was previously set through
 * n_acd_probe_set_userdata().
 *
 * The default value is NULL.
 *
 * Return: The stored userdata pointer is returned.
 */
_c_public_ void n_acd_probe_get_userdata(NAcdProbe *probe, void **userdatap) {
        *userdatap = probe->userdata;
}

/**
 * n_acd_probe_announce() - announce the configured IP address
 * @probe:                      probe to operate on
 * @defend:                     defense policy
 *
 * Announce the IP address on the local link, and start defending it according
 * to the given policy, which mut be one of N_ACD_DEFEND_ONCE,
 * N_ACD_DEFEND_NEVER, or N_ACD_DEFEND_ALWAYS.
 *
 * This must be called in response to an N_ACD_EVENT_READY event, and only
 * after the given address has been configured on the given network interface.
 *
 * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the defense policy
 *         is invalid, negative error code on failure.
 */
_c_public_ int n_acd_probe_announce(NAcdProbe *probe, unsigned int defend) {
        if (defend >= _N_ACD_DEFEND_N)
                return N_ACD_E_INVALID_ARGUMENT;

        probe->state = N_ACD_PROBE_STATE_ANNOUNCING;
        probe->defend = defend;
        probe->n_iteration = 0;

        /*
         * We must schedule a fake-timeout, since we are not allowed to
         * advance the state-machine outside of n_acd_dispatch().
         */
        n_acd_probe_schedule(probe, 0, 0);

        return 0;
}