Blob Blame History Raw
/*
 * regexp.c:
 *
 * Copyright (C) 2007-2016 David Lutterkort
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 *
 * Author: David Lutterkort <dlutter@redhat.com>
 */

#include <config.h>
#include <regex.h>

#include "internal.h"
#include "syntax.h"
#include "memory.h"
#include "errcode.h"

static const struct string empty_pattern_string = {
    .ref = REF_MAX, .str = (char *) "()"
};

static const struct string *const empty_pattern = &empty_pattern_string;

char *regexp_escape(const struct regexp *r) {
    char *pat = NULL;

    if (r == NULL)
        return strdup("");

#if !HAVE_USELOCALE
    char *nre = NULL;
    int ret;
    size_t nre_len;

    /* Use a range with from > to to force conversion of ranges into
     * short form */
    ret = fa_restrict_alphabet(r->pattern->str, strlen(r->pattern->str),
                               &nre, &nre_len, 2, 1);
    if (ret == 0) {
        pat = escape(nre, nre_len, RX_ESCAPES);
        free(nre);
    }
#endif

    if (pat == NULL) {
        /* simplify the regexp by removing some artifacts of reserving
           chanaracters for internal purposes */
        if (index(r->pattern->str, RESERVED_FROM_CH)) {
            char *s = strdup(r->pattern->str);
            char *t = s;
            for (char *p = s; *p; p++) {
                if (STREQLEN(p, RESERVED_RANGE_RX, strlen(RESERVED_RANGE_RX))) {
                    /* Completely eliminate mentions of the reserved range */
                    p += strlen(RESERVED_RANGE_RX);
                } else if (STREQLEN(p,
                                    RESERVED_DOT_RX, strlen(RESERVED_DOT_RX))) {
                    /* Replace what amounts to a '.' by one */
                    p += strlen(RESERVED_DOT_RX);
                    *t++ = '.';
                }
                *t++ = *p;
            }
            *t = '\0';
            pat = escape(s, -1, RX_ESCAPES);
            free(s);
        } else {
            pat = escape(r->pattern->str, -1, RX_ESCAPES);
        }
    }

    if (pat == NULL)
        return NULL;

    /* Remove unneeded '()' from pat */
    for (int changed = 1; changed;) {
        changed = 0;
        for (char *p = pat; *p != '\0'; p++) {
            if (*p == '(' && p[1] == ')') {
                memmove(p, p+2, strlen(p+2)+1);
                changed = 1;
            }
        }
    }

    if (pat[0] == '(' && pat[strlen(pat)-1] == ')') {
        int level = 1;
        for (int i=1; i < strlen(pat)-1; i++) {
            if (pat[i] == '(')
                level += 1;
            if (pat[i] == ')')
                level -= 1;
            if (level == 0)
                break;
        }
        if (level == 1) {
            memmove(pat, pat+1, strlen(pat+1)+1);
            pat[strlen(pat)-1] = '\0';
        }
    }

    return pat;
}

void print_regexp(FILE *out, struct regexp *r) {
    if (r == NULL) {
        fprintf(out, "<NULL>");
        return;
    }

    fputc('/', out);
    if (r->pattern == NULL)
        fprintf(out, "%p", r);
    else {
        char *rx;
        size_t rx_len;
        fa_restrict_alphabet(r->pattern->str, strlen(r->pattern->str),
                             &rx, &rx_len, 2, 1);
        print_chars(out, rx, rx_len);
        FREE(rx);
    }
    fputc('/', out);
    if (r->nocase)
        fputc('i', out);
}

struct regexp *
make_regexp_unescape(struct info *info, const char *pat, int nocase) {
    char *p = unescape(pat, strlen(pat), NULL);

    if (p == NULL)
        return NULL;
    return make_regexp(info, p, nocase);
}

struct regexp *make_regexp(struct info *info, char *pat, int nocase) {
    struct regexp *regexp;

    make_ref(regexp);
    regexp->info = ref(info);

    make_ref(regexp->pattern);
    regexp->pattern->str = pat;
    regexp->nocase = nocase;
    return regexp;
}

/* Take a POSIX glob and turn it into a regexp. The regexp is constructed
 * by doing the following translations of characters in the string:
 *  * -> [^/]*
 *  ? -> [^/]
 *  leave characters escaped with a backslash alone
 *  escape any of ".|{}()+^$" with a backslash
 *
 * Note that that ignores some of the finer points of globs, like
 * complementation.
 */
struct regexp *make_regexp_from_glob(struct info *info, const char *glob) {
    static const char *const star = "[^/]*";
    static const char *const qmark = "[^/]";
    static const char *const special = ".|{}()+^$";
    int newlen = strlen(glob);
    char *pat = NULL;

    for (const char *s = glob; *s; s++) {
        if (*s == '\\' && *(s+1))
            s += 1;
        else if (*s == '*')
            newlen += strlen(star)-1;
        else if (*s == '?')
            newlen += strlen(qmark)-1;
        else if (strchr(special, *s) != NULL)
            newlen += 1;
    }

    if (ALLOC_N(pat, newlen + 1) < 0)
        return NULL;

    char *t = pat;
    for (const char *s = glob; *s; s++) {
        if (*s == '\\' && *(s+1)) {
            *t++ = *s++;
            *t++ = *s;
        } else if (*s == '*') {
            t = stpcpy(t, star);
        } else if (*s == '?') {
            t = stpcpy(t, qmark);
        } else if (strchr(special, *s) != NULL) {
            *t++ = '\\';
            *t++ = *s;
        } else {
            *t++ = *s;
        }
    }

    return make_regexp(info, pat, 0);
}

void free_regexp(struct regexp *regexp) {
    if (regexp == NULL)
        return;
    assert(regexp->ref == 0);
    unref(regexp->info, info);
    unref(regexp->pattern, string);
    if (regexp->re != NULL) {
        regfree(regexp->re);
        free(regexp->re);
    }
    free(regexp);
}

int regexp_is_empty_pattern(struct regexp *r) {
    for (char *s = r->pattern->str; *s; s++) {
        if (*s != '(' && *s != ')')
            return 0;
    }
    return 1;
}

struct regexp *make_regexp_literal(struct info *info, const char *text) {
    char *pattern, *p;

    /* Escape special characters in text since it should be taken
       literally */
    if (ALLOC_N(pattern, 2*strlen(text)+1) < 0)
        return NULL;
    p = pattern;
    for (const char *t = text; *t != '\0'; t++) {
        if ((*t == '\\') && t[1]) {
            *p++ = *t++;
            *p++ = *t;
        } else if (strchr(".|{}[]()+*?", *t) != NULL) {
            *p++ = '\\';
            *p++ = *t;
        } else {
            *p++ = *t;
        }
    }
    return make_regexp(info, pattern, 0);
}

struct regexp *
regexp_union(struct info *info, struct regexp *r1, struct regexp *r2) {
    struct regexp *r[2];

    r[0] = r1;
    r[1] = r2;
    return regexp_union_n(info, 2, r);
}

char *regexp_expand_nocase(struct regexp *r) {
    const char *p = r->pattern->str, *t;
    char *s = NULL;
    size_t len;
    int ret;
    int psub = 0, rsub = 0;

    if (! r->nocase)
        return strdup(p);

    ret = fa_expand_nocase(p, strlen(p), &s, &len);
    ERR_NOMEM(ret == REG_ESPACE, r->info);
    BUG_ON(ret != REG_NOERROR, r->info, NULL);

    /* Make sure that r->pattern->str and ret have the same number
     * of parentheses/groups, since our parser critically depends
     * on the fact that the regexp for a union/concat and those
     * of its children have groups that are in direct relation */
    for (t = p; *t; t++) if (*t == '(') psub += 1;
    for (t = s; *t; t++) if (*t == '(') rsub += 1;
    BUG_ON(psub < rsub, r->info, NULL);
    psub -= rsub;
    if (psub > 0) {
        char *adjusted = NULL, *a;
        if (ALLOC_N(adjusted, strlen(s) + 2*psub + 1) < 0)
            ERR_NOMEM(true, r->info);
        a = adjusted;
        for (int i=0; i < psub; i++) *a++ = '(';
        a = stpcpy(a, s);
        for (int i=0; i < psub; i++) *a++ = ')';
        free(s);
        s = adjusted;
    }
 error:
    return s;
}

static char *append_expanded(struct regexp *r, char **pat, char *p,
                             size_t *len) {
    char *expanded = NULL;
    size_t ofs = p - *pat;
    int ret;

    expanded = regexp_expand_nocase(r);
    ERR_BAIL(r->info);

    *len += strlen(expanded) - strlen(r->pattern->str);

    ret = REALLOC_N(*pat, *len);
    ERR_NOMEM(ret < 0, r->info);

    p = stpcpy(*pat + ofs, expanded);
 error:
    FREE(expanded);
    return p;
}

struct regexp *
regexp_union_n(struct info *info, int n, struct regexp **r) {
    size_t len = 0;
    char *pat = NULL, *p, *expanded = NULL;
    int nnocase = 0, npresent = 0;

    for (int i=0; i < n; i++)
        if (r[i] != NULL) {
            len += strlen(r[i]->pattern->str) + strlen("()|");
            npresent += 1;
            if (r[i]->nocase)
                nnocase += 1;
        }

    bool mixedcase = nnocase > 0 && nnocase < npresent;

    if (len == 0)
        return NULL;

    if (ALLOC_N(pat, len) < 0)
        return NULL;

    p = pat;
    int added = 0;
    for (int i=0; i < n; i++) {
        if (r[i] == NULL)
            continue;
        if (added > 0)
            *p++ = '|';
        *p++ = '(';
        if (mixedcase && r[i]->nocase) {
            p = append_expanded(r[i], &pat, p, &len);
            ERR_BAIL(r[i]->info);
        } else {
            p = stpcpy(p, r[i]->pattern->str);
        }
        *p++ = ')';
        added += 1;
    }
    *p = '\0';
    return make_regexp(info, pat, nnocase == npresent);
 error:
    FREE(expanded);
    FREE(pat);
    return NULL;
}

struct regexp *
regexp_concat(struct info *info, struct regexp *r1, struct regexp *r2) {
    struct regexp *r[2];

    r[0] = r1;
    r[1] = r2;
    return regexp_concat_n(info, 2, r);
}

struct regexp *
regexp_concat_n(struct info *info, int n, struct regexp **r) {
    size_t len = 0;
    char *pat = NULL, *p, *expanded = NULL;
    int nnocase = 0, npresent = 0;

    for (int i=0; i < n; i++)
        if (r[i] != NULL) {
            len += strlen(r[i]->pattern->str) + strlen("()");
            npresent += 1;
            if (r[i]->nocase)
                nnocase += 1;
        }

    bool mixedcase = nnocase > 0 && nnocase < npresent;

    if (len == 0)
        return NULL;

    len += 1;
    if (ALLOC_N(pat, len) < 0)
        return NULL;

    p = pat;
    for (int i=0; i < n; i++) {
        if (r[i] == NULL)
            continue;
        *p++ = '(';
        if (mixedcase && r[i]->nocase) {
            p = append_expanded(r[i], &pat, p, &len);
            ERR_BAIL(r[i]->info);
        } else {
            p = stpcpy(p, r[i]->pattern->str);
        }
        *p++ = ')';
    }
    *p = '\0';
    return make_regexp(info, pat, nnocase == npresent);
 error:
    FREE(expanded);
    FREE(pat);
    return NULL;
}

static struct fa *regexp_to_fa(struct regexp *r) {
    const char *p = r->pattern->str;
    int ret;
    struct fa *fa = NULL;

    ret = fa_compile(p, strlen(p), &fa);
    ERR_NOMEM(ret == REG_ESPACE, r->info);
    BUG_ON(ret != REG_NOERROR, r->info, NULL);

    if (r->nocase) {
        ret = fa_nocase(fa);
        ERR_NOMEM(ret < 0, r->info);
    }
    return fa;

 error:
    fa_free(fa);
    return NULL;
}

struct regexp *
regexp_minus(struct info *info, struct regexp *r1, struct regexp *r2) {
    struct regexp *result = NULL;
    struct fa *fa = NULL, *fa1 = NULL, *fa2 = NULL;
    int r;
    char *s = NULL;
    size_t s_len;

    fa1 = regexp_to_fa(r1);
    ERR_BAIL(r1->info);

    fa2 = regexp_to_fa(r2);
    ERR_BAIL(r2->info);

    fa = fa_minus(fa1, fa2);
    if (fa == NULL)
        goto error;

    r = fa_as_regexp(fa, &s, &s_len);
    if (r < 0)
        goto error;

    if (s == NULL) {
        /* FA is the empty set, which we can't represent as a regexp */
        goto error;
    }

    if (regexp_c_locale(&s, NULL) < 0)
        goto error;

    result = make_regexp(info, s, fa_is_nocase(fa));
    s = NULL;

 done:
    fa_free(fa);
    fa_free(fa1);
    fa_free(fa2);
    free(s);
    return result;
 error:
    unref(result, regexp);
    goto done;
}


struct regexp *
regexp_iter(struct info *info, struct regexp *r, int min, int max) {
    const char *p;
    char *s;
    int ret = 0;

    if (r == NULL)
        return NULL;

    p = r->pattern->str;
    if ((min == 0 || min == 1) && max == -1) {
        char q = (min == 0) ? '*' : '+';
        ret = asprintf(&s, "(%s)%c", p, q);
    } else if (min == max) {
        ret = asprintf(&s, "(%s){%d}", p, min);
    } else {
        ret = asprintf(&s, "(%s){%d,%d}", p, min, max);
    }
    return (ret == -1) ? NULL : make_regexp(info, s, r->nocase);
}

struct regexp *
regexp_maybe(struct info *info, struct regexp *r) {
    const char *p;
    char *s;
    int ret;

    if (r == NULL)
        return NULL;
    p = r->pattern->str;
    ret = asprintf(&s, "(%s)?", p);
    return (ret == -1) ? NULL : make_regexp(info, s, r->nocase);
}

struct regexp *regexp_make_empty(struct info *info) {
    struct regexp *regexp;

    make_ref(regexp);
    if (regexp != NULL) {
        regexp->info = ref(info);
        /* Casting away the CONST for EMPTY_PATTERN is ok since it
           is protected against changes because REF == REF_MAX */
        regexp->pattern = (struct string *) empty_pattern;
        regexp->nocase = 0;
    }
    return regexp;
}

static int regexp_compile_internal(struct regexp *r, const char **c) {
    /* See the GNU regex manual or regex.h in gnulib for
     * an explanation of these flags. They are set so that the regex
     * matcher interprets regular expressions the same way that libfa
     * does
     */
    static const reg_syntax_t syntax =
        RE_CONTEXT_INDEP_OPS|RE_CONTEXT_INVALID_OPS|RE_DOT_NOT_NULL
        |RE_INTERVALS|RE_NO_BK_BRACES|RE_NO_BK_PARENS|RE_NO_BK_REFS
        |RE_NO_BK_VBAR|RE_NO_EMPTY_RANGES
        |RE_NO_POSIX_BACKTRACKING|RE_CONTEXT_INVALID_DUP|RE_NO_GNU_OPS;
    reg_syntax_t old_syntax = re_syntax_options;

    *c = NULL;

    if (r->re == NULL) {
        if (ALLOC(r->re) < 0)
            return -1;
    }

    re_syntax_options = syntax;
    if (r->nocase)
        re_syntax_options |= RE_ICASE;
    *c = re_compile_pattern(r->pattern->str, strlen(r->pattern->str), r->re);
    re_syntax_options = old_syntax;

    r->re->regs_allocated = REGS_REALLOCATE;
    if (*c != NULL)
        return -1;
    return 0;
}

int regexp_compile(struct regexp *r) {
    const char *c;

    return regexp_compile_internal(r, &c);
}

int regexp_check(struct regexp *r, const char **msg) {
    return regexp_compile_internal(r, msg);
}

int regexp_match(struct regexp *r,
                 const char *string, const int size,
                 const int start, struct re_registers *regs) {
    if (r->re == NULL) {
        if (regexp_compile(r) == -1)
            return -3;
    }
    return re_match(r->re, string, size, start, regs);
}

int regexp_matches_empty(struct regexp *r) {
    return regexp_match(r, "", 0, 0, NULL) == 0;
}

int regexp_nsub(struct regexp *r) {
    if (r->re == NULL)
        if (regexp_compile(r) == -1)
            return -1;
    return r->re->re_nsub;
}

void regexp_release(struct regexp *regexp) {
    if (regexp != NULL && regexp->re != NULL) {
        regfree(regexp->re);
        FREE(regexp->re);
    }
}

/*
 * Local variables:
 *  indent-tabs-mode: nil
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 * End:
 */