Blame src/terminal-regex.h

Packit d370c2
/*
Packit d370c2
 * Copyright © 2015 Egmont Koblinger
Packit d370c2
 *
Packit d370c2
 * This program is free software: you can redistribute it and/or modify
Packit d370c2
 * it under the terms of the GNU General Public License as published by
Packit d370c2
 * the Free Software Foundation, either version 3 of the License, or
Packit d370c2
 * (at your option) any later version.
Packit d370c2
 *
Packit d370c2
 * This program is distributed in the hope that it will be useful,
Packit d370c2
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit d370c2
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit d370c2
 * GNU General Public License for more details.
Packit d370c2
 *
Packit d370c2
 * You should have received a copy of the GNU General Public License
Packit d370c2
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
Packit d370c2
 */
Packit d370c2
Packit d370c2
/*
Packit d370c2
 * Mini style-guide:
Packit d370c2
 *
Packit d370c2
 * #define'd fragments should preferably have an outermost group, for the
Packit d370c2
 * exact same reason as why usually in C/C++ #define's the values are enclosed
Packit d370c2
 * in parentheses: that is, so that you don't get surprised when you use the
Packit d370c2
 * macro and append a quantifier.
Packit d370c2
 *
Packit d370c2
 * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
Packit d370c2
 * as (?&NAME), so that the regex string and the compiled regex object is
Packit d370c2
 * smaller.
Packit d370c2
 *
Packit d370c2
 * Build small blocks, comment and unittest them heavily.
Packit d370c2
 *
Packit d370c2
 * Use free-spacing mode for improved readability. The hardest to read is
Packit d370c2
 * which additional characters belong to a "(?" prefix. To improve
Packit d370c2
 * readability, place a space after this, and for symmetry, before the closing
Packit d370c2
 * parenthesis. Also place a space around "|" characters. No space before
Packit d370c2
 * quantifiers. Try to be consistent with the existing style (yes I know the
Packit d370c2
 * existing style is not consistent either, but please do your best).
Packit d370c2
 *
Packit d370c2
 * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
Packit d370c2
 * syntaxes.
Packit d370c2
 */
Packit d370c2
Packit d370c2
#ifndef TERMINAL_REGEX_H
Packit d370c2
#define TERMINAL_REGEX_H
Packit d370c2
Packit d370c2
/* Lookbehind to see if there's a preceding apostrophe */
Packit d370c2
#define APOS_START_DEF "(?<APOS_START>(?<='))?"
Packit d370c2
Packit d370c2
#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
Packit d370c2
Packit d370c2
#define USERCHARS "-+.[:alnum:]"
Packit d370c2
/* Nonempty username, e.g. "john.smith" */
Packit d370c2
#define USER "[" USERCHARS "]+"
Packit d370c2
Packit d370c2
#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
Packit d370c2
/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", "" */
Packit d370c2
#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
Packit d370c2
Packit d370c2
/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
Packit d370c2
#define USERPASS "(?:" USER PASS "@)?"
Packit d370c2
Packit d370c2
/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the string "256".
Packit d370c2
   The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
Packit d370c2
#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] ) )))"
Packit d370c2
Packit d370c2
/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match "192.168.1.123" in the string "192.168.1.1234". */
Packit d370c2
#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
Packit d370c2
Packit d370c2
/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
Packit d370c2
 * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in "dead:beef::1.2.3.4".
Packit d370c2
 * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer unittesting.
Packit d370c2
 * TODO: more strict check (right number of colons, etc.)
Packit d370c2
 * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
Packit d370c2
Packit d370c2
/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
Packit d370c2
#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
Packit d370c2
Packit d370c2
/* No :: shorthand */
Packit d370c2
#define IPV6_FULL  "(?x: (?&S6C){7} (?&S6) )"
Packit d370c2
/* Begins with :: */
Packit d370c2
#define IPV6_LEFT  "(?x: : (?&CS6){1,7} )"
Packit d370c2
/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
Packit d370c2
#define IPV6_MID   "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
Packit d370c2
/* Ends with :: */
Packit d370c2
#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
Packit d370c2
/* Is "::" and nothing more */
Packit d370c2
#define IPV6_NULL  "(?x: :: )"
Packit d370c2
Packit d370c2
/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
Packit d370c2
#define IPV6V4_FULL  "(?x: (?&S6C){6} )"
Packit d370c2
#define IPV6V4_LEFT  "(?x: :: (?&S6C){0,5} )"  /* includes "::<ipv4>" */
Packit d370c2
#define IPV6V4_MID   "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
Packit d370c2
#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
Packit d370c2
Packit d370c2
/* IPV6: An IPv6 address (possibly with an embedded IPv4).
Packit d370c2
 * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
Packit d370c2
#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) (?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
Packit d370c2
Packit d370c2
/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
Packit d370c2
 * then any graphical Unicode character.
Packit d370c2
 * A segment can consist entirely of numbers.
Packit d370c2
 * (Note: PCRE doesn't support character class subtraction/intersection.) */
Packit d370c2
#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
Packit d370c2
Packit d370c2
/* A hostname of at least 1 component. The last component cannot be entirely numbers.
Packit d370c2
 * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
Packit d370c2
#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] ) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
Packit d370c2
Packit d370c2
/* A hostname of at least 2 components. The last component cannot be entirely numbers.
Packit d370c2
 * E.g. "example.com", "1234.com", but not "1234.56" */
Packit d370c2
#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
Packit d370c2
Packit d370c2
/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
Packit d370c2
#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
Packit d370c2
Packit d370c2
/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", "[1.2.3.4]", "[::1]".
Packit d370c2
 * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid tons of false positives (e.g. in a typical shell prompt). */
Packit d370c2
#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
Packit d370c2
Packit d370c2
/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
Packit d370c2
   and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
Packit d370c2
#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
Packit d370c2
Packit d370c2
/* Optional colon-prefixed port, e.g. ":1080", "" */
Packit d370c2
#define PORT "(?x: \\:" N_1_65535 " )?"
Packit d370c2
Packit d370c2
/* Omit the parentheses, see below */
Packit d370c2
#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
Packit d370c2
/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
Packit d370c2
#define PATHTERM_CLASS        "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
Packit d370c2
#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
Packit d370c2
Packit d370c2
/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 763980. */
Packit d370c2
#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
Packit d370c2
/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
Packit d370c2
#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS ") )? )))"
Packit d370c2
Packit d370c2
#define URLPATH "(?x: /(?&PATH) )?"
Packit d370c2
#define VOIP_PATH "(?x: [;?](?&PATH) )?"
Packit d370c2
Packit d370c2
/* Now let's put these fragments together */
Packit d370c2
Packit d370c2
#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
Packit d370c2
Packit d370c2
#define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
Packit d370c2
/* TODO: also support file:/etc/passwd */
Packit d370c2
#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
Packit d370c2
/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience (so that we can reuse HOSTNAME1). */
Packit d370c2
#define REGEX_URL_HTTP   DEFS "(?
Packit d370c2
#define REGEX_URL_VOIP   DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
Packit d370c2
#define REGEX_EMAIL      DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
Packit d370c2
#define REGEX_NEWS_MAN   "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
Packit d370c2
Packit d370c2
#endif /* !TERMINAL_REGEX_H */