Blame src/terminal-regex.h

Packit Service 3bdf47
/*
Packit Service 3bdf47
 * Copyright © 2015 Egmont Koblinger
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * This program is free software: you can redistribute it and/or modify
Packit Service 3bdf47
 * it under the terms of the GNU General Public License as published by
Packit Service 3bdf47
 * the Free Software Foundation, either version 3 of the License, or
Packit Service 3bdf47
 * (at your option) any later version.
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * This program is distributed in the hope that it will be useful,
Packit Service 3bdf47
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 3bdf47
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit Service 3bdf47
 * GNU General Public License for more details.
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * You should have received a copy of the GNU General Public License
Packit Service 3bdf47
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
Packit Service 3bdf47
 */
Packit Service 3bdf47
Packit Service 3bdf47
/*
Packit Service 3bdf47
 * Mini style-guide:
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * #define'd fragments should preferably have an outermost group, for the
Packit Service 3bdf47
 * exact same reason as why usually in C/C++ #define's the values are enclosed
Packit Service 3bdf47
 * in parentheses: that is, so that you don't get surprised when you use the
Packit Service 3bdf47
 * macro and append a quantifier.
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
Packit Service 3bdf47
 * as (?&NAME), so that the regex string and the compiled regex object is
Packit Service 3bdf47
 * smaller.
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * Build small blocks, comment and unittest them heavily.
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * Use free-spacing mode for improved readability. The hardest to read is
Packit Service 3bdf47
 * which additional characters belong to a "(?" prefix. To improve
Packit Service 3bdf47
 * readability, place a space after this, and for symmetry, before the closing
Packit Service 3bdf47
 * parenthesis. Also place a space around "|" characters. No space before
Packit Service 3bdf47
 * quantifiers. Try to be consistent with the existing style (yes I know the
Packit Service 3bdf47
 * existing style is not consistent either, but please do your best).
Packit Service 3bdf47
 *
Packit Service 3bdf47
 * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
Packit Service 3bdf47
 * syntaxes.
Packit Service 3bdf47
 */
Packit Service 3bdf47
Packit Service 3bdf47
#ifndef TERMINAL_REGEX_H
Packit Service 3bdf47
#define TERMINAL_REGEX_H
Packit Service 3bdf47
Packit Service 3bdf47
/* Lookbehind to see if there's a preceding apostrophe */
Packit Service 3bdf47
#define APOS_START_DEF "(?<APOS_START>(?<='))?"
Packit Service 3bdf47
Packit Service 3bdf47
#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
Packit Service 3bdf47
Packit Service 3bdf47
#define USERCHARS "-+.[:alnum:]"
Packit Service 3bdf47
/* Nonempty username, e.g. "john.smith" */
Packit Service 3bdf47
#define USER "[" USERCHARS "]+"
Packit Service 3bdf47
Packit Service 3bdf47
#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
Packit Service 3bdf47
/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", "" */
Packit Service 3bdf47
#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
Packit Service 3bdf47
Packit Service 3bdf47
/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
Packit Service 3bdf47
#define USERPASS "(?:" USER PASS "@)?"
Packit Service 3bdf47
Packit Service 3bdf47
/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the string "256".
Packit Service 3bdf47
   The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
Packit Service 3bdf47
#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] ) )))"
Packit Service 3bdf47
Packit Service 3bdf47
/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match "192.168.1.123" in the string "192.168.1.1234". */
Packit Service 3bdf47
#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
Packit Service 3bdf47
Packit Service 3bdf47
/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
Packit Service 3bdf47
 * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in "dead:beef::1.2.3.4".
Packit Service 3bdf47
 * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer unittesting.
Packit Service 3bdf47
 * TODO: more strict check (right number of colons, etc.)
Packit Service 3bdf47
 * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
Packit Service 3bdf47
Packit Service 3bdf47
/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
Packit Service 3bdf47
#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
Packit Service 3bdf47
Packit Service 3bdf47
/* No :: shorthand */
Packit Service 3bdf47
#define IPV6_FULL  "(?x: (?&S6C){7} (?&S6) )"
Packit Service 3bdf47
/* Begins with :: */
Packit Service 3bdf47
#define IPV6_LEFT  "(?x: : (?&CS6){1,7} )"
Packit Service 3bdf47
/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
Packit Service 3bdf47
#define IPV6_MID   "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
Packit Service 3bdf47
/* Ends with :: */
Packit Service 3bdf47
#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
Packit Service 3bdf47
/* Is "::" and nothing more */
Packit Service 3bdf47
#define IPV6_NULL  "(?x: :: )"
Packit Service 3bdf47
Packit Service 3bdf47
/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
Packit Service 3bdf47
#define IPV6V4_FULL  "(?x: (?&S6C){6} )"
Packit Service 3bdf47
#define IPV6V4_LEFT  "(?x: :: (?&S6C){0,5} )"  /* includes "::<ipv4>" */
Packit Service 3bdf47
#define IPV6V4_MID   "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
Packit Service 3bdf47
#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
Packit Service 3bdf47
Packit Service 3bdf47
/* IPV6: An IPv6 address (possibly with an embedded IPv4).
Packit Service 3bdf47
 * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
Packit Service 3bdf47
#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) (?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
Packit Service 3bdf47
Packit Service 3bdf47
/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
Packit Service 3bdf47
 * then any graphical Unicode character.
Packit Service 3bdf47
 * A segment can consist entirely of numbers.
Packit Service 3bdf47
 * (Note: PCRE doesn't support character class subtraction/intersection.) */
Packit Service 3bdf47
#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
Packit Service 3bdf47
Packit Service 3bdf47
/* A hostname of at least 1 component. The last component cannot be entirely numbers.
Packit Service 3bdf47
 * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
Packit Service 3bdf47
#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] ) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
Packit Service 3bdf47
Packit Service 3bdf47
/* A hostname of at least 2 components. The last component cannot be entirely numbers.
Packit Service 3bdf47
 * E.g. "example.com", "1234.com", but not "1234.56" */
Packit Service 3bdf47
#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
Packit Service 3bdf47
Packit Service 3bdf47
/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
Packit Service 3bdf47
#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
Packit Service 3bdf47
Packit Service 3bdf47
/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", "[1.2.3.4]", "[::1]".
Packit Service 3bdf47
 * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid tons of false positives (e.g. in a typical shell prompt). */
Packit Service 3bdf47
#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
Packit Service 3bdf47
Packit Service 3bdf47
/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
Packit Service 3bdf47
   and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
Packit Service 3bdf47
#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
Packit Service 3bdf47
Packit Service 3bdf47
/* Optional colon-prefixed port, e.g. ":1080", "" */
Packit Service 3bdf47
#define PORT "(?x: \\:" N_1_65535 " )?"
Packit Service 3bdf47
Packit Service 3bdf47
/* Omit the parentheses, see below */
Packit Service 3bdf47
#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
Packit Service 3bdf47
/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
Packit Service 3bdf47
#define PATHTERM_CLASS        "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
Packit Service 3bdf47
#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
Packit Service 3bdf47
Packit Service 3bdf47
/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 763980. */
Packit Service 3bdf47
#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
Packit Service 3bdf47
/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
Packit Service 3bdf47
#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS ") )? )))"
Packit Service 3bdf47
Packit Service 3bdf47
#define URLPATH "(?x: /(?&PATH) )?"
Packit Service 3bdf47
#define VOIP_PATH "(?x: [;?](?&PATH) )?"
Packit Service 3bdf47
Packit Service 3bdf47
/* Now let's put these fragments together */
Packit Service 3bdf47
Packit Service 3bdf47
#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
Packit Service 3bdf47
Packit Service 3bdf47
#define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
Packit Service 3bdf47
/* TODO: also support file:/etc/passwd */
Packit Service 3bdf47
#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
Packit Service 3bdf47
/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience (so that we can reuse HOSTNAME1). */
Packit Service 3bdf47
#define REGEX_URL_HTTP   DEFS "(?
Packit Service 3bdf47
#define REGEX_URL_VOIP   DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
Packit Service 3bdf47
#define REGEX_EMAIL      DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
Packit Service 3bdf47
#define REGEX_NEWS_MAN   "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
Packit Service 3bdf47
Packit Service 3bdf47
#endif /* !TERMINAL_REGEX_H */