|
Packit |
d370c2 |
/*
|
|
Packit |
d370c2 |
* Copyright © 2015 Egmont Koblinger
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* This program is free software: you can redistribute it and/or modify
|
|
Packit |
d370c2 |
* it under the terms of the GNU General Public License as published by
|
|
Packit |
d370c2 |
* the Free Software Foundation, either version 3 of the License, or
|
|
Packit |
d370c2 |
* (at your option) any later version.
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* This program is distributed in the hope that it will be useful,
|
|
Packit |
d370c2 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
d370c2 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
d370c2 |
* GNU General Public License for more details.
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* You should have received a copy of the GNU General Public License
|
|
Packit |
d370c2 |
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
Packit |
d370c2 |
*/
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/*
|
|
Packit |
d370c2 |
* Mini style-guide:
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* #define'd fragments should preferably have an outermost group, for the
|
|
Packit |
d370c2 |
* exact same reason as why usually in C/C++ #define's the values are enclosed
|
|
Packit |
d370c2 |
* in parentheses: that is, so that you don't get surprised when you use the
|
|
Packit |
d370c2 |
* macro and append a quantifier.
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
|
|
Packit |
d370c2 |
* as (?&NAME), so that the regex string and the compiled regex object is
|
|
Packit |
d370c2 |
* smaller.
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* Build small blocks, comment and unittest them heavily.
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* Use free-spacing mode for improved readability. The hardest to read is
|
|
Packit |
d370c2 |
* which additional characters belong to a "(?" prefix. To improve
|
|
Packit |
d370c2 |
* readability, place a space after this, and for symmetry, before the closing
|
|
Packit |
d370c2 |
* parenthesis. Also place a space around "|" characters. No space before
|
|
Packit |
d370c2 |
* quantifiers. Try to be consistent with the existing style (yes I know the
|
|
Packit |
d370c2 |
* existing style is not consistent either, but please do your best).
|
|
Packit |
d370c2 |
*
|
|
Packit |
d370c2 |
* See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
|
|
Packit |
d370c2 |
* syntaxes.
|
|
Packit |
d370c2 |
*/
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#ifndef TERMINAL_REGEX_H
|
|
Packit |
d370c2 |
#define TERMINAL_REGEX_H
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Lookbehind to see if there's a preceding apostrophe */
|
|
Packit |
d370c2 |
#define APOS_START_DEF "(?<APOS_START>(?<='))?"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#define USERCHARS "-+.[:alnum:]"
|
|
Packit |
d370c2 |
/* Nonempty username, e.g. "john.smith" */
|
|
Packit |
d370c2 |
#define USER "[" USERCHARS "]+"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
|
|
Packit |
d370c2 |
/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", "" */
|
|
Packit |
d370c2 |
#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
|
|
Packit |
d370c2 |
#define USERPASS "(?:" USER PASS "@)?"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the string "256".
|
|
Packit |
d370c2 |
The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
|
|
Packit |
d370c2 |
#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] ) )))"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match "192.168.1.123" in the string "192.168.1.1234". */
|
|
Packit |
d370c2 |
#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
|
|
Packit |
d370c2 |
* Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in "dead:beef::1.2.3.4".
|
|
Packit |
d370c2 |
* This is not required since the surrounding brackets would trigger backtracking, but it allows nicer unittesting.
|
|
Packit |
d370c2 |
* TODO: more strict check (right number of colons, etc.)
|
|
Packit |
d370c2 |
* TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
|
|
Packit |
d370c2 |
#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* No :: shorthand */
|
|
Packit |
d370c2 |
#define IPV6_FULL "(?x: (?&S6C){7} (?&S6) )"
|
|
Packit |
d370c2 |
/* Begins with :: */
|
|
Packit |
d370c2 |
#define IPV6_LEFT "(?x: : (?&CS6){1,7} )"
|
|
Packit |
d370c2 |
/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
|
|
Packit |
d370c2 |
#define IPV6_MID "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
|
|
Packit |
d370c2 |
/* Ends with :: */
|
|
Packit |
d370c2 |
#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
|
|
Packit |
d370c2 |
/* Is "::" and nothing more */
|
|
Packit |
d370c2 |
#define IPV6_NULL "(?x: :: )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
|
|
Packit |
d370c2 |
#define IPV6V4_FULL "(?x: (?&S6C){6} )"
|
|
Packit |
d370c2 |
#define IPV6V4_LEFT "(?x: :: (?&S6C){0,5} )" /* includes "::<ipv4>" */
|
|
Packit |
d370c2 |
#define IPV6V4_MID "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
|
|
Packit |
d370c2 |
#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* IPV6: An IPv6 address (possibly with an embedded IPv4).
|
|
Packit |
d370c2 |
* This macro defines both IPV4 and IPV6, since the latter one requires the former. */
|
|
Packit |
d370c2 |
#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) (?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
|
|
Packit |
d370c2 |
* then any graphical Unicode character.
|
|
Packit |
d370c2 |
* A segment can consist entirely of numbers.
|
|
Packit |
d370c2 |
* (Note: PCRE doesn't support character class subtraction/intersection.) */
|
|
Packit |
d370c2 |
#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* A hostname of at least 1 component. The last component cannot be entirely numbers.
|
|
Packit |
d370c2 |
* E.g. "foo", "example.com", "1234.com", but not "foo.123" */
|
|
Packit |
d370c2 |
#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] ) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* A hostname of at least 2 components. The last component cannot be entirely numbers.
|
|
Packit |
d370c2 |
* E.g. "example.com", "1234.com", but not "1234.56" */
|
|
Packit |
d370c2 |
#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
|
|
Packit |
d370c2 |
#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", "[1.2.3.4]", "[::1]".
|
|
Packit |
d370c2 |
* Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid tons of false positives (e.g. in a typical shell prompt). */
|
|
Packit |
d370c2 |
#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
|
|
Packit |
d370c2 |
and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
|
|
Packit |
d370c2 |
#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Optional colon-prefixed port, e.g. ":1080", "" */
|
|
Packit |
d370c2 |
#define PORT "(?x: \\:" N_1_65535 " )?"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Omit the parentheses, see below */
|
|
Packit |
d370c2 |
#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
|
|
Packit |
d370c2 |
/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
|
|
Packit |
d370c2 |
#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
|
|
Packit |
d370c2 |
#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 763980. */
|
|
Packit |
d370c2 |
#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
|
|
Packit |
d370c2 |
/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
|
|
Packit |
d370c2 |
#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS ") )? )))"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#define URLPATH "(?x: /(?&PATH) )?"
|
|
Packit |
d370c2 |
#define VOIP_PATH "(?x: [;?](?&PATH) )?"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
/* Now let's put these fragments together */
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#define REGEX_URL_AS_IS DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
|
|
Packit |
d370c2 |
/* TODO: also support file:/etc/passwd */
|
|
Packit |
d370c2 |
#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
|
|
Packit |
d370c2 |
/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience (so that we can reuse HOSTNAME1). */
|
|
Packit |
d370c2 |
#define REGEX_URL_HTTP DEFS "(?
|
|
Packit |
d370c2 |
#define REGEX_URL_VOIP DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
|
|
Packit |
d370c2 |
#define REGEX_EMAIL DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
|
|
Packit |
d370c2 |
#define REGEX_NEWS_MAN "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
|
|
Packit |
d370c2 |
|
|
Packit |
d370c2 |
#endif /* !TERMINAL_REGEX_H */
|