|
Packit |
b89d10 |
Oniguruma Regular Expressions Version 6.8.0 2018/04/13
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
syntax: ONIG_SYNTAX_ONIGURUMA (default)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
1. Syntax elements
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\ escape (enable or disable meta character)
|
|
Packit |
b89d10 |
| alternation
|
|
Packit |
b89d10 |
(...) group
|
|
Packit |
b89d10 |
[...] character class
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
2. Characters
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\t horizontal tab (0x09)
|
|
Packit |
b89d10 |
\v vertical tab (0x0B)
|
|
Packit |
b89d10 |
\n newline (line feed) (0x0A)
|
|
Packit |
b89d10 |
\r carriage return (0x0D)
|
|
Packit |
b89d10 |
\b backspace (0x08)
|
|
Packit |
b89d10 |
\f form feed (0x0C)
|
|
Packit |
b89d10 |
\a bell (0x07)
|
|
Packit |
b89d10 |
\e escape (0x1B)
|
|
Packit |
b89d10 |
\nnn octal char (encoded byte value)
|
|
Packit |
b89d10 |
\o{17777777777} wide octal char (character code point value)
|
|
Packit |
b89d10 |
\uHHHH wide hexadecimal char (character code point value)
|
|
Packit |
b89d10 |
\xHH hexadecimal char (encoded byte value)
|
|
Packit |
b89d10 |
\x{7HHHHHHH} wide hexadecimal char (character code point value)
|
|
Packit |
b89d10 |
\cx control char (character code point value)
|
|
Packit |
b89d10 |
\C-x control char (character code point value)
|
|
Packit |
b89d10 |
\M-x meta (x|0x80) (character code point value)
|
|
Packit |
b89d10 |
\M-\C-x meta control char (character code point value)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(* \b as backspace is effective in character class only)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
3. Character types
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
. any character (except newline)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\w word character
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Not Unicode:
|
|
Packit |
b89d10 |
alphanumeric, "_" and multibyte char.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Unicode:
|
|
Packit |
b89d10 |
General_Category -- (Letter|Mark|Number|Connector_Punctuation)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\W non-word char
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\s whitespace char
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Not Unicode:
|
|
Packit |
b89d10 |
\t, \n, \v, \f, \r, \x20
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Unicode case:
|
|
Packit |
b89d10 |
U+0009, U+000A, U+000B, U+000C, U+000D, U+0085(NEL),
|
|
Packit |
b89d10 |
General_Category -- Line_Separator
|
|
Packit |
b89d10 |
-- Paragraph_Separator
|
|
Packit |
b89d10 |
-- Space_Separator
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\S non-whitespace char
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\d decimal digit char
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Unicode: General_Category -- Decimal_Number
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\D non-decimal-digit char
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\h hexadecimal digit char [0-9a-fA-F]
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\H non-hexdigit char
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\R general newline (* can't be used in character-class)
|
|
Packit |
b89d10 |
"\r\n" or \n,\v,\f,\r (* but doesn't backtrack from \r\n to \r)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Unicode case:
|
|
Packit |
b89d10 |
"\r\n" or \n,\v,\f,\r or U+0085, U+2028, U+2029
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\N negative newline (?-m:.)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\O true anychar (?m:.) (* original function)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\X Extended Grapheme Cluster (?>\O(?:\Y\O)*)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\X doesn't check whether matching start position is boundary.
|
|
Packit |
b89d10 |
Write as \y\X if you want to ensure it.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Unicode case:
|
|
Packit |
b89d10 |
See [Unicode Standard Annex #29: http://unicode.org/reports/tr29/]
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Not Unicode: (?>\r\n|\O)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Character Property
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* \p{property-name}
|
|
Packit |
b89d10 |
* \p{^property-name} (negative)
|
|
Packit |
b89d10 |
* \P{property-name} (negative)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
property-name:
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ works on all encodings
|
|
Packit |
b89d10 |
Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
|
|
Packit |
b89d10 |
Print, Punct, Space, Upper, XDigit, Word, ASCII
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ works on EUC_JP, Shift_JIS
|
|
Packit |
b89d10 |
Hiragana, Katakana
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ works on UTF8, UTF16, UTF32
|
|
Packit |
b89d10 |
See doc/UNICODE_PROPERTIES.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
4. Quantifier
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
greedy
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
? 1 or 0 times
|
|
Packit |
b89d10 |
* 0 or more times
|
|
Packit |
b89d10 |
+ 1 or more times
|
|
Packit |
b89d10 |
{n,m} at least n but no more than m times
|
|
Packit |
b89d10 |
{n,} at least n times
|
|
Packit |
b89d10 |
{,n} at least 0 but no more than n times ({0,n})
|
|
Packit |
b89d10 |
{n} n times
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
reluctant
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
?? 1 or 0 times
|
|
Packit |
b89d10 |
*? 0 or more times
|
|
Packit |
b89d10 |
+? 1 or more times
|
|
Packit |
b89d10 |
{n,m}? at least n but not more than m times
|
|
Packit |
b89d10 |
{n,}? at least n times
|
|
Packit |
b89d10 |
{,n}? at least 0 but not more than n times (== {0,n}?)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
possessive (greedy and does not backtrack once match)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
?+ 1 or 0 times
|
|
Packit |
b89d10 |
*+ 0 or more times
|
|
Packit |
b89d10 |
++ 1 or more times
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. /a*+/ === /(?>a*)/
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
5. Anchors
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
^ beginning of the line
|
|
Packit |
b89d10 |
$ end of the line
|
|
Packit |
b89d10 |
\b word boundary
|
|
Packit |
b89d10 |
\B non-word boundary
|
|
Packit |
b89d10 |
\y Extended Grapheme Cluster boundary
|
|
Packit |
b89d10 |
\Y Extended Grapheme Cluster non-boundary
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\A beginning of string
|
|
Packit |
b89d10 |
\Z end of string, or before newline at the end
|
|
Packit |
b89d10 |
\z end of string
|
|
Packit |
b89d10 |
\G where the current search attempt begins
|
|
Packit |
b89d10 |
\K keep (keep start position of the result string)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
6. Character class
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
^... negative class (lowest precedence)
|
|
Packit |
b89d10 |
x-y range from x to y
|
|
Packit |
b89d10 |
[...] set (character class in character class)
|
|
Packit |
b89d10 |
..&&.. intersection (low precedence, only higher than ^)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. [a-w&&[^c-g]z] ==> ([a-w] AND ([^c-g] OR z)) ==> [abh-w]
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* If you want to use '[', '-', or ']' as a normal character
|
|
Packit |
b89d10 |
in character class, you should escape them with '\'.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Not Unicode Case:
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
alnum alphabet or digit char
|
|
Packit |
b89d10 |
alpha alphabet
|
|
Packit |
b89d10 |
ascii code value: [0 - 127]
|
|
Packit |
b89d10 |
blank \t, \x20
|
|
Packit |
b89d10 |
cntrl
|
|
Packit |
b89d10 |
digit 0-9
|
|
Packit |
b89d10 |
graph include all of multibyte encoded characters
|
|
Packit |
b89d10 |
lower
|
|
Packit |
b89d10 |
print include all of multibyte encoded characters
|
|
Packit |
b89d10 |
punct
|
|
Packit |
b89d10 |
space \t, \n, \v, \f, \r, \x20
|
|
Packit |
b89d10 |
upper
|
|
Packit |
b89d10 |
xdigit 0-9, a-f, A-F
|
|
Packit |
b89d10 |
word alphanumeric, "_" and multibyte characters
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Unicode Case:
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
alnum Letter | Mark | Decimal_Number
|
|
Packit |
b89d10 |
alpha Letter | Mark
|
|
Packit |
b89d10 |
ascii 0000 - 007F
|
|
Packit |
b89d10 |
blank Space_Separator | 0009
|
|
Packit |
b89d10 |
cntrl Control | Format | Unassigned | Private_Use | Surrogate
|
|
Packit |
b89d10 |
digit Decimal_Number
|
|
Packit |
b89d10 |
graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
|
|
Packit |
b89d10 |
lower Lowercase_Letter
|
|
Packit |
b89d10 |
print [[:graph:]] | [[:space:]]
|
|
Packit |
b89d10 |
punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
|
|
Packit |
b89d10 |
Final_Punctuation | Initial_Punctuation | Other_Punctuation |
|
|
Packit |
b89d10 |
Open_Punctuation
|
|
Packit |
b89d10 |
space Space_Separator | Line_Separator | Paragraph_Separator |
|
|
Packit |
b89d10 |
U+0009 | U+000A | U+000B | U+000C | U+000D | U+0085
|
|
Packit |
b89d10 |
upper Uppercase_Letter
|
|
Packit |
b89d10 |
xdigit U+0030 - U+0039 | U+0041 - U+0046 | U+0061 - U+0066
|
|
Packit |
b89d10 |
(0-9, a-f, A-F)
|
|
Packit |
b89d10 |
word Letter | Mark | Decimal_Number | Connector_Punctuation
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
7. Extended groups
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?#...) comment
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?imxWDSP-imxWDSP) option on/off
|
|
Packit |
b89d10 |
i: ignore case
|
|
Packit |
b89d10 |
m: multi-line (dot (.) also matches newline)
|
|
Packit |
b89d10 |
x: extended form
|
|
Packit |
b89d10 |
W: ASCII only word (\w, \p{Word}, [[:word:]])
|
|
Packit |
b89d10 |
ASCII only word bound (\b)
|
|
Packit |
b89d10 |
D: ASCII only digit (\d, \p{Digit}, [[:digit:]])
|
|
Packit |
b89d10 |
S: ASCII only space (\s, \p{Space}, [[:space:]])
|
|
Packit |
b89d10 |
P: ASCII only POSIX properties (includes W,D,S)
|
|
Packit |
b89d10 |
(alnum, alpha, blank, cntrl, digit, graph,
|
|
Packit |
b89d10 |
lower, print, punct, space, upper, xdigit, word)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?imxWDSP-imxWDSP:subexp) option on/off for subexp
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?:subexp) non-capturing group
|
|
Packit |
b89d10 |
(subexp) capturing group
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?=subexp) look-ahead
|
|
Packit |
b89d10 |
(?!subexp) negative look-ahead
|
|
Packit |
b89d10 |
(?<=subexp) look-behind
|
|
Packit |
b89d10 |
(?
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Subexp of look-behind must be fixed-width.
|
|
Packit |
b89d10 |
But top-level alternatives can be of various lengths.
|
|
Packit |
b89d10 |
ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
In negative look-behind, capturing group isn't allowed,
|
|
Packit |
b89d10 |
but non-capturing group (?:) is allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?>subexp) atomic group
|
|
Packit |
b89d10 |
no backtracks in subexp.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?<name>subexp), (?'name'subexp)
|
|
Packit |
b89d10 |
define named group
|
|
Packit |
b89d10 |
(Each character of the name must be a word character.)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Not only a name but a number is assigned like a capturing
|
|
Packit |
b89d10 |
group.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Assigning the same name to two or more subexps is allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
<Callouts>
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Callouts of contents
|
|
Packit |
b89d10 |
(?{...contents...}) callout in progress
|
|
Packit |
b89d10 |
(?{...contents...}D) D is a direction flag char
|
|
Packit |
b89d10 |
D = 'X': in progress and retraction
|
|
Packit |
b89d10 |
'<': in retraction only
|
|
Packit |
b89d10 |
'>': in progress only
|
|
Packit |
b89d10 |
(?{...contents...}[tag]) tag assigned
|
|
Packit |
b89d10 |
(?{...contents...}[tag]D)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Escape characters have no effects in contents.
|
|
Packit |
b89d10 |
* contents is not allowed to start with '{'.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?{{{...contents...}}}) n times continuations '}' in contents is allowed in
|
|
Packit |
b89d10 |
(n+1) times continuations {{{...}}}.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Allowed tag string characters: _ A-Z a-z 0-9 (* first character: _ A-Z a-z)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Callouts of name
|
|
Packit |
b89d10 |
(*name)
|
|
Packit |
b89d10 |
(*name{args...}) with args
|
|
Packit |
b89d10 |
(*name[tag]) tag assigned
|
|
Packit |
b89d10 |
(*name[tag]{args...})
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Allowed name string characters: _ A-Z a-z 0-9 (* first character: _ A-Z a-z)
|
|
Packit |
b89d10 |
Allowed tag string characters: _ A-Z a-z 0-9 (* first character: _ A-Z a-z)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
<Absent functions>
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?~absent) Absent repeater (* proposed by Tanaka Akira)
|
|
Packit |
b89d10 |
This works like .* (more precisely \O*), but it is
|
|
Packit |
b89d10 |
limited by the range that does not include the string
|
|
Packit |
b89d10 |
match with <absent>.
|
|
Packit |
b89d10 |
This is a written abbreviation of (?~|absent|\O*).
|
|
Packit |
b89d10 |
\O* is used as a repeater.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?~|absent|exp) Absent expression (* original)
|
|
Packit |
b89d10 |
This works like "exp", but it is limited by the range
|
|
Packit |
b89d10 |
that does not include the string match with <absent>.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. (?~|345|\d*) "12345678" ==> "12", "1", ""
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?~|absent) Absent stopper (* original)
|
|
Packit |
b89d10 |
After passed this operator, string right range is limited
|
|
Packit |
b89d10 |
at the point that does not include the string match whth
|
|
Packit |
b89d10 |
<absent>.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?~|) Range clear
|
|
Packit |
b89d10 |
Clear the effects caused by Absent stoppers.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Nested Absent functions are not supported and the behavior
|
|
Packit |
b89d10 |
is undefined.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
<if-then-else>
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?(condition_exp)then_exp|else_exp) if-then-else
|
|
Packit |
b89d10 |
(?(condition_exp)then_exp) if-then
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
condition_exp can be a backreference number/name or a normal
|
|
Packit |
b89d10 |
regular expression.
|
|
Packit |
b89d10 |
When condition_exp is a backreference number/name, both then_exp and
|
|
Packit |
b89d10 |
else_exp can be omitted.
|
|
Packit |
b89d10 |
Then it works as a backreference validity checker.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
[ backreference validity checker ] (* original)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?(n)), (?(-n)), (?(+n)), (?(n+level)) ...
|
|
Packit |
b89d10 |
(?(<n>)), (?('-n')), (?(<+n>)) ...
|
|
Packit |
b89d10 |
(?(<name>)), (?('name')), (?(<name+level>)) ...
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
8. Backreferences
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
When we say "backreference a group," it actually means, "re-match the same
|
|
Packit |
b89d10 |
text matched by the subexp in that group."
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\n \k<n> \k'n' (n >= 1) backreference the nth group in the regexp
|
|
Packit |
b89d10 |
\k<-n> \k'-n' (n >= 1) backreference the nth group counting
|
|
Packit |
b89d10 |
backwards from the referring position
|
|
Packit |
b89d10 |
\k<+n> \k'+n' (n >= 1) backreference the nth group counting
|
|
Packit |
b89d10 |
forwards from the referring position
|
|
Packit |
b89d10 |
\k<name> \k'name' backreference a group with the specified name
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
When backreferencing with a name that is assigned to more than one groups,
|
|
Packit |
b89d10 |
the last group with the name is checked first, if not matched then the
|
|
Packit |
b89d10 |
previous one with the name, and so on, until there is a match.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Backreference by number is forbidden if any named group is defined and
|
|
Packit |
b89d10 |
ONIG_OPTION_CAPTURE_GROUP is not set.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
backreference with recursion level
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(n >= 1, level >= 0)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\k<n+level> \k'n+level'
|
|
Packit |
b89d10 |
\k<n-level> \k'n-level'
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\k<name+level> \k'name+level'
|
|
Packit |
b89d10 |
\k<name-level> \k'name-level'
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Destine a group on the recursion level relative to the referring position.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex 1.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
/\A(?|.|(?:(?.)\g\k))\z/.match("reee")
|
|
Packit |
b89d10 |
/\A(?|.|(?:(?.)\g\k<b+0>))\z/.match("reer")
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\k<b+0> refers to the (?.) on the same recursion level with it.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex 2.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED)
|
|
Packit |
b89d10 |
(?<element> \g<stag> \g<content>* \g<etag> ){0}
|
|
Packit |
b89d10 |
(?<stag> < \g<name> \s* > ){0}
|
|
Packit |
b89d10 |
(?<name> [a-zA-Z_:]+ ){0}
|
|
Packit |
b89d10 |
(?<content> [^<&]+ (\g<element> | [^<&]+)* ){0}
|
|
Packit |
b89d10 |
(?<etag> </ \k<name+1> >){0}
|
|
Packit |
b89d10 |
\g<element>
|
|
Packit |
b89d10 |
__REGEXP__
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
p r.match("<foo>f<bar>bbb</bar>f</foo>").captures
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
9. Subexp calls ("Tanaka Akira special") (* original function)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
When we say "call a group," it actually means, "re-execute the subexp in
|
|
Packit |
b89d10 |
that group."
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
\g<n> \g'n' (n >= 1) call the nth group
|
|
Packit |
b89d10 |
\g<0> \g'0' call zero (call the total regexp)
|
|
Packit |
b89d10 |
\g<-n> \g'-n' (n >= 1) call the nth group counting backwards from
|
|
Packit |
b89d10 |
the calling position
|
|
Packit |
b89d10 |
\g<+n> \g'+n' (n >= 1) call the nth group counting forwards from
|
|
Packit |
b89d10 |
the calling position
|
|
Packit |
b89d10 |
\g<name> \g'name' call the group with the specified name
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Left-most recursive calls are not allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. (?<name>a|\g<name>b) => error
|
|
Packit |
b89d10 |
(?<name>a|b\g<name>c) => OK
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Calls with a name that is assigned to more than one groups are not
|
|
Packit |
b89d10 |
allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Call by number is forbidden if any named group is defined and
|
|
Packit |
b89d10 |
ONIG_OPTION_CAPTURE_GROUP is not set.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* The option status of the called group is always effective.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. /(?-i:\g<name>)(?i:(?<name>a)){0}/.match("A")
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
10. Captured group
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
Behavior of an unnamed group (...) changes with the following conditions.
|
|
Packit |
b89d10 |
(But named group is not changed.)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
case 1. /.../ (named group is not used, no option)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(...) is treated as a capturing group.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
case 2. /.../g (named group is not used, 'g' option)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(...) is treated as a non-capturing group (?:...).
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
case 3. /..(?<name>..)../ (named group is used, no option)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(...) is treated as a non-capturing group.
|
|
Packit |
b89d10 |
numbered-backref/call is not allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
case 4. /..(?<name>..)../G (named group is used, 'G' option)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(...) is treated as a capturing group.
|
|
Packit |
b89d10 |
numbered-backref/call is allowed.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
where
|
|
Packit |
b89d10 |
g: ONIG_OPTION_DONT_CAPTURE_GROUP
|
|
Packit |
b89d10 |
G: ONIG_OPTION_CAPTURE_GROUP
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
('g' and 'G' options are argued in ruby-dev ML)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
-----------------------------
|
|
Packit |
b89d10 |
A-1. Syntax-dependent options
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ ONIG_SYNTAX_ONIGURUMA
|
|
Packit |
b89d10 |
(?m): dot (.) also matches newline
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
|
|
Packit |
b89d10 |
(?s): dot (.) also matches newline
|
|
Packit |
b89d10 |
(?m): ^ matches after newline, $ matches before newline
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
A-2. Original extensions
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ hexadecimal digit char type \h, \H
|
|
Packit |
b89d10 |
+ named group (?<name>...), (?'name'...)
|
|
Packit |
b89d10 |
+ named backref \k<name>
|
|
Packit |
b89d10 |
+ subexp call \g<name>, \g<group-num>
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
A-3. Missing features compared with perl 5.8.0
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ \N{name}
|
|
Packit |
b89d10 |
+ \l,\u,\L,\U,\C
|
|
Packit |
b89d10 |
+ (?{code})
|
|
Packit |
b89d10 |
+ (??{code})
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* \Q...\E
|
|
Packit |
b89d10 |
This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
A-4. Differences with Japanized GNU regex(version 0.12) of Ruby 1.8
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ add character property (\p{property}, \P{property})
|
|
Packit |
b89d10 |
+ add hexadecimal digit char type (\h, \H)
|
|
Packit |
b89d10 |
+ add look-behind
|
|
Packit |
b89d10 |
(?<=fixed-width-pattern), (?
|
|
Packit |
b89d10 |
+ add possessive quantifier. ?+, *+, ++
|
|
Packit |
b89d10 |
+ add operations in character class. [], &&
|
|
Packit |
b89d10 |
('[' must be escaped as an usual char in character class.)
|
|
Packit |
b89d10 |
+ add named group and subexp call.
|
|
Packit |
b89d10 |
+ octal or hexadecimal number sequence can be treated as
|
|
Packit |
b89d10 |
a multibyte code char in character class if multibyte encoding
|
|
Packit |
b89d10 |
is specified.
|
|
Packit |
b89d10 |
(ex. [\xa1\xa2], [\xa1\xa7-\xa4\xa1])
|
|
Packit |
b89d10 |
+ allow the range of single byte char and multibyte char in character
|
|
Packit |
b89d10 |
class.
|
|
Packit |
b89d10 |
ex. /[a-<<any EUC-JP character>>]/ in EUC-JP encoding.
|
|
Packit |
b89d10 |
+ effect range of isolated option is to next ')'.
|
|
Packit |
b89d10 |
ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
|
|
Packit |
b89d10 |
+ isolated option is not transparent to previous pattern.
|
|
Packit |
b89d10 |
ex. a(?i)* is a syntax error pattern.
|
|
Packit |
b89d10 |
+ allowed unpaired left brace as a normal character.
|
|
Packit |
b89d10 |
ex. /{/, /({)/, /a{2,3/ etc...
|
|
Packit |
b89d10 |
+ negative POSIX bracket [:^xxxx:] is supported.
|
|
Packit |
b89d10 |
+ POSIX bracket [:ascii:] is added.
|
|
Packit |
b89d10 |
+ repeat of look-ahead is not allowed.
|
|
Packit |
b89d10 |
ex. /(?=a)*/, /(?!b){5}/
|
|
Packit |
b89d10 |
+ Ignore case option is effective to escape sequence.
|
|
Packit |
b89d10 |
ex. /\x61/i =~ "A"
|
|
Packit |
b89d10 |
+ In the range quantifier, the number of the minimum is optional.
|
|
Packit |
b89d10 |
/a{,n}/ == /a{0,n}/
|
|
Packit |
b89d10 |
The omission of both minimum and maximum values is not allowed.
|
|
Packit |
b89d10 |
/a{,}/
|
|
Packit |
b89d10 |
+ /{n}?/ is not a reluctant quantifier.
|
|
Packit |
b89d10 |
/a{n}?/ == /(?:a{n})?/
|
|
Packit |
b89d10 |
+ invalid back reference is checked and raises error.
|
|
Packit |
b89d10 |
/\1/, /(a)\2/
|
|
Packit |
b89d10 |
+ Zero-width match in an infinite loop stops the repeat,
|
|
Packit |
b89d10 |
then changes of the capture group status are checked as stop condition.
|
|
Packit |
b89d10 |
/(?:()|())*\1\2/ =~ ""
|
|
Packit |
b89d10 |
/(?:\1a|())*/ =~ "a"
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
A-5. Features disabled in default syntax
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ capture history
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
(?@...) and (?@<name>...)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>]
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
see sample/listcap.c file.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
A-6. Problems
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
+ Invalid encoding byte sequence is not checked.
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ex. UTF-8
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Invalid first byte is treated as a character.
|
|
Packit |
b89d10 |
/./u =~ "\xa3"
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
* Incomplete byte sequence is not checked.
|
|
Packit |
b89d10 |
/\w+/ =~ "a\xf3\x8ec"
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
// END
|