Blob Blame History Raw
# This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library.

# The next 4 patterns have UTF-8 errors

/[Ã]/utf

/Ã/utf

/ÃÃÃxxx/utf

/‚‚‚‚‚‚‚Ã/utf

# Now test subjects

/badutf/utf
\= Expect UTF-8 errors
    X\xdf
    XX\xef
    XXX\xef\x80
    X\xf7
    XX\xf7\x80
    XXX\xf7\x80\x80
    \xfb
    \xfb\x80
    \xfb\x80\x80
    \xfb\x80\x80\x80
    \xfd
    \xfd\x80
    \xfd\x80\x80
    \xfd\x80\x80\x80
    \xfd\x80\x80\x80\x80
    \xdf\x7f
    \xef\x7f\x80
    \xef\x80\x7f
    \xf7\x7f\x80\x80
    \xf7\x80\x7f\x80
    \xf7\x80\x80\x7f
    \xfb\x7f\x80\x80\x80
    \xfb\x80\x7f\x80\x80
    \xfb\x80\x80\x7f\x80
    \xfb\x80\x80\x80\x7f
    \xfd\x7f\x80\x80\x80\x80
    \xfd\x80\x7f\x80\x80\x80
    \xfd\x80\x80\x7f\x80\x80
    \xfd\x80\x80\x80\x7f\x80
    \xfd\x80\x80\x80\x80\x7f
    \xed\xa0\x80
    \xc0\x8f
    \xe0\x80\x8f
    \xf0\x80\x80\x8f
    \xf8\x80\x80\x80\x8f
    \xfc\x80\x80\x80\x80\x8f
    \x80
    \xfe
    \xff

/badutf/utf
\= Expect UTF-8 errors
    XX\xfb\x80\x80\x80\x80
    XX\xfd\x80\x80\x80\x80\x80
    XX\xf7\xbf\xbf\xbf

/shortutf/utf
\= Expect UTF-8 errors
    XX\xdf\=ph
    XX\xef\=ph
    XX\xef\x80\=ph
    \xf7\=ph
    \xf7\x80\=ph
    \xf7\x80\x80\=ph
    \xfb\=ph
    \xfb\x80\=ph
    \xfb\x80\x80\=ph
    \xfb\x80\x80\x80\=ph
    \xfd\=ph
    \xfd\x80\=ph
    \xfd\x80\x80\=ph
    \xfd\x80\x80\x80\=ph
    \xfd\x80\x80\x80\x80\=ph

/anything/utf
\= Expect UTF-8 errors
    X\xc0\x80
    XX\xc1\x8f
    XXX\xe0\x9f\x80
    \xf0\x8f\x80\x80
    \xf8\x87\x80\x80\x80
    \xfc\x83\x80\x80\x80\x80
    \xfe\x80\x80\x80\x80\x80
    \xff\x80\x80\x80\x80\x80
    \xf8\x88\x80\x80\x80
    \xf9\x87\x80\x80\x80
    \xfc\x84\x80\x80\x80\x80
    \xfd\x83\x80\x80\x80\x80
\= Expect no match
    \xc3\x8f
    \xe0\xaf\x80
    \xe1\x80\x80
    \xf0\x9f\x80\x80
    \xf1\x8f\x80\x80
    \xf8\x88\x80\x80\x80\=no_utf_check
    \xf9\x87\x80\x80\x80\=no_utf_check
    \xfc\x84\x80\x80\x80\x80\=no_utf_check
    \xfd\x83\x80\x80\x80\x80\=no_utf_check
    
# Similar tests with offsets

/badutf/utf
\= Expect UTF-8 errors
    X\xdfabcd
    X\xdfabcd\=offset=1
\= Expect no match
    X\xdfabcd\=offset=2

/(?<=x)badutf/utf
\= Expect UTF-8 errors
    X\xdfabcd
    X\xdfabcd\=offset=1
    X\xdfabcd\=offset=2
    X\xdfabcd\xdf\=offset=3
\= Expect no match
    X\xdfabcd\=offset=3

/(?<=xx)badutf/utf
\= Expect UTF-8 errors
    X\xdfabcd
    X\xdfabcd\=offset=1
    X\xdfabcd\=offset=2
    X\xdfabcd\=offset=3

/(?<=xxxx)badutf/utf
\= Expect UTF-8 errors
    X\xdfabcd
    X\xdfabcd\=offset=1
    X\xdfabcd\=offset=2
    X\xdfabcd\=offset=3
    X\xdfabc\xdf\=offset=6
    X\xdfabc\xdf\=offset=7
\= Expect no match
    X\xdfabcd\=offset=6
 
/\x{100}/IB,utf

/\x{1000}/IB,utf

/\x{10000}/IB,utf

/\x{100000}/IB,utf

/\x{10ffff}/IB,utf

/[\x{ff}]/IB,utf

/[\x{100}]/IB,utf

/\x80/IB,utf

/\xff/IB,utf

/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
    \x{D55c}\x{ad6d}\x{C5B4}

/\x{65e5}\x{672c}\x{8a9e}/IB,utf
    \x{65e5}\x{672c}\x{8a9e}

/\x{80}/IB,utf

/\x{084}/IB,utf

/\x{104}/IB,utf

/\x{861}/IB,utf

/\x{212ab}/IB,utf

/[^ab\xC0-\xF0]/IB,utf
    \x{f1}
    \x{bf}
    \x{100}
    \x{1000}
\= Expect no match
    \x{c0}
    \x{f0}

/Ā{3,4}/IB,utf
  \x{100}\x{100}\x{100}\x{100\x{100}

/(\x{100}+|x)/IB,utf

/(\x{100}*a|x)/IB,utf

/(\x{100}{0,2}a|x)/IB,utf

/(\x{100}{1,2}a|x)/IB,utf

/\x{100}/IB,utf

/a\x{100}\x{101}*/IB,utf

/a\x{100}\x{101}+/IB,utf

/[^\x{c4}]/IB

/[\x{100}]/IB,utf
    \x{100}
    Z\x{100}
    \x{100}Z

/[\xff]/IB,utf
    >\x{ff}<

/[^\xff]/IB,utf

/\x{100}abc(xyz(?1))/IB,utf

/\777/I,utf
  \x{1ff}
  \777

/\x{100}+\x{200}/IB,utf

/\x{100}+X/IB,utf

/^[\QĀ\E-\QŐ\E/B,utf

# This tests the stricter UTF-8 check according to RFC 3629.

/X/utf
\= Expect UTF-8 errors
    \x{d800}
    \x{da00}
    \x{dfff}
    \x{110000}
    \x{2000000}
    \x{7fffffff}
\= Expect no match
    \x{d800}\=no_utf_check
    \x{da00}\=no_utf_check
    \x{dfff}\=no_utf_check
    \x{110000}\=no_utf_check
    \x{2000000}\=no_utf_check
    \x{7fffffff}\=no_utf_check

/(*UTF8)\x{1234}/
    abcd\x{1234}pqr

/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I

/\h/I,utf
    ABC\x{09}
    ABC\x{20}
    ABC\x{a0}
    ABC\x{1680}
    ABC\x{180e}
    ABC\x{2000}
    ABC\x{202f}
    ABC\x{205f}
    ABC\x{3000}

/\v/I,utf
    ABC\x{0a}
    ABC\x{0b}
    ABC\x{0c}
    ABC\x{0d}
    ABC\x{85}
    ABC\x{2028}

/\h*A/I,utf
    CDBABC

/\v+A/I,utf

/\s?xxx\s/I,utf

/\sxxx\s/I,utf,tables=2
    AB\x{85}xxx\x{a0}XYZ
    AB\x{a0}xxx\x{85}XYZ

/\S \S/I,utf,tables=2
    \x{a2} \x{84}
    A Z

/a+/utf
    a\x{123}aa\=offset=1
    a\x{123}aa\=offset=3
    a\x{123}aa\=offset=4
\= Expect bad offset value
    a\x{123}aa\=offset=6
\= Expect bad UTF-8 offset     
    a\x{123}aa\=offset=2
\= Expect no match
    a\x{123}aa\=offset=5

/\x{1234}+/Ii,utf

/\x{1234}+?/Ii,utf

/\x{1234}++/Ii,utf

/\x{1234}{2}/Ii,utf

/[^\x{c4}]/IB,utf

/X+\x{200}/IB,utf

/\R/I,utf

/\777/IB,utf

/\w+\x{C4}/B,utf
    a\x{C4}\x{C4}

/\w+\x{C4}/B,utf,tables=2
    a\x{C4}\x{C4}

/\W+\x{C4}/B,utf
    !\x{C4}

/\W+\x{C4}/B,utf,tables=2
    !\x{C4}

/\W+\x{A1}/B,utf
    !\x{A1}

/\W+\x{A1}/B,utf,tables=2
    !\x{A1}

/X\s+\x{A0}/B,utf
    X\x20\x{A0}\x{A0}

/X\s+\x{A0}/B,utf,tables=2
    X\x20\x{A0}\x{A0}

/\S+\x{A0}/B,utf
    X\x{A0}\x{A0}

/\S+\x{A0}/B,utf,tables=2
    X\x{A0}\x{A0}

/\x{a0}+\s!/B,utf
    \x{a0}\x20!

/\x{a0}+\s!/B,utf,tables=2
    \x{a0}\x20!

/A/utf
  \x{ff000041}
  \x{7f000041}

/(*UTF8)abc/never_utf

/abc/utf,never_utf

/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf

/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf

/AB\x{1fb0}/IB,utf

/AB\x{1fb0}/IBi,utf

/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
    \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
    \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}

/[â±¥]/Bi,utf

/[^â±¥]/Bi,utf

/\h/I

/\v/I

/\R/I

/[[:blank:]]/B,ucp

/\x{212a}+/Ii,utf
    KKkk\x{212a}

/s+/Ii,utf
    SSss\x{17f}

/\x{100}*A/IB,utf
    A

/\x{100}*\d(?R)/IB,utf

/[Z\x{100}]/IB,utf
    Z\x{100}
    \x{100}
    \x{100}Z

/[z-\x{100}]/IB,utf

/[z\Qa-d]Ā\E]/IB,utf
    \x{100}
    Ā 

/[ab\x{100}]abc(xyz(?1))/IB,utf

/\x{100}*\s/IB,utf

/\x{100}*\d/IB,utf

/\x{100}*\w/IB,utf

/\x{100}*\D/IB,utf

/\x{100}*\S/IB,utf

/\x{100}*\W/IB,utf

/[\x{105}-\x{109}]/IBi,utf
    \x{104}
    \x{105}
    \x{109}  
\= Expect no match
    \x{100}
    \x{10a} 
    
/[z-\x{100}]/IBi,utf
    Z
    z
    \x{39c}
    \x{178}
    |
    \x{80}
    \x{ff}
    \x{100}
    \x{101} 
\= Expect no match
    \x{102}
    Y
    y           

/[z-\x{100}]/IBi,utf

/\x{3a3}B/IBi,utf

/abc/utf,replace=Ã
    abc

/(?<=(a)(?-1))x/I,utf
    a\x80zx\=offset=3

/[\W\p{Any}]/B
    abc
    123 

/[\W\pL]/B
    abc
\= Expect no match
    123     

/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':Æ¿)/utf

/[\s[:^ascii:]]/B,ucp

# A special extra option allows excaped surrogate code points in 8-bit mode,
# but subjects containing them must not be UTF-checked.

/\x{d800}/I,utf,allow_surrogate_escapes
    \x{d800}\=no_utf_check

/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    \x{dfff}\x{df01}\=no_utf_check
    
# This has different starting code units in 8-bit mode. 

/^[^ab]/IB,utf
    c
    \x{ff}
    \x{100}
\= Expect no match
    aaa

# End of testinput10