|
Packit |
709fb3 |
#! /bin/sh
|
|
Packit |
709fb3 |
# Test whether \s matches SP and UTF-8 multi-byte white space characters.
|
|
Packit |
709fb3 |
#
|
|
Packit |
709fb3 |
# Copyright (C) 2013-2017 Free Software Foundation, Inc.
|
|
Packit |
709fb3 |
#
|
|
Packit |
709fb3 |
# Copying and distribution of this file, with or without modification,
|
|
Packit |
709fb3 |
# are permitted in any medium without royalty provided the copyright
|
|
Packit |
709fb3 |
# notice and this notice are preserved.
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
require_en_utf8_locale_
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
LC_ALL=en_US.UTF-8
|
|
Packit |
709fb3 |
export LC_ALL
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
# It would have been nice to be able to use all UTF8 characters
|
|
Packit |
709fb3 |
# with the Unicode WSpace=Y character property,
|
|
Packit |
709fb3 |
# https://en.wikipedia.org/wiki/Whitespace_character, but that
|
|
Packit |
709fb3 |
# would currently cause distracting failures everywhere I've tried.
|
|
Packit |
709fb3 |
# Instead, I've listed each with an indicator column, telling what
|
|
Packit |
709fb3 |
# this test should do if the system's locale/tools produce the
|
|
Packit |
709fb3 |
# wrong answer.
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
# The values in that column:
|
|
Packit |
709fb3 |
# X required on all systems (fail if \s or \S fail to work as expected)
|
|
Packit |
709fb3 |
# x required on "modern enough" systems
|
|
Packit |
709fb3 |
# O optional: \s or \S misbehavior elicits a warning, but never failure
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
utf8_space_characters=$(sed 's/.*: *//;s/ */\\x/g' <<\EOF
|
|
Packit |
709fb3 |
U+0009 Horizontal Tab: X 09
|
|
Packit |
709fb3 |
U+000A Line feed: O 0a
|
|
Packit |
709fb3 |
U+000B Vertical Tab: X 0b
|
|
Packit |
709fb3 |
U+000C Form feed: X 0c
|
|
Packit |
709fb3 |
U+000D Carriage return: X 0d
|
|
Packit |
709fb3 |
U+0020 SPACE: X 20
|
|
Packit |
709fb3 |
U+0085 Next line: O 85
|
|
Packit |
709fb3 |
U+00A0 NO-BREAK SPACE: O c2 a0
|
|
Packit |
709fb3 |
U+1680 OGHAM SPACE MARK: x e1 9a 80
|
|
Packit |
709fb3 |
U+2000 EN QUAD: x e2 80 80
|
|
Packit |
709fb3 |
U+2001 EM QUAD: x e2 80 81
|
|
Packit |
709fb3 |
U+2002 EN SPACE: x e2 80 82
|
|
Packit |
709fb3 |
U+2003 EM SPACE: x e2 80 83
|
|
Packit |
709fb3 |
U+2004 THREE-PER-EM SPACE: x e2 80 84
|
|
Packit |
709fb3 |
U+2005 FOUR-PER-EM SPACE: x e2 80 85
|
|
Packit |
709fb3 |
U+2006 SIX-PER-EM SPACE: x e2 80 86
|
|
Packit |
709fb3 |
U+2007 FIGURE SPACE: O e2 80 87
|
|
Packit |
709fb3 |
U+2008 PUNCTUATION SPACE: x e2 80 88
|
|
Packit |
709fb3 |
U+2009 THIN SPACE: x e2 80 89
|
|
Packit |
709fb3 |
U+200A HAIR SPACE: x e2 80 8a
|
|
Packit |
709fb3 |
U+200B ZERO WIDTH SPACE: O e2 80 8b
|
|
Packit |
709fb3 |
U+202F NARROW NO-BREAK SPACE: O e2 80 af
|
|
Packit |
709fb3 |
U+205F MEDIUM MATHEMATICAL SPACE: x e2 81 9f
|
|
Packit |
709fb3 |
U+3000 IDEOGRAPHIC SPACE: x e3 80 80
|
|
Packit |
709fb3 |
EOF
|
|
Packit |
709fb3 |
)
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
fail=0
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
# On systems that are not "modern enough," simply warn when an "x"-marked
|
|
Packit |
709fb3 |
# character is not classified as white space. Too many systems
|
|
Packit |
709fb3 |
# have inadequate UTF-8 tables in this respect, and that lack should not
|
|
Packit |
709fb3 |
# discourage/confuse those who consider whether to install grep.
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
# As for what constitutes "modern enough", I've arbitrarily started
|
|
Packit |
709fb3 |
# with "Fedora 20 or newer". Tested additions welcome.
|
|
Packit |
709fb3 |
modern_enough=0
|
|
Packit |
709fb3 |
grep -iE 'fedora release [2-9][0-9]+\b' /etc/redhat-release >/dev/null 2>&1 \
|
|
Packit |
709fb3 |
&& modern_enough=1
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
for i in $utf8_space_characters; do
|
|
Packit |
709fb3 |
eval 'fail() { fail=1; }'
|
|
Packit |
709fb3 |
m=ERROR
|
|
Packit |
709fb3 |
case $i in
|
|
Packit |
709fb3 |
X*) ;;
|
|
Packit |
709fb3 |
x*) test $modern_enough = 1 || { eval 'fail() { :; }'; m=warning; } ;;
|
|
Packit |
709fb3 |
O*) m=warning; eval 'fail() { :; }' ;;
|
|
Packit |
709fb3 |
*) warn_ "unexpected prefix: $i"; exit 1 ;;
|
|
Packit |
709fb3 |
esac
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
# Strip the prefix byte.
|
|
Packit |
709fb3 |
i=${i#?}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
hex_printf_ "$i" | grep -q '^\s$' \
|
|
Packit |
709fb3 |
|| { warn_ " $m: \\s failed to match $i in the $LC_ALL locale"; fail; }
|
|
Packit |
709fb3 |
hex_printf_ "$i" | returns_ 1 grep -q '\S' \
|
|
Packit |
709fb3 |
|| { warn_ " $m: \\S mistakenly matched $i in the $LC_ALL locale"; fail; }
|
|
Packit |
709fb3 |
done
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
# This is a separate test, only nominally related to \s.
|
|
Packit |
709fb3 |
# It is solely to get coverage of a code path (exercising dfa.c's
|
|
Packit |
709fb3 |
# match_mb_charset function) that would have otherwise been untouched.
|
|
Packit |
709fb3 |
# However, as of the change-set adding this new test, match_mb_charset
|
|
Packit |
709fb3 |
# is unreachable via grep.
|
|
Packit |
709fb3 |
printf '\0' | returns_ 1 grep -aE '^\s?$' > out 2>&1 || fail=1
|
|
Packit |
709fb3 |
compare /dev/null out
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
Exit $fail
|