From 1d01107caefb021c224fbc5e1e6067dab8d8db2e Mon Sep 17 00:00:00 2001 From: Packit Service Date: Dec 10 2020 11:40:49 +0000 Subject: Apply patch diffutils-i18n.patch patch_name: diffutils-i18n.patch present_in_specfile: true --- diff --git a/src/diff.c b/src/diff.c index df3338c..60c6ce7 100644 --- a/src/diff.c +++ b/src/diff.c @@ -76,6 +76,8 @@ static void try_help (char const *, char const *) __attribute__((noreturn)); static void check_stdout (void); static void usage (void); +bool (*lines_differ) (char const *, size_t, char const *, size_t); + /* If comparing directories, compare their common subdirectories recursively. */ static bool recursive; @@ -298,6 +300,13 @@ main (int argc, char **argv) excluded = new_exclude (); presume_output_tty = false; +#ifdef HANDLE_MULTIBYTE + if (MB_CUR_MAX > 1) + lines_differ = lines_differ_multibyte; + else +#endif + lines_differ = lines_differ_singlebyte; + /* Decode the options. */ while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1) diff --git a/src/diff.h b/src/diff.h index c8cf436..a438a8e 100644 --- a/src/diff.h +++ b/src/diff.h @@ -23,6 +23,17 @@ #include #include +/* For platforms which support the ISO C ammendment 1 functionality we + support user-defined character classes. */ +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H +/* Solaris 2.5 has a bug: must be included before . */ +# include +# include +# if defined (HAVE_MBRTOWC) +# define HANDLE_MULTIBYTE 1 +# endif +#endif + /* What kind of changes a hunk contains. */ enum changes { @@ -381,7 +392,11 @@ extern void print_sdiff_script (struct change *); extern char const change_letter[4]; extern char const pr_program[]; extern char *concat (char const *, char const *, char const *); -extern bool lines_differ (char const *, char const *) _GL_ATTRIBUTE_PURE; +extern bool (*lines_differ) (char const *, size_t, char const *, size_t) _GL_ATTRIBUTE_PURE; +extern bool lines_differ_singlebyte (char const *, size_t, char const *, size_t) _GL_ATTRIBUTE_PURE; +#ifdef HANDLE_MULTIBYTE +extern bool lines_differ_multibyte (char const *, size_t, char const *, size_t) _GL_ATTRIBUTE_PURE; +#endif extern lin translate_line_number (struct file_data const *, lin); extern struct change *find_change (struct change *); extern struct change *find_reverse_change (struct change *); diff --git a/src/io.c b/src/io.c index b4ef5dc..fce16ab 100644 --- a/src/io.c +++ b/src/io.c @@ -23,6 +23,7 @@ #include #include #include +#include /* Rotate an unsigned value to the left. */ #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) @@ -215,6 +216,28 @@ slurp (struct file_data *current) /* Split the file into lines, simultaneously computing the equivalence class for each line. */ +#ifdef HANDLE_MULTIBYTE +# define MBC2WC(P, END, MBLENGTH, WC, STATE, CONVFAIL) \ +do \ + { \ + mbstate_t state_bak = STATE; \ + \ + CONVFAIL = 0; \ + MBLENGTH = mbrtowc (&WC, P, END - (char const *)P, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-2: \ + case (size_t)-1: \ + STATE = state_bak; \ + ++CONVFAIL; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ + } \ + while (0) +#endif static void find_and_hash_each_line (struct file_data *current) @@ -241,12 +264,300 @@ find_and_hash_each_line (struct file_data *current) bool same_length_diff_contents_compare_anyway = diff_length_compare_anyway | ig_case; +#ifdef HANDLE_MULTIBYTE + wchar_t wc; + size_t mblength; + mbstate_t state; + int convfail; + + memset (&state, '\0', sizeof (mbstate_t)); +#endif + while (p < suffix_begin) { char const *ip = p; hash_value h = 0; unsigned char c; +#ifdef HANDLE_MULTIBYTE + if (MB_CUR_MAX > 1) + { + wchar_t lo_wc; + char mbc[MB_LEN_MAX]; + mbstate_t state_wc; + + /* Hash this line until we find a newline. */ + switch (ig_white_space) + { + case IGNORE_ALL_SPACE: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + mbc[0] = *p++; + else if (!iswspace (wc)) + { + bool flag = 0; + + if (ig_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + + p += mblength; + memset (&state_wc, '\0', sizeof(mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t)-1 && + mblength != (size_t)-2); + + mblength = (mblength < 1) ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + else + { + p += mblength; + continue; + } + + for (i = 0; i < mblength; i++) + h = HASH (h, mbc[i]); + } + break; + + case IGNORE_SPACE_CHANGE: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (!convfail && iswspace (wc)) + { + while (1) + { + if (*p == '\n') + { + ++p; + goto hashing_done; + } + + p += mblength; + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + if (convfail || !iswspace (wc)) + break; + } + h = HASH (h, ' '); + } + + /* WC is now the first non-space. */ + if (convfail) + mbc[0] = *p++; + else + { + bool flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + + p += mblength; + memset (&state_wc, '\0', sizeof(mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t)-1 && + mblength != (size_t)-2); + + mblength = (mblength < 1) ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + + for (i = 0; i < mblength; i++) + h = HASH (h, mbc[i]); + } + break; + + case IGNORE_TAB_EXPANSION: + case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE: + case IGNORE_TRAILING_SPACE: + { + size_t column = 0; + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (!convfail + && ig_white_space & IGNORE_TRAILING_SPACE + && iswspace (wc)) + { + char const *p1 = p; + while (1) + { + if (*p1 == '\n') + { + p = p1 + 1; + goto hashing_done; + } + + p1 += mblength; + MBC2WC (p1, suffix_begin, mblength, wc, state, convfail); + if (convfail || !iswspace (wc)) + break; + } + } + + size_t repetitions = 1; + bool no_convert = 0; + + if (ig_white_space & IGNORE_TAB_EXPANSION) + { + if (convfail) + column++; + else + switch (wc) + { + case L'\b': + column -= 0 < column; + break; + + case L'\t': + mbc[0] = ' '; + mblength = 1; + no_convert = 1; + p++; + assert(mblength == 1); + repetitions = tabsize - column % tabsize; + column = (column + repetitions < column + ? 0 + : column + repetitions); + break; + + case L'\r': + column = 0; + break; + + default: + column += wcwidth (wc); + break; + } + } + + if (ig_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + no_convert = 1; + p += mblength; + memset (&state_wc, '\0', sizeof(mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t)-1 && + mblength != (size_t)-2); + + mblength = (mblength < 1) ? 1 : mblength; + } + } + + if (!no_convert) + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + + do + { + for (i = 0; i < mblength; i++) + h = HASH (h, mbc[i]); + } + while (--repetitions != 0); + } + } + break; + + default: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + mbc[0] = *p++; + else + { + int flag = 0; + + if (ig_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + p += mblength; + memset (&state_wc, '\0', sizeof(mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t)-1 && + mblength != (size_t)-2); + + mblength = (mblength < 1) ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + + for (i = 0; i < mblength; i++) + h = HASH (h, mbc[i]); + } + } + } + else +#endif + /* Hash this line until we find a newline. */ switch (ig_white_space) { @@ -397,7 +708,7 @@ find_and_hash_each_line (struct file_data *current) else if (!diff_length_compare_anyway) continue; - if (! lines_differ (eqline, ip)) + if (! lines_differ (eqline, eqs[i].length + 1, ip, length + 1)) break; } diff --git a/src/util.c b/src/util.c index 88955da..3de19f8 100644 --- a/src/util.c +++ b/src/util.c @@ -985,7 +985,8 @@ finish_output (void) Return nonzero if the lines differ. */ bool -lines_differ (char const *s1, char const *s2) +lines_differ_singlebyte (char const *s1, size_t s1len, + char const *s2, size_t s2len) { register char const *t1 = s1; register char const *t2 = s2; @@ -1141,6 +1142,354 @@ lines_differ (char const *s1, char const *s2) return true; } + +#ifdef HANDLE_MULTIBYTE +# define MBC2WC(T, END, MBLENGTH, WC, STATE, CONVFAIL) \ +do \ + { \ + mbstate_t bak = STATE; \ + \ + CONVFAIL = 0; \ + MBLENGTH = mbrtowc (&WC, T, END - T, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-2: \ + case (size_t)-1: \ + STATE = bak; \ + ++CONVFAIL; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ + } \ + while (0) + +bool +lines_differ_multibyte (char const *s1, size_t s1len, + char const *s2, size_t s2len) +{ + char const *end1, *end2; + char c1, c2; + wchar_t wc1, wc2, wc1_bak, wc2_bak; + size_t mblen1, mblen2; + mbstate_t state1, state2, state1_bak, state2_bak; + int convfail1, convfail2, convfail1_bak, convfail2_bak; + + char const *t1 = s1; + char const *t2 = s2; + char const *t1_bak, *t2_bak; + size_t column = 0; + + if (ignore_white_space == IGNORE_NO_WHITE_SPACE && !ignore_case) + { + while (*t1 != '\n') + if (*t1++ != *t2++) + return 1; + return 0; + } + + end1 = t1 + s1len; + end2 = t2 + s2len; + + memset (&state1, '\0', sizeof (mbstate_t)); + memset (&state2, '\0', sizeof (mbstate_t)); + + while (1) + { + c1 = *t1; + c2 = *t2; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + + /* Test for exact char equality first, since it's a common case. */ + if (convfail1 ^ convfail2) + break; + else if (convfail1 && convfail2 && c1 != c2) + break; + else if (!convfail1 && !convfail2 && wc1 != wc2) + { + switch (ignore_white_space) + { + case IGNORE_ALL_SPACE: + /* For -w, just skip past any white space. */ + while (1) + { + if (convfail1) + break; + else if (wc1 == L'\n' || !iswspace (wc1)) + break; + + t1 += mblen1; + c1 = *t1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + + while (1) + { + if (convfail2) + break; + else if (wc2 == L'\n' || !iswspace (wc2)) + break; + + t2 += mblen2; + c2 = *t2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + t1 += mblen1; + t2 += mblen2; + break; + + case IGNORE_SPACE_CHANGE: + /* For -b, advance past any sequence of white space in + line 1 and consider it just one space, or nothing at + all if it is at the end of the line. */ + if (wc1 != L'\n' && iswspace (wc1)) + { + size_t mblen_bak; + mbstate_t state_bak; + + do + { + t1 += mblen1; + mblen_bak = mblen1; + state_bak = state1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + while (!convfail1 && (wc1 != L'\n' && iswspace (wc1))); + + state1 = state_bak; + mblen1 = mblen_bak; + t1 -= mblen1; + convfail1 = 0; + wc1 = L' '; + } + + /* Likewise for line 2. */ + if (wc2 != L'\n' && iswspace (wc2)) + { + size_t mblen_bak; + mbstate_t state_bak; + + do + { + t2 += mblen2; + mblen_bak = mblen2; + state_bak = state2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + while (!convfail2 && (wc2 != L'\n' && iswspace (wc2))); + + state2 = state_bak; + mblen2 = mblen_bak; + t2 -= mblen2; + convfail2 = 0; + wc2 = L' '; + } + + if (wc1 != wc2) + { + /* If we went too far when doing the simple test for + equality, go back to the first non-whitespace + character in both sides and try again. */ + if (wc2 == L' ' && wc1 != L'\n' && + t1 > s1 && + !convfail1_bak && iswspace (wc1_bak)) + { + t1 = t1_bak; + wc1 = wc1_bak; + state1 = state1_bak; + convfail1 = convfail1_bak; + continue; + } + if (wc1 == L' ' && wc2 != L'\n' + && t2 > s2 + && !convfail2_bak && iswspace (wc2_bak)) + { + t2 = t2_bak; + wc2 = wc2_bak; + state2 = state2_bak; + convfail2 = convfail2_bak; + continue; + } + } + + t1_bak = t1; t2_bak = t2; + wc1_bak = wc1; wc2_bak = wc2; + state1_bak = state1; state2_bak = state2; + convfail1_bak = convfail1; convfail2_bak = convfail2; + + if (wc1 == L'\n') + wc1 = L' '; + else + t1 += mblen1; + + if (wc2 == L'\n') + wc2 = L' '; + else + t2 += mblen2; + + break; + + case IGNORE_TRAILING_SPACE: + case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE: + if (iswspace (wc1) && iswspace (wc2)) + { + char const *p; + wchar_t wc; + size_t mblength; + int convfail; + mbstate_t state; + bool just_whitespace_left = 1; + if (wc1 != L'\n') + { + mblength = mblen1; + p = t1; + memset (&state, '\0', sizeof(mbstate_t)); + while (p < end1) + { + if (*p == '\n') + break; + + p += mblength; + MBC2WC (p, end1, mblength, wc, state, convfail); + if (convfail || !iswspace (wc)) + { + just_whitespace_left = 0; + break; + } + } + } + if (just_whitespace_left && wc2 != L'\n') + { + mblength = mblen2; + p = t2; + memset (&state, '\0', sizeof(mbstate_t)); + while (p < end2) + { + if (*p == '\n') + break; + + p += mblength; + MBC2WC (p, end2, mblength, wc, state, convfail); + if (convfail || !iswspace (wc)) + { + just_whitespace_left = 0; + break; + } + } + } + + if (just_whitespace_left) + /* Both lines have nothing but whitespace left. */ + return false; + } + + if (ignore_white_space == IGNORE_TRAILING_SPACE) + break; + /* Fall through. */ + case IGNORE_TAB_EXPANSION: + if ((wc1 == L' ' && wc2 == L'\t') + || (wc1 == L'\t' && wc2 == L' ')) + { + size_t column2 = column; + + while (1) + { + if (convfail1) + { + ++t1; + break; + } + else if (wc1 == L' ') + column++; + else if (wc1 == L'\t') + column += tabsize - column % tabsize; + else + { + t1 += mblen1; + break; + } + + t1 += mblen1; + c1 = *t1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + + while (1) + { + if (convfail2) + { + ++t2; + break; + } + else if (wc2 == L' ') + column2++; + else if (wc2 == L'\t') + column2 += tabsize - column2 % tabsize; + else + { + t2 += mblen2; + break; + } + + t2 += mblen2; + c2 = *t2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + + if (column != column2) + return 1; + } + else + { + t1 += mblen1; + t2 += mblen2; + } + break; + + case IGNORE_NO_WHITE_SPACE: + t1 += mblen1; + t2 += mblen2; + break; + } + + /* Lowercase all letters if -i is specified. */ + if (ignore_case) + { + if (!convfail1) + wc1 = towlower (wc1); + if (!convfail2) + wc2 = towlower (wc2); + } + + if (convfail1 ^ convfail2) + break; + else if (convfail1 && convfail2 && c1 != c2) + break; + else if (!convfail1 && !convfail2 && wc1 != wc2) + break; + } + else + { + t1_bak = t1; t2_bak = t2; + wc1_bak = wc1; wc2_bak = wc2; + state1_bak = state1; state2_bak = state2; + convfail1_bak = convfail1; convfail2_bak = convfail2; + + t1 += mblen1; t2 += mblen2; + } + + if (!convfail1 && wc1 == L'\n') + return 0; + + column += convfail1 ? 1 : + (wc1 == L'\t') ? tabsize - column % tabsize : wcwidth (wc1); + } + + return 1; +} +#endif /* Find the consecutive changes at the start of the script START. Return the last link before the first gap. */