/* Pango
* testboundaries.c: Test text boundary algorithms
*
* Copyright (C) 1999-2000 Red Hat Software
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <glib.h>
#include <pango/pango.h>
#ifndef G_OS_WIN32
#include <unistd.h>
#endif
#define CHFORMAT "%0#6x"
/* FIXME for now this just tests that the breaking of some sample
* text conforms to certain rules and invariants. But eventually
* we should also have test-result pairs, i.e. a string and some
* encoding of the correct way to break the string, to check
* more precisely that things worked
*/
static int offset = 0;
static int line = 0;
static gunichar current_wc = 0;
static const char *line_start = NULL;
static const char *line_end = NULL;
static void fail (const char *format, ...) G_GNUC_PRINTF (1, 2) G_GNUC_NORETURN;
static void fail (const char *format, ...)
{
char *str;
char *line_text;
va_list args;
va_start (args, format);
str = g_strdup_vprintf (format, args);
va_end (args);
line_text = g_strndup (line_start, line_end - line_start);
fprintf (stderr, "line %d offset %d char is " CHFORMAT ": %s\n (line is '%s')\n", line, offset, current_wc, str, line_text);
g_free (str);
g_free (line_text);
exit (1);
}
typedef void (* CharForeachFunc) (gunichar wc,
gunichar prev_wc,
gunichar next_wc,
GUnicodeType type,
GUnicodeType prev_type,
GUnicodeType next_type,
PangoLogAttr *attr,
PangoLogAttr *prev_attr,
PangoLogAttr *next_attr,
gpointer data);
static void
log_attr_foreach (const char *text,
PangoLogAttr *attrs,
CharForeachFunc func,
gpointer data)
{
const gchar *next = text;
gint length = strlen (text);
const gchar *end = text + length;
gint i = 0;
gunichar prev_wc;
gunichar next_wc;
GUnicodeType prev_type;
GUnicodeType next_type;
if (next == end)
return;
offset = 0;
line = 1;
prev_type = (GUnicodeType) -1;
prev_wc = 0;
next_wc = g_utf8_get_char (next);
next_type = g_unichar_type (next_wc);
line_start = text;
line_end = text;
while (next_wc != 0)
{
GUnicodeType type;
gunichar wc;
wc = next_wc;
type = next_type;
current_wc = wc;
next = g_utf8_next_char (next);
line_end = next;
if (next >= end)
next_wc = 0;
else
next_wc = g_utf8_get_char (next);
if (next_wc)
next_type = g_unichar_type (next_wc);
(* func) (wc, prev_wc, next_wc,
type, prev_type, next_type,
&attrs[i],
i != 0 ? &attrs[i-1] : NULL,
next_wc != 0 ? &attrs[i+1] : NULL,
data);
prev_type = type;
prev_wc = wc;
++i;
++offset;
if (wc == '\n')
{
++line;
offset = 0;
line_start = next;
line_end = next;
}
}
}
static void
check_line_char (gunichar wc,
gunichar prev_wc,
gunichar next_wc,
GUnicodeType type,
GUnicodeType prev_type,
GUnicodeType next_type,
PangoLogAttr *attr,
PangoLogAttr *prev_attr,
PangoLogAttr *next_attr,
gpointer data)
{
GUnicodeBreakType break_type;
GUnicodeBreakType prev_break_type;
break_type = g_unichar_break_type (wc);
if (prev_wc)
prev_break_type = g_unichar_break_type (prev_wc);
else
prev_break_type = G_UNICODE_BREAK_UNKNOWN;
if (wc == '\n')
{
if (prev_wc == '\r')
{
if (attr->is_line_break)
fail ("line break between \\r and \\n");
}
if (next_attr && !next_attr->is_line_break)
fail ("no line break after \\n");
}
if (attr->is_line_break && prev_wc == 0)
fail ("first char in string should not be marked as a line break");
if (break_type == G_UNICODE_BREAK_SPACE)
{
if (attr->is_line_break && prev_attr != NULL &&
!attr->is_mandatory_break &&
!(next_wc && g_unichar_break_type (next_wc) == G_UNICODE_BREAK_COMBINING_MARK))
fail ("can't break lines before a space unless a mandatory break char precedes it or a combining mark follows; prev char was " CHFORMAT, prev_wc);
}
if (attr->is_mandatory_break && !attr->is_line_break)
fail ("mandatory breaks must also be marked as regular breaks");
/* FIXME use the break tables from break.c to automatically
* check invariants for each cell in the table. Shouldn't
* be that hard to do.
*/
if (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
attr->is_line_break &&
!attr->is_mandatory_break)
fail ("can't break between two open punctuation chars");
if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
prev_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
attr->is_line_break &&
!attr->is_mandatory_break)
fail ("can't break between two close punctuation chars");
if (break_type == G_UNICODE_BREAK_QUOTATION &&
prev_break_type == G_UNICODE_BREAK_ALPHABETIC &&
attr->is_line_break &&
!attr->is_mandatory_break)
fail ("can't break letter-quotemark sequence");
}
static void
check_line_invariants (const char *text,
PangoLogAttr *attrs)
{
log_attr_foreach (text, attrs, check_line_char, NULL);
}
static void
check_word_invariants (const char *text,
PangoLogAttr *attrs)
{
}
static void
check_sentence_invariants (const char *text,
PangoLogAttr *attrs)
{
}
static void
check_grapheme_invariants (const char *text,
PangoLogAttr *attrs)
{
}
#if 0
static void print_sentences (const char *text,
PangoLogAttr *attrs);
static void
print_sentences (const char *text,
PangoLogAttr *attrs)
{
const char *p;
const char *last;
int i = 0;
last = text;
p = text;
while (*p)
{
if (attrs[i].is_sentence_boundary)
{
char *s = g_strndup (last, p - last);
printf ("%s\n", s);
g_free (s);
last = p;
}
p = g_utf8_next_char (p);
++i;
}
}
#endif
static void
check_invariants (const char *text)
{
int len;
PangoLogAttr *attrs;
if (!g_utf8_validate (text, -1, NULL))
fail ("Invalid UTF-8 in test text");
len = g_utf8_strlen (text, -1);
attrs = g_new0 (PangoLogAttr, len + 1);
pango_get_log_attrs (text,
-1,
0,
pango_language_from_string ("C"),
attrs,
len + 1);
check_line_invariants (text, attrs);
check_sentence_invariants (text, attrs);
check_grapheme_invariants (text, attrs);
check_word_invariants (text, attrs);
#if 0
print_sentences (text, attrs);
#endif
g_free (attrs);
}
static void
test_boundaries (void)
{
gchar *text;
const gchar *filename;
#if GLIB_CHECK_VERSION(2, 37, 2)
filename = g_test_get_filename (G_TEST_DIST, "boundaries.utf8", NULL);
#else
filename = SRCDIR "/boundaries.utf8";
#endif
g_print ("sample file: %s\n", filename);
if (!g_file_get_contents (filename, &text, NULL, NULL))
fail ("Couldn't open sample text file");
check_invariants (text);
g_free (text);
printf ("testboundaries passed\n");
}
int
main (int argc, char *argv[])
{
g_test_init (&argc, &argv, NULL);
g_test_add_func ("/text/boundaries", test_boundaries);
return g_test_run ();
}