|
Packit |
d0f5c2 |
/*
|
|
Packit |
d0f5c2 |
Data structures for encoding transformations.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Perl works internally in either a native 'byte' encoding or
|
|
Packit |
d0f5c2 |
in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
|
|
Packit |
d0f5c2 |
representation. When we do we can use utf8_to_uv().
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Most character encodings are either simple byte mappings or
|
|
Packit |
d0f5c2 |
variable length multi-byte encodings. UTF-8 can be viewed as a
|
|
Packit |
d0f5c2 |
rather extreme case of the latter.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
So to solve an important part of perl's encode needs we need to solve the
|
|
Packit |
d0f5c2 |
"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
|
|
Packit |
d0f5c2 |
case. (Where one of multi-bytes will usually be UTF-8.)
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The other type of encoding is a shift encoding where a prefix sequence
|
|
Packit |
d0f5c2 |
determines what subsequent bytes mean. Such encodings have state.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
We also need to handle case where a character in one encoding has to be
|
|
Packit |
d0f5c2 |
represented as multiple characters in the other. e.g. letter+diacritic.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The process can be considered as pseudo perl:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
my $dst = '';
|
|
Packit |
d0f5c2 |
while (length($src))
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
my $size = $count($src);
|
|
Packit |
d0f5c2 |
my $in_seq = substr($src,0,$size,'');
|
|
Packit |
d0f5c2 |
my $out_seq = $s2d_hash{$in_seq};
|
|
Packit |
d0f5c2 |
if (defined $out_seq)
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
$dst .= $out_seq;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
# an error condition
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
return $dst;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
That has the following components:
|
|
Packit |
d0f5c2 |
&src_count - a "rule" for how many bytes make up the next character in the
|
|
Packit |
d0f5c2 |
source.
|
|
Packit |
d0f5c2 |
%s2d_hash - a mapping from input sequences to output sequences
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The problem with that scheme is that it does not allow the output
|
|
Packit |
d0f5c2 |
character repertoire to affect the characters considered from the
|
|
Packit |
d0f5c2 |
input.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
So we use a "trie" representation which can also be considered
|
|
Packit |
d0f5c2 |
a state machine:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
my $dst = '';
|
|
Packit |
d0f5c2 |
my $seq = \@s2d_seq;
|
|
Packit |
d0f5c2 |
my $next = \@s2d_next;
|
|
Packit |
d0f5c2 |
while (length($src))
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
my $byte = $substr($src,0,1,'');
|
|
Packit |
d0f5c2 |
my $out_seq = $seq->[$byte];
|
|
Packit |
d0f5c2 |
if (defined $out_seq)
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
$dst .= $out_seq;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
# an error condition
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
($next,$seq) = @$next->[$byte] if $next;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
return $dst;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
There is now a pair of data structures to represent everything.
|
|
Packit |
d0f5c2 |
It is valid for output sequence at a particular point to
|
|
Packit |
d0f5c2 |
be defined but zero length, that just means "don't know yet".
|
|
Packit |
d0f5c2 |
For the single byte case there is no 'next' so new tables will be the same as
|
|
Packit |
d0f5c2 |
the original tables. For a multi-byte case a prefix byte will flip to the tables
|
|
Packit |
d0f5c2 |
for the next page (adding nothing to the output), then the tables for the page
|
|
Packit |
d0f5c2 |
will provide the actual output and set tables back to original base page.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
This scheme can also handle shift encodings.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
A slight enhancement to the scheme also allows for look-ahead - if
|
|
Packit |
d0f5c2 |
we add a flag to re-add the removed byte to the source we could handle
|
|
Packit |
d0f5c2 |
a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
|
|
Packit |
d0f5c2 |
ab -> a (and take b back please)
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
*/
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#define PERL_NO_GET_CONTEXT
|
|
Packit |
d0f5c2 |
#include <EXTERN.h>
|
|
Packit |
d0f5c2 |
#include <perl.h>
|
|
Packit |
d0f5c2 |
#include "encode.h"
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
int
|
|
Packit |
d0f5c2 |
do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
|
|
Packit |
d0f5c2 |
STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
const U8 *s = src;
|
|
Packit |
d0f5c2 |
const U8 *send = s + *slen;
|
|
Packit |
d0f5c2 |
const U8 *last = s;
|
|
Packit |
d0f5c2 |
U8 *d = dst;
|
|
Packit |
d0f5c2 |
U8 *dend = d + dlen, *dlast = d;
|
|
Packit |
d0f5c2 |
int code = 0;
|
|
Packit |
d0f5c2 |
while (s < send) {
|
|
Packit |
d0f5c2 |
const encpage_t *e = enc;
|
|
Packit |
d0f5c2 |
U8 byte = *s;
|
|
Packit |
d0f5c2 |
while (byte > e->max)
|
|
Packit |
d0f5c2 |
e++;
|
|
Packit |
d0f5c2 |
if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
|
|
Packit |
d0f5c2 |
const U8 *cend = s + (e->slen & 0x7f);
|
|
Packit |
d0f5c2 |
if (cend <= send) {
|
|
Packit |
d0f5c2 |
STRLEN n;
|
|
Packit |
d0f5c2 |
if ((n = e->dlen)) {
|
|
Packit |
d0f5c2 |
const U8 *out = e->seq + n * (byte - e->min);
|
|
Packit |
d0f5c2 |
U8 *oend = d + n;
|
|
Packit |
d0f5c2 |
if (dst) {
|
|
Packit |
d0f5c2 |
if (oend <= dend) {
|
|
Packit |
d0f5c2 |
while (d < oend)
|
|
Packit |
d0f5c2 |
*d++ = *out++;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
/* Out of space */
|
|
Packit |
d0f5c2 |
code = ENCODE_NOSPACE;
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else
|
|
Packit |
d0f5c2 |
d = oend;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
enc = e->next;
|
|
Packit |
d0f5c2 |
s++;
|
|
Packit |
d0f5c2 |
if (s == cend) {
|
|
Packit |
d0f5c2 |
if (approx && (e->slen & 0x80))
|
|
Packit |
d0f5c2 |
code = ENCODE_FALLBACK;
|
|
Packit |
d0f5c2 |
last = s;
|
|
Packit |
d0f5c2 |
if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
|
|
Packit |
d0f5c2 |
code = ENCODE_FOUND_TERM;
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
dlast = d;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
/* partial source character */
|
|
Packit |
d0f5c2 |
code = ENCODE_PARTIAL;
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
/* Cannot represent */
|
|
Packit |
d0f5c2 |
code = ENCODE_NOREP;
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
*slen = last - src;
|
|
Packit |
d0f5c2 |
*dout = d - dst;
|
|
Packit |
d0f5c2 |
return code;
|
|
Packit |
d0f5c2 |
}
|