Blame encengine.c

Packit d0f5c2
/*
Packit d0f5c2
Data structures for encoding transformations.
Packit d0f5c2
Packit d0f5c2
Perl works internally in either a native 'byte' encoding or
Packit d0f5c2
in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
Packit d0f5c2
representation. When we do we can use utf8_to_uv().
Packit d0f5c2
Packit d0f5c2
Most character encodings are either simple byte mappings or
Packit d0f5c2
variable length multi-byte encodings. UTF-8 can be viewed as a
Packit d0f5c2
rather extreme case of the latter.
Packit d0f5c2
Packit d0f5c2
So to solve an important part of perl's encode needs we need to solve the
Packit d0f5c2
"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
Packit d0f5c2
case. (Where one of multi-bytes will usually be UTF-8.)
Packit d0f5c2
Packit d0f5c2
The other type of encoding is a shift encoding where a prefix sequence
Packit d0f5c2
determines what subsequent bytes mean. Such encodings have state.
Packit d0f5c2
Packit d0f5c2
We also need to handle case where a character in one encoding has to be
Packit d0f5c2
represented as multiple characters in the other. e.g. letter+diacritic.
Packit d0f5c2
Packit d0f5c2
The process can be considered as pseudo perl:
Packit d0f5c2
Packit d0f5c2
my $dst = '';
Packit d0f5c2
while (length($src))
Packit d0f5c2
 {
Packit d0f5c2
  my $size    = $count($src);
Packit d0f5c2
  my $in_seq  = substr($src,0,$size,'');
Packit d0f5c2
  my $out_seq = $s2d_hash{$in_seq};
Packit d0f5c2
  if (defined $out_seq)
Packit d0f5c2
   {
Packit d0f5c2
    $dst .= $out_seq;
Packit d0f5c2
   }
Packit d0f5c2
  else
Packit d0f5c2
   {
Packit d0f5c2
    # an error condition
Packit d0f5c2
   }
Packit d0f5c2
 }
Packit d0f5c2
return $dst;
Packit d0f5c2
Packit d0f5c2
That has the following components:
Packit d0f5c2
 &src_count - a "rule" for how many bytes make up the next character in the
Packit d0f5c2
              source.
Packit d0f5c2
 %s2d_hash  - a mapping from input sequences to output sequences
Packit d0f5c2
Packit d0f5c2
The problem with that scheme is that it does not allow the output
Packit d0f5c2
character repertoire to affect the characters considered from the
Packit d0f5c2
input.
Packit d0f5c2
Packit d0f5c2
So we use a "trie" representation which can also be considered
Packit d0f5c2
a state machine:
Packit d0f5c2
Packit d0f5c2
my $dst   = '';
Packit d0f5c2
my $seq   = \@s2d_seq;
Packit d0f5c2
my $next  = \@s2d_next;
Packit d0f5c2
while (length($src))
Packit d0f5c2
 {
Packit d0f5c2
  my $byte    = $substr($src,0,1,'');
Packit d0f5c2
  my $out_seq = $seq->[$byte];
Packit d0f5c2
  if (defined $out_seq)
Packit d0f5c2
   {
Packit d0f5c2
    $dst .= $out_seq;
Packit d0f5c2
   }
Packit d0f5c2
  else
Packit d0f5c2
   {
Packit d0f5c2
    # an error condition
Packit d0f5c2
   }
Packit d0f5c2
  ($next,$seq) = @$next->[$byte] if $next;
Packit d0f5c2
 }
Packit d0f5c2
return $dst;
Packit d0f5c2
Packit d0f5c2
There is now a pair of data structures to represent everything.
Packit d0f5c2
It is valid for output sequence at a particular point to
Packit d0f5c2
be defined but zero length, that just means "don't know yet".
Packit d0f5c2
For the single byte case there is no 'next' so new tables will be the same as
Packit d0f5c2
the original tables. For a multi-byte case a prefix byte will flip to the tables
Packit d0f5c2
for  the next page (adding nothing to the output), then the tables for the page
Packit d0f5c2
will provide the actual output and set tables back to original base page.
Packit d0f5c2
Packit d0f5c2
This scheme can also handle shift encodings.
Packit d0f5c2
Packit d0f5c2
A slight enhancement to the scheme also allows for look-ahead - if
Packit d0f5c2
we add a flag to re-add the removed byte to the source we could handle
Packit d0f5c2
  a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
Packit d0f5c2
  ab -> a (and take b back please)
Packit d0f5c2
Packit d0f5c2
*/
Packit d0f5c2
Packit d0f5c2
#define PERL_NO_GET_CONTEXT
Packit d0f5c2
#include <EXTERN.h>
Packit d0f5c2
#include <perl.h>
Packit d0f5c2
#include "encode.h"
Packit d0f5c2
Packit d0f5c2
int
Packit d0f5c2
do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
Packit d0f5c2
      STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
Packit d0f5c2
{
Packit d0f5c2
    const U8 *s = src;
Packit d0f5c2
    const U8 *send = s + *slen;
Packit d0f5c2
    const U8 *last = s;
Packit d0f5c2
    U8 *d = dst;
Packit d0f5c2
    U8 *dend = d + dlen, *dlast = d;
Packit d0f5c2
    int code = 0;
Packit d0f5c2
    while (s < send) {
Packit d0f5c2
        const encpage_t *e = enc;
Packit d0f5c2
        U8 byte = *s;
Packit d0f5c2
        while (byte > e->max)
Packit d0f5c2
            e++;
Packit d0f5c2
        if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
Packit d0f5c2
            const U8 *cend = s + (e->slen & 0x7f);
Packit d0f5c2
            if (cend <= send) {
Packit d0f5c2
                STRLEN n;
Packit d0f5c2
                if ((n = e->dlen)) {
Packit d0f5c2
                    const U8 *out = e->seq + n * (byte - e->min);
Packit d0f5c2
                    U8 *oend = d + n;
Packit d0f5c2
                    if (dst) {
Packit d0f5c2
                        if (oend <= dend) {
Packit d0f5c2
                            while (d < oend)
Packit d0f5c2
                                *d++ = *out++;
Packit d0f5c2
                        }
Packit d0f5c2
                        else {
Packit d0f5c2
                            /* Out of space */
Packit d0f5c2
                            code = ENCODE_NOSPACE;
Packit d0f5c2
                            break;
Packit d0f5c2
                        }
Packit d0f5c2
                    }
Packit d0f5c2
                    else
Packit d0f5c2
                        d = oend;
Packit d0f5c2
                }
Packit d0f5c2
                enc = e->next;
Packit d0f5c2
                s++;
Packit d0f5c2
                if (s == cend) {
Packit d0f5c2
                    if (approx && (e->slen & 0x80))
Packit d0f5c2
                        code = ENCODE_FALLBACK;
Packit d0f5c2
                    last = s;
Packit d0f5c2
                    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
Packit d0f5c2
                        code = ENCODE_FOUND_TERM;
Packit d0f5c2
                        break;
Packit d0f5c2
                    }
Packit d0f5c2
                    dlast = d;
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
            else {
Packit d0f5c2
                /* partial source character */
Packit d0f5c2
                code = ENCODE_PARTIAL;
Packit d0f5c2
                break;
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
        else {
Packit d0f5c2
            /* Cannot represent */
Packit d0f5c2
            code = ENCODE_NOREP;
Packit d0f5c2
            break;
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    *slen = last - src;
Packit d0f5c2
    *dout = d - dst;
Packit d0f5c2
    return code;
Packit d0f5c2
}