Blob Blame History Raw
/*
  make_hash.c v2003-01-24
  make encodings.c from encodings.dat

  Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>

  This program is free software; you can redistribute it and/or modify it
  under the terms of version 2 of the GNU General Public License as published
  by the Free Software Foundation.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdlib.h>
#include <stdio.h>

#ifdef HAVE_STRING_H
#  include <string.h>
#else /* HAVE_STRING_H */
#  ifdef HAVE_STRINGS_H
#    include <strings.h>
#  endif /* HAVE_STRINGS_H */
#endif /* HAVE_STRING_H */

#ifdef HAVE_MEMORY_H
#  include <memory.h>
#endif /* HAVE_MEMORY_H */

#include <unistd.h>
#include <ctype.h>

/* PARR {{{ */
#ifdef __GNUC__
# define PVAR(f, v) fprintf(stderr, "%s:%u %s(): " \
                            #v " == %" #f "\n", __FILE__, __LINE__, __FUNCTION__, v)
# define PARR(f, v, n) ( { int _i; \
  fprintf(stderr, "%s:%u %s(): " #v " == { ", __FILE__, __LINE__, __FUNCTION__); \
  for (_i = 0; _i < n; _i++) fprintf(stderr, "%" #f ", ", (v)[_i]); \
  fputs("}\n", stderr); \
} )
#else /* __GNUC__ */
/* FIXME */
#endif /* __GNUC__ */
/* }}} */

#define LEN 4096

typedef struct {
  char *enca;
  char *rfc1345;
  char *cstocs;
  char *iconv;
  char *mime;
  int naliases;
  char **aliases;
  char *human;
  char *flags;
  char *nsurface;
} EncaCharsetRaw;

typedef struct {
  int enca;
  int rfc1345;
  int cstocs;
  int iconv;
  int mime;
  char *human;
  char *flags;
  char *nsurface;
} EncaCharsetFine;

static EncaCharsetRaw RawNULL = {
  NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL
};

static char*
fixspaces(char *line)
{
  char *p, *q;
  int qs = 0;

  for (p = line; isspace(*p); p++)
    ;
  for (q = line; *p != '\0'; p++) {
    if (isspace(*p)) {
      *q = ' ';
      qs = 1;
    }
    else {
      if (qs) q++;
      *q++ = *p;
      qs = 0;
    }
  }
  *q = '\0';

  return line;
}

static int
add_item(const char *line,
         const char *name,
         char **item)
{
  const int len = strlen(name);

  if (*item != NULL) return 0;
  if (strncmp(line, name, len) != 0) return 0;
  *item = fixspaces(strdup(line + len));

  return 1;
}

static char**
check_alias(char **aliases,
            int *n,
            char *string)
{
  int i;

  if (string == NULL || string[0] == '\0') return aliases;
  for (i = 0; i < *n; i++)
    if (strcmp(aliases[i], string) == 0) return aliases;
  (*n)++;
  aliases = (char**)realloc(aliases, (*n)*sizeof(char*));
  aliases[*n - 1] = strdup(string);

  return aliases;
}

static EncaCharsetRaw*
read_raw_charset_data(FILE *stream,
                      int *rsize)
{
  char *line;
  EncaCharsetRaw *r, *raw;
  int rs;
  char *gl;

  line = (char*)malloc(LEN);
  r = raw = (EncaCharsetRaw*)malloc(sizeof(EncaCharsetRaw));
  *r = RawNULL;
  rs = 1;
  while (1) {
    gl = fgets(line, LEN, stream);
    if (r->enca && r->rfc1345 && r->cstocs && r->human && r->iconv && r->mime
        && r->flags && r->nsurface && r->aliases) {
      if (r->enca[0] == '\0') {
        fprintf(stderr, "Enca's charset name #%d empty\n", (int)(r - raw + 1));
        exit(1);
      }
      if (r->rfc1345[0] == '\0') {
        fprintf(stderr, "RFC-1345 charset name #%d empty\n", (int)(r - raw + 1));
        exit(1);
      }
      if (r->iconv[0] == '\0') r->iconv = NULL;
      if (r->cstocs[0] == '\0') r->cstocs = NULL;
      if (r->mime[0] == '\0') r->mime = NULL;
      if (r->nsurface[0] == '\0') r->nsurface = strdup("0");
      r->aliases = check_alias(r->aliases, &r->naliases, r->enca);
      r->aliases = check_alias(r->aliases, &r->naliases, r->iconv);
      r->aliases = check_alias(r->aliases, &r->naliases, r->rfc1345);
      r->aliases = check_alias(r->aliases, &r->naliases, r->mime);
      r->aliases = check_alias(r->aliases, &r->naliases, r->cstocs);
      if (!gl) break;
      rs++;
      {
        int d = r - raw;
        raw = (EncaCharsetRaw*)realloc(raw, rs*sizeof(EncaCharsetRaw));
        r = raw + d + 1;
      }
      *r = RawNULL;
    }
    line[LEN-1] = '\0';
    fixspaces(line);
    if (line[0] == '\0' || line[0] == '#') continue;
    if (add_item(line, "enca:", &r->enca)) continue;
    if (add_item(line, "rfc:", &r->rfc1345)) continue;
    if (add_item(line, "iconv:", &r->iconv)) continue;
    if (add_item(line, "mime:", &r->mime)) continue;
    if (add_item(line, "cstocs:", &r->cstocs)) continue;
    if (add_item(line, "human:", &r->human)) continue;
    if (add_item(line, "flags:", &r->flags)) continue;
    if (add_item(line, "nsurface:", &r->nsurface)) continue;
    if (strncmp(line, "aliases:", 8) == 0 && !r->aliases) {
      int i;
      char *next, *l = fixspaces(line+8);
      r->naliases = 1;
      while ((l = strchr(l, ' ')) != NULL) {
        r->naliases++;
        l++;
      }
      r->aliases = (char**)malloc((r->naliases)*sizeof(char*));
      l = line+8;
      for (i = 0; i < r->naliases; i++) {
        next = strchr(l, ' ');
        if (next) *next = '\0';
        r->aliases[i] = strdup(l);
        l = next+1;
      }
      continue;
    }
    fprintf(stderr, "Unexpected `%s'\n", line);
    exit(1);
  }

  *rsize = rs;
  return raw;
}

static int
squeeze_compare(const char *x, const char *y)
{
  while (*x != '\0' || *y != '\0') {
    while (*x != '\0' && !isalnum(*x)) x++;
    while (*y != '\0' && !isalnum(*y)) y++;
    if (tolower(*x) != tolower(*y))
      return (int)tolower(*x) - (int)tolower(*y);
    if (*x != '\0') x++;
    if (*y != '\0') y++;
  }
  return 0;
}

static int
stable_compare(const void *p, const void *q)
{
  char *x = *(char**)p;
  char *y = *(char**)q;
  int i;

  i = squeeze_compare(x, y);
  /* to stabilize the sort */
  if (i == 0) return strcmp(x, y);
  return i;
}

static int
bin_search(char **alist, const int n, const char *s)
{
  int i1 = 0;
  int i2 = n-1;
  int i;

  i = stable_compare(&s, &alist[i1]);
  if (i < 0) {
    fprintf(stderr, "Out of search range: `%s'\n", s);
    exit(0);
  }
  if (i == 0) return i1;

  i = stable_compare(&s, &alist[i2]);
  if (i > 0) {
    fprintf(stderr, "Out of search range: `%s'\n", s);
    exit(0);
  }
  if (i == 0) return i2;

  while (i1+1 < i2) {
    int im = (i1 + i2)/2;
    i = stable_compare(&s, &alist[im]);
    if (i == 0) return im;
    if (i > 0) i1 = im; else i2 = im;
  }
  if (stable_compare(&s, &alist[i1+1]) == 0) return i1+1;

  fprintf(stderr, "Not found: `%s'\n", s);
  exit(0);
}

static char**
build_alias_list(EncaCharsetRaw *raw, const int ncs, int *total)
{
  char **alist;
  int nn, i, j, k;

  for (i = nn = 0; i < ncs; i++) nn += raw[i].naliases;
  alist = (char**)malloc(nn*sizeof(char*));
  for (i = j = 0; i < ncs; i++) {
    for (k = 0; k < raw[i].naliases; k++)
      alist[j++] = raw[i].aliases[k];
  }
  qsort(alist, nn, sizeof(char*), &stable_compare);
  for (i = 1; i < nn; ) {
    if (squeeze_compare(alist[i], alist[i-1]) == 0) {
      if (strcmp(alist[i], alist[i-1]) == 0) {
        fprintf(stderr, "Removing duplicate `%s'\n", alist[i]);
        memmove(alist+i-1, alist+i, (nn-i)*sizeof(char*));
        nn--;
      }
      else {
        fprintf(stderr, "Keeping equvialent `%s' and `%s'\n",
                alist[i], alist[i-1]);
        i++;
      }
    }
    else i++;
  }

  *total = nn;
  return alist;
}

static EncaCharsetFine*
refine_data(EncaCharsetRaw *raw, const int ncs, char **alist, const int nn)
{
  int i;
  EncaCharsetFine *fine;

  fine = (EncaCharsetFine*)malloc(ncs*sizeof(EncaCharsetFine));

  for (i = 0; i < ncs; i++) {
    fine[i].enca = bin_search(alist, nn, raw[i].enca);
    fine[i].rfc1345 = bin_search(alist, nn, raw[i].rfc1345);
    fine[i].iconv = raw[i].iconv ? bin_search(alist, nn, raw[i].iconv) : -1;
    fine[i].cstocs = raw[i].cstocs ? bin_search(alist, nn, raw[i].cstocs) : -1;
    fine[i].mime = raw[i].mime ? bin_search(alist, nn, raw[i].mime) : -1;
    fine[i].human = raw[i].human;
    fine[i].flags = raw[i].flags;
    fine[i].nsurface = raw[i].nsurface;
  }

  return fine;
}

static int*
create_index_list(EncaCharsetRaw *raw, const int ncs,
                  char **alist, const int nn)
{
  int i, k;
  int *ilist;

  ilist = (int*)malloc(nn*sizeof(int));

  for (i = 0; i < ncs; i++) {
    for (k = 0; k < raw[i].naliases; k++) {
      ilist[bin_search(alist, nn, raw[i].aliases[k])] = i;
    }
  }

  return ilist;
}

static void
print_fine_data(EncaCharsetFine *fine, const int ncs,
                int *ilist, char **alist, const int nn)
{
  int i;

  puts("/****  THIS IS A GENERATED FILE.  DO NOT TOUCH!  *****/");

  puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
  puts("static const EncaCharsetInfo CHARSET_INFO[] = {");
  for (i = 0; i < ncs; i++) {
    printf("  {\n"
           "     %d, %d, %d, %d, %d,\n"
           "     \"%s\",\n"
           "     %s,\n"
           "     %s\n"
           "  },\n",
           fine[i].enca,
           fine[i].rfc1345,
           fine[i].cstocs,
           fine[i].iconv,
           fine[i].mime,
           fine[i].human,
           fine[i].flags,
           fine[i].nsurface);
  }
  puts("};\n");

  puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
  puts("static const char *ALIAS_LIST[] = {");
  for (i = 0; i < nn; i++) printf("  \"%s\",\n", alist[i]);
  puts("};\n");

  puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
  puts("static const int INDEX_LIST[] = {");
  for (i = 0; i < nn; i++) {
    if (i%16 == 0) printf("  ");
    printf("%2d, ", ilist[i]);
    if (i%16 == 15 || i == nn-1) printf("\n");
  }
  puts("};\n");
}

int
main(void)
{
  EncaCharsetRaw *raw;
  EncaCharsetFine *fine;
  char **alist;
  int *ilist;
  int ncs, nn;

  raw = read_raw_charset_data(stdin, &ncs);
  alist = build_alias_list(raw, ncs, &nn);
  fine = refine_data(raw, ncs, alist, nn);
  ilist = create_index_list(raw, ncs, alist, nn);
  print_fine_data(fine, ncs, ilist, alist, nn);

  free(fine);
  free(ilist);

  return 0;
}