Blob Blame History Raw
/* "$Id: $"
 *
 * Author: Jean-Marc Lienher ( http://oksid.ch )
 * Copyright 2000-2003 by O'ksi'D.
 *
 * This library is free software. Distribution and use rights are outlined in
 * the file "COPYING" which should have been included with this file.  If this
 * file is missing or damaged, see the license at:
 *
 *     http://www.fltk.org/COPYING.php
 *
 * Please report all bugs and problems on the following page:
 *
 *     http://www.fltk.org/str.php
 */

/*
 * Unicode to UTF-8 conversion functions.
 */

#if !defined(WIN32) && !defined(__APPLE__)

#include "../Xutf8.h"

/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/

/* 
 * Converts the first char of the UTF-8 string to an Unicode value 
 * Returns the byte length of the converted UTF-8 char 
 * Returns -1 if the UTF-8 string is not valid 
 */
int
XConvertUtf8ToUcs(const unsigned char     *buf,
		  int                     len,
		  unsigned int         	  *ucs) {

  if (buf[0] & 0x80) {
    if (buf[0] & 0x40) {
      if (buf[0] & 0x20) {
	if (buf[0] & 0x10) {
	  if (buf[0] & 0x08) {
	    if (buf[0] & 0x04) {
	      if (buf[0] & 0x02) {
		/* bad UTF-8 string */
	      } else {
		/* 0x04000000 - 0x7FFFFFFF */
	      }	
	    } else if (len > 4 
		       && (buf[1] & 0xC0) == 0x80
		       && (buf[2] & 0xC0) == 0x80
		       && (buf[3] & 0xC0) == 0x80
		       && (buf[4] & 0xC0) == 0x80) {
	      /* 0x00200000 - 0x03FFFFFF */
	      *ucs =  ((buf[0] & ~0xF8) << 24) +
		      ((buf[1] & ~0x80) << 18) +
		      ((buf[2] & ~0x80) << 12) +
		      ((buf[3] & ~0x80) << 6) +
		       (buf[4] & ~0x80);
	      if (*ucs > 0x001FFFFF && *ucs < 0x01000000) return 5;
	    }
	  } else if (len > 3 
		     && (buf[1] & 0xC0) == 0x80
		     && (buf[2] & 0xC0) == 0x80
		     && (buf[3] & 0xC0) == 0x80) {
	    /* 0x00010000 - 0x001FFFFF */
	    *ucs =  ((buf[0] & ~0xF0) << 18) +
		    ((buf[1] & ~0x80) << 12) +
		    ((buf[2] & ~0x80) << 6) +
		     (buf[3] & ~0x80);
	    if (*ucs > 0x0000FFFF) return 4;
	  }
	} else if (len > 2
	           && (buf[1] & 0xC0) == 0x80 
		   && (buf[2] & 0xC0) == 0x80) {
	  /* 0x00000800 - 0x0000FFFF */
	  *ucs =  ((buf[0] & ~0xE0) << 12) +
		  ((buf[1] & ~0x80) << 6) +
		   (buf[2] & ~0x80);
	  if (*ucs > 0x000007FF) return 3;
	}	
      } else if (len > 1 && (buf[1] & 0xC0) == 0x80) {
	/* 0x00000080 - 0x000007FF */
	*ucs = ((buf[0] & ~0xC0) << 6) +
		(buf[1] & ~0x80);
	if (*ucs > 0x0000007F) return 2;
      }
    }
  } else if (len > 0) {
    /* 0x00000000 - 0x0000007F */
    *ucs = buf[0];
    return 1;
  } 

  *ucs = (unsigned int) '?'; /* bad utf-8 string */
  return -1;
}

/* 
 * Converts an Unicode value to an UTF-8 string 
 * NOTE : the buffer (buf) must be at least 5 bytes long !!!  
 */
int 
XConvertUcsToUtf8(unsigned int 	ucs, 
		  char 		*buf) {

  if (ucs < 0x000080) {
    buf[0] = ucs;
    return 1;
  } else if (ucs < 0x000800) {
    buf[0] = 0xC0 | (ucs >> 6);
    buf[1] = 0x80 | (ucs & 0x3F);
    return 2;
  } else if (ucs < 0x010000) { 
    buf[0] = 0xE0 | (ucs >> 12);
    buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
    buf[2] = 0x80 | (ucs & 0x3F);
    return 3;
  } else if (ucs < 0x00200000) {
    buf[0] = 0xF0 | (ucs >> 18);
    buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
    buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
    buf[3] = 0x80 | (ucs & 0x3F);
    return 4;
  } else if (ucs < 0x01000000) {
    buf[0] = 0xF8 | (ucs >> 24);
    buf[1] = 0x80 | ((ucs >> 18) & 0x3F);
    buf[2] = 0x80 | ((ucs >> 12) & 0x3F);
    buf[3] = 0x80 | ((ucs >> 6) & 0x3F);
    buf[4] = 0x80 | (ucs & 0x3F);
    return 5;
  }
  buf[0] = '?';
  return -1;
}

/* 
 * returns the byte length of the first UTF-8 char 
 * (returns -1 if not valid) 
 */
int
XUtf8CharByteLen(const unsigned char     *buf,
		 int                     len) {
  unsigned int ucs;
  return XConvertUtf8ToUcs(buf, len, &ucs);
}

/*
 * returns the quantity of Unicode chars in the UTF-8 string 
 */
int 
XCountUtf8Char(const unsigned char 	*buf, 
	       int 			len) {

  int i = 0;
  int nbc = 0;
  while (i < len) {
    int cl = XUtf8CharByteLen(buf + i, len - i);
    if (cl < 1) cl = 1;
    nbc++;
    i += cl;
  }
  return nbc;
}

/* 
 * Same as XConvertUtf8ToUcs but no sanity check is done.
 */
int
XFastConvertUtf8ToUcs(const unsigned char     *buf,
		      int                     len,
		      unsigned int            *ucs) {

  if (buf[0] & 0x80) {
    if (buf[0] & 0x40) {
      if (buf[0] & 0x20) {
	if (buf[0] & 0x10) {
	  if (buf[0] & 0x08) {
	    if (buf[0] & 0x04) {
	      if (buf[0] & 0x02) {
		/* bad UTF-8 string */
	      } else {
		/* 0x04000000 - 0x7FFFFFFF */
	      }	
	    } else if (len > 4) {
	      /* 0x00200000 - 0x03FFFFFF */
	      *ucs =  ((buf[0] & ~0xF8) << 24) +
		      ((buf[1] & ~0x80) << 18) +
		      ((buf[2] & ~0x80) << 12) +
		      ((buf[3] & ~0x80) << 6) +
		       (buf[4] & ~0x80);
	      return 5;
	    }
	  } else if (len > 3) {
	    /* 0x00010000 - 0x001FFFFF */
	    *ucs =  ((buf[0] & ~0xF0) << 18) +
		    ((buf[1] & ~0x80) << 12) +
		    ((buf[2] & ~0x80) << 6) +
		     (buf[3] & ~0x80);
	    return 4;
	  }
	} else if (len > 2) {
	  /* 0x00000800 - 0x0000FFFF */
	  *ucs =  ((buf[0] & ~0xE0) << 12) +
		  ((buf[1] & ~0x80) << 6) +
		   (buf[2] & ~0x80);
	  return 3;
	}	
      } else if (len > 1) {
	/* 0x00000080 - 0x000007FF */
	*ucs = ((buf[0] & ~0xC0) << 6) +
		(buf[1] & ~0x80);
	return 2;
      }
    }
  } else if (len > 0) {
    /* 0x00000000 - 0x0000007F */
    *ucs = buf[0];
    return 1;
  } 

  *ucs = (unsigned int) '?'; /* bad utf-8 string */
  return -1;
}

#endif /* X11 only */

/*
 * End of "$Id: $".
 */