//C- -*- C++ -*- //C- ------------------------------------------------------------------- //C- DjVuLibre-3.5 //C- Copyright (c) 2002 Leon Bottou and Yann Le Cun. //C- Copyright (c) 2001 AT&T //C- //C- This software is subject to, and may be distributed under, the //C- GNU General Public License, either Version 2 of the license, //C- or (at your option) any later version. The license should have //C- accompanied the software or you may obtain a copy of the license //C- from the Free Software Foundation at http://www.fsf.org . //C- //C- This program is distributed in the hope that it will be useful, //C- but WITHOUT ANY WARRANTY; without even the implied warranty of //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //C- GNU General Public License for more details. //C- //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from //C- Lizardtech Software. Lizardtech Software has authorized us to //C- replace the original DjVu(r) Reference Library notice by the following //C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu): //C- //C- ------------------------------------------------------------------ //C- | DjVu (r) Reference Library (v. 3.5) //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved. //C- | The DjVu Reference Library is protected by U.S. Pat. No. //C- | 6,058,214 and patents pending. //C- | //C- | This software is subject to, and may be distributed under, the //C- | GNU General Public License, either Version 2 of the license, //C- | or (at your option) any later version. The license should have //C- | accompanied the software or you may obtain a copy of the license //C- | from the Free Software Foundation at http://www.fsf.org . //C- | //C- | The computer code originally released by LizardTech under this //C- | license and unmodified by other parties is deemed "the LIZARDTECH //C- | ORIGINAL CODE." Subject to any third party intellectual property //C- | claims, LizardTech grants recipient a worldwide, royalty-free, //C- | non-exclusive license to make, use, sell, or otherwise dispose of //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU //C- | General Public License. This grant only confers the right to //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to //C- | the extent such infringement is reasonably necessary to enable //C- | recipient to make, have made, practice, sell, or otherwise dispose //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to //C- | any greater extent that may be necessary to utilize further //C- | modifications or combinations. //C- | //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. //C- +------------------------------------------------------------------ // From: Leon Bottou, 1/31/2002 // This file has very little to do with my initial implementation. // It has been practically rewritten by Lizardtech for i18n changes. // My original implementation was very small in comparison // . // In my opinion, the duplication of the string classes is a failed // attempt to use the type system to enforce coding policies. // This could be fixed. But there are better things to do in djvulibre. #ifdef HAVE_CONFIG_H # include "config.h" #endif #if NEED_GNUG_PRAGMAS # pragma implementation #endif #include "GString.h" #include "GThreads.h" #include "debug.h" #include #include #include #include #if HAS_WCHAR # include # if !defined(AUTOCONF) || HAVE_WCHAR_H # include # endif # if HAS_WCTYPE # include # endif #endif #include #ifndef LC_NUMERIC //MingW # undef DO_CHANGELOCALE # define LC_NUMERIC 0 #endif #ifndef DO_CHANGELOCALE # define DO_CHANGELOCALE 0 #endif #ifdef HAVE_NAMESPACES namespace DJVU { # ifdef NOT_DEFINED // Just to fool emacs c++ mode } #endif #endif GBaseString::~GBaseString() {} GNativeString::~GNativeString() {} GUTF8String::~GUTF8String() {} #if !HAS_MBSTATE && HAS_WCHAR // Under some systems, wctomb() and mbtowc() are not thread // safe. In those cases, wcrtomb and mbrtowc are preferred. // For Solaris, wctomb() and mbtowc() are thread safe, and // wcrtomb() and mbrtowc() don't exist. #define wcrtomb MYwcrtomb #define mbrtowc MYmbrtowc #define mbrlen MYmbrlen static inline int wcrtomb(char *bytes,wchar_t w,mbstate_t *) { return wctomb(bytes,w); } static inline int mbrtowc(wchar_t *w,const char *source, size_t n, mbstate_t *) { return mbtowc(w,source,n); } static inline size_t mbrlen(const char *s, size_t n, mbstate_t *) { return mblen(s,n); } #endif // !HAS_MBSTATE || HAS_WCHAR GP GStringRep::upcase(void) const { return tocase(giswupper,gtowupper); } GP GStringRep::downcase(void) const { return tocase(giswlower,gtowlower); } GP GStringRep::UTF8::create(const unsigned int sz) { return GStringRep::create(sz,(GStringRep::UTF8 *)0); } GP GStringRep::UTF8::create(const char *s) { GStringRep::UTF8 dummy; return dummy.strdup(s); } GP GStringRep::UTF8::create(const GP &s1,const GP &s2) { GStringRep::UTF8 dummy; return dummy.concat(s1,s2); } GP GStringRep::UTF8::create( const GP &s1,const char *s2) { GStringRep::UTF8 dummy; return dummy.concat(s1,s2); } GP GStringRep::UTF8::create( const char *s1, const GP &s2) { GStringRep::UTF8 dummy; return dummy.concat(s1,s2); } GP GStringRep::UTF8::create( const char *s1,const char *s2) { GStringRep::UTF8 dummy; return dummy.concat(s1,s2); } GP GStringRep::UTF8::create(const char *s,const int start,const int length) { GStringRep::UTF8 dummy; return dummy.substr(s,start,length); } GP GStringRep::UTF8::create( const uint16_t *s,const int start,const int length) { GStringRep::UTF8 dummy; return dummy.substr(s,start,length); } GP GStringRep::UTF8::create( const uint32_t *s,const int start,const int length) { GStringRep::UTF8 dummy; return dummy.substr(s,start,length); } GP GStringRep::UTF8::blank(const unsigned int sz) const { return GStringRep::create(sz,(GStringRep::UTF8 *)0); } bool GStringRep::UTF8::isUTF8(void) const { return true; } GP GStringRep::UTF8::toThis( const GP &rep,const GP &) const { return rep?(rep->toUTF8(true)):rep; } GP GStringRep::UTF8::create(const char fmt[],va_list& args) { const GP s(create(fmt)); return (s?(s->vformat(args)):s); } #if !HAS_WCHAR #define NATIVE_CREATE(x) UTF8::create( x ); #ifdef LC_ALL #undef LC_ALL #endif #define LC_ALL 0 class GStringRep::ChangeLocale { public: ChangeLocale(const int,const char *) {} ~ChangeLocale() {}; }; GP GStringRep::NativeToUTF8( const char *s ) { return GStringRep::UTF8::create(s); } #else #define NATIVE_CREATE(x) Native::create( x ); // The declaration and implementation of GStringRep::ChangeLocale // Not used in WinCE class GStringRep::ChangeLocale { public: ChangeLocale(const int category,const char locale[]); ~ChangeLocale(); private: GUTF8String locale; int category; }; class GStringRep::Native : public GStringRep { public: // default constructor Native(void); // virtual destructor virtual ~Native(); // Other virtual methods. // Create an empty string. virtual GP blank(const unsigned int sz = 0) const; // Append a string. virtual GP append(const GP &s2) const; // Test if Native. virtual bool isNative(void) const; // Convert to Native. virtual GP toNative( const EscapeMode escape=UNKNOWN_ESCAPED) const; // Convert to UTF8. virtual GP toUTF8(const bool nothrow=false) const; // Convert to UTF8. virtual GP toThis( const GP &rep,const GP &) const; // Compare with #s2#. virtual int cmp(const GP &s2, const int len=(-1)) const; // Convert strings to numbers. virtual int toInt(void) const; virtual long toLong( const int pos, int &endpos, const int base=10) const; virtual unsigned long toULong( const int pos, int &endpos, const int base=10) const; virtual double toDouble( const int pos, int &endpos) const; // Create an empty string static GP create(const unsigned int sz = 0); // Create a strdup string. static GP create(const char *s); // Creates by appending to the current string // Creates with a concat operation. static GP create( const GP &s1,const GP &s2); static GP create( const GP &s1,const char *s2); static GP create( const char *s1, const GP &s2); static GP create(const char *s1,const char *s2); // Create with a strdup and substr operation. static GP create( const char *s,const int start,const int length=(-1)); static GP create( const uint16_t *s,const int start,const int length=(-1)); static GP create( const uint32_t *s,const int start,const int length=(-1)); // Create with an sprintf() static GP create_format(const char fmt[],...); static GP create(const char fmt[],va_list &args); virtual unsigned char *UCS4toString( const uint32_t w,unsigned char *ptr, mbstate_t *ps=0) const; // Tests if a string is legally encoded in the current character set. virtual bool is_valid(void) const; virtual int ncopy(wchar_t * const buf, const int buflen) const; friend class GBaseString; protected: // Return the next character and increment the source pointer. virtual uint32_t getValidUCS4(const char *&source) const; }; GP GStringRep::Native::create(const unsigned int sz) { return GStringRep::create(sz,(GStringRep::Native *)0); } // Create a strdup string. GP GStringRep::Native::create(const char *s) { GStringRep::Native dummy; return dummy.strdup(s); } GP GStringRep::Native::create(const GP &s1,const GP &s2) { GStringRep::Native dummy; return dummy.concat(s1,s2); } GP GStringRep::Native::create( const GP &s1,const char *s2) { GStringRep::Native dummy; return dummy.concat(s1,s2); } GP GStringRep::Native::create( const char *s1, const GP &s2) { GStringRep::Native dummy; return dummy.concat(s1,s2); } GP GStringRep::Native::create(const char *s1,const char *s2) { GStringRep::Native dummy; return dummy.concat(s1,s2); } GP GStringRep::Native::create( const char *s,const int start,const int length) { GStringRep::Native dummy; return dummy.substr(s,start,length); } GP GStringRep::Native::create( const uint16_t *s,const int start,const int length) { GStringRep::Native dummy; return dummy.substr(s,start,length); } GP GStringRep::Native::create( const uint32_t *s,const int start,const int length) { GStringRep::Native dummy; return dummy.substr(s,start,length); } GP GStringRep::Native::blank(const unsigned int sz) const { return GStringRep::create(sz,(GStringRep::Native *)0); } bool GStringRep::Native::isNative(void) const { return true; } GP GStringRep::Native::toThis( const GP &rep,const GP &) const { return rep?(rep->toNative(NOT_ESCAPED)):rep; } GP GStringRep::Native::create(const char fmt[],va_list &args) { const GP s(create(fmt)); return (s?(s->vformat(args)):s); } int GStringRep::Native::ncopy( wchar_t * const buf, const int buflen ) const { return toUTF8()->ncopy(buf,buflen); } GStringRep::ChangeLocale::ChangeLocale(const int xcategory, const char xlocale[] ) : category(xcategory) { #if DO_CHANGELOCALE // This is disabled under UNIX because // it does not play nice with MT. if(xlocale) { locale=setlocale(xcategory,0); if(locale.length() &&(locale!=xlocale)) { if(locale == setlocale(category,xlocale)) { locale.empty(); } } else { locale.empty(); } } #endif } GStringRep::ChangeLocale::~ChangeLocale() { #if DO_CHANGELOCALE if(locale.length()) { setlocale(category,(const char *)locale); } #endif } GNativeString & GNativeString::format(const char fmt[], ... ) { va_list args; va_start(args, fmt); return init(GStringRep::Native::create(fmt,args)); } // Gather the native implementations here. Not used in WinCE. GStringRep::Native::Native(void) {} GStringRep::Native::~Native() {} GP GStringRep::Native::append(const GP &s2) const { GP retval; if(s2) { if(s2->isUTF8()) { G_THROW( ERR_MSG("GStringRep.appendUTF8toNative") ); } retval=concat(data,s2->data); }else { retval=const_cast(this); } return retval; } GP GStringRep::Native::create_format(const char fmt[],...) { va_list args; va_start(args, fmt); return create(fmt,args); } unsigned char * GStringRep::Native::UCS4toString( const uint32_t w0,unsigned char *ptr, mbstate_t *ps) const { return UCS4toNative(w0,ptr,ps); } // Convert a UCS4 to a multibyte string in the value bytes. // The data pointed to by ptr should be long enough to contain // the results with a nill termination. (Normally 7 characters // is enough.) unsigned char * GStringRep::UCS4toNative(const uint32_t w0,unsigned char *ptr, mbstate_t *ps) { uint16_t w1; uint16_t w2=1; for(int count=(sizeof(wchar_t)==sizeof(w1)) ? UCS4toUTF16(w0,w1,w2) : 1; count; --count,w1=w2) { // wchar_t can be either UCS4 or UCS2 const wchar_t w=(sizeof(wchar_t) == sizeof(w1))?(wchar_t)w1:(wchar_t)w0; int i=wcrtomb((char *)ptr,w,ps); if(i<0) break; ptr[i]=0; ptr += i; } ptr[0]=0; return ptr; } GP GStringRep::Native::toNative(const EscapeMode escape) const { if(escape == UNKNOWN_ESCAPED) G_THROW( ERR_MSG("GStringRep.NativeToNative") ); return const_cast(this); } GP GStringRep::Native::toUTF8(const bool) const { unsigned char *buf; GPBuffer gbuf(buf,size*6+1); buf[0]=0; if(data && size) { size_t n=size; const char *source=data; mbstate_t ps; unsigned char *ptr=buf; //(void)mbrlen(source, n, &ps); memset(&ps,0,sizeof(mbstate_t)); int i=0; if(sizeof(wchar_t) == sizeof(uint32_t)) { wchar_t w = 0; for(;(n>0)&&((i=mbrtowc(&w,source,n,&ps))>=0); n-=i,source+=i) { ptr=UCS4toUTF8((uint32_t)w,ptr); } } else { wchar_t w = 0; for(;(n>0)&&((i=mbrtowc(&w,source,n,&ps))>=0);n-=i,source+=i) { uint16_t s[2]; s[0]=w; uint32_t w0; if(UTF16toUCS4(w0,s,s+1)<=0) { source+=i; n-=i; if((n>0)&&((i=mbrtowc(&w,source,n,&ps))>=0)) { s[1]=w; if(UTF16toUCS4(w0,s,s+2)<=0) { i=(-1); break; } } else { i=(-1); break; } } ptr=UCS4toUTF8(w0,ptr); } } if(i<0) { gbuf.resize(0); } else { ptr[0]=0; } } return GStringRep::UTF8::create((const char *)buf); } GNativeString GBaseString::UTF8ToNative( const bool currentlocale,const EscapeMode escape) const { const char *source=(*this); GP retval; if(source && source[0]) { #if DO_CHANGELOCALE GUTF8String lc_ctype(setlocale(LC_CTYPE,0)); bool repeat; for(repeat=!currentlocale;;repeat=false) { #endif retval=(*this)->toNative((GStringRep::EscapeMode)escape); #if DO_CHANGELOCALE if (!repeat || retval || (lc_ctype == setlocale(LC_CTYPE,""))) break; } if(!repeat) setlocale(LC_CTYPE,(const char *)lc_ctype); #endif } return GNativeString(retval); } /*MBCS*/ GNativeString GBaseString::getUTF82Native( EscapeMode escape ) const { //MBCS cvt GNativeString retval; // We don't want to convert this if it // already is known to be native... // if (isNative()) return *this; const size_t slen=length()+1; if(slen>1) { retval=UTF8ToNative(false,escape) ; if(!retval.length()) { retval=(const char*)*this; } } return retval; } GUTF8String GBaseString::NativeToUTF8(void) const { GP retval; if(length()) { const char *source=(*this); #if DO_CHANGELOCALE GUTF8String lc_ctype=setlocale(LC_CTYPE,0); bool repeat; for(repeat=true;;repeat=false) { #endif if( (retval=GStringRep::NativeToUTF8(source)) ) if(GStringRep::cmp(retval->toNative(),source)) retval=GStringRep::UTF8::create((unsigned int)0); #if DO_CHANGELOCALE if(!repeat || retval || (lc_ctype == setlocale(LC_CTYPE,""))) break; } if(!repeat) setlocale(LC_CTYPE,(const char *)lc_ctype); #endif } return GUTF8String(retval); } GUTF8String GBaseString::getNative2UTF8(void) const { //MBCS cvt // We don't want to do a transform this // if we already are in the given type. // if (isUTF8()) return *this; const size_t slen=length()+1; GUTF8String retval; if(slen > 1) { retval=NativeToUTF8(); if(!retval.length()) { retval=(const char *)(*this); } } return retval; } /*MBCS*/ int GStringRep::Native::cmp(const GP &s2,const int len) const { int retval; if(s2) { if(s2->isUTF8()) { const GP r(toUTF8(true)); if(r) { retval=GStringRep::cmp(r->data,s2->data,len); }else { retval=cmp(s2->toNative(NOT_ESCAPED),len); } }else { retval=GStringRep::cmp(data,s2->data,len); } }else { retval=GStringRep::cmp(data,0,len); } return retval; } int GStringRep::Native::toInt() const { return atoi(data); } long GStringRep::Native::toLong( const int pos, int &endpos, const int base) const { char *edata=0; const long retval=strtol(data+pos, &edata, base); if(edata) { endpos=(int)((size_t)edata-(size_t)data); }else { endpos=(-1); } return retval; } unsigned long GStringRep::Native::toULong( const int pos, int &endpos, const int base) const { char *edata=0; const unsigned long retval=strtoul(data+pos, &edata, base); if(edata) { endpos=(int)((size_t)edata-(size_t)data); }else { endpos=(-1); } return retval; } double GStringRep::Native::toDouble( const int pos, int &endpos) const { char *edata=0; const double retval=strtod(data+pos, &edata); if(edata) { endpos=(int)((size_t)edata-(size_t)data); }else { endpos=(-1); } return retval; } uint32_t GStringRep::Native::getValidUCS4(const char *&source) const { uint32_t retval=0; int n=(int)((size_t)size+(size_t)data-(size_t)source); if(source && (n > 0)) { mbstate_t ps; //(void)mbrlen(source, n, &ps); memset(&ps,0,sizeof(mbstate_t)); wchar_t wt; const int len=mbrtowc(&wt,source,n,&ps); if(len>=0) { if(sizeof(wchar_t) == sizeof(uint16_t)) { source+=len; uint16_t s[2]; s[0]=(uint16_t)wt; if(UTF16toUCS4(retval,s,s+1)<=0) { if((n-=len)>0) { const int len=mbrtowc(&wt,source,n,&ps); if(len>=0) { s[1]=(uint16_t)wt; uint32_t w; if(UTF16toUCS4(w,s,s+2)>0) { source+=len; retval=w; } } } } }else { retval=(uint32_t)wt; source++; } }else { source++; } } return retval; } // Tests if a string is legally encoded in the current character set. bool GStringRep::Native::is_valid(void) const { bool retval=true; if(data && size) { size_t n=size; const char *s=data; mbstate_t ps; //(void)mbrlen(s, n, &ps); memset(&ps,0,sizeof(mbstate_t)); do { size_t m=mbrlen(s,n,&ps); if(m > n) { retval=false; break; }else if(m) { s+=m; n-=m; }else { break; } } while(n); } return retval; } // These are dummy functions. void GStringRep::set_remainder(void const * const, const unsigned int, const EncodeType) {} void GStringRep::set_remainder(void const * const, const unsigned int, const GP &encoding) {} void GStringRep::set_remainder( const GP &) {} GP GStringRep::get_remainder( void ) const { return 0; } GNativeString::GNativeString(const char dat) { init(GStringRep::Native::create(&dat,0,1)); } GNativeString::GNativeString(const char *str) { init(GStringRep::Native::create(str)); } GNativeString::GNativeString(const unsigned char *str) { init(GStringRep::Native::create((const char *)str)); } GNativeString::GNativeString(const uint16_t *str) { init(GStringRep::Native::create(str,0,-1)); } GNativeString::GNativeString(const uint32_t *str) { init(GStringRep::Native::create(str,0,-1)); } GNativeString::GNativeString(const char *dat, unsigned int len) { init( GStringRep::Native::create(dat,0,((int)len<0)?(-1):(int)len)); } GNativeString::GNativeString(const uint16_t *dat, unsigned int len) { init( GStringRep::Native::create(dat,0,((int)len<0)?(-1):(int)len)); } GNativeString::GNativeString(const uint32_t *dat, unsigned int len) { init( GStringRep::Native::create(dat,0,((int)len<0)?(-1):(int)len)); } GNativeString::GNativeString(const GNativeString &str) { init(str); } GNativeString::GNativeString(const GBaseString &gs, int from, int len) { init( GStringRep::Native::create(gs,from,((int)len<0)?(-1):(int)len)); } GNativeString::GNativeString(const int number) { init(GStringRep::Native::create_format("%d",number)); } GNativeString::GNativeString(const double number) { init(GStringRep::Native::create_format("%f",number)); } GNativeString& GNativeString::operator= (const char str) { return init(GStringRep::Native::create(&str,0,1)); } GNativeString& GNativeString::operator= (const char *str) { return init(GStringRep::Native::create(str)); } GNativeString GBaseString::operator+(const GNativeString &s2) const { return GStringRep::Native::create(*this,s2); } GP GStringRep::NativeToUTF8( const char *s ) { return GStringRep::Native::create(s)->toUTF8(); } #endif // HAS_WCHAR template GP GStringRep::create(const unsigned int sz, TYPE *) { GP gaddr; if (sz > 0) { GStringRep *addr; gaddr=(addr=new TYPE); addr->data=(char *)(::operator new(sz+1)); addr->size = sz; addr->data[sz] = 0; } return gaddr; } GP GStringRep::strdup(const char *s) const { GP retval; const int length=s?strlen(s):0; if(length>0) { retval=blank(length); char const * const end=s+length; char *ptr=retval->data; for(;*s&&(s!=end);ptr++) { ptr[0]=s++[0]; } ptr[0]=0; } return retval; } GP GStringRep::substr(const char *s,const int start,const int len) const { GP retval; if(s && s[0]) { const unsigned int length=(start<0 || len<0)?(unsigned int)strlen(s):(unsigned int)(-1); const char *startptr, *endptr; if(start<0) { startptr=s+length+start; if(startptrstartptr) { retval=blank((size_t)(endptr-startptr)); char *data=retval->data; for(; (startptr GStringRep::substr(const uint16_t *s,const int start,const int len) const { GP retval; if(s && s[0]) { uint16_t const *eptr; if(len<0) { for(eptr=s;eptr[0];++eptr) EMPTY_LOOP; }else { eptr=&(s[len]); } s=&s[start]; if((size_t)s<(size_t)eptr) { mbstate_t ps; memset(&ps,0,sizeof(mbstate_t)); unsigned char *buf,*ptr; GPBuffer gbuf(buf,(((size_t)eptr-(size_t)s)/2)*3+7); for(ptr=buf;s[0];) { uint32_t w; int i=UTF16toUCS4(w,s,eptr); if(i<=0) break; s+=i; ptr=UCS4toString(w,ptr,&ps); } ptr[0]=0; retval = strdup( (const char *)buf ); } } return retval; } GP GStringRep::substr(const uint32_t *s,const int start,const int len) const { GP retval; if(s && s[0]) { uint32_t const *eptr; if(len<0) { for(eptr=s;eptr[0];++eptr) EMPTY_LOOP; }else { eptr=&(s[len]); } s=&s[start]; if((size_t)s<(size_t)eptr) { mbstate_t ps; memset(&ps,0,sizeof(mbstate_t)); unsigned char *buf,*ptr; GPBuffer gbuf(buf,((((size_t)eptr-(size_t)s))/4)*6+7); for(ptr=buf;s[0];++s) { ptr=UCS4toString(s[0],ptr,&ps); } ptr[0]=0; retval = strdup( (const char *)buf ); } } return retval; } GP GStringRep::append(const char *s2) const { GP retval; if(s2) { retval=concat(data,s2); }else { retval=const_cast(this); } return retval; } GP GStringRep::UTF8::append(const GP &s2) const { GP retval; if(s2) { if(s2->isNative()) { G_THROW( ERR_MSG("GStringRep.appendNativeToUTF8") ); } retval=concat(data,s2->data); }else { retval=const_cast(this); } return retval; } GP GStringRep::concat(const char *s1,const char *s2) const { const int length1=(s1?strlen(s1):0); const int length2=(s2?strlen(s2):0); const int length=length1+length2; GP retval; if(length>0) { retval=blank(length); GStringRep &r=*retval; if(length1) { strcpy(r.data,s1); if(length2) strcat(r.data,s2); }else { strcpy(r.data,s2); } } return retval; } const char *GBaseString::nullstr = ""; void GBaseString::empty( void ) { init(0); } GP GStringRep::getbuf(int n) const { GP retval; if(n< 0) n=strlen(data); if(n>0) { retval=blank(n); char *ndata=retval->data; strncpy(ndata,data,n); ndata[n]=0; } return retval; } const char * GStringRep::isCharType(bool (*xiswtest)(const unsigned long wc), const char *ptr, const bool reverse) const { const char *xptr = ptr; unsigned long w=getValidUCS4(xptr); if(ptr != xptr) { if (sizeof(wchar_t) == 2) w &= 0xffff; if (reverse ^ xiswtest(w)) ptr = xptr; } return ptr; } int GStringRep::nextCharType( bool (*xiswtest)(const unsigned long wc), const int from, const int len, const bool reverse) const { // We want to return the position of the next // non white space starting from the #from# // location. isspace should work in any locale // so we should only need to do this for the non- // native locales (UTF8) int retval; if(from GStringRep::tocase( bool (*xiswcase)(const unsigned long wc), unsigned long (*xtowcase)(const unsigned long wc)) const { GP retval; char const * const eptr=data+size; char const *ptr=data; while(ptr gbuf(buf,n+(1+size-n)*6); if(n>0) { strncpy((char *)buf,data,n); } unsigned char *buf_ptr=buf+n; for(char const *ptr=data+n;ptr(this); } return retval; } // Returns a copy of this string with characters used in XML escaped as follows: // '<' --> "<" // '>' --> ">" // '&' --> "&" // '\'' --> "'" // '\"' --> """ // Also escapes characters 0x00 through 0x1f and 0x7e through 0x7f. GP GStringRep::toEscaped( const bool tosevenbit ) const { bool modified=false; char *ret; GPBuffer gret(ret,size*7); ret[0]=0; char *retptr=ret; char const *start=data; char const *s=start; char const *last=s; GP special; for(unsigned long w;(w=getValidUCS4(s));last=s) // Whoever wrote this for statement should be __complete_here__ { char const *ss=0; switch(w) { case '<': ss="<"; break; case '>': ss=">"; break; case '&': ss="&"; break; case '\47': ss="'"; break; case '\42': ss="""; break; default: if((w<' ')||(w>=0x7e && (tosevenbit || (w < 0x80)))) { special=toThis(UTF8::create_format("&#%lu;",w)); ss=special->data; } break; } if(ss) { modified=true; if(s!=start) { size_t len=(size_t)last-(size_t)start; strncpy(retptr,start,len); retptr+=len; start=s; } if(ss[0]) { size_t len=strlen(ss); strcpy(retptr,ss); retptr+=len; } } } GP retval; if(modified) { strcpy(retptr,start); retval=strdup( ret ); }else { retval=const_cast(this); } // DEBUG_MSG( "Escaped string is '" << ret << "'\n" ); return retval; } static const GMap & BasicMap( void ) { static GMap Basic; if (! Basic.size()) { Basic["lt"] = GUTF8String('<'); Basic["gt"] = GUTF8String('>'); Basic["amp"] = GUTF8String('&'); Basic["apos"] = GUTF8String('\47'); Basic["quot"] = GUTF8String('\42'); } return Basic; } GUTF8String GUTF8String::fromEscaped( const GMap ConvMap ) const { GUTF8String ret; // Build output string here int start_locn = 0; // Beginning of substring to skip int amp_locn; // Location of a found ampersand while( (amp_locn = search( '&', start_locn )) > -1 ) { // Found the next apostrophe // Locate the closing semicolon const int semi_locn = search( ';', amp_locn ); // No closing semicolon, exit and copy // the rest of the string. if( semi_locn < 0 ) break; ret += substr( start_locn, amp_locn - start_locn ); int const len = semi_locn - amp_locn - 1; if(len) { GUTF8String key = substr( amp_locn+1, len); //DEBUG_MSG( "key = '" << key << "'\n" ); char const * s=key; if( s[0] == '#') { unsigned long value; char *ptr=0; if(s[1] == 'x' || s[1] == 'X') { value=strtoul((char const *)(s+2),&ptr,16); }else { value=strtoul((char const *)(s+1),&ptr,10); } if(ptr) { unsigned char utf8char[7]; unsigned char const * const end=GStringRep::UCS4toUTF8(value,utf8char); ret+=GUTF8String((char const *)utf8char,(size_t)end-(size_t)utf8char); }else { ret += substr( amp_locn, semi_locn - amp_locn + 1 ); } }else { GPosition map_entry = ConvMap.contains( key ); if( map_entry ) { // Found in the conversion map, substitute ret += ConvMap[map_entry]; } else { static const GMap &Basic = BasicMap(); GPosition map_entry = Basic.contains( key ); if ( map_entry ) { ret += Basic[map_entry]; }else { ret += substr( amp_locn, len+2 ); } } } }else { ret += substr( amp_locn, len+2 ); } start_locn = semi_locn + 1; // DEBUG_MSG( "ret = '" << ret << "'\n" ); } // Copy the end of the string to the output ret += substr( start_locn, length()-start_locn ); // DEBUG_MSG( "Unescaped string is '" << ret << "'\n" ); return (ret == *this)?(*this):ret; } GUTF8String GUTF8String::fromEscaped(void) const { const GMap nill; return fromEscaped(nill); } GP GStringRep::setat(int n, char ch) const { GP retval; if(n<0) n+=size; if (n < 0 || n>size) GBaseString::throw_illegal_subscript(); if(ch == data[n]) { retval=const_cast(this); }else if(!ch) { retval=getbuf(n); }else { retval=getbuf((ndata[n]=ch; if(n == size) retval->data[n+1]=0; } return retval; } #if defined(AUTOCONF) && defined(HAVE_VSNPRINTF) # define USE_VSNPRINTF vsnprintf #elif defined(_WIN32) && !defined(__CYGWIN32__) # define USE_VSNPRINTF _vsnprintf #elif defined(linux) # define USE_VSNPRINTF vsnprintf #endif GUTF8String & GUTF8String::format(const char fmt[], ... ) { va_list args; va_start(args, fmt); return init(GStringRep::UTF8::create(fmt,args)); } GP GStringRep::UTF8::create_format(const char fmt[],...) { va_list args; va_start(args, fmt); return create(fmt,args); } GP GStringRep::vformat(va_list args) const { GP retval; if(size) { char const * const fmt=data; int buflen=32768; char *buffer; GPBuffer gbuffer(buffer,buflen); ChangeLocale locale(LC_NUMERIC,(isNative()?0:"C")); // Format string #ifdef USE_VSNPRINTF while(USE_VSNPRINTF(buffer, buflen, fmt, args)<0) { gbuffer.resize(0); gbuffer.resize(buflen+32768); } va_end(args); #else buffer[buflen-1] = 0; vsprintf(buffer, fmt, args); va_end(args); if (buffer[buflen-1]) { // This isn't as fatal since it is on the stack, but we // definitely should stop the current operation. G_THROW( ERR_MSG("GString.overwrite") ); } #endif retval=strdup((const char *)buffer); } // Go altering the string return retval; } int GStringRep::search(char c, int from) const { if (from<0) from += size; int retval=(-1); if (from>=0 && from=0 && from=0) && (from= 0;++loc) retval=loc; return retval; } int GStringRep::contains(const char accept[],int from) const { if(from<0) { from+=size; if(from<0) G_THROW( ERR_MSG("GString.bad_subscript") ); } int retval=(-1); if (accept && accept[0] && from>=0 && from= 0) { retval=from++; } return retval; } bool GBaseString::is_int(void) const { bool isLong=!!ptr; if(isLong) { int endpos; (*this)->toLong(0,endpos); if(endpos>=0) { isLong=((*this)->nextNonSpace(endpos) == (int)length()); } } return isLong; } bool GBaseString::is_float(void) const { bool isDouble=!!ptr; if(isDouble) { int endpos; (*this)->toDouble(0,endpos); if(endpos>=0) { isDouble=((*this)->nextNonSpace(endpos) == (int)length()); } } return isDouble; } unsigned int hash(const GBaseString &str) { unsigned int x = 0; const char *s = (const char*)str; while (*s) x = x ^ (x<<6) ^ (unsigned char)(*s++); return x; } void GBaseString::throw_illegal_subscript() { G_THROW( ERR_MSG("GString.bad_subscript") ); } unsigned char * GStringRep::UTF8::UCS4toString( const uint32_t w0,unsigned char *ptr, mbstate_t *) const { return UCS4toUTF8(w0,ptr); } int GStringRep::UTF8::ncopy(wchar_t * const buf, const int buflen ) const { int retval=(-1); if(buf && buflen) { buf[0]=0; if(data[0]) { const size_t length=strlen(data); const unsigned char * const eptr=(const unsigned char *)(data+length); wchar_t *r=buf; wchar_t const * const rend=buf+buflen; for(const unsigned char *s=(const unsigned char *)data; (r GStringRep::UTF8::toNative(const EscapeMode escape) const { GP retval; if(data[0]) { const size_t length=strlen(data); const unsigned char * const eptr=(const unsigned char *)(data+length); unsigned char *buf; GPBuffer gbuf(buf,12*length+12); unsigned char *r=buf; mbstate_t ps; memset(&ps,0,sizeof(mbstate_t)); for(const unsigned char *s=(const unsigned char *)data;(s GStringRep::UTF8::toUTF8(const bool nothrow) const { if(!nothrow) G_THROW( ERR_MSG("GStringRep.UTF8ToUTF8") ); return const_cast(this); } // Tests if a string is legally encoded in the current character set. bool GStringRep::UTF8::is_valid(void) const { bool retval=true; if(data && size) { const unsigned char * const eptr=(const unsigned char *)(data+size); for(const unsigned char *s=(const unsigned char *)data;(s>6)|0xC0); *ptr++ = (unsigned char)((w|0x80)&0xBF); } else if(w <= 0xFFFF) { *ptr++ = (unsigned char)((w>>12)|0xE0); *ptr++ = (unsigned char)(((w>>6)|0x80)&0xBF); *ptr++ = (unsigned char)((w|0x80)&0xBF); } else if(w <= 0x1FFFFF) { *ptr++ = (unsigned char)((w>>18)|0xF0); *ptr++ = (unsigned char)(((w>>12)|0x80)&0xBF); *ptr++ = (unsigned char)(((w>>6)|0x80)&0xBF); *ptr++ = (unsigned char)((w|0x80)&0xBF); } else if(w <= 0x3FFFFFF) { *ptr++ = (unsigned char)((w>>24)|0xF8); *ptr++ = (unsigned char)(((w>>18)|0x80)&0xBF); *ptr++ = (unsigned char)(((w>>12)|0x80)&0xBF); *ptr++ = (unsigned char)(((w>>6)|0x80)&0xBF); *ptr++ = (unsigned char)((w|0x80)&0xBF); } else if(w <= 0x7FFFFFFF) { *ptr++ = (unsigned char)((w>>30)|0xFC); *ptr++ = (unsigned char)(((w>>24)|0x80)&0xBF); *ptr++ = (unsigned char)(((w>>18)|0x80)&0xBF); *ptr++ = (unsigned char)(((w>>12)|0x80)&0xBF); *ptr++ = (unsigned char)(((w>>6)|0x80)&0xBF); *ptr++ = (unsigned char)((w|0x80)&0xBF); } else { *ptr++ = '?'; } return ptr; } // Creates with a concat operation. GP GStringRep::concat( const char *s1, const GP &s2) const { GP retval; if(s2) { retval=toThis(s2); if(s1 && s1[0]) { if(retval) { retval=concat(s1,retval->data); }else { retval=strdup(s1); } } }else if(s1 && s1[0]) { retval=strdup(s1); } return retval; } // Creates with a concat operation. GP GStringRep::concat( const GP &s1,const char *s2) const { GP retval; if(s1) { retval=toThis(s1); if(s2 && s2[0]) { if(retval) { retval=retval->append(s2); }else { retval=strdup(s2); } } }else if(s2 && s2[0]) { retval=strdup(s2); } return retval; } GP GStringRep::concat(const GP &s1,const GP &s2) const { GP retval; if(s1) { retval=toThis(s1,s2); if(retval && s2) { retval=retval->append(toThis(s2)); } }else if(s2) { retval=toThis(s2); } return retval; } GStringRep::GStringRep(void) { size=0; data=0; } GStringRep::~GStringRep() { if(data) { data[0]=0; ::operator delete(data); } data=0; } GStringRep::UTF8::UTF8(void) {} GStringRep::UTF8::~UTF8() {} int GStringRep::cmp(const char *s1,const int len) const { return cmp(data,s1,len); } int GStringRep::cmp(const char *s1, const char *s2,const int len) { return (len ?((s1&&s1[0]) ?((s2&&s2[0]) ?((len>0) ?strncmp(s1,s2,len) :strcmp(s1,s2)) :1) :((s2&&s2[0])?(-1):0)) :0); } int GStringRep::cmp(const GP &s1, const GP &s2, const int len ) { return (s1?(s1->cmp(s2,len)):cmp(0,(s2?(s2->data):0),len)); } int GStringRep::cmp(const GP &s1, const char *s2, const int len ) { return cmp((s1?s1->data:0),s2,len); } int GStringRep::cmp(const char *s1, const GP &s2, const int len ) { return cmp(s1,(s2?(s2->data):0),len); } int GStringRep::UTF8::cmp(const GP &s2,const int len) const { int retval; if(s2) { if(s2->isNative()) { GP r(s2->toUTF8(true)); if(r) { retval=GStringRep::cmp(data,r->data,len); }else { retval=-(s2->cmp(toNative(NOT_ESCAPED),len)); } }else { retval=GStringRep::cmp(data,s2->data,len); } }else { retval=GStringRep::cmp(data,0,len); } return retval; } int GStringRep::UTF8::toInt() const { int endpos; return (int)toLong(0,endpos); } static inline long Cstrtol(char *data,char **edata, const int base) { GStringRep::ChangeLocale locale(LC_NUMERIC,"C"); while (data && *data==' ') data++; return strtol(data,edata,base); } long GStringRep::UTF8::toLong( const int pos, int &endpos, const int base) const { char *edata=0; long retval=Cstrtol(data+pos,&edata, base); if(edata) { endpos=edata-data; }else { GP ptr = GStringRep::UTF8::create(); endpos=(-1); ptr=ptr->strdup(data+pos); if(ptr) ptr=ptr->toNative(NOT_ESCAPED); if(ptr) { int xendpos; retval=ptr->toLong(0,xendpos,base); if(xendpos> 0) { endpos=(int)size; ptr=ptr->strdup(data+xendpos); if(ptr) { ptr=ptr->toUTF8(true); if(ptr) { endpos-=(int)(ptr->size); } } } } } return retval; } static inline unsigned long Cstrtoul(char *data,char **edata, const int base) { GStringRep::ChangeLocale locale(LC_NUMERIC,"C"); while (data && *data==' ') data++; return strtoul(data,edata,base); } unsigned long GStringRep::UTF8::toULong( const int pos, int &endpos, const int base) const { char *edata=0; unsigned long retval=Cstrtoul(data+pos,&edata, base); if(edata) { endpos=edata-data; }else { GP ptr = GStringRep::UTF8::create(); endpos=(-1); ptr=ptr->strdup(data+pos); if(ptr) ptr=ptr->toNative(NOT_ESCAPED); if(ptr) { int xendpos; retval=ptr->toULong(0,xendpos,base); if(xendpos> 0) { endpos=(int)size; ptr=ptr->strdup(data+xendpos); if(ptr) { ptr=ptr->toUTF8(true); if(ptr) { endpos-=(int)(ptr->size); } } } } } return retval; } static inline double Cstrtod(char *data,char **edata) { GStringRep::ChangeLocale locale(LC_NUMERIC,"C"); while (data && *data==' ') data++; return strtod(data,edata); } double GStringRep::UTF8::toDouble(const int pos, int &endpos) const { char *edata=0; double retval=Cstrtod(data+pos,&edata); if(edata) { endpos=edata-data; }else { GP ptr = GStringRep::UTF8::create(); endpos=(-1); ptr=ptr->strdup(data+pos); if(ptr) ptr=ptr->toNative(NOT_ESCAPED); if(ptr) { int xendpos; retval=ptr->toDouble(0,xendpos); if(xendpos >= 0) { endpos=(int)size; ptr=ptr->strdup(data+xendpos); if(ptr) { ptr=ptr->toUTF8(true); if(ptr) { endpos-=(int)(ptr->size); } } } } } return retval; } int GStringRep::getUCS4(uint32_t &w, const int from) const { int retval; if(from>=size) { w=0; retval=size; }else if(from<0) { w=(unsigned int)(-1); retval=(-1); }else { const char *source=data+from; w=getValidUCS4(source); retval=(int)((size_t)source-(size_t)data); } return retval; } uint32_t GStringRep::UTF8::getValidUCS4(const char *&source) const { return GStringRep::UTF8toUCS4((const unsigned char *&)source,data+size); } int GStringRep::nextNonSpace(const int from,const int len) const { return nextCharType(giswspace,from,len,true); } int GStringRep::nextSpace(const int from,const int len) const { return nextCharType(giswspace,from,len,false); } int GStringRep::nextChar(const int from) const { char const * xptr=data+from; (void)getValidUCS4(xptr); return (int)((size_t)xptr-(size_t)data); } int GStringRep::firstEndSpace(int from,const int len) const { const int xsize=(len<0)?size:(from+len); const int ysize=(size>10)&0x3ff)+0xD800); w2=(uint16_t)((w&0x3ff)+0xDC00); retval=2; } return retval; } int GStringRep::UTF16toUCS4( uint32_t &U,uint16_t const * const s,void const * const eptr) { int retval=0; U=0; uint16_t const * const r=s+1; if(r <= eptr) { uint32_t const W1=s[0]; if((W1<0xD800)||(W1>0xDFFF)) { if((U=W1)) { retval=1; } }else if(W1<=0xDBFF) { uint16_t const * const rr=r+1; if(rr <= eptr) { uint32_t const W2=s[1]; if(((W2>=0xDC00)||(W2<=0xDFFF))&&((U=(0x10000+((W1&0x3ff)<<10))|(W2&0x3ff)))) { retval=2; }else { retval=(-1); } } } } return retval; } //bcr GUTF8String& GUTF8String::operator+= (char ch) { return init( GStringRep::UTF8::create((const char*)*this, GStringRep::UTF8::create(&ch,0,1))); } GUTF8String& GUTF8String::operator+= (const char *str) { return init(GStringRep::UTF8::create(*this,str)); } GUTF8String& GUTF8String::operator+= (const GBaseString &str) { return init(GStringRep::UTF8::create(*this,str)); } GUTF8String GUTF8String::substr(int from, int len) const { return GUTF8String(*this, from, len); } GUTF8String GUTF8String::operator+(const GBaseString &s2) const { return GStringRep::UTF8::create(*this,s2); } GUTF8String GUTF8String::operator+(const GUTF8String &s2) const { return GStringRep::UTF8::create(*this,s2); } GUTF8String GUTF8String::operator+(const char *s2) const { return GStringRep::UTF8::create(*this,s2); } char * GUTF8String::getbuf(int n) { if(ptr) init((*this)->getbuf(n)); else if(n>0) init(GStringRep::UTF8::create(n)); else init(0); return ptr?((*this)->data):0; } void GUTF8String::setat(const int n, const char ch) { if((!n)&&(!ptr)) { init(GStringRep::UTF8::create(&ch,0,1)); }else { init((*this)->setat(CheckSubscript(n),ch)); } } GP GStringRep::UTF8ToNative( const char *s, const EscapeMode escape ) { return GStringRep::UTF8::create(s)->toNative(escape); } GUTF8String::GUTF8String(const char dat) { init(GStringRep::UTF8::create(&dat,0,1)); } GUTF8String::GUTF8String(const GUTF8String &fmt, va_list &args) { if (fmt.ptr) init(fmt->vformat(args)); else init(fmt); } GUTF8String::GUTF8String(const char *str) { init(GStringRep::UTF8::create(str)); } GUTF8String::GUTF8String(const unsigned char *str) { init(GStringRep::UTF8::create((const char *)str)); } GUTF8String::GUTF8String(const uint16_t *str) { init(GStringRep::UTF8::create(str,0,-1)); } GUTF8String::GUTF8String(const uint32_t *str) { init(GStringRep::UTF8::create(str,0,-1)); } GUTF8String::GUTF8String(const char *dat, unsigned int len) { init(GStringRep::UTF8::create(dat,0,((int)len<0)?(-1):(int)len)); } GUTF8String::GUTF8String(const uint16_t *dat, unsigned int len) { init(GStringRep::UTF8::create(dat,0,((int)len<0)?(-1):(int)len)); } GUTF8String::GUTF8String(const uint32_t *dat, unsigned int len) { init(GStringRep::UTF8::create(dat,0,((int)len<0)?(-1):(int)len)); } GUTF8String::GUTF8String(const GBaseString &gs, int from, int len) { init(GStringRep::UTF8::create(gs,from,((int)len<0)?(-1):(int)len)); } GUTF8String::GUTF8String(const int number) { init(GStringRep::UTF8::create_format("%d",number)); } GUTF8String::GUTF8String(const double number) { init(GStringRep::UTF8::create_format("%f",number)); } GUTF8String& GUTF8String::operator= (const char str) { return init(GStringRep::UTF8::create(&str,0,1)); } GUTF8String& GUTF8String::operator= (const char *str) { return init(GStringRep::UTF8::create(str)); } GUTF8String GBaseString::operator+(const GUTF8String &s2) const { return GStringRep::UTF8::create(*this,s2); } #if HAS_WCHAR GUTF8String GNativeString::operator+(const GUTF8String &s2) const { if (ptr) return GStringRep::UTF8::create((*this)->toUTF8(true),s2); else return GStringRep::UTF8::create((*this),s2); } #endif GUTF8String GUTF8String::operator+(const GNativeString &s2) const { GP g = s2; if (s2.ptr) g = s2->toUTF8(true); return GStringRep::UTF8::create(*this,g); } GUTF8String operator+(const char *s1, const GUTF8String &s2) { return GStringRep::UTF8::create(s1,s2); } #if HAS_WCHAR GNativeString operator+(const char *s1, const GNativeString &s2) { return GStringRep::Native::create(s1,s2); } GNativeString& GNativeString::operator+= (char ch) { char s[2]; s[0]=ch; s[1]=0; return init(GStringRep::Native::create((const char*)*this, s)); } GNativeString& GNativeString::operator+= (const char *str) { return init(GStringRep::Native::create(*this,str)); } GNativeString& GNativeString::operator+= (const GBaseString &str) { return init(GStringRep::Native::create(*this,str)); } GNativeString GNativeString::operator+(const GBaseString &s2) const { return GStringRep::Native::create(*this,s2); } GNativeString GNativeString::operator+(const GNativeString &s2) const { return GStringRep::Native::create(*this,s2); } GNativeString GNativeString::operator+(const char *s2) const { return GStringRep::Native::create(*this,s2); } char * GNativeString::getbuf(int n) { if(ptr) init((*this)->getbuf(n)); else if(n>0) init(GStringRep::Native::create(n)); else init(0); return ptr?((*this)->data):0; } void GNativeString::setat(const int n, const char ch) { if((!n)&&(!ptr)) { init(GStringRep::Native::create(&ch,0,1)); }else { init((*this)->setat(CheckSubscript(n),ch)); } } #endif #ifdef HAVE_NAMESPACES } # ifndef NOT_USING_DJVU_NAMESPACE using namespace DJVU; # endif #endif