convert_utf.h

Go to the documentation of this file.
00001 #ifndef UTF_CONVERSION_GROUP
00002 #define UTF_CONVERSION_GROUP
00003 
00004 /*****************************************************************************\
00005 *                                                                             *
00006 *  Name   : convert_utf                                                       *
00007 *  Author : Unicode, Inc. (C conversion functions)                            *
00008 *  Author : Chris Koeritz (C++ conversion classes)                            *
00009 *                                                                             *
00010 *******************************************************************************
00011 * Copyright (c) 2006-$now By Author.  This program is free software; you can  *
00012 * redistribute it and/or modify it under the terms of the GNU General Public  *
00013 * License as published by the Free Software Foundation; either version 2 of   *
00014 * the License or (at your option) any later version.  This is online at:      *
00015 *     http://www.fsf.org/copyleft/gpl.html                                    *
00016 * Please send any updates to: fred@gruntose.com                               *
00017 \*****************************************************************************/
00018 
00019 // original copyright notice still applies to low-level conversion code:
00020 /*
00021  * Copyright 2001-$now Unicode, Inc.
00022  * 
00023  * Disclaimer
00024  * 
00025  * This source code is provided as is by Unicode, Inc. No claims are
00026  * made as to fitness for any particular purpose. No warranties of any
00027  * kind are expressed or implied. The recipient agrees to determine
00028  * applicability of information provided. If this file has been
00029  * purchased on magnetic or optical media from Unicode, Inc., the
00030  * sole remedy for any claim will be exchange of defective media
00031  * within 90 days of receipt.
00032  * 
00033  * Limitations on Rights to Redistribute This Code
00034  * 
00035  * Unicode, Inc. hereby grants the right to freely use the information
00036  * supplied in this file in the creation of products supporting the
00037  * Unicode Standard, and to make copies of this file in any form
00038  * for internal or external distribution as long as this notice
00039  * remains attached.
00040  */
00041 
00043 
00049 /* ---------------------------------------------------------------------
00050 
00051     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
00052 
00053     Several funtions are included here, forming a complete set of
00054     conversions between the three formats.  UTF-7 is not included
00055     here, but is handled in a separate source file.
00056 
00057     Each of these routines takes pointers to input buffers and output
00058     buffers.  The input buffers are const.
00059 
00060     Each routine converts the text between *sourceStart and sourceEnd,
00061     putting the result into the buffer between *targetStart and
00062     targetEnd. Note: the end pointers are *after* the last item: e.g. 
00063     *(sourceEnd - 1) is the last item.
00064 
00065     The return result indicates whether the conversion was successful,
00066     and if not, whether the problem was in the source or target buffers.
00067     (Only the first encountered problem is indicated.)
00068 
00069     After the conversion, *sourceStart and *targetStart are both
00070     updated to point to the end of last text successfully converted in
00071     the respective buffers.
00072 
00073     Input parameters:
00074         sourceStart - pointer to a pointer to the source buffer.
00075                 The contents of this are modified on return so that
00076                 it points at the next thing to be converted.
00077         targetStart - similarly, pointer to pointer to the target buffer.
00078         sourceEnd, targetEnd - respectively pointers to the ends of the
00079                 two buffers, for overflow checking only.
00080 
00081     These conversion functions take a ConversionFlags argument. When this
00082     flag is set to strict, both irregular sequences and isolated surrogates
00083     will cause an error.  When the flag is set to lenient, both irregular
00084     sequences and isolated surrogates are converted.
00085 
00086     Whether the flag is strict or lenient, all illegal sequences will cause
00087     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
00088     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
00089     must check for illegal sequences.
00090 
00091     When the flag is set to lenient, characters over 0x10FFFF are converted
00092     to the replacement character; otherwise (when the flag is set to strict)
00093     they constitute an error.
00094 
00095     Output parameters:
00096         The value "sourceIllegal" is returned from some routines if the input
00097         sequence is malformed.  When "sourceIllegal" is returned, the source
00098         value will point to the illegal value that caused the problem. E.g.,
00099         in UTF-8 when a sequence is malformed, it points to the start of the
00100         malformed sequence.  
00101 
00102     Author: Mark E. Davis, 1994.
00103     Rev History: Rick McGowan, fixes & updates May 2001.
00104         Fixes & updates, Sept 2001.
00105 
00106 ------------------------------------------------------------------------ */
00107 
00108 /* ---------------------------------------------------------------------
00109     The following 4 definitions are compiler-specific.
00110     The C standard does not guarantee that wchar_t has at least
00111     16 bits, so wchar_t is no less portable than unsigned short!
00112     All should be unsigned values to avoid sign extension during
00113     bit mask & shift operations.
00114 ------------------------------------------------------------------------ */
00115 
00116 typedef unsigned long UTF32;  /* at least 32 bits */
00117 typedef unsigned short UTF16;  /* at least 16 bits */
00118 typedef unsigned char UTF8;  /* typically 8 bits */
00119 typedef unsigned char Booleano;  /* 0 or 1 */
00120 
00121 /* Some fundamental constants */
00122 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
00123 #define UNI_MAX_BMP (UTF32)0x0000FFFF
00124 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
00125 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
00126 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
00127 
00128 typedef enum {
00129   conversionOK,     /* conversion successful */
00130   sourceExhausted,  /* partial character in source, but hit end */
00131   targetExhausted,  /* insuff. room in target for conversion */
00132   sourceIllegal  /* source sequence is illegal/malformed */
00133 } ConversionResult;
00134 
00135 typedef enum {
00136   strictConversion = 0,
00137   lenientConversion
00138 } ConversionFlags;
00139 
00140 /* This is for C++ and does no harm in C */
00141 #ifdef __cplusplus
00142 
00143 #include "definitions.h"
00144 
00145 extern "C" {
00146 #endif
00147 
00148 ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart,
00149     const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
00150     ConversionFlags flags);
00151 
00152 ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart,
00153     const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
00154     ConversionFlags flags);
00155 
00156 ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart,
00157     const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
00158     ConversionFlags flags);
00159 
00160 ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart,
00161     const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
00162     ConversionFlags flags);
00163 
00164 ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart,
00165     const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
00166     ConversionFlags flags);
00167 
00168 ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart,
00169     const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
00170     ConversionFlags flags);
00171 
00172 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
00173 
00174 #ifdef __cplusplus
00175 } //extern
00176 #endif //cplusplus
00177 
00179 
00180 #ifdef __cplusplus
00181 
00182 // The following types and macros help to make it irrelevant what kind of
00183 // win32 build is being done.  They will adapt as needed to provide the
00184 // types used in system calls.  They are rendered harmless for other operating
00185 // systems or for non-Unicode builds; this is especially useful for POSIX
00186 // compliant functions that required Unicode in win32 but not in Unix systems.
00187 
00188 #if defined(UNICODE)
00191 
00194   #define to_unicode_temp(s) transcode_to_utf16(s)
00196 
00198   #define from_unicode_temp(s) transcode_to_utf8(s)
00200 
00203   #define to_unicode_persist(name, s) transcode_to_utf16 name(s)
00205 
00206   #define from_unicode_persist(name, s) transcode_to_utf8 name(s)
00207 #else
00208   // these versions of the macros simply defang any conversions.
00209   #define to_unicode_temp(s) null_transcoder(s, false)
00210   #define from_unicode_temp(s) null_transcoder(s, false)
00211   #define to_unicode_persist(name, s) null_transcoder name(s, true)
00212   #define from_unicode_persist(name, s) null_transcoder name(s, true) 
00213 #endif
00214 
00215 #ifdef _MSC_VER
00217   #define TRACE_PRINT(s) TRACE(_T("%s"), to_unicode_temp(s))
00218 #endif
00219 
00221 
00222 // The next two classes support converting a UTF-8 string into a UTF-16
00223 // string and vice-versa.  They hold onto the converted string and provide
00224 // operators that return it.
00225 
00227 
00228 class transcode_to_utf16
00229 {
00230 public:
00231   transcode_to_utf16(const char *utf8_input);
00233 
00236   transcode_to_utf16(const istring &utf8_input);
00238 
00239   ~transcode_to_utf16();
00240 
00241   int length() const;
00243 
00244   operator const UTF16 * () const { return _converted; }
00246   operator UTF16 * () { return _converted; }
00248   operator const flexichar * () const { return (const flexichar *)_converted; }
00250   operator flexichar * () { return (flexichar *)_converted; }
00252 
00253 private:
00254   int _orig_length;  
00255   UTF16 *_converted;  
00256 };
00257 
00259 
00261 
00262 class transcode_to_utf8
00263 {
00264 public:
00265   transcode_to_utf8(const UTF16 *utf16_input);
00267 
00270   transcode_to_utf8(const wchar_t *utf16_input);
00272 
00273   ~transcode_to_utf8();
00274 
00275   int length() const;
00277 
00278   operator const UTF8 * () const { return _converted; }
00280   operator UTF8 * () { return _converted; }
00282 
00283   operator istring() const;
00285 
00286 private:
00287   int _orig_length;  
00288   int _new_length;  
00289   UTF8 *_converted;  
00290 };
00291 
00293 
00295 
00296 class null_transcoder
00297 {
00298 public:
00300   null_transcoder(const char *utf8_input, bool make_own_copy);
00302   null_transcoder(const istring &utf8_input, bool make_own_copy);
00303   ~null_transcoder() {
00304     if (_make_own_copy) delete [] _converted;
00305     _converted = NIL;
00306   }
00307 
00308   int length() const;
00309   operator char * () { return (char *)_converted; }
00310   operator const char * () const { return (const char *)_converted; }
00311 
00312 private:
00313   bool _make_own_copy;
00314   const UTF8 *_converted;
00315 };
00316 
00317 #endif //cplusplus
00318 
00319 #endif // outer guard.
00320 

Generated on Sat Oct 11 04:28:37 2008 for HOOPLE Libraries by  doxygen 1.5.1