00001 #ifndef UTF_CONVERSION_GROUP 00002 #define UTF_CONVERSION_GROUP 00003 00004 /*****************************************************************************\ 00005 * * 00006 * Name : convert_utf * 00007 * Author : Unicode, Inc. (C conversion functions) * 00008 * Author : Chris Koeritz (C++ conversion classes) * 00009 * * 00010 ******************************************************************************* 00011 * Copyright (c) 2006-$now By Author. This program is free software; you can * 00012 * redistribute it and/or modify it under the terms of the GNU General Public * 00013 * License as published by the Free Software Foundation; either version 2 of * 00014 * the License or (at your option) any later version. This is online at: * 00015 * http://www.fsf.org/copyleft/gpl.html * 00016 * Please send any updates to: fred@gruntose.com * 00017 \*****************************************************************************/ 00018 00019 // original copyright notice still applies to low-level conversion code: 00020 /* 00021 * Copyright 2001-$now Unicode, Inc. 00022 * 00023 * Disclaimer 00024 * 00025 * This source code is provided as is by Unicode, Inc. No claims are 00026 * made as to fitness for any particular purpose. No warranties of any 00027 * kind are expressed or implied. The recipient agrees to determine 00028 * applicability of information provided. If this file has been 00029 * purchased on magnetic or optical media from Unicode, Inc., the 00030 * sole remedy for any claim will be exchange of defective media 00031 * within 90 days of receipt. 00032 * 00033 * Limitations on Rights to Redistribute This Code 00034 * 00035 * Unicode, Inc. hereby grants the right to freely use the information 00036 * supplied in this file in the creation of products supporting the 00037 * Unicode Standard, and to make copies of this file in any form 00038 * for internal or external distribution as long as this notice 00039 * remains attached. 00040 */ 00041 00043 00049 /* --------------------------------------------------------------------- 00050 00051 Conversions between UTF32, UTF-16, and UTF-8. Header file. 00052 00053 Several funtions are included here, forming a complete set of 00054 conversions between the three formats. UTF-7 is not included 00055 here, but is handled in a separate source file. 00056 00057 Each of these routines takes pointers to input buffers and output 00058 buffers. The input buffers are const. 00059 00060 Each routine converts the text between *sourceStart and sourceEnd, 00061 putting the result into the buffer between *targetStart and 00062 targetEnd. Note: the end pointers are *after* the last item: e.g. 00063 *(sourceEnd - 1) is the last item. 00064 00065 The return result indicates whether the conversion was successful, 00066 and if not, whether the problem was in the source or target buffers. 00067 (Only the first encountered problem is indicated.) 00068 00069 After the conversion, *sourceStart and *targetStart are both 00070 updated to point to the end of last text successfully converted in 00071 the respective buffers. 00072 00073 Input parameters: 00074 sourceStart - pointer to a pointer to the source buffer. 00075 The contents of this are modified on return so that 00076 it points at the next thing to be converted. 00077 targetStart - similarly, pointer to pointer to the target buffer. 00078 sourceEnd, targetEnd - respectively pointers to the ends of the 00079 two buffers, for overflow checking only. 00080 00081 These conversion functions take a ConversionFlags argument. When this 00082 flag is set to strict, both irregular sequences and isolated surrogates 00083 will cause an error. When the flag is set to lenient, both irregular 00084 sequences and isolated surrogates are converted. 00085 00086 Whether the flag is strict or lenient, all illegal sequences will cause 00087 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 00088 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 00089 must check for illegal sequences. 00090 00091 When the flag is set to lenient, characters over 0x10FFFF are converted 00092 to the replacement character; otherwise (when the flag is set to strict) 00093 they constitute an error. 00094 00095 Output parameters: 00096 The value "sourceIllegal" is returned from some routines if the input 00097 sequence is malformed. When "sourceIllegal" is returned, the source 00098 value will point to the illegal value that caused the problem. E.g., 00099 in UTF-8 when a sequence is malformed, it points to the start of the 00100 malformed sequence. 00101 00102 Author: Mark E. Davis, 1994. 00103 Rev History: Rick McGowan, fixes & updates May 2001. 00104 Fixes & updates, Sept 2001. 00105 00106 ------------------------------------------------------------------------ */ 00107 00108 /* --------------------------------------------------------------------- 00109 The following 4 definitions are compiler-specific. 00110 The C standard does not guarantee that wchar_t has at least 00111 16 bits, so wchar_t is no less portable than unsigned short! 00112 All should be unsigned values to avoid sign extension during 00113 bit mask & shift operations. 00114 ------------------------------------------------------------------------ */ 00115 00116 typedef unsigned long UTF32; /* at least 32 bits */ 00117 typedef unsigned short UTF16; /* at least 16 bits */ 00118 typedef unsigned char UTF8; /* typically 8 bits */ 00119 typedef unsigned char Booleano; /* 0 or 1 */ 00120 00121 /* Some fundamental constants */ 00122 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 00123 #define UNI_MAX_BMP (UTF32)0x0000FFFF 00124 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF 00125 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 00126 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 00127 00128 typedef enum { 00129 conversionOK, /* conversion successful */ 00130 sourceExhausted, /* partial character in source, but hit end */ 00131 targetExhausted, /* insuff. room in target for conversion */ 00132 sourceIllegal /* source sequence is illegal/malformed */ 00133 } ConversionResult; 00134 00135 typedef enum { 00136 strictConversion = 0, 00137 lenientConversion 00138 } ConversionFlags; 00139 00140 /* This is for C++ and does no harm in C */ 00141 #ifdef __cplusplus 00142 00143 #include "definitions.h" 00144 00145 extern "C" { 00146 #endif 00147 00148 ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, 00149 const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, 00150 ConversionFlags flags); 00151 00152 ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, 00153 const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd, 00154 ConversionFlags flags); 00155 00156 ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, 00157 const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd, 00158 ConversionFlags flags); 00159 00160 ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, 00161 const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd, 00162 ConversionFlags flags); 00163 00164 ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, 00165 const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd, 00166 ConversionFlags flags); 00167 00168 ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, 00169 const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd, 00170 ConversionFlags flags); 00171 00172 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 00173 00174 #ifdef __cplusplus 00175 } //extern 00176 #endif //cplusplus 00177 00179 00180 #ifdef __cplusplus 00181 00182 // The following types and macros help to make it irrelevant what kind of 00183 // win32 build is being done. They will adapt as needed to provide the 00184 // types used in system calls. They are rendered harmless for other operating 00185 // systems or for non-Unicode builds; this is especially useful for POSIX 00186 // compliant functions that required Unicode in win32 but not in Unix systems. 00187 00188 #if defined(UNICODE) 00191 00194 #define to_unicode_temp(s) transcode_to_utf16(s) 00196 00198 #define from_unicode_temp(s) transcode_to_utf8(s) 00200 00203 #define to_unicode_persist(name, s) transcode_to_utf16 name(s) 00205 00206 #define from_unicode_persist(name, s) transcode_to_utf8 name(s) 00207 #else 00208 // these versions of the macros simply defang any conversions. 00209 #define to_unicode_temp(s) null_transcoder(s, false) 00210 #define from_unicode_temp(s) null_transcoder(s, false) 00211 #define to_unicode_persist(name, s) null_transcoder name(s, true) 00212 #define from_unicode_persist(name, s) null_transcoder name(s, true) 00213 #endif 00214 00215 #ifdef _MSC_VER 00217 #define TRACE_PRINT(s) TRACE(_T("%s"), to_unicode_temp(s)) 00218 #endif 00219 00221 00222 // The next two classes support converting a UTF-8 string into a UTF-16 00223 // string and vice-versa. They hold onto the converted string and provide 00224 // operators that return it. 00225 00227 00228 class transcode_to_utf16 00229 { 00230 public: 00231 transcode_to_utf16(const char *utf8_input); 00233 00236 transcode_to_utf16(const istring &utf8_input); 00238 00239 ~transcode_to_utf16(); 00240 00241 int length() const; 00243 00244 operator const UTF16 * () const { return _converted; } 00246 operator UTF16 * () { return _converted; } 00248 operator const flexichar * () const { return (const flexichar *)_converted; } 00250 operator flexichar * () { return (flexichar *)_converted; } 00252 00253 private: 00254 int _orig_length; 00255 UTF16 *_converted; 00256 }; 00257 00259 00261 00262 class transcode_to_utf8 00263 { 00264 public: 00265 transcode_to_utf8(const UTF16 *utf16_input); 00267 00270 transcode_to_utf8(const wchar_t *utf16_input); 00272 00273 ~transcode_to_utf8(); 00274 00275 int length() const; 00277 00278 operator const UTF8 * () const { return _converted; } 00280 operator UTF8 * () { return _converted; } 00282 00283 operator istring() const; 00285 00286 private: 00287 int _orig_length; 00288 int _new_length; 00289 UTF8 *_converted; 00290 }; 00291 00293 00295 00296 class null_transcoder 00297 { 00298 public: 00300 null_transcoder(const char *utf8_input, bool make_own_copy); 00302 null_transcoder(const istring &utf8_input, bool make_own_copy); 00303 ~null_transcoder() { 00304 if (_make_own_copy) delete [] _converted; 00305 _converted = NIL; 00306 } 00307 00308 int length() const; 00309 operator char * () { return (char *)_converted; } 00310 operator const char * () const { return (const char *)_converted; } 00311 00312 private: 00313 bool _make_own_copy; 00314 const UTF8 *_converted; 00315 }; 00316 00317 #endif //cplusplus 00318 00319 #endif // outer guard. 00320
1.5.1