convert_utf.cpp

Go to the documentation of this file.
00001 #ifndef CONVERT_UTF_IMPLEMENTATION_FILE
00002 #define CONVERT_UTF_IMPLEMENTATION_FILE
00003 
00004 /*****************************************************************************\
00005 *                                                                             *
00006 *  Name   : convert_utf                                                       *
00007 *  Author : Unicode, Inc. (C conversion functions)                            *
00008 *  Author : Chris Koeritz (C++ conversion classes)                            *
00009 *                                                                             *
00010 *******************************************************************************
00011 * Copyright (c) 2006-$now By Author.  This program is free software; you can  *
00012 * redistribute it and/or modify it under the terms of the GNU General Public  *
00013 * License as published by the Free Software Foundation; either version 2 of   *
00014 * the License or (at your option) any later version.  This is online at:      *
00015 *     http://www.fsf.org/copyleft/gpl.html                                    *
00016 * Please send any updates to: fred@gruntose.com                               *
00017 \*****************************************************************************/
00018 
00019 //copyright below is relevant to UTF conversion methods only.
00020 /*
00021  * Copyright 2001-$now Unicode, Inc.
00022  * 
00023  * Disclaimer
00024  * 
00025  * This source code is provided as is by Unicode, Inc. No claims are
00026  * made as to fitness for any particular purpose. No warranties of any
00027  * kind are expressed or implied. The recipient agrees to determine
00028  * applicability of information provided. If this file has been
00029  * purchased on magnetic or optical media from Unicode, Inc., the
00030  * sole remedy for any claim will be exchange of defective media
00031  * within 90 days of receipt.
00032  * 
00033  * Limitations on Rights to Redistribute This Code
00034  * 
00035  * Unicode, Inc. hereby grants the right to freely use the information
00036  * supplied in this file in the creation of products supporting the
00037  * Unicode Standard, and to make copies of this file in any form
00038  * for internal or external distribution as long as this notice
00039  * remains attached.
00040  */
00041 
00042 /* ---------------------------------------------------------------------
00043 
00044     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
00045     Author: Mark E. Davis, 1994.
00046     Rev History: Rick McGowan, fixes & updates May 2001.
00047     Sept 2001: fixed const & error conditions per
00048         mods suggested by S. Parent & A. Lillich.
00049     June 2002: Tim Dodd added detection and handling of incomplete
00050         source sequences, enhanced error detection, added casts
00051         to eliminate compiler warnings.
00052     July 2003: slight mods to back out aggressive FFFE detection.
00053     Jan 2004: updated switches in from-UTF8 conversions.
00054     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
00055 
00056     See the header file "ConvertUTF.h" for complete documentation.
00057 
00058 ------------------------------------------------------------------------ */
00059 
00060 #include "convert_utf.h"
00061 #ifdef CVTUTF_DEBUG
00062   #include <stdio.h>
00063 #endif
00064 #ifdef __cplusplus
00065   #include "istring.h"
00066   #include <string.h>
00067   #include <wchar.h>
00068 #endif
00069 
00070 static const int halfShift  = 10; /* used for shifting by 10 bits */
00071 
00072 static const UTF32 halfBase = 0x0010000UL;
00073 static const UTF32 halfMask = 0x3FFUL;
00074 
00075 #define UNI_SUR_HIGH_START  (UTF32)0xD800
00076 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
00077 #define UNI_SUR_LOW_START   (UTF32)0xDC00
00078 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
00079 
00080 /* --------------------------------------------------------------------- */
00081 
00082 ConversionResult ConvertUTF32toUTF16 (
00083   const UTF32** sourceStart, const UTF32* sourceEnd, 
00084   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00085     ConversionResult result = conversionOK;
00086     const UTF32* source = *sourceStart;
00087     UTF16* target = *targetStart;
00088     while (source < sourceEnd) {
00089   UTF32 ch;
00090   if (target >= targetEnd) {
00091       result = targetExhausted; break;
00092   }
00093   ch = *source++;
00094   if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
00095       /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
00096       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00097     if (flags == strictConversion) {
00098         --source; /* return to the illegal value itself */
00099         result = sourceIllegal;
00100         break;
00101     } else {
00102         *target++ = UNI_REPLACEMENT_CHAR;
00103     }
00104       } else {
00105     *target++ = (UTF16)ch; /* normal case */
00106       }
00107   } else if (ch > UNI_MAX_LEGAL_UTF32) {
00108       if (flags == strictConversion) {
00109     result = sourceIllegal;
00110       } else {
00111     *target++ = UNI_REPLACEMENT_CHAR;
00112       }
00113   } else {
00114       /* target is a character in range 0xFFFF - 0x10FFFF. */
00115       if (target + 1 >= targetEnd) {
00116     --source; /* Back up source pointer! */
00117     result = targetExhausted; break;
00118       }
00119       ch -= halfBase;
00120       *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00121       *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00122   }
00123     }
00124     *sourceStart = source;
00125     *targetStart = target;
00126     return result;
00127 }
00128 
00129 /* --------------------------------------------------------------------- */
00130 
00131 ConversionResult ConvertUTF16toUTF32 (
00132   const UTF16** sourceStart, const UTF16* sourceEnd, 
00133   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00134     ConversionResult result = conversionOK;
00135     const UTF16* source = *sourceStart;
00136     UTF32* target = *targetStart;
00137     UTF32 ch, ch2;
00138     while (source < sourceEnd) {
00139   const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
00140   ch = *source++;
00141   /* If we have a surrogate pair, convert to UTF32 first. */
00142   if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00143       /* If the 16 bits following the high surrogate are in the source buffer... */
00144       if (source < sourceEnd) {
00145     ch2 = *source;
00146     /* If it's a low surrogate, convert to UTF32. */
00147     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00148         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00149       + (ch2 - UNI_SUR_LOW_START) + halfBase;
00150         ++source;
00151     } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
00152         --source; /* return to the illegal value itself */
00153         result = sourceIllegal;
00154         break;
00155     }
00156       } else { /* We don't have the 16 bits following the high surrogate. */
00157     --source; /* return to the high surrogate */
00158     result = sourceExhausted;
00159     break;
00160       }
00161   } else if (flags == strictConversion) {
00162       /* UTF-16 surrogate values are illegal in UTF-32 */
00163       if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00164     --source; /* return to the illegal value itself */
00165     result = sourceIllegal;
00166     break;
00167       }
00168   }
00169   if (target >= targetEnd) {
00170       source = oldSource; /* Back up source pointer! */
00171       result = targetExhausted; break;
00172   }
00173   *target++ = ch;
00174     }
00175     *sourceStart = source;
00176     *targetStart = target;
00177 #ifdef CVTUTF_DEBUG
00178 if (result == sourceIllegal) {
00179     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
00180     fflush(stderr);
00181 }
00182 #endif
00183     return result;
00184 }
00185 
00186 /* --------------------------------------------------------------------- */
00187 
00188 /*
00189  * Index into the table below with the first byte of a UTF-8 sequence to
00190  * get the number of trailing bytes that are supposed to follow it.
00191  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
00192  * left as-is for anyone who may want to do such conversion, which was
00193  * allowed in earlier algorithms.
00194  */
00195 static const char trailingBytesForUTF8[256] = {
00196     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00197     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00198     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00199     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00200     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00201     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00202     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00203     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00204 };
00205 
00206 /*
00207  * Magic values subtracted from a buffer value during UTF8 conversion.
00208  * This table contains as many values as there might be trailing bytes
00209  * in a UTF-8 sequence.
00210  */
00211 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
00212          0x03C82080UL, 0xFA082080UL, 0x82082080UL };
00213 
00214 /*
00215  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
00216  * into the first byte, depending on how many bytes follow.  There are
00217  * as many entries in this table as there are UTF-8 sequence types.
00218  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
00219  * for *legal* UTF-8 will be 4 or fewer bytes total.
00220  */
00221 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00222 
00223 /* --------------------------------------------------------------------- */
00224 
00225 /* The interface converts a whole buffer to avoid function-call overhead.
00226  * Constants have been gathered. Loops & conditionals have been removed as
00227  * much as possible for efficiency, in favor of drop-through switches.
00228  * (See "Note A" at the bottom of the file for equivalent code.)
00229  * If your compiler supports it, the "isLegalUTF8" call can be turned
00230  * into an inline function.
00231  */
00232 
00233 /* --------------------------------------------------------------------- */
00234 
00235 ConversionResult ConvertUTF16toUTF8 (
00236   const UTF16** sourceStart, const UTF16* sourceEnd, 
00237   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00238     ConversionResult result = conversionOK;
00239     const UTF16* source = *sourceStart;
00240     UTF8* target = *targetStart;
00241     while (source < sourceEnd) {
00242   UTF32 ch;
00243   unsigned short bytesToWrite = 0;
00244   const UTF32 byteMask = 0xBF;
00245   const UTF32 byteMark = 0x80; 
00246   const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
00247   ch = *source++;
00248   /* If we have a surrogate pair, convert to UTF32 first. */
00249   if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00250       /* If the 16 bits following the high surrogate are in the source buffer... */
00251       if (source < sourceEnd) {
00252     UTF32 ch2 = *source;
00253     /* If it's a low surrogate, convert to UTF32. */
00254     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00255         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00256       + (ch2 - UNI_SUR_LOW_START) + halfBase;
00257         ++source;
00258     } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
00259         --source; /* return to the illegal value itself */
00260         result = sourceIllegal;
00261         break;
00262     }
00263       } else { /* We don't have the 16 bits following the high surrogate. */
00264     --source; /* return to the high surrogate */
00265     result = sourceExhausted;
00266     break;
00267       }
00268   } else if (flags == strictConversion) {
00269       /* UTF-16 surrogate values are illegal in UTF-32 */
00270       if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00271     --source; /* return to the illegal value itself */
00272     result = sourceIllegal;
00273     break;
00274       }
00275   }
00276   /* Figure out how many bytes the result will require */
00277   if (ch < (UTF32)0x80) {       bytesToWrite = 1;
00278   } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
00279   } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
00280   } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
00281   } else {          bytesToWrite = 3;
00282               ch = UNI_REPLACEMENT_CHAR;
00283   }
00284 
00285   target += bytesToWrite;
00286   if (target > targetEnd) {
00287       source = oldSource; /* Back up source pointer! */
00288       target -= bytesToWrite; result = targetExhausted; break;
00289   }
00290   switch (bytesToWrite) { /* note: everything falls through. */
00291       case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00292       case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00293       case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00294       case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
00295   }
00296   target += bytesToWrite;
00297     }
00298     *sourceStart = source;
00299     *targetStart = target;
00300     return result;
00301 }
00302 
00303 /* --------------------------------------------------------------------- */
00304 
00305 /*
00306  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
00307  * This must be called with the length pre-determined by the first byte.
00308  * If not calling this from ConvertUTF8to*, then the length can be set by:
00309  *  length = trailingBytesForUTF8[*source]+1;
00310  * and the sequence is illegal right away if there aren't that many bytes
00311  * available.
00312  * If presented with a length > 4, this returns false.  The Unicode
00313  * definition of UTF-8 goes up to 4-byte sequences.
00314  */
00315 
00316 static Booleano isLegalUTF8(const UTF8 *source, int length) {
00317     UTF8 a;
00318     const UTF8 *srcptr = source+length;
00319     switch (length) {
00320     default: return false;
00321   /* Everything else falls through when "true"... */
00322     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00323     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00324     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
00325 
00326   switch (*source) {
00327       /* no fall-through in this inner switch */
00328       case 0xE0: if (a < 0xA0) return false; break;
00329       case 0xED: if (a > 0x9F) return false; break;
00330       case 0xF0: if (a < 0x90) return false; break;
00331       case 0xF4: if (a > 0x8F) return false; break;
00332       default:   if (a < 0x80) return false;
00333   }
00334 
00335     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
00336     }
00337     if (*source > 0xF4) return false;
00338     return true;
00339 }
00340 
00341 /* --------------------------------------------------------------------- */
00342 
00343 /*
00344  * Exported function to return whether a UTF-8 sequence is legal or not.
00345  * This is not used here; it's just exported.
00346  */
00347 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
00348     int length = trailingBytesForUTF8[*source]+1;
00349     if (source+length > sourceEnd) {
00350   return false;
00351     }
00352     return isLegalUTF8(source, length);
00353 }
00354 
00355 /* --------------------------------------------------------------------- */
00356 
00357 ConversionResult ConvertUTF8toUTF16 (
00358   const UTF8** sourceStart, const UTF8* sourceEnd, 
00359   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00360     ConversionResult result = conversionOK;
00361     const UTF8* source = *sourceStart;
00362     UTF16* target = *targetStart;
00363     while (source < sourceEnd) {
00364   UTF32 ch = 0;
00365   unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00366   if (source + extraBytesToRead >= sourceEnd) {
00367       result = sourceExhausted; break;
00368   }
00369   /* Do this check whether lenient or strict */
00370   if (! isLegalUTF8(source, extraBytesToRead+1)) {
00371       result = sourceIllegal;
00372       break;
00373   }
00374   /*
00375    * The cases all fall through. See "Note A" below.
00376    */
00377   switch (extraBytesToRead) {
00378       case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
00379       case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
00380       case 3: ch += *source++; ch <<= 6;
00381       case 2: ch += *source++; ch <<= 6;
00382       case 1: ch += *source++; ch <<= 6;
00383       case 0: ch += *source++;
00384   }
00385   ch -= offsetsFromUTF8[extraBytesToRead];
00386 
00387   if (target >= targetEnd) {
00388       source -= (extraBytesToRead+1); /* Back up source pointer! */
00389       result = targetExhausted; break;
00390   }
00391   if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
00392       /* UTF-16 surrogate values are illegal in UTF-32 */
00393       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00394     if (flags == strictConversion) {
00395         source -= (extraBytesToRead+1); /* return to the illegal value itself */
00396         result = sourceIllegal;
00397         break;
00398     } else {
00399         *target++ = UNI_REPLACEMENT_CHAR;
00400     }
00401       } else {
00402     *target++ = (UTF16)ch; /* normal case */
00403       }
00404   } else if (ch > UNI_MAX_UTF16) {
00405       if (flags == strictConversion) {
00406     result = sourceIllegal;
00407     source -= (extraBytesToRead+1); /* return to the start */
00408     break; /* Bail out; shouldn't continue */
00409       } else {
00410     *target++ = UNI_REPLACEMENT_CHAR;
00411       }
00412   } else {
00413       /* target is a character in range 0xFFFF - 0x10FFFF. */
00414       if (target + 1 >= targetEnd) {
00415     source -= (extraBytesToRead+1); /* Back up source pointer! */
00416     result = targetExhausted; break;
00417       }
00418       ch -= halfBase;
00419       *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00420       *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00421   }
00422     }
00423     *sourceStart = source;
00424     *targetStart = target;
00425     return result;
00426 }
00427 
00428 /* --------------------------------------------------------------------- */
00429 
00430 ConversionResult ConvertUTF32toUTF8 (
00431   const UTF32** sourceStart, const UTF32* sourceEnd, 
00432   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00433     ConversionResult result = conversionOK;
00434     const UTF32* source = *sourceStart;
00435     UTF8* target = *targetStart;
00436     while (source < sourceEnd) {
00437   UTF32 ch;
00438   unsigned short bytesToWrite = 0;
00439   const UTF32 byteMask = 0xBF;
00440   const UTF32 byteMark = 0x80; 
00441   ch = *source++;
00442   if (flags == strictConversion ) {
00443       /* UTF-16 surrogate values are illegal in UTF-32 */
00444       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00445     --source; /* return to the illegal value itself */
00446     result = sourceIllegal;
00447     break;
00448       }
00449   }
00450   /*
00451    * Figure out how many bytes the result will require. Turn any
00452    * illegally large UTF32 things (> Plane 17) into replacement chars.
00453    */
00454   if (ch < (UTF32)0x80) {       bytesToWrite = 1;
00455   } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
00456   } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
00457   } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
00458   } else {          bytesToWrite = 3;
00459               ch = UNI_REPLACEMENT_CHAR;
00460               result = sourceIllegal;
00461   }
00462   
00463   target += bytesToWrite;
00464   if (target > targetEnd) {
00465       --source; /* Back up source pointer! */
00466       target -= bytesToWrite; result = targetExhausted; break;
00467   }
00468   switch (bytesToWrite) { /* note: everything falls through. */
00469       case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00470       case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00471       case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00472       case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
00473   }
00474   target += bytesToWrite;
00475     }
00476     *sourceStart = source;
00477     *targetStart = target;
00478     return result;
00479 }
00480 
00481 /* --------------------------------------------------------------------- */
00482 
00483 ConversionResult ConvertUTF8toUTF32 (
00484   const UTF8** sourceStart, const UTF8* sourceEnd, 
00485   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00486     ConversionResult result = conversionOK;
00487     const UTF8* source = *sourceStart;
00488     UTF32* target = *targetStart;
00489     while (source < sourceEnd) {
00490   UTF32 ch = 0;
00491   unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00492   if (source + extraBytesToRead >= sourceEnd) {
00493       result = sourceExhausted; break;
00494   }
00495   /* Do this check whether lenient or strict */
00496   if (! isLegalUTF8(source, extraBytesToRead+1)) {
00497       result = sourceIllegal;
00498       break;
00499   }
00500   /*
00501    * The cases all fall through. See "Note A" below.
00502    */
00503   switch (extraBytesToRead) {
00504       case 5: ch += *source++; ch <<= 6;
00505       case 4: ch += *source++; ch <<= 6;
00506       case 3: ch += *source++; ch <<= 6;
00507       case 2: ch += *source++; ch <<= 6;
00508       case 1: ch += *source++; ch <<= 6;
00509       case 0: ch += *source++;
00510   }
00511   ch -= offsetsFromUTF8[extraBytesToRead];
00512 
00513   if (target >= targetEnd) {
00514       source -= (extraBytesToRead+1); /* Back up the source pointer! */
00515       result = targetExhausted; break;
00516   }
00517   if (ch <= UNI_MAX_LEGAL_UTF32) {
00518       /*
00519        * UTF-16 surrogate values are illegal in UTF-32, and anything
00520        * over Plane 17 (> 0x10FFFF) is illegal.
00521        */
00522       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00523     if (flags == strictConversion) {
00524         source -= (extraBytesToRead+1); /* return to the illegal value itself */
00525         result = sourceIllegal;
00526         break;
00527     } else {
00528         *target++ = UNI_REPLACEMENT_CHAR;
00529     }
00530       } else {
00531     *target++ = ch;
00532       }
00533   } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
00534       result = sourceIllegal;
00535       *target++ = UNI_REPLACEMENT_CHAR;
00536   }
00537     }
00538     *sourceStart = source;
00539     *targetStart = target;
00540     return result;
00541 }
00542 
00543 /* ---------------------------------------------------------------------
00544 
00545     Note A.
00546     The fall-through switches in UTF-8 reading code save a
00547     temp variable, some decrements & conditionals.  The switches
00548     are equivalent to the following loop:
00549   {
00550       int tmpBytesToRead = extraBytesToRead+1;
00551       do {
00552     ch += *source++;
00553     --tmpBytesToRead;
00554     if (tmpBytesToRead) ch <<= 6;
00555       } while (tmpBytesToRead > 0);
00556   }
00557     In UTF-8 writing code, the switches on "bytesToWrite" are
00558     similarly unrolled loops.
00559 
00560    --------------------------------------------------------------------- */
00561 
00563 
00564 #ifdef __cplusplus
00565 
00566 transcode_to_utf16::transcode_to_utf16(const char *utf8_input)
00567 : _orig_length(int(strlen(utf8_input)) + 1),
00568   _converted(new UTF16[_orig_length + 1])
00569     // we don't ever expect the string to get longer going to the larger data
00570     // type, so the current length should be enough.
00571 {
00572   memset((byte *)_converted, 0, 2 * _orig_length);
00573   // we use these temporary pointers since the converter resets the source
00574   // and target pointers to the end of the conversion.  the same pattern
00575   // is used in the code below.
00576   const UTF8 *temp_in = (const UTF8 *)utf8_input;
00577   UTF16 *temp_out = _converted;
00578   ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length,
00579       &temp_out, temp_out + _orig_length, lenientConversion);
00580 }
00581 
00582 transcode_to_utf16::transcode_to_utf16(const istring &utf8_input)
00583 : _orig_length(utf8_input.length() + 1),
00584   _converted(new UTF16[_orig_length])
00585 {
00586   memset((byte *)_converted, 0, 2 * _orig_length);
00587   const UTF8 *temp_in = (const UTF8 *)utf8_input.observe();
00588   UTF16 *temp_out = _converted;
00589   ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length,
00590       &temp_out, temp_out + _orig_length, lenientConversion);
00591 }
00592 
00593 transcode_to_utf16::~transcode_to_utf16()
00594 {
00595   delete [] _converted;
00596   _converted = NIL;
00597 }
00598 
00599 int transcode_to_utf16::length() const
00600 { return int(wcslen((wchar_t *)_converted)); }
00601 
00603 
00604 transcode_to_utf8::transcode_to_utf8(const UTF16 *utf16_input)
00605 : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
00606   _new_length(_orig_length * 2 + _orig_length / 2),
00607     // this is just an estimate.  it may be appropriate most of the time.
00608     // whatever doesn't fit will get truncated.
00609   _converted(new UTF8[_new_length])
00610 {
00611   memset(_converted, 0, _new_length);
00612   const UTF16 *temp_in = (const UTF16 *)utf16_input;
00613   UTF8 *temp_out = _converted;
00614   ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length,
00615       &temp_out, temp_out + _new_length, lenientConversion);
00616 }
00617 
00618 transcode_to_utf8::transcode_to_utf8(const wchar_t *utf16_input)
00619 : _orig_length(int(wcslen(utf16_input))),
00620   _new_length(_orig_length * 2 + _orig_length / 2),
00621     // this is just an estimate.  it may be appropriate most of the time.
00622     // whatever doesn't fit will get truncated.
00623   _converted(new UTF8[_new_length])
00624 {
00625   memset(_converted, 0, _new_length);
00626   const UTF16 *temp_in = (const UTF16 *)utf16_input;
00627   UTF8 *temp_out = _converted;
00628   ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length,
00629       &temp_out, temp_out + _new_length, lenientConversion);
00630 }
00631 
00632 transcode_to_utf8::~transcode_to_utf8()
00633 {
00634   delete [] _converted;
00635   _converted = NIL;
00636 }
00637 
00638 int transcode_to_utf8::length() const
00639 { return int(strlen((char *)_converted)); }
00640 
00641 transcode_to_utf8::operator istring() const
00642 { return istring((char *)_converted); }
00643 
00645 
00646 null_transcoder::null_transcoder(const char *utf8_input, bool make_own_copy)
00647 : _make_own_copy(make_own_copy),
00648   _converted(make_own_copy? new UTF8[strlen(utf8_input) + 1]
00649       : (const UTF8 *)utf8_input)
00650 {
00651   if (_make_own_copy) {
00652     strcpy((char *)_converted, utf8_input);
00653   }
00654 }
00655 
00656 null_transcoder::null_transcoder(const istring &utf8_input, bool make_own_copy)
00657 : _make_own_copy(make_own_copy),
00658   _converted(make_own_copy? new UTF8[utf8_input.length() + 1]
00659       : (const UTF8 *)utf8_input.s())
00660 {
00661   if (_make_own_copy) {
00662     strcpy((char *)_converted, utf8_input.s());
00663   }
00664 }
00665 
00666 int null_transcoder::length() const
00667 { return int(strlen((char *)_converted)); }
00668 
00669 #endif //_cplusplus
00670 
00671 
00672 #endif //CONVERT_UTF_IMPLEMENTATION_FILE
00673 

Generated on Fri Nov 28 04:29:07 2008 for HOOPLE Libraries by  doxygen 1.5.1