00001 #ifndef CONVERT_UTF_IMPLEMENTATION_FILE
00002 #define CONVERT_UTF_IMPLEMENTATION_FILE
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060 #include "convert_utf.h"
00061 #ifdef CVTUTF_DEBUG
00062 #include <stdio.h>
00063 #endif
00064 #ifdef __cplusplus
00065 #include "istring.h"
00066 #include <string.h>
00067 #include <wchar.h>
00068 #endif
00069
00070 static const int halfShift = 10;
00071
00072 static const UTF32 halfBase = 0x0010000UL;
00073 static const UTF32 halfMask = 0x3FFUL;
00074
00075 #define UNI_SUR_HIGH_START (UTF32)0xD800
00076 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
00077 #define UNI_SUR_LOW_START (UTF32)0xDC00
00078 #define UNI_SUR_LOW_END (UTF32)0xDFFF
00079
00080
00081
00082 ConversionResult ConvertUTF32toUTF16 (
00083 const UTF32** sourceStart, const UTF32* sourceEnd,
00084 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00085 ConversionResult result = conversionOK;
00086 const UTF32* source = *sourceStart;
00087 UTF16* target = *targetStart;
00088 while (source < sourceEnd) {
00089 UTF32 ch;
00090 if (target >= targetEnd) {
00091 result = targetExhausted; break;
00092 }
00093 ch = *source++;
00094 if (ch <= UNI_MAX_BMP) {
00095
00096 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00097 if (flags == strictConversion) {
00098 --source;
00099 result = sourceIllegal;
00100 break;
00101 } else {
00102 *target++ = UNI_REPLACEMENT_CHAR;
00103 }
00104 } else {
00105 *target++ = (UTF16)ch;
00106 }
00107 } else if (ch > UNI_MAX_LEGAL_UTF32) {
00108 if (flags == strictConversion) {
00109 result = sourceIllegal;
00110 } else {
00111 *target++ = UNI_REPLACEMENT_CHAR;
00112 }
00113 } else {
00114
00115 if (target + 1 >= targetEnd) {
00116 --source;
00117 result = targetExhausted; break;
00118 }
00119 ch -= halfBase;
00120 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00121 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00122 }
00123 }
00124 *sourceStart = source;
00125 *targetStart = target;
00126 return result;
00127 }
00128
00129
00130
00131 ConversionResult ConvertUTF16toUTF32 (
00132 const UTF16** sourceStart, const UTF16* sourceEnd,
00133 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00134 ConversionResult result = conversionOK;
00135 const UTF16* source = *sourceStart;
00136 UTF32* target = *targetStart;
00137 UTF32 ch, ch2;
00138 while (source < sourceEnd) {
00139 const UTF16* oldSource = source;
00140 ch = *source++;
00141
00142 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00143
00144 if (source < sourceEnd) {
00145 ch2 = *source;
00146
00147 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00148 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00149 + (ch2 - UNI_SUR_LOW_START) + halfBase;
00150 ++source;
00151 } else if (flags == strictConversion) {
00152 --source;
00153 result = sourceIllegal;
00154 break;
00155 }
00156 } else {
00157 --source;
00158 result = sourceExhausted;
00159 break;
00160 }
00161 } else if (flags == strictConversion) {
00162
00163 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00164 --source;
00165 result = sourceIllegal;
00166 break;
00167 }
00168 }
00169 if (target >= targetEnd) {
00170 source = oldSource;
00171 result = targetExhausted; break;
00172 }
00173 *target++ = ch;
00174 }
00175 *sourceStart = source;
00176 *targetStart = target;
00177 #ifdef CVTUTF_DEBUG
00178 if (result == sourceIllegal) {
00179 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
00180 fflush(stderr);
00181 }
00182 #endif
00183 return result;
00184 }
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195 static const char trailingBytesForUTF8[256] = {
00196 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00197 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00198 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00199 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00200 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00201 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00202 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00203 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00204 };
00205
00206
00207
00208
00209
00210
00211 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
00212 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
00213
00214
00215
00216
00217
00218
00219
00220
00221 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235 ConversionResult ConvertUTF16toUTF8 (
00236 const UTF16** sourceStart, const UTF16* sourceEnd,
00237 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00238 ConversionResult result = conversionOK;
00239 const UTF16* source = *sourceStart;
00240 UTF8* target = *targetStart;
00241 while (source < sourceEnd) {
00242 UTF32 ch;
00243 unsigned short bytesToWrite = 0;
00244 const UTF32 byteMask = 0xBF;
00245 const UTF32 byteMark = 0x80;
00246 const UTF16* oldSource = source;
00247 ch = *source++;
00248
00249 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00250
00251 if (source < sourceEnd) {
00252 UTF32 ch2 = *source;
00253
00254 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00255 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00256 + (ch2 - UNI_SUR_LOW_START) + halfBase;
00257 ++source;
00258 } else if (flags == strictConversion) {
00259 --source;
00260 result = sourceIllegal;
00261 break;
00262 }
00263 } else {
00264 --source;
00265 result = sourceExhausted;
00266 break;
00267 }
00268 } else if (flags == strictConversion) {
00269
00270 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00271 --source;
00272 result = sourceIllegal;
00273 break;
00274 }
00275 }
00276
00277 if (ch < (UTF32)0x80) { bytesToWrite = 1;
00278 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
00279 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
00280 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
00281 } else { bytesToWrite = 3;
00282 ch = UNI_REPLACEMENT_CHAR;
00283 }
00284
00285 target += bytesToWrite;
00286 if (target > targetEnd) {
00287 source = oldSource;
00288 target -= bytesToWrite; result = targetExhausted; break;
00289 }
00290 switch (bytesToWrite) {
00291 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00292 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00293 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00294 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
00295 }
00296 target += bytesToWrite;
00297 }
00298 *sourceStart = source;
00299 *targetStart = target;
00300 return result;
00301 }
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316 static Booleano isLegalUTF8(const UTF8 *source, int length) {
00317 UTF8 a;
00318 const UTF8 *srcptr = source+length;
00319 switch (length) {
00320 default: return false;
00321
00322 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00323 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00324 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
00325
00326 switch (*source) {
00327
00328 case 0xE0: if (a < 0xA0) return false; break;
00329 case 0xED: if (a > 0x9F) return false; break;
00330 case 0xF0: if (a < 0x90) return false; break;
00331 case 0xF4: if (a > 0x8F) return false; break;
00332 default: if (a < 0x80) return false;
00333 }
00334
00335 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
00336 }
00337 if (*source > 0xF4) return false;
00338 return true;
00339 }
00340
00341
00342
00343
00344
00345
00346
00347 Booleano isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
00348 int length = trailingBytesForUTF8[*source]+1;
00349 if (source+length > sourceEnd) {
00350 return false;
00351 }
00352 return isLegalUTF8(source, length);
00353 }
00354
00355
00356
00357 ConversionResult ConvertUTF8toUTF16 (
00358 const UTF8** sourceStart, const UTF8* sourceEnd,
00359 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00360 ConversionResult result = conversionOK;
00361 const UTF8* source = *sourceStart;
00362 UTF16* target = *targetStart;
00363 while (source < sourceEnd) {
00364 UTF32 ch = 0;
00365 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00366 if (source + extraBytesToRead >= sourceEnd) {
00367 result = sourceExhausted; break;
00368 }
00369
00370 if (! isLegalUTF8(source, extraBytesToRead+1)) {
00371 result = sourceIllegal;
00372 break;
00373 }
00374
00375
00376
00377 switch (extraBytesToRead) {
00378 case 5: ch += *source++; ch <<= 6;
00379 case 4: ch += *source++; ch <<= 6;
00380 case 3: ch += *source++; ch <<= 6;
00381 case 2: ch += *source++; ch <<= 6;
00382 case 1: ch += *source++; ch <<= 6;
00383 case 0: ch += *source++;
00384 }
00385 ch -= offsetsFromUTF8[extraBytesToRead];
00386
00387 if (target >= targetEnd) {
00388 source -= (extraBytesToRead+1);
00389 result = targetExhausted; break;
00390 }
00391 if (ch <= UNI_MAX_BMP) {
00392
00393 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00394 if (flags == strictConversion) {
00395 source -= (extraBytesToRead+1);
00396 result = sourceIllegal;
00397 break;
00398 } else {
00399 *target++ = UNI_REPLACEMENT_CHAR;
00400 }
00401 } else {
00402 *target++ = (UTF16)ch;
00403 }
00404 } else if (ch > UNI_MAX_UTF16) {
00405 if (flags == strictConversion) {
00406 result = sourceIllegal;
00407 source -= (extraBytesToRead+1);
00408 break;
00409 } else {
00410 *target++ = UNI_REPLACEMENT_CHAR;
00411 }
00412 } else {
00413
00414 if (target + 1 >= targetEnd) {
00415 source -= (extraBytesToRead+1);
00416 result = targetExhausted; break;
00417 }
00418 ch -= halfBase;
00419 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00420 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00421 }
00422 }
00423 *sourceStart = source;
00424 *targetStart = target;
00425 return result;
00426 }
00427
00428
00429
00430 ConversionResult ConvertUTF32toUTF8 (
00431 const UTF32** sourceStart, const UTF32* sourceEnd,
00432 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00433 ConversionResult result = conversionOK;
00434 const UTF32* source = *sourceStart;
00435 UTF8* target = *targetStart;
00436 while (source < sourceEnd) {
00437 UTF32 ch;
00438 unsigned short bytesToWrite = 0;
00439 const UTF32 byteMask = 0xBF;
00440 const UTF32 byteMark = 0x80;
00441 ch = *source++;
00442 if (flags == strictConversion ) {
00443
00444 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00445 --source;
00446 result = sourceIllegal;
00447 break;
00448 }
00449 }
00450
00451
00452
00453
00454 if (ch < (UTF32)0x80) { bytesToWrite = 1;
00455 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
00456 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
00457 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
00458 } else { bytesToWrite = 3;
00459 ch = UNI_REPLACEMENT_CHAR;
00460 result = sourceIllegal;
00461 }
00462
00463 target += bytesToWrite;
00464 if (target > targetEnd) {
00465 --source;
00466 target -= bytesToWrite; result = targetExhausted; break;
00467 }
00468 switch (bytesToWrite) {
00469 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00470 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00471 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00472 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
00473 }
00474 target += bytesToWrite;
00475 }
00476 *sourceStart = source;
00477 *targetStart = target;
00478 return result;
00479 }
00480
00481
00482
00483 ConversionResult ConvertUTF8toUTF32 (
00484 const UTF8** sourceStart, const UTF8* sourceEnd,
00485 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00486 ConversionResult result = conversionOK;
00487 const UTF8* source = *sourceStart;
00488 UTF32* target = *targetStart;
00489 while (source < sourceEnd) {
00490 UTF32 ch = 0;
00491 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00492 if (source + extraBytesToRead >= sourceEnd) {
00493 result = sourceExhausted; break;
00494 }
00495
00496 if (! isLegalUTF8(source, extraBytesToRead+1)) {
00497 result = sourceIllegal;
00498 break;
00499 }
00500
00501
00502
00503 switch (extraBytesToRead) {
00504 case 5: ch += *source++; ch <<= 6;
00505 case 4: ch += *source++; ch <<= 6;
00506 case 3: ch += *source++; ch <<= 6;
00507 case 2: ch += *source++; ch <<= 6;
00508 case 1: ch += *source++; ch <<= 6;
00509 case 0: ch += *source++;
00510 }
00511 ch -= offsetsFromUTF8[extraBytesToRead];
00512
00513 if (target >= targetEnd) {
00514 source -= (extraBytesToRead+1);
00515 result = targetExhausted; break;
00516 }
00517 if (ch <= UNI_MAX_LEGAL_UTF32) {
00518
00519
00520
00521
00522 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00523 if (flags == strictConversion) {
00524 source -= (extraBytesToRead+1);
00525 result = sourceIllegal;
00526 break;
00527 } else {
00528 *target++ = UNI_REPLACEMENT_CHAR;
00529 }
00530 } else {
00531 *target++ = ch;
00532 }
00533 } else {
00534 result = sourceIllegal;
00535 *target++ = UNI_REPLACEMENT_CHAR;
00536 }
00537 }
00538 *sourceStart = source;
00539 *targetStart = target;
00540 return result;
00541 }
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00563
00564 #ifdef __cplusplus
00565
00566 transcode_to_utf16::transcode_to_utf16(const char *utf8_input)
00567 : _orig_length(int(strlen(utf8_input)) + 1),
00568 _converted(new UTF16[_orig_length + 1])
00569
00570
00571 {
00572 memset((byte *)_converted, 0, 2 * _orig_length);
00573
00574
00575
00576 const UTF8 *temp_in = (const UTF8 *)utf8_input;
00577 UTF16 *temp_out = _converted;
00578 ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length,
00579 &temp_out, temp_out + _orig_length, lenientConversion);
00580 }
00581
00582 transcode_to_utf16::transcode_to_utf16(const istring &utf8_input)
00583 : _orig_length(utf8_input.length() + 1),
00584 _converted(new UTF16[_orig_length])
00585 {
00586 memset((byte *)_converted, 0, 2 * _orig_length);
00587 const UTF8 *temp_in = (const UTF8 *)utf8_input.observe();
00588 UTF16 *temp_out = _converted;
00589 ConvertUTF8toUTF16(&temp_in, temp_in + _orig_length,
00590 &temp_out, temp_out + _orig_length, lenientConversion);
00591 }
00592
00593 transcode_to_utf16::~transcode_to_utf16()
00594 {
00595 delete [] _converted;
00596 _converted = NIL;
00597 }
00598
00599 int transcode_to_utf16::length() const
00600 { return int(wcslen((wchar_t *)_converted)); }
00601
00603
00604 transcode_to_utf8::transcode_to_utf8(const UTF16 *utf16_input)
00605 : _orig_length(int(wcslen((const wchar_t *)utf16_input))),
00606 _new_length(_orig_length * 2 + _orig_length / 2),
00607
00608
00609 _converted(new UTF8[_new_length])
00610 {
00611 memset(_converted, 0, _new_length);
00612 const UTF16 *temp_in = (const UTF16 *)utf16_input;
00613 UTF8 *temp_out = _converted;
00614 ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length,
00615 &temp_out, temp_out + _new_length, lenientConversion);
00616 }
00617
00618 transcode_to_utf8::transcode_to_utf8(const wchar_t *utf16_input)
00619 : _orig_length(int(wcslen(utf16_input))),
00620 _new_length(_orig_length * 2 + _orig_length / 2),
00621
00622
00623 _converted(new UTF8[_new_length])
00624 {
00625 memset(_converted, 0, _new_length);
00626 const UTF16 *temp_in = (const UTF16 *)utf16_input;
00627 UTF8 *temp_out = _converted;
00628 ConvertUTF16toUTF8(&temp_in, temp_in + _orig_length,
00629 &temp_out, temp_out + _new_length, lenientConversion);
00630 }
00631
00632 transcode_to_utf8::~transcode_to_utf8()
00633 {
00634 delete [] _converted;
00635 _converted = NIL;
00636 }
00637
00638 int transcode_to_utf8::length() const
00639 { return int(strlen((char *)_converted)); }
00640
00641 transcode_to_utf8::operator istring() const
00642 { return istring((char *)_converted); }
00643
00645
00646 null_transcoder::null_transcoder(const char *utf8_input, bool make_own_copy)
00647 : _make_own_copy(make_own_copy),
00648 _converted(make_own_copy? new UTF8[strlen(utf8_input) + 1]
00649 : (const UTF8 *)utf8_input)
00650 {
00651 if (_make_own_copy) {
00652 strcpy((char *)_converted, utf8_input);
00653 }
00654 }
00655
00656 null_transcoder::null_transcoder(const istring &utf8_input, bool make_own_copy)
00657 : _make_own_copy(make_own_copy),
00658 _converted(make_own_copy? new UTF8[utf8_input.length() + 1]
00659 : (const UTF8 *)utf8_input.s())
00660 {
00661 if (_make_own_copy) {
00662 strcpy((char *)_converted, utf8_input.s());
00663 }
00664 }
00665
00666 int null_transcoder::length() const
00667 { return int(strlen((char *)_converted)); }
00668
00669 #endif //_cplusplus
00670
00671
00672 #endif //CONVERT_UTF_IMPLEMENTATION_FILE
00673