SNAP Library 2.0, Developer Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 // Unicode.cpp : Defines the entry point for the console application. 00002 // 00003 00005 // Includes 00006 //#include "unicode.h" 00007 00008 //----------------------------------------------------------------------------- 00009 // Private declarations of this module 00010 //----------------------------------------------------------------------------- 00011 00012 namespace { 00013 00014 class TVectorBuilder2 00015 { 00016 public: 00017 TIntV v; 00018 TVectorBuilder2(int i) { v.Add(i); } 00019 operator TIntV() const { return v; } 00020 TVectorBuilder2& operator ,(int i) { v.Add(i); return *this; } 00021 }; 00022 00023 class TVectorBuilder 00024 { 00025 public: 00026 operator TIntV() const { return TIntV(); } 00027 TVectorBuilder2 operator ,(int i) { return TVectorBuilder2(i); } 00028 }; 00029 00030 TVectorBuilder VB; 00031 00032 TStr CombinePath(const TStr& s, const TStr& t) 00033 { 00034 int n = s.Len(); if (n <= 0) return t; 00035 if (s[n - 1] == '\\' || s[n - 1] == '/' || s[n - 1] == ':') return s + t; 00036 return s + "\\" + t; 00037 } 00038 00039 void AssertEq(const TIntV& v1, const TIntV& v2, const TStr& explanation, FILE *f) 00040 { 00041 const int n = v1.Len(); 00042 bool ok = (n == v2.Len()); 00043 if (ok) for (int i = 0; i < n && ok; i++) ok = ok && (v1[i] == v2[i]); 00044 if (! ok) 00045 { 00046 if (! f) f = stderr; 00047 fprintf(f, "%s: [", explanation.CStr()); 00048 for (int i = 0; i < v1.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v1[i])); 00049 fprintf(f, "] != ["); 00050 for (int i = 0; i < v2.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v2[i])); 00051 fprintf(f, "]\n"); 00052 Fail; 00053 } 00054 } 00055 00056 }; 00057 00058 //----------------------------------------------------------------------------- 00059 // TUniCodec -- miscellaneous declarations 00060 //----------------------------------------------------------------------------- 00061 00062 uint TUniCodec::GetRndUint(TRnd& rnd) 00063 { 00064 uint u = rnd.GetUniDevUInt(256) & 0xff; 00065 u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); 00066 u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); 00067 u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); 00068 return u; 00069 } 00070 00071 uint TUniCodec::GetRndUint(TRnd& rnd, uint minVal, uint maxVal) 00072 { 00073 if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd); 00074 uint range = maxVal - minVal + 1; 00075 if (range > (uint(1) << (8 * sizeof(uint) - 1))) 00076 while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; } 00077 uint mask = 1; 00078 while (mask < range) mask <<= 1; 00079 mask -= 1; 00080 while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; } 00081 } 00082 00083 bool TUniCodec::IsMachineLittleEndian() 00084 { 00085 static bool isLE, initialized = false; 00086 if (initialized) return isLE; 00087 int i = 0x0201; 00088 char *p = (char *) (&i); 00089 char c1, c2; 00090 memcpy(&c1, p, 1); memcpy(&c2, p + 1, 1); 00091 if (c1 == 1 && c2 == 2) isLE = true; 00092 else if (c1 == 2 && c2 == 1) isLE = false; 00093 else { 00094 FailR(("TUniCodec::IsMachineLittleEndian: c1 = " + TInt::GetStr(int(uchar(c1)), "%02x") + ", c2 = " + TInt::GetStr(int(uchar(c2)), "%02x") + ".").CStr()); 00095 isLE = true; } 00096 initialized = true; return isLE; 00097 } 00098 00099 //----------------------------------------------------------------------------- 00100 // TUniCodec -- UTF-8 test driver 00101 //----------------------------------------------------------------------------- 00102 00103 void TUniCodec::TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f) 00104 { 00105 TIntV dest; 00106 if (f) { 00107 fprintf(f, "Settings: %s %s %s replacementChar = %x\n", 00108 (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"), 00109 (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar)); 00110 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); } 00111 try 00112 { 00113 size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true)); 00114 if (f) { 00115 fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i])); 00116 fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i])); 00117 fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); } 00118 if (retVal != expectedRetVal) 00119 printf("!!!"); 00120 IAssert(retVal == expectedRetVal); IAssert(! expectedThrow); 00121 if (dest.Len() != expectedDest.Len()) 00122 printf("!!!"); 00123 IAssert(dest.Len() == expectedDest.Len()); 00124 for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]); 00125 } 00126 catch (TUnicodeException e) 00127 { 00128 if (f) { 00129 fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i])); 00130 fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); } 00131 IAssert(expectedThrow); 00132 } 00133 } 00134 00135 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc', 00136 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected. 00137 void TUniCodec::TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc) 00138 { 00139 TIntV src; TIntV expectedDest; int expectedRetVal = 0; 00140 bool expectedAbort = false; 00141 FILE *f = 0; // stderr 00142 // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where: 00143 // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z'); 00144 // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6'); 00145 // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint. 00146 // (absent = 0, 'a' = 1, 'b' = 2 and so on). 00147 for (int i = 0; i < testCaseDesc.Len(); ) 00148 { 00149 IAssert(i + 2 <= testCaseDesc.Len()); 00150 const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2; 00151 uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false; 00152 IAssert('1' <= d && d <= '6'); nBytes = d - '0'; 00153 if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte 00154 else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes 00155 else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes 00156 else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode 00157 else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode 00158 else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes 00159 else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits 00160 else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits 00161 else if (c == 'X') { cp = 0xfffe; minBytes = 3; } 00162 else if (c == 'Y') { cp = 0xfeff; minBytes = 3; } 00163 else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f)) 00164 else Fail; 00165 IAssert(nBytes >= minBytes); 00166 // Process 'e'. 00167 int nToDel = 0; 00168 if (i < testCaseDesc.Len()) { 00169 const char e = testCaseDesc[i]; 00170 if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }} 00171 IAssert(nToDel < nBytes); 00172 // Will an error occur during the decoding of this codepoint? 00173 bool errHere = false; 00174 if (eighties) errHere = true; 00175 else if (nToDel > 0) errHere = true; 00176 else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true; 00177 // Update 'expectedDest' and 'expetedRetVal'. 00178 if (! expectedAbort) { 00179 if (! errHere) { 00180 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { } 00181 else { expectedDest.Add(cp); expectedRetVal += 1; } } 00182 else if (errorHandling == uehReplace) { 00183 if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar); 00184 else expectedDest.Add(replacementChar); } 00185 if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; } 00186 // Update 'src'. 00187 if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff)); 00188 else if (nBytes == 1) src.Add(cp); 00189 else { 00190 int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes); 00191 src.Add(mask | (uint(cp) >> (6 * (nBytes - 1)))); 00192 for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); } 00193 } 00194 if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr()); 00195 TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f); 00196 } 00197 00198 void TUniCodec::TestUtf8() 00199 { 00200 TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true); 00201 for (int skipBom_ = 0; skipBom_ < 2; skipBom_++) 00202 for (int strict_ = 0; strict_ < 2; strict_++) 00203 for (int errMode_ = 0; errMode_ < 4; errMode_++) 00204 { 00205 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1); 00206 TRnd rnd = TRnd(123); 00207 // Test DecodeUtf8 on various random UTF-8-encoded sequences. 00208 for (int i = 0; i < 10; i++) 00209 { 00210 TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6"); 00211 TestDecodeUtf8(rnd, "X3A5dA6d"); 00212 TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1"); 00213 TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1"); 00214 TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1"); 00215 TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1"); 00216 TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2"); 00217 TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2"); 00218 TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a"); 00219 TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1"); 00220 TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1"); 00221 TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1"); 00222 TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1"); 00223 TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a"); 00224 TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b"); 00225 TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c"); 00226 TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d"); 00227 TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e"); 00228 } 00229 // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters 00230 // close to powers of 2. 00231 TIntV src, expectedDest, src2; 00232 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1); 00233 for (int pow = 8; pow <= 32; pow++) 00234 { 00235 uint uFrom, uTo; 00236 if (pow == 8) uFrom = 0, uTo = 1u << pow; 00237 else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx; 00238 else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8); 00239 printf("%u..%u \r", uFrom, uTo); 00240 for (uint u = uFrom; ; u++) 00241 { 00242 int nBytes = 0; 00243 if (u < (1u << 7)) nBytes = 1; 00244 else if (u < (1u << 11)) nBytes = 2; 00245 else if (u < (1u << 16)) nBytes = 3; 00246 else if (u < (1u << 21)) nBytes = 4; 00247 else if (u < (1u << 26)) nBytes = 5; 00248 else nBytes = 6; 00249 src.Gen(6, nBytes); 00250 if (nBytes == 1) src[0] = u; 00251 else { 00252 src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1))); 00253 for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); } 00254 bool err = (strict && u > 0x10ffff); 00255 expectedDest.Reserve(1, 0); 00256 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar); 00257 else if (! err) expectedDest.Add(u); 00258 int erv = (err ? 0 : 1); 00259 if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0; 00260 TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0); 00261 // We can also test the UTF-8 encoder. 00262 src2[0] = u; 00263 if (err) { 00264 if (errorHandling == uehReplace) src = utf8ReplCh; 00265 else src.Clr(false); } 00266 TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0); 00267 // 00268 if (u == uTo) break; 00269 } 00270 } 00271 } 00272 } 00273 00274 //----------------------------------------------------------------------------- 00275 // TUniCodec -- UTF-16 test driver 00276 //----------------------------------------------------------------------------- 00277 00278 void TUniCodec::WordsToBytes(const TIntV& src, TIntV& dest) 00279 { 00280 dest.Clr(); 00281 bool isLE = IsMachineLittleEndian(); 00282 for (int i = 0; i < src.Len(); i++) { 00283 int c = src[i] & 0xffff; 00284 if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } 00285 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } } 00286 } 00287 00288 void TUniCodec::TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, 00289 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, 00290 FILE *f) 00291 { 00292 TIntV srcBytes, expectedDestBytes; 00293 WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes); 00294 TIntV dest; 00295 if (f) { 00296 fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n", 00297 (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"), 00298 (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")), 00299 (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"), 00300 (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"), 00301 uint(replacementChar)); 00302 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); } 00303 for (int useBytes = 0; useBytes < 2; useBytes++) 00304 { 00305 const char *fmt = (useBytes ? " %02x" : " %04x"); 00306 try 00307 { 00308 dest.Clr(); 00309 size_t retVal; 00310 if (! useBytes) { 00311 if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); 00312 else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); } 00313 else { 00314 if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder); 00315 else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); } 00316 const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest); 00317 if (f) { 00318 fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i])); 00319 fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i])); 00320 fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); } 00321 bool ok = true; 00322 if (retVal != expectedRetVal) ok = false; 00323 if (dest.Len() != ed.Len()) ok = false; 00324 if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false; 00325 if (! ok) 00326 { 00327 printf("!!!\n"); 00328 } 00329 IAssert(retVal == expectedRetVal); IAssert(! expectedThrow); 00330 IAssert(dest.Len() == ed.Len()); 00331 for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]); 00332 } 00333 catch (TUnicodeException e) 00334 { 00335 if (f) { 00336 fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i])); 00337 fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); } 00338 IAssert(expectedThrow); 00339 } 00340 } 00341 } 00342 00343 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc', 00344 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected. 00345 void TUniCodec::TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc, 00346 const TUtf16BomHandling bomHandling, 00347 const TUniByteOrder defaultByteOrder, 00348 const bool insertBom) 00349 { 00350 TIntV src; TIntV expectedDest; int expectedRetVal = 0; 00351 bool expectedAbort = false; 00352 FILE *f = 0; 00353 bool isMachineLe = IsMachineLittleEndian(); 00354 bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); 00355 bool swap = (isMachineLe != isDefaultLe); 00356 if (insertBom) { 00357 src.Add(swap ? 0xfffe : 0xfeff); 00358 if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } } 00359 else if (bomHandling == bomRequired) { 00360 expectedAbort = true; expectedRetVal = -1; } 00361 // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where: 00362 // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y'); 00363 // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint. 00364 // (absent = 0, 'a' = 1). 00365 for (int i = 0; i < testCaseDesc.Len(); ) 00366 { 00367 const char c = testCaseDesc[i++]; 00368 uint cp = 0; int nWords = -1; 00369 if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated 00370 if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range 00371 else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range 00372 else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range 00373 else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP 00374 else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16 00375 else if (c == 'X') { cp = 0xfffe; nWords = 1; } 00376 else if (c == 'Y') { cp = 0xfeff; nWords = 1; } 00377 else Fail; 00378 if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C'); 00379 // Process 'e'. 00380 int nToDel = 0; 00381 if (i < testCaseDesc.Len()) { 00382 const char e = testCaseDesc[i]; 00383 if (e >= 'a') { i += 1; nToDel = 1; }} 00384 IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1))); 00385 if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C'); 00386 // Will an error occur during the decoding of this codepoint? 00387 bool errHere = false; 00388 if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true; 00389 else if (cp > 0x10ffff) { Fail; errHere = true; } 00390 else if (nToDel > 0) errHere = true; 00391 else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true; 00392 // Update 'expectedDest' and 'expectedRetVal'. 00393 if (! expectedAbort) { 00394 if (! errHere) { 00395 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { } 00396 else { expectedDest.Add(cp); expectedRetVal += 1; } } 00397 else if (errorHandling == uehReplace) { 00398 expectedDest.Add(replacementChar); } 00399 if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; } 00400 // Update 'src'. 00401 if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp); 00402 else { 00403 int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate; 00404 int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate; 00405 src.Add(swap ? SwapBytes(c1) : c1); 00406 if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); } 00407 } 00408 if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr()); 00409 TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f); 00410 } 00411 00412 void TUniCodec::TestUtf16() 00413 { 00414 TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar); 00415 for (int skipBom_ = 0; skipBom_ < 2; skipBom_++) 00416 for (int strict_ = 0; strict_ < 2; strict_++) 00417 for (int errMode_ = 0; errMode_ < 4; errMode_++) 00418 for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++) 00419 for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++) 00420 for (int insertBom_ = 0; insertBom_ < 2; insertBom_++) 00421 { 00422 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1); 00423 bool insertBom = (insertBom_ == 1); 00424 TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_; 00425 TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_; 00426 TRnd rnd = TRnd(123); 00427 // Test DecodeUtf16 on various random UTF-16-encoded sequences. 00428 for (int i = 0; i < 10; i++) 00429 { 00430 TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom); 00431 TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom); 00432 TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom); 00433 TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom); 00434 TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom); 00435 TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom); 00436 TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom); 00437 TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom); 00438 TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom); 00439 } 00440 //continue; 00441 // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters 00442 // close to powers of 2. 00443 TIntV src, expectedDest, src2; 00444 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1); 00445 for (int pow = 8; pow <= 32; pow++) 00446 { 00447 uint uFrom, uTo; 00448 if (pow == 8) uFrom = 0, uTo = 1u << pow; 00449 else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx; 00450 else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8); 00451 printf("%u..%u \r", uFrom, uTo); 00452 for (uint u = uFrom; ; u++) 00453 { 00454 int nWords = 0; 00455 if (u < 0x10000) nWords = 1; 00456 else nWords = 2; 00457 bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe)); 00458 bool swap = (isMachineLe != isDestLe); 00459 bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023); 00460 src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0)); 00461 if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff); 00462 if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023))) 00463 { 00464 // Try to encode 'u' and see if it gets decoded correctly. 00465 if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u); 00466 else { 00467 int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023); 00468 int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023); 00469 src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1); 00470 src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); } 00471 if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding 00472 { 00473 expectedDest.Reserve(2, 0); 00474 if (insertBom && ! skipBom) expectedDest.Add(0xfeff); 00475 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar); 00476 else if (! err) expectedDest.Add(u); 00477 int erv = (err ? 0 : expectedDest.Len()); 00478 if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0; 00479 bool errD = err; 00480 if (bomHandling == bomRequired && ! insertBom) { 00481 expectedDest.Clr(false); 00482 if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); } 00483 else { erv = -1; errD = true; 00484 /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }} 00485 TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0); 00486 } 00487 } 00488 // We can also test the UTF-16 encoder. 00489 src2[0] = u; 00490 if (err) { 00491 src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff); 00492 if (errorHandling == uehReplace) { 00493 src.Add(swap ? SwapBytes(replacementChar) : replacementChar); 00494 /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); } 00495 else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */ 00496 }} 00497 TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0); 00498 // 00499 if (u == uTo) break; 00500 } 00501 } 00502 } 00503 } 00504 00505 //----------------------------------------------------------------------------- 00506 // TUniCaseFolding 00507 //----------------------------------------------------------------------------- 00508 00509 void TUniCaseFolding::LoadTxt(const TStr& fileName) 00510 { 00511 Clr(); 00512 TUniChDb::TUcdFileReader reader; reader.Open(fileName); 00513 TStrV fields; 00514 while (reader.GetNextLine(fields)) 00515 { 00516 int cp = reader.ParseCodePoint(fields[0]); 00517 const TStr status = fields[1], mapsTo = fields[2]; 00518 if (status == "C" || status == "S" || status == "T") { 00519 TIntH &dest = (status == "C" ? cfCommon : status == "S" ? cfSimple : cfTurkic); 00520 IAssert(! dest.IsKey(cp)); 00521 int cp2 = reader.ParseCodePoint(mapsTo); 00522 dest.AddDat(cp, cp2); } 00523 else if (status == "F") { 00524 TIntIntVH &dest = cfFull; 00525 IAssert(! dest.IsKey(cp)); 00526 TIntV cps; reader.ParseCodePointList(mapsTo, cps); IAssert(cps.Len() > 0); 00527 dest.AddDat(cp, cps); } 00528 else 00529 FailR(status.CStr()); 00530 } 00531 printf("TUniCaseFolding(\"%s\"): %d common, %d simple, %d full, %d Turkic.\n", 00532 fileName.CStr(), cfCommon.Len(), cfSimple.Len(), cfFull.Len(), cfTurkic.Len()); 00533 } 00534 00535 void TUniCaseFolding::Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f) 00536 { 00537 fprintf(f, "TUniCaseFolding(%s%s): ", (full ? "full" : "simple"), (turkic ? ", turkic" : "")); 00538 for (int i = 0; i < src.Len(); i++) fprintf(f, " %04x", int(src[i])); 00539 TIntV dest; Fold(src, 0, src.Len(), dest, true, full, turkic); 00540 fprintf(f, "\n -> "); 00541 for (int i = 0; i < dest.Len(); i++) fprintf(f, " %04x", int(dest[i])); 00542 fprintf(f, "\n"); 00543 IAssert(dest.Len() == expectedDest.Len()); 00544 for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]); 00545 } 00546 00547 /* 00548 void TUniCaseFolding::Test(const TIntV& src, FILE *f) { 00549 Test(src, false, false, f); Test(src, false, true, f); 00550 Test(src, true, false, f); Test(src, true, true, f); } 00551 */ 00552 00553 void TUniCaseFolding::Test() 00554 { 00555 FILE *f = stderr; 00556 TVectorBuilder VB; 00557 // simple 00558 Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0xdf), false, false, f); 00559 // simple + turkic 00560 Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0xdf), false, true, f); 00561 // full 00562 Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0x73, 0x73), true, false, f); 00563 // full + turkic 00564 Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0x73, 0x73), true, true, f); 00565 } 00566 00567 //----------------------------------------------------------------------------- 00568 // TUniChInfo 00569 //----------------------------------------------------------------------------- 00570 00571 // UAX #14 00572 const ushort TUniChInfo::LineBreak_Unknown = TUniChInfo::GetLineBreakCode('X', 'X'); 00573 const ushort TUniChInfo::LineBreak_ComplexContext = TUniChInfo::GetLineBreakCode('S', 'A'); 00574 const ushort TUniChInfo::LineBreak_Numeric = TUniChInfo::GetLineBreakCode('N', 'U'); 00575 const ushort TUniChInfo::LineBreak_InfixNumeric = TUniChInfo::GetLineBreakCode('I', 'S'); 00576 const ushort TUniChInfo::LineBreak_Quotation = TUniChInfo::GetLineBreakCode('Q', 'U'); 00577 00578 //----------------------------------------------------------------------------- 00579 // TUniChDb -- word breaking 00580 //----------------------------------------------------------------------------- 00581 00582 // Test driver for WbFind*NonIgnored. 00583 void TUniChDb::TestWbFindNonIgnored(const TIntV& src) const 00584 { 00585 int n = src.Len(); 00586 TBoolV isIgnored; isIgnored.Gen(n); 00587 for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]); 00588 TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored; 00589 prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n); 00590 FILE *f = 0; // stderr; 00591 for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++) 00592 { 00593 int prev = -1; 00594 for (int i = 0; i < srcLen; i++) { 00595 prevNonIgnored[i] = prev; 00596 if (! isIgnored[srcIdx + i]) prev = srcIdx + i; } 00597 int next = srcIdx + srcLen; 00598 for (int i = srcLen - 1; i >= 0; i--) { 00599 nextNonIgnored[i] = next; 00600 if (! isIgnored[srcIdx + i]) next = srcIdx + i; 00601 curOrNextNonIgnored[i] = next; } 00602 if (f) { 00603 fprintf(f, "\nIndex: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i); 00604 fprintf(f, "\nNonIgn: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y")); 00605 fprintf(f, "\nPrevNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i])); 00606 fprintf(f, "\nNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i])); 00607 fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i])); 00608 fprintf(f, "\n"); } 00609 for (int i = 0; i < srcLen; i++) 00610 { 00611 size_t s; 00612 s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen)); 00613 IAssert(s == size_t(nextNonIgnored[i])); 00614 s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen)); 00615 IAssert(s == size_t(curOrNextNonIgnored[i])); 00616 s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s); 00617 if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); } 00618 else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); } 00619 } 00620 } 00621 } 00622 00623 void TUniChDb::TestWbFindNonIgnored() const 00624 { 00625 TIntV chIgnored, chNonIgnored; 00626 FILE *f = 0; // stderr; 00627 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) { 00628 const int cp = h.GetKey(i); const TUniChInfo& ci = h[i]; 00629 if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp, 00630 ci.flags, ci.properties, ci.propertiesX, GetScriptName(ci.script).CStr()); 00631 (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i)); 00632 } 00633 chIgnored.Sort(); chNonIgnored.Sort(); 00634 printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len()); 00635 TRnd rnd = TRnd(123); 00636 for (int iter = 0; iter <= 50; iter++) 00637 { 00638 int percIgnored = 2 * iter; 00639 for (int n = 0; n <= 20; n++) 00640 { 00641 // Prepare a random sequence of 'n' codepoints. 00642 TIntV v; v.Gen(n); 00643 for (int i = 0; i < n; i++) { 00644 TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored; 00645 int j = rnd.GetUniDevInt(chars.Len()); 00646 v.Add(chars[j]); } 00647 // Run the tests with this sequence. 00648 TestWbFindNonIgnored(v); 00649 } 00650 } 00651 } 00652 00653 void TUniChDb::TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence) 00654 { 00655 TUcdFileReader reader; TStrV fields; 00656 reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn()))); 00657 int nLines = 0; TRnd rnd = TRnd(123); 00658 while (reader.GetNextLine(fields)) 00659 { 00660 nLines += 1; 00661 IAssert(fields.Len() == 1); 00662 TStrV parts; fields[0].SplitOnWs(parts); 00663 const int n = parts.Len(); IAssert((n % 2) == 1); 00664 TIntV chars; TBoolV isBreak, isPredicted, isPredicted2; 00665 // Each line is a sequence of codepoints, with a \times or \div in between each 00666 // pair of codepoints (as well as at the beginning and the end of the sequence) to 00667 // indicate whether a boundary exists there or not. 00668 for (int i = 0; i < n; i++) 00669 { 00670 const TStr& s = parts[i]; 00671 if ((i % 2) == 0) { 00672 if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8 00673 isBreak.Add(false); 00674 else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8 00675 isBreak.Add(true); 00676 else FailR(s.CStr()); } 00677 else chars.Add(reader.ParseCodePoint(s)); 00678 } 00679 const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1); 00680 IAssert(isBreak[0]); IAssert(isBreak[m]); 00681 isPredicted.Gen(m + 1); isPredicted.PutAll(false); 00682 if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); } 00683 // We'll insert a few random characters at the beginning of the sequence 00684 // so that srcPos doesn't always begin at 0. 00685 for (int nBefore = 0; nBefore < 5; nBefore++) 00686 { 00687 TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1)); 00688 chars2.AddV(chars); 00689 // Use FindNextBoundary to find all the word boundaries. 00690 size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position; 00691 while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position)) 00692 { 00693 IAssert(prevPosition < position); 00694 IAssert(position <= size_t(nBefore + m)); 00695 isPredicted[int(position) - nBefore] = true; 00696 prevPosition = position; 00697 } 00698 IAssert(position == size_t(nBefore + m)); 00699 if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2); 00700 else FindWordBoundaries(chars2, nBefore, m, isPredicted2); 00701 IAssert(isPredicted2.Len() == m + 1); 00702 bool ok = true; 00703 // If we start at 0, the word boundary at the beginning of the sequence was 00704 // not found explicitly, so we'll add it now. 00705 if (nBefore == 0) isPredicted[0] = true; 00706 // Compare the predicted and the true boundaries. 00707 for (int i = 0; i <= m; i++) { 00708 if (isBreak[i] != isPredicted[i]) ok = false; 00709 IAssert(isPredicted2[i] == isPredicted[i]); } 00710 FILE *f = stderr; 00711 if (! ok) 00712 { 00713 fprintf(f, "\nError in line %d:\n", nLines); 00714 fprintf(f, "True: "); 00715 for (int i = 0; i <= m; i++) { 00716 fprintf(f, "%s ", (isBreak[i] ? "|" : ".")); 00717 if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); } 00718 fprintf(f, "\nPredicted: "); 00719 for (int i = 0; i <= m; i++) { 00720 fprintf(f, "%s ", (isPredicted[i] ? "|" : ".")); 00721 if (i < m) { 00722 const int cp = chars[i + nBefore]; 00723 TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp)); 00724 if (IsWbIgnored(cp)) s = "*" + s; 00725 fprintf(f, "%4s ", s.CStr()); }} 00726 fprintf(f, "\n"); 00727 Fail; 00728 } 00729 // Test FindNextBoundary if we start in the middle of the sequence, 00730 // i.e. not at an existing boundary. 00731 for (int i = 0; i < m; i++) { 00732 position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position); 00733 IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m 00734 IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m)); 00735 position -= nBefore; 00736 for (int j = i + 1; j < int(position); j++) 00737 IAssert(! isBreak[j]); 00738 IAssert(isBreak[int(position)]); } 00739 } 00740 } 00741 reader.Close(); 00742 printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines); 00743 } 00744 00745 //----------------------------------------------------------------------------- 00746 // TUniChDb -- composition and decomposition 00747 //----------------------------------------------------------------------------- 00748 00749 void TUniChDb::TestComposition(const TStr& basePath) 00750 { 00751 TUcdFileReader reader; TStrV fields; int nLines = 0; 00752 reader.Open(CombinePath(basePath, GetNormalizationTestFn())); 00753 bool inPart1 = false; TIntH testedInPart1; 00754 while (reader.GetNextLine(fields)) 00755 { 00756 nLines += 1; 00757 if (fields.Len() == 1) { 00758 IAssert(fields[0].IsPrefix("@Part")); 00759 inPart1 = (fields[0] == "@Part1"); continue; } 00760 IAssert(fields.Len() == 6); 00761 IAssert(fields[5].Len() == 0); 00762 TIntV c1, c2, c3, c4, c5; 00763 reader.ParseCodePointList(fields[0], c1); 00764 reader.ParseCodePointList(fields[1], c2); 00765 reader.ParseCodePointList(fields[2], c3); 00766 reader.ParseCodePointList(fields[3], c4); 00767 reader.ParseCodePointList(fields[4], c5); 00768 TIntV v; 00769 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0) 00770 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")") 00771 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")") 00772 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")") 00773 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")") 00774 // NFD: 00775 NFD_(c3, c1); // c3 == NFD(c1) 00776 NFD_(c3, c2); // c3 == NFD(c2) 00777 NFD_(c3, c3); // c3 == NFD(c3) 00778 NFD_(c5, c4); // c5 == NFD(c4) 00779 NFD_(c5, c5); // c5 == NFD(c5) 00780 // NFC: 00781 NFC_(c2, c1); // c2 == NFC(c1) 00782 NFC_(c2, c2); // c2 == NFC(c2) 00783 NFC_(c2, c3); // c2 == NFC(c3) 00784 NFC_(c4, c4); // c4 == NFC(c4) 00785 NFC_(c4, c5); // c4 == NFC(c5) 00786 // NFKD: 00787 NFKD_(c5, c1); // c5 == NFKD(c1) 00788 NFKD_(c5, c2); // c5 == NFKD(c2) 00789 NFKD_(c5, c3); // c5 == NFKD(c3) 00790 NFKD_(c5, c4); // c5 == NFKD(c4) 00791 NFKD_(c5, c5); // c5 == NFKD(c5) 00792 // NFKC: 00793 NFKC_(c4, c1); // c4 == NFKC(c1) 00794 NFKC_(c4, c2); // c4 == NFKC(c2) 00795 NFKC_(c4, c3); // c4 == NFKC(c3) 00796 NFKC_(c4, c4); // c4 == NFKC(c4) 00797 NFKC_(c4, c5); // c4 == NFKC(c5) 00798 // 00799 if (inPart1) { 00800 IAssert(c1.Len() == 1); 00801 testedInPart1.AddKey(c1[0]); } 00802 } 00803 reader.Close(); 00804 // Test other individual codepoints that were not mentioned in part 1. 00805 int nOther = 0; 00806 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) 00807 { 00808 const int cp = h.GetKey(i), nLines = -1; 00809 if (testedInPart1.IsKey(cp)) continue; 00810 TIntV x, v; x.Add(cp); 00811 NFC_(x, x); // x == NFC(x) 00812 NFD_(x, x); // x == NFD(x) 00813 NFKC_(x, x); // x == NFKC(x) 00814 NFKD_(x, x); // x == NFKD(x) 00815 nOther += 1; 00816 } 00817 #undef AssE_ 00818 #undef NFC_ 00819 #undef NFD_ 00820 #undef NFKC_ 00821 #undef NFKD_ 00822 printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther); 00823 } 00824 00825 //----------------------------------------------------------------------------- 00826 // TUniChDb -- case conversion tests 00827 //----------------------------------------------------------------------------- 00828 00829 void TUniChDb::TestCaseConversion(const TStr& source, const TStr& trueLc, 00830 const TStr& trueTc, const TStr& trueUc, 00831 bool turkic, bool lithuanian) 00832 { 00833 TIntV src; 00834 TUcdFileReader::ParseCodePointList(source, src); 00835 FILE *f = stderr; 00836 for (int i = 0; i < 3; i++) 00837 { 00838 TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper; 00839 const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc); 00840 TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest); 00841 TIntV dest; 00842 GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian); 00843 bool ok = (dest.Len() == trueDest.Len()); 00844 if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]); 00845 if (ok) continue; 00846 fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase")); 00847 for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i])); 00848 fprintf(f, ")\nCorrect: ("); 00849 for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i])); 00850 fprintf(f, ")\nOur output:("); 00851 for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i])); 00852 fprintf(f, ")\n"); 00853 IAssert(ok); 00854 } 00855 } 00856 00857 void TUniChDb::TestCaseConversions() 00858 { 00859 // Because no thorough case-conversion test files have been provided as part 00860 // of the Unicode standard, we'll have to test things on a few test cases of our own. 00861 // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc. 00862 const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 "; 00863 const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 "; 00864 const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a "; 00865 const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 "; 00866 const TStr space = "0020 ", Grave = "0300 "; 00867 TestCaseConversion( 00868 F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst, // source 00869 f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst, // lowercase 00870 F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst, // titlecase 00871 F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase 00872 false, false); 00873 // - Dotted I, dotless i, etc., but with turkic == false. 00874 const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 "; 00875 TestCaseConversion( 00876 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source 00877 s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase 00878 S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase 00879 S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase 00880 false, false); 00881 // - Sigma (final vs. non-final forms). 00882 const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 "; 00883 TestCaseConversion( 00884 Sigma + s + space + s + Sigma + space + s + Sigma + s + space + Sigma + S + Sigma + space + Sigma, // source 00885 sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase 00886 Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase 00887 Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase 00888 false, false); 00889 TestCaseConversion( 00890 sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + S + sigma + space + sigma, // source 00891 sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + s + sigma + space + sigma, // lowercase 00892 Sigma + s + space + S + sigma + space + S + sigma + s + space + Sigma + s + sigma + space + Sigma, // titlecase 00893 Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase 00894 false, false); 00895 TestCaseConversion( 00896 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma + space + fsigma, // source 00897 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma + space + fsigma, // lowercase 00898 Sigma + s + space + S + fsigma + space + S + fsigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase 00899 Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase 00900 false, false); 00901 const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove 00902 // Special case mappings for Turkic languages: 00903 // - After_I 00904 TestCaseConversion( 00905 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source 00906 s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase 00907 S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase 00908 S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase 00909 true, false); // turkic 00910 // - Not_Before_Dot 00911 TestCaseConversion( 00912 I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source 00913 iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase 00914 I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase 00915 I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase 00916 true, false); // turkic 00917 // Special case mappings for Lithuanian: 00918 // - After_Soft_Dotted [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above] 00919 TestCaseConversion( 00920 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source 00921 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase 00922 I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase 00923 I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase 00924 false, true); // lithuanian 00925 // - More_Above [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted] 00926 TestCaseConversion( 00927 J + Grave + space + J + nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J + nonSA + Grave + space + j + nonSA, // source 00928 j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase 00929 J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase 00930 J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + J + nonSA + Grave + space + J + nonSA, // uppercase 00931 false, true); // lithuanian 00932 // SoftDotted [^ Starter Above]* 0307 --(uc,tc)--> brez 0307 00933 // SoftDotted [^ Starter Above]* 0307 --( 00934 //TestCaseConversion("", "", "", "", false, false); 00935 } 00936 00937 //----------------------------------------------------------------------------- 00938 // TUniChDb -- initialization from the text files 00939 //----------------------------------------------------------------------------- 00940 00941 void TUniChDb::LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s) 00942 { 00943 if (s.Empty()) return; 00944 if (s[0] == '<') { 00945 int i = s.SearchCh('>'); IAssert(i > 0); 00946 ci.flags |= ucfCompatibilityDecomposition; 00947 s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); } 00948 TIntV dec; TUcdFileReader::ParseCodePointList(s, dec); 00949 IAssert(dec.Len() > 0); 00950 ci.decompOffset = decompositions.Len(); 00951 decompositions.AddV(dec); decompositions.Add(-1); 00952 } 00953 00954 void TUniChDb::InitPropList(const TStr& basePath) 00955 { 00956 TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0; 00957 reader.Open(CombinePath(basePath, GetPropListFn())); 00958 TSubcatHelper helper(*this); 00959 while (reader.GetNextLine(fields)) 00960 { 00961 IAssert(fields.Len() == 2); 00962 int from, to; reader.ParseCodePointRange(fields[0], from, to); 00963 TStr s = fields[1]; 00964 TUniChProperties prop = TUniChProperties(0); TUniChPropertiesX propx = TUniChPropertiesX(0); 00965 if (s == "White_Space") prop = ucfPrWhiteSpace; 00966 else if (s == "Bidi_Control") prop = ucfPrBidiControl; 00967 else if (s == "Join_Control") prop = ucfPrJoinControl; 00968 else if (s == "Dash") prop = ucfPrDash; 00969 else if (s == "Hyphen") prop = ucfPrHyphen; 00970 else if (s == "Quotation_Mark") prop = ucfPrQuotationMark; 00971 else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation; 00972 else if (s == "Other_Math") propx = ucfPxOtherMath; 00973 else if (s == "Hex_Digit") prop = ucfPrHexDigit; 00974 else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit; 00975 else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic; 00976 else if (s == "Ideographic") prop = ucfPrIdeographic; 00977 else if (s == "Diacritic") prop = ucfPrDiacritic; 00978 else if (s == "Extender") prop = ucfPrExtender; 00979 else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase; 00980 else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase; 00981 else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint; 00982 else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend; 00983 else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator; 00984 else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator; 00985 else if (s == "Radical") propx = ucfPxRadical; 00986 else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph; 00987 else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint; 00988 else if (s == "Deprecated") prop = ucfPrDeprecated; 00989 else if (s == "Soft_Dotted") prop = ucfPrSoftDotted; 00990 else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException; 00991 else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart; 00992 else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue; 00993 else if (s == "STerm") prop = ucfPrSTerm; 00994 else if (s == "Variation_Selector") prop = ucfPrVariationSelector; 00995 else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace; 00996 else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax; 00997 else FailR(s.CStr()); 00998 helper.ProcessComment(reader); 00999 for (int cp = from; cp <= to; cp++) { 01000 int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); } 01001 TUniChInfo &ci = h[i]; helper.TestCat(cp); 01002 if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); } 01003 if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); } 01004 nCps++; } 01005 nLines++; 01006 } 01007 reader.Close(); 01008 printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps); 01009 } 01010 01011 void TUniChDb::InitDerivedCoreProperties(const TStr& basePath) 01012 { 01013 TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0; 01014 reader.Open(CombinePath(basePath, GetDerivedCorePropsFn())); 01015 TSubcatHelper helper(*this); 01016 while (reader.GetNextLine(fields)) 01017 { 01018 IAssert(fields.Len() == 2); 01019 int from, to; reader.ParseCodePointRange(fields[0], from, to); 01020 TStr s = fields[1]; 01021 TUniChFlags flag = ucfCompatibilityDecomposition; 01022 if (s == "Math") flag = ucfDcpMath; 01023 else if (s == "Alphabetic") flag = ucfDcpAlphabetic; 01024 else if (s == "Lowercase") flag = ucfDcpLowercase; 01025 else if (s == "Uppercase") flag = ucfDcpUppercase; 01026 else if (s == "ID_Start") flag = ucfDcpIdStart; 01027 else if (s == "ID_Continue") flag = ucfDcpIdContinue; 01028 else if (s == "XID_Start") flag = ucfDcpXidStart; 01029 else if (s == "XID_Continue") flag = ucfDcpXidContinue; 01030 else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint; 01031 else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend; 01032 else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase; 01033 else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead 01034 else FailR(s.CStr()); 01035 // If we add new codepoints to the hash table, we should also set their category. 01036 // This is supposed to be provided in the comment, e.g. "# Cf SOFT HYPHEN". 01037 helper.ProcessComment(reader); 01038 // 01039 for (int cp = from; cp <= to; cp++) { 01040 int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); } 01041 helper.TestCat(cp); 01042 TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag)); 01043 ci.SetDcpFlag(flag); nCps++; } 01044 nLines++; 01045 } 01046 reader.Close(); 01047 printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps); 01048 } 01049 01050 void TUniChDb::InitLineBreaks(const TStr& basePath) 01051 { 01052 // Clear old linebreak values. 01053 ushort xx = TUniChInfo::LineBreak_Unknown; 01054 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx; 01055 // Read LineBreak.txt. 01056 TUcdFileReader reader; TStrV fields; 01057 reader.Open(CombinePath(basePath, GetLineBreakFn())); 01058 int nLines = 0, nCps = 0; 01059 while (reader.GetNextLine(fields)) 01060 { 01061 IAssert(fields.Len() == 2); 01062 int from, to; reader.ParseCodePointRange(fields[0], from, to); 01063 TStr s = fields[1]; IAssert(s.Len() == 2); 01064 ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]); 01065 if (us == xx) continue; 01066 for (int cp = from; cp <= to; cp++) { 01067 int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); 01068 printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); } 01069 IAssert(h[i].lineBreak == xx); 01070 h[i].lineBreak = us; nCps++; } 01071 nLines++; 01072 } 01073 reader.Close(); 01074 printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps); 01075 } 01076 01077 void TUniChDb::InitScripts(const TStr& basePath) 01078 { 01079 TUcdFileReader reader; TStrV fields; 01080 reader.Open(CombinePath(basePath, GetScriptsFn())); 01081 TSubcatHelper helper(*this); 01082 while (reader.GetNextLine(fields)) 01083 { 01084 int from, to; reader.ParseCodePointRange(fields[0], from, to); 01085 TStr scriptName = fields[1]; 01086 int scriptNo = scripts.GetKeyId(scriptName); 01087 if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; } 01088 IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char 01089 scripts[scriptNo] += 1; 01090 helper.ProcessComment(reader); 01091 for (int cp = from; cp <= to; cp++) { 01092 int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); } 01093 helper.TestCat(cp); 01094 TUniChInfo &ci = h[i]; ci.script = scriptNo; } 01095 } 01096 reader.Close(); 01097 scripts.AddDat(GetScriptNameUnknown()) = 0; 01098 printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len()); 01099 if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); ) 01100 printf(" %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i])); 01101 printf("\n"); 01102 } 01103 01104 void TUniChDb::InitWordAndSentenceBoundaryFlags(const TStr& basePath) 01105 { 01106 // UAX #29, sec. 4.1 and 5.1. 01107 // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt. 01108 int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0); 01109 int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0); 01110 // Clear any existing word-boundary flags and initialize them again. 01111 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) 01112 { 01113 const int cp = h.GetKey(i); TUniChInfo& ci = h[i]; 01114 ci.ClrWbAndSbFlags(); 01115 // Word-boundary flags. 01116 if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat); 01117 if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana); 01118 if (ci.lineBreak == TUniChInfo::LineBreak_InfixNumeric && cp != 0x3a) ci.SetWbFlag(ucfWbMidNum); 01119 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetWbFlag(ucfWbNumeric); 01120 if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet); 01121 // Sentence-boundary flags. Some are identical to some word-boundary flags. 01122 if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep); 01123 if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat); 01124 if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp); 01125 if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower); 01126 if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper); 01127 if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter); 01128 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetSbFlag(ucfSbNumeric); 01129 if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm); 01130 // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for 01131 // the purposes of sentence-boundary detection. Now in PropList.txt there is no doubt that 002E has the STerm 01132 // property; thus, it should also belong to the STerm sentence-boundary class. However, in 01133 // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class. 01134 if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm); 01135 if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose); 01136 } 01137 // Some additional characters for Katakana and MidLetter. 01138 TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f); 01139 for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana); 01140 v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a); 01141 for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter); 01142 // WbALetter depends on Katakana, so it cannot be initialized earlier. 01143 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) 01144 { 01145 const int cp = h.GetKey(i); TUniChInfo& ci = h[i]; 01146 if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend()) 01147 ci.SetWbFlag(ucfWbALetter); 01148 } 01149 // An alternative is to extract the flags from WordBreakProperty.txt. 01150 // The results should be the same. 01151 {TUcdFileReader reader; TStrV fields; 01152 reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn())); 01153 THash<TInt, TInt> hh; 01154 while (reader.GetNextLine(fields)) 01155 { 01156 IAssert(fields.Len() == 2); 01157 int from, to; reader.ParseCodePointRange(fields[0], from, to); 01158 TStr s = fields[1]; 01159 TUniChFlags flag = ucfCompatibilityDecomposition; 01160 if (s == "Format") flag = ucfWbFormat; 01161 else if (s == "Katakana") flag = ucfWbKatakana; 01162 else if (s == "ALetter") flag = ucfWbALetter; 01163 else if (s == "MidLetter") flag = ucfWbMidLetter; 01164 else if (s == "MidNum") flag = ucfWbMidNum; 01165 else if (s == "Numeric") flag = ucfWbNumeric; 01166 else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet; 01167 else FailR(s.CStr()); 01168 for (int c = from; c <= to; c++) { 01169 int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag); 01170 else hh[i].Val |= flag; } 01171 } 01172 reader.Close(); 01173 TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i)); 01174 for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i)); 01175 cps.Sort(); cps.Merge(); 01176 for (int i = 0; i < cps.Len(); i++) 01177 { 01178 int cp = cps[i]; 01179 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags(); 01180 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp); 01181 flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep; 01182 if (flags1 != flags2) { 01183 printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2); 01184 Fail; } 01185 }} 01186 // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt. 01187 {TUcdFileReader reader; TStrV fields; 01188 reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn())); 01189 THash<TInt, TInt> hh; 01190 while (reader.GetNextLine(fields)) 01191 { 01192 IAssert(fields.Len() == 2); 01193 int from, to; reader.ParseCodePointRange(fields[0], from, to); 01194 TStr s = fields[1]; 01195 TUniChFlags flag = ucfCompatibilityDecomposition; 01196 if (s == "Sep") flag = ucfSbSep; 01197 else if (s == "Format") flag = ucfSbFormat; 01198 else if (s == "Sp") flag = ucfSbSp; 01199 else if (s == "Lower") flag = ucfSbLower; 01200 else if (s == "Upper") flag = ucfSbUpper; 01201 else if (s == "OLetter") flag = ucfSbOLetter; 01202 else if (s == "Numeric") flag = ucfSbNumeric; 01203 else if (s == "ATerm") flag = ucfSbATerm; 01204 else if (s == "STerm") flag = ucfSbSTerm; 01205 else if (s == "Close") flag = ucfSbClose; 01206 else FailR(s.CStr()); 01207 for (int c = from; c <= to; c++) { 01208 int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag); 01209 else hh[i].Val |= flag; } 01210 } 01211 reader.Close(); 01212 TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i)); 01213 for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i)); 01214 cps.Sort(); cps.Merge(); 01215 for (int i = 0; i < cps.Len(); i++) 01216 { 01217 int cp = cps[i]; 01218 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags(); 01219 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp); 01220 if (flags1 != flags2) { 01221 printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp, 01222 flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(), 01223 flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(), 01224 flags1 ^ flags2); 01225 Fail; } 01226 }} 01227 } 01228 01229 void TUniChDb::InitSpecialCasing(const TStr& basePath) 01230 { 01231 TUcdFileReader reader; TStrV fields; 01232 reader.Open(CombinePath(basePath, GetSpecialCasingFn())); 01233 while (reader.GetNextLine(fields)) 01234 { 01235 IAssert(fields.Len() == 5 || fields.Len() == 6); 01236 IAssert(fields.Last().Empty()); 01237 // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method. 01238 TStr conditions = ""; 01239 if (fields.Len() == 6) conditions = fields[4]; 01240 conditions.ToTrunc(); if (! conditions.Empty()) continue; 01241 // Keep the other mappings. 01242 const int cp = reader.ParseCodePoint(fields[0]); 01243 TIntV v; reader.ParseCodePointList(fields[1], v); 01244 specialCasingLower.AddDat(cp, v); 01245 reader.ParseCodePointList(fields[2], v); 01246 specialCasingTitle.AddDat(cp, v); 01247 reader.ParseCodePointList(fields[3], v); 01248 specialCasingUpper.AddDat(cp, v); 01249 } 01250 reader.Close(); 01251 } 01252 01253 void TUniChDb::LoadTxt(const TStr& basePath) 01254 { 01255 Clr(); 01256 // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element. 01257 h = THash<TInt, TUniChInfo>(196613, true); 01258 // 01259 caseFolding.LoadTxt(CombinePath(basePath, GetCaseFoldingFn())); 01260 // 01261 TUcdFileReader reader; TStrV fields; TIntH seen; 01262 reader.Open(CombinePath(basePath, GetUnicodeDataFn())); 01263 while (reader.GetNextLine(fields)) 01264 { 01265 // Codepoint. 01266 int cp = reader.ParseCodePoint(fields[0]); 01267 IAssert(! seen.IsKey(cp)); seen.AddKey(cp); 01268 TUniChInfo& ci = h.AddDat(cp); 01269 // Name. 01270 ci.nameOffset = charNames.AddStr(fields[1]); 01271 // Category. 01272 TStr& s = fields[2]; IAssert(s.Len() == 2); 01273 ci.chCat = s[0]; ci.chSubCat = s[1]; 01274 // Canonical combining class. 01275 s = fields[3]; IAssert(s.Len() > 0); 01276 int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s); 01277 ci.combClass = (uchar) i; 01278 // Decomposition type and mapping. 01279 LoadTxt_ProcessDecomposition(ci, fields[5]); 01280 // Simple case mappings. 01281 s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1); 01282 s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1); 01283 s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1); 01284 // 01285 ci.InitAfterLoad(); // initializes ci.cat, ci.subCat 01286 } 01287 reader.Close(); 01288 // 01289 InitScripts(basePath); 01290 // 01291 InitPropList(basePath); 01292 InitDerivedCoreProperties(basePath); 01293 InitLineBreaks(basePath); 01294 InitSpecialCasing(basePath); 01295 // Process the composition exclusions (UAX #15, sec. 6). 01296 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) 01297 { 01298 TUniChInfo& ci = h[i]; 01299 int ofs = ci.decompOffset; if (ofs < 0) continue; 01300 int n = 0; while (decompositions[ofs + n] >= 0) n++; 01301 IAssert(n > 0); 01302 // Singleton decompositions. 01303 if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; } 01304 // Non-starter decompositions. 01305 int cp1 = decompositions[ofs]; 01306 IAssert(h.IsKey(cp1)); 01307 uchar ccc = h.GetDat(cp1).combClass; 01308 if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; } 01309 } 01310 // Process the composition exclusion table. 01311 reader.Open(CombinePath(basePath, GetCompositionExclusionsFn())); 01312 int nExclusionTable = 0; 01313 while (reader.GetNextLine(fields)) 01314 { 01315 IAssert(fields.Len() == 1); 01316 int cp = reader.ParseCodePoint(fields[0]); 01317 int i = h.GetKeyId(cp); IAssert(i >= 0); 01318 h[i].flags |= ucfCompositionExclusion; 01319 nExclusionTable++; 01320 } 01321 reader.Close(); 01322 // Prepare the inverted index for composition pairs. 01323 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) 01324 { 01325 int cp = h.GetKey(i); 01326 TUniChInfo& ci = h[i]; 01327 int ofs = ci.decompOffset; if (ofs < 0) continue; 01328 if (ci.IsCompositionExclusion()) continue; 01329 if (ci.IsCompatibilityDecomposition()) continue; 01330 int n = 0; while (decompositions[ofs + n] >= 0) n++; 01331 if (n != 2) continue; 01332 TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]); 01333 IAssert(! inverseDec.IsKey(pr)); 01334 IAssert(ci.combClass == TUniChInfo::ccStarter); 01335 inverseDec.AddDat(pr, cp); 01336 } 01337 printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n", 01338 basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable); 01339 // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as 01340 // flags such as Alphabetic, Word_Break, and Grapheme_Extend. 01341 InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point. 01342 // Make sure that Hangul combined characters are treated as stareters. 01343 for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++) 01344 { 01345 int j = h.GetKeyId(cp); if (j < 0) continue; 01346 TUniChInfo& ci = h[j]; 01347 if (ci.combClass == TUniChInfo::ccInvalid) ci.combClass = TUniChInfo::ccStarter; 01348 IAssert(ci.combClass == TUniChInfo::ccStarter); 01349 } 01350 // There should be no more additions to 'h' beyond this point. 01351 const int oldHLen = h.Len(); 01352 // Provide default (identity) case mappings if any were missing from UnicodeData.txt 01353 // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt). 01354 int scriptUnknown = GetScriptByName(GetScriptNameUnknown()); 01355 for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) 01356 { 01357 int cp = h.GetKey(i); TUniChInfo &ci = h[i]; 01358 if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp; 01359 if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp; 01360 if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp; 01361 if (ci.script < 0) ci.script = scriptUnknown; 01362 } 01363 IAssert(h.Len() == oldHLen); 01364 } 01365 01366 void TUniChDb::SaveBin(const TStr& fnBinUcd) 01367 { 01368 PSOut SOut=TFOut::New(fnBinUcd); 01369 Save(*SOut); 01370 } 01371 01372 void TUniChDb::InitAfterLoad() 01373 { 01374 scriptUnknown = GetScriptByName(GetScriptNameUnknown()); IAssert(scriptUnknown >= 0); 01375 } 01376 01377 //----------------------------------------------------------------------------- 01378 // TUniChDb -- main test driver 01379 //----------------------------------------------------------------------------- 01380 01381 void TUniChDb::Test(const TStr& basePath) 01382 { 01383 TStr fnBin = CombinePath(basePath, GetBinFn()); 01384 if (true || ! TFile::Exists(fnBin)) 01385 { 01386 // Test LoadTxt. 01387 LoadTxt(basePath); 01388 // Test Save. 01389 {PSOut SOut = TFOut::New(fnBin); 01390 Save(*SOut);} 01391 } 01392 // Test Load. 01393 this->~TUniChDb(); 01394 new(this) TUniChDb(); 01395 {PSIn SIn = TFIn::New(fnBin); 01396 Load(*SIn);} 01397 // Test the case folding. 01398 caseFolding.Test(); 01399 // Test the word breaking. 01400 TestWbFindNonIgnored(); 01401 // Test the sentence breaking. 01402 TestFindNextWordOrSentenceBoundary(basePath, true); 01403 TestFindNextWordOrSentenceBoundary(basePath, false); 01404 // Test composition and decomposition. 01405 TestComposition(basePath); 01406 // Test the case conversions. 01407 TestCaseConversions(); 01408 } 01409 01410 //----------------------------------------------------------------------------- 01411 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode 01412 //----------------------------------------------------------------------------- 01413 01414 //----------------------------------------------------------------------------- 01415 // ISO-8859-2 01416 //----------------------------------------------------------------------------- 01417 01418 const int TEncoding_ISO8859_2::toUnicodeTable[6 * 16] = 01419 { 01420 /* 0xa0 */ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, 01421 /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, 01422 /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, 01423 /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, 01424 /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, 01425 /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 01426 }; 01427 01428 const int TEncoding_ISO8859_2::fromUnicodeTable1[14 * 16] = { 01429 /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, -1, 01430 /* U+00b0 */ 0x00b0, -1, -1, -1, 0x00b4, -1, -1, -1, 0x00b8, -1, -1, -1, -1, -1, -1, -1, 01431 /* U+00c0 */ -1, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1, 01432 /* U+00d0 */ -1, -1, -1, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, -1, 0x00da, -1, 0x00dc, 0x00dd, -1, 0x00df, 01433 /* U+00e0 */ -1, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1, 01434 /* U+00f0 */ -1, -1, -1, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, -1, 0x00fa, -1, 0x00fc, 0x00fd, -1, -1, 01435 /* U+0100 */ -1, -1, 0x00c3, 0x00e3, 0x00a1, 0x00b1, 0x00c6, 0x00e6, -1, -1, -1, -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef, 01436 /* U+0110 */ 0x00d0, 0x00f0, -1, -1, -1, -1, -1, -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec, -1, -1, -1, -1, 01437 /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01438 /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, 0x00e5, -1, -1, 0x00a5, 0x00b5, -1, 01439 /* U+0140 */ -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1, -1, -1, 0x00d2, 0x00f2, -1, -1, -1, -1, -1, -1, -1, 01440 /* U+0150 */ 0x00d5, 0x00f5, -1, -1, 0x00c0, 0x00e0, -1, -1, 0x00d8, 0x00f8, 0x00a6, 0x00b6, -1, -1, 0x00aa, 0x00ba, 01441 /* U+0160 */ 0x00a9, 0x00b9, 0x00de, 0x00fe, 0x00ab, 0x00bb, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d9, 0x00f9, 01442 /* U+0170 */ 0x00db, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x00ac, 0x00bc, 0x00af, 0x00bf, 0x00ae, 0x00be, -1 01443 }; 01444 01445 const int TEncoding_ISO8859_2::fromUnicodeTable2[2 * 16] = { 01446 /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00b7, -1, -1, -1, -1, -1, -1, -1, -1, 01447 /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00a2, 0x00ff, -1, 0x00b2, -1, 0x00bd, -1, -1 01448 }; 01449 01450 //----------------------------------------------------------------------------- 01451 // ISO-8859-3 01452 //----------------------------------------------------------------------------- 01453 01454 const int TEncoding_ISO8859_3::toUnicodeTable[6 * 16] = { 01455 /* 0xa0 */ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -1, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -1, 0x017b, 01456 /* 0xb0 */ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -1, 0x017c, 01457 /* 0xc0 */ 0x00c0, 0x00c1, 0x00c2, -1, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 01458 /* 0xd0 */ -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, 01459 /* 0xe0 */ 0x00e0, 0x00e1, 0x00e2, -1, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 01460 /* 0xf0 */ -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9 01461 }; 01462 01463 const int TEncoding_ISO8859_3::fromUnicodeTable1[14 * 16] = { 01464 /* U+00a0 */ 0x00a0, -1, -1, 0x00a3, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, -1, 01465 /* U+00b0 */ 0x00b0, -1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, -1, 0x00b7, 0x00b8, -1, -1, -1, -1, 0x00bd, -1, -1, 01466 /* U+00c0 */ 0x00c0, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 01467 /* U+00d0 */ -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, 0x00d9, 0x00da, 0x00db, 0x00dc, -1, -1, 0x00df, 01468 /* U+00e0 */ 0x00e0, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 01469 /* U+00f0 */ -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, 0x00f9, 0x00fa, 0x00fb, 0x00fc, -1, -1, -1, 01470 /* U+0100 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00c6, 0x00e6, 0x00c5, 0x00e5, -1, -1, -1, -1, 01471 /* U+0110 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d8, 0x00f8, 0x00ab, 0x00bb, 01472 /* U+0120 */ 0x00d5, 0x00f5, -1, -1, 0x00a6, 0x00b6, 0x00a1, 0x00b1, -1, -1, -1, -1, -1, -1, -1, -1, 01473 /* U+0130 */ 0x00a9, 0x00b9, -1, -1, 0x00ac, 0x00bc, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01474 /* U+0140 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01475 /* U+0150 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00de, 0x00fe, 0x00aa, 0x00ba, 01476 /* U+0160 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00dd, 0x00fd, -1, -1, 01477 /* U+0170 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00af, 0x00bf, -1, -1, -1, 01478 }; 01479 const int TEncoding_ISO8859_3::fromUnicodeTable2[2] = { 01480 /* U+02d8 */ 0x00a2, 0x00ff 01481 }; 01482 01483 //----------------------------------------------------------------------------- 01484 // ISO-8859-4 01485 //----------------------------------------------------------------------------- 01486 01487 const int TEncoding_ISO8859_4::toUnicodeTable[6 * 16] = { 01488 /* 0xa0 */ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, 01489 /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, 01490 /* 0xc0 */ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, 01491 /* 0xd0 */ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, 01492 /* 0xe0 */ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, 01493 /* 0xf0 */ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9 01494 }; 01495 01496 const int TEncoding_ISO8859_4::fromUnicodeTable1[14 * 16] = { 01497 /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, 0x00af, 01498 /* U+00b0 */ 0x00b0, -1, -1, -1, 0x00b4, -1, -1, -1, 0x00b8, -1, -1, -1, -1, -1, -1, -1, 01499 /* U+00c0 */ -1, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, -1, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1, 01500 /* U+00d0 */ -1, -1, -1, -1, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, -1, 0x00da, 0x00db, 0x00dc, -1, -1, 0x00df, 01501 /* U+00e0 */ -1, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, -1, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1, 01502 /* U+00f0 */ -1, -1, -1, -1, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, -1, 0x00fa, 0x00fb, 0x00fc, -1, -1, -1, 01503 /* U+0100 */ 0x00c0, 0x00e0, -1, -1, 0x00a1, 0x00b1, -1, -1, -1, -1, -1, -1, 0x00c8, 0x00e8, -1, -1, 01504 /* U+0110 */ 0x00d0, 0x00f0, 0x00aa, 0x00ba, -1, -1, 0x00cc, 0x00ec, 0x00ca, 0x00ea, -1, -1, -1, -1, -1, -1, 01505 /* U+0120 */ -1, -1, 0x00ab, 0x00bb, -1, -1, -1, -1, 0x00a5, 0x00b5, 0x00cf, 0x00ef, -1, -1, 0x00c7, 0x00e7, 01506 /* U+0130 */ -1, -1, -1, -1, -1, -1, 0x00d3, 0x00f3, 0x00a2, -1, -1, 0x00a6, 0x00b6, -1, -1, -1, 01507 /* U+0140 */ -1, -1, -1, -1, -1, 0x00d1, 0x00f1, -1, -1, -1, 0x00bd, 0x00bf, 0x00d2, 0x00f2, -1, -1, 01508 /* U+0150 */ -1, -1, -1, -1, -1, -1, 0x00a3, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, 01509 /* U+0160 */ 0x00a9, 0x00b9, -1, -1, -1, -1, 0x00ac, 0x00bc, 0x00dd, 0x00fd, 0x00de, 0x00fe, -1, -1, -1, -1, 01510 /* U+0170 */ -1, -1, 0x00d9, 0x00f9, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ae, 0x00be, -1, 01511 }; 01512 01513 const int TEncoding_ISO8859_4::fromUnicodeTable2[2 * 16] = { 01514 /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00b7, -1, -1, -1, -1, -1, -1, -1, -1, 01515 /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ff, -1, 0x00b2, -1, -1, -1, -1 01516 }; 01517 01518 //----------------------------------------------------------------------------- 01519 // CP 437 01520 //----------------------------------------------------------------------------- 01521 01522 const int TEncoding_CP437::toUnicodeTable[8 * 16] = { 01523 /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 01524 /* 0x90 */ 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 01525 /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, 01526 /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, 01527 /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 01528 /* 0xd0 */ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 01529 /* 0xe0 */ 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, 01530 /* 0xf0 */ 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0 01531 }; 01532 01533 const int TEncoding_CP437::fromUnicodeTable1[6 * 16] = { 01534 /* U+00a0 */ 0x00ff, 0x00ad, 0x009b, 0x009c, -1, 0x009d, -1, -1, -1, -1, 0x00a6, 0x00ae, 0x00aa, -1, -1, -1, 01535 /* U+00b0 */ 0x00f8, 0x00f1, 0x00fd, -1, -1, 0x00e6, -1, 0x00fa, -1, -1, 0x00a7, 0x00af, 0x00ac, 0x00ab, -1, 0x00a8, 01536 /* U+00c0 */ -1, -1, -1, -1, 0x008e, 0x008f, 0x0092, 0x0080, -1, 0x0090, -1, -1, -1, -1, -1, -1, 01537 /* U+00d0 */ -1, 0x00a5, -1, -1, -1, -1, 0x0099, -1, -1, -1, -1, -1, 0x009a, -1, -1, 0x00e1, 01538 /* U+00e0 */ 0x0085, 0x00a0, 0x0083, -1, 0x0084, 0x0086, 0x0091, 0x0087, 0x008a, 0x0082, 0x0088, 0x0089, 0x008d, 0x00a1, 0x008c, 0x008b, 01539 /* U+00f0 */ -1, 0x00a4, 0x0095, 0x00a2, 0x0093, -1, 0x0094, 0x00f6, -1, 0x0097, 0x00a3, 0x0096, 0x0081, -1, -1, 0x0098, 01540 }; 01541 01542 const int TEncoding_CP437::fromUnicodeTable2[4 * 16] = { 01543 /* U+0390 */ -1, -1, -1, 0x00e2, -1, -1, -1, -1, 0x00e9, -1, -1, -1, -1, -1, -1, -1, 01544 /* U+03a0 */ -1, -1, -1, 0x00e4, -1, -1, 0x00e8, -1, -1, 0x00ea, -1, -1, -1, -1, -1, -1, 01545 /* U+03b0 */ -1, 0x00e0, -1, -1, 0x00eb, 0x00ee, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01546 /* U+03c0 */ 0x00e3, -1, -1, 0x00e5, 0x00e7, -1, 0x00ed, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01547 }; 01548 01549 const int TEncoding_CP437::fromUnicodeTable3[6 * 16] = { 01550 /* U+2210 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00f9, 0x00fb, -1, -1, -1, 0x00ec, -1, 01551 /* U+2220 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ef, -1, -1, -1, -1, -1, -1, 01552 /* U+2230 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01553 /* U+2240 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00f7, -1, -1, -1, -1, -1, -1, -1, 01554 /* U+2250 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01555 /* U+2260 */ -1, 0x00f0, -1, -1, 0x00f3, 0x00f2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01556 }; 01557 01558 const int TEncoding_CP437::fromUnicodeTable4[11 * 16] = { 01559 /* U+2500 */ 0x00c4, -1, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00da, -1, -1, -1, 01560 /* U+2510 */ 0x00bf, -1, -1, -1, 0x00c0, -1, -1, -1, 0x00d9, -1, -1, -1, 0x00c3, -1, -1, -1, 01561 /* U+2520 */ -1, -1, -1, -1, 0x00b4, -1, -1, -1, -1, -1, -1, -1, 0x00c2, -1, -1, -1, 01562 /* U+2530 */ -1, -1, -1, -1, 0x00c1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, -1, -1, -1, 01563 /* U+2540 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01564 /* U+2550 */ 0x00cd, 0x00ba, 0x00d5, 0x00d6, 0x00c9, 0x00b8, 0x00b7, 0x00bb, 0x00d4, 0x00d3, 0x00c8, 0x00be, 0x00bd, 0x00bc, 0x00c6, 0x00c7, 01565 /* U+2560 */ 0x00cc, 0x00b5, 0x00b6, 0x00b9, 0x00d1, 0x00d2, 0x00cb, 0x00cf, 0x00d0, 0x00ca, 0x00d8, 0x00d7, 0x00ce, -1, -1, -1, 01566 /* U+2570 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01567 /* U+2580 */ 0x00df, -1, -1, -1, 0x00dc, -1, -1, -1, 0x00db, -1, -1, -1, 0x00dd, -1, -1, -1, 01568 /* U+2590 */ 0x00de, 0x00b0, 0x00b1, 0x00b2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01569 /* U+25a0 */ 0x00fe, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 01570 }; 01571 // /* U+0190 */ -1, -1, 0x009f, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01572 // /* U+2070 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00fc, 01573 // /* U+20a0 */ -1, -1, -1, -1, -1, -1, -1, 0x009e, -1, -1, -1, -1, -1, -1, -1, -1, 01574 // /* U+2310 */ 0x00a9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01575 // /* U+2320 */ 0x00f4, 0x00f5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01576 01577 //----------------------------------------------------------------------------- 01578 // CP 852 01579 //----------------------------------------------------------------------------- 01580 01581 const int TEncoding_CP852::toUnicodeTable[8 * 16] = { 01582 /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7, 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106, 01583 /* 0x90 */ 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d, 01584 /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e, 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb, 01585 /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a, 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510, 01586 /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 01587 /* 0xd0 */ 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce, 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580, 01588 /* 0xe0 */ 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161, 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4, 01589 /* 0xf0 */ 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0 01590 }; 01591 01592 const int TEncoding_CP852::fromUnicodeTable1[14 * 16] = { 01593 /* U+00a0 */ 0x00ff, -1, -1, -1, 0x00cf, -1, -1, 0x00f5, 0x00f9, -1, -1, 0x00ae, 0x00aa, 0x00f0, -1, -1, 01594 /* U+00b0 */ 0x00f8, -1, -1, -1, 0x00ef, -1, -1, -1, 0x00f7, -1, -1, 0x00af, -1, -1, -1, -1, 01595 /* U+00c0 */ -1, 0x00b5, 0x00b6, -1, 0x008e, -1, -1, 0x0080, -1, 0x0090, -1, 0x00d3, -1, 0x00d6, 0x00d7, -1, 01596 /* U+00d0 */ -1, -1, -1, 0x00e0, 0x00e2, -1, 0x0099, 0x009e, -1, -1, 0x00e9, -1, 0x009a, 0x00ed, -1, 0x00e1, 01597 /* U+00e0 */ -1, 0x00a0, 0x0083, -1, 0x0084, -1, -1, 0x0087, -1, 0x0082, -1, 0x0089, -1, 0x00a1, 0x008c, -1, 01598 /* U+00f0 */ -1, -1, -1, 0x00a2, 0x0093, -1, 0x0094, 0x00f6, -1, -1, 0x00a3, -1, 0x0081, 0x00ec, -1, -1, 01599 /* U+0100 */ -1, -1, 0x00c6, 0x00c7, 0x00a4, 0x00a5, 0x008f, 0x0086, -1, -1, -1, -1, 0x00ac, 0x009f, 0x00d2, 0x00d4, 01600 /* U+0110 */ 0x00d1, 0x00d0, -1, -1, -1, -1, -1, -1, 0x00a8, 0x00a9, 0x00b7, 0x00d8, -1, -1, -1, -1, 01601 /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01602 /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0091, 0x0092, -1, -1, 0x0095, 0x0096, -1, 01603 /* U+0140 */ -1, 0x009d, 0x0088, 0x00e3, 0x00e4, -1, -1, 0x00d5, 0x00e5, -1, -1, -1, -1, -1, -1, -1, 01604 /* U+0150 */ 0x008a, 0x008b, -1, -1, 0x00e8, 0x00ea, -1, -1, 0x00fc, 0x00fd, 0x0097, 0x0098, -1, -1, 0x00b8, 0x00ad, 01605 /* U+0160 */ 0x00e6, 0x00e7, 0x00dd, 0x00ee, 0x009b, 0x009c, -1, -1, -1, -1, -1, -1, -1, -1, 0x00de, 0x0085, 01606 /* U+0170 */ 0x00eb, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x008d, 0x00ab, 0x00bd, 0x00be, 0x00a6, 0x00a7, -1 01607 }; 01608 01609 const int TEncoding_CP852::fromUnicodeTable2[2* 16] = { 01610 /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00f3, -1, -1, -1, -1, -1, -1, -1, -1, 01611 /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00f4, 0x00fa, -1, 0x00f2, -1, 0x00f1, -1, -1 01612 }; 01613 01614 const int TEncoding_CP852::fromUnicodeTable3[11 * 16] = { 01615 /* U+2500 */ 0x00c4, -1, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00da, -1, -1, -1, 01616 /* U+2510 */ 0x00bf, -1, -1, -1, 0x00c0, -1, -1, -1, 0x00d9, -1, -1, -1, 0x00c3, -1, -1, -1, 01617 /* U+2520 */ -1, -1, -1, -1, 0x00b4, -1, -1, -1, -1, -1, -1, -1, 0x00c2, -1, -1, -1, 01618 /* U+2530 */ -1, -1, -1, -1, 0x00c1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, -1, -1, -1, 01619 /* U+2540 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01620 /* U+2550 */ 0x00cd, 0x00ba, -1, -1, 0x00c9, -1, -1, 0x00bb, -1, -1, 0x00c8, -1, -1, 0x00bc, -1, -1, 01621 /* U+2560 */ 0x00cc, -1, -1, 0x00b9, -1, -1, 0x00cb, -1, -1, 0x00ca, -1, -1, 0x00ce, -1, -1, -1, 01622 /* U+2570 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01623 /* U+2580 */ 0x00df, -1, -1, -1, 0x00dc, -1, -1, -1, 0x00db, -1, -1, -1, -1, -1, -1, -1, 01624 /* U+2590 */ -1, 0x00b0, 0x00b1, 0x00b2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01625 /* U+25a0 */ 0x00fe, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 01626 }; 01627 01628 //----------------------------------------------------------------------------- 01629 // Windows-1250 01630 //----------------------------------------------------------------------------- 01631 01632 const int TEncoding_CP1250::toUnicodeTable[8 * 16] = { 01633 /* 0x80 */ 0x20ac, -1, 0x201a, -1, 0x201e, 0x2026, 0x2020, 0x2021, -1, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, 01634 /* 0x90 */ -1, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, -1, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, 01635 /* 0xa0 */ 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, 01636 /* 0xb0 */ 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, 01637 /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, 01638 /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, 01639 /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, 01640 /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 01641 }; 01642 01643 const int TEncoding_CP1250::fromUnicodeTable1[14 * 16] = { 01644 /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, 0x00a6, 0x00a7, 0x00a8, 0x00a9, -1, 0x00ab, 0x00ac, 0x00ad, 0x00ae, -1, 01645 /* U+00b0 */ 0x00b0, 0x00b1, -1, -1, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, -1, -1, 0x00bb, -1, -1, -1, -1, 01646 /* U+00c0 */ -1, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1, 01647 /* U+00d0 */ -1, -1, -1, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, -1, 0x00da, -1, 0x00dc, 0x00dd, -1, 0x00df, 01648 /* U+00e0 */ -1, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1, 01649 /* U+00f0 */ -1, -1, -1, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, -1, 0x00fa, -1, 0x00fc, 0x00fd, -1, -1, 01650 /* U+0100 */ -1, -1, 0x00c3, 0x00e3, 0x00a5, 0x00b9, 0x00c6, 0x00e6, -1, -1, -1, -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef, 01651 /* U+0110 */ 0x00d0, 0x00f0, -1, -1, -1, -1, -1, -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec, -1, -1, -1, -1, 01652 /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */, 01653 /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, 0x00e5, -1, -1, 0x00bc, 0x00be, -1, 01654 /* U+0140 */ -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1, -1, -1, 0x00d2, 0x00f2, -1, -1, -1, -1, -1, -1, -1, 01655 /* U+0150 */ 0x00d5, 0x00f5, -1, -1, 0x00c0, 0x00e0, -1, -1, 0x00d8, 0x00f8, 0x008c, 0x009c, -1, -1, 0x00aa, 0x00ba, 01656 /* U+0160 */ 0x008a, 0x009a, 0x00de, 0x00fe, 0x008d, 0x009d, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d9, 0x00f9, 01657 /* U+0170 */ 0x00db, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x008f, 0x009f, 0x00af, 0x00bf, 0x008e, 0x009e, -1, 01658 }; 01659 01660 const int TEncoding_CP1250::fromUnicodeTable2[2 * 16] = { 01661 /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00a1, -1, -1, -1, -1, -1, -1, -1, -1, 01662 /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00a2, 0x00ff, -1, 0x00b2, -1, 0x00bd, -1, -1, 01663 }; 01664 01665 const int TEncoding_CP1250::fromUnicodeTable3[3 * 16] = { 01666 /* U+2010 */ -1, -1, -1, 0x0096, 0x0097, -1, -1, -1, 0x0091, 0x0092, 0x0082, -1, 0x0093, 0x0094, 0x0084, -1, 01667 /* U+2020 */ 0x0086, 0x0087, 0x0095, -1, -1, -1, 0x0085, -1, -1, -1, -1, -1, -1, -1, -1, -1, 01668 /* U+2030 */ 0x0089, -1, -1, -1, -1, -1, -1, -1, -1, 0x008b, 0x009b, -1, -1, -1, -1, -1, 01669 }; 01670 // /* U+20a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0080, -1, -1, -1, 01671 // /* U+2120 */ -1, -1, 0x0099, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 01672 01673 //----------------------------------------------------------------------------- 01674 // YU-ASCII 01675 //----------------------------------------------------------------------------- 01676 01677 // C acute c acute C caron c caron S caron s caron Z caron z caron D stroke d stroke 01678 const int TEncoding_YuAscii::uniChars[10] = { 0x106, 0x107, 0x10c, 0x10d, 0x160, 0x161, 0x17d, 0x17e, 0x110, 0x111 }; 01679 const int TEncoding_YuAscii::yuAsciiChars[10] = { 0x5d, 0x7d, 0x5e, 0x7e, 0x5b, 0x7b, 0x40, 0x60, 0x5c, 0x7c }; 01680 // ']' '}' '^' '~' '[' '{' '@' '`' '\\' '|' 01681 01682 01683 //----------------------------------------------------------------------------- 01684 // TUnicode - codec registry 01685 //----------------------------------------------------------------------------- 01686 01687 void TUnicode::InitCodecs() 01688 { 01689 ClrCodecs(); 01690 RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>()); 01691 RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>()); 01692 RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>()); 01693 RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>()); 01694 RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>()); 01695 RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>()); 01696 RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>()); 01697 RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>()); 01698 } 01699 01700 void TUnicode::EncodeUtf8(const uint& c, TChA& dest) { 01701 if (c > 0x10ffff) { 01702 throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); } 01703 if (c < 0x80u) 01704 dest.AddCh(char(c & 0xffu)); 01705 else if (c < 0x800u) { 01706 dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111))); 01707 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } 01708 else if (c < 0x10000u) { 01709 dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111))); 01710 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); 01711 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } 01712 else if (c < 0x200000u) { 01713 dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111))); 01714 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111))); 01715 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); 01716 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } 01717 else if (c < 0x4000000u) { 01718 dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011))); 01719 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111))); 01720 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111))); 01721 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); 01722 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } 01723 else { 01724 dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011))); 01725 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111))); 01726 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111))); 01727 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111))); 01728 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); 01729 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } 01730 } 01731 01732 TStr TUnicode::EncodeUtf8(const uint& Ch) { 01733 TChA ChA; EncodeUtf8(Ch, ChA); return ChA; 01734 }