SNAP Library 2.0, Developer Reference  2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
unicode.cpp
Go to the documentation of this file.
00001 // Unicode.cpp : Defines the entry point for the console application.
00002 //
00003 
00005 // Includes
00006 //#include "unicode.h"
00007 
00008 //-----------------------------------------------------------------------------
00009 // Private declarations of this module
00010 //-----------------------------------------------------------------------------
00011 
00012 namespace {
00013 
00014 class TVectorBuilder2
00015 {
00016 public:
00017         TIntV v;
00018         TVectorBuilder2(int i) { v.Add(i); }
00019         operator TIntV() const { return v; }
00020         TVectorBuilder2& operator ,(int i) { v.Add(i); return *this; }
00021 };
00022 
00023 class TVectorBuilder
00024 {
00025 public:
00026         operator TIntV() const { return TIntV(); }
00027         TVectorBuilder2 operator ,(int i) { return TVectorBuilder2(i); }
00028 };
00029 
00030 TVectorBuilder VB;
00031 
00032 TStr CombinePath(const TStr& s, const TStr& t)
00033 {
00034         int n = s.Len(); if (n <= 0) return t;
00035         if (s[n - 1] == '\\' || s[n - 1] == '/' || s[n - 1] == ':') return s + t;
00036         return s + "\\" + t;
00037 }
00038 
00039 void AssertEq(const TIntV& v1, const TIntV& v2, const TStr& explanation, FILE *f)
00040 {
00041         const int n = v1.Len();
00042         bool ok = (n == v2.Len());
00043         if (ok) for (int i = 0; i < n && ok; i++) ok = ok && (v1[i] == v2[i]);
00044         if (! ok)
00045         {
00046                 if (! f) f = stderr;
00047                 fprintf(f, "%s: [", explanation.CStr());
00048                 for (int i = 0; i < v1.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v1[i]));
00049                 fprintf(f, "] != [");
00050                 for (int i = 0; i < v2.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v2[i]));
00051                 fprintf(f, "]\n");
00052                 Fail;
00053         }
00054 }
00055 
00056 };
00057 
00058 //-----------------------------------------------------------------------------
00059 // TUniCodec -- miscellaneous declarations
00060 //-----------------------------------------------------------------------------
00061 
00062 uint TUniCodec::GetRndUint(TRnd& rnd)
00063 {
00064         uint u = rnd.GetUniDevUInt(256) & 0xff;
00065         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
00066         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
00067         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
00068         return u;
00069 }
00070 
00071 uint TUniCodec::GetRndUint(TRnd& rnd, uint minVal, uint maxVal)
00072 {
00073         if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
00074         uint range = maxVal - minVal + 1;
00075         if (range > (uint(1) << (8 * sizeof(uint) - 1)))
00076                 while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
00077         uint mask = 1;
00078         while (mask < range) mask <<= 1;
00079         mask -= 1;
00080         while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
00081 }
00082 
00083 bool TUniCodec::IsMachineLittleEndian()
00084 {
00085         static bool isLE, initialized = false;
00086         if (initialized) return isLE;
00087         int i = 0x0201;
00088         char *p = (char *) (&i);
00089         char c1, c2;
00090         memcpy(&c1, p, 1); memcpy(&c2, p + 1, 1);
00091         if (c1 == 1 && c2 == 2) isLE = true;
00092         else if (c1 == 2 && c2 == 1) isLE = false;
00093         else {
00094                 FailR(("TUniCodec::IsMachineLittleEndian: c1 = " + TInt::GetStr(int(uchar(c1)), "%02x") + ", c2 = " + TInt::GetStr(int(uchar(c2)), "%02x") + ".").CStr());
00095                 isLE = true; }
00096         initialized = true; return isLE;
00097 }
00098 
00099 //-----------------------------------------------------------------------------
00100 // TUniCodec -- UTF-8 test driver
00101 //-----------------------------------------------------------------------------
00102 
00103 void TUniCodec::TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f)
00104 {
00105         TIntV dest;
00106         if (f) {
00107                 fprintf(f, "Settings: %s  %s  %s   replacementChar = %x\n",
00108                         (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
00109                         (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
00110                 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
00111         try
00112         {
00113                 size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
00114                 if (f) {
00115                         fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(dest[i]));
00116                         fprintf(f, "\n    expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(expectedDest[i]));
00117                         fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
00118                 if (retVal != expectedRetVal)
00119                         printf("!!!");
00120                 IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
00121                 if (dest.Len() != expectedDest.Len())
00122                         printf("!!!");
00123                 IAssert(dest.Len() == expectedDest.Len());
00124                 for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
00125         }
00126         catch (TUnicodeException e)
00127         {
00128                 if (f) {
00129                         fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
00130                         fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
00131                 IAssert(expectedThrow);
00132         }
00133 }
00134 
00135 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00136 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00137 void TUniCodec::TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc)
00138 {
00139         TIntV src; TIntV expectedDest; int expectedRetVal = 0;
00140         bool expectedAbort = false;
00141         FILE *f = 0; // stderr
00142         // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
00143         // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
00144         // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
00145         // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
00146         //   (absent = 0, 'a' = 1, 'b' = 2 and so on).
00147         for (int i = 0; i < testCaseDesc.Len(); )
00148         {
00149                 IAssert(i + 2 <= testCaseDesc.Len());
00150                 const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
00151                 uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
00152                 IAssert('1' <= d && d <= '6'); nBytes = d - '0';
00153                 if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
00154                 else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
00155                 else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
00156                 else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
00157                 else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
00158                 else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
00159                 else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
00160                 else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
00161                 else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
00162                 else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
00163                 else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
00164                 else Fail;
00165                 IAssert(nBytes >= minBytes);
00166                 // Process 'e'.
00167                 int nToDel = 0;
00168                 if (i < testCaseDesc.Len()) {
00169                         const char e = testCaseDesc[i];
00170                         if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
00171                 IAssert(nToDel < nBytes);
00172                 // Will an error occur during the decoding of this codepoint?
00173                 bool errHere = false;
00174                 if (eighties) errHere = true;
00175                 else if (nToDel > 0) errHere = true;
00176                 else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
00177                 // Update 'expectedDest' and 'expetedRetVal'.
00178                 if (! expectedAbort) {
00179                         if (! errHere) {
00180                                 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
00181                                 else { expectedDest.Add(cp); expectedRetVal += 1; } }
00182                         else if (errorHandling == uehReplace) {
00183                                 if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
00184                                 else expectedDest.Add(replacementChar); }
00185                         if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
00186                 // Update 'src'.
00187                 if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
00188                 else if (nBytes == 1) src.Add(cp);
00189                 else {
00190                         int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
00191                         src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
00192                         for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
00193         }
00194         if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
00195         TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
00196 }
00197 
00198 void TUniCodec::TestUtf8()
00199 {
00200         TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
00201         for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
00202         for (int strict_ = 0; strict_ < 2; strict_++)
00203         for (int errMode_ = 0; errMode_ < 4; errMode_++)
00204         {
00205                 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
00206                 TRnd rnd = TRnd(123);
00207                 // Test DecodeUtf8 on various random UTF-8-encoded sequences.
00208                 for (int i = 0; i < 10; i++)
00209                 {
00210                         TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
00211                         TestDecodeUtf8(rnd, "X3A5dA6d");
00212                         TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
00213                         TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
00214                         TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
00215                         TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
00216                         TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
00217                         TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
00218                         TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
00219                         TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
00220                         TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
00221                         TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
00222                         TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
00223                         TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
00224                         TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
00225                         TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
00226                         TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
00227                         TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
00228                 }
00229                 // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
00230                 // close to powers of 2.
00231                 TIntV src, expectedDest, src2;
00232                 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
00233                 for (int pow = 8; pow <= 32; pow++)
00234                 {
00235                         uint uFrom, uTo;
00236                         if (pow == 8) uFrom = 0, uTo = 1u << pow;
00237                         else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
00238                         else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
00239                         printf("%u..%u          \r", uFrom, uTo);
00240                         for (uint u = uFrom; ; u++)
00241                         {
00242                                 int nBytes = 0;
00243                                 if (u < (1u << 7)) nBytes = 1;
00244                                 else if (u < (1u << 11)) nBytes = 2;
00245                                 else if (u < (1u << 16)) nBytes = 3;
00246                                 else if (u < (1u << 21)) nBytes = 4;
00247                                 else if (u < (1u << 26)) nBytes = 5;
00248                                 else nBytes = 6;
00249                                 src.Gen(6, nBytes);
00250                                 if (nBytes == 1) src[0] = u;
00251                                 else {
00252                                         src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
00253                                         for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
00254                                 bool err = (strict && u > 0x10ffff);
00255                                 expectedDest.Reserve(1, 0);
00256                                 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
00257                                 else if (! err) expectedDest.Add(u);
00258                                 int erv = (err ? 0 : 1);
00259                                 if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
00260                                 TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
00261                                 // We can also test the UTF-8 encoder.
00262                                 src2[0] = u;
00263                                 if (err) {
00264                                         if (errorHandling == uehReplace) src = utf8ReplCh;
00265                                         else src.Clr(false); }
00266                                 TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
00267                                 //
00268                                 if (u == uTo) break;
00269                         }
00270                 }
00271         }
00272 }
00273 
00274 //-----------------------------------------------------------------------------
00275 // TUniCodec -- UTF-16 test driver
00276 //-----------------------------------------------------------------------------
00277 
00278 void TUniCodec::WordsToBytes(const TIntV& src, TIntV& dest)
00279 {
00280         dest.Clr();
00281         bool isLE = IsMachineLittleEndian();
00282         for (int i = 0; i < src.Len(); i++) {
00283                 int c = src[i] & 0xffff;
00284                 if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
00285                 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
00286 }
00287 
00288 void TUniCodec::TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00289         const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00290         FILE *f)
00291 {
00292         TIntV srcBytes, expectedDestBytes;
00293         WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
00294         TIntV dest;
00295         if (f) {
00296                 fprintf(f, "Settings: %s  %s  %s  %s  %s replacementChar = %x  \n",
00297                         (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
00298                         (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
00299                         (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
00300                         (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
00301                         uint(replacementChar));
00302                 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
00303         for (int useBytes = 0; useBytes < 2; useBytes++)
00304         {
00305                 const char *fmt = (useBytes ? " %02x" : " %04x");
00306                 try
00307                 {
00308                         dest.Clr();
00309                         size_t retVal;
00310                         if (! useBytes) {
00311                                 if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
00312                                 else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
00313                         else {
00314                                 if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
00315                                 else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
00316                         const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
00317                         if (f) {
00318                                 fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(dest[i]));
00319                                 fprintf(f, "\n    expDest  "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(ed[i]));
00320                                 fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
00321                         bool ok = true;
00322                         if (retVal != expectedRetVal) ok = false;
00323                         if (dest.Len() != ed.Len()) ok = false;
00324                         if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
00325                         if (! ok)
00326                         {
00327                                 printf("!!!\n");
00328                         }
00329                         IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
00330                         IAssert(dest.Len() == ed.Len());
00331                         for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
00332                 }
00333                 catch (TUnicodeException e)
00334                 {
00335                         if (f) {
00336                                 fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
00337                                 fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
00338                         IAssert(expectedThrow);
00339                 }
00340         }
00341 }
00342 
00343 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00344 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00345 void TUniCodec::TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00346         const TUtf16BomHandling bomHandling,
00347         const TUniByteOrder defaultByteOrder,
00348         const bool insertBom)
00349 {
00350         TIntV src; TIntV expectedDest; int expectedRetVal = 0;
00351         bool expectedAbort = false;
00352         FILE *f = 0;
00353         bool isMachineLe = IsMachineLittleEndian();
00354         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
00355         bool swap = (isMachineLe != isDefaultLe);
00356         if (insertBom) {
00357                 src.Add(swap ? 0xfffe : 0xfeff);
00358                 if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
00359         else if (bomHandling == bomRequired) {
00360                 expectedAbort = true; expectedRetVal = -1; }
00361         // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
00362         // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
00363         // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
00364         //   (absent = 0, 'a' = 1).
00365         for (int i = 0; i < testCaseDesc.Len(); )
00366         {
00367                 const char c = testCaseDesc[i++];
00368                 uint cp = 0; int nWords = -1;
00369                 if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
00370                 if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
00371                 else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
00372                 else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
00373                 else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
00374                 else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
00375                 else if (c == 'X') { cp = 0xfffe; nWords = 1; }
00376                 else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
00377                 else Fail;
00378                 if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
00379                 // Process 'e'.
00380                 int nToDel = 0;
00381                 if (i < testCaseDesc.Len()) {
00382                         const char e = testCaseDesc[i];
00383                         if (e >= 'a') { i += 1; nToDel = 1; }}
00384                 IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
00385                 if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
00386                 // Will an error occur during the decoding of this codepoint?
00387                 bool errHere = false;
00388                 if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
00389                 else if (cp > 0x10ffff) { Fail; errHere = true; }
00390                 else if (nToDel > 0) errHere = true;
00391                 else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
00392                 // Update 'expectedDest' and 'expectedRetVal'.
00393                 if (! expectedAbort) {
00394                         if (! errHere) {
00395                                 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
00396                                 else { expectedDest.Add(cp); expectedRetVal += 1; } }
00397                         else if (errorHandling == uehReplace) {
00398                                 expectedDest.Add(replacementChar); }
00399                         if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
00400                 // Update 'src'.
00401                 if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
00402                 else {
00403                         int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
00404                         int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
00405                         src.Add(swap ? SwapBytes(c1) : c1);
00406                         if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
00407         }
00408         if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
00409         TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
00410 }
00411 
00412 void TUniCodec::TestUtf16()
00413 {
00414         TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
00415         for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
00416         for (int strict_ = 0; strict_ < 2; strict_++)
00417         for (int errMode_ = 0; errMode_ < 4; errMode_++)
00418         for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
00419         for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
00420         for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
00421         {
00422                 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
00423                 bool insertBom = (insertBom_ == 1);
00424                 TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
00425                 TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
00426                 TRnd rnd = TRnd(123);
00427                 // Test DecodeUtf16 on various random UTF-16-encoded sequences.
00428                 for (int i = 0; i < 10; i++)
00429                 {
00430                         TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
00431                         TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
00432                         TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
00433                         TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
00434                         TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
00435                         TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
00436                         TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
00437                         TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
00438                         TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
00439                 }
00440                 //continue;
00441                 // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
00442                 // close to powers of 2.
00443                 TIntV src, expectedDest, src2;
00444                 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
00445                 for (int pow = 8; pow <= 32; pow++)
00446                 {
00447                         uint uFrom, uTo;
00448                         if (pow == 8) uFrom = 0, uTo = 1u << pow;
00449                         else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
00450                         else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
00451                         printf("%u..%u          \r", uFrom, uTo);
00452                         for (uint u = uFrom; ; u++)
00453                         {
00454                                 int nWords = 0;
00455                                 if (u < 0x10000) nWords = 1;
00456                                 else nWords = 2;
00457                                 bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
00458                                 bool swap = (isMachineLe != isDestLe);
00459                                 bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
00460                                 src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
00461                                 if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
00462                                 if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
00463                                 {
00464                                         // Try to encode 'u' and see if it gets decoded correctly.
00465                                         if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
00466                                         else {
00467                                                 int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
00468                                                 int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
00469                                                 src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
00470                                                 src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
00471                                         if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
00472                                         {
00473                                                 expectedDest.Reserve(2, 0);
00474                                                 if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
00475                                                 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
00476                                                 else if (! err) expectedDest.Add(u);
00477                                                 int erv = (err ? 0 : expectedDest.Len());
00478                                                 if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
00479                                                 bool errD = err;
00480                                                 if (bomHandling == bomRequired && ! insertBom) {
00481                                                         expectedDest.Clr(false);
00482                                                         if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
00483                                                         else { erv = -1; errD = true;
00484                                                                 /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
00485                                                 TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
00486                                         }
00487                                 }
00488                                 // We can also test the UTF-16 encoder.
00489                                 src2[0] = u;
00490                                 if (err) {
00491                                         src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
00492                                         if (errorHandling == uehReplace) {
00493                                                 src.Add(swap ? SwapBytes(replacementChar) : replacementChar);
00494                                                 /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
00495                                                 else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
00496                                         }}
00497                                 TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
00498                                 //
00499                                 if (u == uTo) break;
00500                         }
00501                 }
00502         }
00503 }
00504 
00505 //-----------------------------------------------------------------------------
00506 // TUniCaseFolding
00507 //-----------------------------------------------------------------------------
00508 
00509 void TUniCaseFolding::LoadTxt(const TStr& fileName)
00510 {
00511         Clr();
00512         TUniChDb::TUcdFileReader reader; reader.Open(fileName);
00513         TStrV fields;
00514         while (reader.GetNextLine(fields))
00515         {
00516                 int cp = reader.ParseCodePoint(fields[0]);
00517                 const TStr status = fields[1], mapsTo = fields[2];
00518                 if (status == "C" || status == "S" || status == "T") {
00519                         TIntH &dest = (status == "C" ? cfCommon : status == "S" ? cfSimple : cfTurkic);
00520                         IAssert(! dest.IsKey(cp));
00521                         int cp2 = reader.ParseCodePoint(mapsTo);
00522                         dest.AddDat(cp, cp2); }
00523                 else if (status == "F") {
00524                         TIntIntVH &dest = cfFull;
00525                         IAssert(! dest.IsKey(cp));
00526                         TIntV cps; reader.ParseCodePointList(mapsTo, cps); IAssert(cps.Len() > 0);
00527                         dest.AddDat(cp, cps); }
00528                 else
00529                         FailR(status.CStr());
00530         }
00531         printf("TUniCaseFolding(\"%s\"): %d common, %d simple, %d full, %d Turkic.\n",
00532                 fileName.CStr(), cfCommon.Len(), cfSimple.Len(), cfFull.Len(), cfTurkic.Len());
00533 }
00534 
00535 void TUniCaseFolding::Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f)
00536 {
00537         fprintf(f, "TUniCaseFolding(%s%s): ", (full ? "full" : "simple"), (turkic ? ", turkic" : ""));
00538         for (int i = 0; i < src.Len(); i++) fprintf(f, " %04x", int(src[i]));
00539         TIntV dest; Fold(src, 0, src.Len(), dest, true, full, turkic);
00540         fprintf(f, "\n  -> ");
00541         for (int i = 0; i < dest.Len(); i++) fprintf(f, " %04x", int(dest[i]));
00542         fprintf(f, "\n");
00543         IAssert(dest.Len() == expectedDest.Len());
00544         for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
00545 }
00546 
00547 /*
00548 void TUniCaseFolding::Test(const TIntV& src, FILE *f) {
00549         Test(src, false, false, f); Test(src, false, true, f);
00550         Test(src, true, false, f); Test(src, true, true, f); }
00551 */
00552 
00553 void TUniCaseFolding::Test()
00554 {
00555         FILE *f = stderr;
00556         TVectorBuilder VB;
00557         // simple
00558         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0xdf), false, false, f);
00559         // simple + turkic
00560         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0xdf), false, true, f);
00561         // full
00562         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0x73, 0x73), true, false, f);
00563         // full + turkic
00564         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0x73, 0x73), true, true, f);
00565 }
00566 
00567 //-----------------------------------------------------------------------------
00568 // TUniChInfo
00569 //-----------------------------------------------------------------------------
00570 
00571 // UAX #14
00572 const ushort TUniChInfo::LineBreak_Unknown = TUniChInfo::GetLineBreakCode('X', 'X');
00573 const ushort TUniChInfo::LineBreak_ComplexContext = TUniChInfo::GetLineBreakCode('S', 'A');
00574 const ushort TUniChInfo::LineBreak_Numeric = TUniChInfo::GetLineBreakCode('N', 'U');
00575 const ushort TUniChInfo::LineBreak_InfixNumeric = TUniChInfo::GetLineBreakCode('I', 'S');
00576 const ushort TUniChInfo::LineBreak_Quotation = TUniChInfo::GetLineBreakCode('Q', 'U');
00577 
00578 //-----------------------------------------------------------------------------
00579 // TUniChDb -- word breaking
00580 //-----------------------------------------------------------------------------
00581 
00582 // Test driver for WbFind*NonIgnored.
00583 void TUniChDb::TestWbFindNonIgnored(const TIntV& src) const
00584 {
00585         int n = src.Len();
00586         TBoolV isIgnored; isIgnored.Gen(n);
00587         for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
00588         TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
00589         prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
00590         FILE *f = 0; // stderr;
00591         for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
00592         {
00593                 int prev = -1;
00594                 for (int i = 0; i < srcLen; i++) {
00595                         prevNonIgnored[i] = prev;
00596                         if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
00597                 int next = srcIdx + srcLen;
00598                 for (int i = srcLen - 1; i >= 0; i--) {
00599                         nextNonIgnored[i] = next;
00600                         if (! isIgnored[srcIdx + i]) next = srcIdx + i;
00601                         curOrNextNonIgnored[i] = next; }
00602                 if (f) {
00603                         fprintf(f, "\nIndex:     "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
00604                         fprintf(f, "\nNonIgn:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
00605                         fprintf(f, "\nPrevNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
00606                         fprintf(f, "\nNextNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
00607                         fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
00608                         fprintf(f, "\n"); }
00609                 for (int i = 0; i < srcLen; i++)
00610                 {
00611                         size_t s;
00612                         s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
00613                         IAssert(s == size_t(nextNonIgnored[i]));
00614                         s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
00615                         IAssert(s == size_t(curOrNextNonIgnored[i]));
00616                         s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
00617                         if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
00618                         else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
00619                 }
00620         }
00621 }
00622 
00623 void TUniChDb::TestWbFindNonIgnored() const
00624 {
00625         TIntV chIgnored, chNonIgnored;
00626         FILE *f = 0; // stderr;
00627         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
00628                 const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
00629                 if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
00630                         ci.flags, ci.properties, ci.propertiesX, GetScriptName(ci.script).CStr());
00631                 (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
00632         }
00633         chIgnored.Sort(); chNonIgnored.Sort();
00634         printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
00635         TRnd rnd = TRnd(123);
00636         for (int iter = 0; iter <= 50; iter++)
00637         {
00638                 int percIgnored = 2 * iter;
00639                 for (int n = 0; n <= 20; n++)
00640                 {
00641                         // Prepare a random sequence of 'n' codepoints.
00642                         TIntV v; v.Gen(n);
00643                         for (int i = 0; i < n; i++) {
00644                                 TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
00645                                 int j = rnd.GetUniDevInt(chars.Len());
00646                                 v.Add(chars[j]); }
00647                         // Run the tests with this sequence.
00648                         TestWbFindNonIgnored(v);
00649                 }
00650         }
00651 }
00652 
00653 void TUniChDb::TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence)
00654 {
00655         TUcdFileReader reader; TStrV fields;
00656         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
00657         int nLines = 0; TRnd rnd = TRnd(123);
00658         while (reader.GetNextLine(fields))
00659         {
00660                 nLines += 1;
00661                 IAssert(fields.Len() == 1);
00662                 TStrV parts; fields[0].SplitOnWs(parts);
00663                 const int n = parts.Len(); IAssert((n % 2) == 1);
00664                 TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
00665                 // Each line is a sequence of codepoints, with a \times or \div in between each
00666                 // pair of codepoints (as well as at the beginning and the end of the sequence) to
00667                 // indicate whether a boundary exists there or not.
00668                 for (int i = 0; i < n; i++)
00669                 {
00670                         const TStr& s = parts[i];
00671                         if ((i % 2) == 0) {
00672                                 if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
00673                                         isBreak.Add(false);
00674                                 else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
00675                                         isBreak.Add(true);
00676                                 else FailR(s.CStr()); }
00677                         else chars.Add(reader.ParseCodePoint(s));
00678                 }
00679                 const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
00680                 IAssert(isBreak[0]); IAssert(isBreak[m]);
00681                 isPredicted.Gen(m + 1); isPredicted.PutAll(false);
00682                 if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
00683                 // We'll insert a few random characters at the beginning of the sequence
00684                 // so that srcPos doesn't always begin at 0.
00685                 for (int nBefore = 0; nBefore < 5; nBefore++)
00686                 {
00687                         TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
00688                         chars2.AddV(chars);
00689                         // Use FindNextBoundary to find all the word boundaries.
00690                         size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
00691                         while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
00692                         {
00693                                 IAssert(prevPosition < position);
00694                                 IAssert(position <= size_t(nBefore + m));
00695                                 isPredicted[int(position) - nBefore] = true;
00696                                 prevPosition = position;
00697                         }
00698                         IAssert(position == size_t(nBefore + m));
00699                         if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
00700                         else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
00701                         IAssert(isPredicted2.Len() == m + 1);
00702                         bool ok = true;
00703                         // If we start at 0, the word boundary at the beginning of the sequence was
00704                         // not found explicitly, so we'll add it now.
00705                         if (nBefore == 0) isPredicted[0] = true;
00706                         // Compare the predicted and the true boundaries.
00707                         for (int i = 0; i <= m; i++) {
00708                                 if (isBreak[i] != isPredicted[i]) ok = false;
00709                                 IAssert(isPredicted2[i] == isPredicted[i]); }
00710                         FILE *f = stderr;
00711                         if (! ok)
00712                         {
00713                                 fprintf(f, "\nError in line %d:\n", nLines);
00714                                 fprintf(f, "True:      ");
00715                                 for (int i = 0; i <= m; i++) {
00716                                         fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
00717                                         if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
00718                                 fprintf(f, "\nPredicted: ");
00719                                 for (int i = 0; i <= m; i++) {
00720                                         fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
00721                                         if (i < m) {
00722                                                 const int cp = chars[i + nBefore];
00723                                                 TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp));
00724                                                 if (IsWbIgnored(cp)) s = "*" + s;
00725                                                 fprintf(f, "%4s ", s.CStr()); }}
00726                                 fprintf(f, "\n");
00727                                 Fail;
00728                         }
00729                         // Test FindNextBoundary if we start in the middle of the sequence,
00730                         // i.e. not at an existing boundary.
00731                         for (int i = 0; i < m; i++) {
00732                                 position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
00733                                 IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
00734                                 IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
00735                                 position -= nBefore;
00736                                 for (int j = i + 1; j < int(position); j++)
00737                                         IAssert(! isBreak[j]);
00738                                 IAssert(isBreak[int(position)]); }
00739                 }
00740         }
00741         reader.Close();
00742         printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
00743 }
00744 
00745 //-----------------------------------------------------------------------------
00746 // TUniChDb -- composition and decomposition
00747 //-----------------------------------------------------------------------------
00748 
00749 void TUniChDb::TestComposition(const TStr& basePath)
00750 {
00751         TUcdFileReader reader; TStrV fields; int nLines = 0;
00752         reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
00753         bool inPart1 = false; TIntH testedInPart1;
00754         while (reader.GetNextLine(fields))
00755         {
00756                 nLines += 1;
00757                 if (fields.Len() == 1) {
00758                         IAssert(fields[0].IsPrefix("@Part"));
00759                         inPart1 = (fields[0] == "@Part1"); continue; }
00760                 IAssert(fields.Len() == 6);
00761                 IAssert(fields[5].Len() == 0);
00762                 TIntV c1, c2, c3, c4, c5;
00763                 reader.ParseCodePointList(fields[0], c1);
00764                 reader.ParseCodePointList(fields[1], c2);
00765                 reader.ParseCodePointList(fields[2], c3);
00766                 reader.ParseCodePointList(fields[3], c4);
00767                 reader.ParseCodePointList(fields[4], c5);
00768                 TIntV v;
00769 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
00770 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
00771 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
00772 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
00773 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
00774                 // NFD:
00775                 NFD_(c3, c1);   // c3 == NFD(c1)
00776                 NFD_(c3, c2);   // c3 == NFD(c2)
00777                 NFD_(c3, c3);   // c3 == NFD(c3)
00778                 NFD_(c5, c4);   // c5 == NFD(c4)
00779                 NFD_(c5, c5);   // c5 == NFD(c5)
00780                 // NFC:
00781                 NFC_(c2, c1);   // c2 == NFC(c1)
00782                 NFC_(c2, c2);   // c2 == NFC(c2)
00783                 NFC_(c2, c3);   // c2 == NFC(c3)
00784                 NFC_(c4, c4);   // c4 == NFC(c4)
00785                 NFC_(c4, c5);   // c4 == NFC(c5)
00786                 // NFKD:
00787                 NFKD_(c5, c1);   // c5 == NFKD(c1)
00788                 NFKD_(c5, c2);   // c5 == NFKD(c2)
00789                 NFKD_(c5, c3);   // c5 == NFKD(c3)
00790                 NFKD_(c5, c4);   // c5 == NFKD(c4)
00791                 NFKD_(c5, c5);   // c5 == NFKD(c5)
00792                 // NFKC:
00793                 NFKC_(c4, c1);   // c4 == NFKC(c1)
00794                 NFKC_(c4, c2);   // c4 == NFKC(c2)
00795                 NFKC_(c4, c3);   // c4 == NFKC(c3)
00796                 NFKC_(c4, c4);   // c4 == NFKC(c4)
00797                 NFKC_(c4, c5);   // c4 == NFKC(c5)
00798                 //
00799                 if (inPart1) {
00800                         IAssert(c1.Len() == 1);
00801                         testedInPart1.AddKey(c1[0]); }
00802         }
00803         reader.Close();
00804         // Test other individual codepoints that were not mentioned in part 1.
00805         int nOther = 0;
00806         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
00807         {
00808                 const int cp = h.GetKey(i), nLines = -1;
00809                 if (testedInPart1.IsKey(cp)) continue;
00810                 TIntV x, v; x.Add(cp);
00811                 NFC_(x, x);    // x == NFC(x)
00812                 NFD_(x, x);    // x == NFD(x)
00813                 NFKC_(x, x);   // x == NFKC(x)
00814                 NFKD_(x, x);   // x == NFKD(x)
00815                 nOther += 1;
00816         }
00817 #undef AssE_
00818 #undef NFC_
00819 #undef NFD_
00820 #undef NFKC_
00821 #undef NFKD_
00822         printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
00823 }
00824 
00825 //-----------------------------------------------------------------------------
00826 // TUniChDb -- case conversion tests
00827 //-----------------------------------------------------------------------------
00828 
00829 void TUniChDb::TestCaseConversion(const TStr& source, const TStr& trueLc,
00830                                                                   const TStr& trueTc, const TStr& trueUc,
00831                                                                   bool turkic, bool lithuanian)
00832 {
00833         TIntV src;
00834         TUcdFileReader::ParseCodePointList(source, src);
00835         FILE *f = stderr;
00836         for (int i = 0; i < 3; i++)
00837         {
00838                 TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
00839                 const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
00840                 TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
00841                 TIntV dest;
00842                 GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
00843                 bool ok = (dest.Len() == trueDest.Len());
00844                 if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
00845                 if (ok) continue;
00846                 fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
00847                 for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
00848                 fprintf(f, ")\nCorrect:   (");
00849                 for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
00850                 fprintf(f, ")\nOur output:(");
00851                 for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
00852                 fprintf(f, ")\n");
00853                 IAssert(ok);
00854         }
00855 }
00856 
00857 void TUniChDb::TestCaseConversions()
00858 {
00859         // Because no thorough case-conversion test files have been provided as part
00860         // of the Unicode standard, we'll have to test things on a few test cases of our own.
00861         // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
00862         const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
00863         const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
00864         const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
00865         const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
00866         const TStr space = "0020 ", Grave = "0300 ";
00867         TestCaseConversion(
00868                 F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst,  // source
00869                 f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst,  // lowercase
00870                 F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst,      // titlecase
00871                 F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
00872                 false, false);
00873         // - Dotted I, dotless i, etc., but with turkic == false.
00874         const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
00875         TestCaseConversion(
00876                 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
00877                 s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
00878                 S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
00879                 S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
00880                 false, false);
00881         // - Sigma (final vs. non-final forms).
00882         const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
00883         TestCaseConversion(
00884                 Sigma + s + space + s + Sigma  + space + s + Sigma + s + space + Sigma + S + Sigma  + space + Sigma, // source
00885                 sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
00886                 Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
00887                 Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
00888                 false, false);
00889         TestCaseConversion(
00890                 sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + S + sigma  + space + sigma, // source
00891                 sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + s + sigma  + space + sigma, // lowercase
00892                 Sigma + s + space + S + sigma  + space + S + sigma + s + space + Sigma + s + sigma  + space + Sigma, // titlecase
00893                 Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
00894                 false, false);
00895         TestCaseConversion(
00896                 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma  + space + fsigma, // source
00897                 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma  + space + fsigma, // lowercase
00898                 Sigma  + s + space + S + fsigma + space + S + fsigma + s + space + Sigma  + s + fsigma  + space + Sigma, // titlecase
00899                 Sigma  + S + space + S + Sigma  + space + S + Sigma  + S + space + Sigma  + S + Sigma   + space + Sigma, // uppercase
00900                 false, false);
00901         const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
00902         // Special case mappings for Turkic languages:
00903         // - After_I
00904         TestCaseConversion(
00905                 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
00906                 s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
00907                 S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
00908                 S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
00909                 true, false); // turkic
00910         // - Not_Before_Dot
00911         TestCaseConversion(
00912                 I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
00913                 iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
00914                 I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
00915                 I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
00916                 true, false); // turkic
00917         // Special case mappings for Lithuanian:
00918         // - After_Soft_Dotted  [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
00919         TestCaseConversion(
00920                 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
00921                 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
00922                 I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
00923                 I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
00924                 false, true); // lithuanian
00925         // - More_Above  [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
00926         TestCaseConversion(
00927                 J +        Grave + space + J +        nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J +        nonSA + Grave + space + j + nonSA, // source
00928                 j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
00929                 J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
00930                 J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + J +        nonSA + Grave + space + J + nonSA, // uppercase
00931                 false, true); // lithuanian
00932         // SoftDotted [^ Starter Above]* 0307   --(uc,tc)-->  brez 0307
00933         // SoftDotted [^ Starter Above]* 0307   --(
00934         //TestCaseConversion("", "", "", "", false, false);
00935 }
00936 
00937 //-----------------------------------------------------------------------------
00938 // TUniChDb -- initialization from the text files
00939 //-----------------------------------------------------------------------------
00940 
00941 void TUniChDb::LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s)
00942 {
00943         if (s.Empty()) return;
00944         if (s[0] == '<') {
00945                 int i = s.SearchCh('>'); IAssert(i > 0);
00946                 ci.flags |= ucfCompatibilityDecomposition;
00947                 s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
00948         TIntV dec; TUcdFileReader::ParseCodePointList(s, dec);
00949         IAssert(dec.Len() > 0);
00950         ci.decompOffset = decompositions.Len();
00951         decompositions.AddV(dec); decompositions.Add(-1);
00952 }
00953 
00954 void TUniChDb::InitPropList(const TStr& basePath)
00955 {
00956         TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
00957         reader.Open(CombinePath(basePath, GetPropListFn()));
00958         TSubcatHelper helper(*this);
00959         while (reader.GetNextLine(fields))
00960         {
00961                 IAssert(fields.Len() == 2);
00962                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
00963                 TStr s = fields[1];
00964                 TUniChProperties prop = TUniChProperties(0); TUniChPropertiesX propx = TUniChPropertiesX(0);
00965                 if (s == "White_Space") prop = ucfPrWhiteSpace;
00966                 else if (s == "Bidi_Control") prop = ucfPrBidiControl;
00967                 else if (s == "Join_Control") prop = ucfPrJoinControl;
00968                 else if (s == "Dash") prop = ucfPrDash;
00969                 else if (s == "Hyphen") prop = ucfPrHyphen;
00970                 else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
00971                 else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
00972                 else if (s == "Other_Math") propx = ucfPxOtherMath;
00973                 else if (s == "Hex_Digit") prop = ucfPrHexDigit;
00974                 else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
00975                 else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
00976                 else if (s == "Ideographic") prop = ucfPrIdeographic;
00977                 else if (s == "Diacritic") prop = ucfPrDiacritic;
00978                 else if (s == "Extender") prop = ucfPrExtender;
00979                 else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
00980                 else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
00981                 else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
00982                 else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
00983                 else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
00984                 else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
00985                 else if (s == "Radical") propx = ucfPxRadical;
00986                 else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
00987                 else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
00988                 else if (s == "Deprecated") prop = ucfPrDeprecated;
00989                 else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
00990                 else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
00991                 else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
00992                 else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
00993                 else if (s == "STerm") prop = ucfPrSTerm;
00994                 else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
00995                 else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
00996                 else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
00997                 else FailR(s.CStr());
00998                 helper.ProcessComment(reader);
00999                 for (int cp = from; cp <= to; cp++) {
01000                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
01001                         TUniChInfo &ci = h[i]; helper.TestCat(cp);
01002                         if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
01003                         if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
01004                         nCps++; }
01005                 nLines++;
01006         }
01007         reader.Close();
01008         printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
01009 }
01010 
01011 void TUniChDb::InitDerivedCoreProperties(const TStr& basePath)
01012 {
01013         TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
01014         reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
01015         TSubcatHelper helper(*this);
01016         while (reader.GetNextLine(fields))
01017         {
01018                 IAssert(fields.Len() == 2);
01019                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01020                 TStr s = fields[1];
01021                 TUniChFlags flag = ucfCompatibilityDecomposition;
01022                 if (s == "Math") flag = ucfDcpMath;
01023                 else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
01024                 else if (s == "Lowercase") flag = ucfDcpLowercase;
01025                 else if (s == "Uppercase") flag = ucfDcpUppercase;
01026                 else if (s == "ID_Start") flag = ucfDcpIdStart;
01027                 else if (s == "ID_Continue") flag = ucfDcpIdContinue;
01028                 else if (s == "XID_Start") flag = ucfDcpXidStart;
01029                 else if (s == "XID_Continue") flag = ucfDcpXidContinue;
01030                 else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
01031                 else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
01032                 else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
01033                 else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
01034                 else FailR(s.CStr());
01035                 // If we add new codepoints to the hash table, we should also set their category.
01036                 // This is supposed to be provided in the comment, e.g. "# Cf       SOFT HYPHEN".
01037                 helper.ProcessComment(reader);
01038                 //
01039                 for (int cp = from; cp <= to; cp++) {
01040                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
01041                         helper.TestCat(cp);
01042                         TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
01043                         ci.SetDcpFlag(flag); nCps++; }
01044                 nLines++;
01045         }
01046         reader.Close();
01047         printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
01048 }
01049 
01050 void TUniChDb::InitLineBreaks(const TStr& basePath)
01051 {
01052         // Clear old linebreak values.
01053         ushort xx = TUniChInfo::LineBreak_Unknown;
01054         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
01055         // Read LineBreak.txt.
01056         TUcdFileReader reader; TStrV fields;
01057         reader.Open(CombinePath(basePath, GetLineBreakFn()));
01058         int nLines = 0, nCps = 0;
01059         while (reader.GetNextLine(fields))
01060         {
01061                 IAssert(fields.Len() == 2);
01062                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01063                 TStr s = fields[1]; IAssert(s.Len() == 2);
01064                 ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
01065                 if (us == xx) continue;
01066                 for (int cp = from; cp <= to; cp++) {
01067                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
01068                                 printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
01069                         IAssert(h[i].lineBreak == xx);
01070                         h[i].lineBreak = us; nCps++; }
01071                 nLines++;
01072         }
01073         reader.Close();
01074         printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
01075 }
01076 
01077 void TUniChDb::InitScripts(const TStr& basePath)
01078 {
01079         TUcdFileReader reader; TStrV fields;
01080         reader.Open(CombinePath(basePath, GetScriptsFn()));
01081         TSubcatHelper helper(*this);
01082         while (reader.GetNextLine(fields))
01083         {
01084                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01085                 TStr scriptName = fields[1];
01086                 int scriptNo = scripts.GetKeyId(scriptName);
01087                 if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
01088                 IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
01089                 scripts[scriptNo] += 1;
01090                 helper.ProcessComment(reader);
01091                 for (int cp = from; cp <= to; cp++) {
01092                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
01093                         helper.TestCat(cp);
01094                         TUniChInfo &ci = h[i]; ci.script = scriptNo; }
01095         }
01096         reader.Close();
01097         scripts.AddDat(GetScriptNameUnknown()) = 0;
01098         printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
01099         if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
01100                 printf("  %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
01101         printf("\n");
01102 }
01103 
01104 void TUniChDb::InitWordAndSentenceBoundaryFlags(const TStr& basePath)
01105 {
01106         // UAX #29, sec. 4.1 and 5.1.
01107         // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
01108         int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
01109         int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
01110         // Clear any existing word-boundary flags and initialize them again.
01111         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01112         {
01113                 const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
01114                 ci.ClrWbAndSbFlags();
01115                 // Word-boundary flags.
01116                 if (ci.subCat  == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
01117                 if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
01118                 if (ci.lineBreak == TUniChInfo::LineBreak_InfixNumeric && cp != 0x3a) ci.SetWbFlag(ucfWbMidNum);
01119                 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetWbFlag(ucfWbNumeric);
01120                 if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
01121                 // Sentence-boundary flags.  Some are identical to some word-boundary flags.
01122                 if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
01123                 if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
01124                 if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
01125                 if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
01126                 if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
01127                 if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
01128                 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetSbFlag(ucfSbNumeric);
01129                 if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
01130                 // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
01131                 // the purposes of sentence-boundary detection.  Now in PropList.txt there is no doubt that 002E has the STerm
01132                 // property; thus, it should also belong to the STerm sentence-boundary class.  However, in
01133                 // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
01134                 if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
01135                 if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
01136         }
01137         // Some additional characters for Katakana and MidLetter.
01138         TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
01139         for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
01140         v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
01141         for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
01142         // WbALetter depends on Katakana, so it cannot be initialized earlier.
01143         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01144         {
01145                 const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
01146                 if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
01147                         ci.SetWbFlag(ucfWbALetter);
01148         }
01149         // An alternative is to extract the flags from WordBreakProperty.txt.
01150         // The results should be the same.
01151         {TUcdFileReader reader; TStrV fields;
01152         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn()));
01153         THash<TInt, TInt> hh;
01154         while (reader.GetNextLine(fields))
01155         {
01156                 IAssert(fields.Len() == 2);
01157                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01158                 TStr s = fields[1];
01159                 TUniChFlags flag = ucfCompatibilityDecomposition;
01160                 if (s == "Format") flag = ucfWbFormat;
01161                 else if (s == "Katakana") flag = ucfWbKatakana;
01162                 else if (s == "ALetter") flag = ucfWbALetter;
01163                 else if (s == "MidLetter") flag = ucfWbMidLetter;
01164                 else if (s == "MidNum") flag = ucfWbMidNum;
01165                 else if (s == "Numeric") flag = ucfWbNumeric;
01166                 else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
01167                 else FailR(s.CStr());
01168                 for (int c = from; c <= to; c++) {
01169                         int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
01170                         else hh[i].Val |= flag; }
01171         }
01172         reader.Close();
01173         TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
01174         for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
01175         cps.Sort(); cps.Merge();
01176         for (int i = 0; i < cps.Len(); i++)
01177         {
01178                 int cp = cps[i];
01179                 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
01180                 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
01181                 flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
01182                 if (flags1 != flags2) {
01183                         printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
01184                         Fail; }
01185         }}
01186         // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
01187         {TUcdFileReader reader; TStrV fields;
01188         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn()));
01189         THash<TInt, TInt> hh;
01190         while (reader.GetNextLine(fields))
01191         {
01192                 IAssert(fields.Len() == 2);
01193                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01194                 TStr s = fields[1];
01195                 TUniChFlags flag = ucfCompatibilityDecomposition;
01196                 if (s == "Sep") flag = ucfSbSep;
01197                 else if (s == "Format") flag = ucfSbFormat;
01198                 else if (s == "Sp") flag = ucfSbSp;
01199                 else if (s == "Lower") flag = ucfSbLower;
01200                 else if (s == "Upper") flag = ucfSbUpper;
01201                 else if (s == "OLetter") flag = ucfSbOLetter;
01202                 else if (s == "Numeric") flag = ucfSbNumeric;
01203                 else if (s == "ATerm") flag = ucfSbATerm;
01204                 else if (s == "STerm") flag = ucfSbSTerm;
01205                 else if (s == "Close") flag = ucfSbClose;
01206                 else FailR(s.CStr());
01207                 for (int c = from; c <= to; c++) {
01208                         int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
01209                         else hh[i].Val |= flag; }
01210         }
01211         reader.Close();
01212         TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
01213         for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
01214         cps.Sort(); cps.Merge();
01215         for (int i = 0; i < cps.Len(); i++)
01216         {
01217                 int cp = cps[i];
01218                 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
01219                 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
01220                 if (flags1 != flags2) {
01221                         printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
01222                                 flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
01223                                 flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
01224                                 flags1 ^ flags2);
01225                         Fail; }
01226         }}
01227 }
01228 
01229 void TUniChDb::InitSpecialCasing(const TStr& basePath)
01230 {
01231         TUcdFileReader reader; TStrV fields;
01232         reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
01233         while (reader.GetNextLine(fields))
01234         {
01235                 IAssert(fields.Len() == 5 || fields.Len() == 6);
01236                 IAssert(fields.Last().Empty());
01237                 // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
01238                 TStr conditions = "";
01239                 if (fields.Len() == 6) conditions = fields[4];
01240                 conditions.ToTrunc(); if (! conditions.Empty()) continue;
01241                 // Keep the other mappings.
01242                 const int cp = reader.ParseCodePoint(fields[0]);
01243                 TIntV v; reader.ParseCodePointList(fields[1], v);
01244                 specialCasingLower.AddDat(cp, v);
01245                 reader.ParseCodePointList(fields[2], v);
01246                 specialCasingTitle.AddDat(cp, v);
01247                 reader.ParseCodePointList(fields[3], v);
01248                 specialCasingUpper.AddDat(cp, v);
01249         }
01250         reader.Close();
01251 }
01252 
01253 void TUniChDb::LoadTxt(const TStr& basePath)
01254 {
01255         Clr();
01256         // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
01257         h = THash<TInt, TUniChInfo>(196613, true);
01258         //
01259         caseFolding.LoadTxt(CombinePath(basePath, GetCaseFoldingFn()));
01260         //
01261         TUcdFileReader reader; TStrV fields; TIntH seen;
01262         reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
01263         while (reader.GetNextLine(fields))
01264         {
01265                 // Codepoint.
01266                 int cp = reader.ParseCodePoint(fields[0]);
01267                 IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
01268                 TUniChInfo& ci = h.AddDat(cp);
01269                 // Name.
01270                 ci.nameOffset = charNames.AddStr(fields[1]);
01271                 // Category.
01272                 TStr& s = fields[2]; IAssert(s.Len() == 2);
01273                 ci.chCat = s[0]; ci.chSubCat = s[1];
01274                 // Canonical combining class.
01275                 s = fields[3]; IAssert(s.Len() > 0);
01276                 int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
01277                 ci.combClass = (uchar) i;
01278                 // Decomposition type and mapping.
01279                 LoadTxt_ProcessDecomposition(ci, fields[5]);
01280                 // Simple case mappings.
01281                 s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
01282                 s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
01283                 s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
01284                 //
01285                 ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
01286         }
01287         reader.Close();
01288         //
01289         InitScripts(basePath);
01290         //
01291         InitPropList(basePath);
01292         InitDerivedCoreProperties(basePath);
01293         InitLineBreaks(basePath);
01294         InitSpecialCasing(basePath);
01295         // Process the composition exclusions (UAX #15, sec. 6).
01296         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01297         {
01298                 TUniChInfo& ci = h[i];
01299                 int ofs = ci.decompOffset; if (ofs < 0) continue;
01300                 int n = 0; while (decompositions[ofs + n] >= 0) n++;
01301                 IAssert(n > 0);
01302                 // Singleton decompositions.
01303                 if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
01304                 // Non-starter decompositions.
01305                 int cp1 = decompositions[ofs];
01306                 IAssert(h.IsKey(cp1));
01307                 uchar ccc = h.GetDat(cp1).combClass;
01308                 if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
01309         }
01310         // Process the composition exclusion table.
01311         reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
01312         int nExclusionTable = 0;
01313         while (reader.GetNextLine(fields))
01314         {
01315                 IAssert(fields.Len() == 1);
01316                 int cp = reader.ParseCodePoint(fields[0]);
01317                 int i = h.GetKeyId(cp); IAssert(i >= 0);
01318                 h[i].flags |= ucfCompositionExclusion;
01319                 nExclusionTable++;
01320         }
01321         reader.Close();
01322         // Prepare the inverted index for composition pairs.
01323         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01324         {
01325                 int cp = h.GetKey(i);
01326                 TUniChInfo& ci = h[i];
01327                 int ofs = ci.decompOffset; if (ofs < 0) continue;
01328                 if (ci.IsCompositionExclusion()) continue;
01329                 if (ci.IsCompatibilityDecomposition()) continue;
01330                 int n = 0; while (decompositions[ofs + n] >= 0) n++;
01331                 if (n != 2) continue;
01332                 TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
01333                 IAssert(! inverseDec.IsKey(pr));
01334                 IAssert(ci.combClass == TUniChInfo::ccStarter);
01335                 inverseDec.AddDat(pr, cp);
01336         }
01337         printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
01338                 basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
01339         // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
01340         // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
01341         InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
01342         // Make sure that Hangul combined characters are treated as stareters.
01343         for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
01344         {
01345                 int j = h.GetKeyId(cp); if (j < 0) continue;
01346                 TUniChInfo& ci = h[j];
01347                 if (ci.combClass == TUniChInfo::ccInvalid) ci.combClass = TUniChInfo::ccStarter;
01348                 IAssert(ci.combClass == TUniChInfo::ccStarter);
01349         }
01350         // There should be no more additions to 'h' beyond this point.
01351         const int oldHLen = h.Len();
01352         // Provide default (identity) case mappings if any were missing from UnicodeData.txt
01353         // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
01354         int scriptUnknown = GetScriptByName(GetScriptNameUnknown());
01355         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01356         {
01357                 int cp = h.GetKey(i); TUniChInfo &ci = h[i];
01358                 if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
01359                 if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
01360                 if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
01361                 if (ci.script < 0) ci.script = scriptUnknown;
01362         }
01363         IAssert(h.Len() == oldHLen);
01364 }
01365 
01366 void TUniChDb::SaveBin(const TStr& fnBinUcd)
01367 {
01368         PSOut SOut=TFOut::New(fnBinUcd);
01369         Save(*SOut);
01370 }
01371 
01372 void TUniChDb::InitAfterLoad()
01373 {
01374         scriptUnknown = GetScriptByName(GetScriptNameUnknown()); IAssert(scriptUnknown >= 0);
01375 }
01376 
01377 //-----------------------------------------------------------------------------
01378 // TUniChDb -- main test driver
01379 //-----------------------------------------------------------------------------
01380 
01381 void TUniChDb::Test(const TStr& basePath)
01382 {
01383         TStr fnBin = CombinePath(basePath, GetBinFn());
01384         if (true || ! TFile::Exists(fnBin))
01385         {
01386                 // Test LoadTxt.
01387                 LoadTxt(basePath);
01388                 // Test Save.
01389                 {PSOut SOut = TFOut::New(fnBin);
01390                 Save(*SOut);}
01391         }
01392         // Test Load.
01393         this->~TUniChDb();
01394         new(this) TUniChDb();
01395         {PSIn SIn = TFIn::New(fnBin);
01396         Load(*SIn);}
01397         // Test the case folding.
01398         caseFolding.Test();
01399         // Test the word breaking.
01400         TestWbFindNonIgnored();
01401         // Test the sentence breaking.
01402         TestFindNextWordOrSentenceBoundary(basePath, true);
01403         TestFindNextWordOrSentenceBoundary(basePath, false);
01404         // Test composition and decomposition.
01405         TestComposition(basePath);
01406         // Test the case conversions.
01407         TestCaseConversions();
01408 }
01409 
01410 //-----------------------------------------------------------------------------
01411 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
01412 //-----------------------------------------------------------------------------
01413 
01414 //-----------------------------------------------------------------------------
01415 // ISO-8859-2
01416 //-----------------------------------------------------------------------------
01417 
01418 const int TEncoding_ISO8859_2::toUnicodeTable[6 * 16] =
01419 {
01420         /* 0xa0 */ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
01421         /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
01422         /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
01423         /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
01424         /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
01425         /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
01426 };
01427 
01428 const int TEncoding_ISO8859_2::fromUnicodeTable1[14 * 16] = {
01429         /* U+00a0 */ 0x00a0,     -1,     -1,     -1, 0x00a4,     -1,     -1, 0x00a7, 0x00a8,     -1,     -1,     -1,     -1, 0x00ad,     -1,     -1,
01430         /* U+00b0 */ 0x00b0,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1, 0x00b8,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01431         /* U+00c0 */     -1, 0x00c1, 0x00c2,     -1, 0x00c4,     -1,     -1, 0x00c7,     -1, 0x00c9,     -1, 0x00cb,     -1, 0x00cd, 0x00ce,     -1,
01432         /* U+00d0 */     -1,     -1,     -1, 0x00d3, 0x00d4,     -1, 0x00d6, 0x00d7,     -1,     -1, 0x00da,     -1, 0x00dc, 0x00dd,     -1, 0x00df,
01433         /* U+00e0 */     -1, 0x00e1, 0x00e2,     -1, 0x00e4,     -1,     -1, 0x00e7,     -1, 0x00e9,     -1, 0x00eb,     -1, 0x00ed, 0x00ee,     -1,
01434         /* U+00f0 */     -1,     -1,     -1, 0x00f3, 0x00f4,     -1, 0x00f6, 0x00f7,     -1,     -1, 0x00fa,     -1, 0x00fc, 0x00fd,     -1,     -1,
01435         /* U+0100 */     -1,     -1, 0x00c3, 0x00e3, 0x00a1, 0x00b1, 0x00c6, 0x00e6,     -1,     -1,     -1,     -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
01436         /* U+0110 */ 0x00d0, 0x00f0,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec,     -1,     -1,     -1,     -1,
01437         /* U+0120 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01438         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5, 0x00e5,     -1,     -1, 0x00a5, 0x00b5,     -1,
01439         /* U+0140 */     -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1,     -1,     -1, 0x00d2, 0x00f2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01440         /* U+0150 */ 0x00d5, 0x00f5,     -1,     -1, 0x00c0, 0x00e0,     -1,     -1, 0x00d8, 0x00f8, 0x00a6, 0x00b6,     -1,     -1, 0x00aa, 0x00ba,
01441         /* U+0160 */ 0x00a9, 0x00b9, 0x00de, 0x00fe, 0x00ab, 0x00bb,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00d9, 0x00f9,
01442         /* U+0170 */ 0x00db, 0x00fb,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ac, 0x00bc, 0x00af, 0x00bf, 0x00ae, 0x00be,     -1
01443 };
01444 
01445 const int TEncoding_ISO8859_2::fromUnicodeTable2[2 * 16] = {
01446         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00b7,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01447         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a2, 0x00ff,     -1, 0x00b2,     -1, 0x00bd,     -1,     -1
01448 };
01449 
01450 //-----------------------------------------------------------------------------
01451 // ISO-8859-3
01452 //-----------------------------------------------------------------------------
01453 
01454 const int TEncoding_ISO8859_3::toUnicodeTable[6 * 16] = {
01455         /* 0xa0 */ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4,     -1, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad,     -1, 0x017b,
01456         /* 0xb0 */ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd,     -1, 0x017c,
01457         /* 0xc0 */ 0x00c0, 0x00c1, 0x00c2,     -1, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
01458         /* 0xd0 */     -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
01459         /* 0xe0 */ 0x00e0, 0x00e1, 0x00e2,     -1, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
01460         /* 0xf0 */     -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9
01461 };
01462 
01463 const int TEncoding_ISO8859_3::fromUnicodeTable1[14 * 16] = {
01464         /* U+00a0 */ 0x00a0,     -1,     -1, 0x00a3, 0x00a4,     -1,     -1, 0x00a7, 0x00a8,     -1,     -1,     -1,     -1, 0x00ad,     -1,     -1,
01465         /* U+00b0 */ 0x00b0,     -1, 0x00b2, 0x00b3, 0x00b4, 0x00b5,     -1, 0x00b7, 0x00b8,     -1,     -1,     -1,     -1, 0x00bd,     -1,     -1,
01466         /* U+00c0 */ 0x00c0, 0x00c1, 0x00c2,     -1, 0x00c4,     -1,     -1, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
01467         /* U+00d0 */     -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4,     -1, 0x00d6, 0x00d7,     -1, 0x00d9, 0x00da, 0x00db, 0x00dc,     -1,     -1, 0x00df,
01468         /* U+00e0 */ 0x00e0, 0x00e1, 0x00e2,     -1, 0x00e4,     -1,     -1, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
01469         /* U+00f0 */     -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4,     -1, 0x00f6, 0x00f7,     -1, 0x00f9, 0x00fa, 0x00fb, 0x00fc,     -1,     -1,     -1,
01470         /* U+0100 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c6, 0x00e6, 0x00c5, 0x00e5,     -1,     -1,     -1,     -1,
01471         /* U+0110 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00d8, 0x00f8, 0x00ab, 0x00bb,
01472         /* U+0120 */ 0x00d5, 0x00f5,     -1,     -1, 0x00a6, 0x00b6, 0x00a1, 0x00b1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01473         /* U+0130 */ 0x00a9, 0x00b9,     -1,     -1, 0x00ac, 0x00bc,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01474         /* U+0140 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01475         /* U+0150 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00de, 0x00fe, 0x00aa, 0x00ba,
01476         /* U+0160 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00dd, 0x00fd,     -1,     -1,
01477         /* U+0170 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00af, 0x00bf,     -1,     -1,     -1,
01478 };
01479 const int TEncoding_ISO8859_3::fromUnicodeTable2[2] = {
01480         /* U+02d8 */ 0x00a2, 0x00ff
01481 };
01482 
01483 //-----------------------------------------------------------------------------
01484 // ISO-8859-4
01485 //-----------------------------------------------------------------------------
01486 
01487 const int TEncoding_ISO8859_4::toUnicodeTable[6 * 16] = {
01488         /* 0xa0 */ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
01489         /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
01490         /* 0xc0 */ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
01491         /* 0xd0 */ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
01492         /* 0xe0 */ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
01493         /* 0xf0 */ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9
01494 };
01495 
01496 const int TEncoding_ISO8859_4::fromUnicodeTable1[14 * 16] = {
01497         /* U+00a0 */ 0x00a0,     -1,     -1,     -1, 0x00a4,     -1,     -1, 0x00a7, 0x00a8,     -1,     -1,     -1,     -1, 0x00ad,     -1, 0x00af,
01498         /* U+00b0 */ 0x00b0,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1, 0x00b8,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01499         /* U+00c0 */     -1, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6,     -1,     -1, 0x00c9,     -1, 0x00cb,     -1, 0x00cd, 0x00ce,     -1,
01500         /* U+00d0 */     -1,     -1,     -1,     -1, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8,     -1, 0x00da, 0x00db, 0x00dc,     -1,     -1, 0x00df,
01501         /* U+00e0 */     -1, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6,     -1,     -1, 0x00e9,     -1, 0x00eb,     -1, 0x00ed, 0x00ee,     -1,
01502         /* U+00f0 */     -1,     -1,     -1,     -1, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8,     -1, 0x00fa, 0x00fb, 0x00fc,     -1,     -1,     -1,
01503         /* U+0100 */ 0x00c0, 0x00e0,     -1,     -1, 0x00a1, 0x00b1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c8, 0x00e8,     -1,     -1,
01504         /* U+0110 */ 0x00d0, 0x00f0, 0x00aa, 0x00ba,     -1,     -1, 0x00cc, 0x00ec, 0x00ca, 0x00ea,     -1,     -1,     -1,     -1,     -1,     -1,
01505         /* U+0120 */     -1,     -1, 0x00ab, 0x00bb,     -1,     -1,     -1,     -1, 0x00a5, 0x00b5, 0x00cf, 0x00ef,     -1,     -1, 0x00c7, 0x00e7,
01506         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1, 0x00d3, 0x00f3, 0x00a2,     -1,     -1, 0x00a6, 0x00b6,     -1,     -1,     -1,
01507         /* U+0140 */     -1,     -1,     -1,     -1,     -1, 0x00d1, 0x00f1,     -1,     -1,     -1, 0x00bd, 0x00bf, 0x00d2, 0x00f2,     -1,     -1,
01508         /* U+0150 */     -1,     -1,     -1,     -1,     -1,     -1, 0x00a3, 0x00b3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01509         /* U+0160 */ 0x00a9, 0x00b9,     -1,     -1,     -1,     -1, 0x00ac, 0x00bc, 0x00dd, 0x00fd, 0x00de, 0x00fe,     -1,     -1,     -1,     -1,
01510         /* U+0170 */     -1,     -1, 0x00d9, 0x00f9,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ae, 0x00be,     -1,
01511 };
01512 
01513 const int TEncoding_ISO8859_4::fromUnicodeTable2[2 * 16] = {
01514         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00b7,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01515         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ff,     -1, 0x00b2,     -1,     -1,     -1,     -1
01516 };
01517 
01518 //-----------------------------------------------------------------------------
01519 // CP 437
01520 //-----------------------------------------------------------------------------
01521 
01522 const int TEncoding_CP437::toUnicodeTable[8 * 16] = {
01523         /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
01524         /* 0x90 */ 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
01525         /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
01526         /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
01527         /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
01528         /* 0xd0 */ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
01529         /* 0xe0 */ 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
01530         /* 0xf0 */ 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
01531 };
01532 
01533 const int TEncoding_CP437::fromUnicodeTable1[6 * 16] = {
01534         /* U+00a0 */ 0x00ff, 0x00ad, 0x009b, 0x009c,     -1, 0x009d,     -1,     -1,     -1,     -1, 0x00a6, 0x00ae, 0x00aa,     -1,     -1,     -1,
01535         /* U+00b0 */ 0x00f8, 0x00f1, 0x00fd,     -1,     -1, 0x00e6,     -1, 0x00fa,     -1,     -1, 0x00a7, 0x00af, 0x00ac, 0x00ab,     -1, 0x00a8,
01536         /* U+00c0 */     -1,     -1,     -1,     -1, 0x008e, 0x008f, 0x0092, 0x0080,     -1, 0x0090,     -1,     -1,     -1,     -1,     -1,     -1,
01537         /* U+00d0 */     -1, 0x00a5,     -1,     -1,     -1,     -1, 0x0099,     -1,     -1,     -1,     -1,     -1, 0x009a,     -1,     -1, 0x00e1,
01538         /* U+00e0 */ 0x0085, 0x00a0, 0x0083,     -1, 0x0084, 0x0086, 0x0091, 0x0087, 0x008a, 0x0082, 0x0088, 0x0089, 0x008d, 0x00a1, 0x008c, 0x008b,
01539         /* U+00f0 */     -1, 0x00a4, 0x0095, 0x00a2, 0x0093,     -1, 0x0094, 0x00f6,     -1, 0x0097, 0x00a3, 0x0096, 0x0081,     -1,     -1, 0x0098,
01540 };
01541 
01542 const int TEncoding_CP437::fromUnicodeTable2[4 * 16] = {
01543         /* U+0390 */     -1,     -1,     -1, 0x00e2,     -1,     -1,     -1,     -1, 0x00e9,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01544         /* U+03a0 */     -1,     -1,     -1, 0x00e4,     -1,     -1, 0x00e8,     -1,     -1, 0x00ea,     -1,     -1,     -1,     -1,     -1,     -1,
01545         /* U+03b0 */     -1, 0x00e0,     -1,     -1, 0x00eb, 0x00ee,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01546         /* U+03c0 */ 0x00e3,     -1,     -1, 0x00e5, 0x00e7,     -1, 0x00ed,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01547 };
01548 
01549 const int TEncoding_CP437::fromUnicodeTable3[6 * 16] = {
01550         /* U+2210 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f9, 0x00fb,     -1,     -1,     -1, 0x00ec,     -1,
01551         /* U+2220 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ef,     -1,     -1,     -1,     -1,     -1,     -1,
01552         /* U+2230 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01553         /* U+2240 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f7,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01554         /* U+2250 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01555         /* U+2260 */     -1, 0x00f0,     -1,     -1, 0x00f3, 0x00f2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01556 };
01557 
01558 const int TEncoding_CP437::fromUnicodeTable4[11 * 16] = {
01559         /* U+2500 */ 0x00c4,     -1, 0x00b3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00da,     -1,     -1,     -1,
01560         /* U+2510 */ 0x00bf,     -1,     -1,     -1, 0x00c0,     -1,     -1,     -1, 0x00d9,     -1,     -1,     -1, 0x00c3,     -1,     -1,     -1,
01561         /* U+2520 */     -1,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c2,     -1,     -1,     -1,
01562         /* U+2530 */     -1,     -1,     -1,     -1, 0x00c1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5,     -1,     -1,     -1,
01563         /* U+2540 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01564         /* U+2550 */ 0x00cd, 0x00ba, 0x00d5, 0x00d6, 0x00c9, 0x00b8, 0x00b7, 0x00bb, 0x00d4, 0x00d3, 0x00c8, 0x00be, 0x00bd, 0x00bc, 0x00c6, 0x00c7,
01565         /* U+2560 */ 0x00cc, 0x00b5, 0x00b6, 0x00b9, 0x00d1, 0x00d2, 0x00cb, 0x00cf, 0x00d0, 0x00ca, 0x00d8, 0x00d7, 0x00ce,     -1,     -1,     -1,
01566         /* U+2570 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01567         /* U+2580 */ 0x00df,     -1,     -1,     -1, 0x00dc,     -1,     -1,     -1, 0x00db,     -1,     -1,     -1, 0x00dd,     -1,     -1,     -1,
01568         /* U+2590 */ 0x00de, 0x00b0, 0x00b1, 0x00b2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01569         /* U+25a0 */ 0x00fe,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1
01570 };
01571 //      /* U+0190 */     -1,     -1, 0x009f,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01572 //      /* U+2070 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00fc,
01573 //      /* U+20a0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x009e,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01574 //      /* U+2310 */ 0x00a9,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01575 //      /* U+2320 */ 0x00f4, 0x00f5,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01576 
01577 //-----------------------------------------------------------------------------
01578 // CP 852
01579 //-----------------------------------------------------------------------------
01580 
01581 const int TEncoding_CP852::toUnicodeTable[8 * 16] = {
01582         /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7, 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106,
01583         /* 0x90 */ 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d,
01584         /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e, 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb,
01585         /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a, 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510,
01586         /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
01587         /* 0xd0 */ 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce, 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580,
01588         /* 0xe0 */ 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161, 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4,
01589         /* 0xf0 */ 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0
01590 };
01591 
01592 const int TEncoding_CP852::fromUnicodeTable1[14 * 16] = {
01593         /* U+00a0 */ 0x00ff,     -1,     -1,     -1, 0x00cf,     -1,     -1, 0x00f5, 0x00f9,     -1,     -1, 0x00ae, 0x00aa, 0x00f0,     -1,     -1,
01594         /* U+00b0 */ 0x00f8,     -1,     -1,     -1, 0x00ef,     -1,     -1,     -1, 0x00f7,     -1,     -1, 0x00af,     -1,     -1,     -1,     -1,
01595         /* U+00c0 */     -1, 0x00b5, 0x00b6,     -1, 0x008e,     -1,     -1, 0x0080,     -1, 0x0090,     -1, 0x00d3,     -1, 0x00d6, 0x00d7,     -1,
01596         /* U+00d0 */     -1,     -1,     -1, 0x00e0, 0x00e2,     -1, 0x0099, 0x009e,     -1,     -1, 0x00e9,     -1, 0x009a, 0x00ed,     -1, 0x00e1,
01597         /* U+00e0 */     -1, 0x00a0, 0x0083,     -1, 0x0084,     -1,     -1, 0x0087,     -1, 0x0082,     -1, 0x0089,     -1, 0x00a1, 0x008c,     -1,
01598         /* U+00f0 */     -1,     -1,     -1, 0x00a2, 0x0093,     -1, 0x0094, 0x00f6,     -1,     -1, 0x00a3,     -1, 0x0081, 0x00ec,     -1,     -1,
01599         /* U+0100 */     -1,     -1, 0x00c6, 0x00c7, 0x00a4, 0x00a5, 0x008f, 0x0086,     -1,     -1,     -1,     -1, 0x00ac, 0x009f, 0x00d2, 0x00d4,
01600         /* U+0110 */ 0x00d1, 0x00d0,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a8, 0x00a9, 0x00b7, 0x00d8,     -1,     -1,     -1,     -1,
01601         /* U+0120 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01602         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x0091, 0x0092,     -1,     -1, 0x0095, 0x0096,     -1,
01603         /* U+0140 */     -1, 0x009d, 0x0088, 0x00e3, 0x00e4,     -1,     -1, 0x00d5, 0x00e5,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01604         /* U+0150 */ 0x008a, 0x008b,     -1,     -1, 0x00e8, 0x00ea,     -1,     -1, 0x00fc, 0x00fd, 0x0097, 0x0098,     -1,     -1, 0x00b8, 0x00ad,
01605         /* U+0160 */ 0x00e6, 0x00e7, 0x00dd, 0x00ee, 0x009b, 0x009c,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00de, 0x0085,
01606         /* U+0170 */ 0x00eb, 0x00fb,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x008d, 0x00ab, 0x00bd, 0x00be, 0x00a6, 0x00a7,     -1
01607 };
01608 
01609 const int TEncoding_CP852::fromUnicodeTable2[2* 16] = {
01610         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01611         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f4, 0x00fa,     -1, 0x00f2,     -1, 0x00f1,     -1,     -1
01612 };
01613 
01614 const int TEncoding_CP852::fromUnicodeTable3[11 * 16] = {
01615         /* U+2500 */ 0x00c4,     -1, 0x00b3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00da,     -1,     -1,     -1,
01616         /* U+2510 */ 0x00bf,     -1,     -1,     -1, 0x00c0,     -1,     -1,     -1, 0x00d9,     -1,     -1,     -1, 0x00c3,     -1,     -1,     -1,
01617         /* U+2520 */     -1,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c2,     -1,     -1,     -1,
01618         /* U+2530 */     -1,     -1,     -1,     -1, 0x00c1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5,     -1,     -1,     -1,
01619         /* U+2540 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01620         /* U+2550 */ 0x00cd, 0x00ba,     -1,     -1, 0x00c9,     -1,     -1, 0x00bb,     -1,     -1, 0x00c8,     -1,     -1, 0x00bc,     -1,     -1,
01621         /* U+2560 */ 0x00cc,     -1,     -1, 0x00b9,     -1,     -1, 0x00cb,     -1,     -1, 0x00ca,     -1,     -1, 0x00ce,     -1,     -1,     -1,
01622         /* U+2570 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01623         /* U+2580 */ 0x00df,     -1,     -1,     -1, 0x00dc,     -1,     -1,     -1, 0x00db,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01624         /* U+2590 */     -1, 0x00b0, 0x00b1, 0x00b2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01625         /* U+25a0 */ 0x00fe,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1
01626 };
01627 
01628 //-----------------------------------------------------------------------------
01629 // Windows-1250
01630 //-----------------------------------------------------------------------------
01631 
01632 const int TEncoding_CP1250::toUnicodeTable[8 * 16] = {
01633         /* 0x80 */ 0x20ac,     -1, 0x201a,     -1, 0x201e, 0x2026, 0x2020, 0x2021,     -1, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
01634         /* 0x90 */     -1, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,     -1, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
01635         /* 0xa0 */ 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
01636         /* 0xb0 */ 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
01637         /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
01638         /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
01639         /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
01640         /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
01641 };
01642 
01643 const int TEncoding_CP1250::fromUnicodeTable1[14 * 16] = {
01644         /* U+00a0 */ 0x00a0,     -1,     -1,     -1, 0x00a4,     -1, 0x00a6, 0x00a7, 0x00a8, 0x00a9,     -1, 0x00ab, 0x00ac, 0x00ad, 0x00ae,     -1,
01645         /* U+00b0 */ 0x00b0, 0x00b1,     -1,     -1, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8,     -1,     -1, 0x00bb,     -1,     -1,     -1,     -1,
01646         /* U+00c0 */     -1, 0x00c1, 0x00c2,     -1, 0x00c4,     -1,     -1, 0x00c7,     -1, 0x00c9,     -1, 0x00cb,     -1, 0x00cd, 0x00ce,     -1,
01647         /* U+00d0 */     -1,     -1,     -1, 0x00d3, 0x00d4,     -1, 0x00d6, 0x00d7,     -1,     -1, 0x00da,     -1, 0x00dc, 0x00dd,     -1, 0x00df,
01648         /* U+00e0 */     -1, 0x00e1, 0x00e2,     -1, 0x00e4,     -1,     -1, 0x00e7,     -1, 0x00e9,     -1, 0x00eb,     -1, 0x00ed, 0x00ee,     -1,
01649         /* U+00f0 */     -1,     -1,     -1, 0x00f3, 0x00f4,     -1, 0x00f6, 0x00f7,     -1,     -1, 0x00fa,     -1, 0x00fc, 0x00fd,     -1,     -1,
01650         /* U+0100 */     -1,     -1, 0x00c3, 0x00e3, 0x00a5, 0x00b9, 0x00c6, 0x00e6,     -1,     -1,     -1,     -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
01651         /* U+0110 */ 0x00d0, 0x00f0,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec,     -1,     -1,     -1,     -1,
01652         /* U+0120 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01653         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5, 0x00e5,     -1,     -1, 0x00bc, 0x00be,     -1,
01654         /* U+0140 */     -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1,     -1,     -1, 0x00d2, 0x00f2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01655         /* U+0150 */ 0x00d5, 0x00f5,     -1,     -1, 0x00c0, 0x00e0,     -1,     -1, 0x00d8, 0x00f8, 0x008c, 0x009c,     -1,     -1, 0x00aa, 0x00ba,
01656         /* U+0160 */ 0x008a, 0x009a, 0x00de, 0x00fe, 0x008d, 0x009d,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00d9, 0x00f9,
01657         /* U+0170 */ 0x00db, 0x00fb,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x008f, 0x009f, 0x00af, 0x00bf, 0x008e, 0x009e,     -1,
01658 };
01659 
01660 const int TEncoding_CP1250::fromUnicodeTable2[2 * 16] = {
01661         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01662         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a2, 0x00ff,     -1, 0x00b2,     -1, 0x00bd,     -1,     -1,
01663 };
01664 
01665 const int TEncoding_CP1250::fromUnicodeTable3[3 * 16] = {
01666         /* U+2010 */     -1,     -1,     -1, 0x0096, 0x0097,     -1,     -1,     -1, 0x0091, 0x0092, 0x0082,     -1, 0x0093, 0x0094, 0x0084,     -1,
01667         /* U+2020 */ 0x0086, 0x0087, 0x0095,     -1,     -1,     -1, 0x0085,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01668         /* U+2030 */ 0x0089,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x008b, 0x009b,     -1,     -1,     -1,     -1,     -1,
01669 };
01670 //      /* U+20a0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x0080,     -1,     -1,     -1,
01671 //      /* U+2120 */     -1,     -1, 0x0099,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1
01672 
01673 //-----------------------------------------------------------------------------
01674 // YU-ASCII
01675 //-----------------------------------------------------------------------------
01676 
01677 //                                                C acute c acute C caron c caron S caron s caron Z caron z caron D stroke d stroke
01678 const int TEncoding_YuAscii::uniChars[10] =     {  0x106,  0x107,  0x10c,  0x10d,  0x160,  0x161,  0x17d,  0x17e,   0x110,  0x111  };
01679 const int TEncoding_YuAscii::yuAsciiChars[10] = {   0x5d,   0x7d,   0x5e,   0x7e,   0x5b,   0x7b,   0x40,   0x60,    0x5c,   0x7c  };
01680 //                                                   ']'     '}'     '^'    '~'     '['     '{'     '@'     '`'      '\\'    '|'
01681 
01682 
01683 //-----------------------------------------------------------------------------
01684 // TUnicode - codec registry
01685 //-----------------------------------------------------------------------------
01686 
01687 void TUnicode::InitCodecs()
01688 {
01689         ClrCodecs();
01690         RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
01691         RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
01692         RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
01693         RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
01694         RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
01695         RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
01696         RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
01697         RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
01698 }
01699 
01700 void TUnicode::EncodeUtf8(const uint& c, TChA& dest) {
01701         if (c > 0x10ffff) {
01702                 throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); }
01703         if (c < 0x80u)
01704                 dest.AddCh(char(c & 0xffu));
01705         else if (c < 0x800u) {
01706                 dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111)));
01707                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01708         else if (c < 0x10000u) {
01709                 dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111)));
01710                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01711                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01712         else if (c < 0x200000u) {
01713                 dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111)));
01714                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
01715                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01716                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01717         else if (c < 0x4000000u) {
01718                 dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011)));
01719                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
01720                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
01721                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01722                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01723         else {
01724                 dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011)));
01725                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111)));
01726                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
01727                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
01728                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01729                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01730 }
01731 
01732 TStr TUnicode::EncodeUtf8(const uint& Ch) {
01733         TChA ChA; EncodeUtf8(Ch, ChA); return ChA;
01734 }