SNAP Library 2.1, Developer Reference
2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 #include "bd.h" 00002 00003 //#ifndef unicode_h 00004 //#define unicode_h 00005 00007 // Includes 00008 //#include "base.h" 00009 #include <new> 00010 00011 typedef int TUniVecIdx; 00012 00013 //----------------------------------------------------------------------------- 00014 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder 00015 //----------------------------------------------------------------------------- 00016 00017 // Error handling modes for the TUniCodec class. 00018 typedef enum TUnicodeErrorHandling_ 00019 { 00020 // What happens when an error occurs: 00021 uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector) 00022 uehThrow = 1, // - an exception is thrown (TUnicodeException) 00023 uehReplace = 2, // - the replacement character is added to the output vector 00024 uehAbort = 3 // - the encoding/decoding process stops immediately 00025 } 00026 TUnicodeErrorHandling; 00027 00028 class TUnicodeException 00029 { 00030 public: 00031 TStr message; // error message 00032 size_t srcIdx; // the position in the source vector where the error occurred 00033 int srcChar; // the source character at the position srcIdx 00034 TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) : 00035 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { } 00036 }; 00037 00038 typedef enum TUniByteOrder_ 00039 { 00040 boMachineEndian = 0, 00041 boLittleEndian = 1, 00042 boBigEndian = 2 00043 } 00044 TUniByteOrder; 00045 00046 typedef enum TUtf16BomHandling_ 00047 { 00048 bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used 00049 bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported 00050 bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character 00051 } 00052 TUtf16BomHandling; 00053 00054 class TUniCodec 00055 { 00056 public: 00057 // 0xfffd is defined as the replacement character by the Unicode standard. 00058 // By default, it is rendered as a question mark inside a diamond: "<?>". 00059 enum { DefaultReplacementChar = 0xfffd }; 00060 00061 // The replacement character is inserted into the destination vector 00062 // if an error occurs in the source vector. By default, this is set 00063 // to DefaultReplacementChar. 00064 int replacementChar; 00065 // The error handling mode. 00066 TUnicodeErrorHandling errorHandling; 00067 // There are a number of situations where there is strictly speaking an error in 00068 // the source data although it can still be decoded in a reasonably meaningful way. 00069 // If strict == true, these situations are treated as errors. Examples: 00070 // - when decoding UTF-8: 00071 // - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127 00072 // encoded as a two-byte sequence) 00073 // - a codepoint > 0x10ffff 00074 // - when decoding UTF-16: 00075 // - a codepoint from the range reserved for the second character of a surrogate pair 00076 // is not preceded by a codepoint from the range reserved for the first character of a surrogate pair 00077 // - when encoding UTF-8: 00078 // - a codepoint > 0x10ffff 00079 // - when encoding UTF-16: 00080 // - a codepoint from the range reserved from the second character of a surrogate pair 00081 // [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a 00082 // surrogate pair, is always an error, even with strict == false] 00083 bool strict; 00084 // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning 00085 // of the source vector, it is skipped (when decoding). 00086 // - Note: a BOM is not really useful in UTF-8 encoded data. However, the .NET UTF8Encoding 00087 // emits 0xfeff by default as a kind of preamble. It gets encoded as 3 bytes, ef bb bf, 00088 // and can be helpful to make the data easier to recognize as UTF-8 encoded data. 00089 bool skipBom; 00090 00091 TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) 00092 { 00093 } 00094 00095 TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) : 00096 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_) 00097 { 00098 } 00099 00100 protected: 00101 enum { 00102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 00103 DefineByte(1, 0, 0, 0, 0, 0, 0, 0), 00104 DefineByte(1, 1, 0, 0, 0, 0, 0, 0), 00105 DefineByte(1, 1, 1, 0, 0, 0, 0, 0), 00106 DefineByte(1, 1, 1, 1, 0, 0, 0, 0), 00107 DefineByte(1, 1, 1, 1, 1, 0, 0, 0), 00108 DefineByte(1, 1, 1, 1, 1, 1, 0, 0), 00109 DefineByte(1, 1, 1, 1, 1, 1, 1, 0), 00110 DefineByte(0, 0, 1, 1, 1, 1, 1, 1), 00111 DefineByte(0, 0, 0, 1, 1, 1, 1, 1), 00112 DefineByte(0, 0, 0, 0, 1, 1, 1, 1), 00113 DefineByte(0, 0, 0, 0, 0, 1, 1, 1), 00114 DefineByte(0, 0, 0, 0, 0, 0, 1, 1) 00115 #undef DefineByte 00116 }; 00117 00118 typedef TUniVecIdx TVecIdx; 00119 //friend class TUniChDb; 00120 friend class TUniCaseFolding; 00121 friend class TUnicode; 00122 00123 public: 00124 00125 //----------------------------------------------------------------------- 00126 // UTF-8 00127 //----------------------------------------------------------------------- 00128 00129 // Returns the number of characters that have been successfully decoded. 00130 // This does not include any replacement characters that may have been inserted into 'dest'. 00131 template<typename TSrcVec, typename TDestCh> 00132 size_t DecodeUtf8( 00133 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00134 TVec<TDestCh>& dest, const bool clrDest = true) const; 00135 template<typename TSrcVec, typename TDestCh> 00136 size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); } 00137 00138 // Returns the number of characters that have been successfully encoded. 00139 // This does not include any replacement characters that may have been inserted into 'dest'. 00140 template<typename TSrcVec, typename TDestCh> 00141 size_t EncodeUtf8( 00142 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00143 TVec<TDestCh>& dest, const bool clrDest = true) const; 00144 template<typename TSrcVec, typename TDestCh> 00145 size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); } 00146 00147 // The following wrappers around the UTF-8 encoder return a TStr containing 00148 // the UTF-8-encoded version of the input string. 00149 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; } 00150 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; } 00151 00152 //----------------------------------------------------------------------- 00153 // UTF-16 Decoder 00154 //----------------------------------------------------------------------- 00155 00156 protected: 00157 enum { 00158 Utf16FirstSurrogate = 0xd800, 00159 Utf16SecondSurrogate = 0xdc00 00160 }; 00161 00162 static bool IsMachineLittleEndian(); 00163 00164 public: 00165 00166 // Returns the number of characters that have been successfully decoded. 00167 // This does not include any replacement characters that may have been inserted into 'dest'. 00168 // Each element of 'src' is assumed to contain one byte of data. 00169 // srcCount must be even (though srcIdx doesn't need to be). 00170 template<typename TSrcVec, typename TDestCh> 00171 size_t DecodeUtf16FromBytes( 00172 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00173 TVec<TDestCh>& dest, const bool clrDest, 00174 const TUtf16BomHandling bomHandling = bomAllowed, 00175 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00176 00177 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 00178 // are used to determine if the two bytes of each word should be swapped before further 00179 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 00180 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 00181 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 00182 // beginning of the source data is used to determine the "original" byte order of the data; 00183 // if this doesn't match the byte order of the local machine, the two bytes of each word will 00184 // be swapped during the decoding process. 00185 template<typename TSrcVec, typename TDestCh> 00186 size_t DecodeUtf16FromWords( 00187 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00188 TVec<TDestCh>& dest, bool clrDest, 00189 const TUtf16BomHandling bomHandling = bomAllowed, 00190 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00191 00192 //----------------------------------------------------------------------- 00193 // UTF-16 Encoder 00194 //----------------------------------------------------------------------- 00195 00196 // Returns the number of characters that have been successfully encoded. 00197 // This does not include any replacement characters that may have been inserted into 'dest'. 00198 // 00199 // Notes: 00200 // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always 00201 // treated as an error, regardless of the value of 'strict'. 00202 // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023 00203 // cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding 00204 // as the first character of a surrogate pair. 00205 // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023 00206 // can be encoded in principle; however, if strict == true, they are treated as errors. 00207 template<typename TSrcVec, typename TDestCh> 00208 size_t EncodeUtf16ToWords( 00209 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00210 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00211 const TUniByteOrder destByteOrder = boMachineEndian) const; 00212 00213 template<typename TSrcVec, typename TDestCh> 00214 size_t EncodeUtf16ToBytes( 00215 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00216 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00217 const TUniByteOrder destByteOrder = boMachineEndian) const; 00218 00219 //----------------------------------------------------------------------- 00220 // Helper declarations for the test drivers 00221 //----------------------------------------------------------------------- 00222 00223 protected: 00224 00225 static uint GetRndUint(TRnd& rnd); 00226 static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal); 00227 00228 //----------------------------------------------------------------------- 00229 // UTF-8 Test Driver 00230 //----------------------------------------------------------------------- 00231 00232 protected: 00233 void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f); 00234 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc', 00235 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected. 00236 void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc); 00237 public: 00238 void TestUtf8(); 00239 00240 //----------------------------------------------------------------------- 00241 // UTF-16 Test Driver 00242 //----------------------------------------------------------------------- 00243 00244 protected: 00245 void WordsToBytes(const TIntV& src, TIntV& dest); 00246 void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, 00247 // Note: insertBom is only used with the encoder. When encoding, 'defaultByteOrder' is used as the destination byte order. 00248 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, 00249 FILE *f); 00250 static inline int SwapBytes(int x) { 00251 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); } 00252 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc', 00253 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected. 00254 void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc, 00255 const TUtf16BomHandling bomHandling, 00256 const TUniByteOrder defaultByteOrder, 00257 const bool insertBom); 00258 public: 00259 void TestUtf16(); 00260 00261 }; 00262 00263 //----------------------------------------------------------------------------- 00264 // Case folding 00265 //----------------------------------------------------------------------------- 00266 // Note: there's no need to access this class directly. 00267 // Use TUniChDb::GetCaseFolded() instead. 00268 00269 typedef THash<TInt, TIntV> TIntIntVH; 00270 00271 class TUniCaseFolding 00272 { 00273 protected: 00274 TIntH cfCommon, cfSimple, cfTurkic; 00275 TIntIntVH cfFull; 00276 00277 template<typename TSrcDat, typename TDestDat> 00278 inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) { 00279 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); } 00280 friend class TUniChDb; 00281 typedef TUniVecIdx TVecIdx; 00282 00283 public: 00284 TUniCaseFolding() { } 00285 explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); } 00286 void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); } 00287 void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); } 00288 void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); } 00289 void LoadTxt(const TStr& fileName); 00290 00291 // Use 'turkic' when processing text in a Turkic language (tr, az). This only affects the uppercase I and I-with-dot-above. 00292 template<typename TSrcVec, typename TDestCh> 00293 void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00294 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const 00295 { 00296 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 00297 { 00298 int c = src[TVecIdx(srcIdx)], i; srcIdx++; 00299 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; } 00300 if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; } 00301 if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; } 00302 i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c); 00303 } 00304 } 00305 00306 template<typename TSrcVec> 00307 void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const 00308 { 00309 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 00310 { 00311 int c = src[TVecIdx(srcIdx)], i; 00312 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; } 00313 if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; } 00314 i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i]; 00315 } 00316 } 00317 00318 protected: 00319 void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f); 00320 public: 00321 void Test(); 00322 }; 00323 00324 //----------------------------------------------------------------------------- 00325 // TCodecBase -- an abstract base class for codecs 00326 //----------------------------------------------------------------------------- 00327 00328 class TCodecBase; 00329 typedef TPt<TCodecBase> PCodecBase; 00330 typedef TVec<PCodecBase> TCodecBaseV; 00331 00332 class TCodecBase 00333 { 00334 protected: 00335 TCRef CRef; 00336 friend class TPt<TCodecBase>; 00337 public: 00338 virtual ~TCodecBase() { } 00339 00340 template<class TCodecImpl> 00341 static PCodecBase New(); /* { 00342 return new TCodecWrapper<TCodecImpl>(); } */ 00343 00344 virtual TStr GetName() const = 0; 00345 virtual void Test() const { } 00346 00347 // Returns the number of characters that have been successfully decoded. 00348 // This does not include any replacement characters that may have been inserted into 'dest'. 00349 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00350 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00351 00352 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00353 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00354 00355 // Returns the number of characters that have been successfully encoded. 00356 // This does not include any replacement characters that may have been inserted into 'dest'. 00357 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00358 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0; 00359 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0; 00360 00361 size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00362 size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00363 size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00364 }; 00365 00366 //----------------------------------------------------------------------------- 00367 // TCodecWrapper -- a descendant of TCodecBase; relies on a template 00368 // parameter class for the actual implementation of the codec. 00369 //----------------------------------------------------------------------------- 00370 // Thus, if you know in advance that you'll need ISO-8859-2, just use 00371 // T8BitCodec<TEncoding_ISO8859_2>. If you don't know the encoding 00372 // in advance, use a PCodecBase pointing to a suitable specialization 00373 // of TCodecWrapper<...>. You can TUnicode::GetCodec(TStr& name) 00374 // to obtain a suitable pointer. 00375 00376 template<class TCodecImpl_> 00377 class TCodecWrapper : public TCodecBase 00378 { 00379 public: 00380 typedef TCodecImpl_ TCodecImpl; 00381 TCodecImpl impl; 00382 public: 00383 00384 virtual TStr GetName() const { return impl.GetName(); } 00385 00386 virtual void Test() const { impl.Test(); } 00387 00388 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00389 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00390 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00391 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00392 00393 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00394 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00395 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const { 00396 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00397 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00398 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false); 00399 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00400 return retVal; } 00401 }; 00402 00403 template<class TCodecImpl> 00404 PCodecBase TCodecBase::New() { 00405 return new TCodecWrapper<TCodecImpl>(); 00406 } 00407 00408 //----------------------------------------------------------------------------- 00409 // TVecElt -- a template for determining the type of a vector's elements 00410 //----------------------------------------------------------------------------- 00411 00412 template<class TVector_> 00413 class TVecElt 00414 { 00415 }; 00416 00417 template<class TDat> 00418 class TVecElt<TVec<TDat> > 00419 { 00420 public: 00421 typedef TVec<TDat> TVector; 00422 typedef TDat TElement; 00423 static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); } 00424 }; 00425 00426 template<> 00427 class TVecElt<TChA> 00428 { 00429 public: 00430 typedef TChA TVector; 00431 typedef char TElement; 00432 static inline void Add(TVector& vector, const TElement& element) { vector += element; } 00433 }; 00434 00435 00436 //----------------------------------------------------------------------------- 00437 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode 00438 //----------------------------------------------------------------------------- 00439 00440 class TEncoding_ISO8859_1 00441 { 00442 public: 00443 static inline TStr GetName() { return "ISO-8859-1"; } 00444 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; } 00445 static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; } 00446 }; 00447 00448 class TEncoding_ISO8859_2 // ISO Latin 2 00449 { 00450 public: 00451 static inline TStr GetName() { return "ISO-8859-2"; } 00452 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00453 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00454 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00455 static int FromUnicode(int c) { 00456 if (0 <= c && c < 0xa0) return c; 00457 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00458 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00459 else return -1; } 00460 }; 00461 00462 class TEncoding_ISO8859_3 00463 { 00464 public: 00465 static inline TStr GetName() { return "ISO-8859-3"; } 00466 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2]; 00467 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00468 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00469 static int FromUnicode(int c) { 00470 if (0 <= c && c < 0xa0) return c; 00471 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00472 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8]; 00473 else return -1; } 00474 }; 00475 00476 class TEncoding_ISO8859_4 00477 { 00478 public: 00479 static inline TStr GetName() { return "ISO-8859-4"; } 00480 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00481 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00482 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00483 static int FromUnicode(int c) { 00484 if (0 <= c && c < 0xa0) return c; 00485 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00486 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00487 else return -1; } 00488 }; 00489 00490 class TEncoding_YuAscii 00491 { 00492 public: 00493 static const int uniChars[10], yuAsciiChars[10]; 00494 static inline TStr GetName() { return "YU-ASCII"; } 00495 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00496 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++) 00497 if (c == yuAsciiChars[i]) return uniChars[i]; 00498 return c; } 00499 static int FromUnicode(int c) { 00500 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++) 00501 if (c == uniChars[i]) return yuAsciiChars[i]; 00502 else if(c == yuAsciiChars[i]) return -1; 00503 if (0 <= c && c <= 255) return c; else return -1; } 00504 }; 00505 00506 class TEncoding_CP437 // DOS US 00507 { 00508 public: 00509 static inline TStr GetName() { return "CP437"; } 00510 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16]; 00511 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00512 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00513 static int FromUnicode(int c) { 00514 if (0 <= c && c < 0x80) return c; 00515 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0]; 00516 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390]; 00517 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210]; 00518 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500]; 00519 else if (c == 0x192) return 0x9f; 00520 else if (c == 0x207f) return 0xfc; 00521 else if (c == 0x20a7) return 0x9e; 00522 else if (c == 0x2310) return 0xa9; 00523 else if (c == 0x2320) return 0xf4; 00524 else if (c == 0x2321) return 0xf5; 00525 else return -1; } 00526 }; 00527 00528 class TEncoding_CP852 // DOS Latin 2 00529 { 00530 public: 00531 static inline TStr GetName() { return "CP852"; } 00532 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16]; 00533 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00534 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00535 static int FromUnicode(int c) { 00536 if (0 <= c && c < 0x80) return c; 00537 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00538 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00539 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500]; 00540 else return -1; } 00541 }; 00542 00543 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2 00544 { 00545 public: 00546 static inline TStr GetName() { return "CP1250"; } 00547 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16]; 00548 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00549 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00550 static int FromUnicode(int c) { 00551 if (0 <= c && c < 0x80) return c; 00552 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00553 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00554 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010]; 00555 else if (c == 0x20ac) return 0x80; 00556 else if (c == 0x2122) return 0x99; 00557 else return -1; } 00558 }; 00559 00560 template<class TEncoding_> 00561 class T8BitCodec 00562 { 00563 protected: 00564 typedef TUniVecIdx TVecIdx; 00565 public: 00566 typedef TEncoding_ TEncoding; 00567 TUnicodeErrorHandling errorHandling; 00568 int replacementChar; 00569 00570 T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { } 00571 T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) : 00572 errorHandling(errorHandling_), replacementChar(replacementChar_) { } 00573 static TStr GetName() { return TEncoding::GetName(); } 00574 00575 void Test() const 00576 { 00577 int nDecoded = 0; 00578 for (int c = 0; c <= 255; c++) { 00579 int cu = TEncoding::ToUnicode(c); if (cu == -1) continue; 00580 nDecoded++; 00581 IAssert(0 <= cu && cu < 0x110000); 00582 int c2 = TEncoding::FromUnicode(cu); 00583 IAssert(c2 == c); } 00584 int nEncoded = 0; 00585 for (int cu = 0; cu < 0x110000; cu++) { 00586 int c = TEncoding::FromUnicode(cu); if (c == -1) continue; 00587 nEncoded++; 00588 IAssert(0 <= c && c <= 255); 00589 int cu2 = TEncoding::ToUnicode(c); 00590 IAssert(cu2 == cu); } 00591 IAssert(nDecoded == nEncoded); 00592 } 00593 00594 // Returns the number of characters that have been successfully decoded. 00595 // This does not include any replacement characters that may have been inserted into 'dest'. 00596 template<typename TSrcVec, typename TDestCh> 00597 size_t ToUnicode( 00598 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00599 TVec<TDestCh>& dest, const bool clrDest = true) const 00600 { 00601 if (clrDest) dest.Clr(); 00602 size_t toDo = srcCount; 00603 while (toDo-- > 0) { 00604 int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++; 00605 int chDest = TEncoding::ToUnicode(chSrc); 00606 dest.Add(chDest); } 00607 return srcCount; 00608 } 00609 template<typename TSrcVec, typename TDestCh> 00610 size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00611 00612 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00613 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00614 00615 // Returns the number of characters that have been successfully encoded. 00616 // This does not include any replacement characters that may have been inserted into 'dest'. 00617 template<typename TSrcVec, typename TDestVec> 00618 size_t FromUnicode( 00619 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00620 TDestVec& dest, const bool clrDest = true) const 00621 { 00622 typedef typename TVecElt<TDestVec>::TElement TDestCh; 00623 if (clrDest) dest.Clr(); 00624 size_t toDo = srcCount, nEncoded = 0; 00625 while (toDo-- > 0) { 00626 int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++; 00627 int chDest = TEncoding::FromUnicode(chSrc); 00628 if (chDest < 0) { 00629 switch (errorHandling) { 00630 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + "."); 00631 case uehAbort: return nEncoded; 00632 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue; 00633 case uehIgnore: continue; 00634 default: Fail; } } 00635 TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; } 00636 return nEncoded; 00637 } 00638 00639 template<typename TSrcVec, typename TDestVec> 00640 size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00641 00642 size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00643 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false); 00644 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00645 return retVal; } 00646 size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); } 00647 }; 00648 00649 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1; 00650 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2; 00651 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3; 00652 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4; 00653 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852; 00654 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437; 00655 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250; 00656 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii; 00657 00658 //----------------------------------------------------------------------------- 00659 // Various declarations used by the Unicode Character Database 00660 //----------------------------------------------------------------------------- 00661 00662 typedef enum TUniChCategory_ 00663 { 00664 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff) 00665 DefineUniCat(Letter, 'L'), // ucLetter 00666 DefineUniCat(Mark, 'M'), 00667 DefineUniCat(Number, 'N'), 00668 DefineUniCat(Punctuation, 'P'), 00669 DefineUniCat(Symbol, 'S'), 00670 DefineUniCat(Separator, 'Z'), 00671 DefineUniCat(Other, 'C') 00672 #undef DefineUniCat 00673 } 00674 TUniChCategory; 00675 00676 typedef enum TUniChSubCategory_ 00677 { 00678 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) 00679 DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase 00680 DefineUniSubCat(Letter, Lowercase, 'l'), 00681 DefineUniSubCat(Letter, Titlecase, 't'), 00682 DefineUniSubCat(Letter, Modifier, 'm'), 00683 DefineUniSubCat(Letter, Other, 'o'), 00684 DefineUniSubCat(Mark, Nonspacing, 'n'), 00685 DefineUniSubCat(Mark, SpacingCombining, 'c'), 00686 DefineUniSubCat(Mark, Enclosing, 'e'), 00687 DefineUniSubCat(Number, DecimalDigit, 'd'), 00688 DefineUniSubCat(Number, Letter, 'l'), 00689 DefineUniSubCat(Number, Other, 'o'), 00690 DefineUniSubCat(Punctuation, Connector, 'c'), 00691 DefineUniSubCat(Punctuation, Dash, 'd'), 00692 DefineUniSubCat(Punctuation, Open, 's'), 00693 DefineUniSubCat(Punctuation, Close, 'e'), 00694 DefineUniSubCat(Punctuation, InitialQuote, 'i'), 00695 DefineUniSubCat(Punctuation, FinalQuote, 'f'), 00696 DefineUniSubCat(Punctuation, Other, 'o'), 00697 DefineUniSubCat(Symbol, Math, 'm'), 00698 DefineUniSubCat(Symbol, Currency, 'c'), 00699 DefineUniSubCat(Symbol, Modifier, 'k'), 00700 DefineUniSubCat(Symbol, Other, 'o'), 00701 DefineUniSubCat(Separator, Space, 's'), 00702 DefineUniSubCat(Separator, Line, 'l'), 00703 DefineUniSubCat(Separator, Paragraph, 'p'), 00704 DefineUniSubCat(Other, Control, 'c'), 00705 DefineUniSubCat(Other, Format, 'f'), 00706 DefineUniSubCat(Other, Surrogate, 's'), 00707 DefineUniSubCat(Other, PrivateUse, 'o'), 00708 DefineUniSubCat(Other, NotAssigned, 'n') 00709 } 00710 TUniChSubCategory; 00711 00712 typedef enum TUniChFlags_ 00713 { 00714 ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical 00715 ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt 00716 // Flags used when searching for word boundaries. See UAX #29. 00717 ucfWbFormat = 1 << 2, 00718 ucfWbKatakana = 1 << 3, 00719 ucfWbALetter = 1 << 4, 00720 ucfWbMidLetter = 1 << 5, 00721 ucfWbMidNum = 1 << 6, 00722 ucfWbNumeric = 1 << 7, 00723 ucfWbExtendNumLet = 1 << 8, 00724 // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29. 00725 ucfSbSep = 1 << 9, 00726 ucfSbFormat = 1 << 10, 00727 ucfSbSp = 1 << 11, 00728 ucfSbLower = 1 << 12, 00729 ucfSbUpper = 1 << 13, 00730 ucfSbOLetter = 1 << 14, 00731 ucfSbNumeric = 1 << 15, 00732 ucfSbATerm = 1 << 16, 00733 ucfSbSTerm = 1 << 17, 00734 ucfSbClose = 1 << 18, 00735 ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose, 00736 ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, 00737 // Flags from DerivedCoreProperties.txt. 00738 // [The comments are from UCD.html.] 00739 // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode]. 00740 // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl 00741 ucfDcpAlphabetic = 1 << 19, 00742 // - For programmatic determination of default-ignorable code points. 00743 // New characters that should be ignored in processing (unless explicitly supported) 00744 // will be assigned in these ranges, permitting programs to correctly handle the default 00745 // behavior of such characters when not otherwise supported. For more information, see 00746 // UAX #29: Text Boundaries [Breaks]. 00747 // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters 00748 // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors] 00749 ucfDcpDefaultIgnorableCodePoint = 1 << 20, 00750 // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode]. 00751 // Generated from: Other_Lowercase + Ll 00752 ucfDcpLowercase = 1 << 21, 00753 // - For programmatic determination of grapheme cluster boundaries. 00754 // For more information, see UAX #29: Text Boundaries [Breaks]. 00755 // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend 00756 ucfDcpGraphemeBase = 1 << 22, 00757 // - For programmatic determination of grapheme cluster boundaries. 00758 // For more information, see UAX #29: Text Boundaries [Breaks]. 00759 // Generated from: Other_Grapheme_Extend + Me + Mn 00760 // Note: depending on an application's interpretation of Co (private use), they may be either 00761 // in Grapheme_Base, or in Grapheme_Extend, or in neither. 00762 ucfDcpGraphemeExtend = 1 << 23, 00763 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00764 ucfDcpIdStart = 1 << 24, 00765 ucfDcpIdContinue = 1 << 25, 00766 // - Characters with the Math property. For more information, see Chapter 4 in [Unicode]. 00767 // Generated from: Sm + Other_Math 00768 ucfDcpMath = 1 << 26, 00769 // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode]. 00770 // Generated from: Lu + Other_Uppercase 00771 ucfDcpUppercase = 1 << 27, 00772 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00773 ucfDcpXidStart = 1 << 28, 00774 ucfDcpXidContinue = 1 << 29, 00775 ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend | 00776 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue, 00777 } 00778 TUniChFlags; 00779 00780 typedef enum TUniChProperties_ 00781 { 00782 // The flags from PropList.txt. 00783 // [The comments are from UCD.html.] 00784 // - ASCII characters commonly used for the representation of hexadecimal numbers. 00785 // [= 0123456789abcdefABCDEF] 00786 ucfPrAsciiHexDigit = 1, 00787 // - Those format control characters which have specific functions in the Bidirectional Algorithm. 00788 ucfPrBidiControl = 2, 00789 // - Those punctuation characters explicitly called out as dashes in the Unicode Standard, 00790 // plus compatibility equivalents to those. Most of these have the Pd General Category, 00791 // but some have the Sm General Category because of their use in mathematics. 00792 // U+0002d HYPHEN-MINUS 00793 // U+0058a ARMENIAN HYPHEN 00794 // U+005be HEBREW PUNCTUATION MAQAF 00795 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00796 // U+02010 HYPHEN 00797 // U+02011 NON-BREAKING HYPHEN 00798 // U+02012 FIGURE DASH 00799 // U+02013 EN DASH 00800 // U+02014 EM DASH 00801 // U+02015 HORIZONTAL BAR 00802 // U+02053 SWUNG DASH 00803 // U+0207b SUPERSCRIPT MINUS 00804 // U+0208b SUBSCRIPT MINUS 00805 // U+02212 MINUS SIGN 00806 // U+02e17 DOUBLE OBLIQUE HYPHEN 00807 // U+0301c WAVE DASH 00808 // U+03030 WAVY DASH 00809 // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN 00810 // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH 00811 // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH 00812 // U+0fe58 SMALL EM DASH 00813 // U+0fe63 SMALL HYPHEN-MINUS 00814 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00815 ucfPrDash = 4, 00816 // - For a machine-readable list of deprecated characters. No characters will ever be removed 00817 // from the standard, but the usage of deprecated characters is strongly discouraged. 00818 ucfPrDeprecated = 8, 00819 // - Characters that linguistically modify the meaning of another character to which they apply. 00820 // Some diacritics are not combining characters, and some combining characters are not diacritics. 00821 ucfPrDiacritic = 0x10, 00822 // - Characters whose principal function is to extend the value or shape of a preceding alphabetic 00823 // character. Typical of these are length and iteration marks. 00824 ucfPrExtender = 0x20, 00825 // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries. 00826 ucfPrGraphemeLink = 0x40, 00827 // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents. 00828 // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}] 00829 ucfPrHexDigit = 0x80, 00830 // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot. 00831 // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash. 00832 // U+0002d HYPHEN-MINUS 00833 // U+000ad SOFT HYPHEN 00834 // U+0058a ARMENIAN HYPHEN 00835 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00836 // U+02010 HYPHEN 00837 // U+02011 NON-BREAKING HYPHEN 00838 // U+02e17 DOUBLE OBLIQUE HYPHEN 00839 // U+030fb KATAKANA MIDDLE DOT 00840 // U+0fe63 SMALL HYPHEN-MINUS 00841 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00842 // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT 00843 ucfPrHyphen = 0x100, 00844 // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs. 00845 ucfPrIdeographic = 0x200, 00846 // - Those format control characters which have specific functions for control of cursive joining and ligation. 00847 ucfPrJoinControl = 0x400, 00848 // - There are a small number of characters that do not use logical order. 00849 // These characters require special handling in most processing. 00850 ucfPrLogicalOrderException = 0x800, 00851 // - Code points that are permanently reserved for internal use. 00852 ucfPrNoncharacterCodePoint = 0x1000, 00853 // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax. 00854 ucfPrPatternSyntax = 0x2000, 00855 ucfPrPatternWhiteSpace = 0x4000, 00856 // - Those punctuation characters that function as quotation marks. 00857 // U+00022 QUOTATION MARK 00858 // U+00027 APOSTROPHE 00859 // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00860 // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00861 // U+02018 LEFT SINGLE QUOTATION MARK 00862 // U+02019 RIGHT SINGLE QUOTATION MARK 00863 // U+0201a SINGLE LOW-9 QUOTATION MARK 00864 // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK 00865 // U+0201c LEFT DOUBLE QUOTATION MARK 00866 // U+0201d RIGHT DOUBLE QUOTATION MARK 00867 // U+0201e DOUBLE LOW-9 QUOTATION MARK 00868 // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK 00869 // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 00870 // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 00871 // U+0300c LEFT CORNER BRACKET 00872 // U+0300d RIGHT CORNER BRACKET 00873 // U+0300e LEFT WHITE CORNER BRACKET 00874 // U+0300f RIGHT WHITE CORNER BRACKET 00875 // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK 00876 // U+0301e DOUBLE PRIME QUOTATION MARK 00877 // U+0301f LOW DOUBLE PRIME QUOTATION MARK 00878 // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET 00879 // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET 00880 // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET 00881 // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET 00882 // U+0ff02 FULLWIDTH QUOTATION MARK 00883 // U+0ff07 FULLWIDTH APOSTROPHE 00884 // U+0ff62 HALFWIDTH LEFT CORNER BRACKET 00885 // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET 00886 ucfPrQuotationMark = 0x8000, 00887 // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. 00888 // An explicit _dot above_ can be added where required, such as in Lithuanian. 00889 ucfPrSoftDotted = 0x10000, 00890 // - Sentence Terminal. Used in UAX #29: Text Boundaries. 00891 // U+00021 EXCLAMATION MARK 00892 // U+0002e FULL STOP 00893 // U+0003f QUESTION MARK 00894 // U+0203c DOUBLE EXCLAMATION MARK 00895 // U+0203d INTERROBANG 00896 // U+02047 DOUBLE QUESTION MARK 00897 // U+02048 QUESTION EXCLAMATION MARK 00898 // U+02049 EXCLAMATION QUESTION MARK 00899 // U+03002 IDEOGRAPHIC FULL STOP 00900 // [plus many characters from other writing systems] 00901 ucfPrSTerm = 0x20000, 00902 // - Those punctuation characters that generally mark the end of textual units. 00903 // [JB note: this set contains more character than STerm. For example, it contains 00904 // the comma, colon and semicolon, whereas STerm doesn't.] 00905 // U+00021 EXCLAMATION MARK 00906 // U+0002c COMMA 00907 // U+0002e FULL STOP 00908 // U+0003a COLON 00909 // U+0003b SEMICOLON 00910 // U+0003f QUESTION MARK 00911 // U+0203c DOUBLE EXCLAMATION MARK 00912 // U+0203d INTERROBANG 00913 // U+02047 DOUBLE QUESTION MARK 00914 // U+02048 QUESTION EXCLAMATION MARK 00915 // U+02049 EXCLAMATION QUESTION MARK 00916 // [plus *lots* of charcters from other writing systems] 00917 ucfPrTerminalPunctuation = 0x40000, 00918 // - Indicates all those characters that qualify as Variation Selectors. 00919 // For details on the behavior of these characters, see StandardizedVariants.html and 00920 // Section 16.4, Variation Selectors in [Unicode]. 00921 ucfPrVariationSelector = 0x80000, 00922 // - Those separator characters and control characters which should be treated by 00923 // programming languages as "white space" for the purpose of parsing elements. 00924 // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included, 00925 // since their functions are restricted to line-break control. 00926 // Their names are unfortunately misleading in this respect. 00927 // Note: There are other senses of "whitespace" that encompass a different set of characters. 00928 // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt. 00929 // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.] 00930 // This includes the following characters: 00931 // U+0009 <control> 00932 // U+000a <control> 00933 // U+000b <control> 00934 // U+000c <control> 00935 // U+000d <control> 00936 // U+0020 SPACE 00937 // U+0085 <control> 00938 // U+00a0 NO-BREAK SPACE 00939 // U+1680 OGHAM SPACE MARK 00940 // U+180e MONGOLIAN VOWEL SEPARATOR 00941 // U+2000 EN QUAD 00942 // U+2001 EM QUAD 00943 // U+2002 EN SPACE 00944 // U+2003 EM SPACE 00945 // U+2004 THREE-PER-EM SPACE 00946 // U+2005 FOUR-PER-EM SPACE 00947 // U+2006 SIX-PER-EM SPACE 00948 // U+2007 FIGURE SPACE 00949 // U+2008 PUNCTUATION SPACE 00950 // U+2009 THIN SPACE 00951 // U+200a HAIR SPACE 00952 // U+2028 LINE SEPARATOR 00953 // U+2029 PARAGRAPH SEPARATOR 00954 // U+202f NARROW NO-BREAK SPACE 00955 // U+205f MEDIUM MATHEMATICAL SPACE 00956 // U+3000 IDEOGRAPHIC SPACE 00957 ucfPrWhiteSpace = 0x100000 00958 } 00959 TUniChProperties; 00960 00961 typedef enum TUniChPropertiesX_ 00962 { 00963 // More properties from PropList.txt. 00964 // - Used to derive the properties in DerivedCoreProperties.txt. 00965 ucfPxOtherAlphabetic = 1, 00966 ucfPxOtherDefaultIgnorableCodePoint = 2, 00967 ucfPxOtherGraphemeExtend = 4, 00968 ucfPxOtherIdContinue = 8, 00969 ucfPxOtherIdStart = 0x10, 00970 ucfPxOtherLowercase = 0x20, 00971 ucfPxOtherMath = 0x40, 00972 ucfPxOtherUppercase = 0x80, 00973 // - Used in ideographic description sequences. 00974 ucfPxIdsBinaryOperator = 0x100, 00975 ucfPxIdsTrinaryOperator = 0x200, 00976 ucfPxRadical = 0x400, 00977 ucfPxUnifiedIdeograph = 0x800 00978 } 00979 TUniChPropertiesX; 00980 00981 //----------------------------------------------------------------------------- 00982 // TUniChInfo -- contains information about a single Unicode codepoint 00983 //----------------------------------------------------------------------------- 00984 00985 class TUniChInfo 00986 { 00987 public: 00988 enum { // combining classes (for 'combClass'); from UnicodeData.txt 00989 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined 00990 ccOverlaysAndInterior = 1, 00991 ccNuktas = 7, 00992 ccHiraganaKatakanaVoicingMarks = 8, 00993 ccViramas = 9, 00994 ccFixedPositionStart = 10, // Start of fixed position classes 00995 ccFixedPositionEnd = 199, // End of fixed position classes 00996 ccBelowLeftAttached = 200, 00997 ccBelowAttached = 202, 00998 ccBelowRightAttached = 204, 00999 ccLeftAttached = 208, // Left attached (reordrant around single base character) 01000 ccRightAttached = 210, 01001 ccAboveLeftAttached = 212, 01002 ccAboveAttached = 214, 01003 ccAboveRightAttached = 216, 01004 ccBelowLeft = 218, 01005 ccBelow = 220, 01006 ccBelowRight = 222, 01007 ccLeft = 224, // Left (reordrant around single base character) 01008 ccRight = 226, 01009 ccAboveLeft = 228, 01010 ccAbove = 230, 01011 ccAboveRight = 232, 01012 ccDoubleBelow = 233, 01013 ccDoubleAbove = 234, 01014 ccBelowIotaSubscript = 240, // Below (iota subscript) 01015 ccInvalid = 255 // not defined by Unicode 01016 }; 01017 char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt) 01018 uchar combClass; // canonical combining class 01019 TUniChCategory cat; // = TUniChCategory(chCat) 01020 TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat) 01021 signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown 01022 int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt 01023 int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition 01024 int nameOffset; // offset into 'TUniChDb.charNames' 01025 int flags; // a combination of TUniChFlags 01026 int properties; // a combination of TUniChProperties 01027 int propertiesX; // a combination of TUniChPropertiesX 01028 ushort lineBreak; // from LineBreak.txt 01029 01030 // Converts a 2-letter linebreak code into a 16-bit integer. 01031 static inline ushort GetLineBreakCode(char c1, char c2) { return ((static_cast<ushort>(static_cast<uchar>(c1)) & 0xff) << 8) | ((static_cast<ushort>(static_cast<uchar>(c2)) & 0xff)); } 01032 static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation; 01033 01034 public: 01035 void InitAfterLoad() { 01036 cat = (TUniChCategory) chCat; 01037 subCat = (TUniChSubCategory) (((static_cast<int>(static_cast<uchar>(chCat)) & 0xff) << 8) | (static_cast<int>(static_cast<uchar>(chSubCat)) & 0xff)); } 01038 void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) { 01039 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff); 01040 subCat = catAndSubCat; 01041 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); } 01042 friend class TUniChDb; 01043 01044 // Inexplicably missing from TSIn/TSOut... 01045 static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); } 01046 static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); } 01047 static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); } 01048 static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); } 01049 01050 public: 01051 void Save(TSOut& SOut) const { 01052 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script); 01053 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping); 01054 SOut.Save(decompOffset); SOut.Save(nameOffset); 01055 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); } 01056 void Load(TSIn& SIn) { 01057 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script); 01058 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping); 01059 SIn.Load(decompOffset); SIn.Load(nameOffset); 01060 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); } 01061 explicit TUniChInfo(TSIn& SIn) { Load(SIn); } 01062 TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid), 01063 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1), 01064 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) { 01065 InitAfterLoad(); } 01066 01067 // DerivedCoreProperties flags. 01068 bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; } 01069 void ClrDcpFlags() { flags = flags & ~ucfDcpMask; } 01070 void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; } 01071 bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); } 01072 bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); } 01073 bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); } 01074 bool IsMath() const { return IsDcpFlag(ucfDcpMath); } 01075 bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); } 01076 bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); } 01077 bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); } 01078 bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); } 01079 bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); } 01080 bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); } 01081 bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); } 01082 01083 // PropList.txt flags. 01084 bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; } 01085 void SetProperty(const TUniChProperties flag) { properties |= flag; } 01086 bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); } 01087 bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); } 01088 bool IsDash() const { return IsProperty(ucfPrDash); } 01089 bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); } 01090 bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); } 01091 bool IsExtender() const { return IsProperty(ucfPrExtender); } 01092 bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); } 01093 bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); } 01094 bool IsHyphen() const { return IsProperty(ucfPrHyphen); } 01095 bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); } 01096 bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); } 01097 bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); } 01098 bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); } 01099 bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); } 01100 bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); } 01101 bool IsSTerminal() const { return IsProperty(ucfPrSTerm); } 01102 bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); } 01103 bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); } 01104 bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); } 01105 01106 // Additional PropList.txt flags. 01107 bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; } 01108 void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; } 01109 01110 // Miscellaneous flags. 01111 bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; } 01112 bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; } 01113 01114 // Word-boundary flags. 01115 bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; } 01116 void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); } 01117 void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; } 01118 int GetWbFlags() const { return flags & ucfWbMask; } 01119 bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); } 01120 TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); } 01121 static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") + 01122 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") + 01123 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); } 01124 01125 // Sentence-boundary flags. 01126 bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; } 01127 void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; } 01128 int GetSbFlags() const { return flags & ucfSbMask; } 01129 bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); } 01130 TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); } 01131 static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") + 01132 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") + 01133 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") + 01134 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); } 01135 01136 bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; } 01137 01138 // Grapheme-boundary flags. 01139 bool IsGbExtend() const { return IsGraphemeExtend(); } 01140 01141 // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter. 01142 bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); } 01143 01144 // Character categories. 01145 TUniChCategory GetCat() const { return (TUniChCategory) cat; } 01146 TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; } 01147 // The following characters belong to the 'symbol/currency' subcategory: 01148 // U+00024 DOLLAR SIGN 01149 // U+000a2 CENT SIGN 01150 // U+000a3 POUND SIGN 01151 // U+000a4 CURRENCY SIGN 01152 // U+000a5 YEN SIGN 01153 // U+020a3 FRENCH FRANC SIGN 01154 // U+020a4 LIRA SIGN 01155 // U+020ac EURO SIGN 01156 // [and plenty of others] 01157 bool IsCurrency() const { return subCat == ucSymbolCurrency; } 01158 // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt. 01159 // Thus, it's better to call TUniChDb's versions of these methods, which are aware of 01160 // the full ranges of private-use and surrogate characters. 01161 bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; } 01162 bool IsSurrogate() const { return subCat == ucOtherSurrogate; } 01163 01164 inline static bool IsValidSubCat(const char chCat, const char chSubCat) { 01165 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn"; 01166 for (const char *p = s; *p; p += 2) 01167 if (chCat == p[0] && chSubCat == p[1]) return true; 01168 return false; } 01169 }; 01170 01171 //----------------------------------------------------------------------------- 01172 // TUniTrie -- a trie for suffixes that should not appear at the end 01173 // of a sentence 01174 //----------------------------------------------------------------------------- 01175 01176 template<typename TItem_> 01177 class TUniTrie 01178 { 01179 public: 01180 typedef TItem_ TItem; 01181 protected: 01182 class TNode { 01183 public: 01184 TItem item; 01185 int child, sib; 01186 bool terminal; 01187 TNode() : child(-1), sib(-1), terminal(false) { } 01188 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { } 01189 }; 01190 typedef TVec<TNode> TNodeV; 01191 typedef TPair<TItem, TItem> TItemPr; 01192 typedef TTriple<TItem, TItem, TItem> TItemTr; 01193 typedef TUniVecIdx TVecIdx; 01194 THash<TItem, TVoid> singles; // 01195 THash<TItemPr, TVoid> pairs; 01196 THash<TItemTr, TInt> roots; 01197 TNodeV nodes; 01198 public: 01199 TUniTrie() { } 01200 void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); } 01201 01202 bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); } 01203 01204 bool Has1Gram(const TItem& item) const { return singles.IsKey(item); } 01205 bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); } 01206 int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const { 01207 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast)); 01208 if (keyId < 0) return 0; else return roots[keyId]; } 01209 int GetChild(const int parentIdx, const TItem& item) const { 01210 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) { 01211 const TNode &node = nodes[childIdx]; 01212 if (node.item == item) return childIdx; 01213 childIdx = node.sib; } 01214 return -1; } 01215 bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; } 01216 01217 // Adds a new string to the trie. Note that the last characters appear 01218 // closer to the root of the trie. 01219 template<typename TSrcVec> 01220 void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount) 01221 { 01222 IAssert(srcCount > 0); 01223 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; } 01224 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; } 01225 size_t srcLast = srcIdx + (srcCount - 1); 01226 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)])); 01227 int keyId = roots.GetKeyId(tr), curNodeIdx = -1; 01228 if (keyId >= 0) curNodeIdx = roots[keyId]; 01229 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); } 01230 // 01231 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; ) 01232 { 01233 const TItem curItem = src[TVecIdx(srcPos)]; 01234 int childNodeIdx = nodes[curNodeIdx].child; 01235 while (childNodeIdx >= 0) { 01236 TNode &childNode = nodes[childNodeIdx]; 01237 if (childNode.item == curItem) break; 01238 childNodeIdx = childNode.sib; } 01239 if (childNodeIdx < 0) { 01240 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false)); 01241 nodes[curNodeIdx].child = childNodeIdx; } 01242 curNodeIdx = childNodeIdx; 01243 if (srcPos == srcIdx) break; else srcPos--; 01244 } 01245 nodes[curNodeIdx].terminal = true; 01246 } 01247 01248 template<typename TSrcVec> 01249 void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); } 01250 }; 01251 01252 //----------------------------------------------------------------------------- 01253 // TUniChDb -- provides access to the Unicode Character Database 01254 //----------------------------------------------------------------------------- 01255 01256 class TUniChDb 01257 { 01258 protected: 01259 void InitAfterLoad(); 01260 typedef TUniVecIdx TVecIdx; 01261 01262 public: 01263 THash<TInt, TUniChInfo> h; // key: codepoint 01264 TStrPool charNames; 01265 TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only) 01266 TIntV decompositions; 01267 THash<TIntPr, TInt> inverseDec; 01268 TUniCaseFolding caseFolding; 01269 // These hash tables contain only the unconditional mappings from SpecialCasing.txt. 01270 // The conditional mappings are hardcoded into GetCaseConverted(). 01271 TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle; 01272 int scriptUnknown; // = scripts.GetKey("Unknown") 01273 01274 TUniChDb() : scriptUnknown(-1) { } 01275 explicit TUniChDb(TSIn& SIn) { Load(SIn); } 01276 void Clr() { 01277 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr(); 01278 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr(); 01279 scripts.Clr(); } 01280 void Save(TSOut& SOut) const { 01281 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut); 01282 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut); 01283 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut); 01284 SOut.SaveCs(); } 01285 void Load(TSIn& SIn) { 01286 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn); 01287 decompositions.Load(SIn); 01288 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn); 01289 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn); 01290 SIn.LoadCs(); InitAfterLoad(); } 01291 void LoadBin(const TStr& fnBin) { 01292 PSIn SIn = TFIn::New(fnBin); Load(*SIn); } 01293 void Test(const TStr& basePath); 01294 01295 // File names used by LoadTxt() and its subroutines. 01296 static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; } 01297 static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; } 01298 static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; } 01299 static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; } 01300 static TStr GetScriptsFn() { return "Scripts.txt"; } 01301 static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; } 01302 static TStr GetLineBreakFn() { return "LineBreak.txt"; } 01303 static TStr GetPropListFn() { return "PropList.txt"; } 01304 static TStr GetAuxiliaryDir() { return "auxiliary"; } 01305 static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; } 01306 static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; } 01307 static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; } 01308 static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; } 01309 static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; } 01310 static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test() 01311 01312 //------------------------------------------------------------------------- 01313 // Script names 01314 //------------------------------------------------------------------------- 01315 01316 // These constants are used when initializing from the text files. 01317 static TStr GetScriptNameUnknown() { return "Unknown"; } 01318 static TStr GetScriptNameKatakana() { return "Katakana"; } 01319 static TStr GetScriptNameHiragana() { return "Hiragana"; } 01320 // 01321 const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); } 01322 int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); } 01323 int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; } 01324 int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); } 01325 01326 //------------------------------------------------------------------------- 01327 // Character namesnames 01328 //------------------------------------------------------------------------- 01329 01330 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 01331 const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); } 01332 TStr GetCharNameS(const int cp) const { 01333 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16). 01334 const char *p = GetCharName(cp); if (p) return p; 01335 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); } 01336 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const { 01337 if (! f) f = stdout; 01338 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 01339 fprintf(f, "%s", prefix.CStr()); 01340 int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp); 01341 fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }} 01342 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); } 01343 01344 //------------------------------------------------------------------------- 01345 // Character information 01346 //------------------------------------------------------------------------- 01347 // These methods provide access to a subset of the functionality 01348 // available in TUniChInfo. 01349 01350 bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) { 01351 int i = h.GetKeyId(cp); 01352 if (i < 0) return false; else { ChInfo=h[i]; return true; }} 01353 TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; } 01354 TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; } 01355 01356 bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); } 01357 int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); } 01358 bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); } 01359 int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); } 01360 01361 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); } 01362 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2) 01363 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3) 01364 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4) 01365 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5) 01366 01367 #define DECLARE_FORWARDED_PROPERTY_METHODS \ 01368 ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \ 01369 ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \ 01370 ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \ 01371 ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \ 01372 ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \ 01373 ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \ 01374 ___UniFwd2(IsXidStart, IsXidContinue) \ 01375 ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \ 01376 ___UniFwd1(IsGbExtend) \ 01377 ___UniFwd2(IsCased, IsCurrency) 01378 01379 DECLARE_FORWARDED_PROPERTY_METHODS 01380 01381 #undef ___UniFwd1 01382 01383 bool IsPrivateUse(const int cp) const { 01384 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse(); 01385 return (0xe000 <= cp && cp <= 0xf8ff) || // plane 0 private-use area 01386 // Planes 15 and 16 are entirely for private use. 01387 (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); } 01388 // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates. 01389 // For db80..dbff it is clear that the surrogate pair containing this high surrogate 01390 // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false 01391 // for db80..dbff. This is consistent with the category codes assigned in UnicodeData.txt. 01392 bool IsSurrogate(const int cp) const { 01393 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate(); 01394 return 0xd800 <= cp && cp <= 0xdcff; } 01395 01396 // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1 01397 // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters 01398 // for composition to work correctly. 01399 int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; } 01400 01401 //------------------------------------------------------------------------- 01402 // Hangul constants 01403 //------------------------------------------------------------------------- 01404 01405 enum { 01406 HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7, 01407 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, 01408 HangulNCount = HangulVCount * HangulTCount, // 588 01409 HangulSCount = HangulLCount * HangulNCount // 11172 01410 }; 01411 01412 //------------------------------------------------------------------------- 01413 // Word boundaries (UAX #29) 01414 //------------------------------------------------------------------------- 01415 01416 protected: 01417 // UAX #29, rule WB3: ignore Format and Extend characters. 01418 // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.] 01419 static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); } 01420 bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); } 01421 // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character. 01422 template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01423 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01424 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01425 template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01426 if (position >= srcEnd) return; 01427 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01428 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01429 template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01430 if (position >= srcEnd) return; 01431 if (IsSbSep(src[TVecIdx(position)])) { position++; return; } 01432 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01433 // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character. 01434 template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const { 01435 if (position <= srcStart) return false; 01436 while (position > srcStart) { 01437 position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; } 01438 return false; } 01439 // Test driver for WbFind*NonIgnored. 01440 void TestWbFindNonIgnored(const TIntV& src) const; 01441 void TestWbFindNonIgnored() const; 01442 public: 01443 // Finds the next word boundary strictly after 'position'. 01444 // Note that there is a valid word boundary at 'srcIdx + srcCount'. 01445 // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01446 template<typename TSrcVec> 01447 bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01448 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word 01449 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01450 // always set to 'true'. 01451 template<typename TSrcVec> 01452 void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01453 protected: 01454 void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence); 01455 01456 //------------------------------------------------------------------------- 01457 // Sentence boundaries (UAX #29) 01458 //------------------------------------------------------------------------- 01459 01460 protected: 01461 TUniTrie<TInt> sbExTrie; 01462 01463 // Checks whether a sentence that ended at src[position - 1] 01464 // would end in one of the suffixes from sbExTrie. 01465 template<typename TSrcVec> 01466 bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const; 01467 01468 public: 01469 // Finds the next sentence boundary strictly after 'position'. 01470 // Note that there is a valid sentence boundary at 'srcIdx + srcCount'. 01471 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01472 template<typename TSrcVec> 01473 bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01474 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence 01475 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01476 // always set to 'true'. 01477 template<typename TSrcVec> 01478 void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01479 01480 // These methods allow the user to define a set of sentence boundary exceptions. 01481 // This is a set of strings, stored in 'sbExTrie'. If the Unicode rules require 01482 // a sentence boundary in a position that would cause the sentence to end with 01483 // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie', 01484 // we will *not* place a sentence boundary there. 01485 // 01486 // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods. 01487 // By default, it is empty. Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain 01488 // a standard set of English-language exceptions. 01489 void SbEx_Clr() { sbExTrie.Clr(); } 01490 template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); } 01491 // template<> void SbEx_Add(const TStr& s) { 01492 void SbEx_Add(const TStr& s) { 01493 TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); } 01494 void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); } 01495 int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec); 01496 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]); 01497 return vec.Len(); } 01498 void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; } 01499 int SbEx_SetStdEnglish() { 01500 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv"; 01501 SbEx_Clr(); return SbEx_AddMulti(data, false); } 01502 01503 //------------------------------------------------------------------------- 01504 // Normalization, decomposition, etc. (UAX #15) 01505 //------------------------------------------------------------------------- 01506 01507 protected: 01508 // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary). 01509 // If 'compatibility == false', only canonical decompositions are used. 01510 template<typename TDestCh> 01511 void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const; 01512 public: 01513 // This appends, to 'dest', the decomposed form of the source string. 01514 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01515 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01516 template<typename TSrcVec, typename TDestCh> 01517 void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01518 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01519 template<typename TSrcVec, typename TDestCh> 01520 void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01521 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01522 // This performs canonical composition on the source string, and appends 01523 // the result to the destination string. The source string should be the 01524 // result of a (canonical or compatibility) decomposition; if this is the 01525 // case, the composition will lead to a normalization form C (NFC) or 01526 // normalization form KC (NFKC), depending on whether canonical or compatibility 01527 // decomposition was used. 01528 template<typename TSrcVec, typename TDestCh> 01529 void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01530 TVec<TDestCh>& dest, bool clrDest = true) const; 01531 template<typename TSrcVec, typename TDestCh> 01532 void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01533 Compose(src, 0, src.Len(), dest, clrDest); } 01534 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01535 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01536 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01537 // source string. 01538 template<typename TSrcVec, typename TDestCh> 01539 void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01540 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01541 template<typename TSrcVec, typename TDestCh> 01542 void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01543 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01544 // Copies the starter characters from 'src' to 'dest'; the other 01545 // characters are skipped. 'src' should already have been decomposed. 01546 // Returns the number of characters extracted. 01547 template<typename TSrcVec, typename TDestCh> 01548 size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01549 TVec<TDestCh>& dest, bool clrDest = true) const; 01550 template<typename TSrcVec, typename TDestCh> 01551 size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01552 return ExtractStarters(src, 0, src.Len(), dest, clrDest); } 01553 // Extracts the starters into a temporary vector and then copies it into 'src'. 01554 template<typename TSrcVec> 01555 size_t ExtractStarters(TSrcVec& src) const { 01556 TIntV temp; size_t retVal = ExtractStarters(src, temp); 01557 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]); 01558 return retVal; } 01559 01560 protected: 01561 void TestComposition(const TStr& basePath); 01562 01563 //------------------------------------------------------------------------- 01564 // Initialization from the text files 01565 //------------------------------------------------------------------------- 01566 01567 protected: 01568 void InitWordAndSentenceBoundaryFlags(const TStr& basePath); 01569 void InitScripts(const TStr& basePath); 01570 void InitLineBreaks(const TStr& basePath); 01571 void InitDerivedCoreProperties(const TStr& basePath); 01572 void InitPropList(const TStr& basePath); 01573 void InitSpecialCasing(const TStr& basePath); 01574 void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s); 01575 public: 01576 void LoadTxt(const TStr& basePath); 01577 void SaveBin(const TStr& fnBinUcd); 01578 01579 //------------------------------------------------------------------------- 01580 // Case conversions 01581 //------------------------------------------------------------------------- 01582 01583 public: 01584 typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion; 01585 // Appends the case-converted form of 'src' to 'dest'. 01586 // 'how' defines what kind of case conversion is required. 01587 // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar'). 01588 // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt'). 01589 template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const; 01590 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); } 01591 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); } 01592 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); } 01593 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01594 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01595 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01596 01597 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01598 // This is simpler and faster. Since each character now maps into exactly one 01599 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01600 template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const; 01601 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); } 01602 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); } 01603 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); } 01604 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); } 01605 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); } 01606 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); } 01607 01608 template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const; 01609 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); } 01610 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); } 01611 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); } 01612 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); } 01613 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); } 01614 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); } 01615 01616 public: 01617 friend class TUniCaseFolding; 01618 01619 // Case folding is an alternative to the above functions. It is intended primarily 01620 // to produce strings that are suitable for comparisons. For example, 01621 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01622 // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01623 // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless). 01624 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01625 // into a string of two or more characters. 01626 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01627 // each string before comparing them (see sec. 3.13 of the standard). 01628 template<typename TSrcVec, typename TDestCh> 01629 void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01630 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); } 01631 template<typename TSrcVec, typename TDestCh> 01632 void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const { 01633 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); } 01634 // ToCaseFolded folds the string in place. However, this means that only the simple 01635 // case foldings can be used (the full ones could increase the length of the string). 01636 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); } 01637 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); } 01638 01639 protected: 01640 void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian); 01641 void TestCaseConversions(); 01642 01643 //------------------------------------------------------------------------- 01644 // Text file reader for the Unicode character database 01645 //------------------------------------------------------------------------- 01646 01647 protected: 01648 01649 class TUcdFileReader 01650 { 01651 protected: 01652 TChA buf; 01653 public: 01654 TChA comment; // contains '#' and everything after it 01655 protected: 01656 FILE *f; 01657 int putBackCh; 01658 int GetCh() { 01659 if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; } 01660 return fgetc(f); } 01661 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; } 01662 // Returns 'false' iff the EOF was encountered before anything was read. 01663 bool ReadNextLine() { 01664 buf.Clr(); comment.Clr(); 01665 bool inComment = false, first = true; 01666 while (true) { 01667 int c = GetCh(); 01668 if (c == EOF) return ! first; 01669 else if (c == 13) { 01670 c = GetCh(); if (c != 10) PutBack(c); 01671 return true; } 01672 else if (c == 10) return true; 01673 else if (c == '#') inComment = true; 01674 if (! inComment) buf += char(c); 01675 else comment += char(c); } 01676 /*first = false;*/} 01677 private: 01678 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); } 01679 TUcdFileReader(const TUcdFileReader& r) { Fail; } 01680 public: 01681 TUcdFileReader() : f(0) { } 01682 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); } 01683 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; } 01684 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }} 01685 ~TUcdFileReader() { Close(); } 01686 bool GetNextLine(TStrV& dest) { 01687 dest.Clr(); 01688 while (true) { 01689 if (! ReadNextLine()) return false; 01690 TStr line = buf; line.ToTrunc(); 01691 if (line.Len() <= 0) continue; 01692 line.SplitOnAllCh(';', dest, false); 01693 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc(); 01694 return true; }} 01695 static int ParseCodePoint(const TStr& s) { 01696 int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; } 01697 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list 01698 if (ClrDestP) dest.Clr(); 01699 TStrV parts; s.SplitOnWs(parts); 01700 for (int i = 0; i < parts.Len(); i++) { 01701 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); 01702 dest.Add(c); } } 01703 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy 01704 int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; } 01705 from = ParseCodePoint(s.GetSubStr(0, i - 1)); 01706 to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); } 01707 }; 01708 01709 //------------------------------------------------------------------------- 01710 // Helper class for processing the text files 01711 //------------------------------------------------------------------------- 01712 // Files such as DerivedCoreProps.txt often refer to ranges of codepoints, 01713 // and not all codepoints from the range have also been listed in 01714 // UnicodeData.txt. Thus, new TUniChInfo instances will be created 01715 // when processing DerivedCoreProps.txt and similar files. 01716 // To assign the correct (sub)categories to these new codepoints, 01717 // the following class will extract the subcategory info from the 01718 // comments in DerivedCoreProps.txt and similar files. 01719 01720 class TSubcatHelper 01721 { 01722 public: 01723 bool hasCat; TUniChSubCategory subCat; 01724 TStrH invalidCatCodes; 01725 TUniChDb &owner; 01726 01727 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { } 01728 01729 void ProcessComment(TUniChDb::TUcdFileReader &reader) 01730 { 01731 hasCat = false; subCat = ucOtherNotAssigned; 01732 if (reader.comment.Len() > 3) 01733 { 01734 IAssert(reader.comment[0] == '#'); 01735 IAssert(reader.comment[1] == ' '); 01736 char chCat = reader.comment[2], chSubCat = reader.comment[3]; 01737 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4]))); 01738 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) { 01739 hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); } 01740 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat)); 01741 } 01742 } 01743 01744 void SetCat(const int cp) { 01745 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01746 IAssert(owner.h[i].subCat == ucOtherNotAssigned); 01747 IAssert(hasCat); 01748 owner.h[i].SetCatAndSubCat(subCat); } 01749 void TestCat(const int cp) { 01750 if (! hasCat) return; 01751 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01752 IAssert(owner.h[i].subCat == subCat); } 01753 01754 ~TSubcatHelper() 01755 { 01756 if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&"); 01757 // Output any unexpected ones (there shouldn't be any). 01758 if (! invalidCatCodes.Empty()) { 01759 printf("Invalid cat code(s) in the comments: "); 01760 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); ) 01761 printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr()); 01762 printf("\n"); } 01763 } 01764 }; 01765 }; 01766 01767 //----------------------------------------------------------------------------- 01768 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb 01769 //----------------------------------------------------------------------------- 01770 01771 class TUnicode 01772 { 01773 public: 01774 TUniCodec codec; 01775 TUniChDb ucd; 01776 01777 TUnicode() { Init(); } 01778 explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); } 01779 void Init() { InitCodecs(); } 01780 01781 //----------------------------------------------------------------------- 01782 // UTF-8 01783 //----------------------------------------------------------------------- 01784 01785 // Returns the number of characters that have been successfully decoded. 01786 // This does not include any replacement characters that may have been inserted into 'dest'. 01787 int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01788 int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01789 01790 // Returns the number of characters that have been successfully encoded. 01791 // This does not include any replacement characters that may have been inserted into 'dest'. 01792 int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); } 01793 01794 // The following wrapper around the UTF-8 encoder returns a TStr containing 01795 // the UTF-8-encoded version of the input string. 01796 TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); } 01797 01798 // encoding one character to UTF8 01799 static void EncodeUtf8(const uint& Ch, TChA& Dest); 01800 static TStr EncodeUtf8(const uint& Ch); 01801 01802 //----------------------------------------------------------------------- 01803 // UTF-16 Decoder 01804 //----------------------------------------------------------------------- 01805 01806 // Returns the number of characters that have been successfully decoded. 01807 // This does not include any replacement characters that may have been inserted into 'dest'. 01808 // Each element of 'src' is assumed to contain one byte of data. 01809 // srcCount must be even (though srcIdx doesn't need to be). 01810 int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest, 01811 const TUtf16BomHandling bomHandling = bomAllowed, 01812 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01813 return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01814 01815 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 01816 // are used to determine if the two bytes of each word should be swapped before further 01817 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 01818 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 01819 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 01820 // beginning of the source data is used to determine the "original" byte order of the data; 01821 // if this doesn't match the byte order of the local machine, the two bytes of each word will 01822 // be swapped during the decoding process. 01823 int DecodeUtf16FromWords(const TIntV& src, TIntV& dest, 01824 const TUtf16BomHandling bomHandling = bomAllowed, 01825 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01826 return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01827 01828 //----------------------------------------------------------------------- 01829 // UTF-16 Encoder 01830 //----------------------------------------------------------------------- 01831 01832 // Returns the number of characters that have been successfully encoded. 01833 // This does not include any replacement characters that may have been inserted into 'dest'. 01834 int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom, 01835 const TUniByteOrder destByteOrder = boMachineEndian) const { 01836 return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01837 01838 int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom, 01839 const TUniByteOrder destByteOrder = boMachineEndian) const { 01840 return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01841 01842 //----------------------------------------------------------------------- 01843 // 8-bit codecs 01844 //----------------------------------------------------------------------- 01845 01846 T8BitCodec<TEncoding_ISO8859_1> iso8859_1; 01847 T8BitCodec<TEncoding_ISO8859_2> iso8859_2; 01848 T8BitCodec<TEncoding_ISO8859_3> iso8859_3; 01849 T8BitCodec<TEncoding_ISO8859_4> iso8859_4; 01850 T8BitCodec<TEncoding_YuAscii> yuAscii; 01851 T8BitCodec<TEncoding_CP1250> cp1250; 01852 T8BitCodec<TEncoding_CP852> cp852; 01853 T8BitCodec<TEncoding_CP437> cp437; 01854 01855 //----------------------------------------------------------------------- 01856 // Codec registry 01857 //----------------------------------------------------------------------- 01858 // If you know you'll need ISO-8859-2, just use 01859 // TUnicode unicode; 01860 // unicode.iso8859_2.Encode(...); 01861 // If you don't know what you'll need, use: 01862 // TUnicode unicode; 01863 // PCodecBase myCodec = unicode.GetCodec(myCodecName); 01864 // myCodec->Encode(...); 01865 // Note that the first approach is slightly more efficient because there 01866 // aren't any virtual method calls involved. 01867 01868 protected: 01869 THash<TStr, PCodecBase> codecs; 01870 static inline TStr NormalizeCodecName(const TStr& name) { 01871 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; } 01872 public: 01873 void RegisterCodec(const TStr& nameList, const PCodecBase& codec) { 01874 TStrV names; nameList.SplitOnWs(names); 01875 for (int i = 0; i < names.Len(); i++) 01876 codecs.AddDat(NormalizeCodecName(names[i]), codec); } 01877 void UnregisterCodec(const TStr& nameList) { 01878 TStrV names; nameList.SplitOnWs(names); 01879 for (int i = 0; i < names.Len(); i++) 01880 codecs.DelKey(NormalizeCodecName(names[i])); } 01881 void ClrCodecs() { codecs.Clr(); } 01882 void InitCodecs(); 01883 PCodecBase GetCodec(const TStr& name) const { 01884 TStr s = NormalizeCodecName(name); 01885 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr(); 01886 return p; } 01887 void GetAllCodecs(TCodecBaseV& dest) const { 01888 dest.Clr(); 01889 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) { 01890 PCodecBase codec = codecs[i]; bool found = false; 01891 for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; } 01892 if (! found) dest.Add(codec); }} 01893 01894 //------------------------------------------------------------------------- 01895 // Word boundaries (UAX #29) 01896 //------------------------------------------------------------------------- 01897 01898 // Finds the next word boundary strictly after 'position'. 01899 // Note that there are valid word boundaries at 0 and at 'src.Len()'. 01900 // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01901 bool FindNextWordBoundary(const TIntV& src, int &position) const { 01902 if (position < 0) { position = 0; return true; } 01903 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01904 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word 01905 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01906 // always set to 'true'. 01907 void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); } 01908 01909 //------------------------------------------------------------------------- 01910 // Sentence boundaries (UAX #29) 01911 //------------------------------------------------------------------------- 01912 01913 // Finds the next sentence boundary strictly after 'position'. 01914 // Note that there are valid sentence boundaries at 0 and at 'src.Len()'. 01915 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01916 bool FindNextSentenceBoundary(const TIntV& src, int &position) const { 01917 if (position < 0) { position = 0; return true; } 01918 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01919 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence 01920 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01921 // always set to 'true'. 01922 void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); } 01923 01924 void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); } 01925 void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); } 01926 01927 //------------------------------------------------------------------------- 01928 // Normalization, decomposition, etc. (UAX #15) 01929 //------------------------------------------------------------------------- 01930 01931 // This sets 'dest' to the decomposed form of the source string. 01932 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01933 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01934 void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); } 01935 // This performs canonical composition on the source string, and stores 01936 // the result in the destination vector. The source string should be the 01937 // result of a (canonical or compatibility) decomposition; if this is the 01938 // case, the composition will lead to a normalization form C (NFC) or 01939 // normalization form KC (NFKC), depending on whether canonical or compatibility 01940 // decomposition was used. 01941 void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); } 01942 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01943 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01944 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01945 // source string. 01946 void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); } 01947 // Copies the starter characters from 'src' to 'dest'; the other 01948 // characters are skipped. 'src' should already have been decomposed. 01949 // Returns the number of characters extracted. This function can be 01950 // used to remove diacritical marks from a string (after it has been decomposed!). 01951 int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); } 01952 // Extracts the starters into a temporary vector and then copies it into 'src'. 01953 int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); } 01954 01955 //------------------------------------------------------------------------- 01956 // Case conversions 01957 //------------------------------------------------------------------------- 01958 // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text, 01959 // use the case-conversion methods in TUniChDb, which allow the caller 01960 // to request language-specific case mappings for these languages. 01961 01962 public: 01963 typedef TUniChDb::TCaseConversion TCaseConversion; 01964 // Sets 'dest' to the case-converted form of 'src'. 01965 void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); } 01966 void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); } 01967 void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); } 01968 01969 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01970 // This is simpler and faster. Since each character now maps into exactly one 01971 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01972 void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); } 01973 void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); } 01974 void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); } 01975 01976 // These functions perform simple case-conversions in-place. 01977 void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); } 01978 void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); } 01979 void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); } 01980 01981 // Case folding is an alternative to the above functions. It is intended primarily 01982 // to produce strings that are suitable for comparisons. For example, 01983 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01984 // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01985 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01986 // into a string of two or more characters. 01987 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01988 // each string before comparing them (see sec. 3.13 of the standard). 01989 void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); } 01990 // ToCaseFolded folds the string in place. However, this means that only the simple 01991 // case foldings can be used (the full ones could increase the length of the string). 01992 void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); } 01993 01994 TStr GetUtf8CaseFolded(const TStr& s) const { 01995 bool isAscii = true; 01996 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; } 01997 if (isAscii) return s.GetLc(); 01998 TIntV src; DecodeUtf8(s, src); 01999 TIntV dest; GetCaseFolded(src, dest); 02000 return EncodeUtf8Str(dest); } 02001 02002 //------------------------------------------------------------------------- 02003 // Character properties 02004 //------------------------------------------------------------------------- 02005 // These methods simply call the corresponding TUniChDb method 02006 // (which typically calls the corresponding method of TUniChInfo). 02007 // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list. 02008 // They are all of the form bool IsXxxx(const int cp) const 02009 // Some of the more notable ones include: 02010 // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit 02011 // IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic 02012 // IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace 02013 02014 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); } 02015 DECLARE_FORWARDED_PROPERTY_METHODS 02016 #undef DECLARE_FORWARDED_PROPERTY_METHODS 02017 #undef __UniFwd1 02018 ___UniFwd2(IsPrivateUse, IsSurrogate) 02019 02020 TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); } 02021 TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); } 02022 02023 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 02024 const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); } 02025 TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); } 02026 02027 }; 02028 02029 //----------------------------------------------------------------------------- 02030 // TUniCodec -- UTF-8 Decoder 02031 //----------------------------------------------------------------------------- 02032 02033 // Returns the number of characters that have been successfully decoded. 02034 // This does not include any replacement characters that may have been inserted into 'dest'. 02035 template<typename TSrcVec, typename TDestCh> 02036 size_t TUniCodec::DecodeUtf8( 02037 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02038 TVec<TDestCh>& dest, const bool clrDest) const 02039 { 02040 size_t nDecoded = 0; 02041 if (clrDest) dest.Clr(); 02042 const size_t origSrcIdx = srcIdx; 02043 const size_t srcEnd = srcIdx + srcCount; 02044 while (srcIdx < srcEnd) 02045 { 02046 const size_t charSrcIdx = srcIdx; 02047 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02048 if ((c & _1000_0000) == 0) { 02049 // c is one of the characters 0..0x7f, encoded as a single byte. 02050 dest.Add(TDestCh(c)); nDecoded++; continue; } 02051 else if ((c & _1100_0000) == _1000_0000) { 02052 // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx. 02053 // We must have been thrown into the middle of a multi-byte character. 02054 switch (errorHandling) { 02055 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx."); 02056 case uehAbort: return nDecoded; 02057 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02058 case uehIgnore: continue; 02059 default: Fail; } } 02060 else 02061 { 02062 // c introduces a sequence of 2..6 bytes, depending on how many 02063 // of the most significant bits of c are set. 02064 uint nMoreBytes = 0, nBits = 0, minVal = 0; 02065 if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80; 02066 else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800; 02067 else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000; 02068 else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000; 02069 else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000; 02070 else { 02071 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8 02072 // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this 02073 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh 02074 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh. 02075 if (strict) { 02076 switch (errorHandling) { 02077 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x."); 02078 case uehAbort: return nDecoded; 02079 // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes 02080 // and try to decode the character. Then, since 'strict' is true and 02081 // the codepoint is clearly >= 2^31, we'll notice this as an error later 02082 // and (in the case of uehReplace) insert a replacement character then. 02083 // This is probably better than inserting a replacement character right 02084 // away and then trying to read the next byte as if a new character 02085 // was beginning there -- if the current byte is really followed by five 02086 // 10xxxxxx bytes, we'll just get six replacement characters in a row. 02087 case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue; 02088 case uehIgnore: break; // continue; 02089 default: Fail; } } 02090 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; } 02091 // Decode this multi-byte sequence. 02092 uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c. 02093 bool cancel = false; 02094 for (uint i = 0; i < nMoreBytes && ! cancel; i++) { 02095 // See if there are enough bytes left in the source vector. 02096 if (! (srcIdx < srcEnd)) { 02097 switch (errorHandling) { 02098 case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available."); 02099 case uehAbort: return nDecoded; 02100 case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue; 02101 case uehIgnore: cancel = true; continue; 02102 default: Fail; } } 02103 // Read the next byte. 02104 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02105 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx. 02106 switch (errorHandling) { 02107 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx."); 02108 case uehAbort: return nDecoded; 02109 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue; 02110 case uehIgnore: srcIdx--; cancel = true; continue; 02111 default: Fail; } } 02112 cOut <<= 6; cOut |= (c & _0011_1111); } 02113 if (cancel) continue; 02114 if (strict) { 02115 // err1: This codepoint has been represented by more bytes than it should have been. 02116 // For example, cOut in the range 0..127 should be represented by a single byte, 02117 // not by two or more bytes. 02118 // - For example, this may happen in the "modified UTF-8" sometimes used for Java 02119 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid 02120 // the appearance of null bytes in the encoded stream. 02121 bool err1 = (cOut < minVal); 02122 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes. 02123 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these 02124 // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary. 02125 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff)); 02126 if (err1 || err2) switch (errorHandling) { 02127 case uehThrow: 02128 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ")."); 02129 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid."); 02130 else { Fail; break; } 02131 case uehAbort: return nDecoded; 02132 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02133 case uehIgnore: continue; 02134 default: Fail; } } 02135 // Add the decoded codepoint to the destination vector. 02136 // If this is the first decoded character, and it's one of the byte-order marks 02137 // (0xfffe and 0xfeff), we will skip it (unless skipBom is false). 02138 if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) { 02139 dest.Add(cOut); nDecoded++; } 02140 } // else (multi-byte sequence) 02141 } // while 02142 return nDecoded; 02143 } 02144 02145 //----------------------------------------------------------------------- 02146 // TUniCodec -- UTF-8 Encoder 02147 //----------------------------------------------------------------------- 02148 02149 // Returns the number of characters that have been successfully encoded. 02150 // This does not include any replacement characters that may have been inserted into 'dest'. 02151 template<typename TSrcVec, typename TDestCh> 02152 size_t TUniCodec::EncodeUtf8( 02153 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02154 TVec<TDestCh>& dest, const bool clrDest) const 02155 { 02156 size_t nEncoded = 0; 02157 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 02158 { 02159 uint c = uint(src[TVecIdx(srcIdx)]); 02160 bool err = false; 02161 if (strict && c > 0x10ffff) { 02162 err = true; 02163 switch (errorHandling) { 02164 case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed)."); 02165 case uehAbort: return nEncoded; 02166 case uehReplace: c = replacementChar; break; 02167 case uehIgnore: continue; 02168 default: Fail; } } 02169 if (c < 0x80u) 02170 dest.Add(TDestCh(c & 0xffu)); 02171 else if (c < 0x800u) { 02172 dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111))); 02173 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02174 else if (c < 0x10000u) { 02175 dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111))); 02176 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02177 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02178 else if (c < 0x200000u) { 02179 dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111))); 02180 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02181 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02182 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02183 else if (c < 0x4000000u) { 02184 dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011))); 02185 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02186 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02187 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02188 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02189 else { 02190 dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011))); 02191 dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111))); 02192 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02193 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02194 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02195 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02196 if (! err) nEncoded++; 02197 } 02198 return nEncoded; 02199 } 02200 02201 //----------------------------------------------------------------------- 02202 // TUniCodec -- UTF-16 Encoder 02203 //----------------------------------------------------------------------- 02204 02205 // Returns the number of characters that have been successfully decoded. 02206 // This does not include any replacement characters that may have been inserted into 'dest'. 02207 // Each element of 'src' is assumed to contain one byte of data. 02208 // srcCount must be even (though srcIdx doesn't need to be). 02209 template<typename TSrcVec, typename TDestCh> 02210 size_t TUniCodec::DecodeUtf16FromBytes( 02211 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02212 TVec<TDestCh>& dest, const bool clrDest, 02213 const TUtf16BomHandling bomHandling, 02214 const TUniByteOrder defaultByteOrder) const 02215 { 02216 IAssert(srcCount % 2 == 0); 02217 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02218 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02219 if (clrDest) dest.Clr(); 02220 size_t nDecoded = 0; 02221 if (srcCount <= 0) return nDecoded; 02222 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02223 bool littleEndian = false; 02224 bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian())); 02225 if (bomHandling == bomIgnored) littleEndian = leDefault; 02226 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02227 { 02228 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; 02229 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; } 02230 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; } 02231 else if (bomHandling == bomAllowed) littleEndian = leDefault; 02232 else { // Report an error. 02233 switch (errorHandling) { 02234 case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead)."); 02235 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02236 default: Fail; } } 02237 } 02238 else Fail; 02239 while (srcIdx < srcEnd) 02240 { 02241 const size_t charSrcIdx = srcIdx; 02242 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02243 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02244 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02245 { 02246 // c is the first character in a surrogate pair. Read the next character. 02247 if (! (srcIdx + 2 <= srcEnd)) { 02248 switch (errorHandling) { 02249 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02250 case uehAbort: return nDecoded; 02251 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02252 case uehIgnore: continue; 02253 default: Fail; } } 02254 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02255 uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02256 // c2 should be the second character of the surrogate pair. 02257 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02258 switch (errorHandling) { 02259 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02260 case uehAbort: return nDecoded; 02261 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02262 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue; 02263 case uehIgnore: srcIdx -= 2; continue; 02264 default: Fail; } } 02265 // c and c2 each contain 10 bits of information. 02266 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02267 cc += 0x10000; 02268 dest.Add(TDestCh(cc)); nDecoded++; continue; 02269 } 02270 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02271 switch (errorHandling) { 02272 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02273 case uehAbort: return nDecoded; 02274 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02275 case uehIgnore: continue; 02276 default: Fail; } } 02277 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02278 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02279 // Otherwise, store 'c' to the destination vector. 02280 dest.Add(TDestCh(c)); nDecoded++; 02281 } 02282 return nDecoded; 02283 } 02284 02285 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 02286 // are used to determine if the two bytes of each word should be swapped before further 02287 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 02288 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 02289 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 02290 // beginning of the source data is used to determine the "original" byte order of the data; 02291 // if this doesn't match the byte order of the local machine, the two bytes of each word will 02292 // be swapped during the decoding process. 02293 template<typename TSrcVec, typename TDestCh> 02294 size_t TUniCodec::DecodeUtf16FromWords( 02295 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02296 TVec<TDestCh>& dest, bool clrDest, 02297 const TUtf16BomHandling bomHandling, 02298 const TUniByteOrder defaultByteOrder) const 02299 { 02300 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02301 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02302 if (clrDest) dest.Clr(); 02303 size_t nDecoded = 0; 02304 if (srcCount <= 0) return nDecoded; 02305 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02306 bool swap = false; 02307 bool isMachineLe = IsMachineLittleEndian(); 02308 bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); 02309 if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe); 02310 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02311 { 02312 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff; 02313 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; } 02314 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; } 02315 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe); 02316 else { // Report an error. 02317 switch (errorHandling) { 02318 case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead)."); 02319 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02320 default: Fail; } } 02321 } 02322 else Fail; 02323 while (srcIdx < srcEnd) 02324 { 02325 const size_t charSrcIdx = srcIdx; 02326 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02327 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02328 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02329 { 02330 // c is the first character in a surrogate pair. Read the next character. 02331 if (! (srcIdx < srcEnd)) { 02332 switch (errorHandling) { 02333 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02334 case uehAbort: return nDecoded; 02335 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02336 case uehIgnore: continue; 02337 default: Fail; } } 02338 uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02339 if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); 02340 // c2 should be the second character of the surrogate pair. 02341 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02342 switch (errorHandling) { 02343 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02344 case uehAbort: return nDecoded; 02345 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02346 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue; 02347 case uehIgnore: srcIdx -= 1; continue; 02348 default: Fail; } } 02349 // c and c2 each contain 10 bits of information. 02350 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02351 cc += 0x10000; 02352 dest.Add(TDestCh(cc)); nDecoded++; continue; 02353 } 02354 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02355 switch (errorHandling) { 02356 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02357 case uehAbort: return nDecoded; 02358 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02359 case uehIgnore: continue; 02360 default: Fail; } } 02361 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02362 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02363 // Otherwise, store 'c' to the destination vector. 02364 dest.Add(TDestCh(c)); nDecoded++; 02365 } 02366 return nDecoded; 02367 } 02368 02369 //----------------------------------------------------------------------- 02370 // TUniCodec -- UTF-16 Encoder 02371 //----------------------------------------------------------------------- 02372 02373 // Returns the number of characters that have been successfully encoded. 02374 // This does not include any replacement characters that may have been inserted into 'dest'. 02375 template<typename TSrcVec, typename TDestCh> 02376 size_t TUniCodec::EncodeUtf16ToWords( 02377 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02378 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02379 const TUniByteOrder destByteOrder) const 02380 { 02381 bool isMachineLe = IsMachineLittleEndian(); 02382 bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe); 02383 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02384 if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; } 02385 while (srcIdx < srcEnd) 02386 { 02387 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02388 if (! (c <= 0x10ffffu)) { 02389 switch (errorHandling) { 02390 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02391 case uehAbort: return nEncoded; 02392 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02393 case uehIgnore: continue; 02394 default: Fail; } } 02395 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02396 switch (errorHandling) { 02397 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02398 case uehAbort: return nEncoded; 02399 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02400 case uehIgnore: continue; 02401 default: Fail; } } 02402 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02403 switch (errorHandling) { 02404 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02405 case uehAbort: return nEncoded; 02406 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02407 case uehIgnore: continue; 02408 default: Fail; } } 02409 // If c is <= 0xffff, it can be stored directly. 02410 if (c <= 0xffffu) { 02411 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02412 dest.Add(TDestCh(c)); nEncoded++; continue; } 02413 // Otherwise, represent c by a pair of surrogate characters. 02414 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02415 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02416 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02417 if (swap) { 02418 c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8); 02419 c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); } 02420 dest.Add(TDestCh(c1)); 02421 dest.Add(TDestCh(c2)); 02422 nEncoded++; continue; 02423 } 02424 return nEncoded; 02425 } 02426 02427 template<typename TSrcVec, typename TDestCh> 02428 size_t TUniCodec::EncodeUtf16ToBytes( 02429 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02430 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02431 const TUniByteOrder destByteOrder) const 02432 { 02433 bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian())); 02434 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02435 if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; } 02436 while (srcIdx < srcEnd) 02437 { 02438 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02439 if (! (c <= 0x10ffffu)) { 02440 switch (errorHandling) { 02441 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02442 case uehAbort: return nEncoded; 02443 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } 02444 case uehReplace: ___OutRepl; continue; 02445 case uehIgnore: continue; 02446 default: Fail; } } 02447 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02448 switch (errorHandling) { 02449 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02450 case uehAbort: return nEncoded; 02451 case uehReplace: ___OutRepl; continue; 02452 case uehIgnore: continue; 02453 default: Fail; } } 02454 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02455 switch (errorHandling) { 02456 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02457 case uehAbort: return nEncoded; 02458 case uehReplace: ___OutRepl; continue; 02459 case uehIgnore: continue; 02460 default: Fail; } } 02461 #undef ___OutRepl 02462 // If c is <= 0xffff, it can be stored directly. 02463 if (c <= 0xffffu) { 02464 if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } 02465 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } 02466 nEncoded++; continue; } 02467 // Otherwise, represent c by a pair of surrogate characters. 02468 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02469 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02470 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02471 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); } 02472 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); } 02473 nEncoded++; continue; 02474 } 02475 return nEncoded; 02476 } 02477 02478 //----------------------------------------------------------------------------- 02479 // TUniChDb -- word boundaries 02480 //----------------------------------------------------------------------------- 02481 02482 template<typename TSrcVec> 02483 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02484 { 02485 // WB1. Break at the start of text. 02486 if (position < srcIdx) { position = srcIdx; return true; } 02487 // If we are beyond the end of the text, there aren't any word breaks left. 02488 const size_t srcEnd = srcIdx + srcCount; 02489 if (position >= srcEnd) return false; 02490 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02491 size_t origPos = position; 02492 if (IsWbIgnored(src[TVecIdx(position)])) { 02493 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02494 position = origPos; 02495 } 02496 // Determine the previous nonignored character (before 'position'). 02497 size_t posPrev = position; 02498 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02499 // Sec 6.2. Allow a break between Sep and an ignored character. 02500 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02501 // Determine the next nonignored character (after 'position'). 02502 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02503 size_t posNext2; 02504 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02505 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02506 int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext); 02507 int cNext2, wbfNext2; 02508 // 02509 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02510 cPrev = cCur, cCur = cNext, cNext = cNext2, 02511 wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2) 02512 { 02513 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02514 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02515 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02516 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02517 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02518 wbfNext2 = GetWbFlags(cNext2); 02519 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02520 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue 02521 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02522 // WB3. Do not break within CRLF. 02523 if (cCur == 13 && cNext == 10) continue; 02524 // WB5. Do not break between most letters. 02525 TestCurNext(ucfWbALetter, ucfWbALetter); 02526 // WB6. Do not break letters across certain punctuation. 02527 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02528 // WB7. Do not break letters across certain punctuation. 02529 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02530 // WB8. Do not break within sequences of digits, or digits adjacent to letters. 02531 TestCurNext(ucfWbNumeric, ucfWbNumeric); 02532 // WB9. Do not break within sequences of digits, or digits adjacent to letters. 02533 TestCurNext(ucfWbALetter, ucfWbNumeric); 02534 // WB10. Do not break within sequences of digits, or digits adjacent to letters. 02535 TestCurNext(ucfWbNumeric, ucfWbALetter); 02536 // WB11. Do not break within sequences, such as "3.2" or "3.456,789". 02537 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02538 // WB12. Do not break within sequences, such as "3.2" or "3.456,789". 02539 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02540 // WB13. Do not break between Katakana. 02541 TestCurNext(ucfWbKatakana, ucfWbKatakana); 02542 // WB13a. Do not break from extenders. 02543 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 && 02544 (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue; 02545 // WB13b. Do not break from extenders. 02546 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet && 02547 (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue; 02548 // WB14. Otherwise, break everywhere. 02549 position = posNext; return true; 02550 #undef TestCurNext 02551 #undef TestCurNext2 02552 #undef TestPrevCurNext 02553 } 02554 // WB2. Break at the end of text. 02555 IAssert(position == srcEnd); 02556 return true; 02557 } 02558 02559 // ToDo: provide a more efficient implementation of this. 02560 template<typename TSrcVec> 02561 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02562 { 02563 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02564 dest.PutAll(false); 02565 size_t position = srcIdx; 02566 dest[TVecIdx(position - srcIdx)] = true; 02567 while (position < srcIdx + srcCount) 02568 { 02569 size_t oldPos = position; 02570 FindNextWordBoundary(src, srcIdx, srcCount, position); 02571 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02572 dest[TVecIdx(position - srcIdx)] = true; 02573 } 02574 Assert(dest[TVecIdx(srcCount)]); 02575 } 02576 02577 //----------------------------------------------------------------------------- 02578 // TUniChDb -- sentence boundaries 02579 //----------------------------------------------------------------------------- 02580 02581 template<typename TSrcVec> 02582 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const 02583 { 02584 if (sbExTrie.Empty()) return true; 02585 // We'll move back from the position where a sentence-boundary is being considered. 02586 size_t pos = position; 02587 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02588 int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c); 02589 // - Skip the Sep, if there is one. 02590 if ((c & ucfSbSep) == ucfSbSep) { 02591 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02592 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02593 // - Skip any Sp characters. 02594 while ((sfb & ucfSbSp) == ucfSbSp) { 02595 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02596 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02597 // - Skip any Close characters. 02598 while ((sfb & ucfSbSp) == ucfSbSp) { 02599 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02600 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02601 // - Skip any ATerm | STerm characters. 02602 while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) { 02603 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02604 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02605 // Now start moving through the trie. 02606 int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1; 02607 while (true) 02608 { 02609 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos)); 02610 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]); 02611 TUniChCategory cat = GetCat(c); 02612 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) { 02613 // Check if the suffix we've read so far is one of those that appear in the trie. 02614 if (len == 1) return ! sbExTrie.Has1Gram(cLast); 02615 if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast); 02616 IAssert(len >= 3); IAssert(node >= 0); 02617 if (sbExTrie.IsNodeTerminal(node)) return false; 02618 if (atEnd) return true; } 02619 if (len == 1) { cButLast = c; len++; } 02620 else if (len == 2) { cButButLast = c; len++; 02621 // Now we have read the last three characters; start descending the suitable subtrie. 02622 node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast); 02623 if (node < 0) return true; } 02624 else { 02625 // Descend down the trie. 02626 node = sbExTrie.GetChild(node, c); 02627 if (node < 0) return true; } 02628 } 02629 //return true; 02630 } 02631 02632 template<typename TSrcVec> 02633 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02634 { 02635 // SB1. Break at the start of text. 02636 if (position < srcIdx) { position = srcIdx; return true; } 02637 // If we are beyond the end of the text, there aren't any word breaks left. 02638 const size_t srcEnd = srcIdx + srcCount; 02639 if (position >= srcEnd) return false; 02640 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02641 size_t origPos = position; 02642 if (IsWbIgnored(src[TVecIdx(position)])) { 02643 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02644 position = origPos; 02645 } 02646 // Determine the previous nonignored character (before 'position'). 02647 size_t posPrev = position; 02648 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02649 // Sec 6.2. Allow a break between Sep and an ignored character. 02650 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02651 // Determine the next nonignored character (after 'position'). 02652 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02653 size_t posNext2; 02654 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02655 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02656 int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext); 02657 int cNext2, sbfNext2; 02658 // Initialize the state of the peek-back automaton. 02659 typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState; 02660 TPeekBackState backState; 02661 { 02662 size_t pos = position; 02663 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false; 02664 while (true) 02665 { 02666 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02667 // Skip at most one Sep. 02668 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02669 if ((sbf & ucfSbSep) == ucfSbSep) { 02670 wasSep = true; 02671 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02672 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02673 // Skip zero or more Sp's. 02674 bool stop = false; 02675 while ((sbf & ucfSbSp) == ucfSbSp) { 02676 wasSp = true; 02677 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02678 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02679 if (stop) break; 02680 // Skip zero or more Close's. 02681 while ((sbf & ucfSbClose) == ucfSbClose) { 02682 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02683 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02684 if (stop) break; 02685 // Process an ATerm or STerm. 02686 wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm); 02687 wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm); 02688 break; 02689 } 02690 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm); 02691 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm); 02692 else backState = stInit; 02693 } 02694 // Initialize the state of the peek-ahead automaton. This state tells us what follows 02695 // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}. 02696 // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string. 02697 // Our peek-ahead automaton must tell us whether it is Lower or something else. 02698 typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState; 02699 TPeekAheadState aheadState = stUnknown; 02700 // 02701 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02702 cPrev = cCur, cCur = cNext, cNext = cNext2, 02703 sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2) 02704 { 02705 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02706 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02707 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02708 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02709 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02710 sbfNext2 = GetSbFlags(cNext2); 02711 // Update the peek-back automaton. 02712 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag) 02713 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; } 02714 switch (backState) { 02715 case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break; 02716 case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break; 02717 case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break; 02718 case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02719 case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02720 case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02721 case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02722 default: IAssert(false); } 02723 #undef Trans 02724 #undef TestCur 02725 // Update the peek-ahead automaton. 02726 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0) 02727 if (! IsPeekAheadSkippable(sbfCur)) { 02728 bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower); 02729 if (aheadState == stLower) IAssert(isLower); 02730 else if (aheadState == stNotLower) IAssert(! isLower); 02731 // We haven't peaked ahead farther than this so far -- invalidate the state. 02732 aheadState = stUnknown; } 02733 if (aheadState == stUnknown) 02734 { 02735 // Peak ahead to the next non-peekahead-skippable character. 02736 size_t pos = posNext; 02737 while (pos < srcEnd) { 02738 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02739 if (! IsPeekAheadSkippable(sbf)) { 02740 if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower; 02741 else aheadState = stNotLower; 02742 break; } 02743 WbFindNextNonIgnored(src, pos, srcEnd); } 02744 if (! (pos < srcEnd)) aheadState = stNotLower; 02745 } 02746 #undef IsPeekAheadSkippable 02747 // 02748 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02749 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue 02750 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02751 // SB3. Do not break within CRLF. 02752 if (cCur == 13 && cNext == 10) continue; 02753 // SB4. Break ater paragraph separators. 02754 if ((sbfCur & ucfSbSep) == ucfSbSep) { 02755 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02756 position = posNext; return true; } 02757 // Do not break after ambiguous terminators like period, if they are immediately followed by a number 02758 // or lowercase letter, if they are between uppercase letters, or if the first following letter 02759 // (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation 02760 // or numeric period, and thus may not mark the end of a sentence. 02761 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6 02762 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7 02763 // SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm) 02764 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) && 02765 (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue; 02766 // SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower 02767 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue; 02768 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present). 02769 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep ) 02770 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue; 02771 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep ) 02772 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break] 02773 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) { 02774 if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10 02775 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02776 position = posNext; return true; } // SB11 02777 // WB12. Otherwise, do not break. 02778 continue; 02779 #undef TestCurNext 02780 #undef TestCurNext2 02781 #undef TestPrevCurNext 02782 } 02783 // WB2. Break at the end of text. 02784 IAssert(position == srcEnd); 02785 return true; 02786 } 02787 02788 // ToDo: provide a more efficient implementation of this. 02789 template<typename TSrcVec> 02790 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02791 { 02792 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02793 dest.PutAll(false); 02794 size_t position = srcIdx; 02795 dest[TVecIdx(position - srcIdx)] = true; 02796 while (position < srcIdx + srcCount) 02797 { 02798 size_t oldPos = position; 02799 FindNextSentenceBoundary(src, srcIdx, srcCount, position); 02800 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02801 dest[TVecIdx(position - srcIdx)] = true; 02802 } 02803 Assert(dest[TVecIdx(srcCount)]); 02804 } 02805 02806 //----------------------------------------------------------------------------- 02807 // TUniChDb -- case conversions 02808 //----------------------------------------------------------------------------- 02809 02810 template<typename TSrcVec, typename TDestCh> 02811 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02812 TVec<TDestCh>& dest, const bool clrDest, 02813 const TUniChDb::TCaseConversion how, 02814 const bool turkic, const bool lithuanian) const 02815 { 02816 const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0)); 02817 if (clrDest) dest.Clr(); 02818 enum { 02819 GreekCapitalLetterSigma = 0x3a3, 02820 GreekSmallLetterSigma = 0x3c3, 02821 GreekSmallLetterFinalSigma = 0x3c2, 02822 LatinCapitalLetterI = 0x49, 02823 LatinCapitalLetterJ = 0x4a, 02824 LatinCapitalLetterIWithOgonek = 0x12e, 02825 LatinCapitalLetterIWithGrave = 0xcc, 02826 LatinCapitalLetterIWithAcute = 0xcd, 02827 LatinCapitalLetterIWithTilde = 0x128, 02828 LatinCapitalLetterIWithDotAbove = 0x130, 02829 LatinSmallLetterI = 0x69, 02830 CombiningDotAbove = 0x307 02831 }; 02832 // 02833 bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1; 02834 size_t nextWordBoundary = srcIdx; 02835 TBoolV wordBoundaries; bool wbsKnown = false; 02836 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 02837 { 02838 int cp = src[TVecIdx(srcIdx)]; srcIdx++; 02839 //if (turkic && cp == 0x130 && how == ccLower) printf("!"); 02840 // For conversion to titlecase, the first cased character of each word 02841 // must be converted to titlecase; everything else must be converted 02842 // to lowercase. 02843 TUniChDb::TCaseConversion howHere; 02844 if (how != ccTitle) howHere = how; 02845 else { 02846 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 02847 seenCased = false; seenTwoCased = false; cpFirstCased = -1; 02848 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 02849 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 02850 bool isCased = IsCased(cp); 02851 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; } 02852 else { howHere = ccLower; 02853 if (isCased && seenCased) seenTwoCased = true; } 02854 } 02855 // First, process the conditional mappings from SpecialCasing.txt. 02856 // These will be processed in code -- they were ignored while 02857 // we were reading SpecialCasing.txt itself. 02858 if (cp == GreekCapitalLetterSigma && howHere == ccLower) 02859 { 02860 // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of 02861 // the standard doesn't define it. We'll use FinalCased instead. 02862 // FinalCased: within the closest word boundaries containing C, 02863 // there is a cased letter before C, and there is no cased letter after C. 02864 //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary); 02865 if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; } 02866 size_t srcIdx2 = srcIdx; bool casedAfter = false; 02867 if (how == ccTitle) 02868 printf("!"); 02869 //while (srcIdx2 < nextBoundary) 02870 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02871 { 02872 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02873 if (IsCased(cp2)) { casedAfter = true; break; } 02874 } 02875 if (! casedAfter) 02876 { 02877 //size_t prevBoundary = srcIdx - 1; 02878 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary); 02879 srcIdx2 = srcIdx - 1; bool casedBefore = false; 02880 //while (prevBoundary < srcIdx2) 02881 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02882 { 02883 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02884 if (IsCased(cp2)) { casedBefore = true; break; } 02885 } 02886 if (casedBefore) { 02887 // Now we have a FinalCased character. 02888 dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; } 02889 } 02890 // If we got here, add a non-final sigma. 02891 dest.Add(GreekSmallLetterSigma); continue; 02892 } 02893 else if (lithuanian) 02894 { 02895 if (howHere == ccLower) 02896 { 02897 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek) 02898 { 02899 bool moreAbove = false; 02900 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02901 { 02902 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02903 const int cc2 = GetCombiningClass(cp2); 02904 if (cc2 == TUniChInfo::ccStarter) break; 02905 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; } 02906 } 02907 if (moreAbove) 02908 { 02909 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; } 02910 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; } 02911 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; } 02912 } 02913 } 02914 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; } 02915 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; } 02916 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; } 02917 } 02918 if (cp == CombiningDotAbove) 02919 { 02920 // Lithuanian, howHere != ccLower. 02921 // AfterSoftDotted := the last preceding character with a combining class 02922 // of zero before C was Soft_Dotted, and there is no intervening combining 02923 // character class 230 (ABOVE). 02924 bool afterSoftDotted = false; 02925 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02926 while (origSrcIdx < srcIdx2) 02927 { 02928 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02929 int cc2 = GetCombiningClass(cp2); 02930 if (cc2 == TUniChInfo::ccAbove) break; 02931 if (cc2 == TUniChInfo::ccStarter) { 02932 afterSoftDotted = IsSoftDotted(cp2); break; } 02933 } 02934 if (afterSoftDotted) 02935 { 02936 Assert(lithuanian); 02937 // Remove DOT ABOVE after "i" with upper or titlecase. 02938 // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle, 02939 // the "i" may have been kept lowercase and thus we shouldn't remove the dot). 02940 if (how == ccLower) { dest.Add(0x307); continue; } 02941 if (how == ccUpper) continue; 02942 Assert(how == ccTitle); 02943 Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character 02944 if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot. 02945 dest.Add(0x307); continue; 02946 } 02947 } 02948 } 02949 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri) 02950 { 02951 // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 02952 // The following rules handle those cases. 02953 if (cp == LatinCapitalLetterIWithDotAbove) { 02954 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; } 02955 // When lowercasing, remove dot_above in the sequence I + dot_above, 02956 // which will turn into i. This matches the behavior of the 02957 // canonically equivalent I-dot_above. 02958 else if (cp == CombiningDotAbove) 02959 { 02960 // AfterI: the last preceding base character was an uppercase I, 02961 // and there is no intervening combining character class 230 (ABOVE). 02962 bool afterI = false; 02963 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02964 while (origSrcIdx < srcIdx2) 02965 { 02966 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02967 if (cp2 == LatinCapitalLetterI) { afterI = true; break; } 02968 int cc2 = GetCombiningClass(cp2); 02969 if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break; 02970 } 02971 if (afterI) { 02972 if (how == ccTitle && seenCased && ! seenTwoCased) { 02973 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word; 02974 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase. 02975 // This suggests that if a cased character is found, others in that word should be left alone. 02976 // This seems unusual; we map all other characters to lowercase instead. 02977 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above 02978 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase; 02979 // but since afterI is also true here, this would mean deleting it. Thus our titlecased 02980 // form of "I followed by dot-above" would be just "I", which is clearly wrong. 02981 // So we treat this as a special case here. 02982 IAssert(cpFirstCased == LatinCapitalLetterI); 02983 dest.Add(0x307); continue; } 02984 if (howHere != ccLower) dest.Add(0x307); 02985 continue; } 02986 } 02987 // When lowercasing, unless an I is before a dot_above, 02988 // it turns into a dotless i. 02989 else if (cp == LatinCapitalLetterI) 02990 { 02991 // BeforeDot: C is followed by U+0307 (combining dot above). 02992 // Any sequence of characters with a combining class that is 02993 // neither 0 nor 230 may intervene between the current character 02994 // and the combining dot above. 02995 bool beforeDot = false; 02996 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02997 { 02998 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02999 if (cp2 == 0x307) { beforeDot = true; break; } 03000 const int cc2 = GetCombiningClass(cp2); 03001 if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break; 03002 } 03003 if (! beforeDot) { 03004 dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; } 03005 } 03006 // When uppercasing, i turns into a dotted capital I. 03007 else if (cp == LatinSmallLetterI) 03008 { 03009 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; 03010 } 03011 } 03012 // Try to use the unconditional mappings. 03013 const TIntIntVH &specHere = ( 03014 howHere == how ? specials : 03015 howHere == ccLower ? specialCasingLower : 03016 howHere == ccTitle ? specialCasingTitle : 03017 howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0)); 03018 int i = specHere.GetKeyId(cp); 03019 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; } 03020 // Try to use the simple (one-character) mappings. 03021 i = h.GetKeyId(cp); 03022 if (i >= 0) { 03023 const TUniChInfo &ci = h[i]; 03024 int cpNew = ( 03025 howHere == ccLower ? ci.simpleLowerCaseMapping : 03026 howHere == ccUpper ? ci.simpleUpperCaseMapping : 03027 ci.simpleTitleCaseMapping); 03028 if (cpNew < 0) cpNew = cp; 03029 dest.Add(cpNew); continue; } 03030 // As a final resort, leave 'cp' unchanged. 03031 dest.Add(cp); 03032 } 03033 } 03034 03035 template<typename TSrcVec, typename TDestCh> 03036 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03037 TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const 03038 { 03039 if (clrDest) dest.Clr(); 03040 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03041 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 03042 { 03043 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03044 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; } 03045 const TUniChInfo &ci = h[i]; 03046 // With titlecasing, the first cased character of each word must be put into titlecase, 03047 // all others into lowercase. This is what the howHere variable is for. 03048 TUniChDb::TCaseConversion howHere; 03049 if (how != ccTitle) howHere = how; 03050 else { 03051 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 03052 seenCased = false; 03053 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03054 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03055 bool isCased = IsCased(cp); 03056 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03057 else howHere = ccLower; 03058 } 03059 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03060 if (cpNew < 0) cpNew = cp; 03061 dest.Add(cpNew); 03062 } 03063 } 03064 03065 template<typename TSrcVec> 03066 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const 03067 { 03068 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03069 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 03070 { 03071 const int cp = src[TVecIdx(srcIdx)]; 03072 int i = h.GetKeyId(cp); if (i < 0) continue; 03073 const TUniChInfo &ci = h[i]; 03074 // With titlecasing, the first cased character of each word must be put into titlecase, 03075 // all others into lowercase. This is what the howHere variable is for. 03076 TUniChDb::TCaseConversion howHere; 03077 if (how != ccTitle) howHere = how; 03078 else { 03079 if (srcIdx == nextWordBoundary) { // A word starts/ends here. 03080 seenCased = false; 03081 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03082 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03083 bool isCased = IsCased(cp); 03084 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03085 else howHere = ccLower; 03086 } 03087 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03088 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew; 03089 } 03090 } 03091 03092 //----------------------------------------------------------------------------- 03093 // TUniChDb -- composition, decomposition, normal forms 03094 //----------------------------------------------------------------------------- 03095 03096 template<typename TDestCh> 03097 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const 03098 { 03099 if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount) 03100 { 03101 // UAX #15, sec. 16: Hangul decomposition 03102 const int SIndex = codePoint - HangulSBase; 03103 const int L = HangulLBase + SIndex / HangulNCount; 03104 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount; 03105 const int T = HangulTBase + (SIndex % HangulTCount); 03106 dest.Add(L); dest.Add(V); 03107 if (T != HangulTBase) dest.Add(T); 03108 return; 03109 } 03110 int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; } 03111 const TUniChInfo &ci = h[i]; 03112 int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; } 03113 if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; } 03114 while (true) { 03115 int cp = decompositions[ofs++]; if (cp < 0) return; 03116 AddDecomposition(cp, dest, compatibility); } 03117 } 03118 03119 template<typename TSrcVec, typename TDestCh> 03120 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03121 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const 03122 { 03123 if (clrDest) dest.Clr(); 03124 const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/; 03125 // Decompose the string. 03126 while (srcIdx < srcCount) { 03127 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; } 03128 // Rearrange the decomposed string into canonical order. 03129 for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; ) 03130 { 03131 size_t j = destIdx; 03132 int cp = dest[TVecIdx(destIdx)]; destIdx++; 03133 int cpCls = GetCombiningClass(cp); 03134 if (cpCls == TUniChInfo::ccStarter) continue; 03135 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) { 03136 dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; } 03137 dest[TVecIdx(j)] = cp; 03138 } 03139 } 03140 03141 template<typename TSrcVec, typename TDestCh> 03142 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03143 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const 03144 { 03145 if (clrDest) dest.Clr(); 03146 TIntV temp; 03147 Decompose(src, srcIdx, srcCount, temp, compatibility); 03148 Compose(temp, 0, temp.Len(), dest, clrDest); 03149 } 03150 03151 template<typename TSrcVec, typename TDestCh> 03152 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03153 TVec<TDestCh>& dest, bool clrDest) const 03154 { 03155 if (clrDest) dest.Clr(); 03156 bool lastStarterKnown = false; // has a starter been encountered yet? 03157 size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter 03158 int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos]) 03159 const size_t srcEnd = srcIdx + srcCount; 03160 int ccMax = -1; // The highest combining class among the characters since the last starter. 03161 while (srcIdx < srcEnd) 03162 { 03163 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03164 const int cpClass = GetCombiningClass(cp); 03165 //int cpCombined = -1; 03166 // If there is a starter with which 'cp' can be combined, and from which it is not blocked 03167 // by some intermediate character, we can try to combine them. 03168 if (lastStarterKnown && ccMax < cpClass) 03169 { 03170 int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp)); 03171 int cpCombined = -1; 03172 do { 03173 // Try to look up a composition in the inverseDec table. 03174 if (j >= 0) { cpCombined = inverseDec[j]; break; } 03175 // UAX #15, sec. 16: Hangul composition 03176 // - Try to combine L and V. 03177 const int LIndex = cpLastStarter - HangulLBase; 03178 if (0 <= LIndex && LIndex < HangulLCount) { 03179 const int VIndex = cp - HangulVBase; 03180 if (0 <= VIndex && VIndex < HangulVCount) { 03181 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount; 03182 break; } } 03183 // - Try to combine LV and T. 03184 const int SIndex = cpLastStarter - HangulSBase; 03185 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) 03186 { 03187 const int TIndex = cp - HangulTBase; 03188 if (0 <= TIndex && TIndex < HangulTCount) { 03189 cpCombined = cpLastStarter + TIndex; 03190 break; } 03191 } 03192 } while (false); 03193 // If a combining character has been found, use it to replace the old cpStarter. 03194 if (cpCombined >= 0) { 03195 dest[TVecIdx(lastStarterPos)] = cpCombined; 03196 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter); 03197 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else 03198 cpLastStarter = cpCombined; continue; } 03199 } 03200 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter. 03201 lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; } 03202 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking). 03203 ccMax = cpClass; 03204 dest.Add(cp); 03205 } 03206 } 03207 03208 template<typename TSrcVec, typename TDestCh> 03209 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03210 TVec<TDestCh>& dest, bool clrDest) const 03211 { 03212 if (clrDest) dest.Clr(); 03213 size_t retVal = 0; 03214 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 03215 const int cp = src[TVecIdx(srcIdx)]; 03216 if (GetCombiningClass(cp) == TUniChInfo::ccStarter) 03217 { dest.Add(cp); retVal++; } } 03218 return retVal; 03219 } 03220 03221 inline bool AlwaysFalse() 03222 { 03223 int sum = 0; 03224 for (int i = 0; i < 5; i++) sum += i; 03225 return sum > 100; 03226 } 03227 03228 inline bool AlwaysTrue() 03229 { 03230 int sum = 0; 03231 for (int i = 0; i < 5; i++) sum += i; 03232 return sum < 100; 03233 } 03234 03235 /* 03236 03237 Notes on decomposition: 03238 03239 - In UnicodeData.txt, there is a field with the decomposition mapping. 03240 This field may also include a tag, <...>. 03241 If there is a tag, this is a compatibility mapping. 03242 Otherwise it is a canonical mapping. 03243 - Canonical decomposition uses only canonical mappings, 03244 compatibility decomposition uses both canonical and compatibility mappings. 03245 - Decomposition: 03246 1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively. 03247 2. Put the string into canonical order, which means: 03248 while there exists a pair of characters, A immediately followed by B, 03249 such that combiningclass(A) > combiningclass(B) > 0 [an "exchangeable pair"]: 03250 swap A and B; 03251 This results in NFD (normalized form D, after canonical decomposition) 03252 or NFKD (normalized form KD, after compatibility decomposition). 03253 - Canonical composition: 03254 1. Before composition, the string should have been decomposed 03255 (using either canonical or compatibility decomposition). 03256 2. For each character C (from left to right): 03257 2.1. Find the last starter S before C (if not found, continue). 03258 2.2. If there is, between S and C, some character with a combining class >= than that of C, then continue. 03259 2.3. If there exists a character L for which the canonical decomposition is S+L 03260 and L is not in the composition exclusion table [i.e. L is a "primary composite"], 03261 then replace S by L, and remove C. 03262 This results in NFC (normalized form C, with canonical decomposition followed by canonical composition) 03263 or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition). 03264 - Composition exclusion table: 03265 - Anything in CompositionExclusions.txt. 03266 - Singletons: characters whose canonical decomposition is a single character. 03267 - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter. 03268 03269 Example: 03270 E-grave (00c8; composition class 0; canonical decomposition: 0045 0300) 03271 E-macron (0112; composition class 0; 0045 0304) 03272 grave (0300; composition class 230) 03273 macron (0304; composition class 230) 03274 source string: 00c8 0304 03275 after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304 03276 after canonical composition: 00c8 0304 03277 03278 cc(horn) = 216 03279 cc(dot below) = 220 03280 cc(dot above) = 230 03281 03282 ToDos: 03283 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov. 03284 Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna. 03285 Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu 03286 upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt). 03287 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase. 03288 Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt 03289 (+ simple case mappinge v UnicodeData.txt). 03290 Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo 03291 in jih potem upostevamo posebej kar v source kodi nasih programov [za 03292 podrobno definicijo pogojev pa glej tabelo 3.13]. 03293 - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase. 03294 Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3. 03295 To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise, 03296 da je CaseFolding.txt izpeljan iz njiju. Glavni namen CaseFolding.txt naj bi bil za 03297 potrebe "locale-independent case folding" (table 4.1 in sec. 5.18). 03298 - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13 03299 in se posebej str. 90. 03300 - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD 03301 - definicija cased ipd. na str. 89 03302 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15 03303 Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih 03304 stvari, med drugim isLowerCase in isUpperCase. Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9). 03305 To je se najbolje dodati med flagse posameznega characterja. 03306 - general category: sec. 4.5 03307 - motivacija za titlecase: 5.18 03308 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt 03309 pod Full_Composition_Exclusion 03310 - script names: Scripts.txt in UAX #24. 03311 - block names: Blocks.txt 03312 - space characters: table 6.2 in baje tudi UCD.html 03313 - dash characters: table 6.3 03314 */ 03315 03316 //#endif 03317