SNAP Library , Developer Reference
2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 #include "bd.h" 00002 00003 //#ifndef unicode_h 00004 //#define unicode_h 00005 00007 // Includes 00008 //#include "base.h" 00009 #include <new> 00010 00011 typedef int TUniVecIdx; 00012 00013 //----------------------------------------------------------------------------- 00014 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder 00015 //----------------------------------------------------------------------------- 00016 00017 // Error handling modes for the TUniCodec class. 00018 typedef enum TUnicodeErrorHandling_ 00019 { 00020 // What happens when an error occurs: 00021 uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector) 00022 uehThrow = 1, // - an exception is thrown (TUnicodeException) 00023 uehReplace = 2, // - the replacement character is added to the output vector 00024 uehAbort = 3 // - the encoding/decoding process stops immediately 00025 } 00026 TUnicodeErrorHandling; 00027 00028 class TUnicodeException 00029 { 00030 public: 00031 TStr message; // error message 00032 size_t srcIdx; // the position in the source vector where the error occurred 00033 int srcChar; // the source character at the position srcIdx 00034 TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) : 00035 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { } 00036 }; 00037 00038 typedef enum TUniByteOrder_ 00039 { 00040 boMachineEndian = 0, 00041 boLittleEndian = 1, 00042 boBigEndian = 2 00043 } 00044 TUniByteOrder; 00045 00046 typedef enum TUtf16BomHandling_ 00047 { 00048 bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used 00049 bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported 00050 bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character 00051 } 00052 TUtf16BomHandling; 00053 00054 class TUniCodec 00055 { 00056 public: 00057 // 0xfffd is defined as the replacement character by the Unicode standard. 00058 // By default, it is rendered as a question mark inside a diamond: "<?>". 00059 enum { DefaultReplacementChar = 0xfffd }; 00060 00061 // The replacement character is inserted into the destination vector 00062 // if an error occurs in the source vector. By default, this is set 00063 // to DefaultReplacementChar. 00064 int replacementChar; 00065 // The error handling mode. 00066 TUnicodeErrorHandling errorHandling; 00067 // There are a number of situations where there is strictly speaking an error in 00068 // the source data although it can still be decoded in a reasonably meaningful way. 00069 // If strict == true, these situations are treated as errors. Examples: 00070 // - when decoding UTF-8: 00071 // - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127 00072 // encoded as a two-byte sequence) 00073 // - a codepoint > 0x10ffff 00074 // - when decoding UTF-16: 00075 // - a codepoint from the range reserved for the second character of a surrogate pair 00076 // is not preceded by a codepoint from the range reserved for the first character of a surrogate pair 00077 // - when encoding UTF-8: 00078 // - a codepoint > 0x10ffff 00079 // - when encoding UTF-16: 00080 // - a codepoint from the range reserved from the second character of a surrogate pair 00081 // [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a 00082 // surrogate pair, is always an error, even with strict == false] 00083 bool strict; 00084 // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning 00085 // of the source vector, it is skipped (when decoding). 00086 // - Note: a BOM is not really useful in UTF-8 encoded data. However, the .NET UTF8Encoding 00087 // emits 0xfeff by default as a kind of preamble. It gets encoded as 3 bytes, ef bb bf, 00088 // and can be helpful to make the data easier to recognize as UTF-8 encoded data. 00089 bool skipBom; 00090 00091 TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) 00092 { 00093 } 00094 00095 TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) : 00096 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_) 00097 { 00098 } 00099 00100 protected: 00101 enum { 00102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 00103 DefineByte(1, 0, 0, 0, 0, 0, 0, 0), 00104 DefineByte(1, 1, 0, 0, 0, 0, 0, 0), 00105 DefineByte(1, 1, 1, 0, 0, 0, 0, 0), 00106 DefineByte(1, 1, 1, 1, 0, 0, 0, 0), 00107 DefineByte(1, 1, 1, 1, 1, 0, 0, 0), 00108 DefineByte(1, 1, 1, 1, 1, 1, 0, 0), 00109 DefineByte(1, 1, 1, 1, 1, 1, 1, 0), 00110 DefineByte(0, 0, 1, 1, 1, 1, 1, 1), 00111 DefineByte(0, 0, 0, 1, 1, 1, 1, 1), 00112 DefineByte(0, 0, 0, 0, 1, 1, 1, 1), 00113 DefineByte(0, 0, 0, 0, 0, 1, 1, 1), 00114 DefineByte(0, 0, 0, 0, 0, 0, 1, 1) 00115 #undef DefineByte 00116 }; 00117 00118 typedef TUniVecIdx TVecIdx; 00119 //friend class TUniChDb; 00120 friend class TUniCaseFolding; 00121 00122 public: 00123 00124 //----------------------------------------------------------------------- 00125 // UTF-8 00126 //----------------------------------------------------------------------- 00127 00128 // Returns the number of characters that have been successfully decoded. 00129 // This does not include any replacement characters that may have been inserted into 'dest'. 00130 template<typename TSrcVec, typename TDestCh> 00131 size_t DecodeUtf8( 00132 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00133 TVec<TDestCh>& dest, const bool clrDest = true) const; 00134 template<typename TSrcVec, typename TDestCh> 00135 size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); } 00136 00137 // Returns the number of characters that have been successfully encoded. 00138 // This does not include any replacement characters that may have been inserted into 'dest'. 00139 template<typename TSrcVec, typename TDestCh> 00140 size_t EncodeUtf8( 00141 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00142 TVec<TDestCh>& dest, const bool clrDest = true) const; 00143 template<typename TSrcVec, typename TDestCh> 00144 size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); } 00145 00146 // The following wrappers around the UTF-8 encoder return a TStr containing 00147 // the UTF-8-encoded version of the input string. 00148 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; } 00149 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; } 00150 00151 //----------------------------------------------------------------------- 00152 // UTF-16 Decoder 00153 //----------------------------------------------------------------------- 00154 00155 protected: 00156 enum { 00157 Utf16FirstSurrogate = 0xd800, 00158 Utf16SecondSurrogate = 0xdc00 00159 }; 00160 00161 static bool IsMachineLittleEndian(); 00162 00163 public: 00164 00165 // Returns the number of characters that have been successfully decoded. 00166 // This does not include any replacement characters that may have been inserted into 'dest'. 00167 // Each element of 'src' is assumed to contain one byte of data. 00168 // srcCount must be even (though srcIdx doesn't need to be). 00169 template<typename TSrcVec, typename TDestCh> 00170 size_t DecodeUtf16FromBytes( 00171 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00172 TVec<TDestCh>& dest, const bool clrDest, 00173 const TUtf16BomHandling bomHandling = bomAllowed, 00174 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00175 00176 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 00177 // are used to determine if the two bytes of each word should be swapped before further 00178 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 00179 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 00180 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 00181 // beginning of the source data is used to determine the "original" byte order of the data; 00182 // if this doesn't match the byte order of the local machine, the two bytes of each word will 00183 // be swapped during the decoding process. 00184 template<typename TSrcVec, typename TDestCh> 00185 size_t DecodeUtf16FromWords( 00186 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00187 TVec<TDestCh>& dest, bool clrDest, 00188 const TUtf16BomHandling bomHandling = bomAllowed, 00189 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00190 00191 //----------------------------------------------------------------------- 00192 // UTF-16 Encoder 00193 //----------------------------------------------------------------------- 00194 00195 // Returns the number of characters that have been successfully encoded. 00196 // This does not include any replacement characters that may have been inserted into 'dest'. 00197 // 00198 // Notes: 00199 // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always 00200 // treated as an error, regardless of the value of 'strict'. 00201 // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023 00202 // cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding 00203 // as the first character of a surrogate pair. 00204 // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023 00205 // can be encoded in principle; however, if strict == true, they are treated as errors. 00206 template<typename TSrcVec, typename TDestCh> 00207 size_t EncodeUtf16ToWords( 00208 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00209 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00210 const TUniByteOrder destByteOrder = boMachineEndian) const; 00211 00212 template<typename TSrcVec, typename TDestCh> 00213 size_t EncodeUtf16ToBytes( 00214 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00215 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00216 const TUniByteOrder destByteOrder = boMachineEndian) const; 00217 00218 //----------------------------------------------------------------------- 00219 // Helper declarations for the test drivers 00220 //----------------------------------------------------------------------- 00221 00222 protected: 00223 00224 static uint GetRndUint(TRnd& rnd); 00225 static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal); 00226 00227 //----------------------------------------------------------------------- 00228 // UTF-8 Test Driver 00229 //----------------------------------------------------------------------- 00230 00231 protected: 00232 void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f); 00233 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc', 00234 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected. 00235 void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc); 00236 public: 00237 void TestUtf8(); 00238 00239 //----------------------------------------------------------------------- 00240 // UTF-16 Test Driver 00241 //----------------------------------------------------------------------- 00242 00243 protected: 00244 void WordsToBytes(const TIntV& src, TIntV& dest); 00245 void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, 00246 // Note: insertBom is only used with the encoder. When encoding, 'defaultByteOrder' is used as the destination byte order. 00247 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, 00248 FILE *f); 00249 static inline int SwapBytes(int x) { 00250 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); } 00251 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc', 00252 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected. 00253 void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc, 00254 const TUtf16BomHandling bomHandling, 00255 const TUniByteOrder defaultByteOrder, 00256 const bool insertBom); 00257 public: 00258 void TestUtf16(); 00259 00260 }; 00261 00262 //----------------------------------------------------------------------------- 00263 // Case folding 00264 //----------------------------------------------------------------------------- 00265 // Note: there's no need to access this class directly. 00266 // Use TUniChDb::GetCaseFolded() instead. 00267 00268 typedef THash<TInt, TIntV> TIntIntVH; 00269 00270 class TUniCaseFolding 00271 { 00272 protected: 00273 TIntH cfCommon, cfSimple, cfTurkic; 00274 TIntIntVH cfFull; 00275 00276 template<typename TSrcDat, typename TDestDat> 00277 inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) { 00278 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); } 00279 friend class TUniChDb; 00280 typedef TUniVecIdx TVecIdx; 00281 00282 public: 00283 TUniCaseFolding() { } 00284 explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); } 00285 void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); } 00286 void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); } 00287 void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); } 00288 void LoadTxt(const TStr& fileName); 00289 00290 // Use 'turkic' when processing text in a Turkic language (tr, az). This only affects the uppercase I and I-with-dot-above. 00291 template<typename TSrcVec, typename TDestCh> 00292 void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00293 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const 00294 { 00295 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 00296 { 00297 int c = src[TVecIdx(srcIdx)], i; srcIdx++; 00298 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; } 00299 if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; } 00300 if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; } 00301 i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c); 00302 } 00303 } 00304 00305 template<typename TSrcVec> 00306 void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const 00307 { 00308 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 00309 { 00310 int c = src[TVecIdx(srcIdx)], i; 00311 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; } 00312 if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; } 00313 i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i]; 00314 } 00315 } 00316 00317 protected: 00318 void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f); 00319 public: 00320 void Test(); 00321 }; 00322 00323 //----------------------------------------------------------------------------- 00324 // TCodecBase -- an abstract base class for codecs 00325 //----------------------------------------------------------------------------- 00326 00327 class TCodecBase; 00328 typedef TPt<TCodecBase> PCodecBase; 00329 typedef TVec<PCodecBase> TCodecBaseV; 00330 00331 class TCodecBase 00332 { 00333 protected: 00334 TCRef CRef; 00335 friend class TPt<TCodecBase>; 00336 public: 00337 virtual ~TCodecBase() { } 00338 00339 template<class TCodecImpl> 00340 static PCodecBase New(); /* { 00341 return new TCodecWrapper<TCodecImpl>(); } */ 00342 00343 virtual TStr GetName() const = 0; 00344 virtual void Test() const { } 00345 00346 // Returns the number of characters that have been successfully decoded. 00347 // This does not include any replacement characters that may have been inserted into 'dest'. 00348 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00349 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00350 00351 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00352 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00353 00354 // Returns the number of characters that have been successfully encoded. 00355 // This does not include any replacement characters that may have been inserted into 'dest'. 00356 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00357 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0; 00358 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0; 00359 00360 size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00361 size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00362 size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00363 }; 00364 00365 //----------------------------------------------------------------------------- 00366 // TCodecWrapper -- a descendant of TCodecBase; relies on a template 00367 // parameter class for the actual implementation of the codec. 00368 //----------------------------------------------------------------------------- 00369 // Thus, if you know in advance that you'll need ISO-8859-2, just use 00370 // T8BitCodec<TEncoding_ISO8859_2>. If you don't know the encoding 00371 // in advance, use a PCodecBase pointing to a suitable specialization 00372 // of TCodecWrapper<...>. You can TUnicode::GetCodec(TStr& name) 00373 // to obtain a suitable pointer. 00374 00375 template<class TCodecImpl_> 00376 class TCodecWrapper : public TCodecBase 00377 { 00378 public: 00379 typedef TCodecImpl_ TCodecImpl; 00380 TCodecImpl impl; 00381 public: 00382 00383 virtual TStr GetName() const { return impl.GetName(); } 00384 00385 virtual void Test() const { impl.Test(); } 00386 00387 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00388 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00389 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00390 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00391 00392 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00393 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00394 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const { 00395 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00396 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00397 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false); 00398 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00399 return retVal; } 00400 }; 00401 00402 template<class TCodecImpl> 00403 PCodecBase TCodecBase::New() { 00404 return new TCodecWrapper<TCodecImpl>(); 00405 } 00406 00407 //----------------------------------------------------------------------------- 00408 // TVecElt -- a template for determining the type of a vector's elements 00409 //----------------------------------------------------------------------------- 00410 00411 template<class TVector_> 00412 class TVecElt 00413 { 00414 }; 00415 00416 template<class TDat> 00417 class TVecElt<TVec<TDat> > 00418 { 00419 public: 00420 typedef TVec<TDat> TVector; 00421 typedef TDat TElement; 00422 static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); } 00423 }; 00424 00425 template<> 00426 class TVecElt<TChA> 00427 { 00428 public: 00429 typedef TChA TVector; 00430 typedef char TElement; 00431 static inline void Add(TVector& vector, const TElement& element) { vector += element; } 00432 }; 00433 00434 00435 //----------------------------------------------------------------------------- 00436 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode 00437 //----------------------------------------------------------------------------- 00438 00439 class TEncoding_ISO8859_1 00440 { 00441 public: 00442 static inline TStr GetName() { return "ISO-8859-1"; } 00443 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; } 00444 static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; } 00445 }; 00446 00447 class TEncoding_ISO8859_2 // ISO Latin 2 00448 { 00449 public: 00450 static inline TStr GetName() { return "ISO-8859-2"; } 00451 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00452 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00453 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00454 static int FromUnicode(int c) { 00455 if (0 <= c && c < 0xa0) return c; 00456 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00457 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00458 else return -1; } 00459 }; 00460 00461 class TEncoding_ISO8859_3 00462 { 00463 public: 00464 static inline TStr GetName() { return "ISO-8859-3"; } 00465 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2]; 00466 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00467 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00468 static int FromUnicode(int c) { 00469 if (0 <= c && c < 0xa0) return c; 00470 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00471 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8]; 00472 else return -1; } 00473 }; 00474 00475 class TEncoding_ISO8859_4 00476 { 00477 public: 00478 static inline TStr GetName() { return "ISO-8859-4"; } 00479 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00480 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00481 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00482 static int FromUnicode(int c) { 00483 if (0 <= c && c < 0xa0) return c; 00484 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00485 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00486 else return -1; } 00487 }; 00488 00489 class TEncoding_YuAscii 00490 { 00491 public: 00492 static const int uniChars[10], yuAsciiChars[10]; 00493 static inline TStr GetName() { return "YU-ASCII"; } 00494 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00495 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++) 00496 if (c == yuAsciiChars[i]) return uniChars[i]; 00497 return c; } 00498 static int FromUnicode(int c) { 00499 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++) 00500 if (c == uniChars[i]) return yuAsciiChars[i]; 00501 else if(c == yuAsciiChars[i]) return -1; 00502 if (0 <= c && c <= 255) return c; else return -1; } 00503 }; 00504 00505 class TEncoding_CP437 // DOS US 00506 { 00507 public: 00508 static inline TStr GetName() { return "CP437"; } 00509 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16]; 00510 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00511 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00512 static int FromUnicode(int c) { 00513 if (0 <= c && c < 0x80) return c; 00514 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0]; 00515 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390]; 00516 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210]; 00517 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500]; 00518 else if (c == 0x192) return 0x9f; 00519 else if (c == 0x207f) return 0xfc; 00520 else if (c == 0x20a7) return 0x9e; 00521 else if (c == 0x2310) return 0xa9; 00522 else if (c == 0x2320) return 0xf4; 00523 else if (c == 0x2321) return 0xf5; 00524 else return -1; } 00525 }; 00526 00527 class TEncoding_CP852 // DOS Latin 2 00528 { 00529 public: 00530 static inline TStr GetName() { return "CP852"; } 00531 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16]; 00532 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00533 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00534 static int FromUnicode(int c) { 00535 if (0 <= c && c < 0x80) return c; 00536 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00537 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00538 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500]; 00539 else return -1; } 00540 }; 00541 00542 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2 00543 { 00544 public: 00545 static inline TStr GetName() { return "CP1250"; } 00546 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16]; 00547 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00548 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00549 static int FromUnicode(int c) { 00550 if (0 <= c && c < 0x80) return c; 00551 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00552 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00553 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010]; 00554 else if (c == 0x20ac) return 0x80; 00555 else if (c == 0x2122) return 0x99; 00556 else return -1; } 00557 }; 00558 00559 template<class TEncoding_> 00560 class T8BitCodec 00561 { 00562 protected: 00563 typedef TUniVecIdx TVecIdx; 00564 public: 00565 typedef TEncoding_ TEncoding; 00566 TUnicodeErrorHandling errorHandling; 00567 int replacementChar; 00568 00569 T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { } 00570 T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) : 00571 errorHandling(errorHandling_), replacementChar(replacementChar_) { } 00572 static TStr GetName() { return TEncoding::GetName(); } 00573 00574 void Test() const 00575 { 00576 int nDecoded = 0; 00577 for (int c = 0; c <= 255; c++) { 00578 int cu = TEncoding::ToUnicode(c); if (cu == -1) continue; 00579 nDecoded++; 00580 IAssert(0 <= cu && cu < 0x110000); 00581 int c2 = TEncoding::FromUnicode(cu); 00582 IAssert(c2 == c); } 00583 int nEncoded = 0; 00584 for (int cu = 0; cu < 0x110000; cu++) { 00585 int c = TEncoding::FromUnicode(cu); if (c == -1) continue; 00586 nEncoded++; 00587 IAssert(0 <= c && c <= 255); 00588 int cu2 = TEncoding::ToUnicode(c); 00589 IAssert(cu2 == cu); } 00590 IAssert(nDecoded == nEncoded); 00591 } 00592 00593 // Returns the number of characters that have been successfully decoded. 00594 // This does not include any replacement characters that may have been inserted into 'dest'. 00595 template<typename TSrcVec, typename TDestCh> 00596 size_t ToUnicode( 00597 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00598 TVec<TDestCh>& dest, const bool clrDest = true) const 00599 { 00600 if (clrDest) dest.Clr(); 00601 size_t toDo = srcCount; 00602 while (toDo-- > 0) { 00603 int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++; 00604 int chDest = TEncoding::ToUnicode(chSrc); 00605 dest.Add(chDest); } 00606 return srcCount; 00607 } 00608 template<typename TSrcVec, typename TDestCh> 00609 size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00610 00611 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00612 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00613 00614 // Returns the number of characters that have been successfully encoded. 00615 // This does not include any replacement characters that may have been inserted into 'dest'. 00616 template<typename TSrcVec, typename TDestVec> 00617 size_t FromUnicode( 00618 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00619 TDestVec& dest, const bool clrDest = true) const 00620 { 00621 typedef typename TVecElt<TDestVec>::TElement TDestCh; 00622 if (clrDest) dest.Clr(); 00623 size_t toDo = srcCount, nEncoded = 0; 00624 while (toDo-- > 0) { 00625 int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++; 00626 int chDest = TEncoding::FromUnicode(chSrc); 00627 if (chDest < 0) { 00628 switch (errorHandling) { 00629 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + "."); 00630 case uehAbort: return nEncoded; 00631 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue; 00632 case uehIgnore: continue; 00633 default: Fail; } } 00634 TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; } 00635 return nEncoded; 00636 } 00637 00638 template<typename TSrcVec, typename TDestVec> 00639 size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00640 00641 size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00642 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false); 00643 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00644 return retVal; } 00645 size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); } 00646 }; 00647 00648 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1; 00649 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2; 00650 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3; 00651 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4; 00652 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852; 00653 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437; 00654 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250; 00655 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii; 00656 00657 //----------------------------------------------------------------------------- 00658 // Various declarations used by the Unicode Character Database 00659 //----------------------------------------------------------------------------- 00660 00661 typedef enum TUniChCategory_ 00662 { 00663 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff) 00664 DefineUniCat(Letter, 'L'), // ucLetter 00665 DefineUniCat(Mark, 'M'), 00666 DefineUniCat(Number, 'N'), 00667 DefineUniCat(Punctuation, 'P'), 00668 DefineUniCat(Symbol, 'S'), 00669 DefineUniCat(Separator, 'Z'), 00670 DefineUniCat(Other, 'C') 00671 #undef DefineUniCat 00672 } 00673 TUniChCategory; 00674 00675 typedef enum TUniChSubCategory_ 00676 { 00677 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) 00678 DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase 00679 DefineUniSubCat(Letter, Lowercase, 'l'), 00680 DefineUniSubCat(Letter, Titlecase, 't'), 00681 DefineUniSubCat(Letter, Modifier, 'm'), 00682 DefineUniSubCat(Letter, Other, 'o'), 00683 DefineUniSubCat(Mark, Nonspacing, 'n'), 00684 DefineUniSubCat(Mark, SpacingCombining, 'c'), 00685 DefineUniSubCat(Mark, Enclosing, 'e'), 00686 DefineUniSubCat(Number, DecimalDigit, 'd'), 00687 DefineUniSubCat(Number, Letter, 'l'), 00688 DefineUniSubCat(Number, Other, 'o'), 00689 DefineUniSubCat(Punctuation, Connector, 'c'), 00690 DefineUniSubCat(Punctuation, Dash, 'd'), 00691 DefineUniSubCat(Punctuation, Open, 's'), 00692 DefineUniSubCat(Punctuation, Close, 'e'), 00693 DefineUniSubCat(Punctuation, InitialQuote, 'i'), 00694 DefineUniSubCat(Punctuation, FinalQuote, 'f'), 00695 DefineUniSubCat(Punctuation, Other, 'o'), 00696 DefineUniSubCat(Symbol, Math, 'm'), 00697 DefineUniSubCat(Symbol, Currency, 'c'), 00698 DefineUniSubCat(Symbol, Modifier, 'k'), 00699 DefineUniSubCat(Symbol, Other, 'o'), 00700 DefineUniSubCat(Separator, Space, 's'), 00701 DefineUniSubCat(Separator, Line, 'l'), 00702 DefineUniSubCat(Separator, Paragraph, 'p'), 00703 DefineUniSubCat(Other, Control, 'c'), 00704 DefineUniSubCat(Other, Format, 'f'), 00705 DefineUniSubCat(Other, Surrogate, 's'), 00706 DefineUniSubCat(Other, PrivateUse, 'o'), 00707 DefineUniSubCat(Other, NotAssigned, 'n') 00708 } 00709 TUniChSubCategory; 00710 00711 typedef enum TUniChFlags_ 00712 { 00713 ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical 00714 ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt 00715 // Flags used when searching for word boundaries. See UAX #29. 00716 ucfWbFormat = 1 << 2, 00717 ucfWbKatakana = 1 << 3, 00718 ucfWbALetter = 1 << 4, 00719 ucfWbMidLetter = 1 << 5, 00720 ucfWbMidNum = 1 << 6, 00721 ucfWbNumeric = 1 << 7, 00722 ucfWbExtendNumLet = 1 << 8, 00723 // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29. 00724 ucfSbSep = 1 << 9, 00725 ucfSbFormat = 1 << 10, 00726 ucfSbSp = 1 << 11, 00727 ucfSbLower = 1 << 12, 00728 ucfSbUpper = 1 << 13, 00729 ucfSbOLetter = 1 << 14, 00730 ucfSbNumeric = 1 << 15, 00731 ucfSbATerm = 1 << 16, 00732 ucfSbSTerm = 1 << 17, 00733 ucfSbClose = 1 << 18, 00734 ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose, 00735 ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, 00736 // Flags from DerivedCoreProperties.txt. 00737 // [The comments are from UCD.html.] 00738 // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode]. 00739 // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl 00740 ucfDcpAlphabetic = 1 << 19, 00741 // - For programmatic determination of default-ignorable code points. 00742 // New characters that should be ignored in processing (unless explicitly supported) 00743 // will be assigned in these ranges, permitting programs to correctly handle the default 00744 // behavior of such characters when not otherwise supported. For more information, see 00745 // UAX #29: Text Boundaries [Breaks]. 00746 // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters 00747 // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors] 00748 ucfDcpDefaultIgnorableCodePoint = 1 << 20, 00749 // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode]. 00750 // Generated from: Other_Lowercase + Ll 00751 ucfDcpLowercase = 1 << 21, 00752 // - For programmatic determination of grapheme cluster boundaries. 00753 // For more information, see UAX #29: Text Boundaries [Breaks]. 00754 // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend 00755 ucfDcpGraphemeBase = 1 << 22, 00756 // - For programmatic determination of grapheme cluster boundaries. 00757 // For more information, see UAX #29: Text Boundaries [Breaks]. 00758 // Generated from: Other_Grapheme_Extend + Me + Mn 00759 // Note: depending on an application's interpretation of Co (private use), they may be either 00760 // in Grapheme_Base, or in Grapheme_Extend, or in neither. 00761 ucfDcpGraphemeExtend = 1 << 23, 00762 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00763 ucfDcpIdStart = 1 << 24, 00764 ucfDcpIdContinue = 1 << 25, 00765 // - Characters with the Math property. For more information, see Chapter 4 in [Unicode]. 00766 // Generated from: Sm + Other_Math 00767 ucfDcpMath = 1 << 26, 00768 // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode]. 00769 // Generated from: Lu + Other_Uppercase 00770 ucfDcpUppercase = 1 << 27, 00771 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00772 ucfDcpXidStart = 1 << 28, 00773 ucfDcpXidContinue = 1 << 29, 00774 ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend | 00775 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue, 00776 } 00777 TUniChFlags; 00778 00779 typedef enum TUniChProperties_ 00780 { 00781 // The flags from PropList.txt. 00782 // [The comments are from UCD.html.] 00783 // - ASCII characters commonly used for the representation of hexadecimal numbers. 00784 // [= 0123456789abcdefABCDEF] 00785 ucfPrAsciiHexDigit = 1, 00786 // - Those format control characters which have specific functions in the Bidirectional Algorithm. 00787 ucfPrBidiControl = 2, 00788 // - Those punctuation characters explicitly called out as dashes in the Unicode Standard, 00789 // plus compatibility equivalents to those. Most of these have the Pd General Category, 00790 // but some have the Sm General Category because of their use in mathematics. 00791 // U+0002d HYPHEN-MINUS 00792 // U+0058a ARMENIAN HYPHEN 00793 // U+005be HEBREW PUNCTUATION MAQAF 00794 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00795 // U+02010 HYPHEN 00796 // U+02011 NON-BREAKING HYPHEN 00797 // U+02012 FIGURE DASH 00798 // U+02013 EN DASH 00799 // U+02014 EM DASH 00800 // U+02015 HORIZONTAL BAR 00801 // U+02053 SWUNG DASH 00802 // U+0207b SUPERSCRIPT MINUS 00803 // U+0208b SUBSCRIPT MINUS 00804 // U+02212 MINUS SIGN 00805 // U+02e17 DOUBLE OBLIQUE HYPHEN 00806 // U+0301c WAVE DASH 00807 // U+03030 WAVY DASH 00808 // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN 00809 // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH 00810 // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH 00811 // U+0fe58 SMALL EM DASH 00812 // U+0fe63 SMALL HYPHEN-MINUS 00813 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00814 ucfPrDash = 4, 00815 // - For a machine-readable list of deprecated characters. No characters will ever be removed 00816 // from the standard, but the usage of deprecated characters is strongly discouraged. 00817 ucfPrDeprecated = 8, 00818 // - Characters that linguistically modify the meaning of another character to which they apply. 00819 // Some diacritics are not combining characters, and some combining characters are not diacritics. 00820 ucfPrDiacritic = 0x10, 00821 // - Characters whose principal function is to extend the value or shape of a preceding alphabetic 00822 // character. Typical of these are length and iteration marks. 00823 ucfPrExtender = 0x20, 00824 // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries. 00825 ucfPrGraphemeLink = 0x40, 00826 // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents. 00827 // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}] 00828 ucfPrHexDigit = 0x80, 00829 // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot. 00830 // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash. 00831 // U+0002d HYPHEN-MINUS 00832 // U+000ad SOFT HYPHEN 00833 // U+0058a ARMENIAN HYPHEN 00834 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00835 // U+02010 HYPHEN 00836 // U+02011 NON-BREAKING HYPHEN 00837 // U+02e17 DOUBLE OBLIQUE HYPHEN 00838 // U+030fb KATAKANA MIDDLE DOT 00839 // U+0fe63 SMALL HYPHEN-MINUS 00840 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00841 // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT 00842 ucfPrHyphen = 0x100, 00843 // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs. 00844 ucfPrIdeographic = 0x200, 00845 // - Those format control characters which have specific functions for control of cursive joining and ligation. 00846 ucfPrJoinControl = 0x400, 00847 // - There are a small number of characters that do not use logical order. 00848 // These characters require special handling in most processing. 00849 ucfPrLogicalOrderException = 0x800, 00850 // - Code points that are permanently reserved for internal use. 00851 ucfPrNoncharacterCodePoint = 0x1000, 00852 // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax. 00853 ucfPrPatternSyntax = 0x2000, 00854 ucfPrPatternWhiteSpace = 0x4000, 00855 // - Those punctuation characters that function as quotation marks. 00856 // U+00022 QUOTATION MARK 00857 // U+00027 APOSTROPHE 00858 // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00859 // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00860 // U+02018 LEFT SINGLE QUOTATION MARK 00861 // U+02019 RIGHT SINGLE QUOTATION MARK 00862 // U+0201a SINGLE LOW-9 QUOTATION MARK 00863 // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK 00864 // U+0201c LEFT DOUBLE QUOTATION MARK 00865 // U+0201d RIGHT DOUBLE QUOTATION MARK 00866 // U+0201e DOUBLE LOW-9 QUOTATION MARK 00867 // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK 00868 // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 00869 // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 00870 // U+0300c LEFT CORNER BRACKET 00871 // U+0300d RIGHT CORNER BRACKET 00872 // U+0300e LEFT WHITE CORNER BRACKET 00873 // U+0300f RIGHT WHITE CORNER BRACKET 00874 // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK 00875 // U+0301e DOUBLE PRIME QUOTATION MARK 00876 // U+0301f LOW DOUBLE PRIME QUOTATION MARK 00877 // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET 00878 // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET 00879 // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET 00880 // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET 00881 // U+0ff02 FULLWIDTH QUOTATION MARK 00882 // U+0ff07 FULLWIDTH APOSTROPHE 00883 // U+0ff62 HALFWIDTH LEFT CORNER BRACKET 00884 // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET 00885 ucfPrQuotationMark = 0x8000, 00886 // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. 00887 // An explicit _dot above_ can be added where required, such as in Lithuanian. 00888 ucfPrSoftDotted = 0x10000, 00889 // - Sentence Terminal. Used in UAX #29: Text Boundaries. 00890 // U+00021 EXCLAMATION MARK 00891 // U+0002e FULL STOP 00892 // U+0003f QUESTION MARK 00893 // U+0203c DOUBLE EXCLAMATION MARK 00894 // U+0203d INTERROBANG 00895 // U+02047 DOUBLE QUESTION MARK 00896 // U+02048 QUESTION EXCLAMATION MARK 00897 // U+02049 EXCLAMATION QUESTION MARK 00898 // U+03002 IDEOGRAPHIC FULL STOP 00899 // [plus many characters from other writing systems] 00900 ucfPrSTerm = 0x20000, 00901 // - Those punctuation characters that generally mark the end of textual units. 00902 // [JB note: this set contains more character than STerm. For example, it contains 00903 // the comma, colon and semicolon, whereas STerm doesn't.] 00904 // U+00021 EXCLAMATION MARK 00905 // U+0002c COMMA 00906 // U+0002e FULL STOP 00907 // U+0003a COLON 00908 // U+0003b SEMICOLON 00909 // U+0003f QUESTION MARK 00910 // U+0203c DOUBLE EXCLAMATION MARK 00911 // U+0203d INTERROBANG 00912 // U+02047 DOUBLE QUESTION MARK 00913 // U+02048 QUESTION EXCLAMATION MARK 00914 // U+02049 EXCLAMATION QUESTION MARK 00915 // [plus *lots* of charcters from other writing systems] 00916 ucfPrTerminalPunctuation = 0x40000, 00917 // - Indicates all those characters that qualify as Variation Selectors. 00918 // For details on the behavior of these characters, see StandardizedVariants.html and 00919 // Section 16.4, Variation Selectors in [Unicode]. 00920 ucfPrVariationSelector = 0x80000, 00921 // - Those separator characters and control characters which should be treated by 00922 // programming languages as "white space" for the purpose of parsing elements. 00923 // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included, 00924 // since their functions are restricted to line-break control. 00925 // Their names are unfortunately misleading in this respect. 00926 // Note: There are other senses of "whitespace" that encompass a different set of characters. 00927 // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt. 00928 // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.] 00929 // This includes the following characters: 00930 // U+0009 <control> 00931 // U+000a <control> 00932 // U+000b <control> 00933 // U+000c <control> 00934 // U+000d <control> 00935 // U+0020 SPACE 00936 // U+0085 <control> 00937 // U+00a0 NO-BREAK SPACE 00938 // U+1680 OGHAM SPACE MARK 00939 // U+180e MONGOLIAN VOWEL SEPARATOR 00940 // U+2000 EN QUAD 00941 // U+2001 EM QUAD 00942 // U+2002 EN SPACE 00943 // U+2003 EM SPACE 00944 // U+2004 THREE-PER-EM SPACE 00945 // U+2005 FOUR-PER-EM SPACE 00946 // U+2006 SIX-PER-EM SPACE 00947 // U+2007 FIGURE SPACE 00948 // U+2008 PUNCTUATION SPACE 00949 // U+2009 THIN SPACE 00950 // U+200a HAIR SPACE 00951 // U+2028 LINE SEPARATOR 00952 // U+2029 PARAGRAPH SEPARATOR 00953 // U+202f NARROW NO-BREAK SPACE 00954 // U+205f MEDIUM MATHEMATICAL SPACE 00955 // U+3000 IDEOGRAPHIC SPACE 00956 ucfPrWhiteSpace = 0x100000 00957 } 00958 TUniChProperties; 00959 00960 typedef enum TUniChPropertiesX_ 00961 { 00962 // More properties from PropList.txt. 00963 // - Used to derive the properties in DerivedCoreProperties.txt. 00964 ucfPxOtherAlphabetic = 1, 00965 ucfPxOtherDefaultIgnorableCodePoint = 2, 00966 ucfPxOtherGraphemeExtend = 4, 00967 ucfPxOtherIdContinue = 8, 00968 ucfPxOtherIdStart = 0x10, 00969 ucfPxOtherLowercase = 0x20, 00970 ucfPxOtherMath = 0x40, 00971 ucfPxOtherUppercase = 0x80, 00972 // - Used in ideographic description sequences. 00973 ucfPxIdsBinaryOperator = 0x100, 00974 ucfPxIdsTrinaryOperator = 0x200, 00975 ucfPxRadical = 0x400, 00976 ucfPxUnifiedIdeograph = 0x800 00977 } 00978 TUniChPropertiesX; 00979 00980 //----------------------------------------------------------------------------- 00981 // TUniChInfo -- contains information about a single Unicode codepoint 00982 //----------------------------------------------------------------------------- 00983 00984 class TUniChInfo 00985 { 00986 public: 00987 enum { // combining classes (for 'combClass'); from UnicodeData.txt 00988 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined 00989 ccOverlaysAndInterior = 1, 00990 ccNuktas = 7, 00991 ccHiraganaKatakanaVoicingMarks = 8, 00992 ccViramas = 9, 00993 ccFixedPositionStart = 10, // Start of fixed position classes 00994 ccFixedPositionEnd = 199, // End of fixed position classes 00995 ccBelowLeftAttached = 200, 00996 ccBelowAttached = 202, 00997 ccBelowRightAttached = 204, 00998 ccLeftAttached = 208, // Left attached (reordrant around single base character) 00999 ccRightAttached = 210, 01000 ccAboveLeftAttached = 212, 01001 ccAboveAttached = 214, 01002 ccAboveRightAttached = 216, 01003 ccBelowLeft = 218, 01004 ccBelow = 220, 01005 ccBelowRight = 222, 01006 ccLeft = 224, // Left (reordrant around single base character) 01007 ccRight = 226, 01008 ccAboveLeft = 228, 01009 ccAbove = 230, 01010 ccAboveRight = 232, 01011 ccDoubleBelow = 233, 01012 ccDoubleAbove = 234, 01013 ccBelowIotaSubscript = 240, // Below (iota subscript) 01014 ccInvalid = 255 // not defined by Unicode 01015 }; 01016 char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt) 01017 uchar combClass; // canonical combining class 01018 TUniChCategory cat; // = TUniChCategory(chCat) 01019 TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat) 01020 signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown 01021 int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt 01022 int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition 01023 int nameOffset; // offset into 'TUniChDb.charNames' 01024 int flags; // a combination of TUniChFlags 01025 int properties; // a combination of TUniChProperties 01026 int propertiesX; // a combination of TUniChPropertiesX 01027 ushort lineBreak; // from LineBreak.txt 01028 01029 // Converts a 2-letter linebreak code into a 16-bit integer. 01030 static inline ushort GetLineBreakCode(char c1, char c2) { return ((ushort(uchar(c1)) & 0xff) << 8) | ((ushort(uchar(c2)) & 0xff)); } 01031 static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation; 01032 01033 public: 01034 void InitAfterLoad() { 01035 cat = (TUniChCategory) chCat; 01036 subCat = (TUniChSubCategory) (((int(uchar(chCat)) & 0xff) << 8) | (int(uchar(chSubCat)) & 0xff)); } 01037 void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) { 01038 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff); 01039 subCat = catAndSubCat; 01040 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); } 01041 friend class TUniChDb; 01042 01043 // Inexplicably missing from TSIn/TSOut... 01044 static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); } 01045 static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); } 01046 static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); } 01047 static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); } 01048 01049 public: 01050 void Save(TSOut& SOut) const { 01051 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script); 01052 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping); 01053 SOut.Save(decompOffset); SOut.Save(nameOffset); 01054 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); } 01055 void Load(TSIn& SIn) { 01056 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script); 01057 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping); 01058 SIn.Load(decompOffset); SIn.Load(nameOffset); 01059 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); } 01060 explicit TUniChInfo(TSIn& SIn) { Load(SIn); } 01061 TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid), 01062 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1), 01063 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) { 01064 InitAfterLoad(); } 01065 01066 // DerivedCoreProperties flags. 01067 bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; } 01068 void ClrDcpFlags() { flags = flags & ~ucfDcpMask; } 01069 void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; } 01070 bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); } 01071 bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); } 01072 bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); } 01073 bool IsMath() const { return IsDcpFlag(ucfDcpMath); } 01074 bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); } 01075 bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); } 01076 bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); } 01077 bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); } 01078 bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); } 01079 bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); } 01080 bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); } 01081 01082 // PropList.txt flags. 01083 bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; } 01084 void SetProperty(const TUniChProperties flag) { properties |= flag; } 01085 bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); } 01086 bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); } 01087 bool IsDash() const { return IsProperty(ucfPrDash); } 01088 bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); } 01089 bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); } 01090 bool IsExtender() const { return IsProperty(ucfPrExtender); } 01091 bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); } 01092 bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); } 01093 bool IsHyphen() const { return IsProperty(ucfPrHyphen); } 01094 bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); } 01095 bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); } 01096 bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); } 01097 bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); } 01098 bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); } 01099 bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); } 01100 bool IsSTerminal() const { return IsProperty(ucfPrSTerm); } 01101 bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); } 01102 bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); } 01103 bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); } 01104 01105 // Additional PropList.txt flags. 01106 bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; } 01107 void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; } 01108 01109 // Miscellaneous flags. 01110 bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; } 01111 bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; } 01112 01113 // Word-boundary flags. 01114 bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; } 01115 void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); } 01116 void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; } 01117 int GetWbFlags() const { return flags & ucfWbMask; } 01118 bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); } 01119 TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); } 01120 static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") + 01121 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") + 01122 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); } 01123 01124 // Sentence-boundary flags. 01125 bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; } 01126 void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; } 01127 int GetSbFlags() const { return flags & ucfSbMask; } 01128 bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); } 01129 TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); } 01130 static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") + 01131 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") + 01132 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") + 01133 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); } 01134 01135 bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; } 01136 01137 // Grapheme-boundary flags. 01138 bool IsGbExtend() const { return IsGraphemeExtend(); } 01139 01140 // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter. 01141 bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); } 01142 01143 // Character categories. 01144 TUniChCategory GetCat() const { return (TUniChCategory) cat; } 01145 TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; } 01146 // The following characters belong to the 'symbol/currency' subcategory: 01147 // U+00024 DOLLAR SIGN 01148 // U+000a2 CENT SIGN 01149 // U+000a3 POUND SIGN 01150 // U+000a4 CURRENCY SIGN 01151 // U+000a5 YEN SIGN 01152 // U+020a3 FRENCH FRANC SIGN 01153 // U+020a4 LIRA SIGN 01154 // U+020ac EURO SIGN 01155 // [and plenty of others] 01156 bool IsCurrency() const { return subCat == ucSymbolCurrency; } 01157 // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt. 01158 // Thus, it's better to call TUniChDb's versions of these methods, which are aware of 01159 // the full ranges of private-use and surrogate characters. 01160 bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; } 01161 bool IsSurrogate() const { return subCat == ucOtherSurrogate; } 01162 01163 inline static bool IsValidSubCat(const char chCat, const char chSubCat) { 01164 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn"; 01165 for (const char *p = s; *p; p += 2) 01166 if (chCat == p[0] && chSubCat == p[1]) return true; 01167 return false; } 01168 }; 01169 01170 //----------------------------------------------------------------------------- 01171 // TUniTrie -- a trie for suffixes that should not appear at the end 01172 // of a sentence 01173 //----------------------------------------------------------------------------- 01174 01175 template<typename TItem_> 01176 class TUniTrie 01177 { 01178 public: 01179 typedef TItem_ TItem; 01180 protected: 01181 class TNode { 01182 public: 01183 TItem item; 01184 int child, sib; 01185 bool terminal; 01186 TNode() : child(-1), sib(-1), terminal(false) { } 01187 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { } 01188 }; 01189 typedef TVec<TNode> TNodeV; 01190 typedef TPair<TItem, TItem> TItemPr; 01191 typedef TTriple<TItem, TItem, TItem> TItemTr; 01192 typedef TUniVecIdx TVecIdx; 01193 THash<TItem, TVoid> singles; // 01194 THash<TItemPr, TVoid> pairs; 01195 THash<TItemTr, TInt> roots; 01196 TNodeV nodes; 01197 public: 01198 TUniTrie() { } 01199 void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); } 01200 01201 bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); } 01202 01203 bool Has1Gram(const TItem& item) const { return singles.IsKey(item); } 01204 bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); } 01205 int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const { 01206 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast)); 01207 if (keyId < 0) return 0; else return roots[keyId]; } 01208 int GetChild(const int parentIdx, const TItem& item) const { 01209 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) { 01210 const TNode &node = nodes[childIdx]; 01211 if (node.item == item) return childIdx; 01212 childIdx = node.sib; } 01213 return -1; } 01214 bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; } 01215 01216 // Adds a new string to the trie. Note that the last characters appear 01217 // closer to the root of the trie. 01218 template<typename TSrcVec> 01219 void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount) 01220 { 01221 IAssert(srcCount > 0); 01222 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; } 01223 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; } 01224 size_t srcLast = srcIdx + (srcCount - 1); 01225 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)])); 01226 int keyId = roots.GetKeyId(tr), curNodeIdx = -1; 01227 if (keyId >= 0) curNodeIdx = roots[keyId]; 01228 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); } 01229 // 01230 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; ) 01231 { 01232 const TItem curItem = src[TVecIdx(srcPos)]; 01233 int childNodeIdx = nodes[curNodeIdx].child; 01234 while (childNodeIdx >= 0) { 01235 TNode &childNode = nodes[childNodeIdx]; 01236 if (childNode.item == curItem) break; 01237 childNodeIdx = childNode.sib; } 01238 if (childNodeIdx < 0) { 01239 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false)); 01240 nodes[curNodeIdx].child = childNodeIdx; } 01241 curNodeIdx = childNodeIdx; 01242 if (srcPos == srcIdx) break; else srcPos--; 01243 } 01244 nodes[curNodeIdx].terminal = true; 01245 } 01246 01247 template<typename TSrcVec> 01248 void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); } 01249 }; 01250 01251 //----------------------------------------------------------------------------- 01252 // TUniChDb -- provides access to the Unicode Character Database 01253 //----------------------------------------------------------------------------- 01254 01255 class TUniChDb 01256 { 01257 protected: 01258 void InitAfterLoad(); 01259 typedef TUniVecIdx TVecIdx; 01260 01261 public: 01262 THash<TInt, TUniChInfo> h; // key: codepoint 01263 TStrPool charNames; 01264 TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only) 01265 TIntV decompositions; 01266 THash<TIntPr, TInt> inverseDec; 01267 TUniCaseFolding caseFolding; 01268 // These hash tables contain only the unconditional mappings from SpecialCasing.txt. 01269 // The conditional mappings are hardcoded into GetCaseConverted(). 01270 TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle; 01271 int scriptUnknown; // = scripts.GetKey("Unknown") 01272 01273 TUniChDb() : scriptUnknown(-1) { } 01274 explicit TUniChDb(TSIn& SIn) { Load(SIn); } 01275 void Clr() { 01276 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr(); 01277 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr(); 01278 scripts.Clr(); } 01279 void Save(TSOut& SOut) const { 01280 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut); 01281 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut); 01282 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut); 01283 SOut.SaveCs(); } 01284 void Load(TSIn& SIn) { 01285 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn); 01286 decompositions.Load(SIn); 01287 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn); 01288 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn); 01289 SIn.LoadCs(); InitAfterLoad(); } 01290 void LoadBin(const TStr& fnBin) { 01291 PSIn SIn = TFIn::New(fnBin); Load(*SIn); } 01292 void Test(const TStr& basePath); 01293 01294 // File names used by LoadTxt() and its subroutines. 01295 static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; } 01296 static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; } 01297 static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; } 01298 static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; } 01299 static TStr GetScriptsFn() { return "Scripts.txt"; } 01300 static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; } 01301 static TStr GetLineBreakFn() { return "LineBreak.txt"; } 01302 static TStr GetPropListFn() { return "PropList.txt"; } 01303 static TStr GetAuxiliaryDir() { return "auxiliary"; } 01304 static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; } 01305 static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; } 01306 static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; } 01307 static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; } 01308 static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; } 01309 static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test() 01310 01311 //------------------------------------------------------------------------- 01312 // Script names 01313 //------------------------------------------------------------------------- 01314 01315 // These constants are used when initializing from the text files. 01316 static TStr GetScriptNameUnknown() { return "Unknown"; } 01317 static TStr GetScriptNameKatakana() { return "Katakana"; } 01318 static TStr GetScriptNameHiragana() { return "Hiragana"; } 01319 // 01320 const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); } 01321 int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); } 01322 int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; } 01323 int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); } 01324 01325 //------------------------------------------------------------------------- 01326 // Character namesnames 01327 //------------------------------------------------------------------------- 01328 01329 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 01330 const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); } 01331 TStr GetCharNameS(const int cp) const { 01332 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16). 01333 const char *p = GetCharName(cp); if (p) return p; 01334 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); } 01335 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const { 01336 if (! f) f = stdout; 01337 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 01338 fprintf(f, "%s", prefix.CStr()); 01339 int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp); 01340 fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }} 01341 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); } 01342 01343 //------------------------------------------------------------------------- 01344 // Character information 01345 //------------------------------------------------------------------------- 01346 // These methods provide access to a subset of the functionality 01347 // available in TUniChInfo. 01348 01349 bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) { 01350 int i = h.GetKeyId(cp); 01351 if (i < 0) return false; else { ChInfo=h[i]; return true; }} 01352 TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; } 01353 TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; } 01354 01355 bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); } 01356 int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); } 01357 bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); } 01358 int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); } 01359 01360 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); } 01361 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2) 01362 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3) 01363 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4) 01364 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5) 01365 01366 #define DECLARE_FORWARDED_PROPERTY_METHODS \ 01367 ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \ 01368 ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \ 01369 ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \ 01370 ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \ 01371 ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \ 01372 ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \ 01373 ___UniFwd2(IsXidStart, IsXidContinue) \ 01374 ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \ 01375 ___UniFwd1(IsGbExtend) \ 01376 ___UniFwd2(IsCased, IsCurrency) 01377 01378 DECLARE_FORWARDED_PROPERTY_METHODS 01379 01380 #undef ___UniFwd1 01381 01382 bool IsPrivateUse(const int cp) const { 01383 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse(); 01384 return (0xe000 <= cp && cp <= 0xf8ff) || // plane 0 private-use area 01385 // Planes 15 and 16 are entirely for private use. 01386 (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); } 01387 // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates. 01388 // For db80..dbff it is clear that the surrogate pair containing this high surrogate 01389 // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false 01390 // for db80..dbff. This is consistent with the category codes assigned in UnicodeData.txt. 01391 bool IsSurrogate(const int cp) const { 01392 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate(); 01393 return 0xd800 <= cp && cp <= 0xdcff; } 01394 01395 // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1 01396 // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters 01397 // for composition to work correctly. 01398 int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; } 01399 01400 //------------------------------------------------------------------------- 01401 // Hangul constants 01402 //------------------------------------------------------------------------- 01403 01404 enum { 01405 HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7, 01406 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, 01407 HangulNCount = HangulVCount * HangulTCount, // 588 01408 HangulSCount = HangulLCount * HangulNCount // 11172 01409 }; 01410 01411 //------------------------------------------------------------------------- 01412 // Word boundaries (UAX #29) 01413 //------------------------------------------------------------------------- 01414 01415 protected: 01416 // UAX #29, rule WB3: ignore Format and Extend characters. 01417 // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.] 01418 static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); } 01419 bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); } 01420 // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character. 01421 template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01422 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01423 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01424 template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01425 if (position >= srcEnd) return; 01426 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01427 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01428 template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01429 if (position >= srcEnd) return; 01430 if (IsSbSep(src[TVecIdx(position)])) { position++; return; } 01431 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01432 // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character. 01433 template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const { 01434 if (position <= srcStart) return false; 01435 while (position > srcStart) { 01436 position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; } 01437 return false; } 01438 // Test driver for WbFind*NonIgnored. 01439 void TestWbFindNonIgnored(const TIntV& src) const; 01440 void TestWbFindNonIgnored() const; 01441 public: 01442 // Finds the next word boundary strictly after 'position'. 01443 // Note that there is a valid word boundary at 'srcIdx + srcCount'. 01444 // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01445 template<typename TSrcVec> 01446 bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01447 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word 01448 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01449 // always set to 'true'. 01450 template<typename TSrcVec> 01451 void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01452 protected: 01453 void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence); 01454 01455 //------------------------------------------------------------------------- 01456 // Sentence boundaries (UAX #29) 01457 //------------------------------------------------------------------------- 01458 01459 protected: 01460 TUniTrie<TInt> sbExTrie; 01461 01462 // Checks whether a sentence that ended at src[position - 1] 01463 // would end in one of the suffixes from sbExTrie. 01464 template<typename TSrcVec> 01465 bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const; 01466 01467 public: 01468 // Finds the next sentence boundary strictly after 'position'. 01469 // Note that there is a valid sentence boundary at 'srcIdx + srcCount'. 01470 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01471 template<typename TSrcVec> 01472 bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01473 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence 01474 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01475 // always set to 'true'. 01476 template<typename TSrcVec> 01477 void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01478 01479 // These methods allow the user to define a set of sentence boundary exceptions. 01480 // This is a set of strings, stored in 'sbExTrie'. If the Unicode rules require 01481 // a sentence boundary in a position that would cause the sentence to end with 01482 // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie', 01483 // we will *not* place a sentence boundary there. 01484 // 01485 // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods. 01486 // By default, it is empty. Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain 01487 // a standard set of English-language exceptions. 01488 void SbEx_Clr() { sbExTrie.Clr(); } 01489 template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); } 01490 // template<> void SbEx_Add(const TStr& s) { 01491 void SbEx_Add(const TStr& s) { 01492 TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); } 01493 void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); } 01494 int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec); 01495 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]); 01496 return vec.Len(); } 01497 void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; } 01498 int SbEx_SetStdEnglish() { 01499 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv"; 01500 SbEx_Clr(); return SbEx_AddMulti(data, false); } 01501 01502 //------------------------------------------------------------------------- 01503 // Normalization, decomposition, etc. (UAX #15) 01504 //------------------------------------------------------------------------- 01505 01506 protected: 01507 // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary). 01508 // If 'compatibility == false', only canonical decompositions are used. 01509 template<typename TDestCh> 01510 void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const; 01511 public: 01512 // This appends, to 'dest', the decomposed form of the source string. 01513 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01514 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01515 template<typename TSrcVec, typename TDestCh> 01516 void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01517 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01518 template<typename TSrcVec, typename TDestCh> 01519 void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01520 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01521 // This performs canonical composition on the source string, and appends 01522 // the result to the destination string. The source string should be the 01523 // result of a (canonical or compatibility) decomposition; if this is the 01524 // case, the composition will lead to a normalization form C (NFC) or 01525 // normalization form KC (NFKC), depending on whether canonical or compatibility 01526 // decomposition was used. 01527 template<typename TSrcVec, typename TDestCh> 01528 void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01529 TVec<TDestCh>& dest, bool clrDest = true) const; 01530 template<typename TSrcVec, typename TDestCh> 01531 void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01532 Compose(src, 0, src.Len(), dest, clrDest); } 01533 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01534 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01535 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01536 // source string. 01537 template<typename TSrcVec, typename TDestCh> 01538 void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01539 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01540 template<typename TSrcVec, typename TDestCh> 01541 void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01542 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01543 // Copies the starter characters from 'src' to 'dest'; the other 01544 // characters are skipped. 'src' should already have been decomposed. 01545 // Returns the number of characters extracted. 01546 template<typename TSrcVec, typename TDestCh> 01547 size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01548 TVec<TDestCh>& dest, bool clrDest = true) const; 01549 template<typename TSrcVec, typename TDestCh> 01550 size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01551 return ExtractStarters(src, 0, src.Len(), dest, clrDest); } 01552 // Extracts the starters into a temporary vector and then copies it into 'src'. 01553 template<typename TSrcVec> 01554 size_t ExtractStarters(TSrcVec& src) const { 01555 TIntV temp; size_t retVal = ExtractStarters(src, temp); 01556 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]); 01557 return retVal; } 01558 01559 protected: 01560 void TestComposition(const TStr& basePath); 01561 01562 //------------------------------------------------------------------------- 01563 // Initialization from the text files 01564 //------------------------------------------------------------------------- 01565 01566 protected: 01567 void InitWordAndSentenceBoundaryFlags(const TStr& basePath); 01568 void InitScripts(const TStr& basePath); 01569 void InitLineBreaks(const TStr& basePath); 01570 void InitDerivedCoreProperties(const TStr& basePath); 01571 void InitPropList(const TStr& basePath); 01572 void InitSpecialCasing(const TStr& basePath); 01573 void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s); 01574 public: 01575 void LoadTxt(const TStr& basePath); 01576 void SaveBin(const TStr& fnBinUcd); 01577 01578 //------------------------------------------------------------------------- 01579 // Case conversions 01580 //------------------------------------------------------------------------- 01581 01582 public: 01583 typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion; 01584 // Appends the case-converted form of 'src' to 'dest'. 01585 // 'how' defines what kind of case conversion is required. 01586 // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar'). 01587 // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt'). 01588 template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const; 01589 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); } 01590 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); } 01591 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); } 01592 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01593 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01594 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01595 01596 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01597 // This is simpler and faster. Since each character now maps into exactly one 01598 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01599 template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const; 01600 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); } 01601 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); } 01602 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); } 01603 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); } 01604 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); } 01605 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); } 01606 01607 template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const; 01608 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); } 01609 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); } 01610 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); } 01611 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); } 01612 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); } 01613 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); } 01614 01615 public: 01616 friend class TUniCaseFolding; 01617 01618 // Case folding is an alternative to the above functions. It is intended primarily 01619 // to produce strings that are suitable for comparisons. For example, 01620 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01621 // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01622 // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless). 01623 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01624 // into a string of two or more characters. 01625 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01626 // each string before comparing them (see sec. 3.13 of the standard). 01627 template<typename TSrcVec, typename TDestCh> 01628 void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01629 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); } 01630 template<typename TSrcVec, typename TDestCh> 01631 void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const { 01632 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); } 01633 // ToCaseFolded folds the string in place. However, this means that only the simple 01634 // case foldings can be used (the full ones could increase the length of the string). 01635 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); } 01636 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); } 01637 01638 protected: 01639 void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian); 01640 void TestCaseConversions(); 01641 01642 //------------------------------------------------------------------------- 01643 // Text file reader for the Unicode character database 01644 //------------------------------------------------------------------------- 01645 01646 protected: 01647 01648 class TUcdFileReader 01649 { 01650 protected: 01651 TChA buf; 01652 public: 01653 TChA comment; // contains '#' and everything after it 01654 protected: 01655 FILE *f; 01656 int putBackCh; 01657 int GetCh() { 01658 if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; } 01659 return fgetc(f); } 01660 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; } 01661 // Returns 'false' iff the EOF was encountered before anything was read. 01662 bool ReadNextLine() { 01663 buf.Clr(); comment.Clr(); 01664 bool inComment = false, first = true; 01665 while (true) { 01666 int c = GetCh(); 01667 if (c == EOF) return ! first; 01668 else if (c == 13) { 01669 c = GetCh(); if (c != 10) PutBack(c); 01670 return true; } 01671 else if (c == 10) return true; 01672 else if (c == '#') inComment = true; 01673 if (! inComment) buf += char(c); 01674 else comment += char(c); } 01675 /*first = false;*/} 01676 private: 01677 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); } 01678 TUcdFileReader(const TUcdFileReader& r) { Fail; } 01679 public: 01680 TUcdFileReader() : f(0) { } 01681 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); } 01682 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; } 01683 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }} 01684 ~TUcdFileReader() { Close(); } 01685 bool GetNextLine(TStrV& dest) { 01686 dest.Clr(); 01687 while (true) { 01688 if (! ReadNextLine()) return false; 01689 TStr line = buf; line.ToTrunc(); 01690 if (line.Len() <= 0) continue; 01691 line.SplitOnAllCh(';', dest, false); 01692 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc(); 01693 return true; }} 01694 static int ParseCodePoint(const TStr& s) { 01695 int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; } 01696 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list 01697 if (ClrDestP) dest.Clr(); 01698 TStrV parts; s.SplitOnWs(parts); 01699 for (int i = 0; i < parts.Len(); i++) { 01700 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); 01701 dest.Add(c); } } 01702 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy 01703 int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; } 01704 from = ParseCodePoint(s.GetSubStr(0, i - 1)); 01705 to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); } 01706 }; 01707 01708 //------------------------------------------------------------------------- 01709 // Helper class for processing the text files 01710 //------------------------------------------------------------------------- 01711 // Files such as DerivedCoreProps.txt often refer to ranges of codepoints, 01712 // and not all codepoints from the range have also been listed in 01713 // UnicodeData.txt. Thus, new TUniChInfo instances will be created 01714 // when processing DerivedCoreProps.txt and similar files. 01715 // To assign the correct (sub)categories to these new codepoints, 01716 // the following class will extract the subcategory info from the 01717 // comments in DerivedCoreProps.txt and similar files. 01718 01719 class TSubcatHelper 01720 { 01721 public: 01722 bool hasCat; TUniChSubCategory subCat; 01723 TStrH invalidCatCodes; 01724 TUniChDb &owner; 01725 01726 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { } 01727 01728 void ProcessComment(TUniChDb::TUcdFileReader &reader) 01729 { 01730 hasCat = false; subCat = ucOtherNotAssigned; 01731 if (reader.comment.Len() > 3) 01732 { 01733 IAssert(reader.comment[0] == '#'); 01734 IAssert(reader.comment[1] == ' '); 01735 char chCat = reader.comment[2], chSubCat = reader.comment[3]; 01736 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4]))); 01737 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) { 01738 hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); } 01739 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat)); 01740 } 01741 } 01742 01743 void SetCat(const int cp) { 01744 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01745 IAssert(owner.h[i].subCat == ucOtherNotAssigned); 01746 IAssert(hasCat); 01747 owner.h[i].SetCatAndSubCat(subCat); } 01748 void TestCat(const int cp) { 01749 if (! hasCat) return; 01750 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01751 IAssert(owner.h[i].subCat == subCat); } 01752 01753 ~TSubcatHelper() 01754 { 01755 if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&"); 01756 // Output any unexpected ones (there shouldn't be any). 01757 if (! invalidCatCodes.Empty()) { 01758 printf("Invalid cat code(s) in the comments: "); 01759 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); ) 01760 printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr()); 01761 printf("\n"); } 01762 } 01763 }; 01764 }; 01765 01766 //----------------------------------------------------------------------------- 01767 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb 01768 //----------------------------------------------------------------------------- 01769 01770 class TUnicode 01771 { 01772 public: 01773 TUniCodec codec; 01774 TUniChDb ucd; 01775 01776 TUnicode() { Init(); } 01777 explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); } 01778 void Init() { InitCodecs(); } 01779 01780 //----------------------------------------------------------------------- 01781 // UTF-8 01782 //----------------------------------------------------------------------- 01783 01784 // Returns the number of characters that have been successfully decoded. 01785 // This does not include any replacement characters that may have been inserted into 'dest'. 01786 int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01787 int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01788 01789 // Returns the number of characters that have been successfully encoded. 01790 // This does not include any replacement characters that may have been inserted into 'dest'. 01791 int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); } 01792 01793 // The following wrapper around the UTF-8 encoder returns a TStr containing 01794 // the UTF-8-encoded version of the input string. 01795 TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); } 01796 01797 //----------------------------------------------------------------------- 01798 // UTF-16 Decoder 01799 //----------------------------------------------------------------------- 01800 01801 // Returns the number of characters that have been successfully decoded. 01802 // This does not include any replacement characters that may have been inserted into 'dest'. 01803 // Each element of 'src' is assumed to contain one byte of data. 01804 // srcCount must be even (though srcIdx doesn't need to be). 01805 int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest, 01806 const TUtf16BomHandling bomHandling = bomAllowed, 01807 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01808 return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01809 01810 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 01811 // are used to determine if the two bytes of each word should be swapped before further 01812 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 01813 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 01814 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 01815 // beginning of the source data is used to determine the "original" byte order of the data; 01816 // if this doesn't match the byte order of the local machine, the two bytes of each word will 01817 // be swapped during the decoding process. 01818 int DecodeUtf16FromWords(const TIntV& src, TIntV& dest, 01819 const TUtf16BomHandling bomHandling = bomAllowed, 01820 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01821 return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01822 01823 //----------------------------------------------------------------------- 01824 // UTF-16 Encoder 01825 //----------------------------------------------------------------------- 01826 01827 // Returns the number of characters that have been successfully encoded. 01828 // This does not include any replacement characters that may have been inserted into 'dest'. 01829 int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom, 01830 const TUniByteOrder destByteOrder = boMachineEndian) const { 01831 return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01832 01833 int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom, 01834 const TUniByteOrder destByteOrder = boMachineEndian) const { 01835 return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01836 01837 //----------------------------------------------------------------------- 01838 // 8-bit codecs 01839 //----------------------------------------------------------------------- 01840 01841 T8BitCodec<TEncoding_ISO8859_1> iso8859_1; 01842 T8BitCodec<TEncoding_ISO8859_2> iso8859_2; 01843 T8BitCodec<TEncoding_ISO8859_3> iso8859_3; 01844 T8BitCodec<TEncoding_ISO8859_4> iso8859_4; 01845 T8BitCodec<TEncoding_YuAscii> yuAscii; 01846 T8BitCodec<TEncoding_CP1250> cp1250; 01847 T8BitCodec<TEncoding_CP852> cp852; 01848 T8BitCodec<TEncoding_CP437> cp437; 01849 01850 //----------------------------------------------------------------------- 01851 // Codec registry 01852 //----------------------------------------------------------------------- 01853 // If you know you'll need ISO-8859-2, just use 01854 // TUnicode unicode; 01855 // unicode.iso8859_2.Encode(...); 01856 // If you don't know what you'll need, use: 01857 // TUnicode unicode; 01858 // PCodecBase myCodec = unicode.GetCodec(myCodecName); 01859 // myCodec->Encode(...); 01860 // Note that the first approach is slightly more efficient because there 01861 // aren't any virtual method calls involved. 01862 01863 protected: 01864 THash<TStr, PCodecBase> codecs; 01865 static inline TStr NormalizeCodecName(const TStr& name) { 01866 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; } 01867 public: 01868 void RegisterCodec(const TStr& nameList, const PCodecBase& codec) { 01869 TStrV names; nameList.SplitOnWs(names); 01870 for (int i = 0; i < names.Len(); i++) 01871 codecs.AddDat(NormalizeCodecName(names[i]), codec); } 01872 void UnregisterCodec(const TStr& nameList) { 01873 TStrV names; nameList.SplitOnWs(names); 01874 for (int i = 0; i < names.Len(); i++) 01875 codecs.DelKey(NormalizeCodecName(names[i])); } 01876 void ClrCodecs() { codecs.Clr(); } 01877 void InitCodecs(); 01878 PCodecBase GetCodec(const TStr& name) const { 01879 TStr s = NormalizeCodecName(name); 01880 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr(); 01881 return p; } 01882 void GetAllCodecs(TCodecBaseV& dest) const { 01883 dest.Clr(); 01884 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) { 01885 PCodecBase codec = codecs[i]; bool found = false; 01886 for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; } 01887 if (! found) dest.Add(codec); }} 01888 01889 //------------------------------------------------------------------------- 01890 // Word boundaries (UAX #29) 01891 //------------------------------------------------------------------------- 01892 01893 // Finds the next word boundary strictly after 'position'. 01894 // Note that there are valid word boundaries at 0 and at 'src.Len()'. 01895 // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01896 bool FindNextWordBoundary(const TIntV& src, int &position) const { 01897 if (position < 0) { position = 0; return true; } 01898 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01899 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word 01900 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01901 // always set to 'true'. 01902 void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); } 01903 01904 //------------------------------------------------------------------------- 01905 // Sentence boundaries (UAX #29) 01906 //------------------------------------------------------------------------- 01907 01908 // Finds the next sentence boundary strictly after 'position'. 01909 // Note that there are valid sentence boundaries at 0 and at 'src.Len()'. 01910 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01911 bool FindNextSentenceBoundary(const TIntV& src, int &position) const { 01912 if (position < 0) { position = 0; return true; } 01913 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01914 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence 01915 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01916 // always set to 'true'. 01917 void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); } 01918 01919 void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); } 01920 void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); } 01921 01922 //------------------------------------------------------------------------- 01923 // Normalization, decomposition, etc. (UAX #15) 01924 //------------------------------------------------------------------------- 01925 01926 // This sets 'dest' to the decomposed form of the source string. 01927 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01928 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01929 void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); } 01930 // This performs canonical composition on the source string, and stores 01931 // the result in the destination vector. The source string should be the 01932 // result of a (canonical or compatibility) decomposition; if this is the 01933 // case, the composition will lead to a normalization form C (NFC) or 01934 // normalization form KC (NFKC), depending on whether canonical or compatibility 01935 // decomposition was used. 01936 void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); } 01937 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01938 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01939 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01940 // source string. 01941 void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); } 01942 // Copies the starter characters from 'src' to 'dest'; the other 01943 // characters are skipped. 'src' should already have been decomposed. 01944 // Returns the number of characters extracted. This function can be 01945 // used to remove diacritical marks from a string (after it has been decomposed!). 01946 int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); } 01947 // Extracts the starters into a temporary vector and then copies it into 'src'. 01948 int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); } 01949 01950 //------------------------------------------------------------------------- 01951 // Case conversions 01952 //------------------------------------------------------------------------- 01953 // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text, 01954 // use the case-conversion methods in TUniChDb, which allow the caller 01955 // to request language-specific case mappings for these languages. 01956 01957 public: 01958 typedef TUniChDb::TCaseConversion TCaseConversion; 01959 // Sets 'dest' to the case-converted form of 'src'. 01960 void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); } 01961 void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); } 01962 void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); } 01963 01964 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01965 // This is simpler and faster. Since each character now maps into exactly one 01966 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01967 void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); } 01968 void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); } 01969 void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); } 01970 01971 // These functions perform simple case-conversions in-place. 01972 void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); } 01973 void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); } 01974 void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); } 01975 01976 // Case folding is an alternative to the above functions. It is intended primarily 01977 // to produce strings that are suitable for comparisons. For example, 01978 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01979 // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01980 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01981 // into a string of two or more characters. 01982 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01983 // each string before comparing them (see sec. 3.13 of the standard). 01984 void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); } 01985 // ToCaseFolded folds the string in place. However, this means that only the simple 01986 // case foldings can be used (the full ones could increase the length of the string). 01987 void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); } 01988 01989 TStr GetUtf8CaseFolded(const TStr& s) const { 01990 bool isAscii = true; 01991 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; } 01992 if (isAscii) return s.GetLc(); 01993 TIntV src; DecodeUtf8(s, src); 01994 TIntV dest; GetCaseFolded(src, dest); 01995 return EncodeUtf8Str(dest); } 01996 01997 //------------------------------------------------------------------------- 01998 // Character properties 01999 //------------------------------------------------------------------------- 02000 // These methods simply call the corresponding TUniChDb method 02001 // (which typically calls the corresponding method of TUniChInfo). 02002 // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list. 02003 // They are all of the form bool IsXxxx(const int cp) const 02004 // Some of the more notable ones include: 02005 // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit 02006 // IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic 02007 // IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace 02008 02009 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); } 02010 DECLARE_FORWARDED_PROPERTY_METHODS 02011 #undef DECLARE_FORWARDED_PROPERTY_METHODS 02012 #undef __UniFwd1 02013 ___UniFwd2(IsPrivateUse, IsSurrogate) 02014 02015 TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); } 02016 TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); } 02017 02018 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 02019 const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); } 02020 TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); } 02021 02022 }; 02023 02024 //----------------------------------------------------------------------------- 02025 // TUniCodec -- UTF-8 Decoder 02026 //----------------------------------------------------------------------------- 02027 02028 // Returns the number of characters that have been successfully decoded. 02029 // This does not include any replacement characters that may have been inserted into 'dest'. 02030 template<typename TSrcVec, typename TDestCh> 02031 size_t TUniCodec::DecodeUtf8( 02032 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02033 TVec<TDestCh>& dest, const bool clrDest) const 02034 { 02035 size_t nDecoded = 0; 02036 if (clrDest) dest.Clr(); 02037 const size_t origSrcIdx = srcIdx; 02038 const size_t srcEnd = srcIdx + srcCount; 02039 while (srcIdx < srcEnd) 02040 { 02041 const size_t charSrcIdx = srcIdx; 02042 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02043 if ((c & _1000_0000) == 0) { 02044 // c is one of the characters 0..0x7f, encoded as a single byte. 02045 dest.Add(TDestCh(c)); nDecoded++; continue; } 02046 else if ((c & _1100_0000) == _1000_0000) { 02047 // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx. 02048 // We must have been thrown into the middle of a multi-byte character. 02049 switch (errorHandling) { 02050 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx."); 02051 case uehAbort: return nDecoded; 02052 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02053 case uehIgnore: continue; 02054 default: Fail; } } 02055 else 02056 { 02057 // c introduces a sequence of 2..6 bytes, depending on how many 02058 // of the most significant bits of c are set. 02059 uint nMoreBytes = 0, nBits = 0, minVal = 0; 02060 if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80; 02061 else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800; 02062 else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000; 02063 else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000; 02064 else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000; 02065 else { 02066 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8 02067 // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this 02068 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh 02069 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh. 02070 if (strict) { 02071 switch (errorHandling) { 02072 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x."); 02073 case uehAbort: return nDecoded; 02074 // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes 02075 // and try to decode the character. Then, since 'strict' is true and 02076 // the codepoint is clearly >= 2^31, we'll notice this as an error later 02077 // and (in the case of uehReplace) insert a replacement character then. 02078 // This is probably better than inserting a replacement character right 02079 // away and then trying to read the next byte as if a new character 02080 // was beginning there -- if the current byte is really followed by five 02081 // 10xxxxxx bytes, we'll just get six replacement characters in a row. 02082 case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue; 02083 case uehIgnore: break; // continue; 02084 default: Fail; } } 02085 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; } 02086 // Decode this multi-byte sequence. 02087 uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c. 02088 bool cancel = false; 02089 for (uint i = 0; i < nMoreBytes && ! cancel; i++) { 02090 // See if there are enough bytes left in the source vector. 02091 if (! (srcIdx < srcEnd)) { 02092 switch (errorHandling) { 02093 case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available."); 02094 case uehAbort: return nDecoded; 02095 case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue; 02096 case uehIgnore: cancel = true; continue; 02097 default: Fail; } } 02098 // Read the next byte. 02099 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02100 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx. 02101 switch (errorHandling) { 02102 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx."); 02103 case uehAbort: return nDecoded; 02104 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue; 02105 case uehIgnore: srcIdx--; cancel = true; continue; 02106 default: Fail; } } 02107 cOut <<= 6; cOut |= (c & _0011_1111); } 02108 if (cancel) continue; 02109 if (strict) { 02110 // err1: This codepoint has been represented by more bytes than it should have been. 02111 // For example, cOut in the range 0..127 should be represented by a single byte, 02112 // not by two or more bytes. 02113 // - For example, this may happen in the "modified UTF-8" sometimes used for Java 02114 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid 02115 // the appearance of null bytes in the encoded stream. 02116 bool err1 = (cOut < minVal); 02117 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes. 02118 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these 02119 // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary. 02120 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff)); 02121 if (err1 || err2) switch (errorHandling) { 02122 case uehThrow: 02123 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ")."); 02124 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid."); 02125 else { Fail; break; } 02126 case uehAbort: return nDecoded; 02127 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02128 case uehIgnore: continue; 02129 default: Fail; } } 02130 // Add the decoded codepoint to the destination vector. 02131 // If this is the first decoded character, and it's one of the byte-order marks 02132 // (0xfffe and 0xfeff), we will skip it (unless skipBom is false). 02133 if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) { 02134 dest.Add(cOut); nDecoded++; } 02135 } // else (multi-byte sequence) 02136 } // while 02137 return nDecoded; 02138 } 02139 02140 //----------------------------------------------------------------------- 02141 // TUniCodec -- UTF-8 Encoder 02142 //----------------------------------------------------------------------- 02143 02144 // Returns the number of characters that have been successfully encoded. 02145 // This does not include any replacement characters that may have been inserted into 'dest'. 02146 template<typename TSrcVec, typename TDestCh> 02147 size_t TUniCodec::EncodeUtf8( 02148 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02149 TVec<TDestCh>& dest, const bool clrDest) const 02150 { 02151 size_t nEncoded = 0; 02152 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 02153 { 02154 uint c = uint(src[TVecIdx(srcIdx)]); 02155 bool err = false; 02156 if (strict && c > 0x10ffff) { 02157 err = true; 02158 switch (errorHandling) { 02159 case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed)."); 02160 case uehAbort: return nEncoded; 02161 case uehReplace: c = replacementChar; break; 02162 case uehIgnore: continue; 02163 default: Fail; } } 02164 if (c < 0x80u) 02165 dest.Add(TDestCh(c & 0xffu)); 02166 else if (c < 0x800u) { 02167 dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111))); 02168 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02169 else if (c < 0x10000u) { 02170 dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111))); 02171 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02172 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02173 else if (c < 0x200000u) { 02174 dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111))); 02175 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02176 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02177 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02178 else if (c < 0x4000000u) { 02179 dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011))); 02180 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02181 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02182 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02183 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02184 else { 02185 dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011))); 02186 dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111))); 02187 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02188 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02189 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02190 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02191 if (! err) nEncoded++; 02192 } 02193 return nEncoded; 02194 } 02195 02196 //----------------------------------------------------------------------- 02197 // TUniCodec -- UTF-16 Encoder 02198 //----------------------------------------------------------------------- 02199 02200 // Returns the number of characters that have been successfully decoded. 02201 // This does not include any replacement characters that may have been inserted into 'dest'. 02202 // Each element of 'src' is assumed to contain one byte of data. 02203 // srcCount must be even (though srcIdx doesn't need to be). 02204 template<typename TSrcVec, typename TDestCh> 02205 size_t TUniCodec::DecodeUtf16FromBytes( 02206 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02207 TVec<TDestCh>& dest, const bool clrDest, 02208 const TUtf16BomHandling bomHandling, 02209 const TUniByteOrder defaultByteOrder) const 02210 { 02211 IAssert(srcCount % 2 == 0); 02212 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02213 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02214 if (clrDest) dest.Clr(); 02215 size_t nDecoded = 0; 02216 if (srcCount <= 0) return nDecoded; 02217 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02218 bool littleEndian = false; 02219 bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian())); 02220 if (bomHandling == bomIgnored) littleEndian = leDefault; 02221 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02222 { 02223 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; 02224 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; } 02225 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; } 02226 else if (bomHandling == bomAllowed) littleEndian = leDefault; 02227 else { // Report an error. 02228 switch (errorHandling) { 02229 case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead)."); 02230 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02231 default: Fail; } } 02232 } 02233 else Fail; 02234 while (srcIdx < srcEnd) 02235 { 02236 const size_t charSrcIdx = srcIdx; 02237 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02238 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02239 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02240 { 02241 // c is the first character in a surrogate pair. Read the next character. 02242 if (! (srcIdx + 2 <= srcEnd)) { 02243 switch (errorHandling) { 02244 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02245 case uehAbort: return nDecoded; 02246 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02247 case uehIgnore: continue; 02248 default: Fail; } } 02249 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02250 uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02251 // c2 should be the second character of the surrogate pair. 02252 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02253 switch (errorHandling) { 02254 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02255 case uehAbort: return nDecoded; 02256 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02257 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue; 02258 case uehIgnore: srcIdx -= 2; continue; 02259 default: Fail; } } 02260 // c and c2 each contain 10 bits of information. 02261 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02262 cc += 0x10000; 02263 dest.Add(TDestCh(cc)); nDecoded++; continue; 02264 } 02265 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02266 switch (errorHandling) { 02267 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02268 case uehAbort: return nDecoded; 02269 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02270 case uehIgnore: continue; 02271 default: Fail; } } 02272 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02273 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02274 // Otherwise, store 'c' to the destination vector. 02275 dest.Add(TDestCh(c)); nDecoded++; 02276 } 02277 return nDecoded; 02278 } 02279 02280 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 02281 // are used to determine if the two bytes of each word should be swapped before further 02282 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 02283 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 02284 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 02285 // beginning of the source data is used to determine the "original" byte order of the data; 02286 // if this doesn't match the byte order of the local machine, the two bytes of each word will 02287 // be swapped during the decoding process. 02288 template<typename TSrcVec, typename TDestCh> 02289 size_t TUniCodec::DecodeUtf16FromWords( 02290 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02291 TVec<TDestCh>& dest, bool clrDest, 02292 const TUtf16BomHandling bomHandling, 02293 const TUniByteOrder defaultByteOrder) const 02294 { 02295 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02296 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02297 if (clrDest) dest.Clr(); 02298 size_t nDecoded = 0; 02299 if (srcCount <= 0) return nDecoded; 02300 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02301 bool swap = false; 02302 bool isMachineLe = IsMachineLittleEndian(); 02303 bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); 02304 if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe); 02305 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02306 { 02307 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff; 02308 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; } 02309 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; } 02310 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe); 02311 else { // Report an error. 02312 switch (errorHandling) { 02313 case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead)."); 02314 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02315 default: Fail; } } 02316 } 02317 else Fail; 02318 while (srcIdx < srcEnd) 02319 { 02320 const size_t charSrcIdx = srcIdx; 02321 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02322 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02323 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02324 { 02325 // c is the first character in a surrogate pair. Read the next character. 02326 if (! (srcIdx < srcEnd)) { 02327 switch (errorHandling) { 02328 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02329 case uehAbort: return nDecoded; 02330 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02331 case uehIgnore: continue; 02332 default: Fail; } } 02333 uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02334 if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); 02335 // c2 should be the second character of the surrogate pair. 02336 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02337 switch (errorHandling) { 02338 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02339 case uehAbort: return nDecoded; 02340 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02341 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue; 02342 case uehIgnore: srcIdx -= 1; continue; 02343 default: Fail; } } 02344 // c and c2 each contain 10 bits of information. 02345 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02346 cc += 0x10000; 02347 dest.Add(TDestCh(cc)); nDecoded++; continue; 02348 } 02349 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02350 switch (errorHandling) { 02351 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02352 case uehAbort: return nDecoded; 02353 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02354 case uehIgnore: continue; 02355 default: Fail; } } 02356 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02357 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02358 // Otherwise, store 'c' to the destination vector. 02359 dest.Add(TDestCh(c)); nDecoded++; 02360 } 02361 return nDecoded; 02362 } 02363 02364 //----------------------------------------------------------------------- 02365 // TUniCodec -- UTF-16 Encoder 02366 //----------------------------------------------------------------------- 02367 02368 // Returns the number of characters that have been successfully encoded. 02369 // This does not include any replacement characters that may have been inserted into 'dest'. 02370 template<typename TSrcVec, typename TDestCh> 02371 size_t TUniCodec::EncodeUtf16ToWords( 02372 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02373 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02374 const TUniByteOrder destByteOrder) const 02375 { 02376 bool isMachineLe = IsMachineLittleEndian(); 02377 bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe); 02378 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02379 if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; } 02380 while (srcIdx < srcEnd) 02381 { 02382 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02383 if (! (c <= 0x10ffffu)) { 02384 switch (errorHandling) { 02385 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02386 case uehAbort: return nEncoded; 02387 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02388 case uehIgnore: continue; 02389 default: Fail; } } 02390 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02391 switch (errorHandling) { 02392 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02393 case uehAbort: return nEncoded; 02394 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02395 case uehIgnore: continue; 02396 default: Fail; } } 02397 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02398 switch (errorHandling) { 02399 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02400 case uehAbort: return nEncoded; 02401 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02402 case uehIgnore: continue; 02403 default: Fail; } } 02404 // If c is <= 0xffff, it can be stored directly. 02405 if (c <= 0xffffu) { 02406 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02407 dest.Add(TDestCh(c)); nEncoded++; continue; } 02408 // Otherwise, represent c by a pair of surrogate characters. 02409 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02410 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02411 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02412 if (swap) { 02413 c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8); 02414 c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); } 02415 dest.Add(TDestCh(c1)); 02416 dest.Add(TDestCh(c2)); 02417 nEncoded++; continue; 02418 } 02419 return nEncoded; 02420 } 02421 02422 template<typename TSrcVec, typename TDestCh> 02423 size_t TUniCodec::EncodeUtf16ToBytes( 02424 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02425 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02426 const TUniByteOrder destByteOrder) const 02427 { 02428 bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian())); 02429 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02430 if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; } 02431 while (srcIdx < srcEnd) 02432 { 02433 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02434 if (! (c <= 0x10ffffu)) { 02435 switch (errorHandling) { 02436 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02437 case uehAbort: return nEncoded; 02438 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } 02439 case uehReplace: ___OutRepl; continue; 02440 case uehIgnore: continue; 02441 default: Fail; } } 02442 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02443 switch (errorHandling) { 02444 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02445 case uehAbort: return nEncoded; 02446 case uehReplace: ___OutRepl; continue; 02447 case uehIgnore: continue; 02448 default: Fail; } } 02449 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02450 switch (errorHandling) { 02451 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02452 case uehAbort: return nEncoded; 02453 case uehReplace: ___OutRepl; continue; 02454 case uehIgnore: continue; 02455 default: Fail; } } 02456 #undef ___OutRepl 02457 // If c is <= 0xffff, it can be stored directly. 02458 if (c <= 0xffffu) { 02459 if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } 02460 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } 02461 nEncoded++; continue; } 02462 // Otherwise, represent c by a pair of surrogate characters. 02463 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02464 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02465 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02466 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); } 02467 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); } 02468 nEncoded++; continue; 02469 } 02470 return nEncoded; 02471 } 02472 02473 //----------------------------------------------------------------------------- 02474 // TUniChDb -- word boundaries 02475 //----------------------------------------------------------------------------- 02476 02477 template<typename TSrcVec> 02478 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02479 { 02480 // WB1. Break at the start of text. 02481 if (position < srcIdx) { position = srcIdx; return true; } 02482 // If we are beyond the end of the text, there aren't any word breaks left. 02483 const size_t srcEnd = srcIdx + srcCount; 02484 if (position >= srcEnd) return false; 02485 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02486 size_t origPos = position; 02487 if (IsWbIgnored(src[TVecIdx(position)])) { 02488 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02489 position = origPos; 02490 } 02491 // Determine the previous nonignored character (before 'position'). 02492 size_t posPrev = position; 02493 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02494 // Sec 6.2. Allow a break between Sep and an ignored character. 02495 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02496 // Determine the next nonignored character (after 'position'). 02497 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02498 size_t posNext2; 02499 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02500 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02501 int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext); 02502 int cNext2, wbfNext2; 02503 // 02504 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02505 cPrev = cCur, cCur = cNext, cNext = cNext2, 02506 wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2) 02507 { 02508 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02509 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02510 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02511 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02512 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02513 wbfNext2 = GetWbFlags(cNext2); 02514 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02515 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue 02516 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02517 // WB3. Do not break within CRLF. 02518 if (cCur == 13 && cNext == 10) continue; 02519 // WB5. Do not break between most letters. 02520 TestCurNext(ucfWbALetter, ucfWbALetter); 02521 // WB6. Do not break letters across certain punctuation. 02522 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02523 // WB7. Do not break letters across certain punctuation. 02524 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02525 // WB8. Do not break within sequences of digits, or digits adjacent to letters. 02526 TestCurNext(ucfWbNumeric, ucfWbNumeric); 02527 // WB9. Do not break within sequences of digits, or digits adjacent to letters. 02528 TestCurNext(ucfWbALetter, ucfWbNumeric); 02529 // WB10. Do not break within sequences of digits, or digits adjacent to letters. 02530 TestCurNext(ucfWbNumeric, ucfWbALetter); 02531 // WB11. Do not break within sequences, such as "3.2" or "3.456,789". 02532 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02533 // WB12. Do not break within sequences, such as "3.2" or "3.456,789". 02534 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02535 // WB13. Do not break between Katakana. 02536 TestCurNext(ucfWbKatakana, ucfWbKatakana); 02537 // WB13a. Do not break from extenders. 02538 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 && 02539 (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue; 02540 // WB13b. Do not break from extenders. 02541 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet && 02542 (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue; 02543 // WB14. Otherwise, break everywhere. 02544 position = posNext; return true; 02545 #undef TestCurNext 02546 #undef TestCurNext2 02547 #undef TestPrevCurNext 02548 } 02549 // WB2. Break at the end of text. 02550 IAssert(position == srcEnd); 02551 return true; 02552 } 02553 02554 // ToDo: provide a more efficient implementation of this. 02555 template<typename TSrcVec> 02556 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02557 { 02558 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02559 dest.PutAll(false); 02560 size_t position = srcIdx; 02561 dest[TVecIdx(position - srcIdx)] = true; 02562 while (position < srcIdx + srcCount) 02563 { 02564 size_t oldPos = position; 02565 FindNextWordBoundary(src, srcIdx, srcCount, position); 02566 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02567 dest[TVecIdx(position - srcIdx)] = true; 02568 } 02569 Assert(dest[TVecIdx(srcCount)]); 02570 } 02571 02572 //----------------------------------------------------------------------------- 02573 // TUniChDb -- sentence boundaries 02574 //----------------------------------------------------------------------------- 02575 02576 template<typename TSrcVec> 02577 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const 02578 { 02579 if (sbExTrie.Empty()) return true; 02580 // We'll move back from the position where a sentence-boundary is being considered. 02581 size_t pos = position; 02582 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02583 int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c); 02584 // - Skip the Sep, if there is one. 02585 if ((c & ucfSbSep) == ucfSbSep) { 02586 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02587 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02588 // - Skip any Sp characters. 02589 while ((sfb & ucfSbSp) == ucfSbSp) { 02590 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02591 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02592 // - Skip any Close characters. 02593 while ((sfb & ucfSbSp) == ucfSbSp) { 02594 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02595 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02596 // - Skip any ATerm | STerm characters. 02597 while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) { 02598 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02599 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02600 // Now start moving through the trie. 02601 int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1; 02602 while (true) 02603 { 02604 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos)); 02605 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]); 02606 TUniChCategory cat = GetCat(c); 02607 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) { 02608 // Check if the suffix we've read so far is one of those that appear in the trie. 02609 if (len == 1) return ! sbExTrie.Has1Gram(cLast); 02610 if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast); 02611 IAssert(len >= 3); IAssert(node >= 0); 02612 if (sbExTrie.IsNodeTerminal(node)) return false; 02613 if (atEnd) return true; } 02614 if (len == 1) { cButLast = c; len++; } 02615 else if (len == 2) { cButButLast = c; len++; 02616 // Now we have read the last three characters; start descending the suitable subtrie. 02617 node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast); 02618 if (node < 0) return true; } 02619 else { 02620 // Descend down the trie. 02621 node = sbExTrie.GetChild(node, c); 02622 if (node < 0) return true; } 02623 } 02624 //return true; 02625 } 02626 02627 template<typename TSrcVec> 02628 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02629 { 02630 // SB1. Break at the start of text. 02631 if (position < srcIdx) { position = srcIdx; return true; } 02632 // If we are beyond the end of the text, there aren't any word breaks left. 02633 const size_t srcEnd = srcIdx + srcCount; 02634 if (position >= srcEnd) return false; 02635 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02636 size_t origPos = position; 02637 if (IsWbIgnored(src[TVecIdx(position)])) { 02638 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02639 position = origPos; 02640 } 02641 // Determine the previous nonignored character (before 'position'). 02642 size_t posPrev = position; 02643 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02644 // Sec 6.2. Allow a break between Sep and an ignored character. 02645 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02646 // Determine the next nonignored character (after 'position'). 02647 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02648 size_t posNext2; 02649 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02650 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02651 int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext); 02652 int cNext2, sbfNext2; 02653 // Initialize the state of the peek-back automaton. 02654 typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState; 02655 TPeekBackState backState; 02656 { 02657 size_t pos = position; 02658 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false; 02659 while (true) 02660 { 02661 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02662 // Skip at most one Sep. 02663 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02664 if ((sbf & ucfSbSep) == ucfSbSep) { 02665 wasSep = true; 02666 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02667 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02668 // Skip zero or more Sp's. 02669 bool stop = false; 02670 while ((sbf & ucfSbSp) == ucfSbSp) { 02671 wasSp = true; 02672 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02673 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02674 if (stop) break; 02675 // Skip zero or more Close's. 02676 while ((sbf & ucfSbClose) == ucfSbClose) { 02677 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02678 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02679 if (stop) break; 02680 // Process an ATerm or STerm. 02681 wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm); 02682 wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm); 02683 break; 02684 } 02685 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm); 02686 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm); 02687 else backState = stInit; 02688 } 02689 // Initialize the state of the peek-ahead automaton. This state tells us what follows 02690 // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}. 02691 // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string. 02692 // Our peek-ahead automaton must tell us whether it is Lower or something else. 02693 typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState; 02694 TPeekAheadState aheadState = stUnknown; 02695 // 02696 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02697 cPrev = cCur, cCur = cNext, cNext = cNext2, 02698 sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2) 02699 { 02700 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02701 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02702 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02703 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02704 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02705 sbfNext2 = GetSbFlags(cNext2); 02706 // Update the peek-back automaton. 02707 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag) 02708 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; } 02709 switch (backState) { 02710 case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break; 02711 case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break; 02712 case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break; 02713 case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02714 case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02715 case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02716 case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02717 default: IAssert(false); } 02718 #undef Trans 02719 #undef TestCur 02720 // Update the peek-ahead automaton. 02721 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0) 02722 if (! IsPeekAheadSkippable(sbfCur)) { 02723 bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower); 02724 if (aheadState == stLower) IAssert(isLower); 02725 else if (aheadState == stNotLower) IAssert(! isLower); 02726 // We haven't peaked ahead farther than this so far -- invalidate the state. 02727 aheadState = stUnknown; } 02728 if (aheadState == stUnknown) 02729 { 02730 // Peak ahead to the next non-peekahead-skippable character. 02731 size_t pos = posNext; 02732 while (pos < srcEnd) { 02733 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02734 if (! IsPeekAheadSkippable(sbf)) { 02735 if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower; 02736 else aheadState = stNotLower; 02737 break; } 02738 WbFindNextNonIgnored(src, pos, srcEnd); } 02739 if (! (pos < srcEnd)) aheadState = stNotLower; 02740 } 02741 #undef IsPeekAheadSkippable 02742 // 02743 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02744 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue 02745 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02746 // SB3. Do not break within CRLF. 02747 if (cCur == 13 && cNext == 10) continue; 02748 // SB4. Break ater paragraph separators. 02749 if ((sbfCur & ucfSbSep) == ucfSbSep) { 02750 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02751 position = posNext; return true; } 02752 // Do not break after ambiguous terminators like period, if they are immediately followed by a number 02753 // or lowercase letter, if they are between uppercase letters, or if the first following letter 02754 // (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation 02755 // or numeric period, and thus may not mark the end of a sentence. 02756 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6 02757 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7 02758 // SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm) 02759 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) && 02760 (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue; 02761 // SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower 02762 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue; 02763 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present). 02764 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep ) 02765 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue; 02766 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep ) 02767 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break] 02768 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) { 02769 if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10 02770 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02771 position = posNext; return true; } // SB11 02772 // WB12. Otherwise, do not break. 02773 continue; 02774 #undef TestCurNext 02775 #undef TestCurNext2 02776 #undef TestPrevCurNext 02777 } 02778 // WB2. Break at the end of text. 02779 IAssert(position == srcEnd); 02780 return true; 02781 } 02782 02783 // ToDo: provide a more efficient implementation of this. 02784 template<typename TSrcVec> 02785 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02786 { 02787 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02788 dest.PutAll(false); 02789 size_t position = srcIdx; 02790 dest[TVecIdx(position - srcIdx)] = true; 02791 while (position < srcIdx + srcCount) 02792 { 02793 size_t oldPos = position; 02794 FindNextSentenceBoundary(src, srcIdx, srcCount, position); 02795 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02796 dest[TVecIdx(position - srcIdx)] = true; 02797 } 02798 Assert(dest[TVecIdx(srcCount)]); 02799 } 02800 02801 //----------------------------------------------------------------------------- 02802 // TUniChDb -- case conversions 02803 //----------------------------------------------------------------------------- 02804 02805 template<typename TSrcVec, typename TDestCh> 02806 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02807 TVec<TDestCh>& dest, const bool clrDest, 02808 const TUniChDb::TCaseConversion how, 02809 const bool turkic, const bool lithuanian) const 02810 { 02811 const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0)); 02812 if (clrDest) dest.Clr(); 02813 enum { 02814 GreekCapitalLetterSigma = 0x3a3, 02815 GreekSmallLetterSigma = 0x3c3, 02816 GreekSmallLetterFinalSigma = 0x3c2, 02817 LatinCapitalLetterI = 0x49, 02818 LatinCapitalLetterJ = 0x4a, 02819 LatinCapitalLetterIWithOgonek = 0x12e, 02820 LatinCapitalLetterIWithGrave = 0xcc, 02821 LatinCapitalLetterIWithAcute = 0xcd, 02822 LatinCapitalLetterIWithTilde = 0x128, 02823 LatinCapitalLetterIWithDotAbove = 0x130, 02824 LatinSmallLetterI = 0x69, 02825 CombiningDotAbove = 0x307 02826 }; 02827 // 02828 bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1; 02829 size_t nextWordBoundary = srcIdx; 02830 TBoolV wordBoundaries; bool wbsKnown = false; 02831 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 02832 { 02833 int cp = src[TVecIdx(srcIdx)]; srcIdx++; 02834 //if (turkic && cp == 0x130 && how == ccLower) printf("!"); 02835 // For conversion to titlecase, the first cased character of each word 02836 // must be converted to titlecase; everything else must be converted 02837 // to lowercase. 02838 TUniChDb::TCaseConversion howHere; 02839 if (how != ccTitle) howHere = how; 02840 else { 02841 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 02842 seenCased = false; seenTwoCased = false; cpFirstCased = -1; 02843 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 02844 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 02845 bool isCased = IsCased(cp); 02846 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; } 02847 else { howHere = ccLower; 02848 if (isCased && seenCased) seenTwoCased = true; } 02849 } 02850 // First, process the conditional mappings from SpecialCasing.txt. 02851 // These will be processed in code -- they were ignored while 02852 // we were reading SpecialCasing.txt itself. 02853 if (cp == GreekCapitalLetterSigma && howHere == ccLower) 02854 { 02855 // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of 02856 // the standard doesn't define it. We'll use FinalCased instead. 02857 // FinalCased: within the closest word boundaries containing C, 02858 // there is a cased letter before C, and there is no cased letter after C. 02859 //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary); 02860 if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; } 02861 size_t srcIdx2 = srcIdx; bool casedAfter = false; 02862 if (how == ccTitle) 02863 printf("!"); 02864 //while (srcIdx2 < nextBoundary) 02865 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02866 { 02867 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02868 if (IsCased(cp2)) { casedAfter = true; break; } 02869 } 02870 if (! casedAfter) 02871 { 02872 //size_t prevBoundary = srcIdx - 1; 02873 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary); 02874 srcIdx2 = srcIdx - 1; bool casedBefore = false; 02875 //while (prevBoundary < srcIdx2) 02876 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02877 { 02878 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02879 if (IsCased(cp2)) { casedBefore = true; break; } 02880 } 02881 if (casedBefore) { 02882 // Now we have a FinalCased character. 02883 dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; } 02884 } 02885 // If we got here, add a non-final sigma. 02886 dest.Add(GreekSmallLetterSigma); continue; 02887 } 02888 else if (lithuanian) 02889 { 02890 if (howHere == ccLower) 02891 { 02892 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek) 02893 { 02894 bool moreAbove = false; 02895 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02896 { 02897 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02898 const int cc2 = GetCombiningClass(cp2); 02899 if (cc2 == TUniChInfo::ccStarter) break; 02900 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; } 02901 } 02902 if (moreAbove) 02903 { 02904 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; } 02905 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; } 02906 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; } 02907 } 02908 } 02909 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; } 02910 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; } 02911 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; } 02912 } 02913 if (cp == CombiningDotAbove) 02914 { 02915 // Lithuanian, howHere != ccLower. 02916 // AfterSoftDotted := the last preceding character with a combining class 02917 // of zero before C was Soft_Dotted, and there is no intervening combining 02918 // character class 230 (ABOVE). 02919 bool afterSoftDotted = false; 02920 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02921 while (origSrcIdx < srcIdx2) 02922 { 02923 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02924 int cc2 = GetCombiningClass(cp2); 02925 if (cc2 == TUniChInfo::ccAbove) break; 02926 if (cc2 == TUniChInfo::ccStarter) { 02927 afterSoftDotted = IsSoftDotted(cp2); break; } 02928 } 02929 if (afterSoftDotted) 02930 { 02931 Assert(lithuanian); 02932 // Remove DOT ABOVE after "i" with upper or titlecase. 02933 // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle, 02934 // the "i" may have been kept lowercase and thus we shouldn't remove the dot). 02935 if (how == ccLower) { dest.Add(0x307); continue; } 02936 if (how == ccUpper) continue; 02937 Assert(how == ccTitle); 02938 Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character 02939 if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot. 02940 dest.Add(0x307); continue; 02941 } 02942 } 02943 } 02944 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri) 02945 { 02946 // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 02947 // The following rules handle those cases. 02948 if (cp == LatinCapitalLetterIWithDotAbove) { 02949 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; } 02950 // When lowercasing, remove dot_above in the sequence I + dot_above, 02951 // which will turn into i. This matches the behavior of the 02952 // canonically equivalent I-dot_above. 02953 else if (cp == CombiningDotAbove) 02954 { 02955 // AfterI: the last preceding base character was an uppercase I, 02956 // and there is no intervening combining character class 230 (ABOVE). 02957 bool afterI = false; 02958 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02959 while (origSrcIdx < srcIdx2) 02960 { 02961 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02962 if (cp2 == LatinCapitalLetterI) { afterI = true; break; } 02963 int cc2 = GetCombiningClass(cp2); 02964 if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break; 02965 } 02966 if (afterI) { 02967 if (how == ccTitle && seenCased && ! seenTwoCased) { 02968 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word; 02969 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase. 02970 // This suggests that if a cased character is found, others in that word should be left alone. 02971 // This seems unusual; we map all other characters to lowercase instead. 02972 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above 02973 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase; 02974 // but since afterI is also true here, this would mean deleting it. Thus our titlecased 02975 // form of "I followed by dot-above" would be just "I", which is clearly wrong. 02976 // So we treat this as a special case here. 02977 IAssert(cpFirstCased == LatinCapitalLetterI); 02978 dest.Add(0x307); continue; } 02979 if (howHere != ccLower) dest.Add(0x307); 02980 continue; } 02981 } 02982 // When lowercasing, unless an I is before a dot_above, 02983 // it turns into a dotless i. 02984 else if (cp == LatinCapitalLetterI) 02985 { 02986 // BeforeDot: C is followed by U+0307 (combining dot above). 02987 // Any sequence of characters with a combining class that is 02988 // neither 0 nor 230 may intervene between the current character 02989 // and the combining dot above. 02990 bool beforeDot = false; 02991 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02992 { 02993 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02994 if (cp2 == 0x307) { beforeDot = true; break; } 02995 const int cc2 = GetCombiningClass(cp2); 02996 if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break; 02997 } 02998 if (! beforeDot) { 02999 dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; } 03000 } 03001 // When uppercasing, i turns into a dotted capital I. 03002 else if (cp == LatinSmallLetterI) 03003 { 03004 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; 03005 } 03006 } 03007 // Try to use the unconditional mappings. 03008 const TIntIntVH &specHere = ( 03009 howHere == how ? specials : 03010 howHere == ccLower ? specialCasingLower : 03011 howHere == ccTitle ? specialCasingTitle : 03012 howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0)); 03013 int i = specHere.GetKeyId(cp); 03014 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; } 03015 // Try to use the simple (one-character) mappings. 03016 i = h.GetKeyId(cp); 03017 if (i >= 0) { 03018 const TUniChInfo &ci = h[i]; 03019 int cpNew = ( 03020 howHere == ccLower ? ci.simpleLowerCaseMapping : 03021 howHere == ccUpper ? ci.simpleUpperCaseMapping : 03022 ci.simpleTitleCaseMapping); 03023 if (cpNew < 0) cpNew = cp; 03024 dest.Add(cpNew); continue; } 03025 // As a final resort, leave 'cp' unchanged. 03026 dest.Add(cp); 03027 } 03028 } 03029 03030 template<typename TSrcVec, typename TDestCh> 03031 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03032 TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const 03033 { 03034 if (clrDest) dest.Clr(); 03035 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03036 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 03037 { 03038 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03039 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; } 03040 const TUniChInfo &ci = h[i]; 03041 // With titlecasing, the first cased character of each word must be put into titlecase, 03042 // all others into lowercase. This is what the howHere variable is for. 03043 TUniChDb::TCaseConversion howHere; 03044 if (how != ccTitle) howHere = how; 03045 else { 03046 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 03047 seenCased = false; 03048 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03049 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03050 bool isCased = IsCased(cp); 03051 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03052 else howHere = ccLower; 03053 } 03054 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03055 if (cpNew < 0) cpNew = cp; 03056 dest.Add(cpNew); 03057 } 03058 } 03059 03060 template<typename TSrcVec> 03061 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const 03062 { 03063 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03064 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 03065 { 03066 const int cp = src[TVecIdx(srcIdx)]; 03067 int i = h.GetKeyId(cp); if (i < 0) continue; 03068 const TUniChInfo &ci = h[i]; 03069 // With titlecasing, the first cased character of each word must be put into titlecase, 03070 // all others into lowercase. This is what the howHere variable is for. 03071 TUniChDb::TCaseConversion howHere; 03072 if (how != ccTitle) howHere = how; 03073 else { 03074 if (srcIdx == nextWordBoundary) { // A word starts/ends here. 03075 seenCased = false; 03076 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03077 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03078 bool isCased = IsCased(cp); 03079 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03080 else howHere = ccLower; 03081 } 03082 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03083 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew; 03084 } 03085 } 03086 03087 //----------------------------------------------------------------------------- 03088 // TUniChDb -- composition, decomposition, normal forms 03089 //----------------------------------------------------------------------------- 03090 03091 template<typename TDestCh> 03092 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const 03093 { 03094 if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount) 03095 { 03096 // UAX #15, sec. 16: Hangul decomposition 03097 const int SIndex = codePoint - HangulSBase; 03098 const int L = HangulLBase + SIndex / HangulNCount; 03099 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount; 03100 const int T = HangulTBase + (SIndex % HangulTCount); 03101 dest.Add(L); dest.Add(V); 03102 if (T != HangulTBase) dest.Add(T); 03103 return; 03104 } 03105 int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; } 03106 const TUniChInfo &ci = h[i]; 03107 int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; } 03108 if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; } 03109 while (true) { 03110 int cp = decompositions[ofs++]; if (cp < 0) return; 03111 AddDecomposition(cp, dest, compatibility); } 03112 } 03113 03114 template<typename TSrcVec, typename TDestCh> 03115 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03116 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const 03117 { 03118 if (clrDest) dest.Clr(); 03119 const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/; 03120 // Decompose the string. 03121 while (srcIdx < srcCount) { 03122 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; } 03123 // Rearrange the decomposed string into canonical order. 03124 for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; ) 03125 { 03126 size_t j = destIdx; 03127 int cp = dest[TVecIdx(destIdx)]; destIdx++; 03128 int cpCls = GetCombiningClass(cp); 03129 if (cpCls == TUniChInfo::ccStarter) continue; 03130 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) { 03131 dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; } 03132 dest[TVecIdx(j)] = cp; 03133 } 03134 } 03135 03136 template<typename TSrcVec, typename TDestCh> 03137 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03138 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const 03139 { 03140 if (clrDest) dest.Clr(); 03141 TIntV temp; 03142 Decompose(src, srcIdx, srcCount, temp, compatibility); 03143 Compose(temp, 0, temp.Len(), dest, clrDest); 03144 } 03145 03146 template<typename TSrcVec, typename TDestCh> 03147 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03148 TVec<TDestCh>& dest, bool clrDest) const 03149 { 03150 if (clrDest) dest.Clr(); 03151 bool lastStarterKnown = false; // has a starter been encountered yet? 03152 size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter 03153 int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos]) 03154 const size_t srcEnd = srcIdx + srcCount; 03155 int ccMax = -1; // The highest combining class among the characters since the last starter. 03156 while (srcIdx < srcEnd) 03157 { 03158 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03159 const int cpClass = GetCombiningClass(cp); 03160 //int cpCombined = -1; 03161 // If there is a starter with which 'cp' can be combined, and from which it is not blocked 03162 // by some intermediate character, we can try to combine them. 03163 if (lastStarterKnown && ccMax < cpClass) 03164 { 03165 int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp)); 03166 int cpCombined = -1; 03167 do { 03168 // Try to look up a composition in the inverseDec table. 03169 if (j >= 0) { cpCombined = inverseDec[j]; break; } 03170 // UAX #15, sec. 16: Hangul composition 03171 // - Try to combine L and V. 03172 const int LIndex = cpLastStarter - HangulLBase; 03173 if (0 <= LIndex && LIndex < HangulLCount) { 03174 const int VIndex = cp - HangulVBase; 03175 if (0 <= VIndex && VIndex < HangulVCount) { 03176 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount; 03177 break; } } 03178 // - Try to combine LV and T. 03179 const int SIndex = cpLastStarter - HangulSBase; 03180 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) 03181 { 03182 const int TIndex = cp - HangulTBase; 03183 if (0 <= TIndex && TIndex < HangulTCount) { 03184 cpCombined = cpLastStarter + TIndex; 03185 break; } 03186 } 03187 } while (false); 03188 // If a combining character has been found, use it to replace the old cpStarter. 03189 if (cpCombined >= 0) { 03190 dest[TVecIdx(lastStarterPos)] = cpCombined; 03191 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter); 03192 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else 03193 cpLastStarter = cpCombined; continue; } 03194 } 03195 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter. 03196 lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; } 03197 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking). 03198 ccMax = cpClass; 03199 dest.Add(cp); 03200 } 03201 } 03202 03203 template<typename TSrcVec, typename TDestCh> 03204 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03205 TVec<TDestCh>& dest, bool clrDest) const 03206 { 03207 if (clrDest) dest.Clr(); 03208 size_t retVal = 0; 03209 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 03210 const int cp = src[TVecIdx(srcIdx)]; 03211 if (GetCombiningClass(cp) == TUniChInfo::ccStarter) 03212 { dest.Add(cp); retVal++; } } 03213 return retVal; 03214 } 03215 03216 inline bool AlwaysFalse() 03217 { 03218 int sum = 0; 03219 for (int i = 0; i < 5; i++) sum += i; 03220 return sum > 100; 03221 } 03222 03223 inline bool AlwaysTrue() 03224 { 03225 int sum = 0; 03226 for (int i = 0; i < 5; i++) sum += i; 03227 return sum < 100; 03228 } 03229 03230 /* 03231 03232 Notes on decomposition: 03233 03234 - In UnicodeData.txt, there is a field with the decomposition mapping. 03235 This field may also include a tag, <...>. 03236 If there is a tag, this is a compatibility mapping. 03237 Otherwise it is a canonical mapping. 03238 - Canonical decomposition uses only canonical mappings, 03239 compatibility decomposition uses both canonical and compatibility mappings. 03240 - Decomposition: 03241 1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively. 03242 2. Put the string into canonical order, which means: 03243 while there exists a pair of characters, A immediately followed by B, 03244 such that combiningclass(A) > combiningclass(B) > 0 [an "exchangeable pair"]: 03245 swap A and B; 03246 This results in NFD (normalized form D, after canonical decomposition) 03247 or NFKD (normalized form KD, after compatibility decomposition). 03248 - Canonical composition: 03249 1. Before composition, the string should have been decomposed 03250 (using either canonical or compatibility decomposition). 03251 2. For each character C (from left to right): 03252 2.1. Find the last starter S before C (if not found, continue). 03253 2.2. If there is, between S and C, some character with a combining class >= than that of C, then continue. 03254 2.3. If there exists a character L for which the canonical decomposition is S+L 03255 and L is not in the composition exclusion table [i.e. L is a "primary composite"], 03256 then replace S by L, and remove C. 03257 This results in NFC (normalized form C, with canonical decomposition followed by canonical composition) 03258 or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition). 03259 - Composition exclusion table: 03260 - Anything in CompositionExclusions.txt. 03261 - Singletons: characters whose canonical decomposition is a single character. 03262 - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter. 03263 03264 Example: 03265 E-grave (00c8; composition class 0; canonical decomposition: 0045 0300) 03266 E-macron (0112; composition class 0; 0045 0304) 03267 grave (0300; composition class 230) 03268 macron (0304; composition class 230) 03269 source string: 00c8 0304 03270 after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304 03271 after canonical composition: 00c8 0304 03272 03273 cc(horn) = 216 03274 cc(dot below) = 220 03275 cc(dot above) = 230 03276 03277 ToDos: 03278 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov. 03279 Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna. 03280 Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu 03281 upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt). 03282 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase. 03283 Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt 03284 (+ simple case mappinge v UnicodeData.txt). 03285 Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo 03286 in jih potem upostevamo posebej kar v source kodi nasih programov [za 03287 podrobno definicijo pogojev pa glej tabelo 3.13]. 03288 - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase. 03289 Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3. 03290 To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise, 03291 da je CaseFolding.txt izpeljan iz njiju. Glavni namen CaseFolding.txt naj bi bil za 03292 potrebe "locale-independent case folding" (table 4.1 in sec. 5.18). 03293 - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13 03294 in se posebej str. 90. 03295 - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD 03296 - definicija cased ipd. na str. 89 03297 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15 03298 Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih 03299 stvari, med drugim isLowerCase in isUpperCase. Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9). 03300 To je se najbolje dodati med flagse posameznega characterja. 03301 - general category: sec. 4.5 03302 - motivacija za titlecase: 5.18 03303 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt 03304 pod Full_Composition_Exclusion 03305 - script names: Scripts.txt in UAX #24. 03306 - block names: Blocks.txt 03307 - space characters: table 6.2 in baje tudi UCD.html 03308 - dash characters: table 6.3 03309 */ 03310 03311 //#endif 03312