SNAP Library 2.1, Developer Reference  2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
unicode.h
Go to the documentation of this file.
00001 #include "bd.h"
00002 
00003 //#ifndef unicode_h
00004 //#define unicode_h
00005 
00007 // Includes
00008 //#include "base.h"
00009 #include <new>
00010 
00011 typedef int TUniVecIdx;
00012 
00013 //-----------------------------------------------------------------------------
00014 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder
00015 //-----------------------------------------------------------------------------
00016 
00017 // Error handling modes for the TUniCodec class.
00018 typedef enum TUnicodeErrorHandling_
00019 {
00020         // What happens when an error occurs:
00021         uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
00022         uehThrow = 1,   // - an exception is thrown (TUnicodeException)
00023         uehReplace = 2, // - the replacement character is added to the output vector
00024         uehAbort = 3    // - the encoding/decoding process stops immediately
00025 }
00026 TUnicodeErrorHandling;
00027 
00028 class TUnicodeException
00029 {
00030 public:
00031         TStr message;  // error message
00032         size_t srcIdx; // the position in the source vector where the error occurred
00033         int srcChar;   // the source character at the position srcIdx
00034         TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :
00035                 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
00036 };
00037 
00038 typedef enum TUniByteOrder_
00039 {
00040         boMachineEndian = 0,
00041         boLittleEndian = 1,
00042         boBigEndian = 2
00043 }
00044 TUniByteOrder;
00045 
00046 typedef enum TUtf16BomHandling_
00047 {
00048         bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
00049         bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
00050         bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
00051 }
00052 TUtf16BomHandling;
00053 
00054 class TUniCodec
00055 {
00056 public:
00057         // 0xfffd is defined as the replacement character by the Unicode standard.
00058         // By default, it is rendered as a question mark inside a diamond: "<?>".
00059         enum { DefaultReplacementChar = 0xfffd };
00060 
00061         // The replacement character is inserted into the destination vector
00062         // if an error occurs in the source vector.  By default, this is set
00063         // to DefaultReplacementChar.
00064         int replacementChar;
00065         // The error handling mode.
00066         TUnicodeErrorHandling errorHandling;
00067         // There are a number of situations where there is strictly speaking an error in
00068         // the source data although it can still be decoded in a reasonably meaningful way.
00069         // If strict == true, these situations are treated as errors.  Examples:
00070         // - when decoding UTF-8:
00071         //   - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127
00072         //     encoded as a two-byte sequence)
00073         //   - a codepoint > 0x10ffff
00074         // - when decoding UTF-16:
00075         //   - a codepoint from the range reserved for the second character of a surrogate pair
00076         //     is not preceded by a codepoint from the range reserved for the first character of a surrogate pair
00077         // - when encoding UTF-8:
00078         //   - a codepoint > 0x10ffff
00079         // - when encoding UTF-16:
00080         //   - a codepoint from the range reserved from the second character of a surrogate pair
00081         //     [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a
00082         //     surrogate pair, is always an error, even with strict == false]
00083         bool strict;
00084         // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning
00085         // of the source vector, it is skipped (when decoding).
00086         // - Note: a BOM is not really useful in UTF-8 encoded data.  However, the .NET UTF8Encoding
00087         //   emits 0xfeff by default as a kind of preamble.  It gets encoded as 3 bytes, ef bb bf,
00088         //   and can be helpful to make the data easier to recognize as UTF-8 encoded data.
00089         bool skipBom;
00090 
00091         TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
00092         {
00093         }
00094 
00095         TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :
00096                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
00097         {
00098         }
00099 
00100 protected:
00101         enum {
00102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
00103                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
00104                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
00105                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
00106                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
00107                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
00108                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
00109                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
00110                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
00111                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
00112                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
00113                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
00114                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
00115 #undef DefineByte
00116         };
00117 
00118         typedef TUniVecIdx TVecIdx;
00119         //friend class TUniChDb;
00120         friend class TUniCaseFolding;
00121         friend class TUnicode;
00122 
00123 public:
00124 
00125         //-----------------------------------------------------------------------
00126         // UTF-8
00127         //-----------------------------------------------------------------------
00128 
00129         // Returns the number of characters that have been successfully decoded.
00130         // This does not include any replacement characters that may have been inserted into 'dest'.
00131         template<typename TSrcVec, typename TDestCh>
00132         size_t DecodeUtf8(
00133                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00134                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00135         template<typename TSrcVec, typename TDestCh>
00136         size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
00137 
00138         // Returns the number of characters that have been successfully encoded.
00139         // This does not include any replacement characters that may have been inserted into 'dest'.
00140         template<typename TSrcVec, typename TDestCh>
00141         size_t EncodeUtf8(
00142                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00143                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00144         template<typename TSrcVec, typename TDestCh>
00145         size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
00146 
00147         // The following wrappers around the UTF-8 encoder return a TStr containing
00148         // the UTF-8-encoded version of the input string.
00149         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
00150         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
00151 
00152         //-----------------------------------------------------------------------
00153         // UTF-16 Decoder
00154         //-----------------------------------------------------------------------
00155 
00156 protected:
00157         enum {
00158                 Utf16FirstSurrogate = 0xd800,
00159                 Utf16SecondSurrogate = 0xdc00
00160         };
00161 
00162         static bool IsMachineLittleEndian();
00163 
00164 public:
00165 
00166         // Returns the number of characters that have been successfully decoded.
00167         // This does not include any replacement characters that may have been inserted into 'dest'.
00168         // Each element of 'src' is assumed to contain one byte of data.
00169         // srcCount must be even (though srcIdx doesn't need to be).
00170         template<typename TSrcVec, typename TDestCh>
00171         size_t DecodeUtf16FromBytes(
00172                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00173                 TVec<TDestCh>& dest, const bool clrDest,
00174                 const TUtf16BomHandling bomHandling = bomAllowed,
00175                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00176 
00177         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
00178         // are used to determine if the two bytes of each word should be swapped before further
00179         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
00180         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
00181         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
00182         // beginning of the source data is used to determine the "original" byte order of the data;
00183         // if this doesn't match the byte order of the local machine, the two bytes of each word will
00184         // be swapped during the decoding process.
00185         template<typename TSrcVec, typename TDestCh>
00186         size_t DecodeUtf16FromWords(
00187                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00188                 TVec<TDestCh>& dest, bool clrDest,
00189                 const TUtf16BomHandling bomHandling = bomAllowed,
00190                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00191 
00192         //-----------------------------------------------------------------------
00193         // UTF-16 Encoder
00194         //-----------------------------------------------------------------------
00195 
00196         // Returns the number of characters that have been successfully encoded.
00197         // This does not include any replacement characters that may have been inserted into 'dest'.
00198         //
00199         // Notes:
00200         // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always
00201         //   treated as an error, regardless of the value of 'strict'.
00202         // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023
00203         //   cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding
00204         //   as the first character of a surrogate pair.
00205         // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023
00206         //   can be encoded in principle; however, if strict == true, they are treated as errors.
00207         template<typename TSrcVec, typename TDestCh>
00208         size_t EncodeUtf16ToWords(
00209                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00210                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00211                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00212 
00213         template<typename TSrcVec, typename TDestCh>
00214         size_t EncodeUtf16ToBytes(
00215                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00216                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00217                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00218 
00219         //-----------------------------------------------------------------------
00220         // Helper declarations for the test drivers
00221         //-----------------------------------------------------------------------
00222 
00223 protected:
00224 
00225         static uint GetRndUint(TRnd& rnd);
00226         static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);
00227 
00228         //-----------------------------------------------------------------------
00229         // UTF-8 Test Driver
00230         //-----------------------------------------------------------------------
00231 
00232 protected:
00233         void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);
00234         // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00235         // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00236         void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);
00237 public:
00238         void TestUtf8();
00239 
00240         //-----------------------------------------------------------------------
00241         // UTF-16 Test Driver
00242         //-----------------------------------------------------------------------
00243 
00244 protected:
00245         void WordsToBytes(const TIntV& src, TIntV& dest);
00246         void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00247                 // Note: insertBom is only used with the encoder.  When encoding, 'defaultByteOrder' is used as the destination byte order.
00248                 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00249                 FILE *f);
00250         static inline int SwapBytes(int x) {
00251                 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
00252         // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00253         // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00254         void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00255                 const TUtf16BomHandling bomHandling,
00256                 const TUniByteOrder defaultByteOrder,
00257                 const bool insertBom);
00258 public:
00259         void TestUtf16();
00260 
00261 };
00262 
00263 //-----------------------------------------------------------------------------
00264 // Case folding
00265 //-----------------------------------------------------------------------------
00266 // Note: there's no need to access this class directly.
00267 // Use TUniChDb::GetCaseFolded() instead.
00268 
00269 typedef THash<TInt, TIntV> TIntIntVH;
00270 
00271 class TUniCaseFolding
00272 {
00273 protected:
00274         TIntH cfCommon, cfSimple, cfTurkic;
00275         TIntIntVH cfFull;
00276 
00277         template<typename TSrcDat, typename TDestDat>
00278         inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {
00279                 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }
00280         friend class TUniChDb;
00281         typedef TUniVecIdx TVecIdx;
00282 
00283 public:
00284         TUniCaseFolding() { }
00285         explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }
00286         void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }
00287         void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }
00288         void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }
00289         void LoadTxt(const TStr& fileName);
00290 
00291         // Use 'turkic' when processing text in a Turkic language (tr, az).  This only affects the uppercase I and I-with-dot-above.
00292         template<typename TSrcVec, typename TDestCh>
00293         void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00294                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const
00295         {
00296                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
00297                 {
00298                         int c = src[TVecIdx(srcIdx)], i; srcIdx++;
00299                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }
00300                         if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }
00301                         if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }
00302                         i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);
00303                 }
00304         }
00305 
00306         template<typename TSrcVec>
00307         void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const
00308         {
00309                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
00310                 {
00311                         int c = src[TVecIdx(srcIdx)], i;
00312                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }
00313                         if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }
00314                         i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];
00315                 }
00316         }
00317 
00318 protected:
00319         void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);
00320 public:
00321         void Test();
00322 };
00323 
00324 //-----------------------------------------------------------------------------
00325 // TCodecBase -- an abstract base class for codecs
00326 //-----------------------------------------------------------------------------
00327 
00328 class TCodecBase;
00329 typedef TPt<TCodecBase> PCodecBase;
00330 typedef TVec<PCodecBase> TCodecBaseV;
00331 
00332 class TCodecBase
00333 {
00334 protected:
00335         TCRef CRef;
00336         friend class TPt<TCodecBase>;
00337 public:
00338         virtual ~TCodecBase() { }
00339 
00340         template<class TCodecImpl>
00341         static PCodecBase New(); /* {
00342                 return new TCodecWrapper<TCodecImpl>(); } */
00343 
00344         virtual TStr GetName() const = 0;
00345         virtual void Test() const { }
00346 
00347         // Returns the number of characters that have been successfully decoded.
00348         // This does not include any replacement characters that may have been inserted into 'dest'.
00349         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00350         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00351 
00352         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00353         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00354 
00355         // Returns the number of characters that have been successfully encoded.
00356         // This does not include any replacement characters that may have been inserted into 'dest'.
00357         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00358         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;
00359         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;
00360 
00361         size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00362         size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00363         size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00364 };
00365 
00366 //-----------------------------------------------------------------------------
00367 // TCodecWrapper -- a descendant of TCodecBase; relies on a template
00368 // parameter class for the actual implementation of the codec.
00369 //-----------------------------------------------------------------------------
00370 // Thus, if you know in advance that you'll need ISO-8859-2, just use
00371 // T8BitCodec<TEncoding_ISO8859_2>.  If you don't know the encoding
00372 // in advance, use a PCodecBase pointing to a suitable specialization
00373 // of TCodecWrapper<...>.  You can TUnicode::GetCodec(TStr& name)
00374 // to obtain a suitable pointer.
00375 
00376 template<class TCodecImpl_>
00377 class TCodecWrapper : public TCodecBase
00378 {
00379 public:
00380         typedef TCodecImpl_ TCodecImpl;
00381         TCodecImpl impl;
00382 public:
00383 
00384         virtual TStr GetName() const { return impl.GetName(); }
00385 
00386         virtual void Test() const { impl.Test(); }
00387 
00388         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00389                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00390         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00391                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00392 
00393         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00394                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00395         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {
00396                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00397         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00398                 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);
00399                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00400                 return retVal; }
00401 };
00402 
00403 template<class TCodecImpl>
00404 PCodecBase TCodecBase::New() {
00405   return new TCodecWrapper<TCodecImpl>();
00406 }
00407 
00408 //-----------------------------------------------------------------------------
00409 // TVecElt -- a template for determining the type of a vector's elements
00410 //-----------------------------------------------------------------------------
00411 
00412 template<class TVector_>
00413 class TVecElt
00414 {
00415 };
00416 
00417 template<class TDat>
00418 class TVecElt<TVec<TDat> >
00419 {
00420 public:
00421         typedef TVec<TDat> TVector;
00422         typedef TDat TElement;
00423         static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }
00424 };
00425 
00426 template<>
00427 class TVecElt<TChA>
00428 {
00429 public:
00430         typedef TChA TVector;
00431         typedef char TElement;
00432         static inline void Add(TVector& vector, const TElement& element) { vector += element; }
00433 };
00434 
00435 
00436 //-----------------------------------------------------------------------------
00437 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
00438 //-----------------------------------------------------------------------------
00439 
00440 class TEncoding_ISO8859_1
00441 {
00442 public:
00443         static inline TStr GetName() { return "ISO-8859-1"; }
00444         static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }
00445         static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }
00446 };
00447 
00448 class TEncoding_ISO8859_2 // ISO Latin 2
00449 {
00450 public:
00451         static inline TStr GetName() { return "ISO-8859-2"; }
00452         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00453         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00454                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00455         static int FromUnicode(int c) {
00456                 if (0 <= c && c < 0xa0) return c;
00457                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00458                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00459                 else return -1; }
00460 };
00461 
00462 class TEncoding_ISO8859_3
00463 {
00464 public:
00465         static inline TStr GetName() { return "ISO-8859-3"; }
00466         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];
00467         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00468                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00469         static int FromUnicode(int c) {
00470                 if (0 <= c && c < 0xa0) return c;
00471                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00472                 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];
00473                 else return -1; }
00474 };
00475 
00476 class TEncoding_ISO8859_4
00477 {
00478 public:
00479         static inline TStr GetName() { return "ISO-8859-4"; }
00480         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00481         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00482                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00483         static int FromUnicode(int c) {
00484                 if (0 <= c && c < 0xa0) return c;
00485                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00486                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00487                 else return -1; }
00488 };
00489 
00490 class TEncoding_YuAscii
00491 {
00492 public:
00493         static const int uniChars[10], yuAsciiChars[10];
00494         static inline TStr GetName() { return "YU-ASCII"; }
00495         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00496                 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)
00497                         if (c == yuAsciiChars[i]) return uniChars[i];
00498                 return c; }
00499         static int FromUnicode(int c) {
00500                 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)
00501                         if (c == uniChars[i]) return yuAsciiChars[i];
00502                         else if(c == yuAsciiChars[i]) return -1;
00503                 if (0 <= c && c <= 255) return c; else return -1; }
00504 };
00505 
00506 class TEncoding_CP437 // DOS US
00507 {
00508 public:
00509         static inline TStr GetName() { return "CP437"; }
00510         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];
00511         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00512                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00513         static int FromUnicode(int c) {
00514                 if (0 <= c && c < 0x80) return c;
00515                 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];
00516                 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];
00517                 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];
00518                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];
00519                 else if (c == 0x192) return 0x9f;
00520                 else if (c == 0x207f) return 0xfc;
00521                 else if (c == 0x20a7) return 0x9e;
00522                 else if (c == 0x2310) return 0xa9;
00523                 else if (c == 0x2320) return 0xf4;
00524                 else if (c == 0x2321) return 0xf5;
00525                 else return -1; }
00526 };
00527 
00528 class TEncoding_CP852 // DOS Latin 2
00529 {
00530 public:
00531         static inline TStr GetName() { return "CP852"; }
00532         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];
00533         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00534                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00535         static int FromUnicode(int c) {
00536                 if (0 <= c && c < 0x80) return c;
00537                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00538                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00539                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];
00540                 else return -1; }
00541 };
00542 
00543 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2
00544 {
00545 public:
00546         static inline TStr GetName() { return "CP1250"; }
00547         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];
00548         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00549                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00550         static int FromUnicode(int c) {
00551                 if (0 <= c && c < 0x80) return c;
00552                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00553                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00554                 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];
00555                 else if (c == 0x20ac) return 0x80;
00556                 else if (c == 0x2122) return 0x99;
00557                 else return -1; }
00558 };
00559 
00560 template<class TEncoding_>
00561 class T8BitCodec
00562 {
00563 protected:
00564         typedef TUniVecIdx TVecIdx;
00565 public:
00566         typedef TEncoding_ TEncoding;
00567         TUnicodeErrorHandling errorHandling;
00568         int replacementChar;
00569 
00570         T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }
00571         T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :
00572                 errorHandling(errorHandling_), replacementChar(replacementChar_) { }
00573         static TStr GetName() { return TEncoding::GetName(); }
00574 
00575         void Test() const
00576         {
00577                 int nDecoded = 0;
00578                 for (int c = 0; c <= 255; c++) {
00579                         int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;
00580                         nDecoded++;
00581                         IAssert(0 <= cu && cu < 0x110000);
00582                         int c2 = TEncoding::FromUnicode(cu);
00583                         IAssert(c2 == c); }
00584                 int nEncoded = 0;
00585                 for (int cu = 0; cu < 0x110000; cu++) {
00586                         int c = TEncoding::FromUnicode(cu); if (c == -1) continue;
00587                         nEncoded++;
00588                         IAssert(0 <= c && c <= 255);
00589                         int cu2 = TEncoding::ToUnicode(c);
00590                         IAssert(cu2 == cu); }
00591                 IAssert(nDecoded == nEncoded);
00592         }
00593 
00594         // Returns the number of characters that have been successfully decoded.
00595         // This does not include any replacement characters that may have been inserted into 'dest'.
00596         template<typename TSrcVec, typename TDestCh>
00597         size_t ToUnicode(
00598                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00599                 TVec<TDestCh>& dest, const bool clrDest = true) const
00600         {
00601                 if (clrDest) dest.Clr();
00602                 size_t toDo = srcCount;
00603                 while (toDo-- > 0) {
00604                         int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;
00605                         int chDest = TEncoding::ToUnicode(chSrc);
00606                         dest.Add(chDest); }
00607                 return srcCount;
00608         }
00609         template<typename TSrcVec, typename TDestCh>
00610         size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00611 
00612         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00613         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00614 
00615         // Returns the number of characters that have been successfully encoded.
00616         // This does not include any replacement characters that may have been inserted into 'dest'.
00617         template<typename TSrcVec, typename TDestVec>
00618         size_t FromUnicode(
00619                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00620                 TDestVec& dest, const bool clrDest = true) const
00621         {
00622                 typedef typename TVecElt<TDestVec>::TElement TDestCh;
00623                 if (clrDest) dest.Clr();
00624                 size_t toDo = srcCount, nEncoded = 0;
00625                 while (toDo-- > 0) {
00626                         int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;
00627                         int chDest = TEncoding::FromUnicode(chSrc);
00628                         if (chDest < 0) {
00629                                 switch (errorHandling) {
00630                                 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");
00631                                 case uehAbort: return nEncoded;
00632                                 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;
00633                                 case uehIgnore: continue;
00634                                 default: Fail; } }
00635                         TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }
00636                 return nEncoded;
00637         }
00638 
00639         template<typename TSrcVec, typename TDestVec>
00640         size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00641 
00642         size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00643                 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);
00644                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00645                 return retVal; }
00646         size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }
00647 };
00648 
00649 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1;
00650 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2;
00651 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3;
00652 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4;
00653 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852;
00654 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437;
00655 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250;
00656 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii;
00657 
00658 //-----------------------------------------------------------------------------
00659 // Various declarations used by the Unicode Character Database
00660 //-----------------------------------------------------------------------------
00661 
00662 typedef enum TUniChCategory_
00663 {
00664 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
00665         DefineUniCat(Letter, 'L'),             // ucLetter
00666         DefineUniCat(Mark, 'M'),
00667         DefineUniCat(Number, 'N'),
00668         DefineUniCat(Punctuation, 'P'),
00669         DefineUniCat(Symbol, 'S'),
00670         DefineUniCat(Separator, 'Z'),
00671         DefineUniCat(Other, 'C')
00672 #undef DefineUniCat
00673 }
00674 TUniChCategory;
00675 
00676 typedef enum TUniChSubCategory_
00677 {
00678 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
00679         DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
00680         DefineUniSubCat(Letter, Lowercase, 'l'),
00681         DefineUniSubCat(Letter, Titlecase, 't'),
00682         DefineUniSubCat(Letter, Modifier, 'm'),
00683         DefineUniSubCat(Letter, Other, 'o'),
00684         DefineUniSubCat(Mark, Nonspacing, 'n'),
00685         DefineUniSubCat(Mark, SpacingCombining, 'c'),
00686         DefineUniSubCat(Mark, Enclosing, 'e'),
00687         DefineUniSubCat(Number, DecimalDigit, 'd'),
00688         DefineUniSubCat(Number, Letter, 'l'),
00689         DefineUniSubCat(Number, Other, 'o'),
00690         DefineUniSubCat(Punctuation, Connector, 'c'),
00691         DefineUniSubCat(Punctuation, Dash, 'd'),
00692         DefineUniSubCat(Punctuation, Open, 's'),
00693         DefineUniSubCat(Punctuation, Close, 'e'),
00694         DefineUniSubCat(Punctuation, InitialQuote, 'i'),
00695         DefineUniSubCat(Punctuation, FinalQuote, 'f'),
00696         DefineUniSubCat(Punctuation, Other, 'o'),
00697         DefineUniSubCat(Symbol, Math, 'm'),
00698         DefineUniSubCat(Symbol, Currency, 'c'),
00699         DefineUniSubCat(Symbol, Modifier, 'k'),
00700         DefineUniSubCat(Symbol, Other, 'o'),
00701         DefineUniSubCat(Separator, Space, 's'),
00702         DefineUniSubCat(Separator, Line, 'l'),
00703         DefineUniSubCat(Separator, Paragraph, 'p'),
00704         DefineUniSubCat(Other, Control, 'c'),
00705         DefineUniSubCat(Other, Format, 'f'),
00706         DefineUniSubCat(Other, Surrogate, 's'),
00707         DefineUniSubCat(Other, PrivateUse, 'o'),
00708         DefineUniSubCat(Other, NotAssigned, 'n')
00709 }
00710 TUniChSubCategory;
00711 
00712 typedef enum TUniChFlags_
00713 {
00714         ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
00715         ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
00716         // Flags used when searching for word boundaries.  See UAX #29.
00717         ucfWbFormat = 1 << 2,
00718         ucfWbKatakana = 1 << 3,
00719         ucfWbALetter = 1 << 4,
00720         ucfWbMidLetter = 1 << 5,
00721         ucfWbMidNum = 1 << 6,
00722         ucfWbNumeric = 1 << 7,
00723         ucfWbExtendNumLet = 1 << 8,
00724         // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
00725         ucfSbSep = 1 << 9,
00726         ucfSbFormat = 1 << 10,
00727         ucfSbSp = 1 << 11,
00728         ucfSbLower = 1 << 12,
00729         ucfSbUpper = 1 << 13,
00730         ucfSbOLetter = 1 << 14,
00731         ucfSbNumeric = 1 << 15,
00732         ucfSbATerm = 1 << 16,
00733         ucfSbSTerm = 1 << 17,
00734         ucfSbClose = 1 << 18,
00735         ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
00736         ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
00737         // Flags from DerivedCoreProperties.txt.
00738         // [The comments are from UCD.html.]
00739         // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
00740         //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
00741         ucfDcpAlphabetic = 1 << 19,
00742         // - For programmatic determination of default-ignorable code points.
00743         //   New characters that should be ignored in processing (unless explicitly supported)
00744         //   will be assigned in these ranges, permitting programs to correctly handle the default
00745         //   behavior of such characters when not otherwise supported.  For more information, see
00746         //   UAX #29: Text Boundaries [Breaks].
00747         //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
00748         //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
00749         ucfDcpDefaultIgnorableCodePoint = 1 << 20,
00750         // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
00751         //   Generated from: Other_Lowercase + Ll
00752         ucfDcpLowercase = 1 << 21,
00753         // - For programmatic determination of grapheme cluster boundaries.
00754         //   For more information, see UAX #29: Text Boundaries [Breaks].
00755         //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
00756         ucfDcpGraphemeBase = 1 << 22,
00757         // - For programmatic determination of grapheme cluster boundaries.
00758         //   For more information, see UAX #29: Text Boundaries [Breaks].
00759         //   Generated from: Other_Grapheme_Extend + Me + Mn
00760         //   Note: depending on an application's interpretation of Co (private use), they may be either
00761         //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
00762         ucfDcpGraphemeExtend = 1 << 23,
00763         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00764         ucfDcpIdStart = 1 << 24,
00765         ucfDcpIdContinue = 1 << 25,
00766         // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
00767         //   Generated from: Sm + Other_Math
00768         ucfDcpMath = 1 << 26,
00769         // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
00770         //   Generated from: Lu + Other_Uppercase
00771         ucfDcpUppercase = 1 << 27,
00772         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00773         ucfDcpXidStart = 1 << 28,
00774         ucfDcpXidContinue = 1 << 29,
00775         ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
00776                 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
00777 }
00778 TUniChFlags;
00779 
00780 typedef enum TUniChProperties_
00781 {
00782         // The flags from PropList.txt.
00783         // [The comments are from UCD.html.]
00784         // - ASCII characters commonly used for the representation of hexadecimal numbers.
00785         //   [= 0123456789abcdefABCDEF]
00786         ucfPrAsciiHexDigit = 1,
00787         // - Those format control characters which have specific functions in the Bidirectional Algorithm.
00788         ucfPrBidiControl = 2,
00789         // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
00790         //   plus compatibility equivalents to those. Most of these have the Pd General Category,
00791         //   but some have the Sm General Category because of their use in mathematics.
00792         //     U+0002d  HYPHEN-MINUS
00793         //     U+0058a  ARMENIAN HYPHEN
00794         //     U+005be  HEBREW PUNCTUATION MAQAF
00795         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00796         //     U+02010  HYPHEN
00797         //     U+02011  NON-BREAKING HYPHEN
00798         //     U+02012  FIGURE DASH
00799         //     U+02013  EN DASH
00800         //     U+02014  EM DASH
00801         //     U+02015  HORIZONTAL BAR
00802         //     U+02053  SWUNG DASH
00803         //     U+0207b  SUPERSCRIPT MINUS
00804         //     U+0208b  SUBSCRIPT MINUS
00805         //     U+02212  MINUS SIGN
00806         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00807         //     U+0301c  WAVE DASH
00808         //     U+03030  WAVY DASH
00809         //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
00810         //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
00811         //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
00812         //     U+0fe58  SMALL EM DASH
00813         //     U+0fe63  SMALL HYPHEN-MINUS
00814         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00815         ucfPrDash = 4,
00816         // - For a machine-readable list of deprecated characters.  No characters will ever be removed
00817         //   from the standard, but the usage of deprecated characters is strongly discouraged.
00818         ucfPrDeprecated = 8,
00819         // - Characters that linguistically modify the meaning of another character to which they apply.
00820         //   Some diacritics are not combining characters, and some combining characters are not diacritics.
00821         ucfPrDiacritic = 0x10,
00822         // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
00823         //   character.  Typical of these are length and iteration marks.
00824         ucfPrExtender = 0x20,
00825         // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
00826         ucfPrGraphemeLink = 0x40,
00827         // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
00828         //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
00829         ucfPrHexDigit = 0x80,
00830         // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
00831         //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
00832         //     U+0002d  HYPHEN-MINUS
00833         //     U+000ad  SOFT HYPHEN
00834         //     U+0058a  ARMENIAN HYPHEN
00835         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00836         //     U+02010  HYPHEN
00837         //     U+02011  NON-BREAKING HYPHEN
00838         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00839         //     U+030fb  KATAKANA MIDDLE DOT
00840         //     U+0fe63  SMALL HYPHEN-MINUS
00841         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00842         //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
00843         ucfPrHyphen = 0x100,
00844         // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
00845         ucfPrIdeographic = 0x200,
00846         // - Those format control characters which have specific functions for control of cursive joining and ligation.
00847         ucfPrJoinControl = 0x400,
00848         // - There are a small number of characters that do not use logical order.
00849         //   These characters require special handling in most processing.
00850         ucfPrLogicalOrderException = 0x800,
00851         // - Code points that are permanently reserved for internal use.
00852         ucfPrNoncharacterCodePoint = 0x1000,
00853         // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
00854         ucfPrPatternSyntax = 0x2000,
00855         ucfPrPatternWhiteSpace = 0x4000,
00856         // - Those punctuation characters that function as quotation marks.
00857         //     U+00022  QUOTATION MARK
00858         //     U+00027  APOSTROPHE
00859         //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00860         //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
00861         //     U+02018  LEFT SINGLE QUOTATION MARK
00862         //     U+02019  RIGHT SINGLE QUOTATION MARK
00863         //     U+0201a  SINGLE LOW-9 QUOTATION MARK
00864         //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
00865         //     U+0201c  LEFT DOUBLE QUOTATION MARK
00866         //     U+0201d  RIGHT DOUBLE QUOTATION MARK
00867         //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
00868         //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
00869         //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
00870         //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
00871         //     U+0300c  LEFT CORNER BRACKET
00872         //     U+0300d  RIGHT CORNER BRACKET
00873         //     U+0300e  LEFT WHITE CORNER BRACKET
00874         //     U+0300f  RIGHT WHITE CORNER BRACKET
00875         //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
00876         //     U+0301e  DOUBLE PRIME QUOTATION MARK
00877         //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
00878         //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
00879         //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
00880         //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
00881         //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
00882         //     U+0ff02  FULLWIDTH QUOTATION MARK
00883         //     U+0ff07  FULLWIDTH APOSTROPHE
00884         //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
00885         //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
00886         ucfPrQuotationMark = 0x8000,
00887         // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
00888         //   An explicit _dot above_ can be added where required, such as in Lithuanian.
00889         ucfPrSoftDotted = 0x10000,
00890         // - Sentence Terminal. Used in UAX #29: Text Boundaries.
00891         //     U+00021  EXCLAMATION MARK
00892         //     U+0002e  FULL STOP
00893         //     U+0003f  QUESTION MARK
00894         //     U+0203c  DOUBLE EXCLAMATION MARK
00895         //     U+0203d  INTERROBANG
00896         //     U+02047  DOUBLE QUESTION MARK
00897         //     U+02048  QUESTION EXCLAMATION MARK
00898         //     U+02049  EXCLAMATION QUESTION MARK
00899         //     U+03002  IDEOGRAPHIC FULL STOP
00900         //     [plus many characters from other writing systems]
00901         ucfPrSTerm = 0x20000,
00902         // - Those punctuation characters that generally mark the end of textual units.
00903         //   [JB note: this set contains more character than STerm.  For example, it contains
00904         //   the comma, colon and semicolon, whereas STerm doesn't.]
00905         //     U+00021  EXCLAMATION MARK
00906         //     U+0002c  COMMA
00907         //     U+0002e  FULL STOP
00908         //     U+0003a  COLON
00909         //     U+0003b  SEMICOLON
00910         //     U+0003f  QUESTION MARK
00911         //     U+0203c  DOUBLE EXCLAMATION MARK
00912         //     U+0203d  INTERROBANG
00913         //     U+02047  DOUBLE QUESTION MARK
00914         //     U+02048  QUESTION EXCLAMATION MARK
00915         //     U+02049  EXCLAMATION QUESTION MARK
00916         //     [plus *lots* of charcters from other writing systems]
00917         ucfPrTerminalPunctuation = 0x40000,
00918         // - Indicates all those characters that qualify as Variation Selectors.
00919         //   For details on the behavior of these characters, see StandardizedVariants.html and
00920         //   Section 16.4, Variation Selectors in [Unicode].
00921         ucfPrVariationSelector = 0x80000,
00922         // - Those separator characters and control characters which should be treated by
00923         //   programming languages as "white space" for the purpose of parsing elements.
00924         //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
00925         //         since their functions are restricted to line-break control.
00926         //         Their names are unfortunately misleading in this respect.
00927         //   Note: There are other senses of "whitespace" that encompass a different set of characters.
00928         //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
00929         //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
00930         //   This includes the following characters:
00931         //     U+0009  <control>
00932         //     U+000a  <control>
00933         //     U+000b  <control>
00934         //     U+000c  <control>
00935         //     U+000d  <control>
00936         //     U+0020  SPACE
00937         //     U+0085  <control>
00938         //     U+00a0  NO-BREAK SPACE
00939         //     U+1680  OGHAM SPACE MARK
00940         //     U+180e  MONGOLIAN VOWEL SEPARATOR
00941         //     U+2000  EN QUAD
00942         //     U+2001  EM QUAD
00943         //     U+2002  EN SPACE
00944         //     U+2003  EM SPACE
00945         //     U+2004  THREE-PER-EM SPACE
00946         //     U+2005  FOUR-PER-EM SPACE
00947         //     U+2006  SIX-PER-EM SPACE
00948         //     U+2007  FIGURE SPACE
00949         //     U+2008  PUNCTUATION SPACE
00950         //     U+2009  THIN SPACE
00951         //     U+200a  HAIR SPACE
00952         //     U+2028  LINE SEPARATOR
00953         //     U+2029  PARAGRAPH SEPARATOR
00954         //     U+202f  NARROW NO-BREAK SPACE
00955         //     U+205f  MEDIUM MATHEMATICAL SPACE
00956         //     U+3000  IDEOGRAPHIC SPACE
00957         ucfPrWhiteSpace = 0x100000
00958 }
00959 TUniChProperties;
00960 
00961 typedef enum TUniChPropertiesX_
00962 {
00963         // More properties from PropList.txt.
00964         // - Used to derive the properties in DerivedCoreProperties.txt.
00965         ucfPxOtherAlphabetic = 1,
00966         ucfPxOtherDefaultIgnorableCodePoint = 2,
00967         ucfPxOtherGraphemeExtend = 4,
00968         ucfPxOtherIdContinue = 8,
00969         ucfPxOtherIdStart = 0x10,
00970         ucfPxOtherLowercase = 0x20,
00971         ucfPxOtherMath = 0x40,
00972         ucfPxOtherUppercase = 0x80,
00973         // - Used in ideographic description sequences.
00974         ucfPxIdsBinaryOperator = 0x100,
00975         ucfPxIdsTrinaryOperator = 0x200,
00976         ucfPxRadical = 0x400,
00977         ucfPxUnifiedIdeograph = 0x800
00978 }
00979 TUniChPropertiesX;
00980 
00981 //-----------------------------------------------------------------------------
00982 // TUniChInfo -- contains information about a single Unicode codepoint
00983 //-----------------------------------------------------------------------------
00984 
00985 class TUniChInfo
00986 {
00987 public:
00988         enum { // combining classes (for 'combClass'); from UnicodeData.txt
00989                 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
00990                 ccOverlaysAndInterior = 1,
00991                 ccNuktas = 7,
00992                 ccHiraganaKatakanaVoicingMarks = 8,
00993                 ccViramas = 9,
00994                 ccFixedPositionStart = 10, // Start of fixed position classes
00995                 ccFixedPositionEnd = 199, // End of fixed position classes
00996                 ccBelowLeftAttached = 200,
00997                 ccBelowAttached = 202,
00998                 ccBelowRightAttached = 204,
00999                 ccLeftAttached = 208, // Left attached (reordrant around single base character)
01000                 ccRightAttached = 210,
01001                 ccAboveLeftAttached = 212,
01002                 ccAboveAttached = 214,
01003                 ccAboveRightAttached = 216,
01004                 ccBelowLeft = 218,
01005                 ccBelow = 220,
01006                 ccBelowRight = 222,
01007                 ccLeft = 224, // Left (reordrant around single base character)
01008                 ccRight = 226,
01009                 ccAboveLeft = 228,
01010                 ccAbove = 230,
01011                 ccAboveRight = 232,
01012                 ccDoubleBelow = 233,
01013                 ccDoubleAbove = 234,
01014                 ccBelowIotaSubscript = 240, // Below (iota subscript)
01015                 ccInvalid = 255 // not defined by Unicode
01016         };
01017         char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)
01018         uchar combClass; // canonical combining class
01019         TUniChCategory cat; // = TUniChCategory(chCat)
01020         TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)
01021         signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown
01022         int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt
01023         int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition
01024         int nameOffset; // offset into 'TUniChDb.charNames'
01025         int flags; // a combination of TUniChFlags
01026         int properties; // a combination of TUniChProperties
01027         int propertiesX; // a combination of TUniChPropertiesX
01028         ushort lineBreak; // from LineBreak.txt
01029 
01030         // Converts a 2-letter linebreak code into a 16-bit integer.
01031         static inline ushort GetLineBreakCode(char c1, char c2) { return ((static_cast<ushort>(static_cast<uchar>(c1)) & 0xff) << 8) | ((static_cast<ushort>(static_cast<uchar>(c2)) & 0xff)); }
01032         static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation;
01033 
01034 public:
01035         void InitAfterLoad() {
01036                 cat = (TUniChCategory) chCat;
01037                 subCat = (TUniChSubCategory) (((static_cast<int>(static_cast<uchar>(chCat)) & 0xff) << 8) | (static_cast<int>(static_cast<uchar>(chSubCat)) & 0xff)); }
01038         void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {
01039                 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);
01040                 subCat = catAndSubCat;
01041                 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }
01042         friend class TUniChDb;
01043 
01044         // Inexplicably missing from TSIn/TSOut...
01045         static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }
01046         static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }
01047         static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }
01048         static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }
01049 
01050 public:
01051         void Save(TSOut& SOut) const {
01052                 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);
01053                 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);
01054                 SOut.Save(decompOffset); SOut.Save(nameOffset);
01055                 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }
01056         void Load(TSIn& SIn) {
01057                 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);
01058                 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);
01059                 SIn.Load(decompOffset); SIn.Load(nameOffset);
01060                 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }
01061         explicit TUniChInfo(TSIn& SIn) { Load(SIn); }
01062         TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),
01063                 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
01064                 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
01065                 InitAfterLoad(); }
01066 
01067         // DerivedCoreProperties flags.
01068         bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }
01069         void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }
01070         void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }
01071         bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }
01072         bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }
01073         bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }
01074         bool IsMath() const { return IsDcpFlag(ucfDcpMath); }
01075         bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); }
01076         bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }
01077         bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); }
01078         bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }
01079         bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }
01080         bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }
01081         bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }
01082 
01083         // PropList.txt flags.
01084         bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }
01085         void SetProperty(const TUniChProperties flag) { properties |= flag; }
01086         bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); }
01087         bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }
01088         bool IsDash() const { return IsProperty(ucfPrDash); }
01089         bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }
01090         bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }
01091         bool IsExtender() const { return IsProperty(ucfPrExtender); }
01092         bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }
01093         bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }
01094         bool IsHyphen() const { return IsProperty(ucfPrHyphen); }
01095         bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }
01096         bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }
01097         bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); }
01098         bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); }
01099         bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); }
01100         bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }
01101         bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }
01102         bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); }
01103         bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); }
01104         bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }
01105 
01106         // Additional PropList.txt flags.
01107         bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }
01108         void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }
01109 
01110         // Miscellaneous flags.
01111         bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; }
01112         bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; }
01113 
01114         // Word-boundary flags.
01115         bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }
01116         void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }
01117         void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }
01118         int GetWbFlags() const { return flags & ucfWbMask; }
01119         bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }
01120         TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); }
01121         static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +
01122                 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +
01123                 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }
01124 
01125         // Sentence-boundary flags.
01126         bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }
01127         void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }
01128         int GetSbFlags() const { return flags & ucfSbMask; }
01129         bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }
01130         TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); }
01131         static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +
01132                 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +
01133                 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +
01134                 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }
01135 
01136         bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }
01137 
01138         // Grapheme-boundary flags.
01139         bool IsGbExtend() const { return IsGraphemeExtend(); }
01140 
01141         // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.
01142         bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }
01143 
01144         // Character categories.
01145         TUniChCategory GetCat() const { return (TUniChCategory) cat; }
01146         TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; }
01147         // The following characters belong to the 'symbol/currency' subcategory:
01148         //     U+00024  DOLLAR SIGN
01149         //     U+000a2  CENT SIGN
01150         //     U+000a3  POUND SIGN
01151         //     U+000a4  CURRENCY SIGN
01152         //     U+000a5  YEN SIGN
01153         //     U+020a3  FRENCH FRANC SIGN
01154         //     U+020a4  LIRA SIGN
01155         //     U+020ac  EURO SIGN
01156         //     [and plenty of others]
01157         bool IsCurrency() const { return subCat == ucSymbolCurrency; }
01158         // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.
01159         // Thus, it's better to call TUniChDb's versions of these methods, which are aware of
01160         // the full ranges of private-use and surrogate characters.
01161         bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }
01162         bool IsSurrogate() const { return subCat == ucOtherSurrogate; }
01163 
01164         inline static bool IsValidSubCat(const char chCat, const char chSubCat) {
01165                 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
01166                 for (const char *p = s; *p; p += 2)
01167                         if (chCat == p[0] && chSubCat == p[1]) return true;
01168                 return false; }
01169 };
01170 
01171 //-----------------------------------------------------------------------------
01172 // TUniTrie -- a trie for suffixes that should not appear at the end
01173 // of a sentence
01174 //-----------------------------------------------------------------------------
01175 
01176 template<typename TItem_>
01177 class TUniTrie
01178 {
01179 public:
01180         typedef TItem_ TItem;
01181 protected:
01182         class TNode {
01183         public:
01184                 TItem item;
01185                 int child, sib;
01186                 bool terminal;
01187                 TNode() : child(-1), sib(-1), terminal(false) { }
01188                 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
01189         };
01190         typedef TVec<TNode> TNodeV;
01191         typedef TPair<TItem, TItem> TItemPr;
01192         typedef TTriple<TItem, TItem, TItem> TItemTr;
01193         typedef TUniVecIdx TVecIdx;
01194         THash<TItem, TVoid> singles; //
01195         THash<TItemPr, TVoid> pairs;
01196         THash<TItemTr, TInt> roots;
01197         TNodeV nodes;
01198 public:
01199         TUniTrie() { }
01200         void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }
01201 
01202         bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }
01203 
01204         bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }
01205         bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }
01206         int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {
01207                 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));
01208                 if (keyId < 0) return 0; else return roots[keyId]; }
01209         int GetChild(const int parentIdx, const TItem& item) const {
01210                 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
01211                         const TNode &node = nodes[childIdx];
01212                         if (node.item == item) return childIdx;
01213                         childIdx = node.sib; }
01214                 return -1; }
01215         bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }
01216 
01217         // Adds a new string to the trie.  Note that the last characters appear
01218         // closer to the root of the trie.
01219         template<typename TSrcVec>
01220         void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)
01221         {
01222                 IAssert(srcCount > 0);
01223                 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }
01224                 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }
01225                 size_t srcLast = srcIdx + (srcCount - 1);
01226                 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));
01227                 int keyId = roots.GetKeyId(tr), curNodeIdx = -1;
01228                 if (keyId >= 0) curNodeIdx = roots[keyId];
01229                 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }
01230                 //
01231                 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )
01232                 {
01233                         const TItem curItem = src[TVecIdx(srcPos)];
01234                         int childNodeIdx = nodes[curNodeIdx].child;
01235                         while (childNodeIdx >= 0) {
01236                                 TNode &childNode = nodes[childNodeIdx];
01237                                 if (childNode.item == curItem) break;
01238                                 childNodeIdx = childNode.sib; }
01239                         if (childNodeIdx < 0) {
01240                                 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));
01241                                 nodes[curNodeIdx].child = childNodeIdx; }
01242                         curNodeIdx = childNodeIdx;
01243                         if (srcPos == srcIdx) break; else srcPos--;
01244                 }
01245                 nodes[curNodeIdx].terminal = true;
01246         }
01247 
01248         template<typename TSrcVec>
01249         void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }
01250 };
01251 
01252 //-----------------------------------------------------------------------------
01253 // TUniChDb -- provides access to the Unicode Character Database
01254 //-----------------------------------------------------------------------------
01255 
01256 class TUniChDb
01257 {
01258 protected:
01259         void InitAfterLoad();
01260         typedef TUniVecIdx TVecIdx;
01261 
01262 public:
01263         THash<TInt, TUniChInfo> h; // key: codepoint
01264         TStrPool charNames;
01265         TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)
01266         TIntV decompositions;
01267         THash<TIntPr, TInt> inverseDec;
01268         TUniCaseFolding caseFolding;
01269         // These hash tables contain only the unconditional mappings from SpecialCasing.txt.
01270         // The conditional mappings are hardcoded into GetCaseConverted().
01271         TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle;
01272         int scriptUnknown; // = scripts.GetKey("Unknown")
01273 
01274         TUniChDb() : scriptUnknown(-1) { }
01275         explicit TUniChDb(TSIn& SIn) { Load(SIn); }
01276         void Clr() {
01277                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
01278                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
01279                 scripts.Clr(); }
01280         void Save(TSOut& SOut) const {
01281                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
01282                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
01283                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
01284                 SOut.SaveCs(); }
01285         void Load(TSIn& SIn) {
01286                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
01287                 decompositions.Load(SIn);
01288                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
01289                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
01290                 SIn.LoadCs(); InitAfterLoad(); }
01291         void LoadBin(const TStr& fnBin) {
01292                 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
01293         void Test(const TStr& basePath);
01294 
01295         // File names used by LoadTxt() and its subroutines.
01296         static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }
01297         static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }
01298         static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }
01299         static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }
01300         static TStr GetScriptsFn() { return "Scripts.txt"; }
01301         static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }
01302         static TStr GetLineBreakFn() { return "LineBreak.txt"; }
01303         static TStr GetPropListFn() { return "PropList.txt"; }
01304         static TStr GetAuxiliaryDir() { return "auxiliary"; }
01305         static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }
01306         static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }
01307         static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }
01308         static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }
01309         static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }
01310         static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()
01311 
01312         //-------------------------------------------------------------------------
01313         // Script names
01314         //-------------------------------------------------------------------------
01315 
01316         // These constants are used when initializing from the text files.
01317         static TStr GetScriptNameUnknown() { return "Unknown"; }
01318         static TStr GetScriptNameKatakana() { return "Katakana"; }
01319         static TStr GetScriptNameHiragana() { return "Hiragana"; }
01320         //
01321         const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }
01322         int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }
01323         int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
01324         int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
01325 
01326         //-------------------------------------------------------------------------
01327         // Character namesnames
01328         //-------------------------------------------------------------------------
01329 
01330         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
01331         const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
01332         TStr GetCharNameS(const int cp) const {
01333                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
01334                 const char *p = GetCharName(cp); if (p) return p;
01335                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
01336         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {
01337                 if (! f) f = stdout;
01338                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
01339                         fprintf(f, "%s", prefix.CStr());
01340                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
01341                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
01342         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }
01343 
01344         //-------------------------------------------------------------------------
01345         // Character information
01346         //-------------------------------------------------------------------------
01347         // These methods provide access to a subset of the functionality
01348         // available in TUniChInfo.
01349 
01350         bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {
01351                 int i = h.GetKeyId(cp);
01352                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}
01353         TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
01354         TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
01355 
01356         bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
01357         int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
01358         bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
01359         int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
01360 
01361 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
01362 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
01363 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
01364 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
01365 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
01366 
01367 #define DECLARE_FORWARDED_PROPERTY_METHODS \
01368         ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
01369         ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
01370         ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
01371         ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
01372         ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
01373         ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
01374         ___UniFwd2(IsXidStart, IsXidContinue)  \
01375         ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
01376         ___UniFwd1(IsGbExtend)  \
01377         ___UniFwd2(IsCased, IsCurrency)
01378 
01379         DECLARE_FORWARDED_PROPERTY_METHODS
01380 
01381 #undef ___UniFwd1
01382 
01383         bool IsPrivateUse(const int cp) const {
01384                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
01385                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
01386                         // Planes 15 and 16 are entirely for private use.
01387                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
01388         // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.
01389         // For db80..dbff it is clear that the surrogate pair containing this high surrogate
01390         // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false
01391         // for db80..dbff.  This is consistent with the category codes assigned in UnicodeData.txt.
01392         bool IsSurrogate(const int cp) const {
01393                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
01394                 return 0xd800 <= cp && cp <= 0xdcff; }
01395 
01396         // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1
01397         // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters
01398         // for composition to work correctly.
01399         int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
01400 
01401         //-------------------------------------------------------------------------
01402         // Hangul constants
01403         //-------------------------------------------------------------------------
01404 
01405         enum {
01406         HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
01407         HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
01408         HangulNCount = HangulVCount * HangulTCount,   // 588
01409         HangulSCount = HangulLCount * HangulNCount   // 11172
01410         };
01411 
01412         //-------------------------------------------------------------------------
01413         // Word boundaries (UAX #29)
01414         //-------------------------------------------------------------------------
01415 
01416 protected:
01417         // UAX #29, rule WB3: ignore Format and Extend characters.
01418         // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]
01419         static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }
01420         bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
01421         // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.
01422         template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01423                 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01424         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01425         template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01426                 if (position >= srcEnd) return;
01427                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01428         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01429         template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01430                 if (position >= srcEnd) return;
01431                 if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
01432                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01433         // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.
01434         template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {
01435                 if (position <= srcStart) return false;
01436                 while (position > srcStart) {
01437                         position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
01438                 return false; }
01439         // Test driver for WbFind*NonIgnored.
01440         void TestWbFindNonIgnored(const TIntV& src) const;
01441         void TestWbFindNonIgnored() const;
01442 public:
01443         // Finds the next word boundary strictly after 'position'.
01444         // Note that there is a valid word boundary at 'srcIdx + srcCount'.
01445         // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01446         template<typename TSrcVec>
01447         bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01448         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word
01449         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01450         // always set to 'true'.
01451         template<typename TSrcVec>
01452         void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01453 protected:
01454         void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);
01455 
01456         //-------------------------------------------------------------------------
01457         // Sentence boundaries (UAX #29)
01458         //-------------------------------------------------------------------------
01459 
01460 protected:
01461         TUniTrie<TInt> sbExTrie;
01462 
01463         // Checks whether a sentence that ended at src[position - 1]
01464         // would end in one of the suffixes from sbExTrie.
01465         template<typename TSrcVec>
01466         bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;
01467 
01468 public:
01469         // Finds the next sentence boundary strictly after 'position'.
01470         // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.
01471         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01472         template<typename TSrcVec>
01473         bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01474         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence
01475         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01476         // always set to 'true'.
01477         template<typename TSrcVec>
01478         void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01479 
01480         // These methods allow the user to define a set of sentence boundary exceptions.
01481         // This is a set of strings, stored in 'sbExTrie'.  If the Unicode rules require
01482         // a sentence boundary in a position that would cause the sentence to end with
01483         // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',
01484         // we will *not* place a sentence boundary there.
01485         //
01486         // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.
01487         // By default, it is empty.  Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain
01488         // a standard set of English-language exceptions.
01489         void SbEx_Clr() { sbExTrie.Clr(); }
01490         template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }
01491         // template<> void SbEx_Add(const TStr& s) {
01492         void SbEx_Add(const TStr& s) {
01493           TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
01494         void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
01495         int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);
01496                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
01497                 return vec.Len(); }
01498         void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }
01499         int SbEx_SetStdEnglish() {
01500                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
01501                 SbEx_Clr(); return SbEx_AddMulti(data, false); }
01502 
01503         //-------------------------------------------------------------------------
01504         // Normalization, decomposition, etc. (UAX #15)
01505         //-------------------------------------------------------------------------
01506 
01507 protected:
01508         // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).
01509         // If 'compatibility == false', only canonical decompositions are used.
01510         template<typename TDestCh>
01511         void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;
01512 public:
01513         // This appends, to 'dest', the decomposed form of the source string.
01514         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01515         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01516         template<typename TSrcVec, typename TDestCh>
01517         void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01518                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01519         template<typename TSrcVec, typename TDestCh>
01520         void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01521                 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01522         // This performs canonical composition on the source string, and appends
01523         // the result to the destination string.  The source string should be the
01524         // result of a (canonical or compatibility) decomposition; if this is the
01525         // case, the composition will lead to a normalization form C (NFC) or
01526         // normalization form KC (NFKC), depending on whether canonical or compatibility
01527         // decomposition was used.
01528         template<typename TSrcVec, typename TDestCh>
01529         void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01530                         TVec<TDestCh>& dest, bool clrDest = true) const;
01531         template<typename TSrcVec, typename TDestCh>
01532         void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01533                 Compose(src, 0, src.Len(), dest, clrDest); }
01534         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01535         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01536         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01537         // source string.
01538         template<typename TSrcVec, typename TDestCh>
01539         void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01540                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01541         template<typename TSrcVec, typename TDestCh>
01542         void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01543                 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01544         // Copies the starter characters from 'src' to 'dest'; the other
01545         // characters are skipped.  'src' should already have been decomposed.
01546         // Returns the number of characters extracted.
01547         template<typename TSrcVec, typename TDestCh>
01548         size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01549                         TVec<TDestCh>& dest, bool clrDest = true) const;
01550         template<typename TSrcVec, typename TDestCh>
01551         size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01552                 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
01553         // Extracts the starters into a temporary vector and then copies it into 'src'.
01554         template<typename TSrcVec>
01555         size_t ExtractStarters(TSrcVec& src) const {
01556                 TIntV temp; size_t retVal = ExtractStarters(src, temp);
01557                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
01558                 return retVal; }
01559 
01560 protected:
01561         void TestComposition(const TStr& basePath);
01562 
01563         //-------------------------------------------------------------------------
01564         // Initialization from the text files
01565         //-------------------------------------------------------------------------
01566 
01567 protected:
01568         void InitWordAndSentenceBoundaryFlags(const TStr& basePath);
01569         void InitScripts(const TStr& basePath);
01570         void InitLineBreaks(const TStr& basePath);
01571         void InitDerivedCoreProperties(const TStr& basePath);
01572         void InitPropList(const TStr& basePath);
01573         void InitSpecialCasing(const TStr& basePath);
01574         void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s);
01575 public:
01576         void LoadTxt(const TStr& basePath);
01577         void SaveBin(const TStr& fnBinUcd);
01578 
01579         //-------------------------------------------------------------------------
01580         // Case conversions
01581         //-------------------------------------------------------------------------
01582 
01583 public:
01584         typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
01585         // Appends the case-converted form of 'src' to 'dest'.
01586         // 'how' defines what kind of case conversion is required.
01587         // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').
01588         // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').
01589         template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;
01590         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
01591         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
01592         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
01593         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01594         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01595         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01596 
01597         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01598         // This is simpler and faster.  Since each character now maps into exactly one
01599         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01600         template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;
01601         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
01602         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
01603         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
01604         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
01605         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
01606         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
01607 
01608         template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;
01609         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
01610         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
01611         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
01612         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }
01613         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }
01614         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }
01615 
01616 public:
01617         friend class TUniCaseFolding;
01618 
01619         // Case folding is an alternative to the above functions.  It is intended primarily
01620         // to produce strings that are suitable for comparisons.  For example,
01621         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01622         // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01623         // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).
01624         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01625         //   into a string of two or more characters.
01626         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01627         //   each string before comparing them (see sec. 3.13 of the standard).
01628         template<typename TSrcVec, typename TDestCh>
01629         void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01630                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
01631         template<typename TSrcVec, typename TDestCh>
01632         void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {
01633                 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
01634         // ToCaseFolded folds the string in place.  However, this means that only the simple
01635         // case foldings can be used (the full ones could increase the length of the string).
01636         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
01637         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }
01638 
01639 protected:
01640         void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);
01641         void TestCaseConversions();
01642 
01643         //-------------------------------------------------------------------------
01644         // Text file reader for the Unicode character database
01645         //-------------------------------------------------------------------------
01646 
01647 protected:
01648 
01649         class TUcdFileReader
01650         {
01651         protected:
01652                 TChA buf;
01653         public:
01654                 TChA comment; // contains '#' and everything after it
01655         protected:
01656                 FILE *f;
01657                 int putBackCh;
01658                 int GetCh() {
01659                         if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }
01660                         return fgetc(f); }
01661                 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }
01662                 // Returns 'false' iff the EOF was encountered before anything was read.
01663                 bool ReadNextLine() {
01664                         buf.Clr(); comment.Clr();
01665                         bool inComment = false, first = true;
01666                         while (true) {
01667                                 int c = GetCh();
01668                                 if (c == EOF) return ! first;
01669                                 else if (c == 13) {
01670                                         c = GetCh(); if (c != 10) PutBack(c);
01671                                         return true; }
01672                                 else if (c == 10) return true;
01673                                 else if (c == '#') inComment = true;
01674                                 if (! inComment) buf += char(c);
01675                                 else comment += char(c); }
01676                                 /*first = false;*/}
01677         private:
01678                 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); }
01679                 TUcdFileReader(const TUcdFileReader& r) { Fail; }
01680         public:
01681                 TUcdFileReader() : f(0) { }
01682                 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }
01683                 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }
01684                 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}
01685                 ~TUcdFileReader() { Close(); }
01686                 bool GetNextLine(TStrV& dest) {
01687                         dest.Clr();
01688                         while (true) {
01689                                 if (! ReadNextLine()) return false;
01690                                 TStr line = buf; line.ToTrunc();
01691                                 if (line.Len() <= 0) continue;
01692                                 line.SplitOnAllCh(';', dest, false);
01693                                 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();
01694                                 return true; }}
01695                 static int ParseCodePoint(const TStr& s) {
01696                         int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }
01697                 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list
01698                         if (ClrDestP) dest.Clr();
01699                         TStrV parts; s.SplitOnWs(parts);
01700                         for (int i = 0; i < parts.Len(); i++) {
01701                                 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);
01702                                 dest.Add(c); } }
01703                 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy
01704                         int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }
01705                         from = ParseCodePoint(s.GetSubStr(0, i - 1));
01706                         to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }
01707         };
01708 
01709         //-------------------------------------------------------------------------
01710         // Helper class for processing the text files
01711         //-------------------------------------------------------------------------
01712         // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,
01713         // and not all codepoints from the range have also been listed in
01714         // UnicodeData.txt.  Thus, new TUniChInfo instances will be created
01715         // when processing DerivedCoreProps.txt and similar files.
01716         // To assign the correct (sub)categories to these new codepoints,
01717         // the following class will extract the subcategory info from the
01718         // comments in DerivedCoreProps.txt and similar files.
01719 
01720         class TSubcatHelper
01721         {
01722         public:
01723                 bool hasCat; TUniChSubCategory subCat;
01724                 TStrH invalidCatCodes;
01725                 TUniChDb &owner;
01726 
01727                 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }
01728 
01729                 void ProcessComment(TUniChDb::TUcdFileReader &reader)
01730                 {
01731                         hasCat = false; subCat = ucOtherNotAssigned;
01732                         if (reader.comment.Len() > 3)
01733                         {
01734                                 IAssert(reader.comment[0] == '#');
01735                                 IAssert(reader.comment[1] == ' ');
01736                                 char chCat = reader.comment[2], chSubCat = reader.comment[3];
01737                                 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));
01738                                 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {
01739                                         hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }
01740                                 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));
01741                         }
01742                 }
01743 
01744                 void SetCat(const int cp) {
01745                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01746                         IAssert(owner.h[i].subCat == ucOtherNotAssigned);
01747                         IAssert(hasCat);
01748                         owner.h[i].SetCatAndSubCat(subCat); }
01749                 void TestCat(const int cp) {
01750                         if (! hasCat) return;
01751                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01752                         IAssert(owner.h[i].subCat == subCat); }
01753 
01754                 ~TSubcatHelper()
01755                 {
01756                         if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");
01757                         // Output any unexpected ones (there shouldn't be any).
01758                         if (! invalidCatCodes.Empty()) {
01759                                 printf("Invalid cat code(s) in the comments: ");
01760                                 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )
01761                                         printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());
01762                                 printf("\n"); }
01763                 }
01764         };
01765 };
01766 
01767 //-----------------------------------------------------------------------------
01768 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb
01769 //-----------------------------------------------------------------------------
01770 
01771 class TUnicode
01772 {
01773 public:
01774         TUniCodec codec;
01775         TUniChDb ucd;
01776 
01777         TUnicode() { Init(); }
01778         explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }
01779         void Init() { InitCodecs(); }
01780 
01781         //-----------------------------------------------------------------------
01782         // UTF-8
01783         //-----------------------------------------------------------------------
01784 
01785         // Returns the number of characters that have been successfully decoded.
01786         // This does not include any replacement characters that may have been inserted into 'dest'.
01787         int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01788         int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01789 
01790         // Returns the number of characters that have been successfully encoded.
01791         // This does not include any replacement characters that may have been inserted into 'dest'.
01792         int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }
01793 
01794         // The following wrapper around the UTF-8 encoder returns a TStr containing
01795         // the UTF-8-encoded version of the input string.
01796         TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }
01797 
01798         // encoding one character to UTF8
01799         static void EncodeUtf8(const uint& Ch, TChA& Dest);
01800         static TStr EncodeUtf8(const uint& Ch);
01801 
01802         //-----------------------------------------------------------------------
01803         // UTF-16 Decoder
01804         //-----------------------------------------------------------------------
01805 
01806         // Returns the number of characters that have been successfully decoded.
01807         // This does not include any replacement characters that may have been inserted into 'dest'.
01808         // Each element of 'src' is assumed to contain one byte of data.
01809         // srcCount must be even (though srcIdx doesn't need to be).
01810         int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,
01811                 const TUtf16BomHandling bomHandling = bomAllowed,
01812                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01813                         return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01814 
01815         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
01816         // are used to determine if the two bytes of each word should be swapped before further
01817         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
01818         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
01819         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
01820         // beginning of the source data is used to determine the "original" byte order of the data;
01821         // if this doesn't match the byte order of the local machine, the two bytes of each word will
01822         // be swapped during the decoding process.
01823         int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,
01824                 const TUtf16BomHandling bomHandling = bomAllowed,
01825                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01826                         return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01827 
01828         //-----------------------------------------------------------------------
01829         // UTF-16 Encoder
01830         //-----------------------------------------------------------------------
01831 
01832         // Returns the number of characters that have been successfully encoded.
01833         // This does not include any replacement characters that may have been inserted into 'dest'.
01834         int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,
01835                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01836                         return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01837 
01838         int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,
01839                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01840                         return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01841 
01842         //-----------------------------------------------------------------------
01843         // 8-bit codecs
01844         //-----------------------------------------------------------------------
01845 
01846         T8BitCodec<TEncoding_ISO8859_1> iso8859_1;
01847         T8BitCodec<TEncoding_ISO8859_2> iso8859_2;
01848         T8BitCodec<TEncoding_ISO8859_3> iso8859_3;
01849         T8BitCodec<TEncoding_ISO8859_4> iso8859_4;
01850         T8BitCodec<TEncoding_YuAscii> yuAscii;
01851         T8BitCodec<TEncoding_CP1250> cp1250;
01852         T8BitCodec<TEncoding_CP852> cp852;
01853         T8BitCodec<TEncoding_CP437> cp437;
01854 
01855         //-----------------------------------------------------------------------
01856         // Codec registry
01857         //-----------------------------------------------------------------------
01858         // If you know you'll need ISO-8859-2, just use
01859         //   TUnicode unicode;
01860         //   unicode.iso8859_2.Encode(...);
01861         // If you don't know what you'll need, use:
01862         //   TUnicode unicode;
01863         //   PCodecBase myCodec = unicode.GetCodec(myCodecName);
01864         //   myCodec->Encode(...);
01865         // Note that the first approach is slightly more efficient because there
01866         // aren't any virtual method calls involved.
01867 
01868 protected:
01869         THash<TStr, PCodecBase> codecs;
01870         static inline TStr NormalizeCodecName(const TStr& name) {
01871                 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
01872 public:
01873         void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {
01874                 TStrV names; nameList.SplitOnWs(names);
01875                 for (int i = 0; i < names.Len(); i++)
01876                         codecs.AddDat(NormalizeCodecName(names[i]), codec); }
01877         void UnregisterCodec(const TStr& nameList) {
01878                 TStrV names; nameList.SplitOnWs(names);
01879                 for (int i = 0; i < names.Len(); i++)
01880                         codecs.DelKey(NormalizeCodecName(names[i])); }
01881         void ClrCodecs() { codecs.Clr(); }
01882         void InitCodecs();
01883         PCodecBase GetCodec(const TStr& name) const {
01884                 TStr s = NormalizeCodecName(name);
01885                 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
01886                 return p; }
01887         void GetAllCodecs(TCodecBaseV& dest) const {
01888                 dest.Clr();
01889                 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
01890                         PCodecBase codec = codecs[i]; bool found = false;
01891                         for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
01892                         if (! found) dest.Add(codec); }}
01893 
01894         //-------------------------------------------------------------------------
01895         // Word boundaries (UAX #29)
01896         //-------------------------------------------------------------------------
01897 
01898         // Finds the next word boundary strictly after 'position'.
01899         // Note that there are valid word boundaries at 0 and at 'src.Len()'.
01900         // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01901         bool FindNextWordBoundary(const TIntV& src, int &position) const {
01902                 if (position < 0) { position = 0; return true; }
01903                 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01904         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word
01905         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01906         // always set to 'true'.
01907         void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
01908 
01909         //-------------------------------------------------------------------------
01910         // Sentence boundaries (UAX #29)
01911         //-------------------------------------------------------------------------
01912 
01913         // Finds the next sentence boundary strictly after 'position'.
01914         // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.
01915         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01916         bool FindNextSentenceBoundary(const TIntV& src, int &position) const {
01917                 if (position < 0) { position = 0; return true; }
01918                 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01919         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence
01920         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01921         // always set to 'true'.
01922         void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
01923 
01924         void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); }
01925         void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); }
01926 
01927         //-------------------------------------------------------------------------
01928         // Normalization, decomposition, etc. (UAX #15)
01929         //-------------------------------------------------------------------------
01930 
01931         // This sets 'dest' to the decomposed form of the source string.
01932         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01933         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01934         void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }
01935         // This performs canonical composition on the source string, and stores
01936         // the result in the destination vector.  The source string should be the
01937         // result of a (canonical or compatibility) decomposition; if this is the
01938         // case, the composition will lead to a normalization form C (NFC) or
01939         // normalization form KC (NFKC), depending on whether canonical or compatibility
01940         // decomposition was used.
01941         void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }
01942         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01943         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01944         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01945         // source string.
01946         void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }
01947         // Copies the starter characters from 'src' to 'dest'; the other
01948         // characters are skipped.  'src' should already have been decomposed.
01949         // Returns the number of characters extracted.  This function can be
01950         // used to remove diacritical marks from a string (after it has been decomposed!).
01951         int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }
01952         // Extracts the starters into a temporary vector and then copies it into 'src'.
01953         int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }
01954 
01955         //-------------------------------------------------------------------------
01956         // Case conversions
01957         //-------------------------------------------------------------------------
01958         // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,
01959         // use the case-conversion methods in TUniChDb, which allow the caller
01960         // to request language-specific case mappings for these languages.
01961 
01962 public:
01963         typedef TUniChDb::TCaseConversion TCaseConversion;
01964         // Sets 'dest' to the case-converted form of 'src'.
01965         void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }
01966         void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }
01967         void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }
01968 
01969         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01970         // This is simpler and faster.  Since each character now maps into exactly one
01971         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01972         void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }
01973         void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }
01974         void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }
01975 
01976         // These functions perform simple case-conversions in-place.
01977         void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }
01978         void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }
01979         void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }
01980 
01981         // Case folding is an alternative to the above functions.  It is intended primarily
01982         // to produce strings that are suitable for comparisons.  For example,
01983         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01984         // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01985         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01986         //   into a string of two or more characters.
01987         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01988         //   each string before comparing them (see sec. 3.13 of the standard).
01989         void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }
01990         // ToCaseFolded folds the string in place.  However, this means that only the simple
01991         // case foldings can be used (the full ones could increase the length of the string).
01992         void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }
01993 
01994         TStr GetUtf8CaseFolded(const TStr& s) const {
01995                 bool isAscii = true;
01996                 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
01997                 if (isAscii) return s.GetLc();
01998                 TIntV src; DecodeUtf8(s, src);
01999                 TIntV dest; GetCaseFolded(src, dest);
02000                 return EncodeUtf8Str(dest); }
02001 
02002         //-------------------------------------------------------------------------
02003         // Character properties
02004         //-------------------------------------------------------------------------
02005         // These methods simply call the corresponding TUniChDb method
02006         // (which typically calls the corresponding method of TUniChInfo).
02007         // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.
02008         // They are all of the form        bool IsXxxx(const int cp) const
02009         // Some of the more notable ones include:
02010         // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit
02011         //   IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic
02012         //   IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace
02013 
02014 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
02015         DECLARE_FORWARDED_PROPERTY_METHODS
02016 #undef DECLARE_FORWARDED_PROPERTY_METHODS
02017 #undef __UniFwd1
02018         ___UniFwd2(IsPrivateUse, IsSurrogate)
02019 
02020         TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }
02021         TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }
02022 
02023         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
02024         const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }
02025         TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }
02026 
02027 };
02028 
02029 //-----------------------------------------------------------------------------
02030 // TUniCodec -- UTF-8 Decoder
02031 //-----------------------------------------------------------------------------
02032 
02033 // Returns the number of characters that have been successfully decoded.
02034 // This does not include any replacement characters that may have been inserted into 'dest'.
02035 template<typename TSrcVec, typename TDestCh>
02036 size_t TUniCodec::DecodeUtf8(
02037         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02038         TVec<TDestCh>& dest, const bool clrDest) const
02039 {
02040         size_t nDecoded = 0;
02041         if (clrDest) dest.Clr();
02042         const size_t origSrcIdx = srcIdx;
02043         const size_t srcEnd = srcIdx + srcCount;
02044         while (srcIdx < srcEnd)
02045         {
02046                 const size_t charSrcIdx = srcIdx;
02047                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02048                 if ((c & _1000_0000) == 0) {
02049                         // c is one of the characters 0..0x7f, encoded as a single byte.
02050                         dest.Add(TDestCh(c)); nDecoded++; continue; }
02051                 else if ((c & _1100_0000) == _1000_0000) {
02052                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
02053                         // We must have been thrown into the middle of a multi-byte character.
02054                         switch (errorHandling) {
02055                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
02056                         case uehAbort: return nDecoded;
02057                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02058                         case uehIgnore: continue;
02059                         default: Fail; } }
02060                 else
02061                 {
02062                         // c introduces a sequence of 2..6 bytes, depending on how many
02063                         // of the most significant bits of c are set.
02064                         uint nMoreBytes = 0, nBits = 0, minVal = 0;
02065                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
02066                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
02067                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
02068                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
02069                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
02070                         else {
02071                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
02072                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
02073                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
02074                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
02075                                 if (strict)  {
02076                                         switch (errorHandling) {
02077                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
02078                                         case uehAbort: return nDecoded;
02079                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
02080                                         // and try to decode the character.  Then, since 'strict' is true and
02081                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later
02082                                         // and (in the case of uehReplace) insert a replacement character then.
02083                                         // This is probably better than inserting a replacement character right
02084                                         // away and then trying to read the next byte as if a new character
02085                                         // was beginning there -- if the current byte is really followed by five
02086                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.
02087                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
02088                                         case uehIgnore: break; // continue;
02089                                         default: Fail; } }
02090                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
02091                         // Decode this multi-byte sequence.
02092                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
02093                         bool cancel = false;
02094                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
02095                                 // See if there are enough bytes left in the source vector.
02096                                 if (! (srcIdx < srcEnd)) {
02097                                         switch (errorHandling) {
02098                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
02099                                         case uehAbort: return nDecoded;
02100                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
02101                                         case uehIgnore: cancel = true; continue;
02102                                         default: Fail; } }
02103                                 // Read the next byte.
02104                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02105                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
02106                                         switch (errorHandling) {
02107                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
02108                                         case uehAbort: return nDecoded;
02109                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
02110                                         case uehIgnore: srcIdx--; cancel = true; continue;
02111                                         default: Fail; } }
02112                                 cOut <<= 6; cOut |= (c & _0011_1111); }
02113                         if (cancel) continue;
02114                         if (strict) {
02115                                 // err1: This codepoint has been represented by more bytes than it should have been.
02116                                 // For example, cOut in the range 0..127 should be represented by a single byte,
02117                                 // not by two or more bytes.
02118                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java
02119                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
02120                                 // the appearance of null bytes in the encoded stream.
02121                                 bool err1 = (cOut < minVal);
02122                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
02123                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
02124                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
02125                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
02126                                 if (err1 || err2) switch (errorHandling) {
02127                                         case uehThrow:
02128                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
02129                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
02130                                                 else { Fail; break; }
02131                                         case uehAbort: return nDecoded;
02132                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02133                                         case uehIgnore: continue;
02134                                         default: Fail; } }
02135                         // Add the decoded codepoint to the destination vector.
02136                         // If this is the first decoded character, and it's one of the byte-order marks
02137                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
02138                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
02139                                 dest.Add(cOut); nDecoded++; }
02140                 } // else (multi-byte sequence)
02141         } // while
02142         return nDecoded;
02143 }
02144 
02145 //-----------------------------------------------------------------------
02146 // TUniCodec -- UTF-8 Encoder
02147 //-----------------------------------------------------------------------
02148 
02149 // Returns the number of characters that have been successfully encoded.
02150 // This does not include any replacement characters that may have been inserted into 'dest'.
02151 template<typename TSrcVec, typename TDestCh>
02152 size_t TUniCodec::EncodeUtf8(
02153         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02154         TVec<TDestCh>& dest, const bool clrDest) const
02155 {
02156         size_t nEncoded = 0;
02157         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
02158         {
02159                 uint c = uint(src[TVecIdx(srcIdx)]);
02160                 bool err = false;
02161                 if (strict && c > 0x10ffff) {
02162                         err = true;
02163                         switch (errorHandling) {
02164                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
02165                         case uehAbort: return nEncoded;
02166                         case uehReplace: c = replacementChar; break;
02167                         case uehIgnore: continue;
02168                         default: Fail; } }
02169                 if (c < 0x80u)
02170                         dest.Add(TDestCh(c & 0xffu));
02171                 else if (c < 0x800u) {
02172                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
02173                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02174                 else if (c < 0x10000u) {
02175                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
02176                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02177                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02178                 else if (c < 0x200000u) {
02179                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
02180                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02181                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02182                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02183                 else if (c < 0x4000000u) {
02184                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
02185                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02186                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02187                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02188                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02189                 else {
02190                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
02191                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
02192                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02193                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02194                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02195                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02196                 if (! err) nEncoded++;
02197         }
02198         return nEncoded;
02199 }
02200 
02201 //-----------------------------------------------------------------------
02202 // TUniCodec -- UTF-16 Encoder
02203 //-----------------------------------------------------------------------
02204 
02205 // Returns the number of characters that have been successfully decoded.
02206 // This does not include any replacement characters that may have been inserted into 'dest'.
02207 // Each element of 'src' is assumed to contain one byte of data.
02208 // srcCount must be even (though srcIdx doesn't need to be).
02209 template<typename TSrcVec, typename TDestCh>
02210 size_t TUniCodec::DecodeUtf16FromBytes(
02211         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02212         TVec<TDestCh>& dest, const bool clrDest,
02213         const TUtf16BomHandling bomHandling,
02214         const TUniByteOrder defaultByteOrder) const
02215 {
02216         IAssert(srcCount % 2 == 0);
02217         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02218         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02219         if (clrDest) dest.Clr();
02220         size_t nDecoded = 0;
02221         if (srcCount <= 0) return nDecoded;
02222         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02223         bool littleEndian = false;
02224         bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
02225         if (bomHandling == bomIgnored) littleEndian = leDefault;
02226         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02227         {
02228                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
02229                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
02230                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
02231                 else if (bomHandling == bomAllowed) littleEndian = leDefault;
02232                 else { // Report an error.
02233                         switch (errorHandling) {
02234                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
02235                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02236                         default: Fail; } }
02237         }
02238         else Fail;
02239         while (srcIdx < srcEnd)
02240         {
02241                 const size_t charSrcIdx = srcIdx;
02242                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02243                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02244                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02245                 {
02246                         // c is the first character in a surrogate pair.  Read the next character.
02247                         if (! (srcIdx + 2 <= srcEnd)) {
02248                                 switch (errorHandling) {
02249                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02250                                 case uehAbort: return nDecoded;
02251                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02252                                 case uehIgnore: continue;
02253                                 default: Fail; } }
02254                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02255                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02256                         // c2 should be the second character of the surrogate pair.
02257                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02258                                 switch (errorHandling) {
02259                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02260                                 case uehAbort: return nDecoded;
02261                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02262                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
02263                                 case uehIgnore: srcIdx -= 2; continue;
02264                                 default: Fail; } }
02265                         // c and c2 each contain 10 bits of information.
02266                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02267                         cc += 0x10000;
02268                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02269                 }
02270                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02271                         switch (errorHandling) {
02272                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02273                         case uehAbort: return nDecoded;
02274                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02275                         case uehIgnore: continue;
02276                         default: Fail; } }
02277                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02278                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02279                 // Otherwise, store 'c' to the destination vector.
02280                 dest.Add(TDestCh(c)); nDecoded++;
02281         }
02282         return nDecoded;
02283 }
02284 
02285 // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
02286 // are used to determine if the two bytes of each word should be swapped before further
02287 // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
02288 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
02289 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
02290 // beginning of the source data is used to determine the "original" byte order of the data;
02291 // if this doesn't match the byte order of the local machine, the two bytes of each word will
02292 // be swapped during the decoding process.
02293 template<typename TSrcVec, typename TDestCh>
02294 size_t TUniCodec::DecodeUtf16FromWords(
02295         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02296         TVec<TDestCh>& dest, bool clrDest,
02297         const TUtf16BomHandling bomHandling,
02298         const TUniByteOrder defaultByteOrder) const
02299 {
02300         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02301         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02302         if (clrDest) dest.Clr();
02303         size_t nDecoded = 0;
02304         if (srcCount <= 0) return nDecoded;
02305         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02306         bool swap = false;
02307         bool isMachineLe = IsMachineLittleEndian();
02308         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
02309         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
02310         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02311         {
02312                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
02313                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
02314                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
02315                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
02316                 else { // Report an error.
02317                         switch (errorHandling) {
02318                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
02319                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02320                         default: Fail; } }
02321         }
02322         else Fail;
02323         while (srcIdx < srcEnd)
02324         {
02325                 const size_t charSrcIdx = srcIdx;
02326                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02327                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02328                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02329                 {
02330                         // c is the first character in a surrogate pair.  Read the next character.
02331                         if (! (srcIdx < srcEnd)) {
02332                                 switch (errorHandling) {
02333                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02334                                 case uehAbort: return nDecoded;
02335                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02336                                 case uehIgnore: continue;
02337                                 default: Fail; } }
02338                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02339                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
02340                         // c2 should be the second character of the surrogate pair.
02341                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02342                                 switch (errorHandling) {
02343                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02344                                 case uehAbort: return nDecoded;
02345                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02346                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
02347                                 case uehIgnore: srcIdx -= 1; continue;
02348                                 default: Fail; } }
02349                         // c and c2 each contain 10 bits of information.
02350                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02351                         cc += 0x10000;
02352                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02353                 }
02354                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02355                         switch (errorHandling) {
02356                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02357                         case uehAbort: return nDecoded;
02358                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02359                         case uehIgnore: continue;
02360                         default: Fail; } }
02361                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02362                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02363                 // Otherwise, store 'c' to the destination vector.
02364                 dest.Add(TDestCh(c)); nDecoded++;
02365         }
02366         return nDecoded;
02367 }
02368 
02369 //-----------------------------------------------------------------------
02370 // TUniCodec -- UTF-16 Encoder
02371 //-----------------------------------------------------------------------
02372 
02373 // Returns the number of characters that have been successfully encoded.
02374 // This does not include any replacement characters that may have been inserted into 'dest'.
02375 template<typename TSrcVec, typename TDestCh>
02376 size_t TUniCodec::EncodeUtf16ToWords(
02377         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02378         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02379         const TUniByteOrder destByteOrder) const
02380 {
02381         bool isMachineLe = IsMachineLittleEndian();
02382         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
02383         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02384         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
02385         while (srcIdx < srcEnd)
02386         {
02387                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02388                 if (! (c <= 0x10ffffu)) {
02389                         switch (errorHandling) {
02390                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02391                         case uehAbort: return nEncoded;
02392                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02393                         case uehIgnore: continue;
02394                         default: Fail; } }
02395                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02396                         switch (errorHandling) {
02397                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02398                         case uehAbort: return nEncoded;
02399                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02400                         case uehIgnore: continue;
02401                         default: Fail; } }
02402                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02403                         switch (errorHandling) {
02404                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02405                         case uehAbort: return nEncoded;
02406                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02407                         case uehIgnore: continue;
02408                         default: Fail; } }
02409                 // If c is <= 0xffff, it can be stored directly.
02410                 if (c <= 0xffffu) {
02411                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02412                         dest.Add(TDestCh(c)); nEncoded++; continue; }
02413                 // Otherwise, represent c by a pair of surrogate characters.
02414                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02415                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02416                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02417                 if (swap) {
02418                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
02419                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
02420                 dest.Add(TDestCh(c1));
02421                 dest.Add(TDestCh(c2));
02422                 nEncoded++; continue;
02423         }
02424         return nEncoded;
02425 }
02426 
02427 template<typename TSrcVec, typename TDestCh>
02428 size_t TUniCodec::EncodeUtf16ToBytes(
02429         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02430         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02431         const TUniByteOrder destByteOrder) const
02432 {
02433         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
02434         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02435         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
02436         while (srcIdx < srcEnd)
02437         {
02438                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02439                 if (! (c <= 0x10ffffu)) {
02440                         switch (errorHandling) {
02441                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02442                         case uehAbort: return nEncoded;
02443 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
02444                         case uehReplace: ___OutRepl; continue;
02445                         case uehIgnore: continue;
02446                         default: Fail; } }
02447                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02448                         switch (errorHandling) {
02449                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02450                         case uehAbort: return nEncoded;
02451                         case uehReplace: ___OutRepl; continue;
02452                         case uehIgnore: continue;
02453                         default: Fail; } }
02454                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02455                         switch (errorHandling) {
02456                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02457                         case uehAbort: return nEncoded;
02458                         case uehReplace: ___OutRepl; continue;
02459                         case uehIgnore: continue;
02460                         default: Fail; } }
02461 #undef ___OutRepl
02462                 // If c is <= 0xffff, it can be stored directly.
02463                 if (c <= 0xffffu) {
02464                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
02465                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
02466                         nEncoded++; continue; }
02467                 // Otherwise, represent c by a pair of surrogate characters.
02468                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02469                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02470                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02471                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
02472                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
02473                 nEncoded++; continue;
02474         }
02475         return nEncoded;
02476 }
02477 
02478 //-----------------------------------------------------------------------------
02479 // TUniChDb -- word boundaries
02480 //-----------------------------------------------------------------------------
02481 
02482 template<typename TSrcVec>
02483 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02484 {
02485         // WB1.  Break at the start of text.
02486         if (position < srcIdx) { position = srcIdx; return true; }
02487         // If we are beyond the end of the text, there aren't any word breaks left.
02488         const size_t srcEnd = srcIdx + srcCount;
02489         if (position >= srcEnd) return false;
02490         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02491         size_t origPos = position;
02492         if (IsWbIgnored(src[TVecIdx(position)])) {
02493                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02494                         position = origPos;
02495         }
02496         // Determine the previous nonignored character (before 'position').
02497         size_t posPrev = position;
02498         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02499         // Sec 6.2.  Allow a break between Sep and an ignored character.
02500         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02501         // Determine the next nonignored character (after 'position').
02502         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02503         size_t posNext2;
02504         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02505         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02506         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
02507         int cNext2, wbfNext2;
02508         //
02509         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02510                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02511                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
02512         {
02513                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02514                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02515                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02516                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02517                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02518                 wbfNext2 = GetWbFlags(cNext2);
02519 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02520 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
02521 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02522                 // WB3.  Do not break within CRLF.
02523                 if (cCur == 13 && cNext == 10) continue;
02524                 // WB5.  Do not break between most letters.
02525                 TestCurNext(ucfWbALetter, ucfWbALetter);
02526                 // WB6.  Do not break letters across certain punctuation.
02527                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02528                 // WB7.  Do not break letters across certain punctuation.
02529                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02530                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
02531                 TestCurNext(ucfWbNumeric, ucfWbNumeric);
02532                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
02533                 TestCurNext(ucfWbALetter, ucfWbNumeric);
02534                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
02535                 TestCurNext(ucfWbNumeric, ucfWbALetter);
02536                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
02537                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02538                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
02539                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02540                 // WB13.  Do not break between Katakana.
02541                 TestCurNext(ucfWbKatakana, ucfWbKatakana);
02542                 // WB13a.  Do not break from extenders.
02543                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
02544                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
02545                 // WB13b.  Do not break from extenders.
02546                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
02547                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
02548                 // WB14.  Otherwise, break everywhere.
02549                 position = posNext; return true;
02550 #undef TestCurNext
02551 #undef TestCurNext2
02552 #undef TestPrevCurNext
02553         }
02554         // WB2.  Break at the end of text.
02555         IAssert(position == srcEnd);
02556         return true;
02557 }
02558 
02559 // ToDo: provide a more efficient implementation of this.
02560 template<typename TSrcVec>
02561 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02562 {
02563         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02564         dest.PutAll(false);
02565         size_t position = srcIdx;
02566         dest[TVecIdx(position - srcIdx)] = true;
02567         while (position < srcIdx + srcCount)
02568         {
02569                 size_t oldPos = position;
02570                 FindNextWordBoundary(src, srcIdx, srcCount, position);
02571                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02572                 dest[TVecIdx(position - srcIdx)] = true;
02573         }
02574         Assert(dest[TVecIdx(srcCount)]);
02575 }
02576 
02577 //-----------------------------------------------------------------------------
02578 // TUniChDb -- sentence boundaries
02579 //-----------------------------------------------------------------------------
02580 
02581 template<typename TSrcVec>
02582 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const
02583 {
02584         if (sbExTrie.Empty()) return true;
02585         // We'll move back from the position where a sentence-boundary is being considered.
02586         size_t pos = position;
02587         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02588         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
02589         // - Skip the Sep, if there is one.
02590         if ((c & ucfSbSep) == ucfSbSep) {
02591                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02592                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02593         // - Skip any Sp characters.
02594         while ((sfb & ucfSbSp) == ucfSbSp) {
02595                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02596                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02597         // - Skip any Close characters.
02598         while ((sfb & ucfSbSp) == ucfSbSp) {
02599                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02600                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02601         // - Skip any ATerm | STerm characters.
02602         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
02603                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02604                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02605         // Now start moving through the trie.
02606         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
02607         while (true)
02608         {
02609                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
02610                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
02611                 TUniChCategory cat = GetCat(c);
02612                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
02613                         // Check if the suffix we've read so far is one of those that appear in the trie.
02614                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);
02615                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
02616                         IAssert(len >= 3); IAssert(node >= 0);
02617                         if (sbExTrie.IsNodeTerminal(node)) return false;
02618                         if (atEnd) return true; }
02619                 if (len == 1) { cButLast = c; len++; }
02620                 else if (len == 2) { cButButLast = c; len++;
02621                         // Now we have read the last three characters; start descending the suitable subtrie.
02622                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
02623                         if (node < 0) return true; }
02624                 else {
02625                         // Descend down the trie.
02626                         node = sbExTrie.GetChild(node, c);
02627                         if (node < 0) return true; }
02628         }
02629         //return true;
02630 }
02631 
02632 template<typename TSrcVec>
02633 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02634 {
02635         // SB1.  Break at the start of text.
02636         if (position < srcIdx) { position = srcIdx; return true; }
02637         // If we are beyond the end of the text, there aren't any word breaks left.
02638         const size_t srcEnd = srcIdx + srcCount;
02639         if (position >= srcEnd) return false;
02640         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02641         size_t origPos = position;
02642         if (IsWbIgnored(src[TVecIdx(position)])) {
02643                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02644                         position = origPos;
02645         }
02646         // Determine the previous nonignored character (before 'position').
02647         size_t posPrev = position;
02648         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02649         // Sec 6.2.  Allow a break between Sep and an ignored character.
02650         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02651         // Determine the next nonignored character (after 'position').
02652         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02653         size_t posNext2;
02654         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02655         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02656         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
02657         int cNext2, sbfNext2;
02658         // Initialize the state of the peek-back automaton.
02659         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
02660         TPeekBackState backState;
02661         {
02662                 size_t pos = position;
02663                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
02664                 while (true)
02665                 {
02666                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02667                         // Skip at most one Sep.
02668                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02669                         if ((sbf & ucfSbSep) == ucfSbSep) {
02670                                 wasSep = true;
02671                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02672                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02673                         // Skip zero or more Sp's.
02674                         bool stop = false;
02675                         while ((sbf & ucfSbSp) == ucfSbSp) {
02676                                 wasSp = true;
02677                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02678                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02679                         if (stop) break;
02680                         // Skip zero or more Close's.
02681                         while ((sbf & ucfSbClose) == ucfSbClose) {
02682                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02683                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02684                         if (stop) break;
02685                         // Process an ATerm or STerm.
02686                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
02687                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
02688                         break;
02689                 }
02690                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
02691                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
02692                 else backState = stInit;
02693         }
02694         // Initialize the state of the peek-ahead automaton.  This state tells us what follows
02695         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
02696         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
02697         // Our peek-ahead automaton must tell us whether it is Lower or something else.
02698         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
02699         TPeekAheadState aheadState = stUnknown;
02700         //
02701         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02702                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02703                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
02704         {
02705                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02706                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02707                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02708                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02709                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02710                 sbfNext2 = GetSbFlags(cNext2);
02711                 // Update the peek-back automaton.
02712 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
02713 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
02714                 switch (backState) {
02715                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
02716                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
02717                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
02718                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02719                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02720                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02721                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02722                         default: IAssert(false); }
02723 #undef Trans
02724 #undef TestCur
02725                 // Update the peek-ahead automaton.
02726 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
02727                 if (! IsPeekAheadSkippable(sbfCur)) {
02728                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
02729                         if (aheadState == stLower) IAssert(isLower);
02730                         else if (aheadState == stNotLower) IAssert(! isLower);
02731                         // We haven't peaked ahead farther than this so far -- invalidate the state.
02732                         aheadState = stUnknown; }
02733                 if (aheadState == stUnknown)
02734                 {
02735                         // Peak ahead to the next non-peekahead-skippable character.
02736                         size_t pos = posNext;
02737                         while (pos < srcEnd) {
02738                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02739                                 if (! IsPeekAheadSkippable(sbf)) {
02740                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
02741                                         else aheadState = stNotLower;
02742                                         break; }
02743                                 WbFindNextNonIgnored(src, pos, srcEnd); }
02744                         if (! (pos < srcEnd)) aheadState = stNotLower;
02745                 }
02746 #undef IsPeekAheadSkippable
02747                 //
02748 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02749 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
02750 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02751                 // SB3.  Do not break within CRLF.
02752                 if (cCur == 13 && cNext == 10) continue;
02753                 // SB4.  Break ater paragraph separators.
02754                 if ((sbfCur & ucfSbSep) == ucfSbSep) {
02755                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02756                         position = posNext; return true; }
02757                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number
02758                 // or lowercase letter, if they are between uppercase letters, or if the first following letter
02759                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
02760                 // or numeric period, and thus may not mark the end of a sentence.
02761                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
02762                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
02763                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
02764                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
02765                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
02766                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
02767                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
02768                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
02769                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
02770                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
02771                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
02772                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
02773                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
02774                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
02775                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02776                         position = posNext; return true; } // SB11
02777                 // WB12.  Otherwise, do not break.
02778                 continue;
02779 #undef TestCurNext
02780 #undef TestCurNext2
02781 #undef TestPrevCurNext
02782         }
02783         // WB2.  Break at the end of text.
02784         IAssert(position == srcEnd);
02785         return true;
02786 }
02787 
02788 // ToDo: provide a more efficient implementation of this.
02789 template<typename TSrcVec>
02790 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02791 {
02792         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02793         dest.PutAll(false);
02794         size_t position = srcIdx;
02795         dest[TVecIdx(position - srcIdx)] = true;
02796         while (position < srcIdx + srcCount)
02797         {
02798                 size_t oldPos = position;
02799                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);
02800                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02801                 dest[TVecIdx(position - srcIdx)] = true;
02802         }
02803         Assert(dest[TVecIdx(srcCount)]);
02804 }
02805 
02806 //-----------------------------------------------------------------------------
02807 // TUniChDb -- case conversions
02808 //-----------------------------------------------------------------------------
02809 
02810 template<typename TSrcVec, typename TDestCh>
02811 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02812                                                                 TVec<TDestCh>& dest, const bool clrDest,
02813                                                                 const TUniChDb::TCaseConversion how,
02814                                                                 const bool turkic, const bool lithuanian) const
02815 {
02816         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
02817         if (clrDest) dest.Clr();
02818         enum {
02819                 GreekCapitalLetterSigma = 0x3a3,
02820                 GreekSmallLetterSigma = 0x3c3,
02821                 GreekSmallLetterFinalSigma = 0x3c2,
02822                 LatinCapitalLetterI = 0x49,
02823                 LatinCapitalLetterJ = 0x4a,
02824                 LatinCapitalLetterIWithOgonek = 0x12e,
02825                 LatinCapitalLetterIWithGrave = 0xcc,
02826                 LatinCapitalLetterIWithAcute = 0xcd,
02827                 LatinCapitalLetterIWithTilde = 0x128,
02828                 LatinCapitalLetterIWithDotAbove = 0x130,
02829                 LatinSmallLetterI = 0x69,
02830                 CombiningDotAbove = 0x307
02831         };
02832         //
02833         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
02834         size_t nextWordBoundary = srcIdx;
02835         TBoolV wordBoundaries; bool wbsKnown = false;
02836         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
02837         {
02838                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;
02839                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");
02840                 // For conversion to titlecase, the first cased character of each word
02841                 // must be converted to titlecase; everything else must be converted
02842                 // to lowercase.
02843                 TUniChDb::TCaseConversion howHere;
02844                 if (how != ccTitle) howHere = how;
02845                 else {
02846                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
02847                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;
02848                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
02849                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
02850                         bool isCased = IsCased(cp);
02851                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
02852                         else { howHere = ccLower;
02853                                 if (isCased && seenCased) seenTwoCased = true; }
02854                 }
02855                 // First, process the conditional mappings from SpecialCasing.txt.
02856                 // These will be processed in code -- they were ignored while
02857                 // we were reading SpecialCasing.txt itself.
02858                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)
02859                 {
02860                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
02861                         // the standard doesn't define it.  We'll use FinalCased instead.
02862                         // FinalCased: within the closest word boundaries containing C,
02863                         // there is a cased letter before C, and there is no cased letter after C.
02864                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
02865                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
02866                         size_t srcIdx2 = srcIdx; bool casedAfter = false;
02867                         if (how == ccTitle)
02868                                 printf("!");
02869                         //while (srcIdx2 < nextBoundary)
02870                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02871                         {
02872                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02873                                 if (IsCased(cp2)) { casedAfter = true; break; }
02874                         }
02875                         if (! casedAfter)
02876                         {
02877                                 //size_t prevBoundary = srcIdx - 1;
02878                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
02879                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;
02880                                 //while (prevBoundary < srcIdx2)
02881                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02882                                 {
02883                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02884                                         if (IsCased(cp2)) { casedBefore = true; break; }
02885                                 }
02886                                 if (casedBefore) {
02887                                         // Now we have a FinalCased character.
02888                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
02889                         }
02890                         // If we got here, add a non-final sigma.
02891                         dest.Add(GreekSmallLetterSigma); continue;
02892                 }
02893                 else if (lithuanian)
02894                 {
02895                         if (howHere == ccLower)
02896                         {
02897                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
02898                                 {
02899                                         bool moreAbove = false;
02900                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02901                                         {
02902                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02903                                                 const int cc2 = GetCombiningClass(cp2);
02904                                                 if (cc2 == TUniChInfo::ccStarter) break;
02905                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
02906                                         }
02907                                         if (moreAbove)
02908                                         {
02909                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
02910                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
02911                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
02912                                         }
02913                                 }
02914                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
02915                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
02916                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
02917                         }
02918                         if (cp == CombiningDotAbove)
02919                         {
02920                                 // Lithuanian, howHere != ccLower.
02921                                 // AfterSoftDotted := the last preceding character with a combining class
02922                                 // of zero before C was Soft_Dotted, and there is no intervening combining
02923                                 // character class 230 (ABOVE).
02924                                 bool afterSoftDotted = false;
02925                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02926                                 while (origSrcIdx < srcIdx2)
02927                                 {
02928                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02929                                         int cc2 = GetCombiningClass(cp2);
02930                                         if (cc2 == TUniChInfo::ccAbove) break;
02931                                         if (cc2 == TUniChInfo::ccStarter) {
02932                                                 afterSoftDotted = IsSoftDotted(cp2); break; }
02933                                 }
02934                                 if (afterSoftDotted)
02935                                 {
02936                                         Assert(lithuanian);
02937                                         // Remove DOT ABOVE after "i" with upper or titlecase.
02938                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
02939                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
02940                                         if (how == ccLower) { dest.Add(0x307); continue; }
02941                                         if (how == ccUpper) continue;
02942                                         Assert(how == ccTitle);
02943                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
02944                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
02945                                         dest.Add(0x307); continue;
02946                                 }
02947                         }
02948                 }
02949                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
02950                 {
02951                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
02952                         // The following rules handle those cases.
02953                         if (cp == LatinCapitalLetterIWithDotAbove) {
02954                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
02955                         // When lowercasing, remove dot_above in the sequence I + dot_above,
02956                         // which will turn into i.  This matches the behavior of the
02957                         // canonically equivalent I-dot_above.
02958                         else if (cp == CombiningDotAbove)
02959                         {
02960                                 // AfterI: the last preceding base character was an uppercase I,
02961                                 // and there is no intervening combining character class 230 (ABOVE).
02962                                 bool afterI = false;
02963                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02964                                 while (origSrcIdx < srcIdx2)
02965                                 {
02966                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02967                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
02968                                         int cc2 = GetCombiningClass(cp2);
02969                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
02970                                 }
02971                                 if (afterI) {
02972                                         if (how == ccTitle && seenCased && ! seenTwoCased) {
02973                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
02974                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
02975                                                 // This suggests that if a cased character is found, others in that word should be left alone.
02976                                                 // This seems unusual; we map all other characters to lowercase instead.
02977                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
02978                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
02979                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
02980                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.
02981                                                 // So we treat this as a special case here.
02982                                                 IAssert(cpFirstCased == LatinCapitalLetterI);
02983                                                 dest.Add(0x307); continue; }
02984                                         if (howHere != ccLower) dest.Add(0x307);
02985                                         continue; }
02986                         }
02987                         // When lowercasing, unless an I is before a dot_above,
02988                         // it turns into a dotless i.
02989                         else if (cp == LatinCapitalLetterI)
02990                         {
02991                                 // BeforeDot: C is followed by U+0307 (combining dot above).
02992                                 // Any sequence of characters with a combining class that is
02993                                 // neither 0 nor 230 may intervene between the current character
02994                                 // and the combining dot above.
02995                                 bool beforeDot = false;
02996                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02997                                 {
02998                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02999                                         if (cp2 == 0x307) { beforeDot = true; break; }
03000                                         const int cc2 = GetCombiningClass(cp2);
03001                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
03002                                 }
03003                                 if (! beforeDot) {
03004                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
03005                         }
03006                         // When uppercasing, i turns into a dotted capital I.
03007                         else if (cp == LatinSmallLetterI)
03008                         {
03009                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
03010                         }
03011                 }
03012                 // Try to use the unconditional mappings.
03013                 const TIntIntVH &specHere = (
03014                         howHere == how ? specials :
03015                         howHere == ccLower ? specialCasingLower :
03016                         howHere == ccTitle ? specialCasingTitle :
03017                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
03018                 int i = specHere.GetKeyId(cp);
03019                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
03020                 // Try to use the simple (one-character) mappings.
03021                 i = h.GetKeyId(cp);
03022                 if (i >= 0) {
03023                         const TUniChInfo &ci = h[i];
03024                         int cpNew = (
03025                                 howHere == ccLower ? ci.simpleLowerCaseMapping :
03026                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :
03027                                                                          ci.simpleTitleCaseMapping);
03028                         if (cpNew < 0) cpNew = cp;
03029                         dest.Add(cpNew); continue; }
03030                 // As a final resort, leave 'cp' unchanged.
03031                 dest.Add(cp);
03032         }
03033 }
03034 
03035 template<typename TSrcVec, typename TDestCh>
03036 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03037         TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const
03038 {
03039         if (clrDest) dest.Clr();
03040         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03041         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
03042         {
03043                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03044                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
03045                 const TUniChInfo &ci = h[i];
03046                 // With titlecasing, the first cased character of each word must be put into titlecase,
03047                 // all others into lowercase.  This is what the howHere variable is for.
03048                 TUniChDb::TCaseConversion howHere;
03049                 if (how != ccTitle) howHere = how;
03050                 else {
03051                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
03052                                 seenCased = false;
03053                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03054                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03055                         bool isCased = IsCased(cp);
03056                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03057                         else howHere = ccLower;
03058                 }
03059                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03060                 if (cpNew < 0) cpNew = cp;
03061                 dest.Add(cpNew);
03062         }
03063 }
03064 
03065 template<typename TSrcVec>
03066 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
03067 {
03068         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03069         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
03070         {
03071                 const int cp = src[TVecIdx(srcIdx)];
03072                 int i = h.GetKeyId(cp); if (i < 0) continue;
03073                 const TUniChInfo &ci = h[i];
03074                 // With titlecasing, the first cased character of each word must be put into titlecase,
03075                 // all others into lowercase.  This is what the howHere variable is for.
03076                 TUniChDb::TCaseConversion howHere;
03077                 if (how != ccTitle) howHere = how;
03078                 else {
03079                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.
03080                                 seenCased = false;
03081                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03082                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03083                         bool isCased = IsCased(cp);
03084                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03085                         else howHere = ccLower;
03086                 }
03087                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03088                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
03089         }
03090 }
03091 
03092 //-----------------------------------------------------------------------------
03093 // TUniChDb -- composition, decomposition, normal forms
03094 //-----------------------------------------------------------------------------
03095 
03096 template<typename TDestCh>
03097 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const
03098 {
03099         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
03100         {
03101                 // UAX #15, sec. 16: Hangul decomposition
03102                 const int SIndex = codePoint - HangulSBase;
03103                 const int L = HangulLBase + SIndex / HangulNCount;
03104                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
03105                 const int T = HangulTBase + (SIndex % HangulTCount);
03106                 dest.Add(L); dest.Add(V);
03107                 if (T != HangulTBase) dest.Add(T);
03108                 return;
03109         }
03110         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
03111         const TUniChInfo &ci = h[i];
03112         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
03113         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
03114         while (true) {
03115                 int cp = decompositions[ofs++]; if (cp < 0) return;
03116                 AddDecomposition(cp, dest, compatibility); }
03117 }
03118 
03119 template<typename TSrcVec, typename TDestCh>
03120 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03121                 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const
03122 {
03123         if (clrDest) dest.Clr();
03124         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
03125         // Decompose the string.
03126         while (srcIdx < srcCount) {
03127                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
03128         // Rearrange the decomposed string into canonical order.
03129         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
03130         {
03131                 size_t j = destIdx;
03132                 int cp = dest[TVecIdx(destIdx)]; destIdx++;
03133                 int cpCls = GetCombiningClass(cp);
03134                 if (cpCls == TUniChInfo::ccStarter) continue;
03135                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
03136                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
03137                 dest[TVecIdx(j)] = cp;
03138         }
03139 }
03140 
03141 template<typename TSrcVec, typename TDestCh>
03142 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03143                 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const
03144 {
03145         if (clrDest) dest.Clr();
03146         TIntV temp;
03147         Decompose(src, srcIdx, srcCount, temp, compatibility);
03148         Compose(temp, 0, temp.Len(), dest, clrDest);
03149 }
03150 
03151 template<typename TSrcVec, typename TDestCh>
03152 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03153                 TVec<TDestCh>& dest, bool clrDest) const
03154 {
03155         if (clrDest) dest.Clr();
03156         bool lastStarterKnown = false; // has a starter been encountered yet?
03157         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
03158         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
03159         const size_t srcEnd = srcIdx + srcCount;
03160         int ccMax = -1; // The highest combining class among the characters since the last starter.
03161         while (srcIdx < srcEnd)
03162         {
03163                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03164                 const int cpClass = GetCombiningClass(cp);
03165                 //int cpCombined = -1;
03166                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked
03167                 // by some intermediate character, we can try to combine them.
03168                 if (lastStarterKnown && ccMax < cpClass)
03169                 {
03170                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
03171                         int cpCombined = -1;
03172                         do {
03173                                 // Try to look up a composition in the inverseDec table.
03174                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }
03175                                 // UAX #15, sec. 16: Hangul composition
03176                                 // - Try to combine L and V.
03177                                 const int LIndex = cpLastStarter - HangulLBase;
03178                                 if (0 <= LIndex && LIndex < HangulLCount) {
03179                                         const int VIndex = cp - HangulVBase;
03180                                         if (0 <= VIndex && VIndex < HangulVCount) {
03181                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
03182                                                 break; } }
03183                                 // - Try to combine LV and T.
03184                                 const int SIndex = cpLastStarter - HangulSBase;
03185                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
03186                                 {
03187                                         const int TIndex = cp - HangulTBase;
03188                                         if (0 <= TIndex && TIndex < HangulTCount) {
03189                                                 cpCombined = cpLastStarter + TIndex;
03190                                                 break; }
03191                                 }
03192                         } while (false);
03193                         // If a combining character has been found, use it to replace the old cpStarter.
03194                         if (cpCombined >= 0) {
03195                                 dest[TVecIdx(lastStarterPos)] = cpCombined;
03196                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
03197                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
03198                                 cpLastStarter = cpCombined; continue; }
03199                 }
03200                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
03201                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
03202                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
03203                         ccMax = cpClass;
03204                 dest.Add(cp);
03205         }
03206 }
03207 
03208 template<typename TSrcVec, typename TDestCh>
03209 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03210                 TVec<TDestCh>& dest, bool clrDest) const
03211 {
03212         if (clrDest) dest.Clr();
03213         size_t retVal = 0;
03214         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
03215                 const int cp = src[TVecIdx(srcIdx)];
03216                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
03217                         { dest.Add(cp); retVal++; } }
03218         return retVal;
03219 }
03220 
03221 inline bool AlwaysFalse()
03222 {
03223         int sum = 0;
03224         for (int i = 0; i < 5; i++) sum += i;
03225         return sum > 100;
03226 }
03227 
03228 inline bool AlwaysTrue()
03229 {
03230         int sum = 0;
03231         for (int i = 0; i < 5; i++) sum += i;
03232         return sum < 100;
03233 }
03234 
03235 /*
03236 
03237 Notes on decomposition:
03238 
03239 - In UnicodeData.txt, there is a field with the decomposition mapping.
03240   This field may also include a tag, <...>.
03241   If there is a tag, this is a compatibility mapping.
03242   Otherwise it is a canonical mapping.
03243 - Canonical decomposition uses only canonical mappings,
03244   compatibility decomposition uses both canonical and compatibility mappings.
03245 - Decomposition:
03246   1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.
03247   2. Put the string into canonical order, which means:
03248      while there exists a pair of characters, A immediately followed by B,
03249          such that combiningclass(A) > combiningclass(B) > 0  [an "exchangeable pair"]:
03250            swap A and B;
03251   This results in NFD (normalized form D, after canonical decomposition)
03252   or NFKD (normalized form KD, after compatibility decomposition).
03253 - Canonical composition:
03254   1. Before composition, the string should have been decomposed
03255      (using either canonical or compatibility decomposition).
03256   2. For each character C (from left to right):
03257      2.1.  Find the last starter S before C (if not found, continue).
03258          2.2.  If there is, between S and C, some character with a combining class >= than that of C, then continue.
03259          2.3.  If there exists a character L for which the canonical decomposition is S+L
03260                and L is not in the composition exclusion table [i.e. L is a "primary composite"],
03261                    then replace S by L, and remove C.
03262   This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)
03263   or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).
03264 - Composition exclusion table:
03265   - Anything in CompositionExclusions.txt.
03266   - Singletons: characters whose canonical decomposition is a single character.
03267   - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.
03268 
03269 Example:
03270                  E-grave  (00c8; composition class 0; canonical decomposition: 0045 0300)
03271                                  E-macron (0112; composition class 0;                          0045 0304)
03272                                  grave   (0300; composition class 230)
03273                  macron  (0304; composition class 230)
03274   source string: 00c8 0304
03275   after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304
03276   after canonical composition: 00c8 0304
03277 
03278   cc(horn) = 216
03279   cc(dot below) = 220
03280   cc(dot above) = 230
03281 
03282 ToDos:
03283 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.
03284   Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.
03285   Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu
03286   upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).
03287 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.
03288   Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt
03289   (+ simple case mappinge v UnicodeData.txt).
03290   Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo
03291   in jih potem upostevamo posebej kar v source kodi nasih programov [za
03292   podrobno definicijo pogojev pa glej tabelo 3.13].
03293   - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.
03294     Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.
03295         To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,
03296         da je CaseFolding.txt izpeljan iz njiju.  Glavni namen CaseFolding.txt naj bi bil za
03297         potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).
03298   - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13
03299     in se posebej str. 90.
03300   - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD
03301   - definicija cased ipd. na str. 89
03302 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15
03303   Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih
03304   stvari, med drugim isLowerCase in isUpperCase.  Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).
03305   To je se najbolje dodati med flagse posameznega characterja.
03306 - general category: sec. 4.5
03307 - motivacija za titlecase: 5.18
03308 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt
03309   pod Full_Composition_Exclusion
03310 - script names: Scripts.txt in UAX #24.
03311 - block names: Blocks.txt
03312 - space characters: table 6.2 in baje tudi UCD.html
03313 - dash characters: table 6.3
03314 */
03315 
03316 //#endif
03317