#include "bd.h"
#include <new>

Include dependency graph for unicode.h:

This graph shows which files directly or indirectly include this file:

Classes
class	TUnicodeException
class	TUniCodec
class	TUniCaseFolding
class	TCodecBase
class	TCodecWrapper< TCodecImpl_ >
class	TVecElt< TVector_ >
class	TVecElt< TVec< TDat > >
class	TVecElt< TChA >
class	TEncoding_ISO8859_1
class	TEncoding_ISO8859_2
class	TEncoding_ISO8859_3
class	TEncoding_ISO8859_4
class	TEncoding_YuAscii
class	TEncoding_CP437
class	TEncoding_CP852
class	TEncoding_CP1250
class	T8BitCodec< TEncoding_ >
class	TUniChInfo
class	TUniTrie< TItem_ >
class	TUniTrie< TItem_ >::TNode
class	TUniChDb
class	TUniChDb::TUcdFileReader
class	TUniChDb::TSubcatHelper
class	TUnicode
Defines
#define	DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) \| (b6 << 6) \| (b5 << 5) \| (b4 << 4) \| (b3 << 3) \| (b2 << 2) \| (b1 << 1) \| b0
#define	DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
#define	DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) \| (int(uchar(c)) & 0xff)
#define	___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
#define	___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
#define	___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
#define	___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
#define	___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
#define	DECLARE_FORWARDED_PROPERTY_METHODS
#define	___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
#define	___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
#define	TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define	TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
#define	TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define	TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
#define	Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
#define	IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter \| ucfSbUpper \| ucfSbLower \| ucfSbSep \| ucfSbSTerm \| ucfSbATerm)) == 0)
#define	TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define	TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
#define	TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
Typedefs
typedef int	TUniVecIdx
typedef enum TUnicodeErrorHandling_	TUnicodeErrorHandling
typedef enum TUniByteOrder_	TUniByteOrder
typedef enum TUtf16BomHandling_	TUtf16BomHandling
typedef THash< TInt, TIntV >	TIntIntVH
typedef TPt< TCodecBase >	PCodecBase
typedef TVec< PCodecBase >	TCodecBaseV
typedef T8BitCodec < TEncoding_ISO8859_1 >	TCodec_ISO8859_1
typedef T8BitCodec < TEncoding_ISO8859_2 >	TCodec_ISO8859_2
typedef T8BitCodec < TEncoding_ISO8859_3 >	TCodec_ISO8859_3
typedef T8BitCodec < TEncoding_ISO8859_4 >	TCodec_ISO8859_4
typedef T8BitCodec < TEncoding_CP852 >	TCodec_CP852
typedef T8BitCodec < TEncoding_CP437 >	TCodec_CP437
typedef T8BitCodec < TEncoding_CP1250 >	TCodec_CP1250
typedef T8BitCodec < TEncoding_YuAscii >	TCodec_YuAscii
typedef enum TUniChCategory_	TUniChCategory
typedef enum TUniChSubCategory_	TUniChSubCategory
typedef enum TUniChFlags_	TUniChFlags
typedef enum TUniChProperties_	TUniChProperties
typedef enum TUniChPropertiesX_	TUniChPropertiesX
Enumerations
enum	TUnicodeErrorHandling_ { uehIgnore = 0, uehThrow = 1, uehReplace = 2, uehAbort = 3 }
enum	TUniByteOrder_ { boMachineEndian = 0, boLittleEndian = 1, boBigEndian = 2 }
enum	TUtf16BomHandling_ { bomAllowed = 0, bomRequired = 1, bomIgnored = 2 }
enum	TUniChCategory_ { DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L') }
enum	TUniChSubCategory_ { DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u') }
enum	TUniChFlags_ { ucfCompatibilityDecomposition = 1, ucfCompositionExclusion = 1 << 1, ucfWbFormat = 1 << 2, ucfWbKatakana = 1 << 3, ucfWbALetter = 1 << 4, ucfWbMidLetter = 1 << 5, ucfWbMidNum = 1 << 6, ucfWbNumeric = 1 << 7, ucfWbExtendNumLet = 1 << 8, ucfSbSep = 1 << 9, ucfSbFormat = 1 << 10, ucfSbSp = 1 << 11, ucfSbLower = 1 << 12, ucfSbUpper = 1 << 13, ucfSbOLetter = 1 << 14, ucfSbNumeric = 1 << 15, ucfSbATerm = 1 << 16, ucfSbSTerm = 1 << 17, ucfSbClose = 1 << 18, ucfSbMask = ucfSbSep \| ucfSbFormat \| ucfSbSp \| ucfSbLower \| ucfSbUpper \| ucfSbOLetter \| ucfSbNumeric \| ucfSbATerm \| ucfSbSTerm \| ucfSbClose, ucfWbMask = ucfWbFormat \| ucfWbKatakana \| ucfWbALetter \| ucfWbMidLetter \| ucfWbMidNum \| ucfWbNumeric \| ucfWbExtendNumLet \| ucfSbSep, ucfDcpAlphabetic = 1 << 19, ucfDcpDefaultIgnorableCodePoint = 1 << 20, ucfDcpLowercase = 1 << 21, ucfDcpGraphemeBase = 1 << 22, ucfDcpGraphemeExtend = 1 << 23, ucfDcpIdStart = 1 << 24, ucfDcpIdContinue = 1 << 25, ucfDcpMath = 1 << 26, ucfDcpUppercase = 1 << 27, ucfDcpXidStart = 1 << 28, ucfDcpXidContinue = 1 << 29, ucfDcpMask }
enum	TUniChProperties_ { ucfPrAsciiHexDigit = 1, ucfPrBidiControl = 2, ucfPrDash = 4, ucfPrDeprecated = 8, ucfPrDiacritic = 0x10, ucfPrExtender = 0x20, ucfPrGraphemeLink = 0x40, ucfPrHexDigit = 0x80, ucfPrHyphen = 0x100, ucfPrIdeographic = 0x200, ucfPrJoinControl = 0x400, ucfPrLogicalOrderException = 0x800, ucfPrNoncharacterCodePoint = 0x1000, ucfPrPatternSyntax = 0x2000, ucfPrPatternWhiteSpace = 0x4000, ucfPrQuotationMark = 0x8000, ucfPrSoftDotted = 0x10000, ucfPrSTerm = 0x20000, ucfPrTerminalPunctuation = 0x40000, ucfPrVariationSelector = 0x80000, ucfPrWhiteSpace = 0x100000 }
enum	TUniChPropertiesX_ { ucfPxOtherAlphabetic = 1, ucfPxOtherDefaultIgnorableCodePoint = 2, ucfPxOtherGraphemeExtend = 4, ucfPxOtherIdContinue = 8, ucfPxOtherIdStart = 0x10, ucfPxOtherLowercase = 0x20, ucfPxOtherMath = 0x40, ucfPxOtherUppercase = 0x80, ucfPxIdsBinaryOperator = 0x100, ucfPxIdsTrinaryOperator = 0x200, ucfPxRadical = 0x400, ucfPxUnifiedIdeograph = 0x800 }
Functions
bool	AlwaysFalse ()
bool	AlwaysTrue ()

Define Documentation

#define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }

Referenced by TUniCodec::EncodeUtf16ToBytes().

#define ___UniFwd1 ( name ) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }

Definition at line 2014 of file unicode.h.

#define ___UniFwd1 ( name ) bool name(const int cp) const { return ucd.name(cp); }

Definition at line 2014 of file unicode.h.

#define ___UniFwd2	(	name1,
		name2
	)	___UniFwd1(name1) ___UniFwd1(name2)

Definition at line 1362 of file unicode.h.

#define ___UniFwd3	(	name1,
		name2,
		name3
	)	___UniFwd2(name1, name2) ___UniFwd1(name3)

Definition at line 1363 of file unicode.h.

#define ___UniFwd4	(	name1,
		name2,
		name3,
		name4
	)	___UniFwd3(name1, name2, name3) ___UniFwd1(name4)

Definition at line 1364 of file unicode.h.

#define ___UniFwd5	(	name1,
		name2,
		name3,
		name4,
		name5
	)	___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)

Definition at line 1365 of file unicode.h.

#define DECLARE_FORWARDED_PROPERTY_METHODS

Value:

___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
        ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
        ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
        ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
        ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
        ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
        ___UniFwd2(IsXidStart, IsXidContinue)  \
        ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
        ___UniFwd1(IsGbExtend)  \
        ___UniFwd2(IsCased, IsCurrency)

Definition at line 1367 of file unicode.h.

#define DefineByte	(	b7,
		b6,
		b5,
		b4,
		b3,
		b2,
		b1,
		b0
	)	_ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) \| (b6 << 6) \| (b5 << 5) \| (b4 << 4) \| (b3 << 3) \| (b2 << 2) \| (b1 << 1) \| b0

Definition at line 102 of file unicode.h.

#define DefineUniCat	(	cat,
		c
	)	uc ## cat = (int(uchar(c)) & 0xff)

Definition at line 664 of file unicode.h.

#define DefineUniSubCat	(	cat,
		subCat,
		c
	)	uc ## cat ## subCat = ((uc ## cat) << 8) \| (int(uchar(c)) & 0xff)

Definition at line 678 of file unicode.h.

Referenced by TUniChDb::FindNextSentenceBoundary().

#define TestCur ( curFlag ) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)

#define TestCurNext	(	curFlag,
		nextFlag
	)	if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue

Referenced by TUniChDb::FindNextSentenceBoundary(), and TUniChDb::FindNextWordBoundary().

#define TestCurNext	(	curFlag,
		nextFlag
	)	if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue

#define TestCurNext2	(	curFlag,
		nextFlag,
		next2Flag
	)	if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue

Referenced by TUniChDb::FindNextWordBoundary().

#define TestCurNext2	(	curFlag,
		nextFlag,
		next2Flag
	)	if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue

#define TestPrevCurNext	(	prevFlag,
		curFlag,
		nextFlag
	)	if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue

Referenced by TUniChDb::FindNextSentenceBoundary(), and TUniChDb::FindNextWordBoundary().

#define TestPrevCurNext	(	prevFlag,
		curFlag,
		nextFlag
	)	if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue

#define Trans	(	curFlag,
		newState
	)	if (TestCur(curFlag)) { backState = st##newState; break; }

Referenced by TUniChDb::FindNextSentenceBoundary().

Typedef Documentation

typedef TPt<TCodecBase> PCodecBase

Definition at line 328 of file unicode.h.

typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250

Definition at line 655 of file unicode.h.

typedef T8BitCodec<TEncoding_CP437> TCodec_CP437

Definition at line 654 of file unicode.h.

typedef T8BitCodec<TEncoding_CP852> TCodec_CP852

Definition at line 653 of file unicode.h.

typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1

Definition at line 649 of file unicode.h.

typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2

Definition at line 650 of file unicode.h.

typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3

Definition at line 651 of file unicode.h.

typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4

Definition at line 652 of file unicode.h.

typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii

Definition at line 656 of file unicode.h.

typedef TVec<PCodecBase> TCodecBaseV

Definition at line 330 of file unicode.h.

typedef THash<TInt, TIntV> TIntIntVH

Definition at line 269 of file unicode.h.

typedef enum TUniByteOrder_ TUniByteOrder

typedef enum TUniChCategory_ TUniChCategory

typedef enum TUniChFlags_ TUniChFlags

typedef enum TUniChProperties_ TUniChProperties

typedef enum TUniChPropertiesX_ TUniChPropertiesX

typedef enum TUniChSubCategory_ TUniChSubCategory

typedef enum TUnicodeErrorHandling_ TUnicodeErrorHandling

typedef int TUniVecIdx

Definition at line 11 of file unicode.h.

typedef enum TUtf16BomHandling_ TUtf16BomHandling

Enumeration Type Documentation

enum TUniByteOrder_

Enumerator:

boMachineEndian
boLittleEndian
boBigEndian

Definition at line 38 of file unicode.h.

{
        boMachineEndian = 0,
        boLittleEndian = 1,
        boBigEndian = 2
}

enum TUniChCategory_

Enumerator:

DefineUniCat
DefineUniCat
DefineUniCat
DefineUniCat
DefineUniCat
DefineUniCat
DefineUniCat
DefineUniCat

Definition at line 662 of file unicode.h.

{
#define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
        DefineUniCat(Letter, 'L'),             // ucLetter
        DefineUniCat(Mark, 'M'),
        DefineUniCat(Number, 'N'),
        DefineUniCat(Punctuation, 'P'),
        DefineUniCat(Symbol, 'S'),
        DefineUniCat(Separator, 'Z'),
        DefineUniCat(Other, 'C')
#undef DefineUniCat
}

enum TUniChFlags_

Enumerator:

ucfCompatibilityDecomposition
ucfCompositionExclusion
ucfWbFormat
ucfWbKatakana
ucfWbALetter
ucfWbMidLetter
ucfWbMidNum
ucfWbNumeric
ucfWbExtendNumLet
ucfSbSep
ucfSbFormat
ucfSbSp
ucfSbLower
ucfSbUpper
ucfSbOLetter
ucfSbNumeric
ucfSbATerm
ucfSbSTerm
ucfSbClose
ucfSbMask
ucfWbMask
ucfDcpAlphabetic
ucfDcpDefaultIgnorableCodePoint
ucfDcpLowercase
ucfDcpGraphemeBase
ucfDcpGraphemeExtend
ucfDcpIdStart
ucfDcpIdContinue
ucfDcpMath
ucfDcpUppercase
ucfDcpXidStart
ucfDcpXidContinue
ucfDcpMask

Definition at line 712 of file unicode.h.

{
        ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
        ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
        // Flags used when searching for word boundaries.  See UAX #29.
        ucfWbFormat = 1 << 2,
        ucfWbKatakana = 1 << 3,
        ucfWbALetter = 1 << 4,
        ucfWbMidLetter = 1 << 5,
        ucfWbMidNum = 1 << 6,
        ucfWbNumeric = 1 << 7,
        ucfWbExtendNumLet = 1 << 8,
        // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
        ucfSbSep = 1 << 9,
        ucfSbFormat = 1 << 10,
        ucfSbSp = 1 << 11,
        ucfSbLower = 1 << 12,
        ucfSbUpper = 1 << 13,
        ucfSbOLetter = 1 << 14,
        ucfSbNumeric = 1 << 15,
        ucfSbATerm = 1 << 16,
        ucfSbSTerm = 1 << 17,
        ucfSbClose = 1 << 18,
        ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
        ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
        // Flags from DerivedCoreProperties.txt.
        // [The comments are from UCD.html.]
        // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
        //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
        ucfDcpAlphabetic = 1 << 19,
        // - For programmatic determination of default-ignorable code points.
        //   New characters that should be ignored in processing (unless explicitly supported)
        //   will be assigned in these ranges, permitting programs to correctly handle the default
        //   behavior of such characters when not otherwise supported.  For more information, see
        //   UAX #29: Text Boundaries [Breaks].
        //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
        //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
        ucfDcpDefaultIgnorableCodePoint = 1 << 20,
        // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
        //   Generated from: Other_Lowercase + Ll
        ucfDcpLowercase = 1 << 21,
        // - For programmatic determination of grapheme cluster boundaries.
        //   For more information, see UAX #29: Text Boundaries [Breaks].
        //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
        ucfDcpGraphemeBase = 1 << 22,
        // - For programmatic determination of grapheme cluster boundaries.
        //   For more information, see UAX #29: Text Boundaries [Breaks].
        //   Generated from: Other_Grapheme_Extend + Me + Mn
        //   Note: depending on an application's interpretation of Co (private use), they may be either
        //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
        ucfDcpGraphemeExtend = 1 << 23,
        // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
        ucfDcpIdStart = 1 << 24,
        ucfDcpIdContinue = 1 << 25,
        // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
        //   Generated from: Sm + Other_Math
        ucfDcpMath = 1 << 26,
        // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
        //   Generated from: Lu + Other_Uppercase
        ucfDcpUppercase = 1 << 27,
        // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
        ucfDcpXidStart = 1 << 28,
        ucfDcpXidContinue = 1 << 29,
        ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
                ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
}

enum TUniChProperties_

Enumerator:

ucfPrAsciiHexDigit
ucfPrBidiControl
ucfPrDash
ucfPrDeprecated
ucfPrDiacritic
ucfPrExtender
ucfPrGraphemeLink
ucfPrHexDigit
ucfPrHyphen
ucfPrIdeographic
ucfPrJoinControl
ucfPrLogicalOrderException
ucfPrNoncharacterCodePoint
ucfPrPatternSyntax
ucfPrPatternWhiteSpace
ucfPrQuotationMark
ucfPrSoftDotted
ucfPrSTerm
ucfPrTerminalPunctuation
ucfPrVariationSelector
ucfPrWhiteSpace

Definition at line 780 of file unicode.h.

{
        // The flags from PropList.txt.
        // [The comments are from UCD.html.]
        // - ASCII characters commonly used for the representation of hexadecimal numbers.
        //   [= 0123456789abcdefABCDEF]
        ucfPrAsciiHexDigit = 1,
        // - Those format control characters which have specific functions in the Bidirectional Algorithm.
        ucfPrBidiControl = 2,
        // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
        //   plus compatibility equivalents to those. Most of these have the Pd General Category,
        //   but some have the Sm General Category because of their use in mathematics.
        //     U+0002d  HYPHEN-MINUS
        //     U+0058a  ARMENIAN HYPHEN
        //     U+005be  HEBREW PUNCTUATION MAQAF
        //     U+01806  MONGOLIAN TODO SOFT HYPHEN
        //     U+02010  HYPHEN
        //     U+02011  NON-BREAKING HYPHEN
        //     U+02012  FIGURE DASH
        //     U+02013  EN DASH
        //     U+02014  EM DASH
        //     U+02015  HORIZONTAL BAR
        //     U+02053  SWUNG DASH
        //     U+0207b  SUPERSCRIPT MINUS
        //     U+0208b  SUBSCRIPT MINUS
        //     U+02212  MINUS SIGN
        //     U+02e17  DOUBLE OBLIQUE HYPHEN
        //     U+0301c  WAVE DASH
        //     U+03030  WAVY DASH
        //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
        //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
        //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
        //     U+0fe58  SMALL EM DASH
        //     U+0fe63  SMALL HYPHEN-MINUS
        //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
        ucfPrDash = 4,
        // - For a machine-readable list of deprecated characters.  No characters will ever be removed
        //   from the standard, but the usage of deprecated characters is strongly discouraged.
        ucfPrDeprecated = 8,
        // - Characters that linguistically modify the meaning of another character to which they apply.
        //   Some diacritics are not combining characters, and some combining characters are not diacritics.
        ucfPrDiacritic = 0x10,
        // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
        //   character.  Typical of these are length and iteration marks.
        ucfPrExtender = 0x20,
        // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
        ucfPrGraphemeLink = 0x40,
        // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
        //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
        ucfPrHexDigit = 0x80,
        // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
        //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
        //     U+0002d  HYPHEN-MINUS
        //     U+000ad  SOFT HYPHEN
        //     U+0058a  ARMENIAN HYPHEN
        //     U+01806  MONGOLIAN TODO SOFT HYPHEN
        //     U+02010  HYPHEN
        //     U+02011  NON-BREAKING HYPHEN
        //     U+02e17  DOUBLE OBLIQUE HYPHEN
        //     U+030fb  KATAKANA MIDDLE DOT
        //     U+0fe63  SMALL HYPHEN-MINUS
        //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
        //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
        ucfPrHyphen = 0x100,
        // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
        ucfPrIdeographic = 0x200,
        // - Those format control characters which have specific functions for control of cursive joining and ligation.
        ucfPrJoinControl = 0x400,
        // - There are a small number of characters that do not use logical order.
        //   These characters require special handling in most processing.
        ucfPrLogicalOrderException = 0x800,
        // - Code points that are permanently reserved for internal use.
        ucfPrNoncharacterCodePoint = 0x1000,
        // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
        ucfPrPatternSyntax = 0x2000,
        ucfPrPatternWhiteSpace = 0x4000,
        // - Those punctuation characters that function as quotation marks.
        //     U+00022  QUOTATION MARK
        //     U+00027  APOSTROPHE
        //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
        //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
        //     U+02018  LEFT SINGLE QUOTATION MARK
        //     U+02019  RIGHT SINGLE QUOTATION MARK
        //     U+0201a  SINGLE LOW-9 QUOTATION MARK
        //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
        //     U+0201c  LEFT DOUBLE QUOTATION MARK
        //     U+0201d  RIGHT DOUBLE QUOTATION MARK
        //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
        //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
        //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
        //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
        //     U+0300c  LEFT CORNER BRACKET
        //     U+0300d  RIGHT CORNER BRACKET
        //     U+0300e  LEFT WHITE CORNER BRACKET
        //     U+0300f  RIGHT WHITE CORNER BRACKET
        //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
        //     U+0301e  DOUBLE PRIME QUOTATION MARK
        //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
        //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
        //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
        //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
        //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
        //     U+0ff02  FULLWIDTH QUOTATION MARK
        //     U+0ff07  FULLWIDTH APOSTROPHE
        //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
        //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
        ucfPrQuotationMark = 0x8000,
        // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
        //   An explicit _dot above_ can be added where required, such as in Lithuanian.
        ucfPrSoftDotted = 0x10000,
        // - Sentence Terminal. Used in UAX #29: Text Boundaries.
        //     U+00021  EXCLAMATION MARK
        //     U+0002e  FULL STOP
        //     U+0003f  QUESTION MARK
        //     U+0203c  DOUBLE EXCLAMATION MARK
        //     U+0203d  INTERROBANG
        //     U+02047  DOUBLE QUESTION MARK
        //     U+02048  QUESTION EXCLAMATION MARK
        //     U+02049  EXCLAMATION QUESTION MARK
        //     U+03002  IDEOGRAPHIC FULL STOP
        //     [plus many characters from other writing systems]
        ucfPrSTerm = 0x20000,
        // - Those punctuation characters that generally mark the end of textual units.
        //   [JB note: this set contains more character than STerm.  For example, it contains
        //   the comma, colon and semicolon, whereas STerm doesn't.]
        //     U+00021  EXCLAMATION MARK
        //     U+0002c  COMMA
        //     U+0002e  FULL STOP
        //     U+0003a  COLON
        //     U+0003b  SEMICOLON
        //     U+0003f  QUESTION MARK
        //     U+0203c  DOUBLE EXCLAMATION MARK
        //     U+0203d  INTERROBANG
        //     U+02047  DOUBLE QUESTION MARK
        //     U+02048  QUESTION EXCLAMATION MARK
        //     U+02049  EXCLAMATION QUESTION MARK
        //     [plus *lots* of charcters from other writing systems]
        ucfPrTerminalPunctuation = 0x40000,
        // - Indicates all those characters that qualify as Variation Selectors.
        //   For details on the behavior of these characters, see StandardizedVariants.html and
        //   Section 16.4, Variation Selectors in [Unicode].
        ucfPrVariationSelector = 0x80000,
        // - Those separator characters and control characters which should be treated by
        //   programming languages as "white space" for the purpose of parsing elements.
        //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
        //         since their functions are restricted to line-break control.
        //         Their names are unfortunately misleading in this respect.
        //   Note: There are other senses of "whitespace" that encompass a different set of characters.
        //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
        //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
        //   This includes the following characters:
        //     U+0009  <control>
        //     U+000a  <control>
        //     U+000b  <control>
        //     U+000c  <control>
        //     U+000d  <control>
        //     U+0020  SPACE
        //     U+0085  <control>
        //     U+00a0  NO-BREAK SPACE
        //     U+1680  OGHAM SPACE MARK
        //     U+180e  MONGOLIAN VOWEL SEPARATOR
        //     U+2000  EN QUAD
        //     U+2001  EM QUAD
        //     U+2002  EN SPACE
        //     U+2003  EM SPACE
        //     U+2004  THREE-PER-EM SPACE
        //     U+2005  FOUR-PER-EM SPACE
        //     U+2006  SIX-PER-EM SPACE
        //     U+2007  FIGURE SPACE
        //     U+2008  PUNCTUATION SPACE
        //     U+2009  THIN SPACE
        //     U+200a  HAIR SPACE
        //     U+2028  LINE SEPARATOR
        //     U+2029  PARAGRAPH SEPARATOR
        //     U+202f  NARROW NO-BREAK SPACE
        //     U+205f  MEDIUM MATHEMATICAL SPACE
        //     U+3000  IDEOGRAPHIC SPACE
        ucfPrWhiteSpace = 0x100000
}

enum TUniChPropertiesX_

Enumerator:

ucfPxOtherAlphabetic
ucfPxOtherDefaultIgnorableCodePoint
ucfPxOtherGraphemeExtend
ucfPxOtherIdContinue
ucfPxOtherIdStart
ucfPxOtherLowercase
ucfPxOtherMath
ucfPxOtherUppercase
ucfPxIdsBinaryOperator
ucfPxIdsTrinaryOperator
ucfPxRadical
ucfPxUnifiedIdeograph

Definition at line 961 of file unicode.h.

{
        // More properties from PropList.txt.
        // - Used to derive the properties in DerivedCoreProperties.txt.
        ucfPxOtherAlphabetic = 1,
        ucfPxOtherDefaultIgnorableCodePoint = 2,
        ucfPxOtherGraphemeExtend = 4,
        ucfPxOtherIdContinue = 8,
        ucfPxOtherIdStart = 0x10,
        ucfPxOtherLowercase = 0x20,
        ucfPxOtherMath = 0x40,
        ucfPxOtherUppercase = 0x80,
        // - Used in ideographic description sequences.
        ucfPxIdsBinaryOperator = 0x100,
        ucfPxIdsTrinaryOperator = 0x200,
        ucfPxRadical = 0x400,
        ucfPxUnifiedIdeograph = 0x800
}

enum TUniChSubCategory_

Enumerator:

DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat
DefineUniSubCat

Definition at line 676 of file unicode.h.

{
#define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
        DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
        DefineUniSubCat(Letter, Lowercase, 'l'),
        DefineUniSubCat(Letter, Titlecase, 't'),
        DefineUniSubCat(Letter, Modifier, 'm'),
        DefineUniSubCat(Letter, Other, 'o'),
        DefineUniSubCat(Mark, Nonspacing, 'n'),
        DefineUniSubCat(Mark, SpacingCombining, 'c'),
        DefineUniSubCat(Mark, Enclosing, 'e'),
        DefineUniSubCat(Number, DecimalDigit, 'd'),
        DefineUniSubCat(Number, Letter, 'l'),
        DefineUniSubCat(Number, Other, 'o'),
        DefineUniSubCat(Punctuation, Connector, 'c'),
        DefineUniSubCat(Punctuation, Dash, 'd'),
        DefineUniSubCat(Punctuation, Open, 's'),
        DefineUniSubCat(Punctuation, Close, 'e'),
        DefineUniSubCat(Punctuation, InitialQuote, 'i'),
        DefineUniSubCat(Punctuation, FinalQuote, 'f'),
        DefineUniSubCat(Punctuation, Other, 'o'),
        DefineUniSubCat(Symbol, Math, 'm'),
        DefineUniSubCat(Symbol, Currency, 'c'),
        DefineUniSubCat(Symbol, Modifier, 'k'),
        DefineUniSubCat(Symbol, Other, 'o'),
        DefineUniSubCat(Separator, Space, 's'),
        DefineUniSubCat(Separator, Line, 'l'),
        DefineUniSubCat(Separator, Paragraph, 'p'),
        DefineUniSubCat(Other, Control, 'c'),
        DefineUniSubCat(Other, Format, 'f'),
        DefineUniSubCat(Other, Surrogate, 's'),
        DefineUniSubCat(Other, PrivateUse, 'o'),
        DefineUniSubCat(Other, NotAssigned, 'n')
}

enum TUnicodeErrorHandling_

Enumerator:

uehIgnore
uehThrow
uehReplace
uehAbort

Definition at line 18 of file unicode.h.

{
        // What happens when an error occurs:
        uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
        uehThrow = 1,   // - an exception is thrown (TUnicodeException)
        uehReplace = 2, // - the replacement character is added to the output vector
        uehAbort = 3    // - the encoding/decoding process stops immediately
}

enum TUtf16BomHandling_

Enumerator:

bomAllowed
bomRequired
bomIgnored

Definition at line 46 of file unicode.h.

{
        bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
        bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
        bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
}

Function Documentation

bool AlwaysFalse ( ) [inline]

Definition at line 3221 of file unicode.h.

Referenced by TUniChDb::InitScripts(), and TUniChDb::TestFindNextWordOrSentenceBoundary().

{
        int sum = 0;
        for (int i = 0; i < 5; i++) sum += i;
        return sum > 100;
}

Here is the caller graph for this function:

bool AlwaysTrue ( ) [inline]

Definition at line 3228 of file unicode.h.

{
        int sum = 0;
        for (int i = 0; i < 5; i++) sum += i;
        return sum < 100;
}

Classes

Defines

Typedefs

Enumerations

Functions

Define Documentation

Typedef Documentation

Enumeration Type Documentation

Function Documentation