SNAP Library , Developer Reference
2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
Go to the source code of this file.
Classes | |
class | TUnicodeException |
class | TUniCodec |
class | TUniCaseFolding |
class | TCodecBase |
class | TCodecWrapper< TCodecImpl_ > |
class | TVecElt< TVector_ > |
class | TVecElt< TVec< TDat > > |
class | TVecElt< TChA > |
class | TEncoding_ISO8859_1 |
class | TEncoding_ISO8859_2 |
class | TEncoding_ISO8859_3 |
class | TEncoding_ISO8859_4 |
class | TEncoding_YuAscii |
class | TEncoding_CP437 |
class | TEncoding_CP852 |
class | TEncoding_CP1250 |
class | T8BitCodec< TEncoding_ > |
class | TUniChInfo |
class | TUniTrie< TItem_ > |
class | TUniTrie< TItem_ >::TNode |
class | TUniChDb |
class | TUniChDb::TUcdFileReader |
class | TUniChDb::TSubcatHelper |
class | TUnicode |
Defines | |
#define | DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 |
#define | DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff) |
#define | DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) |
#define | ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); } |
#define | ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2) |
#define | ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3) |
#define | ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4) |
#define | ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5) |
#define | DECLARE_FORWARDED_PROPERTY_METHODS |
#define | ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); } |
#define | ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } |
#define | TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue |
#define | TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue |
#define | TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue |
#define | TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag) |
#define | Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; } |
#define | IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0) |
#define | TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue |
#define | TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue |
#define | TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue |
Typedefs | |
typedef int | TUniVecIdx |
typedef enum TUnicodeErrorHandling_ | TUnicodeErrorHandling |
typedef enum TUniByteOrder_ | TUniByteOrder |
typedef enum TUtf16BomHandling_ | TUtf16BomHandling |
typedef THash< TInt, TIntV > | TIntIntVH |
typedef TPt< TCodecBase > | PCodecBase |
typedef TVec< PCodecBase > | TCodecBaseV |
typedef T8BitCodec < TEncoding_ISO8859_1 > | TCodec_ISO8859_1 |
typedef T8BitCodec < TEncoding_ISO8859_2 > | TCodec_ISO8859_2 |
typedef T8BitCodec < TEncoding_ISO8859_3 > | TCodec_ISO8859_3 |
typedef T8BitCodec < TEncoding_ISO8859_4 > | TCodec_ISO8859_4 |
typedef T8BitCodec < TEncoding_CP852 > | TCodec_CP852 |
typedef T8BitCodec < TEncoding_CP437 > | TCodec_CP437 |
typedef T8BitCodec < TEncoding_CP1250 > | TCodec_CP1250 |
typedef T8BitCodec < TEncoding_YuAscii > | TCodec_YuAscii |
typedef enum TUniChCategory_ | TUniChCategory |
typedef enum TUniChSubCategory_ | TUniChSubCategory |
typedef enum TUniChFlags_ | TUniChFlags |
typedef enum TUniChProperties_ | TUniChProperties |
typedef enum TUniChPropertiesX_ | TUniChPropertiesX |
Enumerations | |
enum | TUnicodeErrorHandling_ { uehIgnore = 0, uehThrow = 1, uehReplace = 2, uehAbort = 3 } |
enum | TUniByteOrder_ { boMachineEndian = 0, boLittleEndian = 1, boBigEndian = 2 } |
enum | TUtf16BomHandling_ { bomAllowed = 0, bomRequired = 1, bomIgnored = 2 } |
enum | TUniChCategory_ { DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L') } |
enum | TUniChSubCategory_ { DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u') } |
enum | TUniChFlags_ { ucfCompatibilityDecomposition = 1, ucfCompositionExclusion = 1 << 1, ucfWbFormat = 1 << 2, ucfWbKatakana = 1 << 3, ucfWbALetter = 1 << 4, ucfWbMidLetter = 1 << 5, ucfWbMidNum = 1 << 6, ucfWbNumeric = 1 << 7, ucfWbExtendNumLet = 1 << 8, ucfSbSep = 1 << 9, ucfSbFormat = 1 << 10, ucfSbSp = 1 << 11, ucfSbLower = 1 << 12, ucfSbUpper = 1 << 13, ucfSbOLetter = 1 << 14, ucfSbNumeric = 1 << 15, ucfSbATerm = 1 << 16, ucfSbSTerm = 1 << 17, ucfSbClose = 1 << 18, ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose, ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, ucfDcpAlphabetic = 1 << 19, ucfDcpDefaultIgnorableCodePoint = 1 << 20, ucfDcpLowercase = 1 << 21, ucfDcpGraphemeBase = 1 << 22, ucfDcpGraphemeExtend = 1 << 23, ucfDcpIdStart = 1 << 24, ucfDcpIdContinue = 1 << 25, ucfDcpMath = 1 << 26, ucfDcpUppercase = 1 << 27, ucfDcpXidStart = 1 << 28, ucfDcpXidContinue = 1 << 29, ucfDcpMask } |
enum | TUniChProperties_ { ucfPrAsciiHexDigit = 1, ucfPrBidiControl = 2, ucfPrDash = 4, ucfPrDeprecated = 8, ucfPrDiacritic = 0x10, ucfPrExtender = 0x20, ucfPrGraphemeLink = 0x40, ucfPrHexDigit = 0x80, ucfPrHyphen = 0x100, ucfPrIdeographic = 0x200, ucfPrJoinControl = 0x400, ucfPrLogicalOrderException = 0x800, ucfPrNoncharacterCodePoint = 0x1000, ucfPrPatternSyntax = 0x2000, ucfPrPatternWhiteSpace = 0x4000, ucfPrQuotationMark = 0x8000, ucfPrSoftDotted = 0x10000, ucfPrSTerm = 0x20000, ucfPrTerminalPunctuation = 0x40000, ucfPrVariationSelector = 0x80000, ucfPrWhiteSpace = 0x100000 } |
enum | TUniChPropertiesX_ { ucfPxOtherAlphabetic = 1, ucfPxOtherDefaultIgnorableCodePoint = 2, ucfPxOtherGraphemeExtend = 4, ucfPxOtherIdContinue = 8, ucfPxOtherIdStart = 0x10, ucfPxOtherLowercase = 0x20, ucfPxOtherMath = 0x40, ucfPxOtherUppercase = 0x80, ucfPxIdsBinaryOperator = 0x100, ucfPxIdsTrinaryOperator = 0x200, ucfPxRadical = 0x400, ucfPxUnifiedIdeograph = 0x800 } |
Functions | |
bool | AlwaysFalse () |
bool | AlwaysTrue () |
#define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } |
Referenced by TUniCodec::EncodeUtf16ToBytes().
#define ___UniFwd1 | ( | name | ) | bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); } |
#define ___UniFwd1 | ( | name | ) | bool name(const int cp) const { return ucd.name(cp); } |
#define ___UniFwd2 | ( | name1, | |
name2 | |||
) | ___UniFwd1(name1) ___UniFwd1(name2) |
#define ___UniFwd3 | ( | name1, | |
name2, | |||
name3 | |||
) | ___UniFwd2(name1, name2) ___UniFwd1(name3) |
#define ___UniFwd4 | ( | name1, | |
name2, | |||
name3, | |||
name4 | |||
) | ___UniFwd3(name1, name2, name3) ___UniFwd1(name4) |
#define ___UniFwd5 | ( | name1, | |
name2, | |||
name3, | |||
name4, | |||
name5 | |||
) | ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5) |
___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \ ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \ ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \ ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \ ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \ ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \ ___UniFwd2(IsXidStart, IsXidContinue) \ ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \ ___UniFwd1(IsGbExtend) \ ___UniFwd2(IsCased, IsCurrency)
#define DefineByte | ( | b7, | |
b6, | |||
b5, | |||
b4, | |||
b3, | |||
b2, | |||
b1, | |||
b0 | |||
) | _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 |
#define DefineUniCat | ( | cat, | |
c | |||
) | uc ## cat = (int(uchar(c)) & 0xff) |
#define DefineUniSubCat | ( | cat, | |
subCat, | |||
c | |||
) | uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) |
#define IsPeekAheadSkippable | ( | sbf | ) | ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0) |
Referenced by TUniChDb::FindNextSentenceBoundary().
#define TestCur | ( | curFlag | ) | ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag) |
#define TestCurNext | ( | curFlag, | |
nextFlag | |||
) | if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue |
Referenced by TUniChDb::FindNextSentenceBoundary(), and TUniChDb::FindNextWordBoundary().
#define TestCurNext | ( | curFlag, | |
nextFlag | |||
) | if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue |
#define TestCurNext2 | ( | curFlag, | |
nextFlag, | |||
next2Flag | |||
) | if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue |
Referenced by TUniChDb::FindNextWordBoundary().
#define TestCurNext2 | ( | curFlag, | |
nextFlag, | |||
next2Flag | |||
) | if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue |
#define TestPrevCurNext | ( | prevFlag, | |
curFlag, | |||
nextFlag | |||
) | if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue |
Referenced by TUniChDb::FindNextSentenceBoundary(), and TUniChDb::FindNextWordBoundary().
#define TestPrevCurNext | ( | prevFlag, | |
curFlag, | |||
nextFlag | |||
) | if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue |
Referenced by TUniChDb::FindNextSentenceBoundary().
typedef TPt<TCodecBase> PCodecBase |
typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250 |
typedef T8BitCodec<TEncoding_CP437> TCodec_CP437 |
typedef T8BitCodec<TEncoding_CP852> TCodec_CP852 |
typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii |
typedef TVec<PCodecBase> TCodecBaseV |
typedef enum TUniByteOrder_ TUniByteOrder |
typedef enum TUniChCategory_ TUniChCategory |
typedef enum TUniChFlags_ TUniChFlags |
typedef enum TUniChProperties_ TUniChProperties |
typedef enum TUniChPropertiesX_ TUniChPropertiesX |
typedef enum TUniChSubCategory_ TUniChSubCategory |
typedef enum TUnicodeErrorHandling_ TUnicodeErrorHandling |
typedef int TUniVecIdx |
typedef enum TUtf16BomHandling_ TUtf16BomHandling |
enum TUniByteOrder_ |
Definition at line 38 of file unicode.h.
{ boMachineEndian = 0, boLittleEndian = 1, boBigEndian = 2 }
enum TUniChCategory_ |
DefineUniCat | |
DefineUniCat | |
DefineUniCat | |
DefineUniCat | |
DefineUniCat | |
DefineUniCat | |
DefineUniCat | |
DefineUniCat |
Definition at line 661 of file unicode.h.
{ #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff) DefineUniCat(Letter, 'L'), // ucLetter DefineUniCat(Mark, 'M'), DefineUniCat(Number, 'N'), DefineUniCat(Punctuation, 'P'), DefineUniCat(Symbol, 'S'), DefineUniCat(Separator, 'Z'), DefineUniCat(Other, 'C') #undef DefineUniCat }
enum TUniChFlags_ |
Definition at line 711 of file unicode.h.
{ ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt // Flags used when searching for word boundaries. See UAX #29. ucfWbFormat = 1 << 2, ucfWbKatakana = 1 << 3, ucfWbALetter = 1 << 4, ucfWbMidLetter = 1 << 5, ucfWbMidNum = 1 << 6, ucfWbNumeric = 1 << 7, ucfWbExtendNumLet = 1 << 8, // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29. ucfSbSep = 1 << 9, ucfSbFormat = 1 << 10, ucfSbSp = 1 << 11, ucfSbLower = 1 << 12, ucfSbUpper = 1 << 13, ucfSbOLetter = 1 << 14, ucfSbNumeric = 1 << 15, ucfSbATerm = 1 << 16, ucfSbSTerm = 1 << 17, ucfSbClose = 1 << 18, ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose, ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, // Flags from DerivedCoreProperties.txt. // [The comments are from UCD.html.] // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode]. // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl ucfDcpAlphabetic = 1 << 19, // - For programmatic determination of default-ignorable code points. // New characters that should be ignored in processing (unless explicitly supported) // will be assigned in these ranges, permitting programs to correctly handle the default // behavior of such characters when not otherwise supported. For more information, see // UAX #29: Text Boundaries [Breaks]. // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors] ucfDcpDefaultIgnorableCodePoint = 1 << 20, // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode]. // Generated from: Other_Lowercase + Ll ucfDcpLowercase = 1 << 21, // - For programmatic determination of grapheme cluster boundaries. // For more information, see UAX #29: Text Boundaries [Breaks]. // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend ucfDcpGraphemeBase = 1 << 22, // - For programmatic determination of grapheme cluster boundaries. // For more information, see UAX #29: Text Boundaries [Breaks]. // Generated from: Other_Grapheme_Extend + Me + Mn // Note: depending on an application's interpretation of Co (private use), they may be either // in Grapheme_Base, or in Grapheme_Extend, or in neither. ucfDcpGraphemeExtend = 1 << 23, // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. ucfDcpIdStart = 1 << 24, ucfDcpIdContinue = 1 << 25, // - Characters with the Math property. For more information, see Chapter 4 in [Unicode]. // Generated from: Sm + Other_Math ucfDcpMath = 1 << 26, // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode]. // Generated from: Lu + Other_Uppercase ucfDcpUppercase = 1 << 27, // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. ucfDcpXidStart = 1 << 28, ucfDcpXidContinue = 1 << 29, ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend | ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue, }
enum TUniChProperties_ |
Definition at line 779 of file unicode.h.
{ // The flags from PropList.txt. // [The comments are from UCD.html.] // - ASCII characters commonly used for the representation of hexadecimal numbers. // [= 0123456789abcdefABCDEF] ucfPrAsciiHexDigit = 1, // - Those format control characters which have specific functions in the Bidirectional Algorithm. ucfPrBidiControl = 2, // - Those punctuation characters explicitly called out as dashes in the Unicode Standard, // plus compatibility equivalents to those. Most of these have the Pd General Category, // but some have the Sm General Category because of their use in mathematics. // U+0002d HYPHEN-MINUS // U+0058a ARMENIAN HYPHEN // U+005be HEBREW PUNCTUATION MAQAF // U+01806 MONGOLIAN TODO SOFT HYPHEN // U+02010 HYPHEN // U+02011 NON-BREAKING HYPHEN // U+02012 FIGURE DASH // U+02013 EN DASH // U+02014 EM DASH // U+02015 HORIZONTAL BAR // U+02053 SWUNG DASH // U+0207b SUPERSCRIPT MINUS // U+0208b SUBSCRIPT MINUS // U+02212 MINUS SIGN // U+02e17 DOUBLE OBLIQUE HYPHEN // U+0301c WAVE DASH // U+03030 WAVY DASH // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH // U+0fe58 SMALL EM DASH // U+0fe63 SMALL HYPHEN-MINUS // U+0ff0d FULLWIDTH HYPHEN-MINUS ucfPrDash = 4, // - For a machine-readable list of deprecated characters. No characters will ever be removed // from the standard, but the usage of deprecated characters is strongly discouraged. ucfPrDeprecated = 8, // - Characters that linguistically modify the meaning of another character to which they apply. // Some diacritics are not combining characters, and some combining characters are not diacritics. ucfPrDiacritic = 0x10, // - Characters whose principal function is to extend the value or shape of a preceding alphabetic // character. Typical of these are length and iteration marks. ucfPrExtender = 0x20, // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries. ucfPrGraphemeLink = 0x40, // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents. // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}] ucfPrHexDigit = 0x80, // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot. // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash. // U+0002d HYPHEN-MINUS // U+000ad SOFT HYPHEN // U+0058a ARMENIAN HYPHEN // U+01806 MONGOLIAN TODO SOFT HYPHEN // U+02010 HYPHEN // U+02011 NON-BREAKING HYPHEN // U+02e17 DOUBLE OBLIQUE HYPHEN // U+030fb KATAKANA MIDDLE DOT // U+0fe63 SMALL HYPHEN-MINUS // U+0ff0d FULLWIDTH HYPHEN-MINUS // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT ucfPrHyphen = 0x100, // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs. ucfPrIdeographic = 0x200, // - Those format control characters which have specific functions for control of cursive joining and ligation. ucfPrJoinControl = 0x400, // - There are a small number of characters that do not use logical order. // These characters require special handling in most processing. ucfPrLogicalOrderException = 0x800, // - Code points that are permanently reserved for internal use. ucfPrNoncharacterCodePoint = 0x1000, // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax. ucfPrPatternSyntax = 0x2000, ucfPrPatternWhiteSpace = 0x4000, // - Those punctuation characters that function as quotation marks. // U+00022 QUOTATION MARK // U+00027 APOSTROPHE // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK // U+02018 LEFT SINGLE QUOTATION MARK // U+02019 RIGHT SINGLE QUOTATION MARK // U+0201a SINGLE LOW-9 QUOTATION MARK // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK // U+0201c LEFT DOUBLE QUOTATION MARK // U+0201d RIGHT DOUBLE QUOTATION MARK // U+0201e DOUBLE LOW-9 QUOTATION MARK // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK // U+0300c LEFT CORNER BRACKET // U+0300d RIGHT CORNER BRACKET // U+0300e LEFT WHITE CORNER BRACKET // U+0300f RIGHT WHITE CORNER BRACKET // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK // U+0301e DOUBLE PRIME QUOTATION MARK // U+0301f LOW DOUBLE PRIME QUOTATION MARK // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET // U+0ff02 FULLWIDTH QUOTATION MARK // U+0ff07 FULLWIDTH APOSTROPHE // U+0ff62 HALFWIDTH LEFT CORNER BRACKET // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET ucfPrQuotationMark = 0x8000, // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. // An explicit _dot above_ can be added where required, such as in Lithuanian. ucfPrSoftDotted = 0x10000, // - Sentence Terminal. Used in UAX #29: Text Boundaries. // U+00021 EXCLAMATION MARK // U+0002e FULL STOP // U+0003f QUESTION MARK // U+0203c DOUBLE EXCLAMATION MARK // U+0203d INTERROBANG // U+02047 DOUBLE QUESTION MARK // U+02048 QUESTION EXCLAMATION MARK // U+02049 EXCLAMATION QUESTION MARK // U+03002 IDEOGRAPHIC FULL STOP // [plus many characters from other writing systems] ucfPrSTerm = 0x20000, // - Those punctuation characters that generally mark the end of textual units. // [JB note: this set contains more character than STerm. For example, it contains // the comma, colon and semicolon, whereas STerm doesn't.] // U+00021 EXCLAMATION MARK // U+0002c COMMA // U+0002e FULL STOP // U+0003a COLON // U+0003b SEMICOLON // U+0003f QUESTION MARK // U+0203c DOUBLE EXCLAMATION MARK // U+0203d INTERROBANG // U+02047 DOUBLE QUESTION MARK // U+02048 QUESTION EXCLAMATION MARK // U+02049 EXCLAMATION QUESTION MARK // [plus *lots* of charcters from other writing systems] ucfPrTerminalPunctuation = 0x40000, // - Indicates all those characters that qualify as Variation Selectors. // For details on the behavior of these characters, see StandardizedVariants.html and // Section 16.4, Variation Selectors in [Unicode]. ucfPrVariationSelector = 0x80000, // - Those separator characters and control characters which should be treated by // programming languages as "white space" for the purpose of parsing elements. // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included, // since their functions are restricted to line-break control. // Their names are unfortunately misleading in this respect. // Note: There are other senses of "whitespace" that encompass a different set of characters. // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt. // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.] // This includes the following characters: // U+0009 <control> // U+000a <control> // U+000b <control> // U+000c <control> // U+000d <control> // U+0020 SPACE // U+0085 <control> // U+00a0 NO-BREAK SPACE // U+1680 OGHAM SPACE MARK // U+180e MONGOLIAN VOWEL SEPARATOR // U+2000 EN QUAD // U+2001 EM QUAD // U+2002 EN SPACE // U+2003 EM SPACE // U+2004 THREE-PER-EM SPACE // U+2005 FOUR-PER-EM SPACE // U+2006 SIX-PER-EM SPACE // U+2007 FIGURE SPACE // U+2008 PUNCTUATION SPACE // U+2009 THIN SPACE // U+200a HAIR SPACE // U+2028 LINE SEPARATOR // U+2029 PARAGRAPH SEPARATOR // U+202f NARROW NO-BREAK SPACE // U+205f MEDIUM MATHEMATICAL SPACE // U+3000 IDEOGRAPHIC SPACE ucfPrWhiteSpace = 0x100000 }
enum TUniChPropertiesX_ |
Definition at line 960 of file unicode.h.
{ // More properties from PropList.txt. // - Used to derive the properties in DerivedCoreProperties.txt. ucfPxOtherAlphabetic = 1, ucfPxOtherDefaultIgnorableCodePoint = 2, ucfPxOtherGraphemeExtend = 4, ucfPxOtherIdContinue = 8, ucfPxOtherIdStart = 0x10, ucfPxOtherLowercase = 0x20, ucfPxOtherMath = 0x40, ucfPxOtherUppercase = 0x80, // - Used in ideographic description sequences. ucfPxIdsBinaryOperator = 0x100, ucfPxIdsTrinaryOperator = 0x200, ucfPxRadical = 0x400, ucfPxUnifiedIdeograph = 0x800 }
enum TUniChSubCategory_ |
Definition at line 675 of file unicode.h.
{ #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase DefineUniSubCat(Letter, Lowercase, 'l'), DefineUniSubCat(Letter, Titlecase, 't'), DefineUniSubCat(Letter, Modifier, 'm'), DefineUniSubCat(Letter, Other, 'o'), DefineUniSubCat(Mark, Nonspacing, 'n'), DefineUniSubCat(Mark, SpacingCombining, 'c'), DefineUniSubCat(Mark, Enclosing, 'e'), DefineUniSubCat(Number, DecimalDigit, 'd'), DefineUniSubCat(Number, Letter, 'l'), DefineUniSubCat(Number, Other, 'o'), DefineUniSubCat(Punctuation, Connector, 'c'), DefineUniSubCat(Punctuation, Dash, 'd'), DefineUniSubCat(Punctuation, Open, 's'), DefineUniSubCat(Punctuation, Close, 'e'), DefineUniSubCat(Punctuation, InitialQuote, 'i'), DefineUniSubCat(Punctuation, FinalQuote, 'f'), DefineUniSubCat(Punctuation, Other, 'o'), DefineUniSubCat(Symbol, Math, 'm'), DefineUniSubCat(Symbol, Currency, 'c'), DefineUniSubCat(Symbol, Modifier, 'k'), DefineUniSubCat(Symbol, Other, 'o'), DefineUniSubCat(Separator, Space, 's'), DefineUniSubCat(Separator, Line, 'l'), DefineUniSubCat(Separator, Paragraph, 'p'), DefineUniSubCat(Other, Control, 'c'), DefineUniSubCat(Other, Format, 'f'), DefineUniSubCat(Other, Surrogate, 's'), DefineUniSubCat(Other, PrivateUse, 'o'), DefineUniSubCat(Other, NotAssigned, 'n') }
Definition at line 18 of file unicode.h.
{ // What happens when an error occurs: uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector) uehThrow = 1, // - an exception is thrown (TUnicodeException) uehReplace = 2, // - the replacement character is added to the output vector uehAbort = 3 // - the encoding/decoding process stops immediately }
enum TUtf16BomHandling_ |
Definition at line 46 of file unicode.h.
{ bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character }
bool AlwaysFalse | ( | ) | [inline] |
Definition at line 3216 of file unicode.h.
Referenced by TUniChDb::InitScripts(), and TUniChDb::TestFindNextWordOrSentenceBoundary().
{ int sum = 0; for (int i = 0; i < 5; i++) sum += i; return sum > 100; }
bool AlwaysTrue | ( | ) | [inline] |