SNAP Library 2.0, User Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <unicode.h>
Public Types | |
typedef TUniChDb::TCaseConversion | TCaseConversion |
Public Member Functions | |
TUnicode () | |
TUnicode (const TStr &fnBinUcd) | |
void | Init () |
int | DecodeUtf8 (const TIntV &src, TIntV &dest) const |
int | DecodeUtf8 (const TStr &src, TIntV &dest) const |
int | EncodeUtf8 (const TIntV &src, TIntV &dest) const |
TStr | EncodeUtf8Str (const TIntV &src) const |
int | DecodeUtf16FromBytes (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
int | DecodeUtf16FromWords (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
int | EncodeUtf16ToWords (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
int | EncodeUtf16ToBytes (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
void | RegisterCodec (const TStr &nameList, const PCodecBase &codec) |
void | UnregisterCodec (const TStr &nameList) |
void | ClrCodecs () |
void | InitCodecs () |
PCodecBase | GetCodec (const TStr &name) const |
void | GetAllCodecs (TCodecBaseV &dest) const |
bool | FindNextWordBoundary (const TIntV &src, int &position) const |
void | FindWordBoundaries (const TIntV &src, TBoolV &dest) const |
bool | FindNextSentenceBoundary (const TIntV &src, int &position) const |
void | FindSentenceBoundaries (const TIntV &src, TBoolV &dest) const |
void | ClrSentenceBoundaryExceptions () |
void | UseEnglishSentenceBoundaryExceptions () |
void | Decompose (const TIntV &src, TIntV &dest, bool compatibility) const |
void | Compose (const TIntV &src, TIntV &dest) const |
void | DecomposeAndCompose (const TIntV &src, TIntV &dest, bool compatibility) const |
int | ExtractStarters (const TIntV &src, TIntV &dest) const |
int | ExtractStarters (TIntV &src) const |
void | GetLowerCase (const TIntV &src, TIntV &dest) const |
void | GetUpperCase (const TIntV &src, TIntV &dest) const |
void | GetTitleCase (const TIntV &src, TIntV &dest) const |
void | GetSimpleLowerCase (const TIntV &src, TIntV &dest) const |
void | GetSimpleUpperCase (const TIntV &src, TIntV &dest) const |
void | GetSimpleTitleCase (const TIntV &src, TIntV &dest) const |
void | ToSimpleUpperCase (TIntV &src) const |
void | ToSimpleLowerCase (TIntV &src) const |
void | ToSimpleTitleCase (TIntV &src) const |
void | GetCaseFolded (const TIntV &src, TIntV &dest, const bool full=true) const |
void | ToCaseFolded (TIntV &src) const |
TStr | GetUtf8CaseFolded (const TStr &s) const |
DECLARE_FORWARDED_PROPERTY_METHODS | ___UniFwd2 (IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const |
TUniChSubCategory | GetSubCat (const int cp) const |
const char * | GetCharName (const int cp) const |
TStr | GetCharNameS (const int cp) const |
Static Public Member Functions | |
static void | EncodeUtf8 (const uint &Ch, TChA &Dest) |
static TStr | EncodeUtf8 (const uint &Ch) |
Public Attributes | |
TUniCodec | codec |
TUniChDb | ucd |
T8BitCodec< TEncoding_ISO8859_1 > | iso8859_1 |
T8BitCodec< TEncoding_ISO8859_2 > | iso8859_2 |
T8BitCodec< TEncoding_ISO8859_3 > | iso8859_3 |
T8BitCodec< TEncoding_ISO8859_4 > | iso8859_4 |
T8BitCodec< TEncoding_YuAscii > | yuAscii |
T8BitCodec< TEncoding_CP1250 > | cp1250 |
T8BitCodec< TEncoding_CP852 > | cp852 |
T8BitCodec< TEncoding_CP437 > | cp437 |
Static Protected Member Functions | |
static TStr | NormalizeCodecName (const TStr &name) |
Protected Attributes | |
THash< TStr, PCodecBase > | codecs |
TUnicode::TUnicode | ( | ) | [inline] |
TUnicode::TUnicode | ( | const TStr & | fnBinUcd | ) | [inline, explicit] |
DECLARE_FORWARDED_PROPERTY_METHODS TUnicode::___UniFwd2 | ( | IsPrivateUse | , |
IsSurrogate | |||
) | const [inline] |
void TUnicode::ClrCodecs | ( | ) | [inline] |
void TUnicode::ClrSentenceBoundaryExceptions | ( | ) | [inline] |
void TUnicode::Compose | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
int TUnicode::DecodeUtf16FromBytes | ( | const TIntV & | src, |
TIntV & | dest, | ||
const TUtf16BomHandling | bomHandling = bomAllowed , |
||
const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
) | const [inline] |
Definition at line 1810 of file unicode.h.
{ return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf16FromWords | ( | const TIntV & | src, |
TIntV & | dest, | ||
const TUtf16BomHandling | bomHandling = bomAllowed , |
||
const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
) | const [inline] |
Definition at line 1823 of file unicode.h.
{ return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf8 | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1787 of file unicode.h.
{ return (int) codec.DecodeUtf8(src, dest); }
int TUnicode::DecodeUtf8 | ( | const TStr & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1788 of file unicode.h.
{ return (int) codec.DecodeUtf8(src, dest); }
void TUnicode::Decompose | ( | const TIntV & | src, |
TIntV & | dest, | ||
bool | compatibility | ||
) | const [inline] |
void TUnicode::DecomposeAndCompose | ( | const TIntV & | src, |
TIntV & | dest, | ||
bool | compatibility | ||
) | const [inline] |
Definition at line 1946 of file unicode.h.
{ return ucd.DecomposeAndCompose(src, dest, compatibility); }
int TUnicode::EncodeUtf16ToBytes | ( | const TIntV & | src, |
TIntV & | dest, | ||
const bool | insertBom, | ||
const TUniByteOrder | destByteOrder = boMachineEndian |
||
) | const [inline] |
Definition at line 1838 of file unicode.h.
{ return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf16ToWords | ( | const TIntV & | src, |
TIntV & | dest, | ||
const bool | insertBom, | ||
const TUniByteOrder | destByteOrder = boMachineEndian |
||
) | const [inline] |
Definition at line 1834 of file unicode.h.
{ return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf8 | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1792 of file unicode.h.
{ return (int) codec.EncodeUtf8(src, dest); }
void TUnicode::EncodeUtf8 | ( | const uint & | Ch, |
TChA & | Dest | ||
) | [static] |
Definition at line 1700 of file unicode.cpp.
{ if (c > 0x10ffff) { throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); } if (c < 0x80u) dest.AddCh(char(c & 0xffu)); else if (c < 0x800u) { dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } else if (c < 0x10000u) { dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } else if (c < 0x200000u) { dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } else if (c < 0x4000000u) { dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } else { dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111))); dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); } }
TStr TUnicode::EncodeUtf8 | ( | const uint & | Ch | ) | [static] |
Definition at line 1732 of file unicode.cpp.
{ TChA ChA; EncodeUtf8(Ch, ChA); return ChA; }
TStr TUnicode::EncodeUtf8Str | ( | const TIntV & | src | ) | const [inline] |
Definition at line 1796 of file unicode.h.
{ return codec.EncodeUtf8Str(src); }
int TUnicode::ExtractStarters | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1951 of file unicode.h.
{ return (int) ucd.ExtractStarters(src, dest); }
int TUnicode::ExtractStarters | ( | TIntV & | src | ) | const [inline] |
Definition at line 1953 of file unicode.h.
{ return (int) ucd.ExtractStarters(src); }
bool TUnicode::FindNextSentenceBoundary | ( | const TIntV & | src, |
int & | position | ||
) | const [inline] |
Definition at line 1916 of file unicode.h.
{ if (position < 0) { position = 0; return true; } size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
bool TUnicode::FindNextWordBoundary | ( | const TIntV & | src, |
int & | position | ||
) | const [inline] |
Definition at line 1901 of file unicode.h.
{ if (position < 0) { position = 0; return true; } size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
void TUnicode::FindSentenceBoundaries | ( | const TIntV & | src, |
TBoolV & | dest | ||
) | const [inline] |
Definition at line 1922 of file unicode.h.
{ ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
void TUnicode::FindWordBoundaries | ( | const TIntV & | src, |
TBoolV & | dest | ||
) | const [inline] |
Definition at line 1907 of file unicode.h.
{ ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
void TUnicode::GetAllCodecs | ( | TCodecBaseV & | dest | ) | const [inline] |
Definition at line 1887 of file unicode.h.
{ dest.Clr(); for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) { PCodecBase codec = codecs[i]; bool found = false; for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; } if (! found) dest.Add(codec); }}
void TUnicode::GetCaseFolded | ( | const TIntV & | src, |
TIntV & | dest, | ||
const bool | full = true |
||
) | const [inline] |
Definition at line 1989 of file unicode.h.
{ return ucd.GetCaseFolded(src, dest, true, full, false); }
const char* TUnicode::GetCharName | ( | const int | cp | ) | const [inline] |
Definition at line 2024 of file unicode.h.
{ return ucd.GetCharName(cp); }
TStr TUnicode::GetCharNameS | ( | const int | cp | ) | const [inline] |
Definition at line 2025 of file unicode.h.
{ return ucd.GetCharNameS(cp); }
PCodecBase TUnicode::GetCodec | ( | const TStr & | name | ) | const [inline] |
Definition at line 1883 of file unicode.h.
{ TStr s = NormalizeCodecName(name); PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr(); return p; }
void TUnicode::GetLowerCase | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1965 of file unicode.h.
{ ucd.GetLowerCase(src, dest, true, false, false); }
void TUnicode::GetSimpleLowerCase | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1972 of file unicode.h.
{ ucd.GetSimpleLowerCase(src, dest, true); }
void TUnicode::GetSimpleTitleCase | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1974 of file unicode.h.
{ ucd.GetSimpleTitleCase(src, dest, true); }
void TUnicode::GetSimpleUpperCase | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1973 of file unicode.h.
{ ucd.GetSimpleUpperCase(src, dest, true); }
TUniChSubCategory TUnicode::GetSubCat | ( | const int | cp | ) | const [inline] |
void TUnicode::GetTitleCase | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1967 of file unicode.h.
{ ucd.GetTitleCase(src, dest, true, false, false); }
void TUnicode::GetUpperCase | ( | const TIntV & | src, |
TIntV & | dest | ||
) | const [inline] |
Definition at line 1966 of file unicode.h.
{ ucd.GetUpperCase(src, dest, true, false, false); }
TStr TUnicode::GetUtf8CaseFolded | ( | const TStr & | s | ) | const [inline] |
Definition at line 1994 of file unicode.h.
{ bool isAscii = true; for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; } if (isAscii) return s.GetLc(); TIntV src; DecodeUtf8(s, src); TIntV dest; GetCaseFolded(src, dest); return EncodeUtf8Str(dest); }
void TUnicode::Init | ( | ) | [inline] |
Definition at line 1779 of file unicode.h.
{ InitCodecs(); }
void TUnicode::InitCodecs | ( | ) |
Definition at line 1687 of file unicode.cpp.
{ ClrCodecs(); RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>()); RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>()); RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>()); RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>()); RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>()); RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>()); RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>()); RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>()); }
static TStr TUnicode::NormalizeCodecName | ( | const TStr & | name | ) | [inline, static, protected] |
Definition at line 1870 of file unicode.h.
{ TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
void TUnicode::RegisterCodec | ( | const TStr & | nameList, |
const PCodecBase & | codec | ||
) | [inline] |
void TUnicode::ToCaseFolded | ( | TIntV & | src | ) | const [inline] |
Definition at line 1992 of file unicode.h.
{ return ucd.ToCaseFolded(src, false); }
void TUnicode::ToSimpleLowerCase | ( | TIntV & | src | ) | const [inline] |
Definition at line 1978 of file unicode.h.
{ ucd.ToSimpleLowerCase(src); }
void TUnicode::ToSimpleTitleCase | ( | TIntV & | src | ) | const [inline] |
Definition at line 1979 of file unicode.h.
{ ucd.ToSimpleTitleCase(src); }
void TUnicode::ToSimpleUpperCase | ( | TIntV & | src | ) | const [inline] |
Definition at line 1977 of file unicode.h.
{ ucd.ToSimpleUpperCase(src); }
void TUnicode::UnregisterCodec | ( | const TStr & | nameList | ) | [inline] |
void TUnicode::UseEnglishSentenceBoundaryExceptions | ( | ) | [inline] |
Definition at line 1925 of file unicode.h.
{ ucd.SbEx_SetStdEnglish(); }
THash<TStr, PCodecBase> TUnicode::codecs [protected] |