|
SNAP Library , Developer Reference
2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <unicode.h>

Public Types | |
| typedef TUniChDb::TCaseConversion | TCaseConversion |
Public Member Functions | |
| TUnicode () | |
| TUnicode (const TStr &fnBinUcd) | |
| void | Init () |
| int | DecodeUtf8 (const TIntV &src, TIntV &dest) const |
| int | DecodeUtf8 (const TStr &src, TIntV &dest) const |
| int | EncodeUtf8 (const TIntV &src, TIntV &dest) const |
| TStr | EncodeUtf8Str (const TIntV &src) const |
| int | DecodeUtf16FromBytes (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
| int | DecodeUtf16FromWords (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
| int | EncodeUtf16ToWords (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
| int | EncodeUtf16ToBytes (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
| void | RegisterCodec (const TStr &nameList, const PCodecBase &codec) |
| void | UnregisterCodec (const TStr &nameList) |
| void | ClrCodecs () |
| void | InitCodecs () |
| PCodecBase | GetCodec (const TStr &name) const |
| void | GetAllCodecs (TCodecBaseV &dest) const |
| bool | FindNextWordBoundary (const TIntV &src, int &position) const |
| void | FindWordBoundaries (const TIntV &src, TBoolV &dest) const |
| bool | FindNextSentenceBoundary (const TIntV &src, int &position) const |
| void | FindSentenceBoundaries (const TIntV &src, TBoolV &dest) const |
| void | ClrSentenceBoundaryExceptions () |
| void | UseEnglishSentenceBoundaryExceptions () |
| void | Decompose (const TIntV &src, TIntV &dest, bool compatibility) const |
| void | Compose (const TIntV &src, TIntV &dest) const |
| void | DecomposeAndCompose (const TIntV &src, TIntV &dest, bool compatibility) const |
| int | ExtractStarters (const TIntV &src, TIntV &dest) const |
| int | ExtractStarters (TIntV &src) const |
| void | GetLowerCase (const TIntV &src, TIntV &dest) const |
| void | GetUpperCase (const TIntV &src, TIntV &dest) const |
| void | GetTitleCase (const TIntV &src, TIntV &dest) const |
| void | GetSimpleLowerCase (const TIntV &src, TIntV &dest) const |
| void | GetSimpleUpperCase (const TIntV &src, TIntV &dest) const |
| void | GetSimpleTitleCase (const TIntV &src, TIntV &dest) const |
| void | ToSimpleUpperCase (TIntV &src) const |
| void | ToSimpleLowerCase (TIntV &src) const |
| void | ToSimpleTitleCase (TIntV &src) const |
| void | GetCaseFolded (const TIntV &src, TIntV &dest, const bool full=true) const |
| void | ToCaseFolded (TIntV &src) const |
| TStr | GetUtf8CaseFolded (const TStr &s) const |
| DECLARE_FORWARDED_PROPERTY_METHODS | ___UniFwd2 (IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const |
| TUniChSubCategory | GetSubCat (const int cp) const |
| const char * | GetCharName (const int cp) const |
| TStr | GetCharNameS (const int cp) const |
Public Attributes | |
| TUniCodec | codec |
| TUniChDb | ucd |
| T8BitCodec< TEncoding_ISO8859_1 > | iso8859_1 |
| T8BitCodec< TEncoding_ISO8859_2 > | iso8859_2 |
| T8BitCodec< TEncoding_ISO8859_3 > | iso8859_3 |
| T8BitCodec< TEncoding_ISO8859_4 > | iso8859_4 |
| T8BitCodec< TEncoding_YuAscii > | yuAscii |
| T8BitCodec< TEncoding_CP1250 > | cp1250 |
| T8BitCodec< TEncoding_CP852 > | cp852 |
| T8BitCodec< TEncoding_CP437 > | cp437 |
Static Protected Member Functions | |
| static TStr | NormalizeCodecName (const TStr &name) |
Protected Attributes | |
| THash< TStr, PCodecBase > | codecs |
| TUnicode::TUnicode | ( | ) | [inline] |
| TUnicode::TUnicode | ( | const TStr & | fnBinUcd | ) | [inline, explicit] |
| DECLARE_FORWARDED_PROPERTY_METHODS TUnicode::___UniFwd2 | ( | IsPrivateUse | , |
| IsSurrogate | |||
| ) | const [inline] |
| void TUnicode::ClrCodecs | ( | ) | [inline] |
Definition at line 1876 of file unicode.h.
References THash< TKey, TDat, THashFunc >::Clr(), and codecs.
Referenced by InitCodecs().


| void TUnicode::ClrSentenceBoundaryExceptions | ( | ) | [inline] |
| void TUnicode::Compose | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
| int TUnicode::DecodeUtf16FromBytes | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| const TUtf16BomHandling | bomHandling = bomAllowed, |
||
| const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
| ) | const [inline] |
Definition at line 1805 of file unicode.h.
References codec, TUniCodec::DecodeUtf16FromBytes(), and TVec< TVal >::Len().
{
return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }

| int TUnicode::DecodeUtf16FromWords | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| const TUtf16BomHandling | bomHandling = bomAllowed, |
||
| const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
| ) | const [inline] |
Definition at line 1818 of file unicode.h.
References codec, TUniCodec::DecodeUtf16FromWords(), and TVec< TVal >::Len().
{
return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }

| int TUnicode::DecodeUtf8 | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1786 of file unicode.h.
References codec, and TUniCodec::DecodeUtf8().
Referenced by TJsonVal::AddEscapeChAFromStr(), GetUtf8CaseFolded(), and TUStr::TUStr().
{ return (int) codec.DecodeUtf8(src, dest); }


| int TUnicode::DecodeUtf8 | ( | const TStr & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1787 of file unicode.h.
References codec, and TUniCodec::DecodeUtf8().
{ return (int) codec.DecodeUtf8(src, dest); }

| void TUnicode::Decompose | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| bool | compatibility | ||
| ) | const [inline] |
Definition at line 1929 of file unicode.h.
References TUniChDb::Decompose(), and ucd.
Referenced by TUStr::GetStarterLowerCaseStr(), TUStr::GetStarterStr(), TUStr::ToStarterCase(), and TUStr::TUStr().


| void TUnicode::DecomposeAndCompose | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| bool | compatibility | ||
| ) | const [inline] |
Definition at line 1941 of file unicode.h.
References TUniChDb::DecomposeAndCompose(), and ucd.
{ return ucd.DecomposeAndCompose(src, dest, compatibility); }

| int TUnicode::EncodeUtf16ToBytes | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| const bool | insertBom, | ||
| const TUniByteOrder | destByteOrder = boMachineEndian |
||
| ) | const [inline] |
Definition at line 1833 of file unicode.h.
References codec, TUniCodec::EncodeUtf16ToBytes(), and TVec< TVal >::Len().
{
return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }

| int TUnicode::EncodeUtf16ToWords | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| const bool | insertBom, | ||
| const TUniByteOrder | destByteOrder = boMachineEndian |
||
| ) | const [inline] |
Definition at line 1829 of file unicode.h.
References codec, TUniCodec::EncodeUtf16ToWords(), and TVec< TVal >::Len().
{
return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }

| int TUnicode::EncodeUtf8 | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1791 of file unicode.h.
References codec, and TUniCodec::EncodeUtf8().
{ return (int) codec.EncodeUtf8(src, dest); }

| TStr TUnicode::EncodeUtf8Str | ( | const TIntV & | src | ) | const [inline] |
Definition at line 1795 of file unicode.h.
References codec, and TUniCodec::EncodeUtf8Str().
Referenced by TUStr::EncodeUtf8(), TUStr::GetStarterLowerCaseStr(), TUStr::GetStarterStr(), TUStr::GetStr(), and GetUtf8CaseFolded().
{ return codec.EncodeUtf8Str(src); }


| int TUnicode::ExtractStarters | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1946 of file unicode.h.
References TUniChDb::ExtractStarters(), and ucd.
Referenced by TUStr::GetStarterLowerCaseStr(), TUStr::GetStarterStr(), and TUStr::ToStarterCase().
{ return (int) ucd.ExtractStarters(src, dest); }


| int TUnicode::ExtractStarters | ( | TIntV & | src | ) | const [inline] |
Definition at line 1948 of file unicode.h.
References TUniChDb::ExtractStarters(), and ucd.
{ return (int) ucd.ExtractStarters(src); }

| bool TUnicode::FindNextSentenceBoundary | ( | const TIntV & | src, |
| int & | position | ||
| ) | const [inline] |
Definition at line 1911 of file unicode.h.
References TUniChDb::FindNextSentenceBoundary(), TVec< TVal >::Len(), and ucd.
{
if (position < 0) { position = 0; return true; }
size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }

| bool TUnicode::FindNextWordBoundary | ( | const TIntV & | src, |
| int & | position | ||
| ) | const [inline] |
Definition at line 1896 of file unicode.h.
References TUniChDb::FindNextWordBoundary(), TVec< TVal >::Len(), and ucd.
{
if (position < 0) { position = 0; return true; }
size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }

| void TUnicode::FindSentenceBoundaries | ( | const TIntV & | src, |
| TBoolV & | dest | ||
| ) | const [inline] |
Definition at line 1917 of file unicode.h.
References TUniChDb::FindSentenceBoundaries(), TVec< TVal >::Len(), and ucd.
{ ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }

| void TUnicode::FindWordBoundaries | ( | const TIntV & | src, |
| TBoolV & | dest | ||
| ) | const [inline] |
Definition at line 1902 of file unicode.h.
References TUniChDb::FindWordBoundaries(), TVec< TVal >::Len(), and ucd.
Referenced by TUStr::GetWordBoundPV().
{ ucd.FindWordBoundaries(src, 0, src.Len(), dest); }


| void TUnicode::GetAllCodecs | ( | TCodecBaseV & | dest | ) | const [inline] |
Definition at line 1882 of file unicode.h.
References TVec< TVal >::Add(), TVec< TVal >::Clr(), codec, codecs, THash< TKey, TDat, THashFunc >::FFirstKeyId(), THash< TKey, TDat, THashFunc >::FNextKeyId(), and TVec< TVal >::Len().
{
dest.Clr();
for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
PCodecBase codec = codecs[i]; bool found = false;
for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
if (! found) dest.Add(codec); }}

| void TUnicode::GetCaseFolded | ( | const TIntV & | src, |
| TIntV & | dest, | ||
| const bool | full = true |
||
| ) | const [inline] |
Definition at line 1984 of file unicode.h.
References TUniChDb::GetCaseFolded(), and ucd.
Referenced by GetUtf8CaseFolded().
{ return ucd.GetCaseFolded(src, dest, true, full, false); }


| const char* TUnicode::GetCharName | ( | const int | cp | ) | const [inline] |
Definition at line 2019 of file unicode.h.
References TUniChDb::GetCharName(), and ucd.
{ return ucd.GetCharName(cp); }

| TStr TUnicode::GetCharNameS | ( | const int | cp | ) | const [inline] |
Definition at line 2020 of file unicode.h.
References TUniChDb::GetCharNameS(), and ucd.
Referenced by TUStr::GetChNm().
{ return ucd.GetCharNameS(cp); }


| PCodecBase TUnicode::GetCodec | ( | const TStr & | name | ) | const [inline] |
Definition at line 1878 of file unicode.h.
References TPt< TRec >::Clr(), codecs, THash< TKey, TDat, THashFunc >::IsKeyGetDat(), and NormalizeCodecName().
{
TStr s = NormalizeCodecName(name);
PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
return p; }

| void TUnicode::GetLowerCase | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1960 of file unicode.h.
References TUniChDb::GetLowerCase(), and ucd.
{ ucd.GetLowerCase(src, dest, true, false, false); }

| void TUnicode::GetSimpleLowerCase | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1967 of file unicode.h.
References TUniChDb::GetSimpleLowerCase(), and ucd.
Referenced by TUStr::GetStarterLowerCaseStr().
{ ucd.GetSimpleLowerCase(src, dest, true); }


| void TUnicode::GetSimpleTitleCase | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1969 of file unicode.h.
References TUniChDb::GetSimpleTitleCase(), and ucd.
{ ucd.GetSimpleTitleCase(src, dest, true); }

| void TUnicode::GetSimpleUpperCase | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1968 of file unicode.h.
References TUniChDb::GetSimpleUpperCase(), and ucd.
{ ucd.GetSimpleUpperCase(src, dest, true); }

| TUniChSubCategory TUnicode::GetSubCat | ( | const int | cp | ) | const [inline] |
| void TUnicode::GetTitleCase | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1962 of file unicode.h.
References TUniChDb::GetTitleCase(), and ucd.
{ ucd.GetTitleCase(src, dest, true, false, false); }

| void TUnicode::GetUpperCase | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | const [inline] |
Definition at line 1961 of file unicode.h.
References TUniChDb::GetUpperCase(), and ucd.
{ ucd.GetUpperCase(src, dest, true, false, false); }

| TStr TUnicode::GetUtf8CaseFolded | ( | const TStr & | s | ) | const [inline] |
Definition at line 1989 of file unicode.h.
References DecodeUtf8(), EncodeUtf8Str(), GetCaseFolded(), TStr::GetLc(), and TStr::Len().
{
bool isAscii = true;
for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
if (isAscii) return s.GetLc();
TIntV src; DecodeUtf8(s, src);
TIntV dest; GetCaseFolded(src, dest);
return EncodeUtf8Str(dest); }

| void TUnicode::Init | ( | ) | [inline] |
Definition at line 1778 of file unicode.h.
References InitCodecs().
Referenced by TUnicode().
{ InitCodecs(); }


| void TUnicode::InitCodecs | ( | ) |
Definition at line 1687 of file unicode.cpp.
References ClrCodecs(), and RegisterCodec().
Referenced by Init().
{
ClrCodecs();
RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
}


| static TStr TUnicode::NormalizeCodecName | ( | const TStr & | name | ) | [inline, static, protected] |
Definition at line 1865 of file unicode.h.
References TStr::ChangeStrAll(), and TStr::GetLc().
Referenced by GetCodec(), RegisterCodec(), and UnregisterCodec().
{
TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }


| void TUnicode::RegisterCodec | ( | const TStr & | nameList, |
| const PCodecBase & | codec | ||
| ) | [inline] |
Definition at line 1868 of file unicode.h.
References THash< TKey, TDat, THashFunc >::AddDat(), codecs, TVec< TVal >::Len(), NormalizeCodecName(), and TStr::SplitOnWs().
Referenced by InitCodecs().
{
TStrV names; nameList.SplitOnWs(names);
for (int i = 0; i < names.Len(); i++)
codecs.AddDat(NormalizeCodecName(names[i]), codec); }


| void TUnicode::ToCaseFolded | ( | TIntV & | src | ) | const [inline] |
Definition at line 1987 of file unicode.h.
References TUniChDb::ToCaseFolded(), and ucd.
{ return ucd.ToCaseFolded(src, false); }

| void TUnicode::ToSimpleLowerCase | ( | TIntV & | src | ) | const [inline] |
Definition at line 1973 of file unicode.h.
References TUniChDb::ToSimpleLowerCase(), and ucd.
Referenced by TUStr::ToLowerCase().
{ ucd.ToSimpleLowerCase(src); }


| void TUnicode::ToSimpleTitleCase | ( | TIntV & | src | ) | const [inline] |
Definition at line 1974 of file unicode.h.
References TUniChDb::ToSimpleTitleCase(), and ucd.
{ ucd.ToSimpleTitleCase(src); }

| void TUnicode::ToSimpleUpperCase | ( | TIntV & | src | ) | const [inline] |
Definition at line 1972 of file unicode.h.
References TUniChDb::ToSimpleUpperCase(), and ucd.
Referenced by TUStr::ToUpperCase().
{ ucd.ToSimpleUpperCase(src); }


| void TUnicode::UnregisterCodec | ( | const TStr & | nameList | ) | [inline] |
Definition at line 1872 of file unicode.h.
References codecs, THash< TKey, TDat, THashFunc >::DelKey(), TVec< TVal >::Len(), NormalizeCodecName(), and TStr::SplitOnWs().
{
TStrV names; nameList.SplitOnWs(names);
for (int i = 0; i < names.Len(); i++)
codecs.DelKey(NormalizeCodecName(names[i])); }

| void TUnicode::UseEnglishSentenceBoundaryExceptions | ( | ) | [inline] |
Definition at line 1920 of file unicode.h.
References TUniChDb::SbEx_SetStdEnglish(), and ucd.
{ ucd.SbEx_SetStdEnglish(); }

Definition at line 1773 of file unicode.h.
Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), DecodeUtf8(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), EncodeUtf8(), EncodeUtf8Str(), and GetAllCodecs().
THash<TStr, PCodecBase> TUnicode::codecs [protected] |
Definition at line 1864 of file unicode.h.
Referenced by ClrCodecs(), GetAllCodecs(), GetCodec(), RegisterCodec(), and UnregisterCodec().
Definition at line 1774 of file unicode.h.
Referenced by ___UniFwd2(), ClrSentenceBoundaryExceptions(), Compose(), Decompose(), DecomposeAndCompose(), ExtractStarters(), FindNextSentenceBoundary(), FindNextWordBoundary(), FindSentenceBoundaries(), FindWordBoundaries(), GetCaseFolded(), GetCharName(), GetCharNameS(), TUStr::GetChScriptId(), GetLowerCase(), TUStr::GetScriptId(), TUStr::GetScriptNm(), GetSimpleLowerCase(), GetSimpleTitleCase(), GetSimpleUpperCase(), GetSubCat(), GetTitleCase(), GetUpperCase(), ToCaseFolded(), ToSimpleLowerCase(), ToSimpleTitleCase(), ToSimpleUpperCase(), TUnicode(), and UseEnglishSentenceBoundaryExceptions().