SNAP Library 2.0, User Reference  2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TUnicode Class Reference

#include <unicode.h>

List of all members.

Public Types

typedef TUniChDb::TCaseConversion TCaseConversion

Public Member Functions

 TUnicode ()
 TUnicode (const TStr &fnBinUcd)
void Init ()
int DecodeUtf8 (const TIntV &src, TIntV &dest) const
int DecodeUtf8 (const TStr &src, TIntV &dest) const
int EncodeUtf8 (const TIntV &src, TIntV &dest) const
TStr EncodeUtf8Str (const TIntV &src) const
int DecodeUtf16FromBytes (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
int DecodeUtf16FromWords (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
int EncodeUtf16ToWords (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
int EncodeUtf16ToBytes (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
void RegisterCodec (const TStr &nameList, const PCodecBase &codec)
void UnregisterCodec (const TStr &nameList)
void ClrCodecs ()
void InitCodecs ()
PCodecBase GetCodec (const TStr &name) const
void GetAllCodecs (TCodecBaseV &dest) const
bool FindNextWordBoundary (const TIntV &src, int &position) const
void FindWordBoundaries (const TIntV &src, TBoolV &dest) const
bool FindNextSentenceBoundary (const TIntV &src, int &position) const
void FindSentenceBoundaries (const TIntV &src, TBoolV &dest) const
void ClrSentenceBoundaryExceptions ()
void UseEnglishSentenceBoundaryExceptions ()
void Decompose (const TIntV &src, TIntV &dest, bool compatibility) const
void Compose (const TIntV &src, TIntV &dest) const
void DecomposeAndCompose (const TIntV &src, TIntV &dest, bool compatibility) const
int ExtractStarters (const TIntV &src, TIntV &dest) const
int ExtractStarters (TIntV &src) const
void GetLowerCase (const TIntV &src, TIntV &dest) const
void GetUpperCase (const TIntV &src, TIntV &dest) const
void GetTitleCase (const TIntV &src, TIntV &dest) const
void GetSimpleLowerCase (const TIntV &src, TIntV &dest) const
void GetSimpleUpperCase (const TIntV &src, TIntV &dest) const
void GetSimpleTitleCase (const TIntV &src, TIntV &dest) const
void ToSimpleUpperCase (TIntV &src) const
void ToSimpleLowerCase (TIntV &src) const
void ToSimpleTitleCase (TIntV &src) const
void GetCaseFolded (const TIntV &src, TIntV &dest, const bool full=true) const
void ToCaseFolded (TIntV &src) const
TStr GetUtf8CaseFolded (const TStr &s) const
DECLARE_FORWARDED_PROPERTY_METHODS ___UniFwd2 (IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const
TUniChSubCategory GetSubCat (const int cp) const
const char * GetCharName (const int cp) const
TStr GetCharNameS (const int cp) const

Static Public Member Functions

static void EncodeUtf8 (const uint &Ch, TChA &Dest)
static TStr EncodeUtf8 (const uint &Ch)

Public Attributes

TUniCodec codec
TUniChDb ucd
T8BitCodec< TEncoding_ISO8859_1iso8859_1
T8BitCodec< TEncoding_ISO8859_2iso8859_2
T8BitCodec< TEncoding_ISO8859_3iso8859_3
T8BitCodec< TEncoding_ISO8859_4iso8859_4
T8BitCodec< TEncoding_YuAsciiyuAscii
T8BitCodec< TEncoding_CP1250cp1250
T8BitCodec< TEncoding_CP852cp852
T8BitCodec< TEncoding_CP437cp437

Static Protected Member Functions

static TStr NormalizeCodecName (const TStr &name)

Protected Attributes

THash< TStr, PCodecBasecodecs

Detailed Description

Definition at line 1771 of file unicode.h.


Member Typedef Documentation


Constructor & Destructor Documentation

TUnicode::TUnicode ( ) [inline]

Definition at line 1777 of file unicode.h.

{ Init(); }
TUnicode::TUnicode ( const TStr fnBinUcd) [inline, explicit]

Definition at line 1778 of file unicode.h.

{ ucd.LoadBin(fnBinUcd); Init(); }

Member Function Documentation

DECLARE_FORWARDED_PROPERTY_METHODS TUnicode::___UniFwd2 ( IsPrivateUse  ,
IsSurrogate   
) const [inline]

Definition at line 2018 of file unicode.h.

                                                  { return ucd.GetCat(cp); }
void TUnicode::ClrCodecs ( ) [inline]

Definition at line 1881 of file unicode.h.

{ codecs.Clr(); }

Definition at line 1924 of file unicode.h.

{ ucd.SbEx_Clr(); }
void TUnicode::Compose ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1941 of file unicode.h.

{ return ucd.Compose(src, dest, true); }
int TUnicode::DecodeUtf16FromBytes ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const [inline]

Definition at line 1810 of file unicode.h.

                                                                              {
                        return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf16FromWords ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const [inline]

Definition at line 1823 of file unicode.h.

                                                                              {
                        return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf8 ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1787 of file unicode.h.

{ return (int) codec.DecodeUtf8(src, dest); }
int TUnicode::DecodeUtf8 ( const TStr src,
TIntV dest 
) const [inline]

Definition at line 1788 of file unicode.h.

{ return (int) codec.DecodeUtf8(src, dest); }
void TUnicode::Decompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const [inline]

Definition at line 1934 of file unicode.h.

{ ucd.Decompose(src, dest, compatibility, true); }
void TUnicode::DecomposeAndCompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const [inline]

Definition at line 1946 of file unicode.h.

{ return ucd.DecomposeAndCompose(src, dest, compatibility); }
int TUnicode::EncodeUtf16ToBytes ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const [inline]

Definition at line 1838 of file unicode.h.

                                                                           {
                        return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf16ToWords ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const [inline]

Definition at line 1834 of file unicode.h.

                                                                           {
                        return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf8 ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1792 of file unicode.h.

{ return (int) codec.EncodeUtf8(src, dest); }
void TUnicode::EncodeUtf8 ( const uint Ch,
TChA Dest 
) [static]

Definition at line 1700 of file unicode.cpp.

                                                   {
        if (c > 0x10ffff) {
                throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); }
        if (c < 0x80u)
                dest.AddCh(char(c & 0xffu));
        else if (c < 0x800u) {
                dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
        else if (c < 0x10000u) {
                dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
        else if (c < 0x200000u) {
                dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
        else if (c < 0x4000000u) {
                dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
        else {
                dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
                dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
}
TStr TUnicode::EncodeUtf8 ( const uint Ch) [static]

Definition at line 1732 of file unicode.cpp.

                                        {
        TChA ChA; EncodeUtf8(Ch, ChA); return ChA;
}
TStr TUnicode::EncodeUtf8Str ( const TIntV src) const [inline]

Definition at line 1796 of file unicode.h.

{ return codec.EncodeUtf8Str(src); }
int TUnicode::ExtractStarters ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1951 of file unicode.h.

{ return (int) ucd.ExtractStarters(src, dest); }
int TUnicode::ExtractStarters ( TIntV src) const [inline]

Definition at line 1953 of file unicode.h.

{ return (int) ucd.ExtractStarters(src); }
bool TUnicode::FindNextSentenceBoundary ( const TIntV src,
int &  position 
) const [inline]

Definition at line 1916 of file unicode.h.

                                                                             {
                if (position < 0) { position = 0; return true; }
                size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
bool TUnicode::FindNextWordBoundary ( const TIntV src,
int &  position 
) const [inline]

Definition at line 1901 of file unicode.h.

                                                                         {
                if (position < 0) { position = 0; return true; }
                size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
void TUnicode::FindSentenceBoundaries ( const TIntV src,
TBoolV dest 
) const [inline]

Definition at line 1922 of file unicode.h.

{ ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
void TUnicode::FindWordBoundaries ( const TIntV src,
TBoolV dest 
) const [inline]

Definition at line 1907 of file unicode.h.

{ ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
void TUnicode::GetAllCodecs ( TCodecBaseV dest) const [inline]

Definition at line 1887 of file unicode.h.

                                                   {
                dest.Clr();
                for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
                        PCodecBase codec = codecs[i]; bool found = false;
                        for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
                        if (! found) dest.Add(codec); }}
void TUnicode::GetCaseFolded ( const TIntV src,
TIntV dest,
const bool  full = true 
) const [inline]

Definition at line 1989 of file unicode.h.

{ return ucd.GetCaseFolded(src, dest, true, full, false); }
const char* TUnicode::GetCharName ( const int  cp) const [inline]

Definition at line 2024 of file unicode.h.

{ return ucd.GetCharName(cp); }
TStr TUnicode::GetCharNameS ( const int  cp) const [inline]

Definition at line 2025 of file unicode.h.

{ return ucd.GetCharNameS(cp); }
PCodecBase TUnicode::GetCodec ( const TStr name) const [inline]

Definition at line 1883 of file unicode.h.

                                                    {
                TStr s = NormalizeCodecName(name);
                PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
                return p; }
void TUnicode::GetLowerCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1965 of file unicode.h.

{ ucd.GetLowerCase(src, dest, true, false, false); }
void TUnicode::GetSimpleLowerCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1972 of file unicode.h.

{ ucd.GetSimpleLowerCase(src, dest, true); }
void TUnicode::GetSimpleTitleCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1974 of file unicode.h.

{ ucd.GetSimpleTitleCase(src, dest, true); }
void TUnicode::GetSimpleUpperCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1973 of file unicode.h.

{ ucd.GetSimpleUpperCase(src, dest, true); }
TUniChSubCategory TUnicode::GetSubCat ( const int  cp) const [inline]

Definition at line 2021 of file unicode.h.

{ return ucd.GetSubCat(cp); }
void TUnicode::GetTitleCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1967 of file unicode.h.

{ ucd.GetTitleCase(src, dest, true, false, false); }
void TUnicode::GetUpperCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1966 of file unicode.h.

{ ucd.GetUpperCase(src, dest, true, false, false); }
TStr TUnicode::GetUtf8CaseFolded ( const TStr s) const [inline]

Definition at line 1994 of file unicode.h.

                                                    {
                bool isAscii = true;
                for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
                if (isAscii) return s.GetLc();
                TIntV src; DecodeUtf8(s, src);
                TIntV dest; GetCaseFolded(src, dest);
                return EncodeUtf8Str(dest); }
void TUnicode::Init ( ) [inline]

Definition at line 1779 of file unicode.h.

{ InitCodecs(); }

Definition at line 1687 of file unicode.cpp.

{
        ClrCodecs();
        RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
        RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
        RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
        RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
        RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
        RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
        RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
        RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
}
static TStr TUnicode::NormalizeCodecName ( const TStr name) [inline, static, protected]

Definition at line 1870 of file unicode.h.

                                                                {
                TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
void TUnicode::RegisterCodec ( const TStr nameList,
const PCodecBase codec 
) [inline]

Definition at line 1873 of file unicode.h.

                                                                          {
                TStrV names; nameList.SplitOnWs(names);
                for (int i = 0; i < names.Len(); i++)
                        codecs.AddDat(NormalizeCodecName(names[i]), codec); }
void TUnicode::ToCaseFolded ( TIntV src) const [inline]

Definition at line 1992 of file unicode.h.

{ return ucd.ToCaseFolded(src, false); }
void TUnicode::ToSimpleLowerCase ( TIntV src) const [inline]

Definition at line 1978 of file unicode.h.

void TUnicode::ToSimpleTitleCase ( TIntV src) const [inline]

Definition at line 1979 of file unicode.h.

void TUnicode::ToSimpleUpperCase ( TIntV src) const [inline]

Definition at line 1977 of file unicode.h.

void TUnicode::UnregisterCodec ( const TStr nameList) [inline]

Definition at line 1877 of file unicode.h.

                                                   {
                TStrV names; nameList.SplitOnWs(names);
                for (int i = 0; i < names.Len(); i++)
                        codecs.DelKey(NormalizeCodecName(names[i])); }

Definition at line 1925 of file unicode.h.


Member Data Documentation

Definition at line 1774 of file unicode.h.

Definition at line 1869 of file unicode.h.

Definition at line 1775 of file unicode.h.


The documentation for this class was generated from the following files: