|
SNAP Library 2.0, User Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <unicode.h>
Classes | |
| class | TSubcatHelper |
| class | TUcdFileReader |
Public Types | |
| enum | { HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7, HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, HangulNCount = HangulVCount * HangulTCount, HangulSCount = HangulLCount * HangulNCount } |
| enum | TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } |
| typedef enum TUniChDb::TCaseConversion_ | TCaseConversion |
Public Member Functions | |
| TUniChDb () | |
| TUniChDb (TSIn &SIn) | |
| void | Clr () |
| void | Save (TSOut &SOut) const |
| void | Load (TSIn &SIn) |
| void | LoadBin (const TStr &fnBin) |
| void | Test (const TStr &basePath) |
| const TStr & | GetScriptName (const int scriptId) const |
| int | GetScriptByName (const TStr &scriptName) const |
| int | GetScript (const TUniChInfo &ci) const |
| int | GetScript (const int cp) const |
| const char * | GetCharName (const int cp) const |
| TStr | GetCharNameS (const int cp) const |
| template<class TSrcVec > | |
| void | PrintCharNames (FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const |
| template<class TSrcVec > | |
| void | PrintCharNames (FILE *f, const TSrcVec &src, const TStr &prefix) const |
| bool | IsGetChInfo (const int cp, TUniChInfo &ChInfo) |
| TUniChCategory | GetCat (const int cp) const |
| TUniChSubCategory | GetSubCat (const int cp) const |
| bool | IsWbFlag (const int cp, const TUniChFlags flag) const |
| int | GetWbFlags (const int cp) const |
| bool | IsSbFlag (const int cp, const TUniChFlags flag) const |
| int | GetSbFlags (const int cp) const |
| DECLARE_FORWARDED_PROPERTY_METHODS bool | IsPrivateUse (const int cp) const |
| bool | IsSurrogate (const int cp) const |
| int | GetCombiningClass (const int cp) const |
| template<typename TSrcVec > | |
| bool | FindNextWordBoundary (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const |
| template<typename TSrcVec > | |
| void | FindWordBoundaries (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const |
| template<typename TSrcVec > | |
| bool | FindNextSentenceBoundary (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const |
| template<typename TSrcVec > | |
| void | FindSentenceBoundaries (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const |
| void | SbEx_Clr () |
| template<class TSrcVec > | |
| void | SbEx_Add (const TSrcVec &v) |
| void | SbEx_Add (const TStr &s) |
| void | SbEx_AddUtf8 (const TStr &s) |
| int | SbEx_AddMulti (const TStr &words, const bool wordsAreUtf8=true) |
| void | SbEx_Set (const TUniTrie< TInt > &newTrie) |
| int | SbEx_SetStdEnglish () |
| template<typename TSrcVec , typename TDestCh > | |
| void | Decompose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | Decompose (const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | Compose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | Compose (const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | DecomposeAndCompose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | DecomposeAndCompose (const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | ExtractStarters (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | ExtractStarters (const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const |
| template<typename TSrcVec > | |
| size_t | ExtractStarters (TSrcVec &src) const |
| void | LoadTxt (const TStr &basePath) |
| void | SaveBin (const TStr &fnBinUcd) |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetCaseConverted (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetLowerCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetUpperCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetTitleCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetLowerCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetUpperCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetTitleCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleCaseConverted (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleLowerCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleUpperCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleTitleCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleLowerCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleUpperCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetSimpleTitleCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec > | |
| void | ToSimpleCaseConverted (TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const |
| template<typename TSrcVec > | |
| void | ToSimpleUpperCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const |
| template<typename TSrcVec > | |
| void | ToSimpleLowerCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const |
| template<typename TSrcVec > | |
| void | ToSimpleTitleCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const |
| template<typename TSrcVec > | |
| void | ToSimpleUpperCase (TSrcVec &src) const |
| template<typename TSrcVec > | |
| void | ToSimpleLowerCase (TSrcVec &src) const |
| template<typename TSrcVec > | |
| void | ToSimpleTitleCase (TSrcVec &src) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetCaseFolded (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const |
| template<typename TSrcVec , typename TDestCh > | |
| void | GetCaseFolded (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool full=true, const bool turkic=false) const |
| template<typename TSrcVec > | |
| void | ToCaseFolded (TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const |
| template<typename TSrcVec > | |
| void | ToCaseFolded (TSrcVec &src, const bool turkic=false) const |
Static Public Member Functions | |
| static TStr | GetCaseFoldingFn () |
| static TStr | GetSpecialCasingFn () |
| static TStr | GetUnicodeDataFn () |
| static TStr | GetCompositionExclusionsFn () |
| static TStr | GetScriptsFn () |
| static TStr | GetDerivedCorePropsFn () |
| static TStr | GetLineBreakFn () |
| static TStr | GetPropListFn () |
| static TStr | GetAuxiliaryDir () |
| static TStr | GetWordBreakTestFn () |
| static TStr | GetWordBreakPropertyFn () |
| static TStr | GetSentenceBreakTestFn () |
| static TStr | GetSentenceBreakPropertyFn () |
| static TStr | GetNormalizationTestFn () |
| static TStr | GetBinFn () |
| static TStr | GetScriptNameUnknown () |
| static TStr | GetScriptNameKatakana () |
| static TStr | GetScriptNameHiragana () |
Public Attributes | |
| THash< TInt, TUniChInfo > | h |
| TStrPool | charNames |
| TStrIntH | scripts |
| TIntV | decompositions |
| THash< TIntPr, TInt > | inverseDec |
| TUniCaseFolding | caseFolding |
| TIntIntVH | specialCasingLower |
| TIntIntVH | specialCasingUpper |
| TIntIntVH | specialCasingTitle |
| int | scriptUnknown |
Protected Types | |
| typedef TUniVecIdx | TVecIdx |
Protected Member Functions | |
| void | InitAfterLoad () |
| bool | IsWbIgnored (const int cp) const |
| template<typename TSrcVec > | |
| void | WbFindCurOrNextNonIgnored (const TSrcVec &src, size_t &position, const size_t srcEnd) const |
| template<typename TSrcVec > | |
| void | WbFindNextNonIgnored (const TSrcVec &src, size_t &position, const size_t srcEnd) const |
| template<typename TSrcVec > | |
| void | WbFindNextNonIgnoredS (const TSrcVec &src, size_t &position, const size_t srcEnd) const |
| template<typename TSrcVec > | |
| bool | WbFindPrevNonIgnored (const TSrcVec &src, const size_t srcStart, size_t &position) const |
| void | TestWbFindNonIgnored (const TIntV &src) const |
| void | TestWbFindNonIgnored () const |
| void | TestFindNextWordOrSentenceBoundary (const TStr &basePath, bool sentence) |
| template<typename TSrcVec > | |
| bool | CanSentenceEndHere (const TSrcVec &src, const size_t srcIdx, const size_t position) const |
| template<typename TDestCh > | |
| void | AddDecomposition (const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const |
| void | TestComposition (const TStr &basePath) |
| void | InitWordAndSentenceBoundaryFlags (const TStr &basePath) |
| void | InitScripts (const TStr &basePath) |
| void | InitLineBreaks (const TStr &basePath) |
| void | InitDerivedCoreProperties (const TStr &basePath) |
| void | InitPropList (const TStr &basePath) |
| void | InitSpecialCasing (const TStr &basePath) |
| void | LoadTxt_ProcessDecomposition (TUniChInfo &ci, TStr s) |
| void | TestCaseConversion (const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian) |
| void | TestCaseConversions () |
Static Protected Member Functions | |
| static bool | IsWbIgnored (const TUniChInfo &ci) |
Protected Attributes | |
| TUniTrie< TInt > | sbExTrie |
Friends | |
| class | TUniCaseFolding |
| typedef enum TUniChDb::TCaseConversion_ TUniChDb::TCaseConversion |
typedef TUniVecIdx TUniChDb::TVecIdx [protected] |
| anonymous enum |
| HangulSBase | |
| HangulLBase | |
| HangulVBase | |
| HangulTBase | |
| HangulLCount | |
| HangulVCount | |
| HangulTCount | |
| HangulNCount | |
| HangulSCount |
Definition at line 1405 of file unicode.h.
{
HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
HangulNCount = HangulVCount * HangulTCount, // 588
HangulSCount = HangulLCount * HangulNCount // 11172
};
| TUniChDb::TUniChDb | ( | ) | [inline] |
Definition at line 1274 of file unicode.h.
: scriptUnknown(-1) { }
| TUniChDb::TUniChDb | ( | TSIn & | SIn | ) | [inline, explicit] |
| void TUniChDb::AddDecomposition | ( | const int | codePoint, |
| TVec< TDestCh > & | dest, | ||
| const bool | compatibility | ||
| ) | const [protected] |
Definition at line 3097 of file unicode.h.
{
if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
{
// UAX #15, sec. 16: Hangul decomposition
const int SIndex = codePoint - HangulSBase;
const int L = HangulLBase + SIndex / HangulNCount;
const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
const int T = HangulTBase + (SIndex % HangulTCount);
dest.Add(L); dest.Add(V);
if (T != HangulTBase) dest.Add(T);
return;
}
int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
const TUniChInfo &ci = h[i];
int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
while (true) {
int cp = decompositions[ofs++]; if (cp < 0) return;
AddDecomposition(cp, dest, compatibility); }
}
| bool TUniChDb::CanSentenceEndHere | ( | const TSrcVec & | src, |
| const size_t | srcIdx, | ||
| const size_t | position | ||
| ) | const [protected] |
Definition at line 2582 of file unicode.h.
{
if (sbExTrie.Empty()) return true;
// We'll move back from the position where a sentence-boundary is being considered.
size_t pos = position;
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
// - Skip the Sep, if there is one.
if ((c & ucfSbSep) == ucfSbSep) {
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
// - Skip any Sp characters.
while ((sfb & ucfSbSp) == ucfSbSp) {
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
// - Skip any Close characters.
while ((sfb & ucfSbSp) == ucfSbSp) {
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
// - Skip any ATerm | STerm characters.
while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
// Now start moving through the trie.
int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
while (true)
{
bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
TUniChCategory cat = GetCat(c);
if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
// Check if the suffix we've read so far is one of those that appear in the trie.
if (len == 1) return ! sbExTrie.Has1Gram(cLast);
if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
IAssert(len >= 3); IAssert(node >= 0);
if (sbExTrie.IsNodeTerminal(node)) return false;
if (atEnd) return true; }
if (len == 1) { cButLast = c; len++; }
else if (len == 2) { cButButLast = c; len++;
// Now we have read the last three characters; start descending the suitable subtrie.
node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
if (node < 0) return true; }
else {
// Descend down the trie.
node = sbExTrie.GetChild(node, c);
if (node < 0) return true; }
}
//return true;
}
| void TUniChDb::Clr | ( | ) | [inline] |
Definition at line 1276 of file unicode.h.
{
h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
scripts.Clr(); }
| void TUniChDb::Compose | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| bool | clrDest = true |
||
| ) | const |
Definition at line 3152 of file unicode.h.
{
if (clrDest) dest.Clr();
bool lastStarterKnown = false; // has a starter been encountered yet?
size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter
int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
const size_t srcEnd = srcIdx + srcCount;
int ccMax = -1; // The highest combining class among the characters since the last starter.
while (srcIdx < srcEnd)
{
const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
const int cpClass = GetCombiningClass(cp);
//int cpCombined = -1;
// If there is a starter with which 'cp' can be combined, and from which it is not blocked
// by some intermediate character, we can try to combine them.
if (lastStarterKnown && ccMax < cpClass)
{
int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
int cpCombined = -1;
do {
// Try to look up a composition in the inverseDec table.
if (j >= 0) { cpCombined = inverseDec[j]; break; }
// UAX #15, sec. 16: Hangul composition
// - Try to combine L and V.
const int LIndex = cpLastStarter - HangulLBase;
if (0 <= LIndex && LIndex < HangulLCount) {
const int VIndex = cp - HangulVBase;
if (0 <= VIndex && VIndex < HangulVCount) {
cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
break; } }
// - Try to combine LV and T.
const int SIndex = cpLastStarter - HangulSBase;
if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
{
const int TIndex = cp - HangulTBase;
if (0 <= TIndex && TIndex < HangulTCount) {
cpCombined = cpLastStarter + TIndex;
break; }
}
} while (false);
// If a combining character has been found, use it to replace the old cpStarter.
if (cpCombined >= 0) {
dest[TVecIdx(lastStarterPos)] = cpCombined;
Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
// if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
cpLastStarter = cpCombined; continue; }
}
if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter.
lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
ccMax = cpClass;
dest.Add(cp);
}
}
| void TUniChDb::Compose | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| bool | clrDest = true |
||
| ) | const [inline] |
| void TUniChDb::Decompose | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| bool | compatibility, | ||
| bool | clrDest = true |
||
| ) | const |
Definition at line 3120 of file unicode.h.
{
if (clrDest) dest.Clr();
const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
// Decompose the string.
while (srcIdx < srcCount) {
AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
// Rearrange the decomposed string into canonical order.
for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
{
size_t j = destIdx;
int cp = dest[TVecIdx(destIdx)]; destIdx++;
int cpCls = GetCombiningClass(cp);
if (cpCls == TUniChInfo::ccStarter) continue;
while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
dest[TVecIdx(j)] = cp;
}
}
| void TUniChDb::Decompose | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| bool | compatibility, | ||
| bool | clrDest = true |
||
| ) | const [inline] |
| void TUniChDb::DecomposeAndCompose | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| bool | compatibility, | ||
| bool | clrDest = true |
||
| ) | const |
| void TUniChDb::DecomposeAndCompose | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| bool | compatibility, | ||
| bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1542 of file unicode.h.
{
DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
| size_t TUniChDb::ExtractStarters | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| bool | clrDest = true |
||
| ) | const |
Definition at line 3209 of file unicode.h.
{
if (clrDest) dest.Clr();
size_t retVal = 0;
for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
const int cp = src[TVecIdx(srcIdx)];
if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
{ dest.Add(cp); retVal++; } }
return retVal;
}
| size_t TUniChDb::ExtractStarters | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1551 of file unicode.h.
{
return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
| size_t TUniChDb::ExtractStarters | ( | TSrcVec & | src | ) | const [inline] |
Definition at line 1555 of file unicode.h.
{
TIntV temp; size_t retVal = ExtractStarters(src, temp);
src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
return retVal; }
| bool TUniChDb::FindNextSentenceBoundary | ( | const TSrcVec & | src, |
| const size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| size_t & | position | ||
| ) | const |
Definition at line 2633 of file unicode.h.
{
// SB1. Break at the start of text.
if (position < srcIdx) { position = srcIdx; return true; }
// If we are beyond the end of the text, there aren't any word breaks left.
const size_t srcEnd = srcIdx + srcCount;
if (position >= srcEnd) return false;
// If 'position' is currently at an ignored character, move it back to the last nonignored character.
size_t origPos = position;
if (IsWbIgnored(src[TVecIdx(position)])) {
if (! WbFindPrevNonIgnored(src, srcIdx, position))
position = origPos;
}
// Determine the previous nonignored character (before 'position').
size_t posPrev = position;
if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
// Sec 6.2. Allow a break between Sep and an ignored character.
if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
// Determine the next nonignored character (after 'position').
size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
size_t posNext2;
int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
int cNext2, sbfNext2;
// Initialize the state of the peek-back automaton.
typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
TPeekBackState backState;
{
size_t pos = position;
bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
while (true)
{
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
// Skip at most one Sep.
int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
if ((sbf & ucfSbSep) == ucfSbSep) {
wasSep = true;
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
// Skip zero or more Sp's.
bool stop = false;
while ((sbf & ucfSbSp) == ucfSbSp) {
wasSp = true;
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
if (stop) break;
// Skip zero or more Close's.
while ((sbf & ucfSbClose) == ucfSbClose) {
if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
if (stop) break;
// Process an ATerm or STerm.
wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
break;
}
if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
else backState = stInit;
}
// Initialize the state of the peek-ahead automaton. This state tells us what follows
// after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
// Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
// Our peek-ahead automaton must tell us whether it is Lower or something else.
typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
TPeekAheadState aheadState = stUnknown;
//
for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
cPrev = cCur, cCur = cNext, cNext = cNext2,
sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
{
// Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
// between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
// and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
sbfNext2 = GetSbFlags(cNext2);
// Update the peek-back automaton.
#define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
#define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
switch (backState) {
case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
default: IAssert(false); }
#undef Trans
#undef TestCur
// Update the peek-ahead automaton.
#define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
if (! IsPeekAheadSkippable(sbfCur)) {
bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
if (aheadState == stLower) IAssert(isLower);
else if (aheadState == stNotLower) IAssert(! isLower);
// We haven't peaked ahead farther than this so far -- invalidate the state.
aheadState = stUnknown; }
if (aheadState == stUnknown)
{
// Peak ahead to the next non-peekahead-skippable character.
size_t pos = posNext;
while (pos < srcEnd) {
int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
if (! IsPeekAheadSkippable(sbf)) {
if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
else aheadState = stNotLower;
break; }
WbFindNextNonIgnored(src, pos, srcEnd); }
if (! (pos < srcEnd)) aheadState = stNotLower;
}
#undef IsPeekAheadSkippable
//
#define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
// SB3. Do not break within CRLF.
if (cCur == 13 && cNext == 10) continue;
// SB4. Break ater paragraph separators.
if ((sbfCur & ucfSbSep) == ucfSbSep) {
if (! CanSentenceEndHere(src, srcIdx, position)) continue;
position = posNext; return true; }
// Do not break after ambiguous terminators like period, if they are immediately followed by a number
// or lowercase letter, if they are between uppercase letters, or if the first following letter
// (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation
// or numeric period, and thus may not mark the end of a sentence.
TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
// SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
(sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
// SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
// Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
// SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
// SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
// SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
if (! CanSentenceEndHere(src, srcIdx, position)) continue;
position = posNext; return true; } // SB11
// WB12. Otherwise, do not break.
continue;
#undef TestCurNext
#undef TestCurNext2
#undef TestPrevCurNext
}
// WB2. Break at the end of text.
IAssert(position == srcEnd);
return true;
}
| bool TUniChDb::FindNextWordBoundary | ( | const TSrcVec & | src, |
| const size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| size_t & | position | ||
| ) | const |
Definition at line 2483 of file unicode.h.
{
// WB1. Break at the start of text.
if (position < srcIdx) { position = srcIdx; return true; }
// If we are beyond the end of the text, there aren't any word breaks left.
const size_t srcEnd = srcIdx + srcCount;
if (position >= srcEnd) return false;
// If 'position' is currently at an ignored character, move it back to the last nonignored character.
size_t origPos = position;
if (IsWbIgnored(src[TVecIdx(position)])) {
if (! WbFindPrevNonIgnored(src, srcIdx, position))
position = origPos;
}
// Determine the previous nonignored character (before 'position').
size_t posPrev = position;
if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
// Sec 6.2. Allow a break between Sep and an ignored character.
if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
// Determine the next nonignored character (after 'position').
size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
size_t posNext2;
int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
int cNext2, wbfNext2;
//
for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
cPrev = cCur, cCur = cNext, cNext = cNext2,
wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
{
// Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
// between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
// and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
wbfNext2 = GetWbFlags(cNext2);
#define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
// WB3. Do not break within CRLF.
if (cCur == 13 && cNext == 10) continue;
// WB5. Do not break between most letters.
TestCurNext(ucfWbALetter, ucfWbALetter);
// WB6. Do not break letters across certain punctuation.
TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
// WB7. Do not break letters across certain punctuation.
TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
// WB8. Do not break within sequences of digits, or digits adjacent to letters.
TestCurNext(ucfWbNumeric, ucfWbNumeric);
// WB9. Do not break within sequences of digits, or digits adjacent to letters.
TestCurNext(ucfWbALetter, ucfWbNumeric);
// WB10. Do not break within sequences of digits, or digits adjacent to letters.
TestCurNext(ucfWbNumeric, ucfWbALetter);
// WB11. Do not break within sequences, such as "3.2" or "3.456,789".
TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
// WB12. Do not break within sequences, such as "3.2" or "3.456,789".
TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
// WB13. Do not break between Katakana.
TestCurNext(ucfWbKatakana, ucfWbKatakana);
// WB13a. Do not break from extenders.
if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
(wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
// WB13b. Do not break from extenders.
if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
(wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
// WB14. Otherwise, break everywhere.
position = posNext; return true;
#undef TestCurNext
#undef TestCurNext2
#undef TestPrevCurNext
}
// WB2. Break at the end of text.
IAssert(position == srcEnd);
return true;
}
| void TUniChDb::FindSentenceBoundaries | ( | const TSrcVec & | src, |
| const size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TBoolV & | dest | ||
| ) | const |
Definition at line 2790 of file unicode.h.
{
if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
dest.PutAll(false);
size_t position = srcIdx;
dest[TVecIdx(position - srcIdx)] = true;
while (position < srcIdx + srcCount)
{
size_t oldPos = position;
FindNextSentenceBoundary(src, srcIdx, srcCount, position);
Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
dest[TVecIdx(position - srcIdx)] = true;
}
Assert(dest[TVecIdx(srcCount)]);
}
| void TUniChDb::FindWordBoundaries | ( | const TSrcVec & | src, |
| const size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TBoolV & | dest | ||
| ) | const |
Definition at line 2561 of file unicode.h.
{
if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
dest.PutAll(false);
size_t position = srcIdx;
dest[TVecIdx(position - srcIdx)] = true;
while (position < srcIdx + srcCount)
{
size_t oldPos = position;
FindNextWordBoundary(src, srcIdx, srcCount, position);
Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
dest[TVecIdx(position - srcIdx)] = true;
}
Assert(dest[TVecIdx(srcCount)]);
}
| static TStr TUniChDb::GetAuxiliaryDir | ( | ) | [inline, static] |
| static TStr TUniChDb::GetBinFn | ( | ) | [inline, static] |
| void TUniChDb::GetCaseConverted | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest, | ||
| const TCaseConversion | how, | ||
| const bool | turkic, | ||
| const bool | lithuanian | ||
| ) | const |
Definition at line 2811 of file unicode.h.
{
const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
if (clrDest) dest.Clr();
enum {
GreekCapitalLetterSigma = 0x3a3,
GreekSmallLetterSigma = 0x3c3,
GreekSmallLetterFinalSigma = 0x3c2,
LatinCapitalLetterI = 0x49,
LatinCapitalLetterJ = 0x4a,
LatinCapitalLetterIWithOgonek = 0x12e,
LatinCapitalLetterIWithGrave = 0xcc,
LatinCapitalLetterIWithAcute = 0xcd,
LatinCapitalLetterIWithTilde = 0x128,
LatinCapitalLetterIWithDotAbove = 0x130,
LatinSmallLetterI = 0x69,
CombiningDotAbove = 0x307
};
//
bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
size_t nextWordBoundary = srcIdx;
TBoolV wordBoundaries; bool wbsKnown = false;
for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
{
int cp = src[TVecIdx(srcIdx)]; srcIdx++;
//if (turkic && cp == 0x130 && how == ccLower) printf("!");
// For conversion to titlecase, the first cased character of each word
// must be converted to titlecase; everything else must be converted
// to lowercase.
TUniChDb::TCaseConversion howHere;
if (how != ccTitle) howHere = how;
else {
if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
seenCased = false; seenTwoCased = false; cpFirstCased = -1;
size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
IAssert(next > nextWordBoundary); nextWordBoundary = next; }
bool isCased = IsCased(cp);
if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
else { howHere = ccLower;
if (isCased && seenCased) seenTwoCased = true; }
}
// First, process the conditional mappings from SpecialCasing.txt.
// These will be processed in code -- they were ignored while
// we were reading SpecialCasing.txt itself.
if (cp == GreekCapitalLetterSigma && howHere == ccLower)
{
// SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
// the standard doesn't define it. We'll use FinalCased instead.
// FinalCased: within the closest word boundaries containing C,
// there is a cased letter before C, and there is no cased letter after C.
//size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
size_t srcIdx2 = srcIdx; bool casedAfter = false;
if (how == ccTitle)
printf("!");
//while (srcIdx2 < nextBoundary)
while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
{
int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
if (IsCased(cp2)) { casedAfter = true; break; }
}
if (! casedAfter)
{
//size_t prevBoundary = srcIdx - 1;
//FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
srcIdx2 = srcIdx - 1; bool casedBefore = false;
//while (prevBoundary < srcIdx2)
while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
{
--srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
if (IsCased(cp2)) { casedBefore = true; break; }
}
if (casedBefore) {
// Now we have a FinalCased character.
dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
}
// If we got here, add a non-final sigma.
dest.Add(GreekSmallLetterSigma); continue;
}
else if (lithuanian)
{
if (howHere == ccLower)
{
if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
{
bool moreAbove = false;
for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
{
const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
const int cc2 = GetCombiningClass(cp2);
if (cc2 == TUniChInfo::ccStarter) break;
if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
}
if (moreAbove)
{
if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
}
}
else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
}
if (cp == CombiningDotAbove)
{
// Lithuanian, howHere != ccLower.
// AfterSoftDotted := the last preceding character with a combining class
// of zero before C was Soft_Dotted, and there is no intervening combining
// character class 230 (ABOVE).
bool afterSoftDotted = false;
size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
while (origSrcIdx < srcIdx2)
{
--srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
int cc2 = GetCombiningClass(cp2);
if (cc2 == TUniChInfo::ccAbove) break;
if (cc2 == TUniChInfo::ccStarter) {
afterSoftDotted = IsSoftDotted(cp2); break; }
}
if (afterSoftDotted)
{
Assert(lithuanian);
// Remove DOT ABOVE after "i" with upper or titlecase.
// - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
// the "i" may have been kept lowercase and thus we shouldn't remove the dot).
if (how == ccLower) { dest.Add(0x307); continue; }
if (how == ccUpper) continue;
Assert(how == ccTitle);
Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
dest.Add(0x307); continue;
}
}
}
else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
{
// I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
// The following rules handle those cases.
if (cp == LatinCapitalLetterIWithDotAbove) {
dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
// When lowercasing, remove dot_above in the sequence I + dot_above,
// which will turn into i. This matches the behavior of the
// canonically equivalent I-dot_above.
else if (cp == CombiningDotAbove)
{
// AfterI: the last preceding base character was an uppercase I,
// and there is no intervening combining character class 230 (ABOVE).
bool afterI = false;
size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
while (origSrcIdx < srcIdx2)
{
--srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
int cc2 = GetCombiningClass(cp2);
if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
}
if (afterI) {
if (how == ccTitle && seenCased && ! seenTwoCased) {
// Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
// if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
// This suggests that if a cased character is found, others in that word should be left alone.
// This seems unusual; we map all other characters to lowercase instead.
// But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
// is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
// but since afterI is also true here, this would mean deleting it. Thus our titlecased
// form of "I followed by dot-above" would be just "I", which is clearly wrong.
// So we treat this as a special case here.
IAssert(cpFirstCased == LatinCapitalLetterI);
dest.Add(0x307); continue; }
if (howHere != ccLower) dest.Add(0x307);
continue; }
}
// When lowercasing, unless an I is before a dot_above,
// it turns into a dotless i.
else if (cp == LatinCapitalLetterI)
{
// BeforeDot: C is followed by U+0307 (combining dot above).
// Any sequence of characters with a combining class that is
// neither 0 nor 230 may intervene between the current character
// and the combining dot above.
bool beforeDot = false;
for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
{
const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
if (cp2 == 0x307) { beforeDot = true; break; }
const int cc2 = GetCombiningClass(cp2);
if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
}
if (! beforeDot) {
dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
}
// When uppercasing, i turns into a dotted capital I.
else if (cp == LatinSmallLetterI)
{
dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
}
}
// Try to use the unconditional mappings.
const TIntIntVH &specHere = (
howHere == how ? specials :
howHere == ccLower ? specialCasingLower :
howHere == ccTitle ? specialCasingTitle :
howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
int i = specHere.GetKeyId(cp);
if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
// Try to use the simple (one-character) mappings.
i = h.GetKeyId(cp);
if (i >= 0) {
const TUniChInfo &ci = h[i];
int cpNew = (
howHere == ccLower ? ci.simpleLowerCaseMapping :
howHere == ccUpper ? ci.simpleUpperCaseMapping :
ci.simpleTitleCaseMapping);
if (cpNew < 0) cpNew = cp;
dest.Add(cpNew); continue; }
// As a final resort, leave 'cp' unchanged.
dest.Add(cp);
}
}
| void TUniChDb::GetCaseFolded | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest, | ||
| const bool | full, | ||
| const bool | turkic = false |
||
| ) | const [inline] |
Definition at line 1629 of file unicode.h.
{ caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
| void TUniChDb::GetCaseFolded | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | full = true, |
||
| const bool | turkic = false |
||
| ) | const [inline] |
Definition at line 1632 of file unicode.h.
{
GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
| static TStr TUniChDb::GetCaseFoldingFn | ( | ) | [inline, static] |
| TUniChCategory TUniChDb::GetCat | ( | const int | cp | ) | const [inline] |
| const char* TUniChDb::GetCharName | ( | const int | cp | ) | const [inline] |
| TStr TUniChDb::GetCharNameS | ( | const int | cp | ) | const [inline] |
Definition at line 1332 of file unicode.h.
{
// ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
const char *p = GetCharName(cp); if (p) return p;
char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
| int TUniChDb::GetCombiningClass | ( | const int | cp | ) | const [inline] |
| static TStr TUniChDb::GetCompositionExclusionsFn | ( | ) | [inline, static] |
| static TStr TUniChDb::GetDerivedCorePropsFn | ( | ) | [inline, static] |
| static TStr TUniChDb::GetLineBreakFn | ( | ) | [inline, static] |
| void TUniChDb::GetLowerCase | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | turkic = false, |
||
| const bool | lithuanian = false |
||
| ) | const [inline] |
Definition at line 1590 of file unicode.h.
{ GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
| void TUniChDb::GetLowerCase | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | turkic = false, |
||
| const bool | lithuanian = false |
||
| ) | const [inline] |
Definition at line 1593 of file unicode.h.
{ GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
| static TStr TUniChDb::GetNormalizationTestFn | ( | ) | [inline, static] |
| static TStr TUniChDb::GetPropListFn | ( | ) | [inline, static] |
| int TUniChDb::GetSbFlags | ( | const int | cp | ) | const [inline] |
| int TUniChDb::GetScript | ( | const TUniChInfo & | ci | ) | const [inline] |
Definition at line 1323 of file unicode.h.
{ int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
| int TUniChDb::GetScript | ( | const int | cp | ) | const [inline] |
| int TUniChDb::GetScriptByName | ( | const TStr & | scriptName | ) | const [inline] |
| const TStr& TUniChDb::GetScriptName | ( | const int | scriptId | ) | const [inline] |
| static TStr TUniChDb::GetScriptNameHiragana | ( | ) | [inline, static] |
| static TStr TUniChDb::GetScriptNameKatakana | ( | ) | [inline, static] |
| static TStr TUniChDb::GetScriptNameUnknown | ( | ) | [inline, static] |
| static TStr TUniChDb::GetScriptsFn | ( | ) | [inline, static] |
| static TStr TUniChDb::GetSentenceBreakPropertyFn | ( | ) | [inline, static] |
| static TStr TUniChDb::GetSentenceBreakTestFn | ( | ) | [inline, static] |
| void TUniChDb::GetSimpleCaseConverted | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest, | ||
| const TCaseConversion | how | ||
| ) | const |
Definition at line 3036 of file unicode.h.
{
if (clrDest) dest.Clr();
bool seenCased = false; size_t nextWordBoundary = srcIdx;
for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
{
const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
const TUniChInfo &ci = h[i];
// With titlecasing, the first cased character of each word must be put into titlecase,
// all others into lowercase. This is what the howHere variable is for.
TUniChDb::TCaseConversion howHere;
if (how != ccTitle) howHere = how;
else {
if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
seenCased = false;
size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
IAssert(next > nextWordBoundary); nextWordBoundary = next; }
bool isCased = IsCased(cp);
if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
else howHere = ccLower;
}
int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
if (cpNew < 0) cpNew = cp;
dest.Add(cpNew);
}
}
| void TUniChDb::GetSimpleLowerCase | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1601 of file unicode.h.
{ GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
| void TUniChDb::GetSimpleLowerCase | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1604 of file unicode.h.
{ GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
| void TUniChDb::GetSimpleTitleCase | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1603 of file unicode.h.
{ GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
| void TUniChDb::GetSimpleTitleCase | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1606 of file unicode.h.
{ GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
| void TUniChDb::GetSimpleUpperCase | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1602 of file unicode.h.
{ GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
| void TUniChDb::GetSimpleUpperCase | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 1605 of file unicode.h.
{ GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
| static TStr TUniChDb::GetSpecialCasingFn | ( | ) | [inline, static] |
| TUniChSubCategory TUniChDb::GetSubCat | ( | const int | cp | ) | const [inline] |
| void TUniChDb::GetTitleCase | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | turkic = false, |
||
| const bool | lithuanian = false |
||
| ) | const [inline] |
Definition at line 1592 of file unicode.h.
{ GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
| void TUniChDb::GetTitleCase | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | turkic = false, |
||
| const bool | lithuanian = false |
||
| ) | const [inline] |
Definition at line 1595 of file unicode.h.
{ GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
| static TStr TUniChDb::GetUnicodeDataFn | ( | ) | [inline, static] |
| void TUniChDb::GetUpperCase | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | turkic = false, |
||
| const bool | lithuanian = false |
||
| ) | const [inline] |
Definition at line 1591 of file unicode.h.
{ GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
| void TUniChDb::GetUpperCase | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true, |
||
| const bool | turkic = false, |
||
| const bool | lithuanian = false |
||
| ) | const [inline] |
Definition at line 1594 of file unicode.h.
{ GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
| int TUniChDb::GetWbFlags | ( | const int | cp | ) | const [inline] |
| static TStr TUniChDb::GetWordBreakPropertyFn | ( | ) | [inline, static] |
| static TStr TUniChDb::GetWordBreakTestFn | ( | ) | [inline, static] |
| void TUniChDb::InitAfterLoad | ( | ) | [protected] |
Definition at line 1372 of file unicode.cpp.
{
scriptUnknown = GetScriptByName(GetScriptNameUnknown()); IAssert(scriptUnknown >= 0);
}
| void TUniChDb::InitDerivedCoreProperties | ( | const TStr & | basePath | ) | [protected] |
Definition at line 1011 of file unicode.cpp.
{
TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
TSubcatHelper helper(*this);
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 2);
int from, to; reader.ParseCodePointRange(fields[0], from, to);
TStr s = fields[1];
TUniChFlags flag = ucfCompatibilityDecomposition;
if (s == "Math") flag = ucfDcpMath;
else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
else if (s == "Lowercase") flag = ucfDcpLowercase;
else if (s == "Uppercase") flag = ucfDcpUppercase;
else if (s == "ID_Start") flag = ucfDcpIdStart;
else if (s == "ID_Continue") flag = ucfDcpIdContinue;
else if (s == "XID_Start") flag = ucfDcpXidStart;
else if (s == "XID_Continue") flag = ucfDcpXidContinue;
else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
else FailR(s.CStr());
// If we add new codepoints to the hash table, we should also set their category.
// This is supposed to be provided in the comment, e.g. "# Cf SOFT HYPHEN".
helper.ProcessComment(reader);
//
for (int cp = from; cp <= to; cp++) {
int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
helper.TestCat(cp);
TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
ci.SetDcpFlag(flag); nCps++; }
nLines++;
}
reader.Close();
printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
}
| void TUniChDb::InitLineBreaks | ( | const TStr & | basePath | ) | [protected] |
Definition at line 1050 of file unicode.cpp.
{
// Clear old linebreak values.
ushort xx = TUniChInfo::LineBreak_Unknown;
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
// Read LineBreak.txt.
TUcdFileReader reader; TStrV fields;
reader.Open(CombinePath(basePath, GetLineBreakFn()));
int nLines = 0, nCps = 0;
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 2);
int from, to; reader.ParseCodePointRange(fields[0], from, to);
TStr s = fields[1]; IAssert(s.Len() == 2);
ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
if (us == xx) continue;
for (int cp = from; cp <= to; cp++) {
int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
IAssert(h[i].lineBreak == xx);
h[i].lineBreak = us; nCps++; }
nLines++;
}
reader.Close();
printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
}
| void TUniChDb::InitPropList | ( | const TStr & | basePath | ) | [protected] |
Definition at line 954 of file unicode.cpp.
{
TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
reader.Open(CombinePath(basePath, GetPropListFn()));
TSubcatHelper helper(*this);
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 2);
int from, to; reader.ParseCodePointRange(fields[0], from, to);
TStr s = fields[1];
TUniChProperties prop = TUniChProperties(0); TUniChPropertiesX propx = TUniChPropertiesX(0);
if (s == "White_Space") prop = ucfPrWhiteSpace;
else if (s == "Bidi_Control") prop = ucfPrBidiControl;
else if (s == "Join_Control") prop = ucfPrJoinControl;
else if (s == "Dash") prop = ucfPrDash;
else if (s == "Hyphen") prop = ucfPrHyphen;
else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
else if (s == "Other_Math") propx = ucfPxOtherMath;
else if (s == "Hex_Digit") prop = ucfPrHexDigit;
else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
else if (s == "Ideographic") prop = ucfPrIdeographic;
else if (s == "Diacritic") prop = ucfPrDiacritic;
else if (s == "Extender") prop = ucfPrExtender;
else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
else if (s == "Radical") propx = ucfPxRadical;
else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
else if (s == "Deprecated") prop = ucfPrDeprecated;
else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
else if (s == "STerm") prop = ucfPrSTerm;
else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
else FailR(s.CStr());
helper.ProcessComment(reader);
for (int cp = from; cp <= to; cp++) {
int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
TUniChInfo &ci = h[i]; helper.TestCat(cp);
if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
nCps++; }
nLines++;
}
reader.Close();
printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
}
| void TUniChDb::InitScripts | ( | const TStr & | basePath | ) | [protected] |
Definition at line 1077 of file unicode.cpp.
{
TUcdFileReader reader; TStrV fields;
reader.Open(CombinePath(basePath, GetScriptsFn()));
TSubcatHelper helper(*this);
while (reader.GetNextLine(fields))
{
int from, to; reader.ParseCodePointRange(fields[0], from, to);
TStr scriptName = fields[1];
int scriptNo = scripts.GetKeyId(scriptName);
if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
scripts[scriptNo] += 1;
helper.ProcessComment(reader);
for (int cp = from; cp <= to; cp++) {
int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
helper.TestCat(cp);
TUniChInfo &ci = h[i]; ci.script = scriptNo; }
}
reader.Close();
scripts.AddDat(GetScriptNameUnknown()) = 0;
printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
printf(" %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
printf("\n");
}
| void TUniChDb::InitSpecialCasing | ( | const TStr & | basePath | ) | [protected] |
Definition at line 1229 of file unicode.cpp.
{
TUcdFileReader reader; TStrV fields;
reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 5 || fields.Len() == 6);
IAssert(fields.Last().Empty());
// Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
TStr conditions = "";
if (fields.Len() == 6) conditions = fields[4];
conditions.ToTrunc(); if (! conditions.Empty()) continue;
// Keep the other mappings.
const int cp = reader.ParseCodePoint(fields[0]);
TIntV v; reader.ParseCodePointList(fields[1], v);
specialCasingLower.AddDat(cp, v);
reader.ParseCodePointList(fields[2], v);
specialCasingTitle.AddDat(cp, v);
reader.ParseCodePointList(fields[3], v);
specialCasingUpper.AddDat(cp, v);
}
reader.Close();
}
| void TUniChDb::InitWordAndSentenceBoundaryFlags | ( | const TStr & | basePath | ) | [protected] |
Definition at line 1104 of file unicode.cpp.
{
// UAX #29, sec. 4.1 and 5.1.
// Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
// Clear any existing word-boundary flags and initialize them again.
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
{
const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
ci.ClrWbAndSbFlags();
// Word-boundary flags.
if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
if (ci.lineBreak == TUniChInfo::LineBreak_InfixNumeric && cp != 0x3a) ci.SetWbFlag(ucfWbMidNum);
if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetWbFlag(ucfWbNumeric);
if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
// Sentence-boundary flags. Some are identical to some word-boundary flags.
if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetSbFlag(ucfSbNumeric);
if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
// Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
// the purposes of sentence-boundary detection. Now in PropList.txt there is no doubt that 002E has the STerm
// property; thus, it should also belong to the STerm sentence-boundary class. However, in
// SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
}
// Some additional characters for Katakana and MidLetter.
TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
// WbALetter depends on Katakana, so it cannot be initialized earlier.
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
{
const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
ci.SetWbFlag(ucfWbALetter);
}
// An alternative is to extract the flags from WordBreakProperty.txt.
// The results should be the same.
{TUcdFileReader reader; TStrV fields;
reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn()));
THash<TInt, TInt> hh;
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 2);
int from, to; reader.ParseCodePointRange(fields[0], from, to);
TStr s = fields[1];
TUniChFlags flag = ucfCompatibilityDecomposition;
if (s == "Format") flag = ucfWbFormat;
else if (s == "Katakana") flag = ucfWbKatakana;
else if (s == "ALetter") flag = ucfWbALetter;
else if (s == "MidLetter") flag = ucfWbMidLetter;
else if (s == "MidNum") flag = ucfWbMidNum;
else if (s == "Numeric") flag = ucfWbNumeric;
else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
else FailR(s.CStr());
for (int c = from; c <= to; c++) {
int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
else hh[i].Val |= flag; }
}
reader.Close();
TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
cps.Sort(); cps.Merge();
for (int i = 0; i < cps.Len(); i++)
{
int cp = cps[i];
int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
if (flags1 != flags2) {
printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
Fail; }
}}
// Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
{TUcdFileReader reader; TStrV fields;
reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn()));
THash<TInt, TInt> hh;
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 2);
int from, to; reader.ParseCodePointRange(fields[0], from, to);
TStr s = fields[1];
TUniChFlags flag = ucfCompatibilityDecomposition;
if (s == "Sep") flag = ucfSbSep;
else if (s == "Format") flag = ucfSbFormat;
else if (s == "Sp") flag = ucfSbSp;
else if (s == "Lower") flag = ucfSbLower;
else if (s == "Upper") flag = ucfSbUpper;
else if (s == "OLetter") flag = ucfSbOLetter;
else if (s == "Numeric") flag = ucfSbNumeric;
else if (s == "ATerm") flag = ucfSbATerm;
else if (s == "STerm") flag = ucfSbSTerm;
else if (s == "Close") flag = ucfSbClose;
else FailR(s.CStr());
for (int c = from; c <= to; c++) {
int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
else hh[i].Val |= flag; }
}
reader.Close();
TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
cps.Sort(); cps.Merge();
for (int i = 0; i < cps.Len(); i++)
{
int cp = cps[i];
int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
if (flags1 != flags2) {
printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
flags1 ^ flags2);
Fail; }
}}
}
| bool TUniChDb::IsGetChInfo | ( | const int | cp, |
| TUniChInfo & | ChInfo | ||
| ) | [inline] |
| DECLARE_FORWARDED_PROPERTY_METHODS bool TUniChDb::IsPrivateUse | ( | const int | cp | ) | const [inline] |
| bool TUniChDb::IsSbFlag | ( | const int | cp, |
| const TUniChFlags | flag | ||
| ) | const [inline] |
| bool TUniChDb::IsSurrogate | ( | const int | cp | ) | const [inline] |
| bool TUniChDb::IsWbFlag | ( | const int | cp, |
| const TUniChFlags | flag | ||
| ) | const [inline] |
| static bool TUniChDb::IsWbIgnored | ( | const TUniChInfo & | ci | ) | [inline, static, protected] |
Definition at line 1419 of file unicode.h.
{ return ci.IsGbExtend() || ci.IsWbFormat(); }
| bool TUniChDb::IsWbIgnored | ( | const int | cp | ) | const [inline, protected] |
| void TUniChDb::Load | ( | TSIn & | SIn | ) | [inline] |
Definition at line 1285 of file unicode.h.
{
h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
decompositions.Load(SIn);
inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
SIn.LoadCs(); InitAfterLoad(); }
| void TUniChDb::LoadBin | ( | const TStr & | fnBin | ) | [inline] |
| void TUniChDb::LoadTxt | ( | const TStr & | basePath | ) |
Definition at line 1253 of file unicode.cpp.
{
Clr();
// Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
h = THash<TInt, TUniChInfo>(196613, true);
//
caseFolding.LoadTxt(CombinePath(basePath, GetCaseFoldingFn()));
//
TUcdFileReader reader; TStrV fields; TIntH seen;
reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
while (reader.GetNextLine(fields))
{
// Codepoint.
int cp = reader.ParseCodePoint(fields[0]);
IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
TUniChInfo& ci = h.AddDat(cp);
// Name.
ci.nameOffset = charNames.AddStr(fields[1]);
// Category.
TStr& s = fields[2]; IAssert(s.Len() == 2);
ci.chCat = s[0]; ci.chSubCat = s[1];
// Canonical combining class.
s = fields[3]; IAssert(s.Len() > 0);
int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
ci.combClass = (uchar) i;
// Decomposition type and mapping.
LoadTxt_ProcessDecomposition(ci, fields[5]);
// Simple case mappings.
s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
//
ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
}
reader.Close();
//
InitScripts(basePath);
//
InitPropList(basePath);
InitDerivedCoreProperties(basePath);
InitLineBreaks(basePath);
InitSpecialCasing(basePath);
// Process the composition exclusions (UAX #15, sec. 6).
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
{
TUniChInfo& ci = h[i];
int ofs = ci.decompOffset; if (ofs < 0) continue;
int n = 0; while (decompositions[ofs + n] >= 0) n++;
IAssert(n > 0);
// Singleton decompositions.
if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
// Non-starter decompositions.
int cp1 = decompositions[ofs];
IAssert(h.IsKey(cp1));
uchar ccc = h.GetDat(cp1).combClass;
if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
}
// Process the composition exclusion table.
reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
int nExclusionTable = 0;
while (reader.GetNextLine(fields))
{
IAssert(fields.Len() == 1);
int cp = reader.ParseCodePoint(fields[0]);
int i = h.GetKeyId(cp); IAssert(i >= 0);
h[i].flags |= ucfCompositionExclusion;
nExclusionTable++;
}
reader.Close();
// Prepare the inverted index for composition pairs.
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
{
int cp = h.GetKey(i);
TUniChInfo& ci = h[i];
int ofs = ci.decompOffset; if (ofs < 0) continue;
if (ci.IsCompositionExclusion()) continue;
if (ci.IsCompatibilityDecomposition()) continue;
int n = 0; while (decompositions[ofs + n] >= 0) n++;
if (n != 2) continue;
TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
IAssert(! inverseDec.IsKey(pr));
IAssert(ci.combClass == TUniChInfo::ccStarter);
inverseDec.AddDat(pr, cp);
}
printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
// Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
// flags such as Alphabetic, Word_Break, and Grapheme_Extend.
InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
// Make sure that Hangul combined characters are treated as stareters.
for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
{
int j = h.GetKeyId(cp); if (j < 0) continue;
TUniChInfo& ci = h[j];
if (ci.combClass == TUniChInfo::ccInvalid) ci.combClass = TUniChInfo::ccStarter;
IAssert(ci.combClass == TUniChInfo::ccStarter);
}
// There should be no more additions to 'h' beyond this point.
const int oldHLen = h.Len();
// Provide default (identity) case mappings if any were missing from UnicodeData.txt
// (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
int scriptUnknown = GetScriptByName(GetScriptNameUnknown());
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
{
int cp = h.GetKey(i); TUniChInfo &ci = h[i];
if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
if (ci.script < 0) ci.script = scriptUnknown;
}
IAssert(h.Len() == oldHLen);
}
| void TUniChDb::LoadTxt_ProcessDecomposition | ( | TUniChInfo & | ci, |
| TStr | s | ||
| ) | [protected] |
Definition at line 941 of file unicode.cpp.
{
if (s.Empty()) return;
if (s[0] == '<') {
int i = s.SearchCh('>'); IAssert(i > 0);
ci.flags |= ucfCompatibilityDecomposition;
s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
TIntV dec; TUcdFileReader::ParseCodePointList(s, dec);
IAssert(dec.Len() > 0);
ci.decompOffset = decompositions.Len();
decompositions.AddV(dec); decompositions.Add(-1);
}
| void TUniChDb::PrintCharNames | ( | FILE * | f, |
| const TSrcVec & | src, | ||
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| const TStr & | prefix | ||
| ) | const [inline] |
Definition at line 1336 of file unicode.h.
{
if (! f) f = stdout;
for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
fprintf(f, "%s", prefix.CStr());
int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
| void TUniChDb::PrintCharNames | ( | FILE * | f, |
| const TSrcVec & | src, | ||
| const TStr & | prefix | ||
| ) | const [inline] |
Definition at line 1342 of file unicode.h.
{ PrintCharNames(f, src, 0, src.Len(), prefix); }
| void TUniChDb::Save | ( | TSOut & | SOut | ) | const [inline] |
Definition at line 1280 of file unicode.h.
{
h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
SOut.SaveCs(); }
| void TUniChDb::SaveBin | ( | const TStr & | fnBinUcd | ) |
Definition at line 1366 of file unicode.cpp.
{
PSOut SOut=TFOut::New(fnBinUcd);
Save(*SOut);
}
| void TUniChDb::SbEx_Add | ( | const TSrcVec & | v | ) | [inline] |
| void TUniChDb::SbEx_Add | ( | const TStr & | s | ) | [inline] |
| int TUniChDb::SbEx_AddMulti | ( | const TStr & | words, |
| const bool | wordsAreUtf8 = true |
||
| ) | [inline] |
Definition at line 1495 of file unicode.h.
{ TStrV vec; words.SplitOnAllCh('|', vec);
for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
return vec.Len(); }
| void TUniChDb::SbEx_AddUtf8 | ( | const TStr & | s | ) | [inline] |
| void TUniChDb::SbEx_Clr | ( | ) | [inline] |
| void TUniChDb::SbEx_Set | ( | const TUniTrie< TInt > & | newTrie | ) | [inline] |
| int TUniChDb::SbEx_SetStdEnglish | ( | ) | [inline] |
Definition at line 1499 of file unicode.h.
{
static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
SbEx_Clr(); return SbEx_AddMulti(data, false); }
| void TUniChDb::Test | ( | const TStr & | basePath | ) |
Definition at line 1381 of file unicode.cpp.
{
TStr fnBin = CombinePath(basePath, GetBinFn());
if (true || ! TFile::Exists(fnBin))
{
// Test LoadTxt.
LoadTxt(basePath);
// Test Save.
{PSOut SOut = TFOut::New(fnBin);
Save(*SOut);}
}
// Test Load.
this->~TUniChDb();
new(this) TUniChDb();
{PSIn SIn = TFIn::New(fnBin);
Load(*SIn);}
// Test the case folding.
caseFolding.Test();
// Test the word breaking.
TestWbFindNonIgnored();
// Test the sentence breaking.
TestFindNextWordOrSentenceBoundary(basePath, true);
TestFindNextWordOrSentenceBoundary(basePath, false);
// Test composition and decomposition.
TestComposition(basePath);
// Test the case conversions.
TestCaseConversions();
}
| void TUniChDb::TestCaseConversion | ( | const TStr & | source, |
| const TStr & | trueLc, | ||
| const TStr & | trueTc, | ||
| const TStr & | trueUc, | ||
| bool | turkic, | ||
| bool | lithuanian | ||
| ) | [protected] |
Definition at line 829 of file unicode.cpp.
{
TIntV src;
TUcdFileReader::ParseCodePointList(source, src);
FILE *f = stderr;
for (int i = 0; i < 3; i++)
{
TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
TIntV dest;
GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
bool ok = (dest.Len() == trueDest.Len());
if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
if (ok) continue;
fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
fprintf(f, ")\nCorrect: (");
for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
fprintf(f, ")\nOur output:(");
for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
fprintf(f, ")\n");
IAssert(ok);
}
}
| void TUniChDb::TestCaseConversions | ( | ) | [protected] |
Definition at line 857 of file unicode.cpp.
{
// Because no thorough case-conversion test files have been provided as part
// of the Unicode standard, we'll have to test things on a few test cases of our own.
// - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
const TStr space = "0020 ", Grave = "0300 ";
TestCaseConversion(
F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst, // source
f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst, // lowercase
F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst, // titlecase
F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
false, false);
// - Dotted I, dotless i, etc., but with turkic == false.
const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
TestCaseConversion(
s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
false, false);
// - Sigma (final vs. non-final forms).
const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
TestCaseConversion(
Sigma + s + space + s + Sigma + space + s + Sigma + s + space + Sigma + S + Sigma + space + Sigma, // source
sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
false, false);
TestCaseConversion(
sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + S + sigma + space + sigma, // source
sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + s + sigma + space + sigma, // lowercase
Sigma + s + space + S + sigma + space + S + sigma + s + space + Sigma + s + sigma + space + Sigma, // titlecase
Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
false, false);
TestCaseConversion(
fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma + space + fsigma, // source
fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma + space + fsigma, // lowercase
Sigma + s + space + S + fsigma + space + S + fsigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
false, false);
const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
// Special case mappings for Turkic languages:
// - After_I
TestCaseConversion(
s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
true, false); // turkic
// - Not_Before_Dot
TestCaseConversion(
I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
true, false); // turkic
// Special case mappings for Lithuanian:
// - After_Soft_Dotted [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
TestCaseConversion(
i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
false, true); // lithuanian
// - More_Above [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
TestCaseConversion(
J + Grave + space + J + nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J + nonSA + Grave + space + j + nonSA, // source
j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + J + nonSA + Grave + space + J + nonSA, // uppercase
false, true); // lithuanian
// SoftDotted [^ Starter Above]* 0307 --(uc,tc)--> brez 0307
// SoftDotted [^ Starter Above]* 0307 --(
//TestCaseConversion("", "", "", "", false, false);
}
| void TUniChDb::TestComposition | ( | const TStr & | basePath | ) | [protected] |
Definition at line 749 of file unicode.cpp.
{
TUcdFileReader reader; TStrV fields; int nLines = 0;
reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
bool inPart1 = false; TIntH testedInPart1;
while (reader.GetNextLine(fields))
{
nLines += 1;
if (fields.Len() == 1) {
IAssert(fields[0].IsPrefix("@Part"));
inPart1 = (fields[0] == "@Part1"); continue; }
IAssert(fields.Len() == 6);
IAssert(fields[5].Len() == 0);
TIntV c1, c2, c3, c4, c5;
reader.ParseCodePointList(fields[0], c1);
reader.ParseCodePointList(fields[1], c2);
reader.ParseCodePointList(fields[2], c3);
reader.ParseCodePointList(fields[3], c4);
reader.ParseCodePointList(fields[4], c5);
TIntV v;
#define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
#define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
#define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
#define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
#define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
// NFD:
NFD_(c3, c1); // c3 == NFD(c1)
NFD_(c3, c2); // c3 == NFD(c2)
NFD_(c3, c3); // c3 == NFD(c3)
NFD_(c5, c4); // c5 == NFD(c4)
NFD_(c5, c5); // c5 == NFD(c5)
// NFC:
NFC_(c2, c1); // c2 == NFC(c1)
NFC_(c2, c2); // c2 == NFC(c2)
NFC_(c2, c3); // c2 == NFC(c3)
NFC_(c4, c4); // c4 == NFC(c4)
NFC_(c4, c5); // c4 == NFC(c5)
// NFKD:
NFKD_(c5, c1); // c5 == NFKD(c1)
NFKD_(c5, c2); // c5 == NFKD(c2)
NFKD_(c5, c3); // c5 == NFKD(c3)
NFKD_(c5, c4); // c5 == NFKD(c4)
NFKD_(c5, c5); // c5 == NFKD(c5)
// NFKC:
NFKC_(c4, c1); // c4 == NFKC(c1)
NFKC_(c4, c2); // c4 == NFKC(c2)
NFKC_(c4, c3); // c4 == NFKC(c3)
NFKC_(c4, c4); // c4 == NFKC(c4)
NFKC_(c4, c5); // c4 == NFKC(c5)
//
if (inPart1) {
IAssert(c1.Len() == 1);
testedInPart1.AddKey(c1[0]); }
}
reader.Close();
// Test other individual codepoints that were not mentioned in part 1.
int nOther = 0;
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
{
const int cp = h.GetKey(i), nLines = -1;
if (testedInPart1.IsKey(cp)) continue;
TIntV x, v; x.Add(cp);
NFC_(x, x); // x == NFC(x)
NFD_(x, x); // x == NFD(x)
NFKC_(x, x); // x == NFKC(x)
NFKD_(x, x); // x == NFKD(x)
nOther += 1;
}
#undef AssE_
#undef NFC_
#undef NFD_
#undef NFKC_
#undef NFKD_
printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
}
| void TUniChDb::TestFindNextWordOrSentenceBoundary | ( | const TStr & | basePath, |
| bool | sentence | ||
| ) | [protected] |
Definition at line 653 of file unicode.cpp.
{
TUcdFileReader reader; TStrV fields;
reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
int nLines = 0; TRnd rnd = TRnd(123);
while (reader.GetNextLine(fields))
{
nLines += 1;
IAssert(fields.Len() == 1);
TStrV parts; fields[0].SplitOnWs(parts);
const int n = parts.Len(); IAssert((n % 2) == 1);
TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
// Each line is a sequence of codepoints, with a \times or \div in between each
// pair of codepoints (as well as at the beginning and the end of the sequence) to
// indicate whether a boundary exists there or not.
for (int i = 0; i < n; i++)
{
const TStr& s = parts[i];
if ((i % 2) == 0) {
if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
isBreak.Add(false);
else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
isBreak.Add(true);
else FailR(s.CStr()); }
else chars.Add(reader.ParseCodePoint(s));
}
const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
IAssert(isBreak[0]); IAssert(isBreak[m]);
isPredicted.Gen(m + 1); isPredicted.PutAll(false);
if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
// We'll insert a few random characters at the beginning of the sequence
// so that srcPos doesn't always begin at 0.
for (int nBefore = 0; nBefore < 5; nBefore++)
{
TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
chars2.AddV(chars);
// Use FindNextBoundary to find all the word boundaries.
size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
{
IAssert(prevPosition < position);
IAssert(position <= size_t(nBefore + m));
isPredicted[int(position) - nBefore] = true;
prevPosition = position;
}
IAssert(position == size_t(nBefore + m));
if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
IAssert(isPredicted2.Len() == m + 1);
bool ok = true;
// If we start at 0, the word boundary at the beginning of the sequence was
// not found explicitly, so we'll add it now.
if (nBefore == 0) isPredicted[0] = true;
// Compare the predicted and the true boundaries.
for (int i = 0; i <= m; i++) {
if (isBreak[i] != isPredicted[i]) ok = false;
IAssert(isPredicted2[i] == isPredicted[i]); }
FILE *f = stderr;
if (! ok)
{
fprintf(f, "\nError in line %d:\n", nLines);
fprintf(f, "True: ");
for (int i = 0; i <= m; i++) {
fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
fprintf(f, "\nPredicted: ");
for (int i = 0; i <= m; i++) {
fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
if (i < m) {
const int cp = chars[i + nBefore];
TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp));
if (IsWbIgnored(cp)) s = "*" + s;
fprintf(f, "%4s ", s.CStr()); }}
fprintf(f, "\n");
Fail;
}
// Test FindNextBoundary if we start in the middle of the sequence,
// i.e. not at an existing boundary.
for (int i = 0; i < m; i++) {
position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
position -= nBefore;
for (int j = i + 1; j < int(position); j++)
IAssert(! isBreak[j]);
IAssert(isBreak[int(position)]); }
}
}
reader.Close();
printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
}
| void TUniChDb::TestWbFindNonIgnored | ( | const TIntV & | src | ) | const [protected] |
Definition at line 583 of file unicode.cpp.
{
int n = src.Len();
TBoolV isIgnored; isIgnored.Gen(n);
for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
FILE *f = 0; // stderr;
for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
{
int prev = -1;
for (int i = 0; i < srcLen; i++) {
prevNonIgnored[i] = prev;
if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
int next = srcIdx + srcLen;
for (int i = srcLen - 1; i >= 0; i--) {
nextNonIgnored[i] = next;
if (! isIgnored[srcIdx + i]) next = srcIdx + i;
curOrNextNonIgnored[i] = next; }
if (f) {
fprintf(f, "\nIndex: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
fprintf(f, "\nNonIgn: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
fprintf(f, "\nPrevNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
fprintf(f, "\nNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
fprintf(f, "\n"); }
for (int i = 0; i < srcLen; i++)
{
size_t s;
s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
IAssert(s == size_t(nextNonIgnored[i]));
s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
IAssert(s == size_t(curOrNextNonIgnored[i]));
s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
}
}
}
| void TUniChDb::TestWbFindNonIgnored | ( | ) | const [protected] |
Definition at line 623 of file unicode.cpp.
{
TIntV chIgnored, chNonIgnored;
FILE *f = 0; // stderr;
for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
ci.flags, ci.properties, ci.propertiesX, GetScriptName(ci.script).CStr());
(IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
}
chIgnored.Sort(); chNonIgnored.Sort();
printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
TRnd rnd = TRnd(123);
for (int iter = 0; iter <= 50; iter++)
{
int percIgnored = 2 * iter;
for (int n = 0; n <= 20; n++)
{
// Prepare a random sequence of 'n' codepoints.
TIntV v; v.Gen(n);
for (int i = 0; i < n; i++) {
TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
int j = rnd.GetUniDevInt(chars.Len());
v.Add(chars[j]); }
// Run the tests with this sequence.
TestWbFindNonIgnored(v);
}
}
}
| void TUniChDb::ToCaseFolded | ( | TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| const bool | turkic = false |
||
| ) | const [inline] |
Definition at line 1636 of file unicode.h.
{ caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
| void TUniChDb::ToCaseFolded | ( | TSrcVec & | src, |
| const bool | turkic = false |
||
| ) | const [inline] |
Definition at line 1637 of file unicode.h.
{ ToCaseFolded(src, 0, src.Len(), turkic); }
| void TUniChDb::ToSimpleCaseConverted | ( | TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| const TCaseConversion | how | ||
| ) | const |
Definition at line 3066 of file unicode.h.
{
bool seenCased = false; size_t nextWordBoundary = srcIdx;
for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
{
const int cp = src[TVecIdx(srcIdx)];
int i = h.GetKeyId(cp); if (i < 0) continue;
const TUniChInfo &ci = h[i];
// With titlecasing, the first cased character of each word must be put into titlecase,
// all others into lowercase. This is what the howHere variable is for.
TUniChDb::TCaseConversion howHere;
if (how != ccTitle) howHere = how;
else {
if (srcIdx == nextWordBoundary) { // A word starts/ends here.
seenCased = false;
size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
IAssert(next > nextWordBoundary); nextWordBoundary = next; }
bool isCased = IsCased(cp);
if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
else howHere = ccLower;
}
int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
}
}
| void TUniChDb::ToSimpleLowerCase | ( | TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount | ||
| ) | const [inline] |
Definition at line 1610 of file unicode.h.
{ ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
| void TUniChDb::ToSimpleLowerCase | ( | TSrcVec & | src | ) | const [inline] |
Definition at line 1613 of file unicode.h.
{ ToSimpleLowerCase(src, 0, src.Len()); }
| void TUniChDb::ToSimpleTitleCase | ( | TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount | ||
| ) | const [inline] |
Definition at line 1611 of file unicode.h.
{ ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
| void TUniChDb::ToSimpleTitleCase | ( | TSrcVec & | src | ) | const [inline] |
Definition at line 1614 of file unicode.h.
{ ToSimpleTitleCase(src, 0, src.Len()); }
| void TUniChDb::ToSimpleUpperCase | ( | TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount | ||
| ) | const [inline] |
Definition at line 1609 of file unicode.h.
{ ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
| void TUniChDb::ToSimpleUpperCase | ( | TSrcVec & | src | ) | const [inline] |
Definition at line 1612 of file unicode.h.
{ ToSimpleUpperCase(src, 0, src.Len()); }
| void TUniChDb::WbFindCurOrNextNonIgnored | ( | const TSrcVec & | src, |
| size_t & | position, | ||
| const size_t | srcEnd | ||
| ) | const [inline, protected] |
Definition at line 1422 of file unicode.h.
{
while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
| void TUniChDb::WbFindNextNonIgnored | ( | const TSrcVec & | src, |
| size_t & | position, | ||
| const size_t | srcEnd | ||
| ) | const [inline, protected] |
Definition at line 1425 of file unicode.h.
{
if (position >= srcEnd) return;
position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
| void TUniChDb::WbFindNextNonIgnoredS | ( | const TSrcVec & | src, |
| size_t & | position, | ||
| const size_t | srcEnd | ||
| ) | const [inline, protected] |
Definition at line 1429 of file unicode.h.
{
if (position >= srcEnd) return;
if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
| bool TUniChDb::WbFindPrevNonIgnored | ( | const TSrcVec & | src, |
| const size_t | srcStart, | ||
| size_t & | position | ||
| ) | const [inline, protected] |
Definition at line 1434 of file unicode.h.
{
if (position <= srcStart) return false;
while (position > srcStart) {
position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
return false; }
friend class TUniCaseFolding [friend] |
TUniTrie<TInt> TUniChDb::sbExTrie [protected] |