#include <unicode.h>

Classes
class	TSubcatHelper
class	TUcdFileReader
Public Types
enum	{ HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7, HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, HangulNCount = HangulVCount * HangulTCount, HangulSCount = HangulLCount * HangulNCount }
enum	TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 }
typedef enum TUniChDb::TCaseConversion_	TCaseConversion
Public Member Functions
	TUniChDb ()
	TUniChDb (TSIn &SIn)
void	Clr ()
void	Save (TSOut &SOut) const
void	Load (TSIn &SIn)
void	LoadBin (const TStr &fnBin)
void	Test (const TStr &basePath)
const TStr &	GetScriptName (const int scriptId) const
int	GetScriptByName (const TStr &scriptName) const
int	GetScript (const TUniChInfo &ci) const
int	GetScript (const int cp) const
const char *	GetCharName (const int cp) const
TStr	GetCharNameS (const int cp) const
template<class TSrcVec >
void	PrintCharNames (FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const
template<class TSrcVec >
void	PrintCharNames (FILE *f, const TSrcVec &src, const TStr &prefix) const
bool	IsGetChInfo (const int cp, TUniChInfo &ChInfo)
TUniChCategory	GetCat (const int cp) const
TUniChSubCategory	GetSubCat (const int cp) const
bool	IsWbFlag (const int cp, const TUniChFlags flag) const
int	GetWbFlags (const int cp) const
bool	IsSbFlag (const int cp, const TUniChFlags flag) const
int	GetSbFlags (const int cp) const
DECLARE_FORWARDED_PROPERTY_METHODS bool	IsPrivateUse (const int cp) const
bool	IsSurrogate (const int cp) const
int	GetCombiningClass (const int cp) const
template<typename TSrcVec >
bool	FindNextWordBoundary (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
template<typename TSrcVec >
void	FindWordBoundaries (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
template<typename TSrcVec >
bool	FindNextSentenceBoundary (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
template<typename TSrcVec >
void	FindSentenceBoundaries (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
void	SbEx_Clr ()
template<class TSrcVec >
void	SbEx_Add (const TSrcVec &v)
void	SbEx_Add (const TStr &s)
void	SbEx_AddUtf8 (const TStr &s)
int	SbEx_AddMulti (const TStr &words, const bool wordsAreUtf8=true)
void	SbEx_Set (const TUniTrie< TInt > &newTrie)
int	SbEx_SetStdEnglish ()
template<typename TSrcVec , typename TDestCh >
void	Decompose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	Decompose (const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	Compose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	Compose (const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	DecomposeAndCompose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	DecomposeAndCompose (const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t	ExtractStarters (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t	ExtractStarters (const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
template<typename TSrcVec >
size_t	ExtractStarters (TSrcVec &src) const
void	LoadTxt (const TStr &basePath)
void	SaveBin (const TStr &fnBinUcd)
template<typename TSrcVec , typename TDestCh >
void	GetCaseConverted (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
template<typename TSrcVec , typename TDestCh >
void	GetLowerCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
template<typename TSrcVec , typename TDestCh >
void	GetUpperCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
template<typename TSrcVec , typename TDestCh >
void	GetTitleCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
template<typename TSrcVec , typename TDestCh >
void	GetLowerCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
template<typename TSrcVec , typename TDestCh >
void	GetUpperCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
template<typename TSrcVec , typename TDestCh >
void	GetTitleCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleCaseConverted (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleLowerCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleUpperCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleTitleCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleLowerCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleUpperCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
void	GetSimpleTitleCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec >
void	ToSimpleCaseConverted (TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
template<typename TSrcVec >
void	ToSimpleUpperCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const
template<typename TSrcVec >
void	ToSimpleLowerCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const
template<typename TSrcVec >
void	ToSimpleTitleCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const
template<typename TSrcVec >
void	ToSimpleUpperCase (TSrcVec &src) const
template<typename TSrcVec >
void	ToSimpleLowerCase (TSrcVec &src) const
template<typename TSrcVec >
void	ToSimpleTitleCase (TSrcVec &src) const
template<typename TSrcVec , typename TDestCh >
void	GetCaseFolded (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const
template<typename TSrcVec , typename TDestCh >
void	GetCaseFolded (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool full=true, const bool turkic=false) const
template<typename TSrcVec >
void	ToCaseFolded (TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const
template<typename TSrcVec >
void	ToCaseFolded (TSrcVec &src, const bool turkic=false) const
Static Public Member Functions
static TStr	GetCaseFoldingFn ()
static TStr	GetSpecialCasingFn ()
static TStr	GetUnicodeDataFn ()
static TStr	GetCompositionExclusionsFn ()
static TStr	GetScriptsFn ()
static TStr	GetDerivedCorePropsFn ()
static TStr	GetLineBreakFn ()
static TStr	GetPropListFn ()
static TStr	GetAuxiliaryDir ()
static TStr	GetWordBreakTestFn ()
static TStr	GetWordBreakPropertyFn ()
static TStr	GetSentenceBreakTestFn ()
static TStr	GetSentenceBreakPropertyFn ()
static TStr	GetNormalizationTestFn ()
static TStr	GetBinFn ()
static TStr	GetScriptNameUnknown ()
static TStr	GetScriptNameKatakana ()
static TStr	GetScriptNameHiragana ()
Public Attributes
THash< TInt, TUniChInfo >	h
TStrPool	charNames
TStrIntH	scripts
TIntV	decompositions
THash< TIntPr, TInt >	inverseDec
TUniCaseFolding	caseFolding
TIntIntVH	specialCasingLower
TIntIntVH	specialCasingUpper
TIntIntVH	specialCasingTitle
int	scriptUnknown
Protected Types
typedef TUniVecIdx	TVecIdx
Protected Member Functions
void	InitAfterLoad ()
bool	IsWbIgnored (const int cp) const
template<typename TSrcVec >
void	WbFindCurOrNextNonIgnored (const TSrcVec &src, size_t &position, const size_t srcEnd) const
template<typename TSrcVec >
void	WbFindNextNonIgnored (const TSrcVec &src, size_t &position, const size_t srcEnd) const
template<typename TSrcVec >
void	WbFindNextNonIgnoredS (const TSrcVec &src, size_t &position, const size_t srcEnd) const
template<typename TSrcVec >
bool	WbFindPrevNonIgnored (const TSrcVec &src, const size_t srcStart, size_t &position) const
void	TestWbFindNonIgnored (const TIntV &src) const
void	TestWbFindNonIgnored () const
void	TestFindNextWordOrSentenceBoundary (const TStr &basePath, bool sentence)
template<typename TSrcVec >
bool	CanSentenceEndHere (const TSrcVec &src, const size_t srcIdx, const size_t position) const
template<typename TDestCh >
void	AddDecomposition (const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const
void	TestComposition (const TStr &basePath)
void	InitWordAndSentenceBoundaryFlags (const TStr &basePath)
void	InitScripts (const TStr &basePath)
void	InitLineBreaks (const TStr &basePath)
void	InitDerivedCoreProperties (const TStr &basePath)
void	InitPropList (const TStr &basePath)
void	InitSpecialCasing (const TStr &basePath)
void	LoadTxt_ProcessDecomposition (TUniChInfo &ci, TStr s)
void	TestCaseConversion (const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
void	TestCaseConversions ()
Static Protected Member Functions
static bool	IsWbIgnored (const TUniChInfo &ci)
Protected Attributes
TUniTrie< TInt >	sbExTrie
Friends
class	TUniCaseFolding

Detailed Description

Definition at line 1256 of file unicode.h.

Member Typedef Documentation

typedef enum TUniChDb::TCaseConversion_ TUniChDb::TCaseConversion

typedef TUniVecIdx TUniChDb::TVecIdx [protected]

Definition at line 1260 of file unicode.h.

Member Enumeration Documentation

anonymous enum

Enumerator:

HangulSBase
HangulLBase
HangulVBase
HangulTBase
HangulLCount
HangulVCount
HangulTCount
HangulNCount
HangulSCount

Definition at line 1405 of file unicode.h.

             {
        HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
        HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
        HangulNCount = HangulVCount * HangulTCount,   // 588
        HangulSCount = HangulLCount * HangulNCount   // 11172
        };

enum TUniChDb::TCaseConversion_

Enumerator:

ccLower
ccUpper
ccTitle
ccMax

Definition at line 1584 of file unicode.h.

{ ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;

Constructor & Destructor Documentation

TUniChDb::TUniChDb ( ) [inline]

Definition at line 1274 of file unicode.h.

: scriptUnknown(-1) { }

TUniChDb::TUniChDb ( TSIn & SIn ) [inline, explicit]

Definition at line 1275 of file unicode.h.

{ Load(SIn); }

Member Function Documentation

template<typename TDestCh >

void TUniChDb::AddDecomposition	(	const int	codePoint,
		TVec< TDestCh > &	dest,
		const bool	compatibility
	)		const `[protected]`

Definition at line 3097 of file unicode.h.

{
        if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
        {
                // UAX #15, sec. 16: Hangul decomposition
                const int SIndex = codePoint - HangulSBase;
                const int L = HangulLBase + SIndex / HangulNCount;
                const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
                const int T = HangulTBase + (SIndex % HangulTCount);
                dest.Add(L); dest.Add(V);
                if (T != HangulTBase) dest.Add(T);
                return;
        }
        int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
        const TUniChInfo &ci = h[i];
        int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
        if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
        while (true) {
                int cp = decompositions[ofs++]; if (cp < 0) return;
                AddDecomposition(cp, dest, compatibility); }
}

template<typename TSrcVec >

bool TUniChDb::CanSentenceEndHere	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	position
	)		const `[protected]`

Definition at line 2582 of file unicode.h.

{
        if (sbExTrie.Empty()) return true;
        // We'll move back from the position where a sentence-boundary is being considered.
        size_t pos = position;
        if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
        int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
        // - Skip the Sep, if there is one.
        if ((c & ucfSbSep) == ucfSbSep) {
                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
        // - Skip any Sp characters.
        while ((sfb & ucfSbSp) == ucfSbSp) {
                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
        // - Skip any Close characters.
        while ((sfb & ucfSbSp) == ucfSbSp) {
                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
        // - Skip any ATerm | STerm characters.
        while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
        // Now start moving through the trie.
        int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
        while (true)
        {
                bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
                c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
                TUniChCategory cat = GetCat(c);
                if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
                        // Check if the suffix we've read so far is one of those that appear in the trie.
                        if (len == 1) return ! sbExTrie.Has1Gram(cLast);
                        if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
                        IAssert(len >= 3); IAssert(node >= 0);
                        if (sbExTrie.IsNodeTerminal(node)) return false;
                        if (atEnd) return true; }
                if (len == 1) { cButLast = c; len++; }
                else if (len == 2) { cButButLast = c; len++;
                        // Now we have read the last three characters; start descending the suitable subtrie.
                        node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
                        if (node < 0) return true; }
                else {
                        // Descend down the trie.
                        node = sbExTrie.GetChild(node, c);
                        if (node < 0) return true; }
        }
        //return true;
}

void TUniChDb::Clr ( ) [inline]

Definition at line 1276 of file unicode.h.

                   {
                h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
                specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
                scripts.Clr(); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Compose	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const

Definition at line 3152 of file unicode.h.

{
        if (clrDest) dest.Clr();
        bool lastStarterKnown = false; // has a starter been encountered yet?
        size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
        int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
        const size_t srcEnd = srcIdx + srcCount;
        int ccMax = -1; // The highest combining class among the characters since the last starter.
        while (srcIdx < srcEnd)
        {
                const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
                const int cpClass = GetCombiningClass(cp);
                //int cpCombined = -1;
                // If there is a starter with which 'cp' can be combined, and from which it is not blocked
                // by some intermediate character, we can try to combine them.
                if (lastStarterKnown && ccMax < cpClass)
                {
                        int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
                        int cpCombined = -1;
                        do {
                                // Try to look up a composition in the inverseDec table.
                                if (j >= 0) { cpCombined = inverseDec[j]; break; }
                                // UAX #15, sec. 16: Hangul composition
                                // - Try to combine L and V.
                                const int LIndex = cpLastStarter - HangulLBase;
                                if (0 <= LIndex && LIndex < HangulLCount) {
                                        const int VIndex = cp - HangulVBase;
                                        if (0 <= VIndex && VIndex < HangulVCount) {
                                                cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
                                                break; } }
                                // - Try to combine LV and T.
                                const int SIndex = cpLastStarter - HangulSBase;
                                if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
                                {
                                        const int TIndex = cp - HangulTBase;
                                        if (0 <= TIndex && TIndex < HangulTCount) {
                                                cpCombined = cpLastStarter + TIndex;
                                                break; }
                                }
                        } while (false);
                        // If a combining character has been found, use it to replace the old cpStarter.
                        if (cpCombined >= 0) {
                                dest[TVecIdx(lastStarterPos)] = cpCombined;
                                Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
                                // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
                                cpLastStarter = cpCombined; continue; }
                }
                if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
                        lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
                else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
                        ccMax = cpClass;
                dest.Add(cp);
        }
}

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Compose	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1532 of file unicode.h.

                                                                                         {
                Compose(src, 0, src.Len(), dest, clrDest); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Decompose	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const

Definition at line 3120 of file unicode.h.

{
        if (clrDest) dest.Clr();
        const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
        // Decompose the string.
        while (srcIdx < srcCount) {
                AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
        // Rearrange the decomposed string into canonical order.
        for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
        {
                size_t j = destIdx;
                int cp = dest[TVecIdx(destIdx)]; destIdx++;
                int cpCls = GetCombiningClass(cp);
                if (cpCls == TUniChInfo::ccStarter) continue;
                while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
                        dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
                dest[TVecIdx(j)] = cp;
        }
}

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Decompose	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1520 of file unicode.h.

                                                                                                               {
                Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::DecomposeAndCompose	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const

Definition at line 3142 of file unicode.h.

{
        if (clrDest) dest.Clr();
        TIntV temp;
        Decompose(src, srcIdx, srcCount, temp, compatibility);
        Compose(temp, 0, temp.Len(), dest, clrDest);
}

template<typename TSrcVec , typename TDestCh >

void TUniChDb::DecomposeAndCompose	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1542 of file unicode.h.

                                                                                                                         {
                DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }

template<typename TSrcVec , typename TDestCh >

size_t TUniChDb::ExtractStarters	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const

Definition at line 3209 of file unicode.h.

{
        if (clrDest) dest.Clr();
        size_t retVal = 0;
        for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
                const int cp = src[TVecIdx(srcIdx)];
                if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
                        { dest.Add(cp); retVal++; } }
        return retVal;
}

template<typename TSrcVec , typename TDestCh >

size_t TUniChDb::ExtractStarters	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1551 of file unicode.h.

                                                                                                   {
                return ExtractStarters(src, 0, src.Len(), dest, clrDest); }

template<typename TSrcVec >

size_t TUniChDb::ExtractStarters ( TSrcVec & src ) const [inline]

Definition at line 1555 of file unicode.h.

                                                   {
                TIntV temp; size_t retVal = ExtractStarters(src, temp);
                src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
                return retVal; }

template<typename TSrcVec >

bool TUniChDb::FindNextSentenceBoundary	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		size_t &	position
	)		const

Definition at line 2633 of file unicode.h.

{
        // SB1.  Break at the start of text.
        if (position < srcIdx) { position = srcIdx; return true; }
        // If we are beyond the end of the text, there aren't any word breaks left.
        const size_t srcEnd = srcIdx + srcCount;
        if (position >= srcEnd) return false;
        // If 'position' is currently at an ignored character, move it back to the last nonignored character.
        size_t origPos = position;
        if (IsWbIgnored(src[TVecIdx(position)])) {
                if (! WbFindPrevNonIgnored(src, srcIdx, position))
                        position = origPos;
        }
        // Determine the previous nonignored character (before 'position').
        size_t posPrev = position;
        if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
        // Sec 6.2.  Allow a break between Sep and an ignored character.
        if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
        // Determine the next nonignored character (after 'position').
        size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
        size_t posNext2;
        int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
        int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
        int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
        int cNext2, sbfNext2;
        // Initialize the state of the peek-back automaton.
        typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
        TPeekBackState backState;
        {
                size_t pos = position;
                bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
                while (true)
                {
                        if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
                        // Skip at most one Sep.
                        int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
                        if ((sbf & ucfSbSep) == ucfSbSep) {
                                wasSep = true;
                                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
                                cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
                        // Skip zero or more Sp's.
                        bool stop = false;
                        while ((sbf & ucfSbSp) == ucfSbSp) {
                                wasSp = true;
                                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
                                cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
                        if (stop) break;
                        // Skip zero or more Close's.
                        while ((sbf & ucfSbClose) == ucfSbClose) {
                                if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
                                cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
                        if (stop) break;
                        // Process an ATerm or STerm.
                        wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
                        wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
                        break;
                }
                if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
                else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
                else backState = stInit;
        }
        // Initialize the state of the peek-ahead automaton.  This state tells us what follows
        // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
        // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
        // Our peek-ahead automaton must tell us whether it is Lower or something else.
        typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
        TPeekAheadState aheadState = stUnknown;
        //
        for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
                                                           cPrev = cCur, cCur = cNext, cNext = cNext2,
                                                           sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
        {
                // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
                // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
                // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
                posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
                cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
                sbfNext2 = GetSbFlags(cNext2);
                // Update the peek-back automaton.
#define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
#define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
                switch (backState) {
                        case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
                        case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
                        case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
                        case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                        case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                        case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                        case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                        default: IAssert(false); }
#undef Trans
#undef TestCur
                // Update the peek-ahead automaton.
#define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
                if (! IsPeekAheadSkippable(sbfCur)) {
                        bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
                        if (aheadState == stLower) IAssert(isLower);
                        else if (aheadState == stNotLower) IAssert(! isLower);
                        // We haven't peaked ahead farther than this so far -- invalidate the state.
                        aheadState = stUnknown; }
                if (aheadState == stUnknown)
                {
                        // Peak ahead to the next non-peekahead-skippable character.
                        size_t pos = posNext;
                        while (pos < srcEnd) {
                                int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
                                if (! IsPeekAheadSkippable(sbf)) {
                                        if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
                                        else aheadState = stNotLower;
                                        break; }
                                WbFindNextNonIgnored(src, pos, srcEnd); }
                        if (! (pos < srcEnd)) aheadState = stNotLower;
                }
#undef IsPeekAheadSkippable
                //
#define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
                // SB3.  Do not break within CRLF.
                if (cCur == 13 && cNext == 10) continue;
                // SB4.  Break ater paragraph separators.
                if ((sbfCur & ucfSbSep) == ucfSbSep) {
                        if (! CanSentenceEndHere(src, srcIdx, position)) continue;
                        position = posNext; return true; }
                // Do not break after ambiguous terminators like period, if they are immediately followed by a number
                // or lowercase letter, if they are between uppercase letters, or if the first following letter
                // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
                // or numeric period, and thus may not mark the end of a sentence.
                TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
                TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
                // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
                if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
                        (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
                // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
                if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
                // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
                // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
                if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
                // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
                // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
                if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
                        if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
                        if (! CanSentenceEndHere(src, srcIdx, position)) continue;
                        position = posNext; return true; } // SB11
                // WB12.  Otherwise, do not break.
                continue;
#undef TestCurNext
#undef TestCurNext2
#undef TestPrevCurNext
        }
        // WB2.  Break at the end of text.
        IAssert(position == srcEnd);
        return true;
}

template<typename TSrcVec >

bool TUniChDb::FindNextWordBoundary	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		size_t &	position
	)		const

Definition at line 2483 of file unicode.h.

{
        // WB1.  Break at the start of text.
        if (position < srcIdx) { position = srcIdx; return true; }
        // If we are beyond the end of the text, there aren't any word breaks left.
        const size_t srcEnd = srcIdx + srcCount;
        if (position >= srcEnd) return false;
        // If 'position' is currently at an ignored character, move it back to the last nonignored character.
        size_t origPos = position;
        if (IsWbIgnored(src[TVecIdx(position)])) {
                if (! WbFindPrevNonIgnored(src, srcIdx, position))
                        position = origPos;
        }
        // Determine the previous nonignored character (before 'position').
        size_t posPrev = position;
        if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
        // Sec 6.2.  Allow a break between Sep and an ignored character.
        if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
        // Determine the next nonignored character (after 'position').
        size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
        size_t posNext2;
        int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
        int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
        int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
        int cNext2, wbfNext2;
        //
        for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
                                                           cPrev = cCur, cCur = cNext, cNext = cNext2,
                                                           wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
        {
                // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
                // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
                // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
                posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
                cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
                wbfNext2 = GetWbFlags(cNext2);
#define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
                // WB3.  Do not break within CRLF.
                if (cCur == 13 && cNext == 10) continue;
                // WB5.  Do not break between most letters.
                TestCurNext(ucfWbALetter, ucfWbALetter);
                // WB6.  Do not break letters across certain punctuation.
                TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
                // WB7.  Do not break letters across certain punctuation.
                TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
                // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
                TestCurNext(ucfWbNumeric, ucfWbNumeric);
                // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
                TestCurNext(ucfWbALetter, ucfWbNumeric);
                // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
                TestCurNext(ucfWbNumeric, ucfWbALetter);
                // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
                TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
                // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
                TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
                // WB13.  Do not break between Katakana.
                TestCurNext(ucfWbKatakana, ucfWbKatakana);
                // WB13a.  Do not break from extenders.
                if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
                        (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
                // WB13b.  Do not break from extenders.
                if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
                        (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
                // WB14.  Otherwise, break everywhere.
                position = posNext; return true;
#undef TestCurNext
#undef TestCurNext2
#undef TestPrevCurNext
        }
        // WB2.  Break at the end of text.
        IAssert(position == srcEnd);
        return true;
}

template<typename TSrcVec >

void TUniChDb::FindSentenceBoundaries	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		TBoolV &	dest
	)		const

Definition at line 2790 of file unicode.h.

{
        if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
        dest.PutAll(false);
        size_t position = srcIdx;
        dest[TVecIdx(position - srcIdx)] = true;
        while (position < srcIdx + srcCount)
        {
                size_t oldPos = position;
                FindNextSentenceBoundary(src, srcIdx, srcCount, position);
                Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
                dest[TVecIdx(position - srcIdx)] = true;
        }
        Assert(dest[TVecIdx(srcCount)]);
}

template<typename TSrcVec >

void TUniChDb::FindWordBoundaries	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		TBoolV &	dest
	)		const

Definition at line 2561 of file unicode.h.

{
        if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
        dest.PutAll(false);
        size_t position = srcIdx;
        dest[TVecIdx(position - srcIdx)] = true;
        while (position < srcIdx + srcCount)
        {
                size_t oldPos = position;
                FindNextWordBoundary(src, srcIdx, srcCount, position);
                Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
                dest[TVecIdx(position - srcIdx)] = true;
        }
        Assert(dest[TVecIdx(srcCount)]);
}

static TStr TUniChDb::GetAuxiliaryDir ( ) [inline, static]

Definition at line 1304 of file unicode.h.

{ return "auxiliary"; }

static TStr TUniChDb::GetBinFn ( ) [inline, static]

Definition at line 1310 of file unicode.h.

{ return "UniChDb.bin"; } // used only by Test()

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetCaseConverted	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const TCaseConversion	how,
		const bool	turkic,
		const bool	lithuanian
	)		const

Definition at line 2811 of file unicode.h.

{
        const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
        if (clrDest) dest.Clr();
        enum {
                GreekCapitalLetterSigma = 0x3a3,
                GreekSmallLetterSigma = 0x3c3,
                GreekSmallLetterFinalSigma = 0x3c2,
                LatinCapitalLetterI = 0x49,
                LatinCapitalLetterJ = 0x4a,
                LatinCapitalLetterIWithOgonek = 0x12e,
                LatinCapitalLetterIWithGrave = 0xcc,
                LatinCapitalLetterIWithAcute = 0xcd,
                LatinCapitalLetterIWithTilde = 0x128,
                LatinCapitalLetterIWithDotAbove = 0x130,
                LatinSmallLetterI = 0x69,
                CombiningDotAbove = 0x307
        };
        //
        bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
        size_t nextWordBoundary = srcIdx;
        TBoolV wordBoundaries; bool wbsKnown = false;
        for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
        {
                int cp = src[TVecIdx(srcIdx)]; srcIdx++;
                //if (turkic && cp == 0x130 && how == ccLower) printf("!");
                // For conversion to titlecase, the first cased character of each word
                // must be converted to titlecase; everything else must be converted
                // to lowercase.
                TUniChDb::TCaseConversion howHere;
                if (how != ccTitle) howHere = how;
                else {
                        if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
                                seenCased = false; seenTwoCased = false; cpFirstCased = -1;
                                size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
                                IAssert(next > nextWordBoundary); nextWordBoundary = next; }
                        bool isCased = IsCased(cp);
                        if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
                        else { howHere = ccLower;
                                if (isCased && seenCased) seenTwoCased = true; }
                }
                // First, process the conditional mappings from SpecialCasing.txt.
                // These will be processed in code -- they were ignored while
                // we were reading SpecialCasing.txt itself.
                if (cp == GreekCapitalLetterSigma && howHere == ccLower)
                {
                        // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
                        // the standard doesn't define it.  We'll use FinalCased instead.
                        // FinalCased: within the closest word boundaries containing C,
                        // there is a cased letter before C, and there is no cased letter after C.
                        //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
                        if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
                        size_t srcIdx2 = srcIdx; bool casedAfter = false;
                        if (how == ccTitle)
                                printf("!");
                        //while (srcIdx2 < nextBoundary)
                        while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
                        {
                                int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
                                if (IsCased(cp2)) { casedAfter = true; break; }
                        }
                        if (! casedAfter)
                        {
                                //size_t prevBoundary = srcIdx - 1;
                                //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
                                srcIdx2 = srcIdx - 1; bool casedBefore = false;
                                //while (prevBoundary < srcIdx2)
                                while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
                                {
                                        --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
                                        if (IsCased(cp2)) { casedBefore = true; break; }
                                }
                                if (casedBefore) {
                                        // Now we have a FinalCased character.
                                        dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
                        }
                        // If we got here, add a non-final sigma.
                        dest.Add(GreekSmallLetterSigma); continue;
                }
                else if (lithuanian)
                {
                        if (howHere == ccLower)
                        {
                                if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
                                {
                                        bool moreAbove = false;
                                        for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
                                        {
                                                const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
                                                const int cc2 = GetCombiningClass(cp2);
                                                if (cc2 == TUniChInfo::ccStarter) break;
                                                if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
                                        }
                                        if (moreAbove)
                                        {
                                                if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
                                                if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
                                                if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
                                        }
                                }
                                else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
                                else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
                                else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
                        }
                        if (cp == CombiningDotAbove)
                        {
                                // Lithuanian, howHere != ccLower.
                                // AfterSoftDotted := the last preceding character with a combining class
                                // of zero before C was Soft_Dotted, and there is no intervening combining
                                // character class 230 (ABOVE).
                                bool afterSoftDotted = false;
                                size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
                                while (origSrcIdx < srcIdx2)
                                {
                                        --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
                                        int cc2 = GetCombiningClass(cp2);
                                        if (cc2 == TUniChInfo::ccAbove) break;
                                        if (cc2 == TUniChInfo::ccStarter) {
                                                afterSoftDotted = IsSoftDotted(cp2); break; }
                                }
                                if (afterSoftDotted)
                                {
                                        Assert(lithuanian);
                                        // Remove DOT ABOVE after "i" with upper or titlecase.
                                        // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
                                        //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
                                        if (how == ccLower) { dest.Add(0x307); continue; }
                                        if (how == ccUpper) continue;
                                        Assert(how == ccTitle);
                                        Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
                                        if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
                                        dest.Add(0x307); continue;
                                }
                        }
                }
                else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
                {
                        // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                        // The following rules handle those cases.
                        if (cp == LatinCapitalLetterIWithDotAbove) {
                                dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
                        // When lowercasing, remove dot_above in the sequence I + dot_above,
                        // which will turn into i.  This matches the behavior of the
                        // canonically equivalent I-dot_above.
                        else if (cp == CombiningDotAbove)
                        {
                                // AfterI: the last preceding base character was an uppercase I,
                                // and there is no intervening combining character class 230 (ABOVE).
                                bool afterI = false;
                                size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
                                while (origSrcIdx < srcIdx2)
                                {
                                        --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
                                        if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
                                        int cc2 = GetCombiningClass(cp2);
                                        if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
                                }
                                if (afterI) {
                                        if (how == ccTitle && seenCased && ! seenTwoCased) {
                                                // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
                                                // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
                                                // This suggests that if a cased character is found, others in that word should be left alone.
                                                // This seems unusual; we map all other characters to lowercase instead.
                                                // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
                                                // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
                                                // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
                                                // form of "I followed by dot-above" would be just "I", which is clearly wrong.
                                                // So we treat this as a special case here.
                                                IAssert(cpFirstCased == LatinCapitalLetterI);
                                                dest.Add(0x307); continue; }
                                        if (howHere != ccLower) dest.Add(0x307);
                                        continue; }
                        }
                        // When lowercasing, unless an I is before a dot_above,
                        // it turns into a dotless i.
                        else if (cp == LatinCapitalLetterI)
                        {
                                // BeforeDot: C is followed by U+0307 (combining dot above).
                                // Any sequence of characters with a combining class that is
                                // neither 0 nor 230 may intervene between the current character
                                // and the combining dot above.
                                bool beforeDot = false;
                                for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
                                {
                                        const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
                                        if (cp2 == 0x307) { beforeDot = true; break; }
                                        const int cc2 = GetCombiningClass(cp2);
                                        if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
                                }
                                if (! beforeDot) {
                                        dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
                        }
                        // When uppercasing, i turns into a dotted capital I.
                        else if (cp == LatinSmallLetterI)
                        {
                                dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
                        }
                }
                // Try to use the unconditional mappings.
                const TIntIntVH &specHere = (
                        howHere == how ? specials :
                        howHere == ccLower ? specialCasingLower :
                        howHere == ccTitle ? specialCasingTitle :
                        howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
                int i = specHere.GetKeyId(cp);
                if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
                // Try to use the simple (one-character) mappings.
                i = h.GetKeyId(cp);
                if (i >= 0) {
                        const TUniChInfo &ci = h[i];
                        int cpNew = (
                                howHere == ccLower ? ci.simpleLowerCaseMapping :
                                howHere == ccUpper ? ci.simpleUpperCaseMapping :
                                                                         ci.simpleTitleCaseMapping);
                        if (cpNew < 0) cpNew = cp;
                        dest.Add(cpNew); continue; }
                // As a final resort, leave 'cp' unchanged.
                dest.Add(cp);
        }
}

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetCaseFolded	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const bool	full,
		const bool	turkic = `false`
	)		const `[inline]`

Definition at line 1629 of file unicode.h.

                                                                                                           { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetCaseFolded	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	full = `true`,
		const bool	turkic = `false`
	)		const `[inline]`

Definition at line 1632 of file unicode.h.

                                                                                                                                                        {
                GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }

static TStr TUniChDb::GetCaseFoldingFn ( ) [inline, static]

Definition at line 1296 of file unicode.h.

{ return "CaseFolding.txt"; }

TUniChCategory TUniChDb::GetCat ( const int cp ) const [inline]

Definition at line 1353 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }

const char* TUniChDb::GetCharName ( const int cp ) const [inline]

Definition at line 1331 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }

TStr TUniChDb::GetCharNameS ( const int cp ) const [inline]

Definition at line 1332 of file unicode.h.

                                              {
                // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
                const char *p = GetCharName(cp); if (p) return p;
                char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }

int TUniChDb::GetCombiningClass ( const int cp ) const [inline]

Definition at line 1399 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }

static TStr TUniChDb::GetCompositionExclusionsFn ( ) [inline, static]

Definition at line 1299 of file unicode.h.

{ return "CompositionExclusions.txt"; }

static TStr TUniChDb::GetDerivedCorePropsFn ( ) [inline, static]

Definition at line 1301 of file unicode.h.

{ return "DerivedCoreProperties.txt"; }

static TStr TUniChDb::GetLineBreakFn ( ) [inline, static]

Definition at line 1302 of file unicode.h.

{ return "LineBreak.txt"; }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetLowerCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const `[inline]`

Definition at line 1590 of file unicode.h.

{ GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetLowerCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const `[inline]`

Definition at line 1593 of file unicode.h.

{ GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

static TStr TUniChDb::GetNormalizationTestFn ( ) [inline, static]

Definition at line 1309 of file unicode.h.

{ return "NormalizationTest.txt"; }

static TStr TUniChDb::GetPropListFn ( ) [inline, static]

Definition at line 1303 of file unicode.h.

{ return "PropList.txt"; }

int TUniChDb::GetSbFlags ( const int cp ) const [inline]

Definition at line 1359 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }

int TUniChDb::GetScript ( const TUniChInfo & ci ) const [inline]

Definition at line 1323 of file unicode.h.

{ int s = ci.script; if (s < 0) s = scriptUnknown; return s; }

int TUniChDb::GetScript ( const int cp ) const [inline]

Definition at line 1324 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }

int TUniChDb::GetScriptByName ( const TStr & scriptName ) const [inline]

Definition at line 1322 of file unicode.h.

{ return scripts.GetKeyId(scriptName); }

const TStr& TUniChDb::GetScriptName ( const int scriptId ) const [inline]

Definition at line 1321 of file unicode.h.

{ return scripts.GetKey(scriptId); }

static TStr TUniChDb::GetScriptNameHiragana ( ) [inline, static]

Definition at line 1319 of file unicode.h.

{ return "Hiragana"; }

static TStr TUniChDb::GetScriptNameKatakana ( ) [inline, static]

Definition at line 1318 of file unicode.h.

{ return "Katakana"; }

static TStr TUniChDb::GetScriptNameUnknown ( ) [inline, static]

Definition at line 1317 of file unicode.h.

{ return "Unknown"; }

static TStr TUniChDb::GetScriptsFn ( ) [inline, static]

Definition at line 1300 of file unicode.h.

{ return "Scripts.txt"; }

static TStr TUniChDb::GetSentenceBreakPropertyFn ( ) [inline, static]

Definition at line 1308 of file unicode.h.

{ return "SentenceBreakProperty.txt"; }

static TStr TUniChDb::GetSentenceBreakTestFn ( ) [inline, static]

Definition at line 1307 of file unicode.h.

{ return "SentenceBreakTest.txt"; }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleCaseConverted	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const TCaseConversion	how
	)		const

Definition at line 3036 of file unicode.h.

{
        if (clrDest) dest.Clr();
        bool seenCased = false; size_t nextWordBoundary = srcIdx;
        for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
        {
                const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
                int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
                const TUniChInfo &ci = h[i];
                // With titlecasing, the first cased character of each word must be put into titlecase,
                // all others into lowercase.  This is what the howHere variable is for.
                TUniChDb::TCaseConversion howHere;
                if (how != ccTitle) howHere = how;
                else {
                        if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
                                seenCased = false;
                                size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
                                IAssert(next > nextWordBoundary); nextWordBoundary = next; }
                        bool isCased = IsCased(cp);
                        if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
                        else howHere = ccLower;
                }
                int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
                if (cpNew < 0) cpNew = cp;
                dest.Add(cpNew);
        }
}

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleLowerCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1601 of file unicode.h.

{ GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleLowerCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1604 of file unicode.h.

{ GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleTitleCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1603 of file unicode.h.

{ GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleTitleCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1606 of file unicode.h.

{ GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleUpperCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1602 of file unicode.h.

{ GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleUpperCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 1605 of file unicode.h.

{ GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }

static TStr TUniChDb::GetSpecialCasingFn ( ) [inline, static]

Definition at line 1297 of file unicode.h.

{ return "SpecialCasing.txt"; }

TUniChSubCategory TUniChDb::GetSubCat ( const int cp ) const [inline]

Definition at line 1354 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetTitleCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const `[inline]`

Definition at line 1592 of file unicode.h.

{ GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetTitleCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const `[inline]`

Definition at line 1595 of file unicode.h.

{ GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

static TStr TUniChDb::GetUnicodeDataFn ( ) [inline, static]

Definition at line 1298 of file unicode.h.

{ return "UnicodeData.txt"; }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetUpperCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const `[inline]`

Definition at line 1591 of file unicode.h.

{ GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetUpperCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const `[inline]`

Definition at line 1594 of file unicode.h.

{ GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

int TUniChDb::GetWbFlags ( const int cp ) const [inline]

Definition at line 1357 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }

static TStr TUniChDb::GetWordBreakPropertyFn ( ) [inline, static]

Definition at line 1306 of file unicode.h.

{ return "WordBreakProperty.txt"; }

static TStr TUniChDb::GetWordBreakTestFn ( ) [inline, static]

Definition at line 1305 of file unicode.h.

{ return "WordBreakTest.txt"; }

void TUniChDb::InitAfterLoad ( ) [protected]

Definition at line 1372 of file unicode.cpp.

{
        scriptUnknown = GetScriptByName(GetScriptNameUnknown()); IAssert(scriptUnknown >= 0);
}

void TUniChDb::InitDerivedCoreProperties ( const TStr & basePath ) [protected]

Definition at line 1011 of file unicode.cpp.

{
        TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
        reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
        TSubcatHelper helper(*this);
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 2);
                int from, to; reader.ParseCodePointRange(fields[0], from, to);
                TStr s = fields[1];
                TUniChFlags flag = ucfCompatibilityDecomposition;
                if (s == "Math") flag = ucfDcpMath;
                else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
                else if (s == "Lowercase") flag = ucfDcpLowercase;
                else if (s == "Uppercase") flag = ucfDcpUppercase;
                else if (s == "ID_Start") flag = ucfDcpIdStart;
                else if (s == "ID_Continue") flag = ucfDcpIdContinue;
                else if (s == "XID_Start") flag = ucfDcpXidStart;
                else if (s == "XID_Continue") flag = ucfDcpXidContinue;
                else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
                else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
                else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
                else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
                else FailR(s.CStr());
                // If we add new codepoints to the hash table, we should also set their category.
                // This is supposed to be provided in the comment, e.g. "# Cf       SOFT HYPHEN".
                helper.ProcessComment(reader);
                //
                for (int cp = from; cp <= to; cp++) {
                        int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
                        helper.TestCat(cp);
                        TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
                        ci.SetDcpFlag(flag); nCps++; }
                nLines++;
        }
        reader.Close();
        printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
}

void TUniChDb::InitLineBreaks ( const TStr & basePath ) [protected]

Definition at line 1050 of file unicode.cpp.

{
        // Clear old linebreak values.
        ushort xx = TUniChInfo::LineBreak_Unknown;
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
        // Read LineBreak.txt.
        TUcdFileReader reader; TStrV fields;
        reader.Open(CombinePath(basePath, GetLineBreakFn()));
        int nLines = 0, nCps = 0;
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 2);
                int from, to; reader.ParseCodePointRange(fields[0], from, to);
                TStr s = fields[1]; IAssert(s.Len() == 2);
                ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
                if (us == xx) continue;
                for (int cp = from; cp <= to; cp++) {
                        int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
                                printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
                        IAssert(h[i].lineBreak == xx);
                        h[i].lineBreak = us; nCps++; }
                nLines++;
        }
        reader.Close();
        printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
}

void TUniChDb::InitPropList ( const TStr & basePath ) [protected]

Definition at line 954 of file unicode.cpp.

{
        TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
        reader.Open(CombinePath(basePath, GetPropListFn()));
        TSubcatHelper helper(*this);
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 2);
                int from, to; reader.ParseCodePointRange(fields[0], from, to);
                TStr s = fields[1];
                TUniChProperties prop = TUniChProperties(0); TUniChPropertiesX propx = TUniChPropertiesX(0);
                if (s == "White_Space") prop = ucfPrWhiteSpace;
                else if (s == "Bidi_Control") prop = ucfPrBidiControl;
                else if (s == "Join_Control") prop = ucfPrJoinControl;
                else if (s == "Dash") prop = ucfPrDash;
                else if (s == "Hyphen") prop = ucfPrHyphen;
                else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
                else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
                else if (s == "Other_Math") propx = ucfPxOtherMath;
                else if (s == "Hex_Digit") prop = ucfPrHexDigit;
                else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
                else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
                else if (s == "Ideographic") prop = ucfPrIdeographic;
                else if (s == "Diacritic") prop = ucfPrDiacritic;
                else if (s == "Extender") prop = ucfPrExtender;
                else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
                else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
                else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
                else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
                else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
                else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
                else if (s == "Radical") propx = ucfPxRadical;
                else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
                else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
                else if (s == "Deprecated") prop = ucfPrDeprecated;
                else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
                else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
                else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
                else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
                else if (s == "STerm") prop = ucfPrSTerm;
                else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
                else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
                else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
                else FailR(s.CStr());
                helper.ProcessComment(reader);
                for (int cp = from; cp <= to; cp++) {
                        int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
                        TUniChInfo &ci = h[i]; helper.TestCat(cp);
                        if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
                        if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
                        nCps++; }
                nLines++;
        }
        reader.Close();
        printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
}

void TUniChDb::InitScripts ( const TStr & basePath ) [protected]

Definition at line 1077 of file unicode.cpp.

{
        TUcdFileReader reader; TStrV fields;
        reader.Open(CombinePath(basePath, GetScriptsFn()));
        TSubcatHelper helper(*this);
        while (reader.GetNextLine(fields))
        {
                int from, to; reader.ParseCodePointRange(fields[0], from, to);
                TStr scriptName = fields[1];
                int scriptNo = scripts.GetKeyId(scriptName);
                if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
                IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
                scripts[scriptNo] += 1;
                helper.ProcessComment(reader);
                for (int cp = from; cp <= to; cp++) {
                        int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
                        helper.TestCat(cp);
                        TUniChInfo &ci = h[i]; ci.script = scriptNo; }
        }
        reader.Close();
        scripts.AddDat(GetScriptNameUnknown()) = 0;
        printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
        if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
                printf("  %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
        printf("\n");
}

void TUniChDb::InitSpecialCasing ( const TStr & basePath ) [protected]

Definition at line 1229 of file unicode.cpp.

{
        TUcdFileReader reader; TStrV fields;
        reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 5 || fields.Len() == 6);
                IAssert(fields.Last().Empty());
                // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
                TStr conditions = "";
                if (fields.Len() == 6) conditions = fields[4];
                conditions.ToTrunc(); if (! conditions.Empty()) continue;
                // Keep the other mappings.
                const int cp = reader.ParseCodePoint(fields[0]);
                TIntV v; reader.ParseCodePointList(fields[1], v);
                specialCasingLower.AddDat(cp, v);
                reader.ParseCodePointList(fields[2], v);
                specialCasingTitle.AddDat(cp, v);
                reader.ParseCodePointList(fields[3], v);
                specialCasingUpper.AddDat(cp, v);
        }
        reader.Close();
}

void TUniChDb::InitWordAndSentenceBoundaryFlags ( const TStr & basePath ) [protected]

Definition at line 1104 of file unicode.cpp.

{
        // UAX #29, sec. 4.1 and 5.1.
        // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
        int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
        int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
        // Clear any existing word-boundary flags and initialize them again.
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
        {
                const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
                ci.ClrWbAndSbFlags();
                // Word-boundary flags.
                if (ci.subCat  == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
                if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
                if (ci.lineBreak == TUniChInfo::LineBreak_InfixNumeric && cp != 0x3a) ci.SetWbFlag(ucfWbMidNum);
                if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetWbFlag(ucfWbNumeric);
                if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
                // Sentence-boundary flags.  Some are identical to some word-boundary flags.
                if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
                if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
                if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
                if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
                if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
                if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
                if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetSbFlag(ucfSbNumeric);
                if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
                // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
                // the purposes of sentence-boundary detection.  Now in PropList.txt there is no doubt that 002E has the STerm
                // property; thus, it should also belong to the STerm sentence-boundary class.  However, in
                // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
                if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
                if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
        }
        // Some additional characters for Katakana and MidLetter.
        TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
        for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
        v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
        for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
        // WbALetter depends on Katakana, so it cannot be initialized earlier.
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
        {
                const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
                if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
                        ci.SetWbFlag(ucfWbALetter);
        }
        // An alternative is to extract the flags from WordBreakProperty.txt.
        // The results should be the same.
        {TUcdFileReader reader; TStrV fields;
        reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn()));
        THash<TInt, TInt> hh;
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 2);
                int from, to; reader.ParseCodePointRange(fields[0], from, to);
                TStr s = fields[1];
                TUniChFlags flag = ucfCompatibilityDecomposition;
                if (s == "Format") flag = ucfWbFormat;
                else if (s == "Katakana") flag = ucfWbKatakana;
                else if (s == "ALetter") flag = ucfWbALetter;
                else if (s == "MidLetter") flag = ucfWbMidLetter;
                else if (s == "MidNum") flag = ucfWbMidNum;
                else if (s == "Numeric") flag = ucfWbNumeric;
                else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
                else FailR(s.CStr());
                for (int c = from; c <= to; c++) {
                        int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
                        else hh[i].Val |= flag; }
        }
        reader.Close();
        TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
        for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
        cps.Sort(); cps.Merge();
        for (int i = 0; i < cps.Len(); i++)
        {
                int cp = cps[i];
                int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
                int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
                flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
                if (flags1 != flags2) {
                        printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
                        Fail; }
        }}
        // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
        {TUcdFileReader reader; TStrV fields;
        reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn()));
        THash<TInt, TInt> hh;
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 2);
                int from, to; reader.ParseCodePointRange(fields[0], from, to);
                TStr s = fields[1];
                TUniChFlags flag = ucfCompatibilityDecomposition;
                if (s == "Sep") flag = ucfSbSep;
                else if (s == "Format") flag = ucfSbFormat;
                else if (s == "Sp") flag = ucfSbSp;
                else if (s == "Lower") flag = ucfSbLower;
                else if (s == "Upper") flag = ucfSbUpper;
                else if (s == "OLetter") flag = ucfSbOLetter;
                else if (s == "Numeric") flag = ucfSbNumeric;
                else if (s == "ATerm") flag = ucfSbATerm;
                else if (s == "STerm") flag = ucfSbSTerm;
                else if (s == "Close") flag = ucfSbClose;
                else FailR(s.CStr());
                for (int c = from; c <= to; c++) {
                        int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
                        else hh[i].Val |= flag; }
        }
        reader.Close();
        TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
        for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
        cps.Sort(); cps.Merge();
        for (int i = 0; i < cps.Len(); i++)
        {
                int cp = cps[i];
                int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
                int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
                if (flags1 != flags2) {
                        printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
                                flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
                                flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
                                flags1 ^ flags2);
                        Fail; }
        }}
}

bool TUniChDb::IsGetChInfo	(	const int	cp,
		TUniChInfo &	ChInfo
	)		`[inline]`

Definition at line 1350 of file unicode.h.

                                                           {
                int i = h.GetKeyId(cp);
                if (i < 0) return false; else { ChInfo=h[i]; return true; }}

DECLARE_FORWARDED_PROPERTY_METHODS bool TUniChDb::IsPrivateUse ( const int cp ) const [inline]

Definition at line 1383 of file unicode.h.

                                              {
                int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
                return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
                        // Planes 15 and 16 are entirely for private use.
                        (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }

bool TUniChDb::IsSbFlag	(	const int	cp,
		const TUniChFlags	flag
	)		const `[inline]`

Definition at line 1358 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }

bool TUniChDb::IsSurrogate ( const int cp ) const [inline]

Definition at line 1392 of file unicode.h.

                                             {
                int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
                return 0xd800 <= cp && cp <= 0xdcff; }

bool TUniChDb::IsWbFlag	(	const int	cp,
		const TUniChFlags	flag
	)		const `[inline]`

Definition at line 1356 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }

static bool TUniChDb::IsWbIgnored ( const TUniChInfo & ci ) [inline, static, protected]

Definition at line 1419 of file unicode.h.

{ return ci.IsGbExtend() || ci.IsWbFormat(); }

bool TUniChDb::IsWbIgnored ( const int cp ) const [inline, protected]

Definition at line 1420 of file unicode.h.

{ int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }

void TUniChDb::Load ( TSIn & SIn ) [inline]

Definition at line 1285 of file unicode.h.

                             {
                h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
                decompositions.Load(SIn);
                inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
                specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
                SIn.LoadCs(); InitAfterLoad(); }

void TUniChDb::LoadBin ( const TStr & fnBin ) [inline]

Definition at line 1291 of file unicode.h.

                                        {
                PSIn SIn = TFIn::New(fnBin); Load(*SIn); }

void TUniChDb::LoadTxt ( const TStr & basePath )

Definition at line 1253 of file unicode.cpp.

{
        Clr();
        // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
        h = THash<TInt, TUniChInfo>(196613, true);
        //
        caseFolding.LoadTxt(CombinePath(basePath, GetCaseFoldingFn()));
        //
        TUcdFileReader reader; TStrV fields; TIntH seen;
        reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
        while (reader.GetNextLine(fields))
        {
                // Codepoint.
                int cp = reader.ParseCodePoint(fields[0]);
                IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
                TUniChInfo& ci = h.AddDat(cp);
                // Name.
                ci.nameOffset = charNames.AddStr(fields[1]);
                // Category.
                TStr& s = fields[2]; IAssert(s.Len() == 2);
                ci.chCat = s[0]; ci.chSubCat = s[1];
                // Canonical combining class.
                s = fields[3]; IAssert(s.Len() > 0);
                int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
                ci.combClass = (uchar) i;
                // Decomposition type and mapping.
                LoadTxt_ProcessDecomposition(ci, fields[5]);
                // Simple case mappings.
                s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
                s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
                s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
                //
                ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
        }
        reader.Close();
        //
        InitScripts(basePath);
        //
        InitPropList(basePath);
        InitDerivedCoreProperties(basePath);
        InitLineBreaks(basePath);
        InitSpecialCasing(basePath);
        // Process the composition exclusions (UAX #15, sec. 6).
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
        {
                TUniChInfo& ci = h[i];
                int ofs = ci.decompOffset; if (ofs < 0) continue;
                int n = 0; while (decompositions[ofs + n] >= 0) n++;
                IAssert(n > 0);
                // Singleton decompositions.
                if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
                // Non-starter decompositions.
                int cp1 = decompositions[ofs];
                IAssert(h.IsKey(cp1));
                uchar ccc = h.GetDat(cp1).combClass;
                if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
        }
        // Process the composition exclusion table.
        reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
        int nExclusionTable = 0;
        while (reader.GetNextLine(fields))
        {
                IAssert(fields.Len() == 1);
                int cp = reader.ParseCodePoint(fields[0]);
                int i = h.GetKeyId(cp); IAssert(i >= 0);
                h[i].flags |= ucfCompositionExclusion;
                nExclusionTable++;
        }
        reader.Close();
        // Prepare the inverted index for composition pairs.
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
        {
                int cp = h.GetKey(i);
                TUniChInfo& ci = h[i];
                int ofs = ci.decompOffset; if (ofs < 0) continue;
                if (ci.IsCompositionExclusion()) continue;
                if (ci.IsCompatibilityDecomposition()) continue;
                int n = 0; while (decompositions[ofs + n] >= 0) n++;
                if (n != 2) continue;
                TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
                IAssert(! inverseDec.IsKey(pr));
                IAssert(ci.combClass == TUniChInfo::ccStarter);
                inverseDec.AddDat(pr, cp);
        }
        printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
                basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
        // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
        // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
        InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
        // Make sure that Hangul combined characters are treated as stareters.
        for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
        {
                int j = h.GetKeyId(cp); if (j < 0) continue;
                TUniChInfo& ci = h[j];
                if (ci.combClass == TUniChInfo::ccInvalid) ci.combClass = TUniChInfo::ccStarter;
                IAssert(ci.combClass == TUniChInfo::ccStarter);
        }
        // There should be no more additions to 'h' beyond this point.
        const int oldHLen = h.Len();
        // Provide default (identity) case mappings if any were missing from UnicodeData.txt
        // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
        int scriptUnknown = GetScriptByName(GetScriptNameUnknown());
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
        {
                int cp = h.GetKey(i); TUniChInfo &ci = h[i];
                if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
                if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
                if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
                if (ci.script < 0) ci.script = scriptUnknown;
        }
        IAssert(h.Len() == oldHLen);
}

void TUniChDb::LoadTxt_ProcessDecomposition	(	TUniChInfo &	ci,
		TStr	s
	)		`[protected]`

Definition at line 941 of file unicode.cpp.

{
        if (s.Empty()) return;
        if (s[0] == '<') {
                int i = s.SearchCh('>'); IAssert(i > 0);
                ci.flags |= ucfCompatibilityDecomposition;
                s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
        TIntV dec; TUcdFileReader::ParseCodePointList(s, dec);
        IAssert(dec.Len() > 0);
        ci.decompOffset = decompositions.Len();
        decompositions.AddV(dec); decompositions.Add(-1);
}

template<class TSrcVec >

void TUniChDb::PrintCharNames	(	FILE *	f,
		const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		const TStr &	prefix
	)		const `[inline]`

Definition at line 1336 of file unicode.h.

                                                                                                                                                 {
                if (! f) f = stdout;
                for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
                        fprintf(f, "%s", prefix.CStr());
                        int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
                        fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}

template<class TSrcVec >

void TUniChDb::PrintCharNames	(	FILE *	f,
		const TSrcVec &	src,
		const TStr &	prefix
	)		const `[inline]`

Definition at line 1342 of file unicode.h.

{ PrintCharNames(f, src, 0, src.Len(), prefix); }

void TUniChDb::Save ( TSOut & SOut ) const [inline]

Definition at line 1280 of file unicode.h.

                                     {
                h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
                inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
                specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
                SOut.SaveCs(); }

void TUniChDb::SaveBin ( const TStr & fnBinUcd )

Definition at line 1366 of file unicode.cpp.

{
        PSOut SOut=TFOut::New(fnBinUcd);
        Save(*SOut);
}

template<class TSrcVec >

void TUniChDb::SbEx_Add ( const TSrcVec & v ) [inline]

Definition at line 1490 of file unicode.h.

{ sbExTrie.Add(v); }

void TUniChDb::SbEx_Add ( const TStr & s ) [inline]

Definition at line 1492 of file unicode.h.

                                     {
          TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }

int TUniChDb::SbEx_AddMulti	(	const TStr &	words,
		const bool	wordsAreUtf8 = `true`
	)		`[inline]`

Definition at line 1495 of file unicode.h.

                                                                             { TStrV vec; words.SplitOnAllCh('|', vec);
                for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
                return vec.Len(); }

void TUniChDb::SbEx_AddUtf8 ( const TStr & s ) [inline]

Definition at line 1494 of file unicode.h.

{ TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }

void TUniChDb::SbEx_Clr ( ) [inline]

Definition at line 1489 of file unicode.h.

{ sbExTrie.Clr(); }

void TUniChDb::SbEx_Set ( const TUniTrie< TInt > & newTrie ) [inline]

Definition at line 1498 of file unicode.h.

{ sbExTrie = newTrie; }

int TUniChDb::SbEx_SetStdEnglish ( ) [inline]

Definition at line 1499 of file unicode.h.

                                 {
                static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
                SbEx_Clr(); return SbEx_AddMulti(data, false); }

void TUniChDb::Test ( const TStr & basePath )

Definition at line 1381 of file unicode.cpp.

{
        TStr fnBin = CombinePath(basePath, GetBinFn());
        if (true || ! TFile::Exists(fnBin))
        {
                // Test LoadTxt.
                LoadTxt(basePath);
                // Test Save.
                {PSOut SOut = TFOut::New(fnBin);
                Save(*SOut);}
        }
        // Test Load.
        this->~TUniChDb();
        new(this) TUniChDb();
        {PSIn SIn = TFIn::New(fnBin);
        Load(*SIn);}
        // Test the case folding.
        caseFolding.Test();
        // Test the word breaking.
        TestWbFindNonIgnored();
        // Test the sentence breaking.
        TestFindNextWordOrSentenceBoundary(basePath, true);
        TestFindNextWordOrSentenceBoundary(basePath, false);
        // Test composition and decomposition.
        TestComposition(basePath);
        // Test the case conversions.
        TestCaseConversions();
}

void TUniChDb::TestCaseConversion	(	const TStr &	source,
		const TStr &	trueLc,
		const TStr &	trueTc,
		const TStr &	trueUc,
		bool	turkic,
		bool	lithuanian
	)		`[protected]`

Definition at line 829 of file unicode.cpp.

{
        TIntV src;
        TUcdFileReader::ParseCodePointList(source, src);
        FILE *f = stderr;
        for (int i = 0; i < 3; i++)
        {
                TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
                const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
                TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
                TIntV dest;
                GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
                bool ok = (dest.Len() == trueDest.Len());
                if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
                if (ok) continue;
                fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
                for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
                fprintf(f, ")\nCorrect:   (");
                for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
                fprintf(f, ")\nOur output:(");
                for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
                fprintf(f, ")\n");
                IAssert(ok);
        }
}

void TUniChDb::TestCaseConversions ( ) [protected]

Definition at line 857 of file unicode.cpp.

{
        // Because no thorough case-conversion test files have been provided as part
        // of the Unicode standard, we'll have to test things on a few test cases of our own.
        // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
        const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
        const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
        const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
        const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
        const TStr space = "0020 ", Grave = "0300 ";
        TestCaseConversion(
                F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst,  // source
                f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst,  // lowercase
                F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst,      // titlecase
                F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
                false, false);
        // - Dotted I, dotless i, etc., but with turkic == false.
        const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
        TestCaseConversion(
                s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
                s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
                S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
                S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
                false, false);
        // - Sigma (final vs. non-final forms).
        const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
        TestCaseConversion(
                Sigma + s + space + s + Sigma  + space + s + Sigma + s + space + Sigma + S + Sigma  + space + Sigma, // source
                sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
                Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
                Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
                false, false);
        TestCaseConversion(
                sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + S + sigma  + space + sigma, // source
                sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + s + sigma  + space + sigma, // lowercase
                Sigma + s + space + S + sigma  + space + S + sigma + s + space + Sigma + s + sigma  + space + Sigma, // titlecase
                Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
                false, false);
        TestCaseConversion(
                fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma  + space + fsigma, // source
                fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma  + space + fsigma, // lowercase
                Sigma  + s + space + S + fsigma + space + S + fsigma + s + space + Sigma  + s + fsigma  + space + Sigma, // titlecase
                Sigma  + S + space + S + Sigma  + space + S + Sigma  + S + space + Sigma  + S + Sigma   + space + Sigma, // uppercase
                false, false);
        const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
        // Special case mappings for Turkic languages:
        // - After_I
        TestCaseConversion(
                s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
                s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
                S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
                S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
                true, false); // turkic
        // - Not_Before_Dot
        TestCaseConversion(
                I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
                iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
                I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
                I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
                true, false); // turkic
        // Special case mappings for Lithuanian:
        // - After_Soft_Dotted  [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
        TestCaseConversion(
                i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
                i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
                I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
                I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
                false, true); // lithuanian
        // - More_Above  [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
        TestCaseConversion(
                J +        Grave + space + J +        nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J +        nonSA + Grave + space + j + nonSA, // source
                j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
                J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
                J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + J +        nonSA + Grave + space + J + nonSA, // uppercase
                false, true); // lithuanian
        // SoftDotted [^ Starter Above]* 0307   --(uc,tc)-->  brez 0307
        // SoftDotted [^ Starter Above]* 0307   --(
        //TestCaseConversion("", "", "", "", false, false);
}

void TUniChDb::TestComposition ( const TStr & basePath ) [protected]

Definition at line 749 of file unicode.cpp.

{
        TUcdFileReader reader; TStrV fields; int nLines = 0;
        reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
        bool inPart1 = false; TIntH testedInPart1;
        while (reader.GetNextLine(fields))
        {
                nLines += 1;
                if (fields.Len() == 1) {
                        IAssert(fields[0].IsPrefix("@Part"));
                        inPart1 = (fields[0] == "@Part1"); continue; }
                IAssert(fields.Len() == 6);
                IAssert(fields[5].Len() == 0);
                TIntV c1, c2, c3, c4, c5;
                reader.ParseCodePointList(fields[0], c1);
                reader.ParseCodePointList(fields[1], c2);
                reader.ParseCodePointList(fields[2], c3);
                reader.ParseCodePointList(fields[3], c4);
                reader.ParseCodePointList(fields[4], c5);
                TIntV v;
#define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
#define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
#define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
#define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
#define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
                // NFD:
                NFD_(c3, c1);   // c3 == NFD(c1)
                NFD_(c3, c2);   // c3 == NFD(c2)
                NFD_(c3, c3);   // c3 == NFD(c3)
                NFD_(c5, c4);   // c5 == NFD(c4)
                NFD_(c5, c5);   // c5 == NFD(c5)
                // NFC:
                NFC_(c2, c1);   // c2 == NFC(c1)
                NFC_(c2, c2);   // c2 == NFC(c2)
                NFC_(c2, c3);   // c2 == NFC(c3)
                NFC_(c4, c4);   // c4 == NFC(c4)
                NFC_(c4, c5);   // c4 == NFC(c5)
                // NFKD:
                NFKD_(c5, c1);   // c5 == NFKD(c1)
                NFKD_(c5, c2);   // c5 == NFKD(c2)
                NFKD_(c5, c3);   // c5 == NFKD(c3)
                NFKD_(c5, c4);   // c5 == NFKD(c4)
                NFKD_(c5, c5);   // c5 == NFKD(c5)
                // NFKC:
                NFKC_(c4, c1);   // c4 == NFKC(c1)
                NFKC_(c4, c2);   // c4 == NFKC(c2)
                NFKC_(c4, c3);   // c4 == NFKC(c3)
                NFKC_(c4, c4);   // c4 == NFKC(c4)
                NFKC_(c4, c5);   // c4 == NFKC(c5)
                //
                if (inPart1) {
                        IAssert(c1.Len() == 1);
                        testedInPart1.AddKey(c1[0]); }
        }
        reader.Close();
        // Test other individual codepoints that were not mentioned in part 1.
        int nOther = 0;
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
        {
                const int cp = h.GetKey(i), nLines = -1;
                if (testedInPart1.IsKey(cp)) continue;
                TIntV x, v; x.Add(cp);
                NFC_(x, x);    // x == NFC(x)
                NFD_(x, x);    // x == NFD(x)
                NFKC_(x, x);   // x == NFKC(x)
                NFKD_(x, x);   // x == NFKD(x)
                nOther += 1;
        }
#undef AssE_
#undef NFC_
#undef NFD_
#undef NFKC_
#undef NFKD_
        printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
}

void TUniChDb::TestFindNextWordOrSentenceBoundary	(	const TStr &	basePath,
		bool	sentence
	)		`[protected]`

Definition at line 653 of file unicode.cpp.

{
        TUcdFileReader reader; TStrV fields;
        reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
        int nLines = 0; TRnd rnd = TRnd(123);
        while (reader.GetNextLine(fields))
        {
                nLines += 1;
                IAssert(fields.Len() == 1);
                TStrV parts; fields[0].SplitOnWs(parts);
                const int n = parts.Len(); IAssert((n % 2) == 1);
                TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
                // Each line is a sequence of codepoints, with a \times or \div in between each
                // pair of codepoints (as well as at the beginning and the end of the sequence) to
                // indicate whether a boundary exists there or not.
                for (int i = 0; i < n; i++)
                {
                        const TStr& s = parts[i];
                        if ((i % 2) == 0) {
                                if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
                                        isBreak.Add(false);
                                else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
                                        isBreak.Add(true);
                                else FailR(s.CStr()); }
                        else chars.Add(reader.ParseCodePoint(s));
                }
                const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
                IAssert(isBreak[0]); IAssert(isBreak[m]);
                isPredicted.Gen(m + 1); isPredicted.PutAll(false);
                if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
                // We'll insert a few random characters at the beginning of the sequence
                // so that srcPos doesn't always begin at 0.
                for (int nBefore = 0; nBefore < 5; nBefore++)
                {
                        TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
                        chars2.AddV(chars);
                        // Use FindNextBoundary to find all the word boundaries.
                        size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
                        while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
                        {
                                IAssert(prevPosition < position);
                                IAssert(position <= size_t(nBefore + m));
                                isPredicted[int(position) - nBefore] = true;
                                prevPosition = position;
                        }
                        IAssert(position == size_t(nBefore + m));
                        if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
                        else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
                        IAssert(isPredicted2.Len() == m + 1);
                        bool ok = true;
                        // If we start at 0, the word boundary at the beginning of the sequence was
                        // not found explicitly, so we'll add it now.
                        if (nBefore == 0) isPredicted[0] = true;
                        // Compare the predicted and the true boundaries.
                        for (int i = 0; i <= m; i++) {
                                if (isBreak[i] != isPredicted[i]) ok = false;
                                IAssert(isPredicted2[i] == isPredicted[i]); }
                        FILE *f = stderr;
                        if (! ok)
                        {
                                fprintf(f, "\nError in line %d:\n", nLines);
                                fprintf(f, "True:      ");
                                for (int i = 0; i <= m; i++) {
                                        fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
                                        if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
                                fprintf(f, "\nPredicted: ");
                                for (int i = 0; i <= m; i++) {
                                        fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
                                        if (i < m) {
                                                const int cp = chars[i + nBefore];
                                                TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp));
                                                if (IsWbIgnored(cp)) s = "*" + s;
                                                fprintf(f, "%4s ", s.CStr()); }}
                                fprintf(f, "\n");
                                Fail;
                        }
                        // Test FindNextBoundary if we start in the middle of the sequence,
                        // i.e. not at an existing boundary.
                        for (int i = 0; i < m; i++) {
                                position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
                                IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
                                IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
                                position -= nBefore;
                                for (int j = i + 1; j < int(position); j++)
                                        IAssert(! isBreak[j]);
                                IAssert(isBreak[int(position)]); }
                }
        }
        reader.Close();
        printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
}

void TUniChDb::TestWbFindNonIgnored ( const TIntV & src ) const [protected]

Definition at line 583 of file unicode.cpp.

{
        int n = src.Len();
        TBoolV isIgnored; isIgnored.Gen(n);
        for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
        TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
        prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
        FILE *f = 0; // stderr;
        for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
        {
                int prev = -1;
                for (int i = 0; i < srcLen; i++) {
                        prevNonIgnored[i] = prev;
                        if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
                int next = srcIdx + srcLen;
                for (int i = srcLen - 1; i >= 0; i--) {
                        nextNonIgnored[i] = next;
                        if (! isIgnored[srcIdx + i]) next = srcIdx + i;
                        curOrNextNonIgnored[i] = next; }
                if (f) {
                        fprintf(f, "\nIndex:     "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
                        fprintf(f, "\nNonIgn:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
                        fprintf(f, "\nPrevNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
                        fprintf(f, "\nNextNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
                        fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
                        fprintf(f, "\n"); }
                for (int i = 0; i < srcLen; i++)
                {
                        size_t s;
                        s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
                        IAssert(s == size_t(nextNonIgnored[i]));
                        s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
                        IAssert(s == size_t(curOrNextNonIgnored[i]));
                        s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
                        if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
                        else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
                }
        }
}

void TUniChDb::TestWbFindNonIgnored ( ) const [protected]

Definition at line 623 of file unicode.cpp.

{
        TIntV chIgnored, chNonIgnored;
        FILE *f = 0; // stderr;
        for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
                const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
                if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
                        ci.flags, ci.properties, ci.propertiesX, GetScriptName(ci.script).CStr());
                (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
        }
        chIgnored.Sort(); chNonIgnored.Sort();
        printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
        TRnd rnd = TRnd(123);
        for (int iter = 0; iter <= 50; iter++)
        {
                int percIgnored = 2 * iter;
                for (int n = 0; n <= 20; n++)
                {
                        // Prepare a random sequence of 'n' codepoints.
                        TIntV v; v.Gen(n);
                        for (int i = 0; i < n; i++) {
                                TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
                                int j = rnd.GetUniDevInt(chars.Len());
                                v.Add(chars[j]); }
                        // Run the tests with this sequence.
                        TestWbFindNonIgnored(v);
                }
        }
}

template<typename TSrcVec >

void TUniChDb::ToCaseFolded	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		const bool	turkic = `false`
	)		const `[inline]`

Definition at line 1636 of file unicode.h.

{ caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }

template<typename TSrcVec >

void TUniChDb::ToCaseFolded	(	TSrcVec &	src,
		const bool	turkic = `false`
	)		const `[inline]`

Definition at line 1637 of file unicode.h.

{ ToCaseFolded(src, 0, src.Len(), turkic); }

template<typename TSrcVec >

void TUniChDb::ToSimpleCaseConverted	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		const TCaseConversion	how
	)		const

Definition at line 3066 of file unicode.h.

{
        bool seenCased = false; size_t nextWordBoundary = srcIdx;
        for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
        {
                const int cp = src[TVecIdx(srcIdx)];
                int i = h.GetKeyId(cp); if (i < 0) continue;
                const TUniChInfo &ci = h[i];
                // With titlecasing, the first cased character of each word must be put into titlecase,
                // all others into lowercase.  This is what the howHere variable is for.
                TUniChDb::TCaseConversion howHere;
                if (how != ccTitle) howHere = how;
                else {
                        if (srcIdx == nextWordBoundary) { // A word starts/ends here.
                                seenCased = false;
                                size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
                                IAssert(next > nextWordBoundary); nextWordBoundary = next; }
                        bool isCased = IsCased(cp);
                        if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
                        else howHere = ccLower;
                }
                int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
                if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
        }
}

template<typename TSrcVec >

void TUniChDb::ToSimpleLowerCase	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const `[inline]`

Definition at line 1610 of file unicode.h.

{ ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }

template<typename TSrcVec >

void TUniChDb::ToSimpleLowerCase ( TSrcVec & src ) const [inline]

Definition at line 1613 of file unicode.h.

{ ToSimpleLowerCase(src, 0, src.Len()); }

template<typename TSrcVec >

void TUniChDb::ToSimpleTitleCase	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const `[inline]`

Definition at line 1611 of file unicode.h.

{ ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }

template<typename TSrcVec >

void TUniChDb::ToSimpleTitleCase ( TSrcVec & src ) const [inline]

Definition at line 1614 of file unicode.h.

{ ToSimpleTitleCase(src, 0, src.Len()); }

template<typename TSrcVec >

void TUniChDb::ToSimpleUpperCase	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const `[inline]`

Definition at line 1609 of file unicode.h.

{ ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }

template<typename TSrcVec >

void TUniChDb::ToSimpleUpperCase ( TSrcVec & src ) const [inline]

Definition at line 1612 of file unicode.h.

{ ToSimpleUpperCase(src, 0, src.Len()); }

template<typename TSrcVec >

void TUniChDb::WbFindCurOrNextNonIgnored	(	const TSrcVec &	src,
		size_t &	position,
		const size_t	srcEnd
	)		const `[inline, protected]`

Definition at line 1422 of file unicode.h.

                                                                                                                                   {
                while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }

template<typename TSrcVec >

void TUniChDb::WbFindNextNonIgnored	(	const TSrcVec &	src,
		size_t &	position,
		const size_t	srcEnd
	)		const `[inline, protected]`

Definition at line 1425 of file unicode.h.

                                                                                                                              {
                if (position >= srcEnd) return;
                position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }

template<typename TSrcVec >

void TUniChDb::WbFindNextNonIgnoredS	(	const TSrcVec &	src,
		size_t &	position,
		const size_t	srcEnd
	)		const `[inline, protected]`

Definition at line 1429 of file unicode.h.

                                                                                                                               {
                if (position >= srcEnd) return;
                if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
                position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }

template<typename TSrcVec >

bool TUniChDb::WbFindPrevNonIgnored	(	const TSrcVec &	src,
		const size_t	srcStart,
		size_t &	position
	)		const `[inline, protected]`

Definition at line 1434 of file unicode.h.

                                                                                                                                {
                if (position <= srcStart) return false;
                while (position > srcStart) {
                        position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
                return false; }