SNAP Library 2.3, Developer Reference: TUniChDb Class Reference

TUniChDb::TCaseConversion

Definition: unicode.h:1584

enum TUniChDb::TCaseConversion_ TCaseConversion

TUniChDb::ccMax

Definition: unicode.h:1584

Definition: unicode.h:1584

TUniChDb::DecomposeAndCompose

Definition: unicode.h:1584

Constructor & Destructor Documentation

TUniChDb::TUniChDb ( )

inline

Definition at line 1274 of file unicode.h.

Referenced by Test().

1274 : scriptUnknown(-1) { }

TUniChDb::scriptUnknown

int scriptUnknown

Definition: unicode.h:1272

Here is the caller graph for this function:

TUniChDb::TUniChDb ( TSIn & SIn )

inlineexplicit

Definition at line 1275 of file unicode.h.

References Load().

1275 { Load(SIn); }

TUniChDb::Load

void Load(TSIn &SIn)

Definition: unicode.h:1285

Here is the call graph for this function:

Member Function Documentation

template<typename TDestCh >

void TUniChDb::AddDecomposition	(	const int	codePoint,
		TVec< TDestCh > &	dest,
		const bool	compatibility
	)		const

protected

Definition at line 3103 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), TUniChInfo::decompOffset, decompositions, THash< TKey, TDat, THashFunc >::GetKeyId(), h, HangulLBase, HangulNCount, HangulSBase, HangulSCount, HangulTBase, HangulTCount, HangulVBase, and TUniChInfo::IsCompatibilityDecomposition().

Referenced by Decompose().

 {
         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
         {
                 // UAX #15, sec. 16: Hangul decomposition
                 const int SIndex = codePoint - HangulSBase;
                 const int L = HangulLBase + SIndex / HangulNCount;
                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
                 const int T = HangulTBase + (SIndex % HangulTCount);
                 dest.Add(L); dest.Add(V);
                 if (T != HangulTBase) dest.Add(T);
                 return;
         }
         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
         const TUniChInfo &ci = h[i];
         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
         while (true) {
                 int cp = decompositions[ofs++]; if (cp < 0) return;
                 AddDecomposition(cp, dest, compatibility); }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

bool TUniChDb::CanSentenceEndHere	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	position
	)		const

protected

Definition at line 2585 of file unicode.h.

References TUniTrie< TItem_ >::Empty(), TUniTrie< TItem_ >::Get3GramRoot(), GetCat(), TUniTrie< TItem_ >::GetChild(), GetSbFlags(), TUniTrie< TItem_ >::Has1Gram(), TUniTrie< TItem_ >::Has2Gram(), IAssert, TUniTrie< TItem_ >::IsNodeTerminal(), sbExTrie, ucfSbATerm, ucfSbSep, ucfSbSp, ucfSbSTerm, and WbFindPrevNonIgnored().

Referenced by FindNextSentenceBoundary().

 {
         if (sbExTrie.Empty()) return true;
         // We'll move back from the position where a sentence-boundary is being considered.
         size_t pos = position;
         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
         // - Skip the Sep, if there is one.
         if ((c & ucfSbSep) == ucfSbSep) {
                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
         // - Skip any Sp characters.
         while ((sfb & ucfSbSp) == ucfSbSp) {
                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
         // - Skip any Close characters.
         while ((sfb & ucfSbSp) == ucfSbSp) {
                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
         // - Skip any ATerm | STerm characters.
         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
         // Now start moving through the trie.
         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
         while (true)
         {
                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
                 TUniChCategory cat = GetCat(c);
                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
                         // Check if the suffix we've read so far is one of those that appear in the trie.
                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);
                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
                         IAssert(len >= 3); IAssert(node >= 0);
                         if (sbExTrie.IsNodeTerminal(node)) return false;
                         if (atEnd) return true; }
                 if (len == 1) { cButLast = c; len++; }
                 else if (len == 2) { cButButLast = c; len++;
                         // Now we have read the last three characters; start descending the suitable subtrie.
                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
                         if (node < 0) return true; }
                 else {
                         // Descend down the trie.
                         node = sbExTrie.GetChild(node, c);
                         if (node < 0) return true; }
         }
         //return true;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::Clr ( )

inline

Definition at line 1276 of file unicode.h.

References caseFolding, charNames, THash< TKey, TDat, THashFunc >::Clr(), TUniCaseFolding::Clr(), TVec< TVal, TSizeTy >::Clr(), TStrPool::Clr(), decompositions, h, inverseDec, scripts, specialCasingLower, specialCasingTitle, and specialCasingUpper.

Referenced by LoadTxt().

                    {
                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
                 scripts.Clr(); }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Compose	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const

Definition at line 3158 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), Assert, ccMax, TUniChInfo::ccStarter, TVec< TVal, TSizeTy >::Clr(), GetCombiningClass(), HangulLBase, HangulLCount, HangulSBase, HangulSCount, HangulTBase, HangulTCount, HangulVBase, HangulVCount, inverseDec, and TVec< TVal, TSizeTy >::Len().

Referenced by Compose(), TUnicode::Compose(), and DecomposeAndCompose().

 {
         if (clrDest) dest.Clr();
         bool lastStarterKnown = false; // has a starter been encountered yet?
         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
         const size_t srcEnd = srcIdx + srcCount;
         int ccMax = -1; // The highest combining class among the characters since the last starter.
         while (srcIdx < srcEnd)
         {
                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
                 const int cpClass = GetCombiningClass(cp);
                 //int cpCombined = -1;
                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked
                 // by some intermediate character, we can try to combine them.
                 if (lastStarterKnown && ccMax < cpClass)
                 {
                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
                         int cpCombined = -1;
                         do {
                                 // Try to look up a composition in the inverseDec table.
                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }
                                 // UAX #15, sec. 16: Hangul composition
                                 // - Try to combine L and V.
                                 const int LIndex = cpLastStarter - HangulLBase;
                                 if (0 <= LIndex && LIndex < HangulLCount) {
                                         const int VIndex = cp - HangulVBase;
                                         if (0 <= VIndex && VIndex < HangulVCount) {
                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
                                                 break; } }
                                 // - Try to combine LV and T.
                                 const int SIndex = cpLastStarter - HangulSBase;
                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
                                 {
                                         const int TIndex = cp - HangulTBase;
                                         if (0 <= TIndex && TIndex < HangulTCount) {
                                                 cpCombined = cpLastStarter + TIndex;
                                                 break; }
                                 }
                         } while (false);
                         // If a combining character has been found, use it to replace the old cpStarter.
                         if (cpCombined >= 0) {
                                 dest[TVecIdx(lastStarterPos)] = cpCombined;
                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
                                 cpLastStarter = cpCombined; continue; }
                 }
                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
                         ccMax = cpClass;
                 dest.Add(cp);
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Compose	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const

inline

Definition at line 1532 of file unicode.h.

References Compose().

1532 {

1533 Compose(src, 0, src.Len(), dest, clrDest); }

TUniChDb::Compose

void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const

Definition: unicode.h:3158

Here is the call graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Decompose	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const

Definition at line 3126 of file unicode.h.

References AddDecomposition(), TUniChInfo::ccStarter, TVec< TVal, TSizeTy >::Clr(), GetCombiningClass(), and TVec< TVal, TSizeTy >::Len().

Referenced by Decompose(), TUnicode::Decompose(), and DecomposeAndCompose().

 {
         if (clrDest) dest.Clr();
         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
         // Decompose the string.
         while (srcIdx < srcCount) {
                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
         // Rearrange the decomposed string into canonical order.
         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
         {
                 size_t j = destIdx;
                 int cp = dest[TVecIdx(destIdx)]; destIdx++;
                 int cpCls = GetCombiningClass(cp);
                 if (cpCls == TUniChInfo::ccStarter) continue;
                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
                 dest[TVecIdx(j)] = cp;
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::Decompose	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const

inline

Definition at line 1520 of file unicode.h.

References Decompose().

1520 {

1521 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }

TUniChDb::Decompose

void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const

Definition: unicode.h:3126

Here is the call graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::DecomposeAndCompose	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const

Definition at line 3148 of file unicode.h.

References TVec< TVal, TSizeTy >::Clr(), Compose(), Decompose(), and TVec< TVal, TSizeTy >::Len().

Referenced by DecomposeAndCompose(), and TUnicode::DecomposeAndCompose().

 {
         if (clrDest) dest.Clr();
         TIntV temp;
         Decompose(src, srcIdx, srcCount, temp, compatibility);
         Compose(temp, 0, temp.Len(), dest, clrDest);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::DecomposeAndCompose	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	compatibility,
		bool	clrDest = `true`
	)		const

inline

Definition at line 1542 of file unicode.h.

References DecomposeAndCompose().

1542 {

1543 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }

void DecomposeAndCompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const

Definition: unicode.h:3148

Here is the call graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniChDb::ExtractStarters	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const

Definition at line 3215 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), TUniChInfo::ccStarter, TVec< TVal, TSizeTy >::Clr(), and GetCombiningClass().

Referenced by ExtractStarters(), and TUnicode::ExtractStarters().

 {
         if (clrDest) dest.Clr();
         size_t retVal = 0;
         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
                 const int cp = src[TVecIdx(srcIdx)];
                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
                         { dest.Add(cp); retVal++; } }
         return retVal;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniChDb::ExtractStarters	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		bool	clrDest = `true`
	)		const

inline

Definition at line 1551 of file unicode.h.

References ExtractStarters().

1551 {

1552 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }

TUniChDb::ExtractStarters

size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const

Definition: unicode.h:3215

Here is the call graph for this function:

template<typename TSrcVec >

size_t TUniChDb::ExtractStarters ( TSrcVec & src ) const

inline

Definition at line 1555 of file unicode.h.

References ExtractStarters(), and TVec< TVal, TSizeTy >::Len().

                                                    {
                 TIntV temp; size_t retVal = ExtractStarters(src, temp);
                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
                 return retVal; }

Here is the call graph for this function:

template<typename TSrcVec >

bool TUniChDb::FindNextSentenceBoundary	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		size_t &	position
	)		const

Definition at line 2636 of file unicode.h.

References CanSentenceEndHere(), GetSbFlags(), IAssert, IsPeekAheadSkippable, IsWbIgnored(), TestCurNext, TestPrevCurNext, Trans, ucfSbATerm, ucfSbClose, ucfSbLower, ucfSbNumeric, ucfSbSep, ucfSbSp, ucfSbSTerm, ucfSbUpper, WbFindNextNonIgnored(), and WbFindPrevNonIgnored().

Referenced by TUnicode::FindNextSentenceBoundary(), FindSentenceBoundaries(), and TestFindNextWordOrSentenceBoundary().

 {
         // SB1.  Break at the start of text.
         if (position < srcIdx) { position = srcIdx; return true; }
         // If we are beyond the end of the text, there aren't any word breaks left.
         const size_t srcEnd = srcIdx + srcCount;
         if (position >= srcEnd) return false;
         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
         size_t origPos = position;
         if (IsWbIgnored(src[TVecIdx(position)])) {
                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
                         position = origPos;
         }
         // Determine the previous nonignored character (before 'position').
         size_t posPrev = position;
         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
         // Sec 6.2.  Allow a break between Sep and an ignored character.
         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
         // Determine the next nonignored character (after 'position').
         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
         size_t posNext2;
         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
         int cNext2, sbfNext2;
         // Initialize the state of the peek-back automaton.
         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
         TPeekBackState backState;
         {
                 size_t pos = position;
                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
                 while (true)
                 {
                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
                         // Skip at most one Sep.
                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
                         if ((sbf & ucfSbSep) == ucfSbSep) {
                                 wasSep = true;
                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
                         // Skip zero or more Sp's.
                         bool stop = false;
                         while ((sbf & ucfSbSp) == ucfSbSp) {
                                 wasSp = true;
                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
                         if (stop) break;
                         // Skip zero or more Close's.
                         while ((sbf & ucfSbClose) == ucfSbClose) {
                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
                         if (stop) break;
                         // Process an ATerm or STerm.
                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
                         break;
                 }
                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
                 else backState = stInit;
         }
         // Initialize the state of the peek-ahead automaton.  This state tells us what follows
         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
         // Our peek-ahead automaton must tell us whether it is Lower or something else.
         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
         TPeekAheadState aheadState = stUnknown;
         //
         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
         {
                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
                 sbfNext2 = GetSbFlags(cNext2);
                 // Update the peek-back automaton.
 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
                 switch (backState) {
                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
                         default: IAssert(false); }
 #undef Trans
 #undef TestCur
                 // Update the peek-ahead automaton.
 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
                 if (! IsPeekAheadSkippable(sbfCur)) {
                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
                         if (aheadState == stLower) IAssert(isLower);
                         else if (aheadState == stNotLower) IAssert(! isLower);
                         // We haven't peaked ahead farther than this so far -- invalidate the state.
                         aheadState = stUnknown; }
                 if (aheadState == stUnknown)
                 {
                         // Peak ahead to the next non-peekahead-skippable character.
                         size_t pos = posNext;
                         while (pos < srcEnd) {
                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
                                 if (! IsPeekAheadSkippable(sbf)) {
                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
                                         else aheadState = stNotLower;
                                         break; }
                                 WbFindNextNonIgnored(src, pos, srcEnd); }
                         if (! (pos < srcEnd)) aheadState = stNotLower;
                 }
 #undef IsPeekAheadSkippable
                 //
 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
                 // SB3.  Do not break within CRLF.
                 if (cCur == 13 && cNext == 10) continue;
                 // SB4.  Break ater paragraph separators.
                 if ((sbfCur & ucfSbSep) == ucfSbSep) {
                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
                         position = posNext; return true; }
                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number
                 // or lowercase letter, if they are between uppercase letters, or if the first following letter
                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
                 // or numeric period, and thus may not mark the end of a sentence.
                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
                         position = posNext; return true; } // SB11
                 // WB12.  Otherwise, do not break.
                 continue;
 #undef TestCurNext
 #undef TestCurNext2
 #undef TestPrevCurNext
         }
         // WB2.  Break at the end of text.
         IAssert(position == srcEnd);
         return true;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

bool TUniChDb::FindNextWordBoundary	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		size_t &	position
	)		const

Definition at line 2483 of file unicode.h.

References GetWbFlags(), IAssert, IsWbIgnored(), TestCurNext, TestCurNext2, TestPrevCurNext, ucfWbALetter, ucfWbExtendNumLet, ucfWbKatakana, ucfWbMidLetter, ucfWbMidNum, ucfWbNumeric, WbFindNextNonIgnored(), and WbFindPrevNonIgnored().

Referenced by TUnicode::FindNextWordBoundary(), FindWordBoundaries(), GetCaseConverted(), GetSimpleCaseConverted(), TestFindNextWordOrSentenceBoundary(), and ToSimpleCaseConverted().

 {
         // WB1.  Break at the start of text.
         if (position < srcIdx) { position = srcIdx; return true; }
         // If we are beyond the end of the text, there aren't any word breaks left.
         const size_t srcEnd = srcIdx + srcCount;
         if (position >= srcEnd) return false;
         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
         size_t origPos = position;
         if (IsWbIgnored(src[TVecIdx(position)])) {
                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
                         position = origPos;
         }
         // Determine the previous nonignored character (before 'position').
         size_t posPrev = position;
         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
         // Sec 6.2.  Allow a break between Sep and an ignored character.
         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
         // Determine the next nonignored character (after 'position').
         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
         size_t posNext2;
         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
         int cNext2, wbfNext2;
         //
         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
         {
                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
                 wbfNext2 = GetWbFlags(cNext2);
 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
                 // WB3.  Do not break within CRLF.
                 if (cCur == 13 && cNext == 10) continue;
                 // WB5.  Do not break between most letters.
                 TestCurNext(ucfWbALetter, ucfWbALetter);
                 // WB6.  Do not break letters across certain punctuation.
                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
                 // WB7.  Do not break letters across certain punctuation.
                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
                 TestCurNext(ucfWbNumeric, ucfWbNumeric);
                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
                 TestCurNext(ucfWbALetter, ucfWbNumeric);
                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
                 TestCurNext(ucfWbNumeric, ucfWbALetter);
                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
                 // WB13.  Do not break between Katakana.
                 TestCurNext(ucfWbKatakana, ucfWbKatakana);
                 // WB13a.  Do not break from extenders.
                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
                 // WB13b.  Do not break from extenders.
                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
                 // WB14.  Otherwise, break everywhere.
                 position = posNext; return true;
 #undef TestCurNext
 #undef TestCurNext2
 #undef TestPrevCurNext
         }
         // WB2.  Break at the end of text.
         IAssert(position == srcEnd);
         return true;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::FindSentenceBoundaries	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		TBoolV &	dest
	)		const

Definition at line 2793 of file unicode.h.

References Assert, FindNextSentenceBoundary(), TVec< TVal, TSizeTy >::Gen(), TVec< TVal, TSizeTy >::Len(), and TVec< TVal, TSizeTy >::PutAll().

Referenced by TUnicode::FindSentenceBoundaries(), and TestFindNextWordOrSentenceBoundary().

 {
         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
         dest.PutAll(false);
         size_t position = srcIdx;
         dest[TVecIdx(position - srcIdx)] = true;
         while (position < srcIdx + srcCount)
         {
                 size_t oldPos = position;
                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);
     if (oldPos < position) {
                   Assert(oldPos < position);
     }
     Assert(position <= srcIdx + srcCount);
                 dest[TVecIdx(position - srcIdx)] = true;
         }
         Assert(dest[TVecIdx(srcCount)]);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::FindWordBoundaries	(	const TSrcVec &	src,
		const size_t	srcIdx,
		const size_t	srcCount,
		TBoolV &	dest
	)		const

Definition at line 2561 of file unicode.h.

References Assert, FindNextWordBoundary(), TVec< TVal, TSizeTy >::Gen(), TVec< TVal, TSizeTy >::Len(), and TVec< TVal, TSizeTy >::PutAll().

Referenced by TUnicode::FindWordBoundaries(), GetCaseConverted(), and TestFindNextWordOrSentenceBoundary().

 {
         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
         dest.PutAll(false);
         size_t position = srcIdx;
         dest[TVecIdx(position - srcIdx)] = true;
         while (position < srcIdx + srcCount)
         {
                 size_t oldPos = position;
                 FindNextWordBoundary(src, srcIdx, srcCount, position);
     if (oldPos < position) {
                   Assert(oldPos < position);
     }
     Assert(position <= srcIdx + srcCount);
                 dest[TVecIdx(position - srcIdx)] = true;
         }
         Assert(dest[TVecIdx(srcCount)]);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetAuxiliaryDir ( )

inlinestatic

Definition at line 1304 of file unicode.h.

Referenced by InitWordAndSentenceBoundaryFlags(), and TestFindNextWordOrSentenceBoundary().

1304 { return "auxiliary"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetBinFn ( )

inlinestatic

Definition at line 1310 of file unicode.h.

Referenced by Test().

1310 { return "UniChDb.bin"; } // used only by Test()

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetCaseConverted	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const TCaseConversion	how,
		const bool	turkic,
		const bool	lithuanian
	)		const

Definition at line 2817 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), TUniCaseFolding::AppendVector(), Assert, TUniChInfo::ccAbove, ccLower, TUniChInfo::ccStarter, ccTitle, ccUpper, TVec< TVal, TSizeTy >::Clr(), FindNextWordBoundary(), FindWordBoundaries(), GetCombiningClass(), THash< TKey, TDat, THashFunc >::GetKeyId(), h, IAssert, TUniChInfo::simpleLowerCaseMapping, TUniChInfo::simpleTitleCaseMapping, TUniChInfo::simpleUpperCaseMapping, specialCasingLower, specialCasingTitle, and specialCasingUpper.

Referenced by GetLowerCase(), GetTitleCase(), GetUpperCase(), and TestCaseConversion().

 {
         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
         if (clrDest) dest.Clr();
         enum {
                 GreekCapitalLetterSigma = 0x3a3,
                 GreekSmallLetterSigma = 0x3c3,
                 GreekSmallLetterFinalSigma = 0x3c2,
                 LatinCapitalLetterI = 0x49,
                 LatinCapitalLetterJ = 0x4a,
                 LatinCapitalLetterIWithOgonek = 0x12e,
                 LatinCapitalLetterIWithGrave = 0xcc,
                 LatinCapitalLetterIWithAcute = 0xcd,
                 LatinCapitalLetterIWithTilde = 0x128,
                 LatinCapitalLetterIWithDotAbove = 0x130,
                 LatinSmallLetterI = 0x69,
                 CombiningDotAbove = 0x307
         };
         //
         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
         size_t nextWordBoundary = srcIdx;
         TBoolV wordBoundaries; bool wbsKnown = false;
         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
         {
                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;
                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");
                 // For conversion to titlecase, the first cased character of each word
                 // must be converted to titlecase; everything else must be converted
                 // to lowercase.
                 TUniChDb::TCaseConversion howHere;
                 if (how != ccTitle) howHere = how;
                 else {
                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;
                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
                         bool isCased = IsCased(cp);
                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
                         else { howHere = ccLower;
                                 if (isCased && seenCased) seenTwoCased = true; }
                 }
                 // First, process the conditional mappings from SpecialCasing.txt.
                 // These will be processed in code -- they were ignored while
                 // we were reading SpecialCasing.txt itself.
                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)
                 {
                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
                         // the standard doesn't define it.  We'll use FinalCased instead.
                         // FinalCased: within the closest word boundaries containing C,
                         // there is a cased letter before C, and there is no cased letter after C.
                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
                         size_t srcIdx2 = srcIdx; bool casedAfter = false;
                         if (how == ccTitle)
                                 printf("!");
                         //while (srcIdx2 < nextBoundary)
                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
                         {
                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
                                 if (IsCased(cp2)) { casedAfter = true; break; }
                         }
                         if (! casedAfter)
                         {
                                 //size_t prevBoundary = srcIdx - 1;
                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;
                                 //while (prevBoundary < srcIdx2)
                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
                                 {
                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
                                         if (IsCased(cp2)) { casedBefore = true; break; }
                                 }
                                 if (casedBefore) {
                                         // Now we have a FinalCased character.
                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
                         }
                         // If we got here, add a non-final sigma.
                         dest.Add(GreekSmallLetterSigma); continue;
                 }
                 else if (lithuanian)
                 {
                         if (howHere == ccLower)
                         {
                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
                                 {
                                         bool moreAbove = false;
                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
                                         {
                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
                                                 const int cc2 = GetCombiningClass(cp2);
                                                 if (cc2 == TUniChInfo::ccStarter) break;
                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
                                         }
                                         if (moreAbove)
                                         {
                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
                                         }
                                 }
                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
                         }
                         if (cp == CombiningDotAbove)
                         {
                                 // Lithuanian, howHere != ccLower.
                                 // AfterSoftDotted := the last preceding character with a combining class
                                 // of zero before C was Soft_Dotted, and there is no intervening combining
                                 // character class 230 (ABOVE).
                                 bool afterSoftDotted = false;
                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
                                 while (origSrcIdx < srcIdx2)
                                 {
                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
                                         int cc2 = GetCombiningClass(cp2);
                                         if (cc2 == TUniChInfo::ccAbove) break;
                                         if (cc2 == TUniChInfo::ccStarter) {
                                                 afterSoftDotted = IsSoftDotted(cp2); break; }
                                 }
                                 if (afterSoftDotted)
                                 {
                                         Assert(lithuanian);
                                         // Remove DOT ABOVE after "i" with upper or titlecase.
                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
                                         if (how == ccLower) { dest.Add(0x307); continue; }
                                         if (how == ccUpper) continue;
                                         Assert(how == ccTitle);
                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
                                         dest.Add(0x307); continue;
                                 }
                         }
                 }
                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
                 {
                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                         // The following rules handle those cases.
                         if (cp == LatinCapitalLetterIWithDotAbove) {
                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
                         // When lowercasing, remove dot_above in the sequence I + dot_above,
                         // which will turn into i.  This matches the behavior of the
                         // canonically equivalent I-dot_above.
                         else if (cp == CombiningDotAbove)
                         {
                                 // AfterI: the last preceding base character was an uppercase I,
                                 // and there is no intervening combining character class 230 (ABOVE).
                                 bool afterI = false;
                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
                                 while (origSrcIdx < srcIdx2)
                                 {
                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
                                         int cc2 = GetCombiningClass(cp2);
                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
                                 }
                                 if (afterI) {
                                         if (how == ccTitle && seenCased && ! seenTwoCased) {
                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
                                                 // This suggests that if a cased character is found, others in that word should be left alone.
                                                 // This seems unusual; we map all other characters to lowercase instead.
                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.
                                                 // So we treat this as a special case here.
                                                 IAssert(cpFirstCased == LatinCapitalLetterI);
                                                 dest.Add(0x307); continue; }
                                         if (howHere != ccLower) dest.Add(0x307);
                                         continue; }
                         }
                         // When lowercasing, unless an I is before a dot_above,
                         // it turns into a dotless i.
                         else if (cp == LatinCapitalLetterI)
                         {
                                 // BeforeDot: C is followed by U+0307 (combining dot above).
                                 // Any sequence of characters with a combining class that is
                                 // neither 0 nor 230 may intervene between the current character
                                 // and the combining dot above.
                                 bool beforeDot = false;
                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
                                 {
                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
                                         if (cp2 == 0x307) { beforeDot = true; break; }
                                         const int cc2 = GetCombiningClass(cp2);
                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
                                 }
                                 if (! beforeDot) {
                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
                         }
                         // When uppercasing, i turns into a dotted capital I.
                         else if (cp == LatinSmallLetterI)
                         {
                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
                         }
                 }
                 // Try to use the unconditional mappings.
                 const TIntIntVH &specHere = (
                         howHere == how ? specials :
                         howHere == ccLower ? specialCasingLower :
                         howHere == ccTitle ? specialCasingTitle :
                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
                 int i = specHere.GetKeyId(cp);
                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
                 // Try to use the simple (one-character) mappings.
                 i = h.GetKeyId(cp);
                 if (i >= 0) {
                         const TUniChInfo &ci = h[i];
                         int cpNew = (
                                 howHere == ccLower ? ci.simpleLowerCaseMapping :
                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :
                                                                          ci.simpleTitleCaseMapping);
                         if (cpNew < 0) cpNew = cp;
                         dest.Add(cpNew); continue; }
                 // As a final resort, leave 'cp' unchanged.
                 dest.Add(cp);
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetCaseFolded	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const bool	full,
		const bool	turkic = `false`
	)		const

inline

Definition at line 1629 of file unicode.h.

References caseFolding, and TUniCaseFolding::Fold().

Referenced by GetCaseFolded(), and TUnicode::GetCaseFolded().

1630 { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }

TUniCaseFolding::Fold

void Fold(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic) const

Definition: unicode.h:293

TUniChDb::caseFolding

TUniCaseFolding caseFolding

Definition: unicode.h:1268

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetCaseFolded	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	full = `true`,
		const bool	turkic = `false`
	)		const

inline

Definition at line 1632 of file unicode.h.

References GetCaseFolded().

1632 {

1633 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }

TUniChDb::GetCaseFolded

void GetCaseFolded(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const

Definition: unicode.h:1629

Here is the call graph for this function:

static TStr TUniChDb::GetCaseFoldingFn ( )

inlinestatic

Definition at line 1296 of file unicode.h.

Referenced by LoadTxt().

1296 { return "CaseFolding.txt"; }

Here is the caller graph for this function:

TUniChCategory TUniChDb::GetCat ( const int cp ) const

inline

Definition at line 1353 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

Referenced by TUnicode::___UniFwd2(), and CanSentenceEndHere().

1353 { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

Here is the caller graph for this function:

const char* TUniChDb::GetCharName ( const int cp ) const

inline

Definition at line 1331 of file unicode.h.

References charNames, TStrPool::GetCStr(), THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

Referenced by TUnicode::GetCharName(), and GetCharNameS().

1331 { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }

TStrPool::GetCStr

const char * GetCStr(const uint &Offset) const

Definition: dt.h:811

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

TUniChDb::charNames

TStrPool charNames

Definition: unicode.h:1264

Here is the call graph for this function:

Here is the caller graph for this function:

TStr TUniChDb::GetCharNameS ( const int cp ) const

inline

Definition at line 1332 of file unicode.h.

References GetCharName().

Referenced by TUnicode::GetCharNameS(), and PrintCharNames().

                                               {
                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
                 const char *p = GetCharName(cp); if (p) return p;
                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }

Here is the call graph for this function:

Here is the caller graph for this function:

int TUniChDb::GetCombiningClass ( const int cp ) const

inline

Definition at line 1399 of file unicode.h.

References TUniChInfo::ccStarter, THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

Referenced by Compose(), Decompose(), ExtractStarters(), and GetCaseConverted().

1399 { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }

TUniChInfo::ccStarter

Definition: unicode.h:989

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetCompositionExclusionsFn ( )

inlinestatic

Definition at line 1299 of file unicode.h.

Referenced by LoadTxt().

1299 { return "CompositionExclusions.txt"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetDerivedCorePropsFn ( )

inlinestatic

Definition at line 1301 of file unicode.h.

Referenced by InitDerivedCoreProperties().

1301 { return "DerivedCoreProperties.txt"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetLineBreakFn ( )

inlinestatic

Definition at line 1302 of file unicode.h.

Referenced by InitLineBreaks().

1302 { return "LineBreak.txt"; }

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetLowerCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const

inline

Definition at line 1590 of file unicode.h.

References ccLower, and GetCaseConverted().

Referenced by TUnicode::GetLowerCase().

1590 { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }

TUniChDb::GetCaseConverted

Definition: unicode.h:1584

void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const

Definition: unicode.h:2817

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetLowerCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const

inline

Definition at line 1593 of file unicode.h.

References GetLowerCase().

Referenced by GetLowerCase().

1593 { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

TUniChDb::GetLowerCase

void GetLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const

Definition: unicode.h:1590

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetNormalizationTestFn ( )

inlinestatic

Definition at line 1309 of file unicode.h.

Referenced by TestComposition().

1309 { return "NormalizationTest.txt"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetPropListFn ( )

inlinestatic

Definition at line 1303 of file unicode.h.

Referenced by InitPropList().

1303 { return "PropList.txt"; }

Here is the caller graph for this function:

int TUniChDb::GetSbFlags ( const int cp ) const

inline

Definition at line 1359 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

Referenced by CanSentenceEndHere(), FindNextSentenceBoundary(), and TestFindNextWordOrSentenceBoundary().

1359 { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

Here is the caller graph for this function:

int TUniChDb::GetScript ( const TUniChInfo & ci ) const

inline

Definition at line 1323 of file unicode.h.

References TUniChInfo::script, and scriptUnknown.

Referenced by TUStr::GetChScriptId().

1323 { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }

TUniChInfo::script

signed char script

Definition: unicode.h:1021

TUniChDb::scriptUnknown

int scriptUnknown

Definition: unicode.h:1272

Here is the caller graph for this function:

int TUniChDb::GetScript ( const int cp ) const

inline

Definition at line 1324 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), GetScript(), h, and scriptUnknown.

Referenced by GetScript().

1324 { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }

TUniChDb::GetScript

int GetScript(const TUniChInfo &ci) const

Definition: unicode.h:1323

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

TUniChDb::scriptUnknown

int scriptUnknown

Definition: unicode.h:1272

Here is the call graph for this function:

Here is the caller graph for this function:

int TUniChDb::GetScriptByName ( const TStr & scriptName ) const

inline

Definition at line 1322 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and scripts.

Referenced by TUStr::GetScriptId(), InitAfterLoad(), InitWordAndSentenceBoundaryFlags(), and LoadTxt().

1322 { return scripts.GetKeyId(scriptName); }

TUniChDb::GetSimpleCaseConverted

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

TUniChDb::scripts

TStrIntH scripts

Definition: unicode.h:1265

Here is the call graph for this function:

Here is the caller graph for this function:

const TStr& TUniChDb::GetScriptName ( const int scriptId ) const

inline

Definition at line 1321 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKey(), and scripts.

Referenced by TUStr::GetScriptNm(), and TestWbFindNonIgnored().

1321 { return scripts.GetKey(scriptId); }

TUniChDb::scripts

TStrIntH scripts

Definition: unicode.h:1265

THash::GetKey

const TKey & GetKey(const int &KeyId) const

Definition: hash.h:210

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetScriptNameHiragana ( )

inlinestatic

Definition at line 1319 of file unicode.h.

Referenced by InitWordAndSentenceBoundaryFlags().

1319 { return "Hiragana"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetScriptNameKatakana ( )

inlinestatic

Definition at line 1318 of file unicode.h.

Referenced by InitWordAndSentenceBoundaryFlags().

1318 { return "Katakana"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetScriptNameUnknown ( )

inlinestatic

Definition at line 1317 of file unicode.h.

Referenced by InitAfterLoad(), InitScripts(), and LoadTxt().

1317 { return "Unknown"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetScriptsFn ( )

inlinestatic

Definition at line 1300 of file unicode.h.

Referenced by InitScripts().

1300 { return "Scripts.txt"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetSentenceBreakPropertyFn ( )

inlinestatic

Definition at line 1308 of file unicode.h.

Referenced by InitWordAndSentenceBoundaryFlags().

1308 { return "SentenceBreakProperty.txt"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetSentenceBreakTestFn ( )

inlinestatic

Definition at line 1307 of file unicode.h.

Referenced by TestFindNextWordOrSentenceBoundary().

1307 { return "SentenceBreakTest.txt"; }

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleCaseConverted	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const TCaseConversion	how
	)		const

Definition at line 3042 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), ccLower, ccTitle, ccUpper, TVec< TVal, TSizeTy >::Clr(), FindNextWordBoundary(), THash< TKey, TDat, THashFunc >::GetKeyId(), h, IAssert, TUniChInfo::simpleLowerCaseMapping, TUniChInfo::simpleTitleCaseMapping, and TUniChInfo::simpleUpperCaseMapping.

Referenced by GetSimpleLowerCase(), GetSimpleTitleCase(), and GetSimpleUpperCase().

 {
         if (clrDest) dest.Clr();
         bool seenCased = false; size_t nextWordBoundary = srcIdx;
         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
         {
                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
                 const TUniChInfo &ci = h[i];
                 // With titlecasing, the first cased character of each word must be put into titlecase,
                 // all others into lowercase.  This is what the howHere variable is for.
                 TUniChDb::TCaseConversion howHere;
                 if (how != ccTitle) howHere = how;
                 else {
                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
                                 seenCased = false;
                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
                         bool isCased = IsCased(cp);
                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
                         else howHere = ccLower;
                 }
                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
                 if (cpNew < 0) cpNew = cp;
                 dest.Add(cpNew);
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleLowerCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 1601 of file unicode.h.

References ccLower, and GetSimpleCaseConverted().

Referenced by TUnicode::GetSimpleLowerCase().

1601 { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }

void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const

Definition: unicode.h:3042

TUniChDb::GetSimpleLowerCase

Definition: unicode.h:1584

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleLowerCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 1604 of file unicode.h.

References GetSimpleLowerCase().

Referenced by GetSimpleLowerCase().

1604 { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }

void GetSimpleLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:1601

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleTitleCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 1603 of file unicode.h.

References ccTitle, and GetSimpleCaseConverted().

Referenced by TUnicode::GetSimpleTitleCase().

1603 { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }

TUniChDb::GetSimpleCaseConverted

void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const

Definition: unicode.h:3042

TUniChDb::GetSimpleTitleCase

Definition: unicode.h:1584

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleTitleCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 1606 of file unicode.h.

References GetSimpleTitleCase().

Referenced by GetSimpleTitleCase().

1606 { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }

void GetSimpleTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:1603

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleUpperCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 1602 of file unicode.h.

References ccUpper, and GetSimpleCaseConverted().

Referenced by TUnicode::GetSimpleUpperCase().

1602 { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }

TUniChDb::GetSimpleCaseConverted

Definition: unicode.h:1584

void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const

Definition: unicode.h:3042

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetSimpleUpperCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 1605 of file unicode.h.

References GetSimpleUpperCase().

Referenced by GetSimpleUpperCase().

1605 { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }

TUniChDb::GetSimpleUpperCase

void GetSimpleUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:1602

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetSpecialCasingFn ( )

inlinestatic

Definition at line 1297 of file unicode.h.

Referenced by InitSpecialCasing().

1297 { return "SpecialCasing.txt"; }

Here is the caller graph for this function:

TUniChSubCategory TUniChDb::GetSubCat ( const int cp ) const

inline

Definition at line 1354 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

Referenced by TUnicode::GetSubCat().

1354 { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetTitleCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const

inline

Definition at line 1592 of file unicode.h.

References ccTitle, and GetCaseConverted().

Referenced by TUnicode::GetTitleCase().

1592 { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }

TUniChDb::GetCaseConverted

Definition: unicode.h:1584

void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const

Definition: unicode.h:2817

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetTitleCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const

inline

Definition at line 1595 of file unicode.h.

References GetTitleCase().

Referenced by GetTitleCase().

1595 { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

TUniChDb::GetTitleCase

void GetTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const

Definition: unicode.h:1592

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetUnicodeDataFn ( )

inlinestatic

Definition at line 1298 of file unicode.h.

Referenced by LoadTxt().

1298 { return "UnicodeData.txt"; }

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetUpperCase	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const

inline

Definition at line 1591 of file unicode.h.

References ccUpper, and GetCaseConverted().

Referenced by TUnicode::GetUpperCase().

1591 { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }

TUniChDb::GetCaseConverted

Definition: unicode.h:1584

void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const

Definition: unicode.h:2817

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

void TUniChDb::GetUpperCase	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`,
		const bool	turkic = `false`,
		const bool	lithuanian = `false`
	)		const

inline

Definition at line 1594 of file unicode.h.

References GetUpperCase().

Referenced by GetUpperCase().

1594 { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

TUniChDb::GetUpperCase

void GetUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const

Definition: unicode.h:1591

Here is the call graph for this function:

Here is the caller graph for this function:

int TUniChDb::GetWbFlags ( const int cp ) const

inline

Definition at line 1357 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

Referenced by FindNextWordBoundary(), and TestFindNextWordOrSentenceBoundary().

1357 { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

References THash< TKey, TDat, THashFunc >::AddKey(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), TStr::CStr(), FailR, GetDerivedCorePropsFn(), THash< TKey, TDat, THashFunc >::GetKeyId(), TUniChDb::TUcdFileReader::GetNextLine(), h, IAssert, TUniChInfo::IsDcpFlag(), TVec< TVal, TSizeTy >::Len(), TUniChDb::TUcdFileReader::Open(), TUniChDb::TUcdFileReader::ParseCodePointRange(), TUniChDb::TSubcatHelper::ProcessComment(), TUniChDb::TSubcatHelper::SetCat(), TUniChInfo::SetDcpFlag(), TUniChDb::TSubcatHelper::TestCat(), ucfCompatibilityDecomposition, ucfDcpAlphabetic, ucfDcpDefaultIgnorableCodePoint, ucfDcpGraphemeBase, ucfDcpGraphemeExtend, ucfDcpIdContinue, ucfDcpIdStart, ucfDcpLowercase, ucfDcpMath, ucfDcpUppercase, ucfDcpXidContinue, and ucfDcpXidStart.

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

Here is the caller graph for this function:

static TStr TUniChDb::GetWordBreakPropertyFn ( )

inlinestatic

Definition at line 1306 of file unicode.h.

Referenced by InitWordAndSentenceBoundaryFlags().

1306 { return "WordBreakProperty.txt"; }

Here is the caller graph for this function:

static TStr TUniChDb::GetWordBreakTestFn ( )

inlinestatic

Definition at line 1305 of file unicode.h.

Referenced by TestFindNextWordOrSentenceBoundary().

1305 { return "WordBreakTest.txt"; }

Here is the caller graph for this function:

void TUniChDb::InitAfterLoad ( )

protected

Definition at line 1368 of file unicode.cpp.

References GetScriptByName(), GetScriptNameUnknown(), IAssert, and scriptUnknown.

Referenced by Load().

 {
         scriptUnknown = GetScriptByName(GetScriptNameUnknown()); IAssert(scriptUnknown >= 0);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::InitDerivedCoreProperties ( const TStr & basePath )

protected

Definition at line 1007 of file unicode.cpp.

Referenced by LoadTxt().

 {
         TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
         reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
         TSubcatHelper helper(*this);
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 2);
                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
                 TStr s = fields[1];
                 TUniChFlags flag = ucfCompatibilityDecomposition;
                 if (s == "Math") flag = ucfDcpMath;
                 else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
                 else if (s == "Lowercase") flag = ucfDcpLowercase;
                 else if (s == "Uppercase") flag = ucfDcpUppercase;
                 else if (s == "ID_Start") flag = ucfDcpIdStart;
                 else if (s == "ID_Continue") flag = ucfDcpIdContinue;
                 else if (s == "XID_Start") flag = ucfDcpXidStart;
                 else if (s == "XID_Continue") flag = ucfDcpXidContinue;
                 else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
                 else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
                 else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
                 else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
                 else FailR(s.CStr());
                 // If we add new codepoints to the hash table, we should also set their category.
                 // This is supposed to be provided in the comment, e.g. "# Cf       SOFT HYPHEN".
                 helper.ProcessComment(reader);
                 //
                 for (int cp = from; cp <= to; cp++) {
                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
                         helper.TestCat(cp);
                         TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
                         ci.SetDcpFlag(flag); nCps++; }
                 nLines++;
         }
         reader.Close();
         printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::InitLineBreaks ( const TStr & basePath )

protected

Definition at line 1046 of file unicode.cpp.

References THash< TKey, TDat, THashFunc >::AddKey(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), THash< TKey, TDat, THashFunc >::FFirstKeyId(), THash< TKey, TDat, THashFunc >::FNextKeyId(), THash< TKey, TDat, THashFunc >::GetKeyId(), TUniChInfo::GetLineBreakCode(), GetLineBreakFn(), TUniChDb::TUcdFileReader::GetNextLine(), h, IAssert, TStr::Len(), TVec< TVal, TSizeTy >::Len(), TUniChInfo::LineBreak_Unknown, TUniChDb::TUcdFileReader::Open(), and TUniChDb::TUcdFileReader::ParseCodePointRange().

Referenced by LoadTxt().

 {
         // Clear old linebreak values.
         ushort xx = TUniChInfo::LineBreak_Unknown;
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
         // Read LineBreak.txt.
         TUcdFileReader reader; TStrV fields;
         reader.Open(CombinePath(basePath, GetLineBreakFn()));
         int nLines = 0, nCps = 0;
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 2);
                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
                 TStr s = fields[1]; IAssert(s.Len() == 2);
                 ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
                 if (us == xx) continue;
                 for (int cp = from; cp <= to; cp++) {
                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
                                 printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
                         IAssert(h[i].lineBreak == xx);
                         h[i].lineBreak = us; nCps++; }
                 nLines++;
         }
         reader.Close();
         printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::InitPropList ( const TStr & basePath )

protected

Definition at line 950 of file unicode.cpp.

References THash< TKey, TDat, THashFunc >::AddKey(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), TStr::CStr(), FailR, THash< TKey, TDat, THashFunc >::GetKeyId(), TUniChDb::TUcdFileReader::GetNextLine(), GetPropListFn(), h, IAssert, TUniChInfo::IsProperty(), TUniChInfo::IsPropertyX(), TVec< TVal, TSizeTy >::Len(), TUniChDb::TUcdFileReader::Open(), TUniChDb::TUcdFileReader::ParseCodePointRange(), TUniChDb::TSubcatHelper::ProcessComment(), TUniChDb::TSubcatHelper::SetCat(), TUniChInfo::SetProperty(), TUniChInfo::SetPropertyX(), TUniChDb::TSubcatHelper::TestCat(), ucfPrAsciiHexDigit, ucfPrBidiControl, ucfPrDash, ucfPrDeprecated, ucfPrDiacritic, ucfPrExtender, ucfPrHexDigit, ucfPrHyphen, ucfPrIdeographic, ucfPrJoinControl, ucfPrLogicalOrderException, ucfPrNoncharacterCodePoint, ucfPrPatternSyntax, ucfPrPatternWhiteSpace, ucfPrQuotationMark, ucfPrSoftDotted, ucfPrSTerm, ucfPrTerminalPunctuation, ucfPrVariationSelector, ucfPrWhiteSpace, ucfPxIdsBinaryOperator, ucfPxIdsTrinaryOperator, ucfPxOtherAlphabetic, ucfPxOtherDefaultIgnorableCodePoint, ucfPxOtherGraphemeExtend, ucfPxOtherIdContinue, ucfPxOtherIdStart, ucfPxOtherLowercase, ucfPxOtherMath, ucfPxOtherUppercase, ucfPxRadical, and ucfPxUnifiedIdeograph.

Referenced by LoadTxt().

 {
         TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
         reader.Open(CombinePath(basePath, GetPropListFn()));
         TSubcatHelper helper(*this);
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 2);
                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
                 TStr s = fields[1];
                 TUniChProperties prop = TUniChProperties(0); TUniChPropertiesX propx = TUniChPropertiesX(0);
                 if (s == "White_Space") prop = ucfPrWhiteSpace;
                 else if (s == "Bidi_Control") prop = ucfPrBidiControl;
                 else if (s == "Join_Control") prop = ucfPrJoinControl;
                 else if (s == "Dash") prop = ucfPrDash;
                 else if (s == "Hyphen") prop = ucfPrHyphen;
                 else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
                 else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
                 else if (s == "Other_Math") propx = ucfPxOtherMath;
                 else if (s == "Hex_Digit") prop = ucfPrHexDigit;
                 else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
                 else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
                 else if (s == "Ideographic") prop = ucfPrIdeographic;
                 else if (s == "Diacritic") prop = ucfPrDiacritic;
                 else if (s == "Extender") prop = ucfPrExtender;
                 else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
                 else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
                 else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
                 else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
                 else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
                 else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
                 else if (s == "Radical") propx = ucfPxRadical;
                 else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
                 else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
                 else if (s == "Deprecated") prop = ucfPrDeprecated;
                 else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
                 else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
                 else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
                 else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
                 else if (s == "STerm") prop = ucfPrSTerm;
                 else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
                 else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
                 else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
                 else FailR(s.CStr());
                 helper.ProcessComment(reader);
                 for (int cp = from; cp <= to; cp++) {
                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
                         TUniChInfo &ci = h[i]; helper.TestCat(cp);
                         if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
                         if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
                         nCps++; }
                 nLines++;
         }
         reader.Close();
         printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::InitScripts ( const TStr & basePath )

protected

Definition at line 1073 of file unicode.cpp.

References THash< TKey, TDat, THashFunc >::AddDat(), THash< TKey, TDat, THashFunc >::AddKey(), AlwaysFalse(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), TStr::CStr(), THash< TKey, TDat, THashFunc >::FFirstKeyId(), THash< TKey, TDat, THashFunc >::FNextKeyId(), THash< TKey, TDat, THashFunc >::GetKey(), THash< TKey, TDat, THashFunc >::GetKeyId(), TUniChDb::TUcdFileReader::GetNextLine(), GetScriptNameUnknown(), GetScriptsFn(), h, IAssert, THash< TKey, TDat, THashFunc >::Len(), TUniChDb::TUcdFileReader::Open(), TUniChDb::TUcdFileReader::ParseCodePointRange(), TUniChDb::TSubcatHelper::ProcessComment(), TUniChInfo::script, scripts, TUniChDb::TSubcatHelper::SetCat(), and TUniChDb::TSubcatHelper::TestCat().

Referenced by LoadTxt().

 {
         TUcdFileReader reader; TStrV fields;
         reader.Open(CombinePath(basePath, GetScriptsFn()));
         TSubcatHelper helper(*this);
         while (reader.GetNextLine(fields))
         {
                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
                 TStr scriptName = fields[1];
                 int scriptNo = scripts.GetKeyId(scriptName);
                 if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
                 IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
                 scripts[scriptNo] += 1;
                 helper.ProcessComment(reader);
                 for (int cp = from; cp <= to; cp++) {
                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
                         helper.TestCat(cp);
                         TUniChInfo &ci = h[i]; ci.script = scriptNo; }
         }
         reader.Close();
         scripts.AddDat(GetScriptNameUnknown()) = 0;
         printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
         if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
                 printf("  %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
         printf("\n");
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::InitSpecialCasing ( const TStr & basePath )

protected

Definition at line 1225 of file unicode.cpp.

References THash< TKey, TDat, THashFunc >::AddDat(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), TStr::Empty(), TUniChDb::TUcdFileReader::GetNextLine(), GetSpecialCasingFn(), IAssert, TVec< TVal, TSizeTy >::Last(), TVec< TVal, TSizeTy >::Len(), TUniChDb::TUcdFileReader::Open(), TUniChDb::TUcdFileReader::ParseCodePoint(), TUniChDb::TUcdFileReader::ParseCodePointList(), specialCasingLower, specialCasingTitle, and specialCasingUpper.

Referenced by LoadTxt().

 {
         TUcdFileReader reader; TStrV fields;
         reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 5 || fields.Len() == 6);
                 IAssert(fields.Last().Empty());
                 // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
                 TStr conditions = "";
                 if (fields.Len() == 6) conditions = fields[4];
                 conditions.ToTrunc(); if (! conditions.Empty()) continue;
                 // Keep the other mappings.
                 const int cp = reader.ParseCodePoint(fields[0]);
                 TIntV v; reader.ParseCodePointList(fields[1], v);
                 specialCasingLower.AddDat(cp, v);
                 reader.ParseCodePointList(fields[2], v);
                 specialCasingTitle.AddDat(cp, v);
                 reader.ParseCodePointList(fields[3], v);
                 specialCasingUpper.AddDat(cp, v);
         }
         reader.Close();
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::InitWordAndSentenceBoundaryFlags ( const TStr & basePath )

protected

Definition at line 1100 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), THash< TKey, TDat, THashFunc >::AddDat(), TUniChDb::TUcdFileReader::Close(), TUniChInfo::ClrWbAndSbFlags(), anonymous_namespace{unicode.cpp}::CombinePath(), TStr::CStr(), Fail, FailR, THash< TKey, TDat, THashFunc >::FFirstKeyId(), THash< TKey, TDat, THashFunc >::FNextKeyId(), GetAuxiliaryDir(), THash< TKey, TDat, THashFunc >::GetDat(), THash< TKey, TDat, THashFunc >::GetKey(), THash< TKey, TDat, THashFunc >::GetKeyId(), TUniChDb::TUcdFileReader::GetNextLine(), TUniChInfo::GetSbFlags(), TUniChInfo::GetSbFlagsStr(), GetScriptByName(), GetScriptNameHiragana(), GetScriptNameKatakana(), GetSentenceBreakPropertyFn(), TUniChInfo::GetWbFlags(), GetWordBreakPropertyFn(), h, IAssert, TUniChInfo::IsAlphabetic(), TUniChInfo::IsGraphemeExtend(), TUniChInfo::IsIdeographic(), THash< TKey, TDat, THashFunc >::IsKey(), TUniChInfo::IsLowercase(), TUniChInfo::IsSbFlag(), TUniChInfo::IsSTerminal(), TUniChInfo::IsUppercase(), TUniChInfo::IsWbFlag(), TUniChInfo::IsWhiteSpace(), TVec< TVal, TSizeTy >::Len(), TUniChInfo::lineBreak, TUniChInfo::LineBreak_ComplexContext, TUniChInfo::LineBreak_InfixNumeric, TUniChInfo::LineBreak_Numeric, TUniChInfo::LineBreak_Quotation, TVec< TVal, TSizeTy >::Merge(), TUniChDb::TUcdFileReader::Open(), TUniChDb::TUcdFileReader::ParseCodePointRange(), TUniChInfo::script, TUniChInfo::SetSbFlag(), TUniChInfo::SetWbFlag(), TVec< TVal, TSizeTy >::Sort(), TUniChInfo::subCat, ucfCompatibilityDecomposition, ucfSbATerm, ucfSbClose, ucfSbFormat, ucfSbLower, ucfSbNumeric, ucfSbOLetter, ucfSbSep, ucfSbSp, ucfSbSTerm, ucfSbUpper, ucfWbALetter, ucfWbExtendNumLet, ucfWbFormat, ucfWbKatakana, ucfWbMidLetter, ucfWbMidNum, ucfWbNumeric, and anonymous_namespace{unicode.cpp}::VB.

Referenced by LoadTxt().

 {
         // UAX #29, sec. 4.1 and 5.1.
         // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
         int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
         int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
         // Clear any existing word-boundary flags and initialize them again.
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
         {
                 const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
                 ci.ClrWbAndSbFlags();
                 // Word-boundary flags.
                 if (ci.subCat  == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
                 if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
                 if (ci.lineBreak == TUniChInfo::LineBreak_InfixNumeric && cp != 0x3a) ci.SetWbFlag(ucfWbMidNum);
                 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetWbFlag(ucfWbNumeric);
                 if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
                 // Sentence-boundary flags.  Some are identical to some word-boundary flags.
                 if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
                 if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
                 if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
                 if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
                 if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
                 if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
                 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetSbFlag(ucfSbNumeric);
                 if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
                 // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
                 // the purposes of sentence-boundary detection.  Now in PropList.txt there is no doubt that 002E has the STerm
                 // property; thus, it should also belong to the STerm sentence-boundary class.  However, in
                 // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
                 if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
                 if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
         }
         // Some additional characters for Katakana and MidLetter.
         TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
         for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
         v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
         for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
         // WbALetter depends on Katakana, so it cannot be initialized earlier.
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
         {
                 const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
                 if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
                         ci.SetWbFlag(ucfWbALetter);
         }
         // An alternative is to extract the flags from WordBreakProperty.txt.
         // The results should be the same.
         {TUcdFileReader reader; TStrV fields;
         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn()));
         THash<TInt, TInt> hh;
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 2);
                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
                 TStr s = fields[1];
                 TUniChFlags flag = ucfCompatibilityDecomposition;
                 if (s == "Format") flag = ucfWbFormat;
                 else if (s == "Katakana") flag = ucfWbKatakana;
                 else if (s == "ALetter") flag = ucfWbALetter;
                 else if (s == "MidLetter") flag = ucfWbMidLetter;
                 else if (s == "MidNum") flag = ucfWbMidNum;
                 else if (s == "Numeric") flag = ucfWbNumeric;
                 else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
                 else FailR(s.CStr());
                 for (int c = from; c <= to; c++) {
                         int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
                         else hh[i].Val |= flag; }
         }
         reader.Close();
         TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
         for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
         cps.Sort(); cps.Merge();
         for (int i = 0; i < cps.Len(); i++)
         {
                 int cp = cps[i];
                 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
                 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
                 flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
                 if (flags1 != flags2) {
                         printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
                         Fail; }
         }}
         // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
         {TUcdFileReader reader; TStrV fields;
         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn()));
         THash<TInt, TInt> hh;
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 2);
                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
                 TStr s = fields[1];
                 TUniChFlags flag = ucfCompatibilityDecomposition;
                 if (s == "Sep") flag = ucfSbSep;
                 else if (s == "Format") flag = ucfSbFormat;
                 else if (s == "Sp") flag = ucfSbSp;
                 else if (s == "Lower") flag = ucfSbLower;
                 else if (s == "Upper") flag = ucfSbUpper;
                 else if (s == "OLetter") flag = ucfSbOLetter;
                 else if (s == "Numeric") flag = ucfSbNumeric;
                 else if (s == "ATerm") flag = ucfSbATerm;
                 else if (s == "STerm") flag = ucfSbSTerm;
                 else if (s == "Close") flag = ucfSbClose;
                 else FailR(s.CStr());
                 for (int c = from; c <= to; c++) {
                         int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
                         else hh[i].Val |= flag; }
         }
         reader.Close();
         TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
         for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
         cps.Sort(); cps.Merge();
         for (int i = 0; i < cps.Len(); i++)
         {
                 int cp = cps[i];
                 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
                 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
                 if (flags1 != flags2) {
                         printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
                                 flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
                                 flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
                                 flags1 ^ flags2);
                         Fail; }
         }}
 }

Here is the call graph for this function:

Here is the caller graph for this function:

bool TUniChDb::IsGetChInfo	(	const int	cp,
		TUniChInfo &	ChInfo
	)

inline

Definition at line 1350 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

                                                            {
                 int i = h.GetKeyId(cp);
                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}

Here is the call graph for this function:

DECLARE_FORWARDED_PROPERTY_METHODS bool TUniChDb::IsPrivateUse ( const int cp ) const

inline

Definition at line 1383 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

                                               {
                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
                         // Planes 15 and 16 are entirely for private use.
                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }

Here is the call graph for this function:

bool TUniChDb::IsSbFlag	(	const int	cp,
		const TUniChFlags	flag
	)		const

inline

Definition at line 1358 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

1358 { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

bool TUniChDb::IsSurrogate ( const int cp ) const

inline

Definition at line 1392 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

                                              {
                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
                 return 0xd800 <= cp && cp <= 0xdcff; }

Here is the call graph for this function:

bool TUniChDb::IsWbFlag	(	const int	cp,
		const TUniChFlags	flag
	)		const

inline

Definition at line 1356 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), and h.

1356 { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

static bool TUniChDb::IsWbIgnored ( const TUniChInfo & ci )

inlinestaticprotected

Definition at line 1419 of file unicode.h.

References TUniChInfo::IsGbExtend(), and TUniChInfo::IsWbFormat().

Referenced by FindNextSentenceBoundary(), FindNextWordBoundary(), TestFindNextWordOrSentenceBoundary(), TestWbFindNonIgnored(), WbFindCurOrNextNonIgnored(), WbFindNextNonIgnored(), WbFindNextNonIgnoredS(), and WbFindPrevNonIgnored().

1419 { return ci.IsGbExtend() || ci.IsWbFormat(); }

TUniChInfo::IsGbExtend

bool IsGbExtend() const

Definition: unicode.h:1139

TUniChInfo::IsWbFormat

bool IsWbFormat() const

Definition: unicode.h:1119

Here is the call graph for this function:

Here is the caller graph for this function:

bool TUniChDb::IsWbIgnored ( const int cp ) const

inlineprotected

Definition at line 1420 of file unicode.h.

References THash< TKey, TDat, THashFunc >::GetKeyId(), h, and IsWbIgnored().

Referenced by IsWbIgnored().

1420 { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }

THash< TInt, TUniChInfo > h

Definition: unicode.h:1263

TUniChDb::IsWbIgnored

static bool IsWbIgnored(const TUniChInfo &ci)

Definition: unicode.h:1419

References TVec< TVal, TSizeTy >::Add(), THash< TKey, TDat, THashFunc >::AddKey(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), THash< TKey, TDat, THashFunc >::FFirstKeyId(), THash< TKey, TDat, THashFunc >::FNextKeyId(), THash< TKey, TDat, THashFunc >::GetKey(), TUniChDb::TUcdFileReader::GetNextLine(), GetNormalizationTestFn(), h, IAssert, THash< TKey, TDat, THashFunc >::IsKey(), TVec< TVal, TSizeTy >::Len(), NFC_, NFD_, NFKC_, NFKD_, TUniChDb::TUcdFileReader::Open(), and TUniChDb::TUcdFileReader::ParseCodePointList().

int GetKeyId(const TKey &Key) const

Definition: hash.h:420

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::Load ( TSIn & SIn )

inline

Definition at line 1285 of file unicode.h.

References caseFolding, charNames, decompositions, h, InitAfterLoad(), inverseDec, THash< TKey, TDat, THashFunc >::Load(), TUniCaseFolding::Load(), TVec< TVal, TSizeTy >::Load(), TSIn::LoadCs(), scripts, specialCasingLower, specialCasingTitle, specialCasingUpper, and TStrPool::~TStrPool().

Referenced by LoadBin(), Test(), and TUniChDb().

                              {
                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
                 decompositions.Load(SIn);
                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
                 SIn.LoadCs(); InitAfterLoad(); }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::LoadBin ( const TStr & fnBin )

inline

Definition at line 1291 of file unicode.h.

References Load(), and TFIn::New().

Referenced by TUnicode::TUnicode().

1291 {

1292 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }

TFIn::New

static PSIn New(const TStr &FNm)

Definition: fl.cpp:290

TUniChDb::Load

void Load(TSIn &SIn)

Definition: unicode.h:1285

TPt

Definition: bd.h:196

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::LoadTxt ( const TStr & basePath )

Definition at line 1249 of file unicode.cpp.

Referenced by Test().

 {
         Clr();
         // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
         h = THash<TInt, TUniChInfo>(196613, true);
         //
         caseFolding.LoadTxt(CombinePath(basePath, GetCaseFoldingFn()));
         //
         TUcdFileReader reader; TStrV fields; TIntH seen;
         reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
         while (reader.GetNextLine(fields))
         {
                 // Codepoint.
                 int cp = reader.ParseCodePoint(fields[0]);
                 IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
                 TUniChInfo& ci = h.AddDat(cp);
                 // Name.
                 ci.nameOffset = charNames.AddStr(fields[1]);
                 // Category.
                 TStr& s = fields[2]; IAssert(s.Len() == 2);
                 ci.chCat = s[0]; ci.chSubCat = s[1];
                 // Canonical combining class.
                 s = fields[3]; IAssert(s.Len() > 0);
                 int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
                 ci.combClass = (uchar) i;
                 // Decomposition type and mapping.
                 LoadTxt_ProcessDecomposition(ci, fields[5]);
                 // Simple case mappings.
                 s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
                 s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
                 s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
                 //
                 ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
         }
         reader.Close();
         //
         InitScripts(basePath);
         //
         InitPropList(basePath);
         InitDerivedCoreProperties(basePath);
         InitLineBreaks(basePath);
         InitSpecialCasing(basePath);
         // Process the composition exclusions (UAX #15, sec. 6).
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
         {
                 TUniChInfo& ci = h[i];
                 int ofs = ci.decompOffset; if (ofs < 0) continue;
                 int n = 0; while (decompositions[ofs + n] >= 0) n++;
                 IAssert(n > 0);
                 // Singleton decompositions.
                 if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
                 // Non-starter decompositions.
                 int cp1 = decompositions[ofs];
                 IAssert(h.IsKey(cp1));
                 uchar ccc = h.GetDat(cp1).combClass;
                 if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
         }
         // Process the composition exclusion table.
         reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
         int nExclusionTable = 0;
         while (reader.GetNextLine(fields))
         {
                 IAssert(fields.Len() == 1);
                 int cp = reader.ParseCodePoint(fields[0]);
                 int i = h.GetKeyId(cp); IAssert(i >= 0);
                 h[i].flags |= ucfCompositionExclusion;
                 nExclusionTable++;
         }
         reader.Close();
         // Prepare the inverted index for composition pairs.
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
         {
                 int cp = h.GetKey(i);
                 TUniChInfo& ci = h[i];
                 int ofs = ci.decompOffset; if (ofs < 0) continue;
                 if (ci.IsCompositionExclusion()) continue;
                 if (ci.IsCompatibilityDecomposition()) continue;
                 int n = 0; while (decompositions[ofs + n] >= 0) n++;
                 if (n != 2) continue;
                 TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
                 IAssert(! inverseDec.IsKey(pr));
                 IAssert(ci.combClass == TUniChInfo::ccStarter);
                 inverseDec.AddDat(pr, cp);
         }
         printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
                 basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
         // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
         // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
         InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
         // Make sure that Hangul combined characters are treated as stareters.
         for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
         {
                 int j = h.GetKeyId(cp); if (j < 0) continue;
                 TUniChInfo& ci = h[j];
                 if (ci.combClass == TUniChInfo::ccInvalid) ci.combClass = TUniChInfo::ccStarter;
                 IAssert(ci.combClass == TUniChInfo::ccStarter);
         }
         // There should be no more additions to 'h' beyond this point.
         const int oldHLen = h.Len();
         // Provide default (identity) case mappings if any were missing from UnicodeData.txt
         // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
         int scriptUnknown = GetScriptByName(GetScriptNameUnknown());
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
         {
                 int cp = h.GetKey(i); TUniChInfo &ci = h[i];
                 if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
                 if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
                 if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
                 if (ci.script < 0) ci.script = scriptUnknown;
         }
         IAssert(h.Len() == oldHLen);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::LoadTxt_ProcessDecomposition	(	TUniChInfo &	ci,
		TStr	s
	)

protected

Definition at line 937 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::AddV(), TUniChInfo::decompOffset, decompositions, TStr::Empty(), TUniChInfo::flags, TStr::GetSubStr(), IAssert, TStr::Len(), TVec< TVal, TSizeTy >::Len(), TUniChDb::TUcdFileReader::ParseCodePointList(), TStr::SearchCh(), TStr::ToTrunc(), and ucfCompatibilityDecomposition.

Referenced by LoadTxt().

 {
         if (s.Empty()) return;
         if (s[0] == '<') {
                 int i = s.SearchCh('>'); IAssert(i > 0);
                 ci.flags |= ucfCompatibilityDecomposition;
                 s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
         TIntV dec; TUcdFileReader::ParseCodePointList(s, dec);
         IAssert(dec.Len() > 0);
         ci.decompOffset = decompositions.Len();
         decompositions.AddV(dec); decompositions.Add(-1);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<class TSrcVec >

void TUniChDb::PrintCharNames	(	FILE *	f,
		const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		const TStr &	prefix
	)		const

inline

Definition at line 1336 of file unicode.h.

References TStr::CStr(), and GetCharNameS().

                                                                                                                                                  {
                 if (! f) f = stdout;
                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
                         fprintf(f, "%s", prefix.CStr());
                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}

Here is the call graph for this function:

template<class TSrcVec >

void TUniChDb::PrintCharNames	(	FILE *	f,
		const TSrcVec &	src,
		const TStr &	prefix
	)		const

inline

Definition at line 1342 of file unicode.h.

References PrintCharNames().

Referenced by PrintCharNames().

1342 { PrintCharNames(f, src, 0, src.Len(), prefix); }

TUniChDb::PrintCharNames

void PrintCharNames(FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const

Definition: unicode.h:1336

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::Save ( TSOut & SOut ) const

inline

Definition at line 1280 of file unicode.h.

References caseFolding, charNames, decompositions, h, inverseDec, THash< TKey, TDat, THashFunc >::Save(), TUniCaseFolding::Save(), TVec< TVal, TSizeTy >::Save(), TStrPool::Save(), TSOut::SaveCs(), scripts, specialCasingLower, specialCasingTitle, and specialCasingUpper.

Referenced by SaveBin(), and Test().

                                      {
                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
                 SOut.SaveCs(); }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::SaveBin ( const TStr & fnBinUcd )

Definition at line 1362 of file unicode.cpp.

References TFOut::New(), and Save().

 {
         PSOut SOut=TFOut::New(fnBinUcd);
         Save(*SOut);
 }

Here is the call graph for this function:

template<class TSrcVec >

void TUniChDb::SbEx_Add ( const TSrcVec & v )

inline

Definition at line 1490 of file unicode.h.

References TUniTrie< TItem_ >::Add(), and sbExTrie.

Referenced by SbEx_Add(), SbEx_AddMulti(), and SbEx_AddUtf8().

1490 { sbExTrie.Add(v); }

TUniChDb::sbExTrie

TUniTrie< TInt > sbExTrie

Definition: unicode.h:1461

TUniTrie::Add

void Add(const TSrcVec &src, const size_t srcIdx, const size_t srcCount)

Definition: unicode.h:1220

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::SbEx_Add ( const TStr & s )

inline

Definition at line 1492 of file unicode.h.

References TVec< TVal, TSizeTy >::Gen(), TStr::Len(), and SbEx_Add().

1492 {

1493 TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }

TStr::Len

int Len() const

Definition: dt.h:487

uchar

unsigned char uchar

Definition: bd.h:10

TUniChDb::SbEx_Add

void SbEx_Add(const TSrcVec &v)

Definition: unicode.h:1490

TVec::Gen

void Gen(const TSizeTy &_Vals)

Constructs a vector (an array) of _Vals elements.

Definition: ds.h:486

TVec< TInt >

Here is the call graph for this function:

int TUniChDb::SbEx_AddMulti	(	const TStr &	words,
		const bool	wordsAreUtf8 = `true`
	)

inline

Definition at line 1495 of file unicode.h.

References TVec< TVal, TSizeTy >::Len(), SbEx_Add(), SbEx_AddUtf8(), and TStr::SplitOnAllCh().

Referenced by SbEx_SetStdEnglish().

                                                                              { TStrV vec; words.SplitOnAllCh('|', vec);
                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
                 return vec.Len(); }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::SbEx_AddUtf8 ( const TStr & s )

inline

Definition at line 1494 of file unicode.h.

References TUniCodec::DecodeUtf8(), and SbEx_Add().

Referenced by SbEx_AddMulti().

1494 { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }

TUniCodec

Definition: unicode.h:54

TUniChDb::SbEx_Add

void SbEx_Add(const TSrcVec &v)

Definition: unicode.h:1490

TUniCodec::DecodeUtf8

size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:2036

TVec< TInt >

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::SbEx_Clr ( )

inline

Definition at line 1489 of file unicode.h.

References TUniTrie< TItem_ >::Clr(), and sbExTrie.

Referenced by TUnicode::ClrSentenceBoundaryExceptions(), and SbEx_SetStdEnglish().

1489 { sbExTrie.Clr(); }

TUniTrie::Clr

void Clr()

Definition: unicode.h:1200

TUniChDb::sbExTrie

TUniTrie< TInt > sbExTrie

Definition: unicode.h:1461

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::SbEx_Set ( const TUniTrie< TInt > & newTrie )

inline

Definition at line 1498 of file unicode.h.

References sbExTrie.

1498 { sbExTrie = newTrie; }

TUniChDb::sbExTrie

TUniTrie< TInt > sbExTrie

Definition: unicode.h:1461

int TUniChDb::SbEx_SetStdEnglish ( )

inline

Definition at line 1499 of file unicode.h.

References SbEx_AddMulti(), and SbEx_Clr().

Referenced by TUnicode::UseEnglishSentenceBoundaryExceptions().

                                  {
                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
                 SbEx_Clr(); return SbEx_AddMulti(data, false); }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::Test ( const TStr & basePath )

Definition at line 1377 of file unicode.cpp.

References caseFolding, anonymous_namespace{unicode.cpp}::CombinePath(), TFile::Exists(), GetBinFn(), Load(), LoadTxt(), TFIn::New(), TFOut::New(), Save(), TUniCaseFolding::Test(), TestCaseConversions(), TestComposition(), TestFindNextWordOrSentenceBoundary(), TestWbFindNonIgnored(), and TUniChDb().

 {
         TStr fnBin = CombinePath(basePath, GetBinFn());
         if (true || ! TFile::Exists(fnBin))
         {
                 // Test LoadTxt.
                 LoadTxt(basePath);
                 // Test Save.
                 {PSOut SOut = TFOut::New(fnBin);
                 Save(*SOut);}
         }
         // Test Load.
         this->~TUniChDb();
         new(this) TUniChDb();
         {PSIn SIn = TFIn::New(fnBin);
         Load(*SIn);}
         // Test the case folding.
         caseFolding.Test();
         // Test the word breaking.
         TestWbFindNonIgnored();
         // Test the sentence breaking.
         TestFindNextWordOrSentenceBoundary(basePath, true);
         TestFindNextWordOrSentenceBoundary(basePath, false);
         // Test composition and decomposition.
         TestComposition(basePath);
         // Test the case conversions.
         TestCaseConversions();
 }

Here is the call graph for this function:

void TUniChDb::TestCaseConversion	(	const TStr &	source,
		const TStr &	trueLc,
		const TStr &	trueTc,
		const TStr &	trueUc,
		bool	turkic,
		bool	lithuanian
	)

protected

Definition at line 825 of file unicode.cpp.

References ccLower, ccTitle, ccUpper, GetCaseConverted(), IAssert, TVec< TVal, TSizeTy >::Len(), and TUniChDb::TUcdFileReader::ParseCodePointList().

Referenced by TestCaseConversions().

 {
         TIntV src;
         TUcdFileReader::ParseCodePointList(source, src);
         FILE *f = stderr;
         for (int i = 0; i < 3; i++)
         {
                 TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
                 const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
                 TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
                 TIntV dest;
                 GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
                 bool ok = (dest.Len() == trueDest.Len());
                 if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
                 if (ok) continue;
                 fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
                 for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
                 fprintf(f, ")\nCorrect:   (");
                 for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
                 fprintf(f, ")\nOur output:(");
                 for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
                 fprintf(f, ")\n");
                 IAssert(ok);
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::TestCaseConversions ( )

protected

Definition at line 853 of file unicode.cpp.

References TestCaseConversion().

Referenced by Test().

 {
         // Because no thorough case-conversion test files have been provided as part
         // of the Unicode standard, we'll have to test things on a few test cases of our own.
         // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
         const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
         const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
         const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
         const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
         const TStr space = "0020 ", Grave = "0300 ";
         TestCaseConversion(
                 F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst,  // source
                 f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst,  // lowercase
                 F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst,      // titlecase
                 F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
                 false, false);
         // - Dotted I, dotless i, etc., but with turkic == false.
         const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
         TestCaseConversion(
                 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
                 s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
                 S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
                 S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
                 false, false);
         // - Sigma (final vs. non-final forms).
         const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
         TestCaseConversion(
                 Sigma + s + space + s + Sigma  + space + s + Sigma + s + space + Sigma + S + Sigma  + space + Sigma, // source
                 sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
                 Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
                 Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
                 false, false);
         TestCaseConversion(
                 sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + S + sigma  + space + sigma, // source
                 sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + s + sigma  + space + sigma, // lowercase
                 Sigma + s + space + S + sigma  + space + S + sigma + s + space + Sigma + s + sigma  + space + Sigma, // titlecase
                 Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
                 false, false);
         TestCaseConversion(
                 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma  + space + fsigma, // source
                 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma  + space + fsigma, // lowercase
                 Sigma  + s + space + S + fsigma + space + S + fsigma + s + space + Sigma  + s + fsigma  + space + Sigma, // titlecase
                 Sigma  + S + space + S + Sigma  + space + S + Sigma  + S + space + Sigma  + S + Sigma   + space + Sigma, // uppercase
                 false, false);
         const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
         // Special case mappings for Turkic languages:
         // - After_I
         TestCaseConversion(
                 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
                 s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
                 S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
                 S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
                 true, false); // turkic
         // - Not_Before_Dot
         TestCaseConversion(
                 I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
                 iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
                 I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
                 I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
                 true, false); // turkic
         // Special case mappings for Lithuanian:
         // - After_Soft_Dotted  [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
         TestCaseConversion(
                 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
                 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
                 I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
                 I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
                 false, true); // lithuanian
         // - More_Above  [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
         TestCaseConversion(
                 J +        Grave + space + J +        nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J +        nonSA + Grave + space + j + nonSA, // source
                 j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
                 J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
                 J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + J +        nonSA + Grave + space + J + nonSA, // uppercase
                 false, true); // lithuanian
         // SoftDotted [^ Starter Above]* 0307   --(uc,tc)-->  brez 0307
         // SoftDotted [^ Starter Above]* 0307   --(
         //TestCaseConversion("", "", "", "", false, false);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::TestComposition ( const TStr & basePath )

protected

Definition at line 745 of file unicode.cpp.

Referenced by Test().

 {
         TUcdFileReader reader; TStrV fields; int nLines = 0;
         reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
         bool inPart1 = false; TIntH testedInPart1;
         while (reader.GetNextLine(fields))
         {
                 nLines += 1;
                 if (fields.Len() == 1) {
                         IAssert(fields[0].IsPrefix("@Part"));
                         inPart1 = (fields[0] == "@Part1"); continue; }
                 IAssert(fields.Len() == 6);
                 IAssert(fields[5].Len() == 0);
                 TIntV c1, c2, c3, c4, c5;
                 reader.ParseCodePointList(fields[0], c1);
                 reader.ParseCodePointList(fields[1], c2);
                 reader.ParseCodePointList(fields[2], c3);
                 reader.ParseCodePointList(fields[3], c4);
                 reader.ParseCodePointList(fields[4], c5);
                 TIntV v;
 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
                 // NFD:
                 NFD_(c3, c1);   // c3 == NFD(c1)
                 NFD_(c3, c2);   // c3 == NFD(c2)
                 NFD_(c3, c3);   // c3 == NFD(c3)
                 NFD_(c5, c4);   // c5 == NFD(c4)
                 NFD_(c5, c5);   // c5 == NFD(c5)
                 // NFC:
                 NFC_(c2, c1);   // c2 == NFC(c1)
                 NFC_(c2, c2);   // c2 == NFC(c2)
                 NFC_(c2, c3);   // c2 == NFC(c3)
                 NFC_(c4, c4);   // c4 == NFC(c4)
                 NFC_(c4, c5);   // c4 == NFC(c5)
                 // NFKD:
                 NFKD_(c5, c1);   // c5 == NFKD(c1)
                 NFKD_(c5, c2);   // c5 == NFKD(c2)
                 NFKD_(c5, c3);   // c5 == NFKD(c3)
                 NFKD_(c5, c4);   // c5 == NFKD(c4)
                 NFKD_(c5, c5);   // c5 == NFKD(c5)
                 // NFKC:
                 NFKC_(c4, c1);   // c4 == NFKC(c1)
                 NFKC_(c4, c2);   // c4 == NFKC(c2)
                 NFKC_(c4, c3);   // c4 == NFKC(c3)
                 NFKC_(c4, c4);   // c4 == NFKC(c4)
                 NFKC_(c4, c5);   // c4 == NFKC(c5)
                 //
                 if (inPart1) {
                         IAssert(c1.Len() == 1);
                         testedInPart1.AddKey(c1[0]); }
         }
         reader.Close();
         // Test other individual codepoints that were not mentioned in part 1.
         int nOther = 0;
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
         {
                 const int cp = h.GetKey(i), nLines = -1;
                 if (testedInPart1.IsKey(cp)) continue;
                 TIntV x, v; x.Add(cp);
                 NFC_(x, x);    // x == NFC(x)
                 NFD_(x, x);    // x == NFD(x)
                 NFKC_(x, x);   // x == NFKC(x)
                 NFKD_(x, x);   // x == NFKD(x)
                 nOther += 1;
         }
 #undef AssE_
 #undef NFC_
 #undef NFD_
 #undef NFKC_
 #undef NFKD_
         printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::TestFindNextWordOrSentenceBoundary	(	const TStr &	basePath,
		bool	sentence
	)

protected

Definition at line 649 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::AddV(), AlwaysFalse(), TUniChDb::TUcdFileReader::Close(), anonymous_namespace{unicode.cpp}::CombinePath(), TStr::CStr(), Fail, FailR, FindNextSentenceBoundary(), FindNextWordBoundary(), FindSentenceBoundaries(), FindWordBoundaries(), TVec< TVal, TSizeTy >::Gen(), GetAuxiliaryDir(), TUniChDb::TUcdFileReader::GetNextLine(), GetSbFlags(), TUniChInfo::GetSbFlagsStr(), GetSentenceBreakTestFn(), TRnd::GetUniDevInt(), GetWbFlags(), TUniChInfo::GetWbFlagsStr(), GetWordBreakTestFn(), IAssert, IsWbIgnored(), TVec< TVal, TSizeTy >::Len(), TUniChDb::TUcdFileReader::Open(), TUniChDb::TUcdFileReader::ParseCodePoint(), and TVec< TVal, TSizeTy >::PutAll().

Referenced by Test().

 {
         TUcdFileReader reader; TStrV fields;
         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
         int nLines = 0; TRnd rnd = TRnd(123);
         while (reader.GetNextLine(fields))
         {
                 nLines += 1;
                 IAssert(fields.Len() == 1);
                 TStrV parts; fields[0].SplitOnWs(parts);
                 const int n = parts.Len(); IAssert((n % 2) == 1);
                 TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
                 // Each line is a sequence of codepoints, with a \times or \div in between each
                 // pair of codepoints (as well as at the beginning and the end of the sequence) to
                 // indicate whether a boundary exists there or not.
                 for (int i = 0; i < n; i++)
                 {
                         const TStr& s = parts[i];
                         if ((i % 2) == 0) {
                                 if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
                                         isBreak.Add(false);
                                 else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
                                         isBreak.Add(true);
                                 else FailR(s.CStr()); }
                         else chars.Add(reader.ParseCodePoint(s));
                 }
                 const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
                 IAssert(isBreak[0]); IAssert(isBreak[m]);
                 isPredicted.Gen(m + 1); isPredicted.PutAll(false);
                 if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
                 // We'll insert a few random characters at the beginning of the sequence
                 // so that srcPos doesn't always begin at 0.
                 for (int nBefore = 0; nBefore < 5; nBefore++)
                 {
                         TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
                         chars2.AddV(chars);
                         // Use FindNextBoundary to find all the word boundaries.
                         size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
                         while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
                         {
                                 IAssert(prevPosition < position);
                                 IAssert(position <= size_t(nBefore + m));
                                 isPredicted[int(position) - nBefore] = true;
                                 prevPosition = position;
                         }
                         IAssert(position == size_t(nBefore + m));
                         if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
                         else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
                         IAssert(isPredicted2.Len() == m + 1);
                         bool ok = true;
                         // If we start at 0, the word boundary at the beginning of the sequence was
                         // not found explicitly, so we'll add it now.
                         if (nBefore == 0) isPredicted[0] = true;
                         // Compare the predicted and the true boundaries.
                         for (int i = 0; i <= m; i++) {
                                 if (isBreak[i] != isPredicted[i]) ok = false;
                                 IAssert(isPredicted2[i] == isPredicted[i]); }
                         FILE *f = stderr;
                         if (! ok)
                         {
                                 fprintf(f, "\nError in line %d:\n", nLines);
                                 fprintf(f, "True:      ");
                                 for (int i = 0; i <= m; i++) {
                                         fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
                                         if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
                                 fprintf(f, "\nPredicted: ");
                                 for (int i = 0; i <= m; i++) {
                                         fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
                                         if (i < m) {
                                                 const int cp = chars[i + nBefore];
                                                 TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp));
                                                 if (IsWbIgnored(cp)) s = "*" + s;
                                                 fprintf(f, "%4s ", s.CStr()); }}
                                 fprintf(f, "\n");
                                 Fail;
                         }
                         // Test FindNextBoundary if we start in the middle of the sequence,
                         // i.e. not at an existing boundary.
                         for (int i = 0; i < m; i++) {
                                 position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
                                 IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
                                 IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
                                 position -= nBefore;
                                 for (int j = i + 1; j < int(position); j++)
                                         IAssert(! isBreak[j]);
                                 IAssert(isBreak[int(position)]); }
                 }
         }
         reader.Close();
         printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniChDb::TestWbFindNonIgnored ( const TIntV & src ) const

protected

Definition at line 579 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Gen(), IAssert, IsWbIgnored(), TVec< TVal, TSizeTy >::Len(), WbFindCurOrNextNonIgnored(), WbFindNextNonIgnored(), and WbFindPrevNonIgnored().

 {
         int n = src.Len();
         TBoolV isIgnored; isIgnored.Gen(n);
         for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
         TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
         prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
         FILE *f = 0; // stderr;
         for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
         {
                 int prev = -1;
                 for (int i = 0; i < srcLen; i++) {
                         prevNonIgnored[i] = prev;
                         if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
                 int next = srcIdx + srcLen;
                 for (int i = srcLen - 1; i >= 0; i--) {
                         nextNonIgnored[i] = next;
                         if (! isIgnored[srcIdx + i]) next = srcIdx + i;
                         curOrNextNonIgnored[i] = next; }
                 if (f) {
                         fprintf(f, "\nIndex:     "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
                         fprintf(f, "\nNonIgn:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
                         fprintf(f, "\nPrevNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
                         fprintf(f, "\nNextNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
                         fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
                         fprintf(f, "\n"); }
                 for (int i = 0; i < srcLen; i++)
                 {
                         size_t s;
                         s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
                         IAssert(s == size_t(nextNonIgnored[i]));
                         s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
                         IAssert(s == size_t(curOrNextNonIgnored[i]));
                         s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
                         if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
                         else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
                 }
         }
 }

Here is the call graph for this function:

void TUniChDb::TestWbFindNonIgnored ( ) const

protected

Definition at line 619 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TStr::CStr(), THash< TKey, TDat, THashFunc >::FFirstKeyId(), TUniChInfo::flags, THash< TKey, TDat, THashFunc >::FNextKeyId(), TVec< TVal, TSizeTy >::Gen(), THash< TKey, TDat, THashFunc >::GetKey(), GetScriptName(), h, IsWbIgnored(), TVec< TVal, TSizeTy >::Len(), TUniChInfo::properties, TUniChInfo::propertiesX, TUniChInfo::script, and TVec< TVal, TSizeTy >::Sort().

Referenced by Test().

 {
         TIntV chIgnored, chNonIgnored;
         FILE *f = 0; // stderr;
         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
                 const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
                 if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
                         ci.flags, ci.properties, ci.propertiesX, GetScriptName(ci.script).CStr());
                 (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
         }
         chIgnored.Sort(); chNonIgnored.Sort();
         printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
         TRnd rnd = TRnd(123);
         for (int iter = 0; iter <= 50; iter++)
         {
                 int percIgnored = 2 * iter;
                 for (int n = 0; n <= 20; n++)
                 {
                         // Prepare a random sequence of 'n' codepoints.
                         TIntV v; v.Gen(n);
                         for (int i = 0; i < n; i++) {
                                 TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
                                 int j = rnd.GetUniDevInt(chars.Len());
                                 v.Add(chars[j]); }
                         // Run the tests with this sequence.
                         TestWbFindNonIgnored(v);
                 }
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToCaseFolded	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		const bool	turkic = `false`
	)		const

inline

Definition at line 1636 of file unicode.h.

References caseFolding, and TUniCaseFolding::FoldInPlace().

Referenced by TUnicode::ToCaseFolded().

1636 { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }

TUniChDb::caseFolding

TUniCaseFolding caseFolding

Definition: unicode.h:1268

TUniCaseFolding::FoldInPlace

void FoldInPlace(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic) const

Definition: unicode.h:307

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToCaseFolded	(	TSrcVec &	src,
		const bool	turkic = `false`
	)		const

inline

Definition at line 1637 of file unicode.h.

References ToCaseFolded().

Referenced by ToCaseFolded().

1637 { ToCaseFolded(src, 0, src.Len(), turkic); }

TUniChDb::ToCaseFolded

void ToCaseFolded(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const

Definition: unicode.h:1636

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToSimpleCaseConverted	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		const TCaseConversion	how
	)		const

Definition at line 3072 of file unicode.h.

References ccLower, ccTitle, ccUpper, FindNextWordBoundary(), THash< TKey, TDat, THashFunc >::GetKeyId(), h, IAssert, TUniChInfo::simpleLowerCaseMapping, TUniChInfo::simpleTitleCaseMapping, and TUniChInfo::simpleUpperCaseMapping.

Referenced by ToSimpleLowerCase(), ToSimpleTitleCase(), and ToSimpleUpperCase().

 {
         bool seenCased = false; size_t nextWordBoundary = srcIdx;
         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
         {
                 const int cp = src[TVecIdx(srcIdx)];
                 int i = h.GetKeyId(cp); if (i < 0) continue;
                 const TUniChInfo &ci = h[i];
                 // With titlecasing, the first cased character of each word must be put into titlecase,
                 // all others into lowercase.  This is what the howHere variable is for.
                 TUniChDb::TCaseConversion howHere;
                 if (how != ccTitle) howHere = how;
                 else {
                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.
                                 seenCased = false;
                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
                         bool isCased = IsCased(cp);
                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
                         else howHere = ccLower;
                 }
                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToSimpleLowerCase	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const

inline

Definition at line 1610 of file unicode.h.

References ccLower, and ToSimpleCaseConverted().

Referenced by TUnicode::ToSimpleLowerCase().

1610 { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }

TUniChDb::ToSimpleCaseConverted

void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const

Definition: unicode.h:3072

TUniChDb::ToSimpleLowerCase

Definition: unicode.h:1584

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToSimpleLowerCase ( TSrcVec & src ) const

inline

Definition at line 1613 of file unicode.h.

References ToSimpleLowerCase().

Referenced by ToSimpleLowerCase().

1613 { ToSimpleLowerCase(src, 0, src.Len()); }

void ToSimpleLowerCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const

Definition: unicode.h:1610

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToSimpleTitleCase	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const

inline

Definition at line 1611 of file unicode.h.

References ccTitle, and ToSimpleCaseConverted().

Referenced by TUnicode::ToSimpleTitleCase().

1611 { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }

TUniChDb::ToSimpleCaseConverted

void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const

Definition: unicode.h:3072

TUniChDb::ToSimpleTitleCase

Definition: unicode.h:1584

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToSimpleTitleCase ( TSrcVec & src ) const

inline

Definition at line 1614 of file unicode.h.

References ToSimpleTitleCase().

Referenced by ToSimpleTitleCase().

1614 { ToSimpleTitleCase(src, 0, src.Len()); }

void ToSimpleTitleCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const

Definition: unicode.h:1611

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

void TUniChDb::ToSimpleUpperCase	(	TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const

inline

Definition at line 1609 of file unicode.h.

References ccUpper, and ToSimpleCaseConverted().

Referenced by TUnicode::ToSimpleUpperCase().

1609 { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }