SNAP Library 2.1, Developer Reference
2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <unicode.h>
Public Types | |
enum | { DefaultReplacementChar = 0xfffd } |
Public Member Functions | |
TUniCodec () | |
TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) | |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec > | |
TStr | EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const |
template<typename TSrcVec > | |
TStr | EncodeUtf8Str (const TSrcVec &src) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
void | TestUtf8 () |
void | TestUtf16 () |
Public Attributes | |
int | replacementChar |
TUnicodeErrorHandling | errorHandling |
bool | strict |
bool | skipBom |
Protected Types | |
enum | { DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0) } |
enum | { Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 } |
typedef TUniVecIdx | TVecIdx |
Protected Member Functions | |
void | TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f) |
void | TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc) |
void | WordsToBytes (const TIntV &src, TIntV &dest) |
void | TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f) |
void | TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom) |
Static Protected Member Functions | |
static bool | IsMachineLittleEndian () |
static uint | GetRndUint (TRnd &rnd) |
static uint | GetRndUint (TRnd &rnd, uint minVal, uint maxVal) |
static int | SwapBytes (int x) |
Friends | |
class | TUniCaseFolding |
class | TUnicode |
typedef TUniVecIdx TUniCodec::TVecIdx [protected] |
anonymous enum |
Definition at line 59 of file unicode.h.
{ DefaultReplacementChar = 0xfffd };
anonymous enum [protected] |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte |
Definition at line 101 of file unicode.h.
{ #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 DefineByte(1, 0, 0, 0, 0, 0, 0, 0), DefineByte(1, 1, 0, 0, 0, 0, 0, 0), DefineByte(1, 1, 1, 0, 0, 0, 0, 0), DefineByte(1, 1, 1, 1, 0, 0, 0, 0), DefineByte(1, 1, 1, 1, 1, 0, 0, 0), DefineByte(1, 1, 1, 1, 1, 1, 0, 0), DefineByte(1, 1, 1, 1, 1, 1, 1, 0), DefineByte(0, 0, 1, 1, 1, 1, 1, 1), DefineByte(0, 0, 0, 1, 1, 1, 1, 1), DefineByte(0, 0, 0, 0, 1, 1, 1, 1), DefineByte(0, 0, 0, 0, 0, 1, 1, 1), DefineByte(0, 0, 0, 0, 0, 0, 1, 1) #undef DefineByte };
anonymous enum [protected] |
Definition at line 157 of file unicode.h.
{ Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 };
TUniCodec::TUniCodec | ( | ) | [inline] |
Definition at line 91 of file unicode.h.
: replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) { }
TUniCodec::TUniCodec | ( | TUnicodeErrorHandling | errorHandling_, |
bool | strict_, | ||
int | replacementChar_, | ||
bool | skipBom_ | ||
) | [inline] |
Definition at line 95 of file unicode.h.
: replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_) { }
size_t TUniCodec::DecodeUtf16FromBytes | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest, | ||
const TUtf16BomHandling | bomHandling = bomAllowed , |
||
const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
) | const |
Definition at line 2210 of file unicode.h.
References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, boMachineEndian, bomAllowed, bomIgnored, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, Fail, TInt::GetStr(), IAssert, IsMachineLittleEndian(), replacementChar, skipBom, strict, uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.
Referenced by TUnicode::DecodeUtf16FromBytes(), and TestUtf16().
{ IAssert(srcCount % 2 == 0); IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); if (clrDest) dest.Clr(); size_t nDecoded = 0; if (srcCount <= 0) return nDecoded; const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; bool littleEndian = false; bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian())); if (bomHandling == bomIgnored) littleEndian = leDefault; else if (bomHandling == bomAllowed || bomHandling == bomRequired) { int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; } else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; } else if (bomHandling == bomAllowed) littleEndian = leDefault; else { // Report an error. switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead)."); case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); default: Fail; } } } else Fail; while (srcIdx < srcEnd) { const size_t charSrcIdx = srcIdx; uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) { // c is the first character in a surrogate pair. Read the next character. if (! (srcIdx + 2 <= srcEnd)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); // c2 should be the second character of the surrogate pair. if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); case uehAbort: return nDecoded; // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue; case uehIgnore: srcIdx -= 2; continue; default: Fail; } } // c and c2 each contain 10 bits of information. uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); cc += 0x10000; dest.Add(TDestCh(cc)); nDecoded++; continue; } else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; // Otherwise, store 'c' to the destination vector. dest.Add(TDestCh(c)); nDecoded++; } return nDecoded; }
size_t TUniCodec::DecodeUtf16FromWords | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
bool | clrDest, | ||
const TUtf16BomHandling | bomHandling = bomAllowed , |
||
const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
) | const |
Definition at line 2294 of file unicode.h.
References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, boMachineEndian, bomAllowed, bomIgnored, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, Fail, TInt::GetStr(), IAssert, IsMachineLittleEndian(), replacementChar, skipBom, strict, uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.
Referenced by TUnicode::DecodeUtf16FromWords(), and TestUtf16().
{ IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); if (clrDest) dest.Clr(); size_t nDecoded = 0; if (srcCount <= 0) return nDecoded; const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; bool swap = false; bool isMachineLe = IsMachineLittleEndian(); bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe); else if (bomHandling == bomAllowed || bomHandling == bomRequired) { int c = uint(src[TVecIdx(srcIdx)]) & 0xffff; if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; } else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; } else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe); else { // Report an error. switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead)."); case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); default: Fail; } } } else Fail; while (srcIdx < srcEnd) { const size_t charSrcIdx = srcIdx; uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) { // c is the first character in a surrogate pair. Read the next character. if (! (srcIdx < srcEnd)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); // c2 should be the second character of the surrogate pair. if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); case uehAbort: return nDecoded; // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue; case uehIgnore: srcIdx -= 1; continue; default: Fail; } } // c and c2 each contain 10 bits of information. uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); cc += 0x10000; dest.Add(TDestCh(cc)); nDecoded++; continue; } else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; // Otherwise, store 'c' to the destination vector. dest.Add(TDestCh(c)); nDecoded++; } return nDecoded; }
size_t TUniCodec::DecodeUtf8 | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const |
Definition at line 2036 of file unicode.h.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), errorHandling, Fail, TInt::GetStr(), replacementChar, skipBom, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.
Referenced by TUnicode::DecodeUtf8(), TUniChDb::SbEx_AddUtf8(), and TestUtf8().
{ size_t nDecoded = 0; if (clrDest) dest.Clr(); const size_t origSrcIdx = srcIdx; const size_t srcEnd = srcIdx + srcCount; while (srcIdx < srcEnd) { const size_t charSrcIdx = srcIdx; uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; if ((c & _1000_0000) == 0) { // c is one of the characters 0..0x7f, encoded as a single byte. dest.Add(TDestCh(c)); nDecoded++; continue; } else if ((c & _1100_0000) == _1000_0000) { // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx. // We must have been thrown into the middle of a multi-byte character. switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } else { // c introduces a sequence of 2..6 bytes, depending on how many // of the most significant bits of c are set. uint nMoreBytes = 0, nBits = 0, minVal = 0; if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80; else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800; else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000; else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000; else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000; else { // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8 // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh. if (strict) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x."); case uehAbort: return nDecoded; // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes // and try to decode the character. Then, since 'strict' is true and // the codepoint is clearly >= 2^31, we'll notice this as an error later // and (in the case of uehReplace) insert a replacement character then. // This is probably better than inserting a replacement character right // away and then trying to read the next byte as if a new character // was beginning there -- if the current byte is really followed by five // 10xxxxxx bytes, we'll just get six replacement characters in a row. case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: break; // continue; default: Fail; } } nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; } // Decode this multi-byte sequence. uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c. bool cancel = false; for (uint i = 0; i < nMoreBytes && ! cancel; i++) { // See if there are enough bytes left in the source vector. if (! (srcIdx < srcEnd)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue; case uehIgnore: cancel = true; continue; default: Fail; } } // Read the next byte. c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx. switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue; case uehIgnore: srcIdx--; cancel = true; continue; default: Fail; } } cOut <<= 6; cOut |= (c & _0011_1111); } if (cancel) continue; if (strict) { // err1: This codepoint has been represented by more bytes than it should have been. // For example, cOut in the range 0..127 should be represented by a single byte, // not by two or more bytes. // - For example, this may happen in the "modified UTF-8" sometimes used for Java // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid // the appearance of null bytes in the encoded stream. bool err1 = (cOut < minVal); // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes. // However, later this was restricted to the codepoints 0..0x10ffff only, because only these // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary. bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff)); if (err1 || err2) switch (errorHandling) { case uehThrow: if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ")."); else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid."); else { Fail; break; } case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // Add the decoded codepoint to the destination vector. // If this is the first decoded character, and it's one of the byte-order marks // (0xfffe and 0xfeff), we will skip it (unless skipBom is false). if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) { dest.Add(cOut); nDecoded++; } } // else (multi-byte sequence) } // while return nDecoded; }
size_t TUniCodec::DecodeUtf8 | ( | const TSrcVec & | src, |
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const [inline] |
Definition at line 136 of file unicode.h.
References DecodeUtf8().
Referenced by DecodeUtf8().
{ return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
size_t TUniCodec::EncodeUtf16ToBytes | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest, | ||
const bool | insertBom, | ||
const TUniByteOrder | destByteOrder = boMachineEndian |
||
) | const |
Definition at line 2428 of file unicode.h.
References ___OutRepl, TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, errorHandling, Fail, TUInt::GetStr(), IAssert, IsMachineLittleEndian(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.
Referenced by TUnicode::EncodeUtf16ToBytes(), and TestUtf16().
{ bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian())); size_t nEncoded = 0, srcEnd = srcIdx + srcCount; if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; } while (srcIdx < srcEnd) { uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; if (! (c <= 0x10ffffu)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); case uehAbort: return nEncoded; #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } case uehReplace: ___OutRepl; continue; case uehIgnore: continue; default: Fail; } } if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); case uehAbort: return nEncoded; case uehReplace: ___OutRepl; continue; case uehIgnore: continue; default: Fail; } } if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); case uehAbort: return nEncoded; case uehReplace: ___OutRepl; continue; case uehIgnore: continue; default: Fail; } } #undef ___OutRepl // If c is <= 0xffff, it can be stored directly. if (c <= 0xffffu) { if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } nEncoded++; continue; } // Otherwise, represent c by a pair of surrogate characters. c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); uint c1 = (c >> 10) & 1023, c2 = c & 1023; c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); } else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); } nEncoded++; continue; } return nEncoded; }
size_t TUniCodec::EncodeUtf16ToWords | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest, | ||
const bool | insertBom, | ||
const TUniByteOrder | destByteOrder = boMachineEndian |
||
) | const |
Definition at line 2376 of file unicode.h.
References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, errorHandling, Fail, TUInt::GetStr(), IAssert, IsMachineLittleEndian(), replacementChar, SwapBytes(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.
Referenced by TUnicode::EncodeUtf16ToWords(), and TestUtf16().
{ bool isMachineLe = IsMachineLittleEndian(); bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe); size_t nEncoded = 0, srcEnd = srcIdx + srcCount; if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; } while (srcIdx < srcEnd) { uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; if (! (c <= 0x10ffffu)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); case uehAbort: return nEncoded; case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; case uehIgnore: continue; default: Fail; } } if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); case uehAbort: return nEncoded; case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; case uehIgnore: continue; default: Fail; } } if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); case uehAbort: return nEncoded; case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // If c is <= 0xffff, it can be stored directly. if (c <= 0xffffu) { if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); dest.Add(TDestCh(c)); nEncoded++; continue; } // Otherwise, represent c by a pair of surrogate characters. c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); uint c1 = (c >> 10) & 1023, c2 = c & 1023; c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; if (swap) { c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8); c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); } dest.Add(TDestCh(c1)); dest.Add(TDestCh(c2)); nEncoded++; continue; } return nEncoded; }
size_t TUniCodec::EncodeUtf8 | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const |
Definition at line 2152 of file unicode.h.
References TVec< TVal, TSizeTy >::Add(), errorHandling, Fail, TInt::GetStr(), replacementChar, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.
Referenced by TUnicode::EncodeUtf8(), EncodeUtf8Str(), and TestUtf8().
{ size_t nEncoded = 0; for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { uint c = uint(src[TVecIdx(srcIdx)]); bool err = false; if (strict && c > 0x10ffff) { err = true; switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed)."); case uehAbort: return nEncoded; case uehReplace: c = replacementChar; break; case uehIgnore: continue; default: Fail; } } if (c < 0x80u) dest.Add(TDestCh(c & 0xffu)); else if (c < 0x800u) { dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else if (c < 0x10000u) { dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else if (c < 0x200000u) { dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111))); dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else if (c < 0x4000000u) { dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011))); dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else { dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011))); dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } if (! err) nEncoded++; } return nEncoded; }
size_t TUniCodec::EncodeUtf8 | ( | const TSrcVec & | src, |
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const [inline] |
Definition at line 145 of file unicode.h.
References EncodeUtf8().
Referenced by EncodeUtf8().
{ return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
TStr TUniCodec::EncodeUtf8Str | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount | ||
) | const [inline] |
Definition at line 149 of file unicode.h.
References EncodeUtf8().
Referenced by TUnicode::EncodeUtf8Str().
{ TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
TStr TUniCodec::EncodeUtf8Str | ( | const TSrcVec & | src | ) | const [inline] |
Definition at line 150 of file unicode.h.
References TVec< TVal, TSizeTy >::Add(), and EncodeUtf8().
{ TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
uint TUniCodec::GetRndUint | ( | TRnd & | rnd | ) | [static, protected] |
Definition at line 62 of file unicode.cpp.
References TRnd::GetUniDevUInt().
Referenced by GetRndUint(), TestDecodeUtf16(), and TestDecodeUtf8().
{ uint u = rnd.GetUniDevUInt(256) & 0xff; u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); return u; }
uint TUniCodec::GetRndUint | ( | TRnd & | rnd, |
uint | minVal, | ||
uint | maxVal | ||
) | [static, protected] |
Definition at line 71 of file unicode.cpp.
References GetRndUint(), TUInt::Mn, and TUInt::Mx.
{ if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd); uint range = maxVal - minVal + 1; if (range > (uint(1) << (8 * sizeof(uint) - 1))) while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; } uint mask = 1; while (mask < range) mask <<= 1; mask -= 1; while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; } }
bool TUniCodec::IsMachineLittleEndian | ( | ) | [static, protected] |
Definition at line 83 of file unicode.cpp.
Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), TestDecodeUtf16(), TestUtf16(), and WordsToBytes().
{ static bool isLE, initialized = false; if (initialized) return isLE; int i = 1; if(*(char *)&i == 1) isLE = true; else isLE = false; initialized = true; return isLE; }
static int TUniCodec::SwapBytes | ( | int | x | ) | [inline, static, protected] |
Definition at line 250 of file unicode.h.
Referenced by EncodeUtf16ToWords(), TestDecodeUtf16(), and TestUtf16().
{
return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
void TUniCodec::TestDecodeUtf16 | ( | TRnd & | rnd, |
const TStr & | testCaseDesc, | ||
const TUtf16BomHandling | bomHandling, | ||
const TUniByteOrder | defaultByteOrder, | ||
const bool | insertBom | ||
) | [protected] |
Definition at line 341 of file unicode.cpp.
References TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, bomRequired, TStr::CStr(), errorHandling, Fail, GetRndUint(), IAssert, IsMachineLittleEndian(), TStr::Len(), TVec< TVal, TSizeTy >::Len(), replacementChar, skipBom, strict, SwapBytes(), TestUtf16(), uehAbort, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.
Referenced by TestUtf16().
{ TIntV src; TIntV expectedDest; int expectedRetVal = 0; bool expectedAbort = false; FILE *f = 0; bool isMachineLe = IsMachineLittleEndian(); bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); bool swap = (isMachineLe != isDefaultLe); if (insertBom) { src.Add(swap ? 0xfffe : 0xfeff); if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } } else if (bomHandling == bomRequired) { expectedAbort = true; expectedRetVal = -1; } // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where: // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y'); // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint. // (absent = 0, 'a' = 1). for (int i = 0; i < testCaseDesc.Len(); ) { const char c = testCaseDesc[i++]; uint cp = 0; int nWords = -1; if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16 else if (c == 'X') { cp = 0xfffe; nWords = 1; } else if (c == 'Y') { cp = 0xfeff; nWords = 1; } else Fail; if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C'); // Process 'e'. int nToDel = 0; if (i < testCaseDesc.Len()) { const char e = testCaseDesc[i]; if (e >= 'a') { i += 1; nToDel = 1; }} IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1))); if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C'); // Will an error occur during the decoding of this codepoint? bool errHere = false; if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true; else if (cp > 0x10ffff) { Fail; errHere = true; } else if (nToDel > 0) errHere = true; else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true; // Update 'expectedDest' and 'expectedRetVal'. if (! expectedAbort) { if (! errHere) { if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { } else { expectedDest.Add(cp); expectedRetVal += 1; } } else if (errorHandling == uehReplace) { expectedDest.Add(replacementChar); } if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; } // Update 'src'. if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp); else { int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate; int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate; src.Add(swap ? SwapBytes(c1) : c1); if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); } } if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr()); TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f); }
void TUniCodec::TestDecodeUtf8 | ( | TRnd & | rnd, |
const TStr & | testCaseDesc | ||
) | [protected] |
Definition at line 133 of file unicode.cpp.
References TVec< TVal, TSizeTy >::Add(), TStr::CStr(), errorHandling, Fail, GetRndUint(), IAssert, TStr::Len(), TVec< TVal, TSizeTy >::Len(), replacementChar, skipBom, strict, TestUtf8(), uehAbort, uehReplace, and uehThrow.
Referenced by TestUtf8().
{ TIntV src; TIntV expectedDest; int expectedRetVal = 0; bool expectedAbort = false; FILE *f = 0; // stderr // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where: // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z'); // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6'); // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint. // (absent = 0, 'a' = 1, 'b' = 2 and so on). for (int i = 0; i < testCaseDesc.Len(); ) { IAssert(i + 2 <= testCaseDesc.Len()); const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2; uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false; IAssert('1' <= d && d <= '6'); nBytes = d - '0'; if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits else if (c == 'X') { cp = 0xfffe; minBytes = 3; } else if (c == 'Y') { cp = 0xfeff; minBytes = 3; } else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f)) else Fail; IAssert(nBytes >= minBytes); // Process 'e'. int nToDel = 0; if (i < testCaseDesc.Len()) { const char e = testCaseDesc[i]; if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }} IAssert(nToDel < nBytes); // Will an error occur during the decoding of this codepoint? bool errHere = false; if (eighties) errHere = true; else if (nToDel > 0) errHere = true; else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true; // Update 'expectedDest' and 'expetedRetVal'. if (! expectedAbort) { if (! errHere) { if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { } else { expectedDest.Add(cp); expectedRetVal += 1; } } else if (errorHandling == uehReplace) { if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar); else expectedDest.Add(replacementChar); } if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; } // Update 'src'. if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff)); else if (nBytes == 1) src.Add(cp); else { int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes); src.Add(mask | (uint(cp) >> (6 * (nBytes - 1)))); for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); } } if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr()); TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f); }
void TUniCodec::TestUtf16 | ( | bool | decode, |
size_t | expectedRetVal, | ||
bool | expectedThrow, | ||
const TIntV & | src, | ||
const TIntV & | expectedDest, | ||
const TUtf16BomHandling | bomHandling, | ||
const TUniByteOrder | defaultByteOrder, | ||
const bool | insertBom, | ||
FILE * | f | ||
) | [protected] |
Definition at line 284 of file unicode.cpp.
References boBigEndian, boLittleEndian, bomAllowed, bomRequired, TVec< TVal, TSizeTy >::Clr(), TStr::CStr(), DecodeUtf16FromBytes(), DecodeUtf16FromWords(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), errorHandling, IAssert, TVec< TVal, TSizeTy >::Len(), TUnicodeException::message, replacementChar, skipBom, TUnicodeException::srcChar, TUnicodeException::srcIdx, strict, uehAbort, uehIgnore, uehReplace, uehThrow, and WordsToBytes().
{ TIntV srcBytes, expectedDestBytes; WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes); TIntV dest; if (f) { fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n", (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"), (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")), (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"), (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"), uint(replacementChar)); fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); } for (int useBytes = 0; useBytes < 2; useBytes++) { const char *fmt = (useBytes ? " %02x" : " %04x"); try { dest.Clr(); size_t retVal; if (! useBytes) { if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); } else { if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder); else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); } const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest); if (f) { fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i])); fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i])); fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); } bool ok = true; if (retVal != expectedRetVal) ok = false; if (dest.Len() != ed.Len()) ok = false; if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false; if (! ok) { printf("!!!\n"); } IAssert(retVal == expectedRetVal); IAssert(! expectedThrow); IAssert(dest.Len() == ed.Len()); for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]); } catch (TUnicodeException e) { if (f) { fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i])); fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); } IAssert(expectedThrow); } } }
void TUniCodec::TestUtf16 | ( | ) |
Definition at line 408 of file unicode.cpp.
References TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, bomAllowed, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, TVec< TVal, TSizeTy >::Gen(), IsMachineLittleEndian(), TVec< TVal, TSizeTy >::Len(), TUInt::Mx, replacementChar, TVec< TVal, TSizeTy >::Reserve(), skipBom, strict, SwapBytes(), TestDecodeUtf16(), uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.
Referenced by TestDecodeUtf16().
{ TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar); for (int skipBom_ = 0; skipBom_ < 2; skipBom_++) for (int strict_ = 0; strict_ < 2; strict_++) for (int errMode_ = 0; errMode_ < 4; errMode_++) for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++) for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++) for (int insertBom_ = 0; insertBom_ < 2; insertBom_++) { strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1); bool insertBom = (insertBom_ == 1); TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_; TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_; TRnd rnd = TRnd(123); // Test DecodeUtf16 on various random UTF-16-encoded sequences. for (int i = 0; i < 10; i++) { TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom); } //continue; // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters // close to powers of 2. TIntV src, expectedDest, src2; expectedDest.Gen(1); src.Reserve(6); src2.Gen(1); for (int pow = 8; pow <= 32; pow++) { uint uFrom, uTo; if (pow == 8) uFrom = 0, uTo = 1u << pow; else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx; else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8); printf("%u..%u \r", uFrom, uTo); for (uint u = uFrom; ; u++) { int nWords = 0; if (u < 0x10000) nWords = 1; else nWords = 2; bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe)); bool swap = (isMachineLe != isDestLe); bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023); src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0)); if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff); if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023))) { // Try to encode 'u' and see if it gets decoded correctly. if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u); else { int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023); int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023); src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1); src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); } if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding { expectedDest.Reserve(2, 0); if (insertBom && ! skipBom) expectedDest.Add(0xfeff); if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar); else if (! err) expectedDest.Add(u); int erv = (err ? 0 : expectedDest.Len()); if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0; bool errD = err; if (bomHandling == bomRequired && ! insertBom) { expectedDest.Clr(false); if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); } else { erv = -1; errD = true; /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }} TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0); } } // We can also test the UTF-16 encoder. src2[0] = u; if (err) { src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff); if (errorHandling == uehReplace) { src.Add(swap ? SwapBytes(replacementChar) : replacementChar); /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); } else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */ }} TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0); // if (u == uTo) break; } } } }
void TUniCodec::TestUtf8 | ( | bool | decode, |
size_t | expectedRetVal, | ||
bool | expectedThrow, | ||
const TIntV & | src, | ||
const TIntV & | expectedDest, | ||
FILE * | f | ||
) | [protected] |
Definition at line 99 of file unicode.cpp.
References TStr::CStr(), DecodeUtf8(), EncodeUtf8(), errorHandling, IAssert, TVec< TVal, TSizeTy >::Len(), TUnicodeException::message, replacementChar, skipBom, TUnicodeException::srcChar, TUnicodeException::srcIdx, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.
{ TIntV dest; if (f) { fprintf(f, "Settings: %s %s %s replacementChar = %x\n", (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"), (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar)); fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); } try { size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true)); if (f) { fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i])); fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i])); fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); } if (retVal != expectedRetVal) printf("!!!"); IAssert(retVal == expectedRetVal); IAssert(! expectedThrow); if (dest.Len() != expectedDest.Len()) printf("!!!"); IAssert(dest.Len() == expectedDest.Len()); for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]); } catch (TUnicodeException e) { if (f) { fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i])); fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); } IAssert(expectedThrow); } }
void TUniCodec::TestUtf8 | ( | ) |
Definition at line 194 of file unicode.cpp.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), EncodeUtf8(), errorHandling, TVec< TVal, TSizeTy >::Gen(), TUInt::Mx, replacementChar, TVec< TVal, TSizeTy >::Reserve(), skipBom, strict, TestDecodeUtf8(), uehReplace, and uehThrow.
Referenced by TestDecodeUtf8().
{ TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true); for (int skipBom_ = 0; skipBom_ < 2; skipBom_++) for (int strict_ = 0; strict_ < 2; strict_++) for (int errMode_ = 0; errMode_ < 4; errMode_++) { strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1); TRnd rnd = TRnd(123); // Test DecodeUtf8 on various random UTF-8-encoded sequences. for (int i = 0; i < 10; i++) { TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6"); TestDecodeUtf8(rnd, "X3A5dA6d"); TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1"); TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2"); TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2"); TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a"); TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a"); TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b"); TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c"); TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d"); TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e"); } // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters // close to powers of 2. TIntV src, expectedDest, src2; expectedDest.Gen(1); src.Reserve(6); src2.Gen(1); for (int pow = 8; pow <= 32; pow++) { uint uFrom, uTo; if (pow == 8) uFrom = 0, uTo = 1u << pow; else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx; else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8); printf("%u..%u \r", uFrom, uTo); for (uint u = uFrom; ; u++) { int nBytes = 0; if (u < (1u << 7)) nBytes = 1; else if (u < (1u << 11)) nBytes = 2; else if (u < (1u << 16)) nBytes = 3; else if (u < (1u << 21)) nBytes = 4; else if (u < (1u << 26)) nBytes = 5; else nBytes = 6; src.Gen(6, nBytes); if (nBytes == 1) src[0] = u; else { src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1))); for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); } bool err = (strict && u > 0x10ffff); expectedDest.Reserve(1, 0); if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar); else if (! err) expectedDest.Add(u); int erv = (err ? 0 : 1); if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0; TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0); // We can also test the UTF-8 encoder. src2[0] = u; if (err) { if (errorHandling == uehReplace) src = utf8ReplCh; else src.Clr(false); } TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0); // if (u == uTo) break; } } } }
void TUniCodec::WordsToBytes | ( | const TIntV & | src, |
TIntV & | dest | ||
) | [protected] |
Definition at line 274 of file unicode.cpp.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), IsMachineLittleEndian(), and TVec< TVal, TSizeTy >::Len().
Referenced by TestUtf16().
{ dest.Clr(); bool isLE = IsMachineLittleEndian(); for (int i = 0; i < src.Len(); i++) { int c = src[i] & 0xffff; if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } } }
friend class TUniCaseFolding [friend] |
Definition at line 66 of file unicode.h.
Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), DecodeUtf8(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), EncodeUtf8(), TestDecodeUtf16(), TestDecodeUtf8(), TestUtf16(), and TestUtf8().
Definition at line 64 of file unicode.h.
Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), DecodeUtf8(), EncodeUtf16ToWords(), EncodeUtf8(), TestDecodeUtf16(), TestDecodeUtf8(), TestUtf16(), and TestUtf8().
bool TUniCodec::skipBom |
Definition at line 89 of file unicode.h.
Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), DecodeUtf8(), TestDecodeUtf16(), TestDecodeUtf8(), TestUtf16(), and TestUtf8().
bool TUniCodec::strict |
Definition at line 83 of file unicode.h.
Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), DecodeUtf8(), EncodeUtf8(), TestDecodeUtf16(), TestDecodeUtf8(), TestUtf16(), and TestUtf8().