|
SNAP Library, User Reference
2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
|
#include <unicode.h>
Public Types | |
| enum | { DefaultReplacementChar = 0xfffd } |
Public Member Functions | |
| TUniCodec () | |
| TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) | |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
| template<typename TSrcVec > | |
| TStr | EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const |
| template<typename TSrcVec > | |
| TStr | EncodeUtf8Str (const TSrcVec &src) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
| template<typename TSrcVec , typename TDestCh > | |
| size_t | EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
| void | TestUtf8 () |
| void | TestUtf16 () |
Public Attributes | |
| int | replacementChar |
| TUnicodeErrorHandling | errorHandling |
| bool | strict |
| bool | skipBom |
Protected Types | |
| enum | { DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0) } |
| enum | { Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 } |
| typedef TUniVecIdx | TVecIdx |
Protected Member Functions | |
| void | TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f) |
| void | TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc) |
| void | WordsToBytes (const TIntV &src, TIntV &dest) |
| void | TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f) |
| void | TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom) |
Static Protected Member Functions | |
| static bool | IsMachineLittleEndian () |
| static uint | GetRndUint (TRnd &rnd) |
| static uint | GetRndUint (TRnd &rnd, uint minVal, uint maxVal) |
| static int | SwapBytes (int x) |
Friends | |
| class | TUniCaseFolding |
typedef TUniVecIdx TUniCodec::TVecIdx [protected] |
| anonymous enum |
Definition at line 57 of file unicode.h.
{ DefaultReplacementChar = 0xfffd };
anonymous enum [protected] |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte | |
| DefineByte |
Definition at line 99 of file unicode.h.
{
#define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
#undef DefineByte
};
anonymous enum [protected] |
Definition at line 154 of file unicode.h.
{
Utf16FirstSurrogate = 0xd800,
Utf16SecondSurrogate = 0xdc00
};
| TUniCodec::TUniCodec | ( | ) | [inline] |
Definition at line 89 of file unicode.h.
: replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) { }
| TUniCodec::TUniCodec | ( | TUnicodeErrorHandling | errorHandling_, |
| bool | strict_, | ||
| int | replacementChar_, | ||
| bool | skipBom_ | ||
| ) | [inline] |
Definition at line 93 of file unicode.h.
:
replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
{
}
| size_t TUniCodec::DecodeUtf16FromBytes | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest, | ||
| const TUtf16BomHandling | bomHandling = bomAllowed, |
||
| const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
| ) | const |
Definition at line 2203 of file unicode.h.
{
IAssert(srcCount % 2 == 0);
IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
if (clrDest) dest.Clr();
size_t nDecoded = 0;
if (srcCount <= 0) return nDecoded;
const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
bool littleEndian = false;
bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
if (bomHandling == bomIgnored) littleEndian = leDefault;
else if (bomHandling == bomAllowed || bomHandling == bomRequired)
{
int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
else if (bomHandling == bomAllowed) littleEndian = leDefault;
else { // Report an error.
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
default: Fail; } }
}
else Fail;
while (srcIdx < srcEnd)
{
const size_t charSrcIdx = srcIdx;
uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
{
// c is the first character in a surrogate pair. Read the next character.
if (! (srcIdx + 2 <= srcEnd)) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
// c2 should be the second character of the surrogate pair.
if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
case uehAbort: return nDecoded;
// with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
case uehIgnore: srcIdx -= 2; continue;
default: Fail; } }
// c and c2 each contain 10 bits of information.
uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
cc += 0x10000;
dest.Add(TDestCh(cc)); nDecoded++; continue;
}
else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
// If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
// Otherwise, store 'c' to the destination vector.
dest.Add(TDestCh(c)); nDecoded++;
}
return nDecoded;
}
| size_t TUniCodec::DecodeUtf16FromWords | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| bool | clrDest, | ||
| const TUtf16BomHandling | bomHandling = bomAllowed, |
||
| const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
| ) | const |
Definition at line 2287 of file unicode.h.
{
IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
if (clrDest) dest.Clr();
size_t nDecoded = 0;
if (srcCount <= 0) return nDecoded;
const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
bool swap = false;
bool isMachineLe = IsMachineLittleEndian();
bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
else if (bomHandling == bomAllowed || bomHandling == bomRequired)
{
int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
else { // Report an error.
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
default: Fail; } }
}
else Fail;
while (srcIdx < srcEnd)
{
const size_t charSrcIdx = srcIdx;
uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
{
// c is the first character in a surrogate pair. Read the next character.
if (! (srcIdx < srcEnd)) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
// c2 should be the second character of the surrogate pair.
if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
case uehAbort: return nDecoded;
// with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
case uehIgnore: srcIdx -= 1; continue;
default: Fail; } }
// c and c2 each contain 10 bits of information.
uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
cc += 0x10000;
dest.Add(TDestCh(cc)); nDecoded++; continue;
}
else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
// If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
// Otherwise, store 'c' to the destination vector.
dest.Add(TDestCh(c)); nDecoded++;
}
return nDecoded;
}
| size_t TUniCodec::DecodeUtf8 | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const |
Definition at line 2029 of file unicode.h.
{
size_t nDecoded = 0;
if (clrDest) dest.Clr();
const size_t origSrcIdx = srcIdx;
const size_t srcEnd = srcIdx + srcCount;
while (srcIdx < srcEnd)
{
const size_t charSrcIdx = srcIdx;
uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
if ((c & _1000_0000) == 0) {
// c is one of the characters 0..0x7f, encoded as a single byte.
dest.Add(TDestCh(c)); nDecoded++; continue; }
else if ((c & _1100_0000) == _1000_0000) {
// No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
// We must have been thrown into the middle of a multi-byte character.
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
else
{
// c introduces a sequence of 2..6 bytes, depending on how many
// of the most significant bits of c are set.
uint nMoreBytes = 0, nBits = 0, minVal = 0;
if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
else {
// c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
// (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this
// could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
// could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
if (strict) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
case uehAbort: return nDecoded;
// In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
// and try to decode the character. Then, since 'strict' is true and
// the codepoint is clearly >= 2^31, we'll notice this as an error later
// and (in the case of uehReplace) insert a replacement character then.
// This is probably better than inserting a replacement character right
// away and then trying to read the next byte as if a new character
// was beginning there -- if the current byte is really followed by five
// 10xxxxxx bytes, we'll just get six replacement characters in a row.
case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: break; // continue;
default: Fail; } }
nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
// Decode this multi-byte sequence.
uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
bool cancel = false;
for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
// See if there are enough bytes left in the source vector.
if (! (srcIdx < srcEnd)) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
case uehIgnore: cancel = true; continue;
default: Fail; } }
// Read the next byte.
c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
switch (errorHandling) {
case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
case uehIgnore: srcIdx--; cancel = true; continue;
default: Fail; } }
cOut <<= 6; cOut |= (c & _0011_1111); }
if (cancel) continue;
if (strict) {
// err1: This codepoint has been represented by more bytes than it should have been.
// For example, cOut in the range 0..127 should be represented by a single byte,
// not by two or more bytes.
// - For example, this may happen in the "modified UTF-8" sometimes used for Java
// serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
// the appearance of null bytes in the encoded stream.
bool err1 = (cOut < minVal);
// err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
// However, later this was restricted to the codepoints 0..0x10ffff only, because only these
// are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary.
bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
if (err1 || err2) switch (errorHandling) {
case uehThrow:
if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
else { Fail; break; }
case uehAbort: return nDecoded;
case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
// Add the decoded codepoint to the destination vector.
// If this is the first decoded character, and it's one of the byte-order marks
// (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
dest.Add(cOut); nDecoded++; }
} // else (multi-byte sequence)
} // while
return nDecoded;
}
| size_t TUniCodec::DecodeUtf8 | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 133 of file unicode.h.
{ return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
| size_t TUniCodec::EncodeUtf16ToBytes | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest, | ||
| const bool | insertBom, | ||
| const TUniByteOrder | destByteOrder = boMachineEndian |
||
| ) | const |
Definition at line 2421 of file unicode.h.
{
bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
while (srcIdx < srcEnd)
{
uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
if (! (c <= 0x10ffffu)) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
case uehAbort: return nEncoded;
#define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
case uehReplace: ___OutRepl; continue;
case uehIgnore: continue;
default: Fail; } }
if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
case uehAbort: return nEncoded;
case uehReplace: ___OutRepl; continue;
case uehIgnore: continue;
default: Fail; } }
if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
case uehAbort: return nEncoded;
case uehReplace: ___OutRepl; continue;
case uehIgnore: continue;
default: Fail; } }
#undef ___OutRepl
// If c is <= 0xffff, it can be stored directly.
if (c <= 0xffffu) {
if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
nEncoded++; continue; }
// Otherwise, represent c by a pair of surrogate characters.
c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
uint c1 = (c >> 10) & 1023, c2 = c & 1023;
c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
nEncoded++; continue;
}
return nEncoded;
}
| size_t TUniCodec::EncodeUtf16ToWords | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest, | ||
| const bool | insertBom, | ||
| const TUniByteOrder | destByteOrder = boMachineEndian |
||
| ) | const |
Definition at line 2369 of file unicode.h.
{
bool isMachineLe = IsMachineLittleEndian();
bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
while (srcIdx < srcEnd)
{
uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
if (! (c <= 0x10ffffu)) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
case uehAbort: return nEncoded;
case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
case uehAbort: return nEncoded;
case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
case uehAbort: return nEncoded;
case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
case uehIgnore: continue;
default: Fail; } }
// If c is <= 0xffff, it can be stored directly.
if (c <= 0xffffu) {
if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
dest.Add(TDestCh(c)); nEncoded++; continue; }
// Otherwise, represent c by a pair of surrogate characters.
c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
uint c1 = (c >> 10) & 1023, c2 = c & 1023;
c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
if (swap) {
c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
dest.Add(TDestCh(c1));
dest.Add(TDestCh(c2));
nEncoded++; continue;
}
return nEncoded;
}
| size_t TUniCodec::EncodeUtf8 | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount, | ||
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const |
Definition at line 2145 of file unicode.h.
{
size_t nEncoded = 0;
for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
{
uint c = uint(src[TVecIdx(srcIdx)]);
bool err = false;
if (strict && c > 0x10ffff) {
err = true;
switch (errorHandling) {
case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
case uehAbort: return nEncoded;
case uehReplace: c = replacementChar; break;
case uehIgnore: continue;
default: Fail; } }
if (c < 0x80u)
dest.Add(TDestCh(c & 0xffu));
else if (c < 0x800u) {
dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
else if (c < 0x10000u) {
dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
else if (c < 0x200000u) {
dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
else if (c < 0x4000000u) {
dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
else {
dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
if (! err) nEncoded++;
}
return nEncoded;
}
| size_t TUniCodec::EncodeUtf8 | ( | const TSrcVec & | src, |
| TVec< TDestCh > & | dest, | ||
| const bool | clrDest = true |
||
| ) | const [inline] |
Definition at line 142 of file unicode.h.
{ return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
| TStr TUniCodec::EncodeUtf8Str | ( | const TSrcVec & | src, |
| size_t | srcIdx, | ||
| const size_t | srcCount | ||
| ) | const [inline] |
Definition at line 146 of file unicode.h.
{ TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
| TStr TUniCodec::EncodeUtf8Str | ( | const TSrcVec & | src | ) | const [inline] |
Definition at line 147 of file unicode.h.
{ TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
| uint TUniCodec::GetRndUint | ( | TRnd & | rnd | ) | [static, protected] |
Definition at line 62 of file unicode.cpp.
{
uint u = rnd.GetUniDevUInt(256) & 0xff;
u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
return u;
}
| uint TUniCodec::GetRndUint | ( | TRnd & | rnd, |
| uint | minVal, | ||
| uint | maxVal | ||
| ) | [static, protected] |
Definition at line 71 of file unicode.cpp.
{
if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
uint range = maxVal - minVal + 1;
if (range > (uint(1) << (8 * sizeof(uint) - 1)))
while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
uint mask = 1;
while (mask < range) mask <<= 1;
mask -= 1;
while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
}
| bool TUniCodec::IsMachineLittleEndian | ( | ) | [static, protected] |
Definition at line 83 of file unicode.cpp.
{
static bool isLE, initialized = false;
if (initialized) return isLE;
int i = 0x0201;
char *p = (char *) (&i);
char c1, c2;
memcpy(&c1, p, 1); memcpy(&c2, p + 1, 1);
if (c1 == 1 && c2 == 2) isLE = true;
else if (c1 == 2 && c2 == 1) isLE = false;
else {
FailR(("TUniCodec::IsMachineLittleEndian: c1 = " + TInt::GetStr(int(uchar(c1)), "%02x") + ", c2 = " + TInt::GetStr(int(uchar(c2)), "%02x") + ".").CStr());
isLE = true; }
initialized = true; return isLE;
}
| static int TUniCodec::SwapBytes | ( | int | x | ) | [inline, static, protected] |
| void TUniCodec::TestDecodeUtf16 | ( | TRnd & | rnd, |
| const TStr & | testCaseDesc, | ||
| const TUtf16BomHandling | bomHandling, | ||
| const TUniByteOrder | defaultByteOrder, | ||
| const bool | insertBom | ||
| ) | [protected] |
Definition at line 345 of file unicode.cpp.
{
TIntV src; TIntV expectedDest; int expectedRetVal = 0;
bool expectedAbort = false;
FILE *f = 0;
bool isMachineLe = IsMachineLittleEndian();
bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
bool swap = (isMachineLe != isDefaultLe);
if (insertBom) {
src.Add(swap ? 0xfffe : 0xfeff);
if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
else if (bomHandling == bomRequired) {
expectedAbort = true; expectedRetVal = -1; }
// testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
// - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
// - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
// (absent = 0, 'a' = 1).
for (int i = 0; i < testCaseDesc.Len(); )
{
const char c = testCaseDesc[i++];
uint cp = 0; int nWords = -1;
if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
else if (c == 'X') { cp = 0xfffe; nWords = 1; }
else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
else Fail;
if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
// Process 'e'.
int nToDel = 0;
if (i < testCaseDesc.Len()) {
const char e = testCaseDesc[i];
if (e >= 'a') { i += 1; nToDel = 1; }}
IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
// Will an error occur during the decoding of this codepoint?
bool errHere = false;
if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
else if (cp > 0x10ffff) { Fail; errHere = true; }
else if (nToDel > 0) errHere = true;
else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
// Update 'expectedDest' and 'expectedRetVal'.
if (! expectedAbort) {
if (! errHere) {
if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
else { expectedDest.Add(cp); expectedRetVal += 1; } }
else if (errorHandling == uehReplace) {
expectedDest.Add(replacementChar); }
if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
// Update 'src'.
if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
else {
int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
src.Add(swap ? SwapBytes(c1) : c1);
if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
}
if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
}
| void TUniCodec::TestDecodeUtf8 | ( | TRnd & | rnd, |
| const TStr & | testCaseDesc | ||
| ) | [protected] |
Definition at line 137 of file unicode.cpp.
{
TIntV src; TIntV expectedDest; int expectedRetVal = 0;
bool expectedAbort = false;
FILE *f = 0; // stderr
// testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
// - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
// - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
// - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
// (absent = 0, 'a' = 1, 'b' = 2 and so on).
for (int i = 0; i < testCaseDesc.Len(); )
{
IAssert(i + 2 <= testCaseDesc.Len());
const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
IAssert('1' <= d && d <= '6'); nBytes = d - '0';
if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
else Fail;
IAssert(nBytes >= minBytes);
// Process 'e'.
int nToDel = 0;
if (i < testCaseDesc.Len()) {
const char e = testCaseDesc[i];
if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
IAssert(nToDel < nBytes);
// Will an error occur during the decoding of this codepoint?
bool errHere = false;
if (eighties) errHere = true;
else if (nToDel > 0) errHere = true;
else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
// Update 'expectedDest' and 'expetedRetVal'.
if (! expectedAbort) {
if (! errHere) {
if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
else { expectedDest.Add(cp); expectedRetVal += 1; } }
else if (errorHandling == uehReplace) {
if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
else expectedDest.Add(replacementChar); }
if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
// Update 'src'.
if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
else if (nBytes == 1) src.Add(cp);
else {
int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
}
if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
}
| void TUniCodec::TestUtf16 | ( | bool | decode, |
| size_t | expectedRetVal, | ||
| bool | expectedThrow, | ||
| const TIntV & | src, | ||
| const TIntV & | expectedDest, | ||
| const TUtf16BomHandling | bomHandling, | ||
| const TUniByteOrder | defaultByteOrder, | ||
| const bool | insertBom, | ||
| FILE * | f | ||
| ) | [protected] |
Definition at line 288 of file unicode.cpp.
{
TIntV srcBytes, expectedDestBytes;
WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
TIntV dest;
if (f) {
fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n",
(errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
(strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
(bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
(defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
uint(replacementChar));
fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
for (int useBytes = 0; useBytes < 2; useBytes++)
{
const char *fmt = (useBytes ? " %02x" : " %04x");
try
{
dest.Clr();
size_t retVal;
if (! useBytes) {
if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
else {
if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
if (f) {
fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i]));
fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i]));
fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
bool ok = true;
if (retVal != expectedRetVal) ok = false;
if (dest.Len() != ed.Len()) ok = false;
if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
if (! ok)
{
printf("!!!\n");
}
IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
IAssert(dest.Len() == ed.Len());
for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
}
catch (TUnicodeException e)
{
if (f) {
fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
IAssert(expectedThrow);
}
}
}
| void TUniCodec::TestUtf16 | ( | ) |
Definition at line 412 of file unicode.cpp.
{
TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
for (int strict_ = 0; strict_ < 2; strict_++)
for (int errMode_ = 0; errMode_ < 4; errMode_++)
for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
{
strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
bool insertBom = (insertBom_ == 1);
TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
TRnd rnd = TRnd(123);
// Test DecodeUtf16 on various random UTF-16-encoded sequences.
for (int i = 0; i < 10; i++)
{
TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
}
//continue;
// Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
// close to powers of 2.
TIntV src, expectedDest, src2;
expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
for (int pow = 8; pow <= 32; pow++)
{
uint uFrom, uTo;
if (pow == 8) uFrom = 0, uTo = 1u << pow;
else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
printf("%u..%u \r", uFrom, uTo);
for (uint u = uFrom; ; u++)
{
int nWords = 0;
if (u < 0x10000) nWords = 1;
else nWords = 2;
bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
bool swap = (isMachineLe != isDestLe);
bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
{
// Try to encode 'u' and see if it gets decoded correctly.
if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
else {
int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
{
expectedDest.Reserve(2, 0);
if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
else if (! err) expectedDest.Add(u);
int erv = (err ? 0 : expectedDest.Len());
if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
bool errD = err;
if (bomHandling == bomRequired && ! insertBom) {
expectedDest.Clr(false);
if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
else { erv = -1; errD = true;
/*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
}
}
// We can also test the UTF-16 encoder.
src2[0] = u;
if (err) {
src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
if (errorHandling == uehReplace) {
src.Add(swap ? SwapBytes(replacementChar) : replacementChar);
/*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
}}
TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
//
if (u == uTo) break;
}
}
}
}
| void TUniCodec::TestUtf8 | ( | bool | decode, |
| size_t | expectedRetVal, | ||
| bool | expectedThrow, | ||
| const TIntV & | src, | ||
| const TIntV & | expectedDest, | ||
| FILE * | f | ||
| ) | [protected] |
Definition at line 103 of file unicode.cpp.
{
TIntV dest;
if (f) {
fprintf(f, "Settings: %s %s %s replacementChar = %x\n",
(errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
(strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
try
{
size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
if (f) {
fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i]));
fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i]));
fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
if (retVal != expectedRetVal)
printf("!!!");
IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
if (dest.Len() != expectedDest.Len())
printf("!!!");
IAssert(dest.Len() == expectedDest.Len());
for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
}
catch (TUnicodeException e)
{
if (f) {
fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
IAssert(expectedThrow);
}
}
| void TUniCodec::TestUtf8 | ( | ) |
Definition at line 198 of file unicode.cpp.
{
TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
for (int strict_ = 0; strict_ < 2; strict_++)
for (int errMode_ = 0; errMode_ < 4; errMode_++)
{
strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
TRnd rnd = TRnd(123);
// Test DecodeUtf8 on various random UTF-8-encoded sequences.
for (int i = 0; i < 10; i++)
{
TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
TestDecodeUtf8(rnd, "X3A5dA6d");
TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
}
// Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
// close to powers of 2.
TIntV src, expectedDest, src2;
expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
for (int pow = 8; pow <= 32; pow++)
{
uint uFrom, uTo;
if (pow == 8) uFrom = 0, uTo = 1u << pow;
else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
printf("%u..%u \r", uFrom, uTo);
for (uint u = uFrom; ; u++)
{
int nBytes = 0;
if (u < (1u << 7)) nBytes = 1;
else if (u < (1u << 11)) nBytes = 2;
else if (u < (1u << 16)) nBytes = 3;
else if (u < (1u << 21)) nBytes = 4;
else if (u < (1u << 26)) nBytes = 5;
else nBytes = 6;
src.Gen(6, nBytes);
if (nBytes == 1) src[0] = u;
else {
src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
bool err = (strict && u > 0x10ffff);
expectedDest.Reserve(1, 0);
if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
else if (! err) expectedDest.Add(u);
int erv = (err ? 0 : 1);
if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
// We can also test the UTF-8 encoder.
src2[0] = u;
if (err) {
if (errorHandling == uehReplace) src = utf8ReplCh;
else src.Clr(false); }
TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
//
if (u == uTo) break;
}
}
}
}
| void TUniCodec::WordsToBytes | ( | const TIntV & | src, |
| TIntV & | dest | ||
| ) | [protected] |
Definition at line 278 of file unicode.cpp.
friend class TUniCaseFolding [friend] |
| bool TUniCodec::skipBom |
| bool TUniCodec::strict |