diff options
Diffstat (limited to 'src/lib/base/Unicode.cpp')
| -rw-r--r-- | src/lib/base/Unicode.cpp | 784 |
1 files changed, 784 insertions, 0 deletions
diff --git a/src/lib/base/Unicode.cpp b/src/lib/base/Unicode.cpp new file mode 100644 index 0000000..6a077e7 --- /dev/null +++ b/src/lib/base/Unicode.cpp @@ -0,0 +1,784 @@ +/* + * barrier -- mouse and keyboard sharing utility + * Copyright (C) 2012-2016 Symless Ltd. + * Copyright (C) 2002 Chris Schoeneman + * + * This package is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * found in the file LICENSE that should have accompanied this file. + * + * This package is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "arch/Arch.h" +#include "base/Unicode.h" + +#include <cstring> + +// +// local utility functions +// + +inline +static +UInt16 +decode16(const UInt8* n, bool byteSwapped) +{ + union x16 { + UInt8 n8[2]; + UInt16 n16; + } c; + if (byteSwapped) { + c.n8[0] = n[1]; + c.n8[1] = n[0]; + } + else { + c.n8[0] = n[0]; + c.n8[1] = n[1]; + } + return c.n16; +} + +inline +static +UInt32 +decode32(const UInt8* n, bool byteSwapped) +{ + union x32 { + UInt8 n8[4]; + UInt32 n32; + } c; + if (byteSwapped) { + c.n8[0] = n[3]; + c.n8[1] = n[2]; + c.n8[2] = n[1]; + c.n8[3] = n[0]; + } + else { + c.n8[0] = n[0]; + c.n8[1] = n[1]; + c.n8[2] = n[2]; + c.n8[3] = n[3]; + } + return c.n32; +} + +inline +static +void +resetError(bool* errors) +{ + if (errors != NULL) { + *errors = false; + } +} + +inline +static +void +setError(bool* errors) +{ + if (errors != NULL) { + *errors = true; + } +} + + +// +// Unicode +// + +UInt32 Unicode::s_invalid = 0x0000ffff; +UInt32 Unicode::s_replacement = 0x0000fffd; + +bool +Unicode::isUTF8(const String& src) +{ + // convert and test each character + const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); + for (UInt32 n = (UInt32)src.size(); n > 0; ) { + if (fromUTF8(data, n) == s_invalid) { + return false; + } + } + return true; +} + +String +Unicode::UTF8ToUCS2(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // get size of input string and reserve some space in output + UInt32 n = (UInt32)src.size(); + String dst; + dst.reserve(2 * n); + + // convert each character + const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); + while (n > 0) { + UInt32 c = fromUTF8(data, n); + if (c == s_invalid) { + c = s_replacement; + } + else if (c >= 0x00010000) { + setError(errors); + c = s_replacement; + } + UInt16 ucs2 = static_cast<UInt16>(c); + dst.append(reinterpret_cast<const char*>(&ucs2), 2); + } + + return dst; +} + +String +Unicode::UTF8ToUCS4(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // get size of input string and reserve some space in output + UInt32 n = (UInt32)src.size(); + String dst; + dst.reserve(4 * n); + + // convert each character + const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); + while (n > 0) { + UInt32 c = fromUTF8(data, n); + if (c == s_invalid) { + c = s_replacement; + } + dst.append(reinterpret_cast<const char*>(&c), 4); + } + + return dst; +} + +String +Unicode::UTF8ToUTF16(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // get size of input string and reserve some space in output + UInt32 n = (UInt32)src.size(); + String dst; + dst.reserve(2 * n); + + // convert each character + const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); + while (n > 0) { + UInt32 c = fromUTF8(data, n); + if (c == s_invalid) { + c = s_replacement; + } + else if (c >= 0x00110000) { + setError(errors); + c = s_replacement; + } + if (c < 0x00010000) { + UInt16 ucs2 = static_cast<UInt16>(c); + dst.append(reinterpret_cast<const char*>(&ucs2), 2); + } + else { + c -= 0x00010000; + UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800); + UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00); + dst.append(reinterpret_cast<const char*>(&utf16h), 2); + dst.append(reinterpret_cast<const char*>(&utf16l), 2); + } + } + + return dst; +} + +String +Unicode::UTF8ToUTF32(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // get size of input string and reserve some space in output + UInt32 n = (UInt32)src.size(); + String dst; + dst.reserve(4 * n); + + // convert each character + const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); + while (n > 0) { + UInt32 c = fromUTF8(data, n); + if (c == s_invalid) { + c = s_replacement; + } + else if (c >= 0x00110000) { + setError(errors); + c = s_replacement; + } + dst.append(reinterpret_cast<const char*>(&c), 4); + } + + return dst; +} + +String +Unicode::UTF8ToText(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // convert to wide char + UInt32 size; + wchar_t* tmp = UTF8ToWideChar(src, size, errors); + + // convert string to multibyte + int len = ARCH->convStringWCToMB(NULL, tmp, size, errors); + char* mbs = new char[len + 1]; + ARCH->convStringWCToMB(mbs, tmp, size, errors); + String text(mbs, len); + + // clean up + delete[] mbs; + delete[] tmp; + + return text; +} + +String +Unicode::UCS2ToUTF8(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // convert + UInt32 n = (UInt32)src.size() >> 1; + return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); +} + +String +Unicode::UCS4ToUTF8(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // convert + UInt32 n = (UInt32)src.size() >> 2; + return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); +} + +String +Unicode::UTF16ToUTF8(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // convert + UInt32 n = (UInt32)src.size() >> 1; + return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); +} + +String +Unicode::UTF32ToUTF8(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // convert + UInt32 n = (UInt32)src.size() >> 2; + return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); +} + +String +Unicode::textToUTF8(const String& src, bool* errors) +{ + // default to success + resetError(errors); + + // convert string to wide characters + UInt32 n = (UInt32)src.size(); + int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors); + wchar_t* wcs = new wchar_t[len + 1]; + ARCH->convStringMBToWC(wcs, src.c_str(), n, errors); + + // convert to UTF8 + String utf8 = wideCharToUTF8(wcs, len, errors); + + // clean up + delete[] wcs; + + return utf8; +} + +wchar_t* +Unicode::UTF8ToWideChar(const String& src, UInt32& size, bool* errors) +{ + // convert to platform's wide character encoding + String tmp; + switch (ARCH->getWideCharEncoding()) { + case IArchString::kUCS2: + tmp = UTF8ToUCS2(src, errors); + size = (UInt32)tmp.size() >> 1; + break; + + case IArchString::kUCS4: + tmp = UTF8ToUCS4(src, errors); + size = (UInt32)tmp.size() >> 2; + break; + + case IArchString::kUTF16: + tmp = UTF8ToUTF16(src, errors); + size = (UInt32)tmp.size() >> 1; + break; + + case IArchString::kUTF32: + tmp = UTF8ToUTF32(src, errors); + size = (UInt32)tmp.size() >> 2; + break; + + default: + assert(0 && "unknown wide character encoding"); + } + + // copy to a wchar_t array + wchar_t* dst = new wchar_t[size]; + ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size); + return dst; +} + +String +Unicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors) +{ + // convert from platform's wide character encoding. + // note -- this must include a wide nul character (independent of + // the String's nul character). + switch (ARCH->getWideCharEncoding()) { + case IArchString::kUCS2: + return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); + + case IArchString::kUCS4: + return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); + + case IArchString::kUTF16: + return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); + + case IArchString::kUTF32: + return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); + + default: + assert(0 && "unknown wide character encoding"); + return String(); + } +} + +String +Unicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors) +{ + // make some space + String dst; + dst.reserve(n); + + // check if first character is 0xfffe or 0xfeff + bool byteSwapped = false; + if (n >= 1) { + switch (decode16(data, false)) { + case 0x0000feff: + data += 2; + --n; + break; + + case 0x0000fffe: + byteSwapped = true; + data += 2; + --n; + break; + + default: + break; + } + } + + // convert each character + for (; n > 0; data += 2, --n) { + UInt32 c = decode16(data, byteSwapped); + toUTF8(dst, c, errors); + } + + return dst; +} + +String +Unicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors) +{ + // make some space + String dst; + dst.reserve(n); + + // check if first character is 0xfffe or 0xfeff + bool byteSwapped = false; + if (n >= 1) { + switch (decode32(data, false)) { + case 0x0000feff: + data += 4; + --n; + break; + + case 0x0000fffe: + byteSwapped = true; + data += 4; + --n; + break; + + default: + break; + } + } + + // convert each character + for (; n > 0; data += 4, --n) { + UInt32 c = decode32(data, byteSwapped); + toUTF8(dst, c, errors); + } + + return dst; +} + +String +Unicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors) +{ + // make some space + String dst; + dst.reserve(n); + + // check if first character is 0xfffe or 0xfeff + bool byteSwapped = false; + if (n >= 1) { + switch (decode16(data, false)) { + case 0x0000feff: + data += 2; + --n; + break; + + case 0x0000fffe: + byteSwapped = true; + data += 2; + --n; + break; + + default: + break; + } + } + + // convert each character + for (; n > 0; data += 2, --n) { + UInt32 c = decode16(data, byteSwapped); + if (c < 0x0000d800 || c > 0x0000dfff) { + toUTF8(dst, c, errors); + } + else if (n == 1) { + // error -- missing second word + setError(errors); + toUTF8(dst, s_replacement, NULL); + } + else if (c >= 0x0000d800 && c <= 0x0000dbff) { + UInt32 c2 = decode16(data, byteSwapped); + data += 2; + --n; + if (c2 < 0x0000dc00 || c2 > 0x0000dfff) { + // error -- [d800,dbff] not followed by [dc00,dfff] + setError(errors); + toUTF8(dst, s_replacement, NULL); + } + else { + c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000; + toUTF8(dst, c, errors); + } + } + else { + // error -- [dc00,dfff] without leading [d800,dbff] + setError(errors); + toUTF8(dst, s_replacement, NULL); + } + } + + return dst; +} + +String +Unicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors) +{ + // make some space + String dst; + dst.reserve(n); + + // check if first character is 0xfffe or 0xfeff + bool byteSwapped = false; + if (n >= 1) { + switch (decode32(data, false)) { + case 0x0000feff: + data += 4; + --n; + break; + + case 0x0000fffe: + byteSwapped = true; + data += 4; + --n; + break; + + default: + break; + } + } + + // convert each character + for (; n > 0; data += 4, --n) { + UInt32 c = decode32(data, byteSwapped); + if (c >= 0x00110000) { + setError(errors); + c = s_replacement; + } + toUTF8(dst, c, errors); + } + + return dst; +} + +UInt32 +Unicode::fromUTF8(const UInt8*& data, UInt32& n) +{ + assert(data != NULL); + assert(n != 0); + + // compute character encoding length, checking for overlong + // sequences (i.e. characters that don't use the shortest + // possible encoding). + UInt32 size; + if (data[0] < 0x80) { + // 0xxxxxxx + size = 1; + } + else if (data[0] < 0xc0) { + // 10xxxxxx -- in the middle of a multibyte character. counts + // as one invalid character. + --n; + ++data; + return s_invalid; + } + else if (data[0] < 0xe0) { + // 110xxxxx + size = 2; + } + else if (data[0] < 0xf0) { + // 1110xxxx + size = 3; + } + else if (data[0] < 0xf8) { + // 11110xxx + size = 4; + } + else if (data[0] < 0xfc) { + // 111110xx + size = 5; + } + else if (data[0] < 0xfe) { + // 1111110x + size = 6; + } + else { + // invalid sequence. dunno how many bytes to skip so skip one. + --n; + ++data; + return s_invalid; + } + + // make sure we have enough data + if (size > n) { + data += n; + n = 0; + return s_invalid; + } + + // extract character + UInt32 c; + switch (size) { + case 1: + c = static_cast<UInt32>(data[0]); + break; + + case 2: + c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) | + ((static_cast<UInt32>(data[1]) & 0x3f) ); + break; + + case 3: + c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 6) | + ((static_cast<UInt32>(data[2]) & 0x3f) ); + break; + + case 4: + c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 12) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 6) | + ((static_cast<UInt32>(data[1]) & 0x3f) ); + break; + + case 5: + c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 18) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 12) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 6) | + ((static_cast<UInt32>(data[1]) & 0x3f) ); + break; + + case 6: + c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 24) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 18) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 12) | + ((static_cast<UInt32>(data[1]) & 0x3f) << 6) | + ((static_cast<UInt32>(data[1]) & 0x3f) ); + break; + + default: + assert(0 && "invalid size"); + return s_invalid; + } + + // check that all bytes after the first have the pattern 10xxxxxx. + // truncated sequences are treated as a single malformed character. + bool truncated = false; + switch (size) { + case 6: + if ((data[5] & 0xc0) != 0x80) { + truncated = true; + size = 5; + } + // fall through + + case 5: + if ((data[4] & 0xc0) != 0x80) { + truncated = true; + size = 4; + } + // fall through + + case 4: + if ((data[3] & 0xc0) != 0x80) { + truncated = true; + size = 3; + } + // fall through + + case 3: + if ((data[2] & 0xc0) != 0x80) { + truncated = true; + size = 2; + } + // fall through + + case 2: + if ((data[1] & 0xc0) != 0x80) { + truncated = true; + size = 1; + } + } + + // update parameters + data += size; + n -= size; + + // invalid if sequence was truncated + if (truncated) { + return s_invalid; + } + + // check for characters that didn't use the smallest possible encoding + static UInt32 s_minChar[] = { + 0, + 0x00000000, + 0x00000080, + 0x00000800, + 0x00010000, + 0x00200000, + 0x04000000 + }; + if (c < s_minChar[size]) { + return s_invalid; + } + + // check for characters not in ISO-10646 + if (c >= 0x0000d800 && c <= 0x0000dfff) { + return s_invalid; + } + if (c >= 0x0000fffe && c <= 0x0000ffff) { + return s_invalid; + } + + return c; +} + +void +Unicode::toUTF8(String& dst, UInt32 c, bool* errors) +{ + UInt8 data[6]; + + // handle characters outside the valid range + if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) { + setError(errors); + c = s_replacement; + } + + // convert to UTF-8 + if (c < 0x00000080) { + data[0] = static_cast<UInt8>(c); + dst.append(reinterpret_cast<char*>(data), 1); + } + else if (c < 0x00000800) { + data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0); + data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80); + dst.append(reinterpret_cast<char*>(data), 2); + } + else if (c < 0x00010000) { + data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0); + data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80); + data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80); + dst.append(reinterpret_cast<char*>(data), 3); + } + else if (c < 0x00200000) { + data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0); + data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80); + data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80); + data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80); + dst.append(reinterpret_cast<char*>(data), 4); + } + else if (c < 0x04000000) { + data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8); + data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80); + data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80); + data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80); + data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80); + dst.append(reinterpret_cast<char*>(data), 5); + } + else if (c < 0x80000000) { + data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc); + data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80); + data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80); + data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80); + data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80); + data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80); + dst.append(reinterpret_cast<char*>(data), 6); + } + else { + assert(0 && "character out of range"); + } +} |
