aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/base/Unicode.cpp
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2018-04-25 18:07:30 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2018-04-25 18:07:30 -0400
commit9b1b081cfdb1c0fb6457278775e0823f8bc10f62 (patch)
treece8840148d8445055ba9e4f12263b2208f234c16 /src/lib/base/Unicode.cpp
Import Upstream version 2.0.0+dfsgupstream/2.0.0+dfsg
Diffstat (limited to 'src/lib/base/Unicode.cpp')
-rw-r--r--src/lib/base/Unicode.cpp784
1 files changed, 784 insertions, 0 deletions
diff --git a/src/lib/base/Unicode.cpp b/src/lib/base/Unicode.cpp
new file mode 100644
index 0000000..6a077e7
--- /dev/null
+++ b/src/lib/base/Unicode.cpp
@@ -0,0 +1,784 @@
+/*
+ * barrier -- mouse and keyboard sharing utility
+ * Copyright (C) 2012-2016 Symless Ltd.
+ * Copyright (C) 2002 Chris Schoeneman
+ *
+ * This package is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * found in the file LICENSE that should have accompanied this file.
+ *
+ * This package is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "arch/Arch.h"
+#include "base/Unicode.h"
+
+#include <cstring>
+
+//
+// local utility functions
+//
+
+inline
+static
+UInt16
+decode16(const UInt8* n, bool byteSwapped)
+{
+ union x16 {
+ UInt8 n8[2];
+ UInt16 n16;
+ } c;
+ if (byteSwapped) {
+ c.n8[0] = n[1];
+ c.n8[1] = n[0];
+ }
+ else {
+ c.n8[0] = n[0];
+ c.n8[1] = n[1];
+ }
+ return c.n16;
+}
+
+inline
+static
+UInt32
+decode32(const UInt8* n, bool byteSwapped)
+{
+ union x32 {
+ UInt8 n8[4];
+ UInt32 n32;
+ } c;
+ if (byteSwapped) {
+ c.n8[0] = n[3];
+ c.n8[1] = n[2];
+ c.n8[2] = n[1];
+ c.n8[3] = n[0];
+ }
+ else {
+ c.n8[0] = n[0];
+ c.n8[1] = n[1];
+ c.n8[2] = n[2];
+ c.n8[3] = n[3];
+ }
+ return c.n32;
+}
+
+inline
+static
+void
+resetError(bool* errors)
+{
+ if (errors != NULL) {
+ *errors = false;
+ }
+}
+
+inline
+static
+void
+setError(bool* errors)
+{
+ if (errors != NULL) {
+ *errors = true;
+ }
+}
+
+
+//
+// Unicode
+//
+
+UInt32 Unicode::s_invalid = 0x0000ffff;
+UInt32 Unicode::s_replacement = 0x0000fffd;
+
+bool
+Unicode::isUTF8(const String& src)
+{
+ // convert and test each character
+ const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+ for (UInt32 n = (UInt32)src.size(); n > 0; ) {
+ if (fromUTF8(data, n) == s_invalid) {
+ return false;
+ }
+ }
+ return true;
+}
+
+String
+Unicode::UTF8ToUCS2(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // get size of input string and reserve some space in output
+ UInt32 n = (UInt32)src.size();
+ String dst;
+ dst.reserve(2 * n);
+
+ // convert each character
+ const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+ while (n > 0) {
+ UInt32 c = fromUTF8(data, n);
+ if (c == s_invalid) {
+ c = s_replacement;
+ }
+ else if (c >= 0x00010000) {
+ setError(errors);
+ c = s_replacement;
+ }
+ UInt16 ucs2 = static_cast<UInt16>(c);
+ dst.append(reinterpret_cast<const char*>(&ucs2), 2);
+ }
+
+ return dst;
+}
+
+String
+Unicode::UTF8ToUCS4(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // get size of input string and reserve some space in output
+ UInt32 n = (UInt32)src.size();
+ String dst;
+ dst.reserve(4 * n);
+
+ // convert each character
+ const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+ while (n > 0) {
+ UInt32 c = fromUTF8(data, n);
+ if (c == s_invalid) {
+ c = s_replacement;
+ }
+ dst.append(reinterpret_cast<const char*>(&c), 4);
+ }
+
+ return dst;
+}
+
+String
+Unicode::UTF8ToUTF16(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // get size of input string and reserve some space in output
+ UInt32 n = (UInt32)src.size();
+ String dst;
+ dst.reserve(2 * n);
+
+ // convert each character
+ const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+ while (n > 0) {
+ UInt32 c = fromUTF8(data, n);
+ if (c == s_invalid) {
+ c = s_replacement;
+ }
+ else if (c >= 0x00110000) {
+ setError(errors);
+ c = s_replacement;
+ }
+ if (c < 0x00010000) {
+ UInt16 ucs2 = static_cast<UInt16>(c);
+ dst.append(reinterpret_cast<const char*>(&ucs2), 2);
+ }
+ else {
+ c -= 0x00010000;
+ UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
+ UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
+ dst.append(reinterpret_cast<const char*>(&utf16h), 2);
+ dst.append(reinterpret_cast<const char*>(&utf16l), 2);
+ }
+ }
+
+ return dst;
+}
+
+String
+Unicode::UTF8ToUTF32(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // get size of input string and reserve some space in output
+ UInt32 n = (UInt32)src.size();
+ String dst;
+ dst.reserve(4 * n);
+
+ // convert each character
+ const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+ while (n > 0) {
+ UInt32 c = fromUTF8(data, n);
+ if (c == s_invalid) {
+ c = s_replacement;
+ }
+ else if (c >= 0x00110000) {
+ setError(errors);
+ c = s_replacement;
+ }
+ dst.append(reinterpret_cast<const char*>(&c), 4);
+ }
+
+ return dst;
+}
+
+String
+Unicode::UTF8ToText(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // convert to wide char
+ UInt32 size;
+ wchar_t* tmp = UTF8ToWideChar(src, size, errors);
+
+ // convert string to multibyte
+ int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
+ char* mbs = new char[len + 1];
+ ARCH->convStringWCToMB(mbs, tmp, size, errors);
+ String text(mbs, len);
+
+ // clean up
+ delete[] mbs;
+ delete[] tmp;
+
+ return text;
+}
+
+String
+Unicode::UCS2ToUTF8(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // convert
+ UInt32 n = (UInt32)src.size() >> 1;
+ return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+String
+Unicode::UCS4ToUTF8(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // convert
+ UInt32 n = (UInt32)src.size() >> 2;
+ return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+String
+Unicode::UTF16ToUTF8(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // convert
+ UInt32 n = (UInt32)src.size() >> 1;
+ return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+String
+Unicode::UTF32ToUTF8(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // convert
+ UInt32 n = (UInt32)src.size() >> 2;
+ return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+String
+Unicode::textToUTF8(const String& src, bool* errors)
+{
+ // default to success
+ resetError(errors);
+
+ // convert string to wide characters
+ UInt32 n = (UInt32)src.size();
+ int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
+ wchar_t* wcs = new wchar_t[len + 1];
+ ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
+
+ // convert to UTF8
+ String utf8 = wideCharToUTF8(wcs, len, errors);
+
+ // clean up
+ delete[] wcs;
+
+ return utf8;
+}
+
+wchar_t*
+Unicode::UTF8ToWideChar(const String& src, UInt32& size, bool* errors)
+{
+ // convert to platform's wide character encoding
+ String tmp;
+ switch (ARCH->getWideCharEncoding()) {
+ case IArchString::kUCS2:
+ tmp = UTF8ToUCS2(src, errors);
+ size = (UInt32)tmp.size() >> 1;
+ break;
+
+ case IArchString::kUCS4:
+ tmp = UTF8ToUCS4(src, errors);
+ size = (UInt32)tmp.size() >> 2;
+ break;
+
+ case IArchString::kUTF16:
+ tmp = UTF8ToUTF16(src, errors);
+ size = (UInt32)tmp.size() >> 1;
+ break;
+
+ case IArchString::kUTF32:
+ tmp = UTF8ToUTF32(src, errors);
+ size = (UInt32)tmp.size() >> 2;
+ break;
+
+ default:
+ assert(0 && "unknown wide character encoding");
+ }
+
+ // copy to a wchar_t array
+ wchar_t* dst = new wchar_t[size];
+ ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
+ return dst;
+}
+
+String
+Unicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
+{
+ // convert from platform's wide character encoding.
+ // note -- this must include a wide nul character (independent of
+ // the String's nul character).
+ switch (ARCH->getWideCharEncoding()) {
+ case IArchString::kUCS2:
+ return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
+
+ case IArchString::kUCS4:
+ return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
+
+ case IArchString::kUTF16:
+ return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
+
+ case IArchString::kUTF32:
+ return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
+
+ default:
+ assert(0 && "unknown wide character encoding");
+ return String();
+ }
+}
+
+String
+Unicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
+{
+ // make some space
+ String dst;
+ dst.reserve(n);
+
+ // check if first character is 0xfffe or 0xfeff
+ bool byteSwapped = false;
+ if (n >= 1) {
+ switch (decode16(data, false)) {
+ case 0x0000feff:
+ data += 2;
+ --n;
+ break;
+
+ case 0x0000fffe:
+ byteSwapped = true;
+ data += 2;
+ --n;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ // convert each character
+ for (; n > 0; data += 2, --n) {
+ UInt32 c = decode16(data, byteSwapped);
+ toUTF8(dst, c, errors);
+ }
+
+ return dst;
+}
+
+String
+Unicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
+{
+ // make some space
+ String dst;
+ dst.reserve(n);
+
+ // check if first character is 0xfffe or 0xfeff
+ bool byteSwapped = false;
+ if (n >= 1) {
+ switch (decode32(data, false)) {
+ case 0x0000feff:
+ data += 4;
+ --n;
+ break;
+
+ case 0x0000fffe:
+ byteSwapped = true;
+ data += 4;
+ --n;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ // convert each character
+ for (; n > 0; data += 4, --n) {
+ UInt32 c = decode32(data, byteSwapped);
+ toUTF8(dst, c, errors);
+ }
+
+ return dst;
+}
+
+String
+Unicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
+{
+ // make some space
+ String dst;
+ dst.reserve(n);
+
+ // check if first character is 0xfffe or 0xfeff
+ bool byteSwapped = false;
+ if (n >= 1) {
+ switch (decode16(data, false)) {
+ case 0x0000feff:
+ data += 2;
+ --n;
+ break;
+
+ case 0x0000fffe:
+ byteSwapped = true;
+ data += 2;
+ --n;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ // convert each character
+ for (; n > 0; data += 2, --n) {
+ UInt32 c = decode16(data, byteSwapped);
+ if (c < 0x0000d800 || c > 0x0000dfff) {
+ toUTF8(dst, c, errors);
+ }
+ else if (n == 1) {
+ // error -- missing second word
+ setError(errors);
+ toUTF8(dst, s_replacement, NULL);
+ }
+ else if (c >= 0x0000d800 && c <= 0x0000dbff) {
+ UInt32 c2 = decode16(data, byteSwapped);
+ data += 2;
+ --n;
+ if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
+ // error -- [d800,dbff] not followed by [dc00,dfff]
+ setError(errors);
+ toUTF8(dst, s_replacement, NULL);
+ }
+ else {
+ c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
+ toUTF8(dst, c, errors);
+ }
+ }
+ else {
+ // error -- [dc00,dfff] without leading [d800,dbff]
+ setError(errors);
+ toUTF8(dst, s_replacement, NULL);
+ }
+ }
+
+ return dst;
+}
+
+String
+Unicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
+{
+ // make some space
+ String dst;
+ dst.reserve(n);
+
+ // check if first character is 0xfffe or 0xfeff
+ bool byteSwapped = false;
+ if (n >= 1) {
+ switch (decode32(data, false)) {
+ case 0x0000feff:
+ data += 4;
+ --n;
+ break;
+
+ case 0x0000fffe:
+ byteSwapped = true;
+ data += 4;
+ --n;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ // convert each character
+ for (; n > 0; data += 4, --n) {
+ UInt32 c = decode32(data, byteSwapped);
+ if (c >= 0x00110000) {
+ setError(errors);
+ c = s_replacement;
+ }
+ toUTF8(dst, c, errors);
+ }
+
+ return dst;
+}
+
+UInt32
+Unicode::fromUTF8(const UInt8*& data, UInt32& n)
+{
+ assert(data != NULL);
+ assert(n != 0);
+
+ // compute character encoding length, checking for overlong
+ // sequences (i.e. characters that don't use the shortest
+ // possible encoding).
+ UInt32 size;
+ if (data[0] < 0x80) {
+ // 0xxxxxxx
+ size = 1;
+ }
+ else if (data[0] < 0xc0) {
+ // 10xxxxxx -- in the middle of a multibyte character. counts
+ // as one invalid character.
+ --n;
+ ++data;
+ return s_invalid;
+ }
+ else if (data[0] < 0xe0) {
+ // 110xxxxx
+ size = 2;
+ }
+ else if (data[0] < 0xf0) {
+ // 1110xxxx
+ size = 3;
+ }
+ else if (data[0] < 0xf8) {
+ // 11110xxx
+ size = 4;
+ }
+ else if (data[0] < 0xfc) {
+ // 111110xx
+ size = 5;
+ }
+ else if (data[0] < 0xfe) {
+ // 1111110x
+ size = 6;
+ }
+ else {
+ // invalid sequence. dunno how many bytes to skip so skip one.
+ --n;
+ ++data;
+ return s_invalid;
+ }
+
+ // make sure we have enough data
+ if (size > n) {
+ data += n;
+ n = 0;
+ return s_invalid;
+ }
+
+ // extract character
+ UInt32 c;
+ switch (size) {
+ case 1:
+ c = static_cast<UInt32>(data[0]);
+ break;
+
+ case 2:
+ c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) );
+ break;
+
+ case 3:
+ c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
+ ((static_cast<UInt32>(data[2]) & 0x3f) );
+ break;
+
+ case 4:
+ c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) );
+ break;
+
+ case 5:
+ c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) );
+ break;
+
+ case 6:
+ c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
+ ((static_cast<UInt32>(data[1]) & 0x3f) );
+ break;
+
+ default:
+ assert(0 && "invalid size");
+ return s_invalid;
+ }
+
+ // check that all bytes after the first have the pattern 10xxxxxx.
+ // truncated sequences are treated as a single malformed character.
+ bool truncated = false;
+ switch (size) {
+ case 6:
+ if ((data[5] & 0xc0) != 0x80) {
+ truncated = true;
+ size = 5;
+ }
+ // fall through
+
+ case 5:
+ if ((data[4] & 0xc0) != 0x80) {
+ truncated = true;
+ size = 4;
+ }
+ // fall through
+
+ case 4:
+ if ((data[3] & 0xc0) != 0x80) {
+ truncated = true;
+ size = 3;
+ }
+ // fall through
+
+ case 3:
+ if ((data[2] & 0xc0) != 0x80) {
+ truncated = true;
+ size = 2;
+ }
+ // fall through
+
+ case 2:
+ if ((data[1] & 0xc0) != 0x80) {
+ truncated = true;
+ size = 1;
+ }
+ }
+
+ // update parameters
+ data += size;
+ n -= size;
+
+ // invalid if sequence was truncated
+ if (truncated) {
+ return s_invalid;
+ }
+
+ // check for characters that didn't use the smallest possible encoding
+ static UInt32 s_minChar[] = {
+ 0,
+ 0x00000000,
+ 0x00000080,
+ 0x00000800,
+ 0x00010000,
+ 0x00200000,
+ 0x04000000
+ };
+ if (c < s_minChar[size]) {
+ return s_invalid;
+ }
+
+ // check for characters not in ISO-10646
+ if (c >= 0x0000d800 && c <= 0x0000dfff) {
+ return s_invalid;
+ }
+ if (c >= 0x0000fffe && c <= 0x0000ffff) {
+ return s_invalid;
+ }
+
+ return c;
+}
+
+void
+Unicode::toUTF8(String& dst, UInt32 c, bool* errors)
+{
+ UInt8 data[6];
+
+ // handle characters outside the valid range
+ if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
+ setError(errors);
+ c = s_replacement;
+ }
+
+ // convert to UTF-8
+ if (c < 0x00000080) {
+ data[0] = static_cast<UInt8>(c);
+ dst.append(reinterpret_cast<char*>(data), 1);
+ }
+ else if (c < 0x00000800) {
+ data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
+ data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
+ dst.append(reinterpret_cast<char*>(data), 2);
+ }
+ else if (c < 0x00010000) {
+ data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
+ data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
+ data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
+ dst.append(reinterpret_cast<char*>(data), 3);
+ }
+ else if (c < 0x00200000) {
+ data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
+ data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
+ data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
+ data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
+ dst.append(reinterpret_cast<char*>(data), 4);
+ }
+ else if (c < 0x04000000) {
+ data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
+ data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
+ data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
+ data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
+ data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
+ dst.append(reinterpret_cast<char*>(data), 5);
+ }
+ else if (c < 0x80000000) {
+ data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
+ data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
+ data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
+ data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
+ data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
+ data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
+ dst.append(reinterpret_cast<char*>(data), 6);
+ }
+ else {
+ assert(0 && "character out of range");
+ }
+}