utf8.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. /*************************************************************************
  2. *
  3. * Copyright 2016 Realm Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. **************************************************************************/
  18. #ifndef REALM_UTIL_UTF8_HPP
  19. #define REALM_UTIL_UTF8_HPP
  20. #include <cstdint>
  21. #include <string>
  22. #include <realm/util/safe_int_ops.hpp>
  23. #include <realm/string_data.hpp>
  24. #include <realm/util/features.h>
  25. #include <realm/utilities.hpp>
  26. namespace realm {
  27. namespace util {
  28. /// Transcode between UTF-8 and UTF-16.
  29. ///
  30. /// \tparam Char16 Must be an integral type with at least 16 bits.
  31. ///
  32. /// \tparam Traits16 Must define to_int_type() and to_char_type() for
  33. /// \a Char16.
  34. template <class Char16, class Traits16 = std::char_traits<Char16>>
  35. struct Utf8x16 {
  36. /// Transcode as much as possible of the specified UTF-8 input, to
  37. /// UTF-16. Returns true if all input characters were transcoded, or
  38. /// transcoding stopped because the next character did not fit into the
  39. /// output buffer. Returns false if transcoding stopped due to invalid
  40. /// input. It is not specified whether this function returns true or false
  41. /// if invalid input occurs at the same time as the output buffer runs
  42. /// full. In any case, upon return, \a in_begin and \a out_begin are
  43. /// advanced to the position where transcoding stopped.
  44. ///
  45. /// Throws only if Traits16::to_char_type() throws.
  46. static bool to_utf16(const char*& in_begin, const char* in_end, Char16*& out_begin, Char16* out_end);
  47. /// Same as to_utf16(), but in reverse.
  48. ///
  49. /// Throws only if Traits16::to_int_type() throws.
  50. static bool to_utf8(const Char16*& in_begin, const Char16* in_end, char*& out_begin, char* out_end);
  51. /// Summarize the number of UTF-16 elements needed to hold the result of
  52. /// transcoding the specified UTF-8 string. Upon return, if \a in_begin !=
  53. /// \a in_end, then the summation stopped due to invalid UTF-8 input. The
  54. /// returned size then reflects the number of UTF-16 elements needed to hold
  55. /// the result of transcoding the part of the input that was examined. This
  56. /// function will only detect a few UTF-8 validity issues, and can therefore
  57. /// not be used for general UTF-8 validation.
  58. static size_t find_utf16_buf_size(const char*& in_begin, const char* in_end);
  59. /// Summarize the number of UTF-8 bytes needed to hold the result of
  60. /// transcoding the specified UTF-16 string. Upon return, if \a in_begin !=
  61. /// \a in_end, then the summation stopped due to invalid UTF-16 input, or to
  62. /// prevent the returned \c size_t value from overflowing. The returned size
  63. /// then reflects the number of UTF-8 bytes needed to hold the result of
  64. /// transcoding the part of the input that was examined. This function will
  65. /// only detect a few UTF-16 validity issues, and can therefore not be used
  66. /// for general UTF-16 validation.
  67. static size_t find_utf8_buf_size(const Char16*& in_begin, const Char16* in_end);
  68. };
  69. // Implementation:
  70. // Adapted from reference implementation.
  71. // http://www.unicode.org/resources/utf8.html
  72. // http://www.bsdua.org/files/unicode.tar.gz
  73. template <class Char16, class Traits16>
  74. inline bool Utf8x16<Char16, Traits16>::to_utf16(const char*& in_begin, const char* const in_end, Char16*& out_begin,
  75. Char16* const out_end)
  76. {
  77. typedef std::char_traits<char> traits8;
  78. bool invalid = false;
  79. const char* in = in_begin;
  80. Char16* out = out_begin;
  81. while (in != in_end) {
  82. if (REALM_UNLIKELY(out == out_end)) {
  83. break; // Need space in output buffer
  84. }
  85. REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
  86. uint_fast16_t v1 = uint_fast16_t(traits8::to_int_type(in[0]));
  87. if (REALM_LIKELY(v1 < 0x80)) { // One byte
  88. // UTF-8 layout: 0xxxxxxx
  89. *out++ = Traits16::to_char_type(v1);
  90. in += 1;
  91. continue;
  92. }
  93. if (REALM_UNLIKELY(v1 < 0xC0)) {
  94. invalid = true;
  95. break; // Invalid first byte of UTF-8 sequence
  96. }
  97. if (REALM_LIKELY(v1 < 0xE0)) { // Two bytes
  98. if (REALM_UNLIKELY(in_end - in < 2)) {
  99. invalid = true;
  100. break; // Incomplete UTF-8 sequence
  101. }
  102. REALM_ASSERT(&in[1] >= in_begin && &in[1] < in_end);
  103. uint_fast16_t v2 = uint_fast16_t(traits8::to_int_type(in[1]));
  104. // UTF-8 layout: 110xxxxx 10xxxxxx
  105. if (REALM_UNLIKELY((v2 & 0xC0) != 0x80)) {
  106. invalid = true;
  107. break; // Invalid continuation byte
  108. }
  109. uint_fast16_t v = uint_fast16_t(((v1 & 0x1F) << 6) | ((v2 & 0x3F) << 0));
  110. if (REALM_UNLIKELY(v < 0x80)) {
  111. invalid = true;
  112. break; // Overlong encoding is invalid
  113. }
  114. *out++ = Traits16::to_char_type(v);
  115. in += 2;
  116. continue;
  117. }
  118. if (REALM_LIKELY(v1 < 0xF0)) { // Three bytes
  119. if (REALM_UNLIKELY(in_end - in < 3)) {
  120. invalid = true;
  121. break; // Incomplete UTF-8 sequence
  122. }
  123. REALM_ASSERT(&in[1] >= in_begin && &in[2] < in_end);
  124. uint_fast16_t v2 = uint_fast16_t(traits8::to_int_type(in[1]));
  125. uint_fast16_t v3 = uint_fast16_t(traits8::to_int_type(in[2]));
  126. // UTF-8 layout: 1110xxxx 10xxxxxx 10xxxxxx
  127. if (REALM_UNLIKELY((v2 & 0xC0) != 0x80 || (v3 & 0xC0) != 0x80)) {
  128. invalid = true;
  129. break; // Invalid continuation byte
  130. }
  131. uint_fast16_t v = uint_fast16_t(((v1 & 0x0F) << 12) | ((v2 & 0x3F) << 6) | ((v3 & 0x3F) << 0));
  132. if (REALM_UNLIKELY(v < 0x800)) {
  133. invalid = true;
  134. break; // Overlong encoding is invalid
  135. }
  136. if (REALM_UNLIKELY(0xD800 <= v && v < 0xE000)) {
  137. invalid = true;
  138. break; // Illegal code point range (reserved for UTF-16 surrogate pairs)
  139. }
  140. *out++ = Traits16::to_char_type(v);
  141. in += 3;
  142. continue;
  143. }
  144. if (REALM_UNLIKELY(out + 1 == out_end)) {
  145. break; // Need space in output buffer for surrogate pair
  146. }
  147. if (REALM_LIKELY(v1 < 0xF8)) { // Four bytes
  148. if (REALM_UNLIKELY(in_end - in < 4)) {
  149. invalid = true;
  150. break; // Incomplete UTF-8 sequence
  151. }
  152. uint_fast32_t w1 = uint_fast32_t(v1); // 16 bit -> 32 bit
  153. REALM_ASSERT(&in[1] >= in_begin && &in[3] < in_end);
  154. uint_fast32_t v2 = uint_fast32_t(traits8::to_int_type(in[1])); // 32 bit intended
  155. uint_fast16_t v3 = uint_fast16_t(traits8::to_int_type(in[2])); // 16 bit intended
  156. uint_fast16_t v4 = uint_fast16_t(traits8::to_int_type(in[3])); // 16 bit intended
  157. // UTF-8 layout: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  158. if (REALM_UNLIKELY((v2 & 0xC0) != 0x80 || (v3 & 0xC0) != 0x80 || (v4 & 0xC0) != 0x80)) {
  159. invalid = true;
  160. break; // Invalid continuation byte
  161. }
  162. uint_fast32_t v = uint_fast32_t(((w1 & 0x07) << 18) | // Parenthesis is 32 bit partial result
  163. ((v2 & 0x3F) << 12) | // Parenthesis is 32 bit partial result
  164. ((v3 & 0x3F) << 6) | // Parenthesis is 16 bit partial result
  165. ((v4 & 0x3F) << 0)); // Parenthesis is 16 bit partial result
  166. if (REALM_UNLIKELY(v < 0x10000)) {
  167. invalid = true;
  168. break; // Overlong encoding is invalid
  169. }
  170. if (REALM_UNLIKELY(0x110000 <= v)) {
  171. invalid = true;
  172. break; // Code point too big for UTF-16
  173. }
  174. v -= 0x10000l;
  175. *out++ = Traits16::to_char_type(0xD800 + (v / 0x400));
  176. *out++ = Traits16::to_char_type(0xDC00 + (v % 0x400));
  177. in += 4;
  178. continue;
  179. }
  180. // Invalid first byte of UTF-8 sequence, or code point too big for UTF-16
  181. invalid = true;
  182. break;
  183. }
  184. REALM_ASSERT(in >= in_begin && in <= in_end);
  185. REALM_ASSERT(out >= out_begin && out <= out_end);
  186. in_begin = in;
  187. out_begin = out;
  188. return !invalid;
  189. }
  190. template <class Char16, class Traits16>
  191. inline size_t Utf8x16<Char16, Traits16>::find_utf16_buf_size(const char*& in_begin, const char* const in_end)
  192. {
  193. typedef std::char_traits<char> traits8;
  194. size_t num_out = 0;
  195. const char* in = in_begin;
  196. while (in != in_end) {
  197. REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
  198. uint_fast16_t v1 = uint_fast16_t(traits8::to_int_type(in[0]));
  199. if (REALM_LIKELY(v1 < 0x80)) { // One byte
  200. num_out += 1;
  201. in += 1;
  202. continue;
  203. }
  204. if (REALM_UNLIKELY(v1 < 0xC0)) {
  205. break; // Invalid first byte of UTF-8 sequence
  206. }
  207. if (REALM_LIKELY(v1 < 0xE0)) { // Two bytes
  208. if (REALM_UNLIKELY(in_end - in < 2)) {
  209. break; // Incomplete UTF-8 sequence
  210. }
  211. num_out += 1;
  212. in += 2;
  213. continue;
  214. }
  215. if (REALM_LIKELY(v1 < 0xF0)) { // Three bytes
  216. if (REALM_UNLIKELY(in_end - in < 3)) {
  217. break; // Incomplete UTF-8 sequence
  218. }
  219. num_out += 1;
  220. in += 3;
  221. continue;
  222. }
  223. if (REALM_LIKELY(v1 < 0xF8)) { // Four bytes
  224. if (REALM_UNLIKELY(in_end - in < 4)) {
  225. break; // Incomplete UTF-8 sequence
  226. }
  227. num_out += 2; // Surrogate pair
  228. in += 4;
  229. continue;
  230. }
  231. // Invalid first byte of UTF-8 sequence, or code point too big for UTF-16
  232. break;
  233. }
  234. REALM_ASSERT(in >= in_begin && in <= in_end);
  235. in_begin = in;
  236. return num_out;
  237. }
  238. // Adapted from reference implementation.
  239. // http://www.unicode.org/resources/utf8.html
  240. // http://www.bsdua.org/files/unicode.tar.gz
  241. template <class Char16, class Traits16>
  242. inline bool Utf8x16<Char16, Traits16>::to_utf8(const Char16*& in_begin, const Char16* const in_end, char*& out_begin,
  243. char* const out_end)
  244. {
  245. typedef std::char_traits<char> traits8;
  246. typedef typename traits8::int_type traits8_int_type;
  247. bool invalid = false;
  248. const Char16* in = in_begin;
  249. char* out = out_begin;
  250. while (in != in_end) {
  251. REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
  252. uint_fast16_t v1 = uint_fast16_t(Traits16::to_int_type(in[0]));
  253. if (REALM_LIKELY(v1 < 0x80)) {
  254. if (REALM_UNLIKELY(out == out_end)) {
  255. break; // Not enough output buffer space
  256. }
  257. // UTF-8 layout: 0xxxxxxx
  258. REALM_ASSERT(out >= out_begin && out < out_end);
  259. *out++ = traits8::to_char_type(traits8_int_type(v1));
  260. in += 1;
  261. continue;
  262. }
  263. if (REALM_LIKELY(v1 < 0x800)) {
  264. if (REALM_UNLIKELY(out_end - out < 2)) {
  265. break; // Not enough output buffer space
  266. }
  267. // UTF-8 layout: 110xxxxx 10xxxxxx
  268. *out++ = traits8::to_char_type(traits8_int_type(0xC0 + v1 / 0x40));
  269. REALM_ASSERT(out >= out_begin && out < out_end);
  270. *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 % 0x40));
  271. in += 1;
  272. continue;
  273. }
  274. if (REALM_LIKELY(v1 < 0xD800 || 0xE000 <= v1)) {
  275. if (REALM_UNLIKELY(out_end - out < 3)) {
  276. break; // Not enough output buffer space
  277. }
  278. // UTF-8 layout: 1110xxxx 10xxxxxx 10xxxxxx
  279. REALM_ASSERT(out >= out_begin && out + 2 < out_end);
  280. *out++ = traits8::to_char_type(traits8_int_type(0xE0 + v1 / 0x1000));
  281. *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 / 0x40 % 0x40));
  282. *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 % 0x40));
  283. in += 1;
  284. continue;
  285. }
  286. // Surrogate pair
  287. if (REALM_UNLIKELY(out_end - out < 4)) {
  288. break; // Not enough output buffer space
  289. }
  290. if (REALM_UNLIKELY(0xDC00 <= v1)) {
  291. invalid = true;
  292. break; // Invalid first half of surrogate pair
  293. }
  294. if (REALM_UNLIKELY(in + 1 == in_end)) {
  295. invalid = true;
  296. break; // Incomplete surrogate pair
  297. }
  298. REALM_ASSERT(&in[1] >= in_begin && &in[1] < in_end);
  299. uint_fast16_t v2 = uint_fast16_t(Traits16::to_int_type(in[1]));
  300. if (REALM_UNLIKELY(v2 < 0xDC00 || 0xE000 <= v2)) {
  301. invalid = true;
  302. break; // Invalid second half of surrogate pair
  303. }
  304. uint_fast32_t v = 0x10000l + (uint_fast32_t(v1 - 0xD800) * 0x400 + (v2 - 0xDC00));
  305. // UTF-8 layout: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  306. REALM_ASSERT(out >= out_begin && out + 3 < out_end);
  307. *out++ = traits8::to_char_type(traits8_int_type(0xF0 + v / 0x40000));
  308. *out++ = traits8::to_char_type(traits8_int_type(0x80 + v / 0x1000 % 0x40));
  309. *out++ = traits8::to_char_type(traits8_int_type(0x80 + v / 0x40 % 0x40));
  310. *out++ = traits8::to_char_type(traits8_int_type(0x80 + v % 0x40));
  311. in += 2;
  312. }
  313. REALM_ASSERT(in >= in_begin && in <= in_end);
  314. REALM_ASSERT(out >= out_begin && out <= out_end);
  315. in_begin = in;
  316. out_begin = out;
  317. return !invalid;
  318. }
  319. template <class Char16, class Traits16>
  320. inline size_t Utf8x16<Char16, Traits16>::find_utf8_buf_size(const Char16*& in_begin, const Char16* const in_end)
  321. {
  322. size_t num_out = 0;
  323. const Char16* in = in_begin;
  324. while (in != in_end) {
  325. REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
  326. uint_fast16_t v = uint_fast16_t(Traits16::to_int_type(in[0]));
  327. if (REALM_LIKELY(v < 0x80)) {
  328. if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 1)))
  329. break; // Avoid overflow
  330. in += 1;
  331. }
  332. else if (REALM_LIKELY(v < 0x800)) {
  333. if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 2)))
  334. break; // Avoid overflow
  335. in += 1;
  336. }
  337. else if (REALM_LIKELY(v < 0xD800 || 0xE000 <= v)) {
  338. if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 3)))
  339. break; // Avoid overflow
  340. in += 1;
  341. }
  342. else {
  343. if (REALM_UNLIKELY(in + 1 == in_end)) {
  344. break; // Incomplete surrogate pair
  345. }
  346. if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 4)))
  347. break; // Avoid overflow
  348. in += 2;
  349. }
  350. }
  351. REALM_ASSERT(in >= in_begin && in <= in_end);
  352. in_begin = in;
  353. return num_out;
  354. }
  355. } // namespace util
  356. } // namespace realm
  357. #endif // REALM_UTIL_UTF8_HPP