unicode.hpp 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. /*************************************************************************
  2. *
  3. * Copyright 2016 Realm Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. **************************************************************************/
  18. #ifndef REALM_UNICODE_HPP
  19. #define REALM_UNICODE_HPP
  20. #include <locale>
  21. #include <cstdint>
  22. #include <string>
  23. #include <realm/string_data.hpp>
  24. #include <realm/util/features.h>
  25. #include <realm/utilities.hpp>
  26. namespace realm {
  27. enum string_compare_method_t {
  28. STRING_COMPARE_CORE,
  29. STRING_COMPARE_CPP11,
  30. STRING_COMPARE_CALLBACK,
  31. STRING_COMPARE_CORE_SIMILAR
  32. };
  33. extern StringCompareCallback string_compare_callback;
  34. extern string_compare_method_t string_compare_method;
  35. // Description for set_string_compare_method():
  36. //
  37. // Short summary: iOS language binding: call
  38. // set_string_compare_method() for fast but slightly inaccurate sort in some countries, or
  39. // set_string_compare_method(2, callbackptr) for slow but precise sort (see callbackptr below)
  40. //
  41. // Different countries ('locales') have different sorting order for strings and letters. Because there unfortunatly
  42. // doesn't exist any unified standardized way to compare strings in C++ on multiple platforms, we need this method.
  43. //
  44. // It determins how sorting a TableView by a String column must take place. The 'method' argument can be:
  45. //
  46. // 0: Fast core-only compare (no OS/framework calls). LIMITATIONS: Works only upto 'Latin Extended 2' (unicodes
  47. // 0...591). Also, sorting order is according to 'en_US' so it may be slightly inaccurate for some countries.
  48. // 'callback' argument is ignored.
  49. //
  50. // Return value: Always 'true'
  51. //
  52. // 1: Native C++11 method if core is compiled as C++11. Gives precise sorting according
  53. // to user's current locale. LIMITATIONS: Currently works only on Windows and on Linux with clang. Does NOT work on
  54. // iOS (due to only 'C' locale being available in CoreFoundation, which puts 'Z' before 'a'). Unknown if works on
  55. // Windows Phone / Android. Furthermore it does NOT work on Linux with gcc 4.7 or 4.8 (lack of c++11 feature that
  56. // can convert utf8->wstring without calls to setlocale()).
  57. //
  58. // Return value: 'true' if supported, otherwise 'false' (if so, then previous setting, if any, is preserved).
  59. //
  60. // 2: Callback method. Language binding / C++ user must provide a utf-8 callback method of prototype:
  61. // bool callback(const char* string1, const char* string2) where 'callback' must return bool(string1 < string2).
  62. //
  63. // Return value: Always 'true'
  64. //
  65. // Default is method = 0 if the function is never called
  66. //
  67. // NOT THREAD SAFE! Call once during initialization or make sure it's not called simultaneously with different
  68. // arguments. The setting is remembered per-process; it does NOT need to be called prior to each sort
  69. bool set_string_compare_method(string_compare_method_t method, StringCompareCallback callback);
  70. // Return size in bytes of utf8 character. No error checking
  71. size_t sequence_length(char lead);
  72. // Limitations for case insensitive string search
  73. // Case insensitive search (equal, begins_with, ends_with, like and contains)
  74. // only works for unicodes 0...0x7f which is the same as the 0...127
  75. // ASCII character set (letters a-z and A-Z).
  76. // In does *not* work for the 0...255 ANSI character set that contains
  77. // characters from many European countries like Germany, France, Denmark,
  78. // etc.
  79. // It also does not work for characters from non-western countries like
  80. // Japan, Russia, Arabia, etc.
  81. // If there exists characters outside the ASCII range either in the text
  82. // to be searched for, or in the Realm string column which is searched
  83. // in, then the compare yields a random result such that the row may or
  84. // may not be included in the result set.
  85. // Return bool(string1 < string2)
  86. bool utf8_compare(StringData string1, StringData string2);
  87. // Return unicode value of character.
  88. uint32_t utf8value(const char* character);
  89. inline bool equal_sequence(const char*& begin, const char* end, const char* begin2);
  90. // FIXME: The current approach to case insensitive comparison requires
  91. // that case mappings can be done in a way that does not change he
  92. // number of bytes used to encode the individual Unicode
  93. // character. This is not generally the case, so, as far as I can see,
  94. // this approach has no future.
  95. //
  96. // FIXME: The current approach to case insensitive comparison relies
  97. // on checking each "haystack" character against the corresponding
  98. // character in both a lower cased and an upper cased version of the
  99. // "needle". While this leads to efficient comparison, it ignores the
  100. // fact that "case folding" is the only correct approach to case
  101. // insensitive comparison in a locale agnostic Unicode
  102. // environment.
  103. //
  104. // See
  105. // http://www.w3.org/International/wiki/Case_folding
  106. // http://userguide.icu-project.org/transforms/casemappings#TOC-Case-Folding.
  107. //
  108. // The ideal API would probably be something like this:
  109. //
  110. // case_fold: utf_8 -> case_folded
  111. // equal_case_fold: (needle_case_folded, single_haystack_entry_utf_8) -> found
  112. // search_case_fold: (needle_case_folded, huge_haystack_string_utf_8) -> found_at_position
  113. //
  114. // The case folded form would probably be using UTF-32 or UTF-16.
  115. /// If successful, returns a string of the same size as \a source.
  116. /// Returns none if invalid UTF-8 encoding was encountered.
  117. util::Optional<std::string> case_map(StringData source, bool upper);
  118. enum IgnoreErrorsTag { IgnoreErrors };
  119. std::string case_map(StringData source, bool upper, IgnoreErrorsTag);
  120. /// Assumes that the sizes of \a needle_upper and \a needle_lower are
  121. /// identical to the size of \a haystack. Returns false if the needle
  122. /// is different from the haystack.
  123. bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower);
  124. /// Assumes that the sizes of \a needle_upper and \a needle_lower are
  125. /// both equal to \a needle_size. Returns haystack.size() if the
  126. /// needle was not found.
  127. size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size);
  128. /// Assumes that the sizes of \a needle_upper and \a needle_lower are
  129. /// both equal to \a needle_size. Returns false if the
  130. /// needle was not found.
  131. bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size, const std::array<uint8_t, 256> &charmap);
  132. /// Case insensitive wildcard matching ('?' for single char, '*' for zero or more chars)
  133. bool string_like_ins(StringData text, StringData pattern) noexcept;
  134. bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept;
  135. } // namespace realm
  136. #endif // REALM_UNICODE_HPP