string_data.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. /*************************************************************************
  2. *
  3. * Copyright 2016 Realm Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. **************************************************************************/
  18. #ifndef REALM_STRING_HPP
  19. #define REALM_STRING_HPP
  20. #include <realm/null.hpp>
  21. #include <realm/util/features.h>
  22. #include <realm/util/optional.hpp>
  23. #include <algorithm>
  24. #include <array>
  25. #include <cfloat>
  26. #include <cmath>
  27. #include <cstddef>
  28. #include <cstring>
  29. #include <ostream>
  30. #include <string>
  31. namespace realm {
  32. /// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms.
  33. /// This is what libc++ does, and it is a good general choice for a
  34. /// non-cryptographic hash function (suitable for std::unordered_map etc.).
  35. size_t murmur2_or_cityhash(const unsigned char* data, size_t len) noexcept;
  36. uint_least32_t murmur2_32(const unsigned char* data, size_t len) noexcept;
  37. uint_least64_t cityhash_64(const unsigned char* data, size_t len) noexcept;
  38. /// A reference to a chunk of character data.
  39. ///
  40. /// An instance of this class can be thought of as a type tag on a region of
  41. /// memory. It does not own the referenced memory, nor does it in any other way
  42. /// attempt to manage the lifetime of it.
  43. ///
  44. /// A null character inside the referenced region is considered a part of the
  45. /// string by Realm.
  46. ///
  47. /// For compatibility with C-style strings, when a string is stored in a Realm
  48. /// database, it is always followed by a terminating null character, regardless
  49. /// of whether the string itself has internal null characters. This means that
  50. /// when a StringData object is extracted from Realm, the referenced region is
  51. /// guaranteed to be followed immediately by an extra null character, but that
  52. /// null character is not inside the referenced region. Therefore, all of the
  53. /// following forms are guaranteed to return a pointer to a null-terminated
  54. /// string:
  55. ///
  56. /// \code{.cpp}
  57. ///
  58. /// group.get_table_name(...).data()
  59. /// table.get_column_name().data()
  60. /// table.get_string(...).data()
  61. /// table.get_mixed(...).get_string().data()
  62. ///
  63. /// \endcode
  64. ///
  65. /// Note that in general, no assumptions can be made about what follows a string
  66. /// that is referenced by a StringData object, or whether anything follows it at
  67. /// all. In particular, the receiver of a StringData object cannot assume that
  68. /// the referenced string is followed by a null character unless there is an
  69. /// externally provided guarantee.
  70. ///
  71. /// This class makes it possible to distinguish between a 'null' reference and a
  72. /// reference to the empty string (see is_null()).
  73. ///
  74. /// \sa BinaryData
  75. /// \sa Mixed
  76. class StringData {
  77. public:
  78. /// Construct a null reference.
  79. constexpr StringData() noexcept = default;
  80. /// If \a external_data is 'null', \a data_size must be zero.
  81. constexpr StringData(const char* external_data, size_t data_size) noexcept;
  82. template <class T, class A>
  83. constexpr StringData(const std::basic_string<char, T, A>&);
  84. template <class T, class A>
  85. operator std::basic_string<char, T, A>() const;
  86. template <class T, class A>
  87. constexpr StringData(const util::Optional<std::basic_string<char, T, A>>&);
  88. constexpr StringData(std::string_view sv);
  89. constexpr StringData(const null&) noexcept {}
  90. /// Initialize from a zero terminated C style string. Pass null to construct
  91. /// a null reference.
  92. constexpr StringData(const char* c_str) noexcept;
  93. constexpr char operator[](size_t i) const noexcept;
  94. constexpr const char* data() const noexcept;
  95. constexpr size_t size() const noexcept;
  96. /// Is this a null reference?
  97. ///
  98. /// An instance of StringData is a null reference when, and only when the
  99. /// stored size is zero (size()) and the stored pointer is the null pointer
  100. /// (data()).
  101. ///
  102. /// In the case of the empty string, the stored size is still zero, but the
  103. /// stored pointer is **not** the null pointer. It could for example point
  104. /// to the empty string literal. Note that the actual value of the pointer
  105. /// is immaterial in this case (as long as it is not zero), because when the
  106. /// size is zero, it is an error to dereference the pointer.
  107. ///
  108. /// Conversion of a StringData object to `bool` yields the logical negation
  109. /// of the result of calling this function. In other words, a StringData
  110. /// object is converted to true if it is not the null reference, otherwise
  111. /// it is converted to false.
  112. constexpr bool is_null() const noexcept;
  113. friend constexpr bool operator==(const StringData&, const StringData&) noexcept;
  114. friend constexpr bool operator!=(const StringData&, const StringData&) noexcept;
  115. //@{
  116. /// Trivial bytewise lexicographical comparison.
  117. friend bool operator<(const StringData&, const StringData&) noexcept;
  118. friend bool operator>(const StringData&, const StringData&) noexcept;
  119. friend bool operator<=(const StringData&, const StringData&) noexcept;
  120. friend bool operator>=(const StringData&, const StringData&) noexcept;
  121. //@}
  122. constexpr bool begins_with(StringData) const noexcept;
  123. constexpr bool ends_with(StringData) const noexcept;
  124. constexpr bool contains(StringData) const noexcept;
  125. bool contains(StringData d, const std::array<uint8_t, 256> &charmap) const noexcept;
  126. // Wildcard matching ('?' for single char, '*' for zero or more chars)
  127. // case insensitive version in unicode.hpp
  128. bool like(StringData) const noexcept;
  129. //@{
  130. /// Undefined behavior if \a n, \a i, or <tt>i+n</tt> is greater than
  131. /// size().
  132. constexpr StringData prefix(size_t n) const noexcept;
  133. constexpr StringData suffix(size_t n) const noexcept;
  134. constexpr StringData substr(size_t i, size_t n) const noexcept;
  135. constexpr StringData substr(size_t i) const noexcept;
  136. //@}
  137. template <class C, class T>
  138. friend std::basic_ostream<C, T>& operator<<(std::basic_ostream<C, T>&, const StringData&);
  139. constexpr explicit operator bool() const noexcept;
  140. constexpr explicit operator std::string_view() const noexcept
  141. {
  142. return std::string_view(m_data, m_size);
  143. }
  144. /// If the StringData is NULL, the hash is 0. Otherwise, the function
  145. /// `murmur2_or_cityhash()` is called on the data.
  146. size_t hash() const noexcept;
  147. private:
  148. const char* m_data = nullptr;
  149. size_t m_size = 0;
  150. static bool matchlike(const StringData& text, const StringData& pattern) noexcept;
  151. static bool matchlike_ins(const StringData& text, const StringData& pattern_upper,
  152. const StringData& pattern_lower) noexcept;
  153. friend bool string_like_ins(StringData, StringData) noexcept;
  154. friend bool string_like_ins(StringData, StringData, StringData) noexcept;
  155. };
  156. // Implementation:
  157. constexpr inline StringData::StringData(const char* external_data, size_t data_size) noexcept
  158. : m_data(external_data)
  159. , m_size(data_size)
  160. {
  161. REALM_ASSERT_DEBUG(external_data || data_size == 0);
  162. }
  163. constexpr inline StringData::StringData(std::string_view sv)
  164. : m_data(sv.data())
  165. , m_size(sv.size())
  166. {
  167. }
  168. template <class T, class A>
  169. constexpr inline StringData::StringData(const std::basic_string<char, T, A>& s)
  170. : m_data(s.data())
  171. , m_size(s.size())
  172. {
  173. }
  174. template <class T, class A>
  175. inline StringData::operator std::basic_string<char, T, A>() const
  176. {
  177. return std::basic_string<char, T, A>(m_data, m_size);
  178. }
  179. template <class T, class A>
  180. constexpr inline StringData::StringData(const util::Optional<std::basic_string<char, T, A>>& s)
  181. : m_data(s ? s->data() : nullptr)
  182. , m_size(s ? s->size() : 0)
  183. {
  184. }
  185. constexpr inline StringData::StringData(const char* c_str) noexcept
  186. : m_data(c_str)
  187. {
  188. if (c_str)
  189. m_size = std::char_traits<char>::length(c_str);
  190. }
  191. constexpr inline char StringData::operator[](size_t i) const noexcept
  192. {
  193. return m_data[i];
  194. }
  195. constexpr inline const char* StringData::data() const noexcept
  196. {
  197. return m_data;
  198. }
  199. constexpr inline size_t StringData::size() const noexcept
  200. {
  201. return m_size;
  202. }
  203. constexpr inline bool StringData::is_null() const noexcept
  204. {
  205. return !m_data;
  206. }
  207. constexpr inline bool operator==(const StringData& a, const StringData& b) noexcept
  208. {
  209. return a.m_size == b.m_size && a.is_null() == b.is_null() && safe_equal(a.m_data, a.m_data + a.m_size, b.m_data);
  210. }
  211. constexpr inline bool operator!=(const StringData& a, const StringData& b) noexcept
  212. {
  213. return !(a == b);
  214. }
  215. inline bool operator<(const StringData& a, const StringData& b) noexcept
  216. {
  217. if (a.is_null() && !b.is_null()) {
  218. // Null strings are smaller than all other strings, and not
  219. // equal to empty strings.
  220. return true;
  221. }
  222. return std::lexicographical_compare(a.m_data, a.m_data + a.m_size, b.m_data, b.m_data + b.m_size);
  223. }
  224. inline bool operator>(const StringData& a, const StringData& b) noexcept
  225. {
  226. return b < a;
  227. }
  228. inline bool operator<=(const StringData& a, const StringData& b) noexcept
  229. {
  230. return !(b < a);
  231. }
  232. inline bool operator>=(const StringData& a, const StringData& b) noexcept
  233. {
  234. return !(a < b);
  235. }
  236. constexpr inline bool StringData::begins_with(StringData d) const noexcept
  237. {
  238. if (is_null() && !d.is_null())
  239. return false;
  240. return d.m_size <= m_size && safe_equal(m_data, m_data + d.m_size, d.m_data);
  241. }
  242. constexpr inline bool StringData::ends_with(StringData d) const noexcept
  243. {
  244. if (is_null() && !d.is_null())
  245. return false;
  246. return d.m_size <= m_size && safe_equal(m_data + m_size - d.m_size, m_data + m_size, d.m_data);
  247. }
  248. constexpr inline bool StringData::contains(StringData d) const noexcept
  249. {
  250. if (is_null() && !d.is_null())
  251. return false;
  252. return d.m_size == 0 || std::search(m_data, m_data + m_size, d.m_data, d.m_data + d.m_size) != m_data + m_size;
  253. }
  254. /// This method takes an array that maps chars to distance that can be moved (and zero for chars not in needle),
  255. /// allowing the method to apply Boyer-Moore for quick substring search
  256. /// The map is calculated in the StringNode<Contains> class (so it can be reused across searches)
  257. inline bool StringData::contains(StringData d, const std::array<uint8_t, 256> &charmap) const noexcept
  258. {
  259. if (is_null() && !d.is_null())
  260. return false;
  261. size_t needle_size = d.size();
  262. if (needle_size == 0)
  263. return true;
  264. // Prepare vars to avoid lookups in loop
  265. size_t last_char_pos = d.size()-1;
  266. unsigned char lastChar = d[last_char_pos];
  267. // Do Boyer-Moore search
  268. size_t p = last_char_pos;
  269. while (p < m_size) {
  270. unsigned char c = m_data[p]; // Get candidate for last char
  271. if (c == lastChar) {
  272. StringData candidate = substr(p-needle_size+1, needle_size);
  273. if (candidate == d)
  274. return true; // text found!
  275. }
  276. // If we don't have a match, see how far we can move char_pos
  277. if (charmap[c] == 0)
  278. p += needle_size; // char was not present in search string
  279. else
  280. p += charmap[c];
  281. }
  282. return false;
  283. }
  284. inline bool StringData::like(StringData d) const noexcept
  285. {
  286. if (is_null() || d.is_null()) {
  287. return (is_null() && d.is_null());
  288. }
  289. return matchlike(*this, d);
  290. }
  291. constexpr inline StringData StringData::prefix(size_t n) const noexcept
  292. {
  293. return substr(0, n);
  294. }
  295. constexpr inline StringData StringData::suffix(size_t n) const noexcept
  296. {
  297. return substr(m_size - n);
  298. }
  299. constexpr inline StringData StringData::substr(size_t i, size_t n) const noexcept
  300. {
  301. return StringData(m_data + i, n);
  302. }
  303. constexpr inline StringData StringData::substr(size_t i) const noexcept
  304. {
  305. return substr(i, m_size - i);
  306. }
  307. template <class C, class T>
  308. inline std::basic_ostream<C, T>& operator<<(std::basic_ostream<C, T>& out, const StringData& d)
  309. {
  310. if (d.is_null()) {
  311. out << "<null>";
  312. }
  313. else {
  314. for (const char* i = d.m_data; i != d.m_data + d.m_size; ++i)
  315. out << *i;
  316. }
  317. return out;
  318. }
  319. constexpr inline StringData::operator bool() const noexcept
  320. {
  321. return !is_null();
  322. }
  323. inline size_t StringData::hash() const noexcept
  324. {
  325. if (is_null())
  326. return 0;
  327. auto unsigned_data = reinterpret_cast<const unsigned char*>(m_data);
  328. return murmur2_or_cityhash(unsigned_data, m_size);
  329. }
  330. } // namespace realm
  331. namespace std {
  332. template <>
  333. struct hash<::realm::StringData> {
  334. inline size_t operator()(const ::realm::StringData& str) const noexcept
  335. {
  336. return str.hash();
  337. }
  338. };
  339. } // namespace std
  340. #endif // REALM_STRING_HPP