string_data.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. /*************************************************************************
  2. *
  3. * Copyright 2016 Realm Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. **************************************************************************/
  18. #ifndef REALM_STRING_HPP
  19. #define REALM_STRING_HPP
  20. #include <realm/null.hpp>
  21. #include <realm/util/features.h>
  22. #include <realm/util/optional.hpp>
  23. #include <algorithm>
  24. #include <array>
  25. #include <cfloat>
  26. #include <cmath>
  27. #include <cstddef>
  28. #include <cstring>
  29. #include <ostream>
  30. #include <string>
  31. namespace realm {
  32. /// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms.
  33. /// This is what libc++ does, and it is a good general choice for a
  34. /// non-cryptographic hash function (suitable for std::unordered_map etc.).
  35. size_t murmur2_or_cityhash(const unsigned char* data, size_t len) noexcept;
  36. uint_least32_t murmur2_32(const unsigned char* data, size_t len) noexcept;
  37. uint_least64_t cityhash_64(const unsigned char* data, size_t len) noexcept;
  38. /// A reference to a chunk of character data.
  39. ///
  40. /// An instance of this class can be thought of as a type tag on a region of
  41. /// memory. It does not own the referenced memory, nor does it in any other way
  42. /// attempt to manage the lifetime of it.
  43. ///
  44. /// A null character inside the referenced region is considered a part of the
  45. /// string by Realm.
  46. ///
  47. /// For compatibility with C-style strings, when a string is stored in a Realm
  48. /// database, it is always followed by a terminating null character, regardless
  49. /// of whether the string itself has internal null characters. This means that
  50. /// when a StringData object is extracted from Realm, the referenced region is
  51. /// guaranteed to be followed immediately by an extra null character, but that
  52. /// null character is not inside the referenced region. Therefore, all of the
  53. /// following forms are guaranteed to return a pointer to a null-terminated
  54. /// string:
  55. ///
  56. /// \code{.cpp}
  57. ///
  58. /// group.get_table_name(...).data()
  59. /// table.get_column_name().data()
  60. /// table.get_string(...).data()
  61. /// table.get_mixed(...).get_string().data()
  62. ///
  63. /// \endcode
  64. ///
  65. /// Note that in general, no assumptions can be made about what follows a string
  66. /// that is referenced by a StringData object, or whether anything follows it at
  67. /// all. In particular, the receiver of a StringData object cannot assume that
  68. /// the referenced string is followed by a null character unless there is an
  69. /// externally provided guarantee.
  70. ///
  71. /// This class makes it possible to distinguish between a 'null' reference and a
  72. /// reference to the empty string (see is_null()).
  73. ///
  74. /// \sa BinaryData
  75. /// \sa Mixed
  76. class StringData {
  77. public:
  78. /// Construct a null reference.
  79. StringData() noexcept;
  80. StringData(int) = delete;
  81. /// If \a external_data is 'null', \a data_size must be zero.
  82. StringData(const char* external_data, size_t data_size) noexcept;
  83. template <class T, class A>
  84. StringData(const std::basic_string<char, T, A>&);
  85. template <class T, class A>
  86. operator std::basic_string<char, T, A>() const;
  87. template <class T, class A>
  88. StringData(const util::Optional<std::basic_string<char, T, A>>&);
  89. StringData(const null&) noexcept;
  90. /// Initialize from a zero terminated C style string. Pass null to construct
  91. /// a null reference.
  92. StringData(const char* c_str) noexcept;
  93. char operator[](size_t i) const noexcept;
  94. const char* data() const noexcept;
  95. size_t size() const noexcept;
  96. /// Is this a null reference?
  97. ///
  98. /// An instance of StringData is a null reference when, and only when the
  99. /// stored size is zero (size()) and the stored pointer is the null pointer
  100. /// (data()).
  101. ///
  102. /// In the case of the empty string, the stored size is still zero, but the
  103. /// stored pointer is **not** the null pointer. It could for example point
  104. /// to the empty string literal. Note that the actual value of the pointer
  105. /// is immaterial in this case (as long as it is not zero), because when the
  106. /// size is zero, it is an error to dereference the pointer.
  107. ///
  108. /// Conversion of a StringData object to `bool` yields the logical negation
  109. /// of the result of calling this function. In other words, a StringData
  110. /// object is converted to true if it is not the null reference, otherwise
  111. /// it is converted to false.
  112. bool is_null() const noexcept;
  113. friend bool operator==(const StringData&, const StringData&) noexcept;
  114. friend bool operator!=(const StringData&, const StringData&) noexcept;
  115. //@{
  116. /// Trivial bytewise lexicographical comparison.
  117. friend bool operator<(const StringData&, const StringData&) noexcept;
  118. friend bool operator>(const StringData&, const StringData&) noexcept;
  119. friend bool operator<=(const StringData&, const StringData&) noexcept;
  120. friend bool operator>=(const StringData&, const StringData&) noexcept;
  121. //@}
  122. bool begins_with(StringData) const noexcept;
  123. bool ends_with(StringData) const noexcept;
  124. bool contains(StringData) const noexcept;
  125. bool contains(StringData d, const std::array<uint8_t, 256> &charmap) const noexcept;
  126. // Wildcard matching ('?' for single char, '*' for zero or more chars)
  127. // case insensitive version in unicode.hpp
  128. bool like(StringData) const noexcept;
  129. //@{
  130. /// Undefined behavior if \a n, \a i, or <tt>i+n</tt> is greater than
  131. /// size().
  132. StringData prefix(size_t n) const noexcept;
  133. StringData suffix(size_t n) const noexcept;
  134. StringData substr(size_t i, size_t n) const noexcept;
  135. StringData substr(size_t i) const noexcept;
  136. //@}
  137. template <class C, class T>
  138. friend std::basic_ostream<C, T>& operator<<(std::basic_ostream<C, T>&, const StringData&);
  139. explicit operator bool() const noexcept;
  140. /// If the StringData is NULL, the hash is 0. Otherwise, the function
  141. /// `murmur2_or_cityhash()` is called on the data.
  142. size_t hash() const noexcept;
  143. private:
  144. const char* m_data;
  145. size_t m_size;
  146. static bool matchlike(const StringData& text, const StringData& pattern) noexcept;
  147. static bool matchlike_ins(const StringData& text, const StringData& pattern_upper,
  148. const StringData& pattern_lower) noexcept;
  149. friend bool string_like_ins(StringData, StringData) noexcept;
  150. friend bool string_like_ins(StringData, StringData, StringData) noexcept;
  151. };
  152. // Implementation:
  153. inline StringData::StringData() noexcept
  154. : m_data(nullptr)
  155. , m_size(0)
  156. {
  157. }
  158. inline StringData::StringData(const char* external_data, size_t data_size) noexcept
  159. : m_data(external_data)
  160. , m_size(data_size)
  161. {
  162. REALM_ASSERT_DEBUG(external_data || data_size == 0);
  163. }
  164. template <class T, class A>
  165. inline StringData::StringData(const std::basic_string<char, T, A>& s)
  166. : m_data(s.data())
  167. , m_size(s.size())
  168. {
  169. }
  170. template <class T, class A>
  171. inline StringData::operator std::basic_string<char, T, A>() const
  172. {
  173. return std::basic_string<char, T, A>(m_data, m_size);
  174. }
  175. template <class T, class A>
  176. inline StringData::StringData(const util::Optional<std::basic_string<char, T, A>>& s)
  177. : m_data(s ? s->data() : nullptr)
  178. , m_size(s ? s->size() : 0)
  179. {
  180. }
  181. inline StringData::StringData(const null&) noexcept
  182. : m_data(nullptr)
  183. , m_size(0)
  184. {
  185. }
  186. inline StringData::StringData(const char* c_str) noexcept
  187. : m_data(c_str)
  188. , m_size(0)
  189. {
  190. if (c_str)
  191. m_size = std::char_traits<char>::length(c_str);
  192. }
  193. inline char StringData::operator[](size_t i) const noexcept
  194. {
  195. return m_data[i];
  196. }
  197. inline const char* StringData::data() const noexcept
  198. {
  199. return m_data;
  200. }
  201. inline size_t StringData::size() const noexcept
  202. {
  203. return m_size;
  204. }
  205. inline bool StringData::is_null() const noexcept
  206. {
  207. return !m_data;
  208. }
  209. inline bool operator==(const StringData& a, const StringData& b) noexcept
  210. {
  211. return a.m_size == b.m_size && a.is_null() == b.is_null() && safe_equal(a.m_data, a.m_data + a.m_size, b.m_data);
  212. }
  213. inline bool operator!=(const StringData& a, const StringData& b) noexcept
  214. {
  215. return !(a == b);
  216. }
  217. inline bool operator<(const StringData& a, const StringData& b) noexcept
  218. {
  219. if (a.is_null() && !b.is_null()) {
  220. // Null strings are smaller than all other strings, and not
  221. // equal to empty strings.
  222. return true;
  223. }
  224. return std::lexicographical_compare(a.m_data, a.m_data + a.m_size, b.m_data, b.m_data + b.m_size);
  225. }
  226. inline bool operator>(const StringData& a, const StringData& b) noexcept
  227. {
  228. return b < a;
  229. }
  230. inline bool operator<=(const StringData& a, const StringData& b) noexcept
  231. {
  232. return !(b < a);
  233. }
  234. inline bool operator>=(const StringData& a, const StringData& b) noexcept
  235. {
  236. return !(a < b);
  237. }
  238. inline bool StringData::begins_with(StringData d) const noexcept
  239. {
  240. if (is_null() && !d.is_null())
  241. return false;
  242. return d.m_size <= m_size && safe_equal(m_data, m_data + d.m_size, d.m_data);
  243. }
  244. inline bool StringData::ends_with(StringData d) const noexcept
  245. {
  246. if (is_null() && !d.is_null())
  247. return false;
  248. return d.m_size <= m_size && safe_equal(m_data + m_size - d.m_size, m_data + m_size, d.m_data);
  249. }
  250. inline bool StringData::contains(StringData d) const noexcept
  251. {
  252. if (is_null() && !d.is_null())
  253. return false;
  254. return d.m_size == 0 || std::search(m_data, m_data + m_size, d.m_data, d.m_data + d.m_size) != m_data + m_size;
  255. }
  256. /// This method takes an array that maps chars to distance that can be moved (and zero for chars not in needle),
  257. /// allowing the method to apply Boyer-Moore for quick substring search
  258. /// The map is calculated in the StringNode<Contains> class (so it can be reused across searches)
  259. inline bool StringData::contains(StringData d, const std::array<uint8_t, 256> &charmap) const noexcept
  260. {
  261. if (is_null() && !d.is_null())
  262. return false;
  263. size_t needle_size = d.size();
  264. if (needle_size == 0)
  265. return true;
  266. // Prepare vars to avoid lookups in loop
  267. size_t last_char_pos = d.size()-1;
  268. unsigned char lastChar = d[last_char_pos];
  269. // Do Boyer-Moore search
  270. size_t p = last_char_pos;
  271. while (p < m_size) {
  272. unsigned char c = m_data[p]; // Get candidate for last char
  273. if (c == lastChar) {
  274. StringData candidate = substr(p-needle_size+1, needle_size);
  275. if (candidate == d)
  276. return true; // text found!
  277. }
  278. // If we don't have a match, see how far we can move char_pos
  279. if (charmap[c] == 0)
  280. p += needle_size; // char was not present in search string
  281. else
  282. p += charmap[c];
  283. }
  284. return false;
  285. }
  286. inline bool StringData::like(StringData d) const noexcept
  287. {
  288. if (is_null() || d.is_null()) {
  289. return (is_null() && d.is_null());
  290. }
  291. return matchlike(*this, d);
  292. }
  293. inline StringData StringData::prefix(size_t n) const noexcept
  294. {
  295. return substr(0, n);
  296. }
  297. inline StringData StringData::suffix(size_t n) const noexcept
  298. {
  299. return substr(m_size - n);
  300. }
  301. inline StringData StringData::substr(size_t i, size_t n) const noexcept
  302. {
  303. return StringData(m_data + i, n);
  304. }
  305. inline StringData StringData::substr(size_t i) const noexcept
  306. {
  307. return substr(i, m_size - i);
  308. }
  309. template <class C, class T>
  310. inline std::basic_ostream<C, T>& operator<<(std::basic_ostream<C, T>& out, const StringData& d)
  311. {
  312. if (d.is_null()) {
  313. out << "<null>";
  314. }
  315. else {
  316. for (const char* i = d.m_data; i != d.m_data + d.m_size; ++i)
  317. out << *i;
  318. }
  319. return out;
  320. }
  321. inline StringData::operator bool() const noexcept
  322. {
  323. return !is_null();
  324. }
  325. inline size_t StringData::hash() const noexcept
  326. {
  327. if (is_null())
  328. return 0;
  329. auto unsigned_data = reinterpret_cast<const unsigned char*>(m_data);
  330. return murmur2_or_cityhash(unsigned_data, m_size);
  331. }
  332. } // namespace realm
  333. namespace std {
  334. template <>
  335. struct hash<::realm::StringData> {
  336. inline size_t operator()(const ::realm::StringData& str) const noexcept
  337. {
  338. return str.hash();
  339. }
  340. };
  341. } // namespace std
  342. #endif // REALM_STRING_HPP