index_string.hpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. /*************************************************************************
  2. *
  3. * Copyright 2016 Realm Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. **************************************************************************/
  18. #ifndef REALM_INDEX_STRING_HPP
  19. #define REALM_INDEX_STRING_HPP
  20. #include <cstring>
  21. #include <memory>
  22. #include <array>
  23. #include <set>
  24. #include <realm/array.hpp>
  25. #include <realm/cluster_tree.hpp>
  26. /*
  27. The StringIndex class is used for both type_String and all integral types, such as type_Bool, type_Timestamp and
  28. type_Int. When used for integral types, the 64-bit integer is simply casted to a string of 8 bytes through a
  29. pretty simple "wrapper layer" in all public methods.
  30. The StringIndex data structure is like an "inversed" B+ tree where the leafs contain row indexes and the non-leafs
  31. contain 4-byte chunks of payload. Imagine a table with following strings:
  32. hello, kitty, kitten, foobar, kitty, foobar
  33. The topmost level of the index tree contains prefixes of the payload strings of length <= 4. The next level contains
  34. prefixes of the remaining parts of the strings. Unnecessary levels of the tree are optimized away; the prefix "foob"
  35. is shared only by rows that are identical ("foobar"), so "ar" is not needed to be stored in the tree.
  36. hell kitt foob
  37. | /\ |
  38. 0 en y {3, 5}
  39. | \
  40. {1, 4} 2
  41. Each non-leafs consists of two integer arrays of the same length, one containing payload and the other containing
  42. references to the sublevel nodes.
  43. The leafs can be either a single value or a Column. If the reference in its parent node has its least significant
  44. bit set, then the remaining upper bits specify the row index at which the string is stored. If the bit is clear,
  45. it must be interpreted as a reference to a Column that stores the row indexes at which the string is stored.
  46. If a Column is used, then all row indexes are guaranteed to be sorted increasingly, which means you an search in it
  47. using our binary search functions such as upper_bound() and lower_bound(). Each duplicate value will be stored in
  48. the same Column, but Columns may contain more than just duplicates if the depth of the tree exceeds the value
  49. `s_max_offset` This is to avoid stack overflow problems with many of our recursive functions if we have two very
  50. long strings that have a long common prefix but differ in the last couple bytes. If a Column stores more than just
  51. duplicates, then the list is kept sorted in ascending order by string value and within the groups of common
  52. strings, the rows are sorted in ascending order.
  53. */
  54. namespace realm {
  55. class Spec;
  56. class Timestamp;
  57. class ClusterColumn;
  58. template <class T>
  59. class BPlusTree;
  60. /// Each StringIndex node contains an array of this type
  61. class IndexArray : public Array {
  62. public:
  63. IndexArray(Allocator& allocator)
  64. : Array(allocator)
  65. {
  66. }
  67. ObjKey index_string_find_first(Mixed value, const ClusterColumn& column) const;
  68. void index_string_find_all(std::vector<ObjKey>& result, Mixed value, const ClusterColumn& column,
  69. bool case_insensitive = false) const;
  70. FindRes index_string_find_all_no_copy(Mixed value, const ClusterColumn& column, InternalFindResult& result) const;
  71. size_t index_string_count(Mixed value, const ClusterColumn& column) const;
  72. private:
  73. template <IndexMethod>
  74. int64_t from_list(Mixed value, InternalFindResult& result_ref, const IntegerColumn& key_values,
  75. const ClusterColumn& column) const;
  76. void from_list_all(Mixed value, std::vector<ObjKey>& result, const IntegerColumn& rows,
  77. const ClusterColumn& column) const;
  78. void from_list_all_ins(StringData value, std::vector<ObjKey>& result, const IntegerColumn& rows,
  79. const ClusterColumn& column) const;
  80. template <IndexMethod method>
  81. int64_t index_string(Mixed value, InternalFindResult& result_ref, const ClusterColumn& column) const;
  82. void index_string_all(Mixed value, std::vector<ObjKey>& result, const ClusterColumn& column) const;
  83. void index_string_all_ins(StringData value, std::vector<ObjKey>& result, const ClusterColumn& column) const;
  84. };
  85. // 16 is the biggest element size of any non-string/binary Realm type
  86. constexpr size_t string_conversion_buffer_size = 16;
  87. using StringConversionBuffer = std::array<char, string_conversion_buffer_size>;
  88. static_assert(sizeof(UUID::UUIDBytes) <= string_conversion_buffer_size,
  89. "if you change the size of a UUID then also change the string index buffer space");
  90. // The purpose of this class is to get easy access to fields in a specific column in the
  91. // cluster. When you have an object like this, you can get a string version of the relevant
  92. // field based on the key for the object.
  93. class ClusterColumn {
  94. public:
  95. ClusterColumn(const ClusterTree* cluster_tree, ColKey column_key, IndexType type)
  96. : m_cluster_tree(cluster_tree)
  97. , m_column_key(column_key)
  98. , m_type(type)
  99. {
  100. }
  101. size_t size() const
  102. {
  103. return m_cluster_tree->size();
  104. }
  105. ClusterTree::Iterator begin() const
  106. {
  107. return ClusterTree::Iterator(*m_cluster_tree, 0);
  108. }
  109. ClusterTree::Iterator end() const
  110. {
  111. return ClusterTree::Iterator(*m_cluster_tree, size());
  112. }
  113. DataType get_data_type() const;
  114. ColKey get_column_key() const
  115. {
  116. return m_column_key;
  117. }
  118. bool is_nullable() const
  119. {
  120. return m_column_key.is_nullable();
  121. }
  122. bool is_fulltext() const
  123. {
  124. return m_type == IndexType::Fulltext;
  125. }
  126. Mixed get_value(ObjKey key) const;
  127. private:
  128. const ClusterTree* m_cluster_tree;
  129. ColKey m_column_key;
  130. IndexType m_type;
  131. };
  132. class StringIndex {
  133. public:
  134. StringIndex(const ClusterColumn& target_column, Allocator&);
  135. StringIndex(ref_type, ArrayParent*, size_t ndx_in_parent, const ClusterColumn& target_column, Allocator&);
  136. ~StringIndex() noexcept
  137. {
  138. }
  139. ColKey get_column_key() const
  140. {
  141. return m_target_column.get_column_key();
  142. }
  143. static bool type_supported(realm::DataType type)
  144. {
  145. return (type == type_Int || type == type_String || type == type_Bool || type == type_Timestamp ||
  146. type == type_ObjectId || type == type_Mixed || type == type_UUID);
  147. }
  148. void set_target(const ClusterColumn& target_column) noexcept;
  149. // Accessor concept:
  150. Allocator& get_alloc() const noexcept;
  151. void destroy() noexcept;
  152. void detach();
  153. bool is_attached() const noexcept;
  154. void set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept;
  155. size_t get_ndx_in_parent() const noexcept;
  156. void set_ndx_in_parent(size_t ndx_in_parent) noexcept;
  157. void update_from_parent() noexcept;
  158. void refresh_accessor_tree(const ClusterColumn& target_column);
  159. ref_type get_ref() const noexcept;
  160. // StringIndex interface:
  161. bool is_empty() const;
  162. bool is_fulltext_index() const
  163. {
  164. return this->m_target_column.is_fulltext();
  165. }
  166. template <class T>
  167. void insert(ObjKey key, T value);
  168. template <class T>
  169. void insert(ObjKey key, util::Optional<T> value);
  170. template <class T>
  171. void set(ObjKey key, T new_value);
  172. template <class T>
  173. void set(ObjKey key, util::Optional<T> new_value);
  174. void erase(ObjKey key);
  175. template <class T>
  176. ObjKey find_first(T value) const;
  177. template <class T>
  178. void find_all(std::vector<ObjKey>& result, T value, bool case_insensitive = false) const;
  179. template <class T>
  180. FindRes find_all_no_copy(T value, InternalFindResult& result) const;
  181. template <class T>
  182. size_t count(T value) const;
  183. void find_all_fulltext(std::vector<ObjKey>& result, StringData value) const;
  184. void clear();
  185. bool has_duplicate_values() const noexcept;
  186. void verify() const;
  187. #ifdef REALM_DEBUG
  188. template <class T>
  189. void verify_entries(const ClusterColumn& column) const;
  190. void do_dump_node_structure(std::ostream&, int) const;
  191. #endif
  192. typedef int32_t key_type;
  193. // s_max_offset specifies the number of levels of recursive string indexes
  194. // allowed before storing everything in lists. This is to avoid nesting
  195. // to too deep of a level. Since every SubStringIndex stores 4 bytes, this
  196. // means that a StringIndex is helpful for strings of a common prefix up to
  197. // 4 times this limit (200 bytes shared). Lists are stored in sorted order,
  198. // so strings sharing a common prefix of more than this limit will use a
  199. // binary search of approximate complexity log2(n) from `std::lower_bound`.
  200. static const size_t s_max_offset = 200; // max depth * s_index_key_length
  201. static const size_t s_index_key_length = 4;
  202. static key_type create_key(StringData) noexcept;
  203. static key_type create_key(StringData, size_t) noexcept;
  204. private:
  205. // m_array is a compact representation for storing the children of this StringIndex.
  206. // Children can be:
  207. // 1) a row number
  208. // 2) a reference to a list which stores row numbers (for duplicate strings).
  209. // 3) a reference to a sub-index
  210. // m_array[0] is always a reference to a values array which stores the 4 byte chunk
  211. // of payload data for quick string chunk comparisons. The array stored
  212. // at m_array[0] lines up with the indices of values in m_array[1] so for example
  213. // starting with an empty StringIndex:
  214. // StringColumn::insert(target_row_ndx=42, value="test_string") would result with
  215. // get_array_from_ref(m_array[0])[0] == create_key("test") and
  216. // m_array[1] == 42
  217. // In this way, m_array which stores one child has a size of two.
  218. // Children are type 1 (row number) if the LSB of the value is set.
  219. // To get the actual row value, shift value down by one.
  220. // If the LSB of the value is 0 then the value is a reference and can be either
  221. // type 2, or type 3 (no shifting in either case).
  222. // References point to a list if the context header flag is NOT set.
  223. // If the header flag is set, references point to a sub-StringIndex (nesting).
  224. std::unique_ptr<IndexArray> m_array;
  225. ClusterColumn m_target_column;
  226. struct inner_node_tag {
  227. };
  228. StringIndex(inner_node_tag, Allocator&);
  229. static std::unique_ptr<IndexArray> create_node(Allocator&, bool is_leaf);
  230. void insert_with_offset(ObjKey key, StringData index_data, const Mixed& value, size_t offset);
  231. void insert_row_list(size_t ref, size_t offset, StringData value);
  232. void insert_to_existing_list(ObjKey key, Mixed value, IntegerColumn& list);
  233. void insert_to_existing_list_at_lower(ObjKey key, Mixed value, IntegerColumn& list,
  234. const IntegerColumnIterator& lower);
  235. key_type get_last_key() const;
  236. struct NodeChange {
  237. size_t ref1;
  238. size_t ref2;
  239. enum ChangeType { change_None, change_InsertBefore, change_InsertAfter, change_Split } type;
  240. NodeChange(ChangeType t, size_t r1 = 0, size_t r2 = 0)
  241. : ref1(r1)
  242. , ref2(r2)
  243. , type(t)
  244. {
  245. }
  246. NodeChange()
  247. : ref1(0)
  248. , ref2(0)
  249. , type(change_None)
  250. {
  251. }
  252. };
  253. // B-Tree functions
  254. void TreeInsert(ObjKey obj_key, key_type, size_t offset, StringData index_data, const Mixed& value);
  255. NodeChange do_insert(ObjKey, key_type, size_t offset, StringData index_data, const Mixed& value);
  256. /// Returns true if there is room or it can join existing entries
  257. bool leaf_insert(ObjKey obj_key, key_type, size_t offset, StringData index_data, const Mixed& value,
  258. bool noextend = false);
  259. void node_insert_split(size_t ndx, size_t new_ref);
  260. void node_insert(size_t ndx, size_t ref);
  261. // Erase without getting value from parent column (useful when string stored
  262. // does not directly match string in parent, like with full-text indexing)
  263. void erase_string(ObjKey key, StringData value);
  264. void do_delete(ObjKey key, StringData, size_t offset);
  265. Mixed get(ObjKey key) const;
  266. void node_add_key(ref_type ref);
  267. #ifdef REALM_DEBUG
  268. static void dump_node_structure(const Array& node, std::ostream&, int level);
  269. #endif
  270. };
  271. class SortedListComparator {
  272. public:
  273. SortedListComparator(const ClusterTree* cluster_tree, ColKey column_key, IndexType type)
  274. : m_column(cluster_tree, column_key, type)
  275. {
  276. }
  277. SortedListComparator(const ClusterColumn& column)
  278. : m_column(column)
  279. {
  280. }
  281. bool operator()(int64_t key_value, Mixed needle);
  282. bool operator()(Mixed needle, int64_t key_value);
  283. private:
  284. const ClusterColumn m_column;
  285. };
  286. // Implementation:
  287. inline StringIndex::StringIndex(const ClusterColumn& target_column, Allocator& alloc)
  288. : m_array(create_node(alloc, true)) // Throws
  289. , m_target_column(target_column)
  290. {
  291. }
  292. inline StringIndex::StringIndex(ref_type ref, ArrayParent* parent, size_t ndx_in_parent,
  293. const ClusterColumn& target_column, Allocator& alloc)
  294. : m_array(new IndexArray(alloc))
  295. , m_target_column(target_column)
  296. {
  297. REALM_ASSERT_EX(Array::get_context_flag_from_header(alloc.translate(ref)), ref, size_t(alloc.translate(ref)));
  298. m_array->init_from_ref(ref);
  299. set_parent(parent, ndx_in_parent);
  300. }
  301. inline StringIndex::StringIndex(inner_node_tag, Allocator& alloc)
  302. : m_array(create_node(alloc, false)) // Throws
  303. , m_target_column(ClusterColumn(nullptr, {}, IndexType::General))
  304. {
  305. }
  306. // Byte order of the key is *reversed*, so that for the integer index, the least significant
  307. // byte comes first, so that it fits little-endian machines. That way we can perform fast
  308. // range-lookups and iterate in order, etc, as future features. This, however, makes the same
  309. // features slower for string indexes. Todo, we should reverse the order conditionally, depending
  310. // on the column type.
  311. inline StringIndex::key_type StringIndex::create_key(StringData str) noexcept
  312. {
  313. key_type key = 0;
  314. if (str.size() >= 4)
  315. goto four;
  316. if (str.size() < 2) {
  317. if (str.size() == 0)
  318. goto none;
  319. goto one;
  320. }
  321. if (str.size() == 2)
  322. goto two;
  323. goto three;
  324. // Create 4 byte index key
  325. // (encoded like this to allow literal comparisons
  326. // independently of endianness)
  327. four:
  328. key |= (key_type(static_cast<unsigned char>(str[3])) << 0);
  329. three:
  330. key |= (key_type(static_cast<unsigned char>(str[2])) << 8);
  331. two:
  332. key |= (key_type(static_cast<unsigned char>(str[1])) << 16);
  333. one:
  334. key |= (key_type(static_cast<unsigned char>(str[0])) << 24);
  335. none:
  336. return key;
  337. }
  338. // Index works as follows: All non-NULL values are stored as if they had appended an 'X' character at the end. So
  339. // "foo" is stored as if it was "fooX", and "" (empty string) is stored as "X". And NULLs are stored as empty strings.
  340. inline StringIndex::key_type StringIndex::create_key(StringData str, size_t offset) noexcept
  341. {
  342. if (str.is_null())
  343. return 0;
  344. if (offset > str.size())
  345. return 0;
  346. // for very short strings
  347. size_t tail = str.size() - offset;
  348. if (tail <= sizeof(key_type) - 1) {
  349. char buf[sizeof(key_type)];
  350. memset(buf, 0, sizeof(key_type));
  351. buf[tail] = 'X';
  352. memcpy(buf, str.data() + offset, tail);
  353. return create_key(StringData(buf, tail + 1));
  354. }
  355. // else fallback
  356. return create_key(str.substr(offset));
  357. }
  358. template <class T>
  359. void StringIndex::insert(ObjKey key, T value)
  360. {
  361. StringConversionBuffer buffer;
  362. Mixed m(value);
  363. size_t offset = 0; // First key from beginning of string
  364. insert_with_offset(key, m.get_index_data(buffer), m, offset); // Throws
  365. }
  366. template <>
  367. void StringIndex::insert<StringData>(ObjKey key, StringData value);
  368. template <class T>
  369. void StringIndex::insert(ObjKey key, util::Optional<T> value)
  370. {
  371. if (value) {
  372. insert(key, *value);
  373. }
  374. else {
  375. insert(key, null{});
  376. }
  377. }
  378. template <class T>
  379. void StringIndex::set(ObjKey key, T new_value)
  380. {
  381. Mixed old_value = get(key);
  382. Mixed new_value2 = Mixed(new_value);
  383. // Note that insert_with_offset() throws UniqueConstraintViolation.
  384. if (REALM_LIKELY(new_value2 != old_value)) {
  385. // We must erase this row first because erase uses find_first which
  386. // might find the duplicate if we insert before erasing.
  387. erase(key); // Throws
  388. StringConversionBuffer buffer;
  389. size_t offset = 0; // First key from beginning of string
  390. auto index_data = new_value2.get_index_data(buffer);
  391. insert_with_offset(key, index_data, new_value2, offset); // Throws
  392. }
  393. }
  394. template <>
  395. void StringIndex::set<StringData>(ObjKey key, StringData new_value);
  396. template <class T>
  397. void StringIndex::set(ObjKey key, util::Optional<T> new_value)
  398. {
  399. if (new_value) {
  400. set(key, *new_value);
  401. }
  402. else {
  403. set(key, null{});
  404. }
  405. }
  406. template <class T>
  407. ObjKey StringIndex::find_first(T value) const
  408. {
  409. // Use direct access method
  410. return m_array->index_string_find_first(Mixed(value), m_target_column);
  411. }
  412. template <class T>
  413. void StringIndex::find_all(std::vector<ObjKey>& result, T value, bool case_insensitive) const
  414. {
  415. // Use direct access method
  416. return m_array->index_string_find_all(result, Mixed(value), m_target_column, case_insensitive);
  417. }
  418. template <class T>
  419. FindRes StringIndex::find_all_no_copy(T value, InternalFindResult& result) const
  420. {
  421. return m_array->index_string_find_all_no_copy(Mixed(value), m_target_column, result);
  422. }
  423. template <class T>
  424. size_t StringIndex::count(T value) const
  425. {
  426. // Use direct access method
  427. return m_array->index_string_count(Mixed(value), m_target_column);
  428. }
  429. inline Allocator& StringIndex::get_alloc() const noexcept
  430. {
  431. return m_array->get_alloc();
  432. }
  433. inline void StringIndex::destroy() noexcept
  434. {
  435. return m_array->destroy_deep();
  436. }
  437. inline bool StringIndex::is_attached() const noexcept
  438. {
  439. return m_array->is_attached();
  440. }
  441. inline void StringIndex::refresh_accessor_tree(const ClusterColumn& target_column)
  442. {
  443. m_array->init_from_parent();
  444. m_target_column = target_column;
  445. }
  446. inline ref_type StringIndex::get_ref() const noexcept
  447. {
  448. return m_array->get_ref();
  449. }
  450. inline void StringIndex::set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept
  451. {
  452. m_array->set_parent(parent, ndx_in_parent);
  453. }
  454. inline size_t StringIndex::get_ndx_in_parent() const noexcept
  455. {
  456. return m_array->get_ndx_in_parent();
  457. }
  458. inline void StringIndex::set_ndx_in_parent(size_t ndx_in_parent) noexcept
  459. {
  460. m_array->set_ndx_in_parent(ndx_in_parent);
  461. }
  462. inline void StringIndex::update_from_parent() noexcept
  463. {
  464. m_array->update_from_parent();
  465. }
  466. } // namespace realm
  467. #endif // REALM_INDEX_STRING_HPP