index_string.hpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. /*************************************************************************
  2. *
  3. * Copyright 2016 Realm Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. **************************************************************************/
  18. #ifndef REALM_INDEX_STRING_HPP
  19. #define REALM_INDEX_STRING_HPP
  20. #include <cstring>
  21. #include <memory>
  22. #include <array>
  23. #include <realm/array.hpp>
  24. #include <realm/table_cluster_tree.hpp>
  25. /*
  26. The StringIndex class is used for both type_String and all integral types, such as type_Bool, type_Timestamp and
  27. type_Int. When used for integral types, the 64-bit integer is simply casted to a string of 8 bytes through a
  28. pretty simple "wrapper layer" in all public methods.
  29. The StringIndex data structure is like an "inversed" B+ tree where the leafs contain row indexes and the non-leafs
  30. contain 4-byte chunks of payload. Imagine a table with following strings:
  31. hello, kitty, kitten, foobar, kitty, foobar
  32. The topmost level of the index tree contains prefixes of the payload strings of length <= 4. The next level contains
  33. prefixes of the remaining parts of the strings. Unnecessary levels of the tree are optimized away; the prefix "foob"
  34. is shared only by rows that are identical ("foobar"), so "ar" is not needed to be stored in the tree.
  35. hell kitt foob
  36. | /\ |
  37. 0 en y {3, 5}
  38. | \
  39. {1, 4} 2
  40. Each non-leafs consists of two integer arrays of the same length, one containing payload and the other containing
  41. references to the sublevel nodes.
  42. The leafs can be either a single value or a Column. If the reference in its parent node has its least significant
  43. bit set, then the remaining upper bits specify the row index at which the string is stored. If the bit is clear,
  44. it must be interpreted as a reference to a Column that stores the row indexes at which the string is stored.
  45. If a Column is used, then all row indexes are guaranteed to be sorted increasingly, which means you an search in it
  46. using our binary search functions such as upper_bound() and lower_bound(). Each duplicate value will be stored in
  47. the same Column, but Columns may contain more than just duplicates if the depth of the tree exceeds the value
  48. `s_max_offset` This is to avoid stack overflow problems with many of our recursive functions if we have two very
  49. long strings that have a long common prefix but differ in the last couple bytes. If a Column stores more than just
  50. duplicates, then the list is kept sorted in ascending order by string value and within the groups of common
  51. strings, the rows are sorted in ascending order.
  52. */
  53. namespace realm {
  54. class Spec;
  55. class Timestamp;
  56. class ClusterColumn;
  57. template <class T>
  58. class BPlusTree;
  59. /// Each StringIndex node contains an array of this type
  60. class IndexArray : public Array {
  61. public:
  62. IndexArray(Allocator& allocator)
  63. : Array(allocator)
  64. {
  65. }
  66. ObjKey index_string_find_first(Mixed value, const ClusterColumn& column) const;
  67. void index_string_find_all(std::vector<ObjKey>& result, Mixed value, const ClusterColumn& column,
  68. bool case_insensitive = false) const;
  69. FindRes index_string_find_all_no_copy(Mixed value, const ClusterColumn& column, InternalFindResult& result) const;
  70. size_t index_string_count(Mixed value, const ClusterColumn& column) const;
  71. private:
  72. template <IndexMethod>
  73. int64_t from_list(Mixed value, InternalFindResult& result_ref, const IntegerColumn& key_values,
  74. const ClusterColumn& column) const;
  75. void from_list_all(Mixed value, std::vector<ObjKey>& result, const IntegerColumn& rows,
  76. const ClusterColumn& column) const;
  77. void from_list_all_ins(StringData value, std::vector<ObjKey>& result, const IntegerColumn& rows,
  78. const ClusterColumn& column) const;
  79. template <IndexMethod method>
  80. int64_t index_string(Mixed value, InternalFindResult& result_ref, const ClusterColumn& column) const;
  81. void index_string_all(Mixed value, std::vector<ObjKey>& result, const ClusterColumn& column) const;
  82. void index_string_all_ins(StringData value, std::vector<ObjKey>& result, const ClusterColumn& column) const;
  83. };
  84. // 16 is the biggest element size of any non-string/binary Realm type
  85. constexpr size_t string_conversion_buffer_size = 16;
  86. using StringConversionBuffer = std::array<char, string_conversion_buffer_size>;
  87. static_assert(sizeof(UUID::UUIDBytes) <= string_conversion_buffer_size,
  88. "if you change the size of a UUID then also change the string index buffer space");
  89. // The purpose of this class is to get easy access to fields in a specific column in the
  90. // cluster. When you have an object like this, you can get a string version of the relevant
  91. // field based on the key for the object.
  92. class ClusterColumn {
  93. public:
  94. ClusterColumn(const TableClusterTree* cluster_tree, ColKey column_key)
  95. : m_cluster_tree(cluster_tree)
  96. , m_column_key(column_key)
  97. {
  98. }
  99. size_t size() const
  100. {
  101. return m_cluster_tree->size();
  102. }
  103. TableClusterTree::Iterator begin() const
  104. {
  105. return TableClusterTree::Iterator(*m_cluster_tree, 0);
  106. }
  107. TableClusterTree::Iterator end() const
  108. {
  109. return TableClusterTree::Iterator(*m_cluster_tree, size());
  110. }
  111. DataType get_data_type() const;
  112. ColKey get_column_key() const
  113. {
  114. return m_column_key;
  115. }
  116. bool is_nullable() const;
  117. Mixed get_value(ObjKey key) const;
  118. private:
  119. const TableClusterTree* m_cluster_tree;
  120. ColKey m_column_key;
  121. };
  122. class StringIndex {
  123. public:
  124. StringIndex(const ClusterColumn& target_column, Allocator&);
  125. StringIndex(ref_type, ArrayParent*, size_t ndx_in_parent, const ClusterColumn& target_column, Allocator&);
  126. ~StringIndex() noexcept
  127. {
  128. }
  129. ColKey get_column_key() const
  130. {
  131. return m_target_column.get_column_key();
  132. }
  133. static bool type_supported(realm::DataType type)
  134. {
  135. return (type == type_Int || type == type_String || type == type_Bool || type == type_Timestamp ||
  136. type == type_ObjectId || type == type_Mixed || type == type_UUID);
  137. }
  138. static ref_type create_empty(Allocator& alloc);
  139. void set_target(const ClusterColumn& target_column) noexcept;
  140. // Accessor concept:
  141. Allocator& get_alloc() const noexcept;
  142. void destroy() noexcept;
  143. void detach();
  144. bool is_attached() const noexcept;
  145. void set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept;
  146. size_t get_ndx_in_parent() const noexcept;
  147. void set_ndx_in_parent(size_t ndx_in_parent) noexcept;
  148. void update_from_parent() noexcept;
  149. void refresh_accessor_tree(const ClusterColumn& target_column);
  150. ref_type get_ref() const noexcept;
  151. // StringIndex interface:
  152. bool is_empty() const;
  153. template <class T>
  154. void insert(ObjKey key, T value);
  155. template <class T>
  156. void insert(ObjKey key, util::Optional<T> value);
  157. template <class T>
  158. void set(ObjKey key, T new_value);
  159. template <class T>
  160. void set(ObjKey key, util::Optional<T> new_value);
  161. void erase(ObjKey key);
  162. template <class T>
  163. ObjKey find_first(T value) const;
  164. template <class T>
  165. void find_all(std::vector<ObjKey>& result, T value, bool case_insensitive = false) const;
  166. template <class T>
  167. FindRes find_all_no_copy(T value, InternalFindResult& result) const;
  168. template <class T>
  169. size_t count(T value) const;
  170. void clear();
  171. bool has_duplicate_values() const noexcept;
  172. void verify() const;
  173. #ifdef REALM_DEBUG
  174. template <class T>
  175. void verify_entries(const ClusterColumn& column) const;
  176. void do_dump_node_structure(std::ostream&, int) const;
  177. #endif
  178. typedef int32_t key_type;
  179. // s_max_offset specifies the number of levels of recursive string indexes
  180. // allowed before storing everything in lists. This is to avoid nesting
  181. // to too deep of a level. Since every SubStringIndex stores 4 bytes, this
  182. // means that a StringIndex is helpful for strings of a common prefix up to
  183. // 4 times this limit (200 bytes shared). Lists are stored in sorted order,
  184. // so strings sharing a common prefix of more than this limit will use a
  185. // binary search of approximate complexity log2(n) from `std::lower_bound`.
  186. static const size_t s_max_offset = 200; // max depth * s_index_key_length
  187. static const size_t s_index_key_length = 4;
  188. static key_type create_key(StringData) noexcept;
  189. static key_type create_key(StringData, size_t) noexcept;
  190. private:
  191. // m_array is a compact representation for storing the children of this StringIndex.
  192. // Children can be:
  193. // 1) a row number
  194. // 2) a reference to a list which stores row numbers (for duplicate strings).
  195. // 3) a reference to a sub-index
  196. // m_array[0] is always a reference to a values array which stores the 4 byte chunk
  197. // of payload data for quick string chunk comparisons. The array stored
  198. // at m_array[0] lines up with the indices of values in m_array[1] so for example
  199. // starting with an empty StringIndex:
  200. // StringColumn::insert(target_row_ndx=42, value="test_string") would result with
  201. // get_array_from_ref(m_array[0])[0] == create_key("test") and
  202. // m_array[1] == 42
  203. // In this way, m_array which stores one child has a size of two.
  204. // Children are type 1 (row number) if the LSB of the value is set.
  205. // To get the actual row value, shift value down by one.
  206. // If the LSB of the value is 0 then the value is a reference and can be either
  207. // type 2, or type 3 (no shifting in either case).
  208. // References point to a list if the context header flag is NOT set.
  209. // If the header flag is set, references point to a sub-StringIndex (nesting).
  210. std::unique_ptr<IndexArray> m_array;
  211. ClusterColumn m_target_column;
  212. struct inner_node_tag {
  213. };
  214. StringIndex(inner_node_tag, Allocator&);
  215. static IndexArray* create_node(Allocator&, bool is_leaf);
  216. void insert_with_offset(ObjKey key, StringData index_data, const Mixed& value, size_t offset);
  217. void insert_row_list(size_t ref, size_t offset, StringData value);
  218. void insert_to_existing_list(ObjKey key, Mixed value, IntegerColumn& list);
  219. void insert_to_existing_list_at_lower(ObjKey key, Mixed value, IntegerColumn& list,
  220. const IntegerColumnIterator& lower);
  221. key_type get_last_key() const;
  222. struct NodeChange {
  223. size_t ref1;
  224. size_t ref2;
  225. enum ChangeType { change_None, change_InsertBefore, change_InsertAfter, change_Split } type;
  226. NodeChange(ChangeType t, size_t r1 = 0, size_t r2 = 0)
  227. : ref1(r1)
  228. , ref2(r2)
  229. , type(t)
  230. {
  231. }
  232. NodeChange()
  233. : ref1(0)
  234. , ref2(0)
  235. , type(change_None)
  236. {
  237. }
  238. };
  239. // B-Tree functions
  240. void TreeInsert(ObjKey obj_key, key_type, size_t offset, StringData index_data, const Mixed& value);
  241. NodeChange do_insert(ObjKey, key_type, size_t offset, StringData index_data, const Mixed& value);
  242. /// Returns true if there is room or it can join existing entries
  243. bool leaf_insert(ObjKey obj_key, key_type, size_t offset, StringData index_data, const Mixed& value,
  244. bool noextend = false);
  245. void node_insert_split(size_t ndx, size_t new_ref);
  246. void node_insert(size_t ndx, size_t ref);
  247. void do_delete(ObjKey key, StringData, size_t offset);
  248. Mixed get(ObjKey key) const;
  249. void node_add_key(ref_type ref);
  250. #ifdef REALM_DEBUG
  251. static void dump_node_structure(const Array& node, std::ostream&, int level);
  252. #endif
  253. };
  254. class SortedListComparator {
  255. public:
  256. SortedListComparator(const TableClusterTree* cluster_tree, ColKey column_key)
  257. : m_column(cluster_tree, column_key)
  258. {
  259. }
  260. SortedListComparator(const ClusterColumn& column)
  261. : m_column(column)
  262. {
  263. }
  264. bool operator()(int64_t key_value, Mixed needle);
  265. bool operator()(Mixed needle, int64_t key_value);
  266. private:
  267. const ClusterColumn m_column;
  268. };
  269. // Implementation:
  270. inline StringIndex::StringIndex(const ClusterColumn& target_column, Allocator& alloc)
  271. : m_array(create_node(alloc, true)) // Throws
  272. , m_target_column(target_column)
  273. {
  274. }
  275. inline StringIndex::StringIndex(ref_type ref, ArrayParent* parent, size_t ndx_in_parent,
  276. const ClusterColumn& target_column, Allocator& alloc)
  277. : m_array(new IndexArray(alloc))
  278. , m_target_column(target_column)
  279. {
  280. REALM_ASSERT_EX(Array::get_context_flag_from_header(alloc.translate(ref)), ref, size_t(alloc.translate(ref)));
  281. m_array->init_from_ref(ref);
  282. set_parent(parent, ndx_in_parent);
  283. }
  284. inline StringIndex::StringIndex(inner_node_tag, Allocator& alloc)
  285. : m_array(create_node(alloc, false)) // Throws
  286. , m_target_column(ClusterColumn(nullptr, {}))
  287. {
  288. }
  289. // Byte order of the key is *reversed*, so that for the integer index, the least significant
  290. // byte comes first, so that it fits little-endian machines. That way we can perform fast
  291. // range-lookups and iterate in order, etc, as future features. This, however, makes the same
  292. // features slower for string indexes. Todo, we should reverse the order conditionally, depending
  293. // on the column type.
  294. inline StringIndex::key_type StringIndex::create_key(StringData str) noexcept
  295. {
  296. key_type key = 0;
  297. if (str.size() >= 4)
  298. goto four;
  299. if (str.size() < 2) {
  300. if (str.size() == 0)
  301. goto none;
  302. goto one;
  303. }
  304. if (str.size() == 2)
  305. goto two;
  306. goto three;
  307. // Create 4 byte index key
  308. // (encoded like this to allow literal comparisons
  309. // independently of endianness)
  310. four:
  311. key |= (key_type(static_cast<unsigned char>(str[3])) << 0);
  312. three:
  313. key |= (key_type(static_cast<unsigned char>(str[2])) << 8);
  314. two:
  315. key |= (key_type(static_cast<unsigned char>(str[1])) << 16);
  316. one:
  317. key |= (key_type(static_cast<unsigned char>(str[0])) << 24);
  318. none:
  319. return key;
  320. }
  321. // Index works as follows: All non-NULL values are stored as if they had appended an 'X' character at the end. So
  322. // "foo" is stored as if it was "fooX", and "" (empty string) is stored as "X". And NULLs are stored as empty strings.
  323. inline StringIndex::key_type StringIndex::create_key(StringData str, size_t offset) noexcept
  324. {
  325. if (str.is_null())
  326. return 0;
  327. if (offset > str.size())
  328. return 0;
  329. // for very short strings
  330. size_t tail = str.size() - offset;
  331. if (tail <= sizeof(key_type) - 1) {
  332. char buf[sizeof(key_type)];
  333. memset(buf, 0, sizeof(key_type));
  334. buf[tail] = 'X';
  335. memcpy(buf, str.data() + offset, tail);
  336. return create_key(StringData(buf, tail + 1));
  337. }
  338. // else fallback
  339. return create_key(str.substr(offset));
  340. }
  341. template <class T>
  342. void StringIndex::insert(ObjKey key, T value)
  343. {
  344. StringConversionBuffer buffer;
  345. Mixed m(value);
  346. size_t offset = 0; // First key from beginning of string
  347. insert_with_offset(key, m.get_index_data(buffer), m, offset); // Throws
  348. }
  349. template <class T>
  350. void StringIndex::insert(ObjKey key, util::Optional<T> value)
  351. {
  352. if (value) {
  353. insert(key, *value);
  354. }
  355. else {
  356. insert(key, null{});
  357. }
  358. }
  359. template <class T>
  360. void StringIndex::set(ObjKey key, T new_value)
  361. {
  362. Mixed old_value = get(key);
  363. Mixed new_value2 = Mixed(new_value);
  364. // Note that insert_with_offset() throws UniqueConstraintViolation.
  365. if (REALM_LIKELY(new_value2 != old_value)) {
  366. // We must erase this row first because erase uses find_first which
  367. // might find the duplicate if we insert before erasing.
  368. erase(key); // Throws
  369. StringConversionBuffer buffer;
  370. size_t offset = 0; // First key from beginning of string
  371. auto index_data = new_value2.get_index_data(buffer);
  372. insert_with_offset(key, index_data, new_value2, offset); // Throws
  373. }
  374. }
  375. template <class T>
  376. void StringIndex::set(ObjKey key, util::Optional<T> new_value)
  377. {
  378. if (new_value) {
  379. set(key, *new_value);
  380. }
  381. else {
  382. set(key, null{});
  383. }
  384. }
  385. template <class T>
  386. ObjKey StringIndex::find_first(T value) const
  387. {
  388. // Use direct access method
  389. return m_array->index_string_find_first(Mixed(value), m_target_column);
  390. }
  391. template <class T>
  392. void StringIndex::find_all(std::vector<ObjKey>& result, T value, bool case_insensitive) const
  393. {
  394. // Use direct access method
  395. return m_array->index_string_find_all(result, Mixed(value), m_target_column, case_insensitive);
  396. }
  397. template <class T>
  398. FindRes StringIndex::find_all_no_copy(T value, InternalFindResult& result) const
  399. {
  400. return m_array->index_string_find_all_no_copy(Mixed(value), m_target_column, result);
  401. }
  402. template <class T>
  403. size_t StringIndex::count(T value) const
  404. {
  405. // Use direct access method
  406. return m_array->index_string_count(Mixed(value), m_target_column);
  407. }
  408. inline void StringIndex::destroy() noexcept
  409. {
  410. return m_array->destroy_deep();
  411. }
  412. inline bool StringIndex::is_attached() const noexcept
  413. {
  414. return m_array->is_attached();
  415. }
  416. inline void StringIndex::refresh_accessor_tree(const ClusterColumn& target_column)
  417. {
  418. m_array->init_from_parent();
  419. m_target_column = target_column;
  420. }
  421. inline ref_type StringIndex::get_ref() const noexcept
  422. {
  423. return m_array->get_ref();
  424. }
  425. inline void StringIndex::set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept
  426. {
  427. m_array->set_parent(parent, ndx_in_parent);
  428. }
  429. inline size_t StringIndex::get_ndx_in_parent() const noexcept
  430. {
  431. return m_array->get_ndx_in_parent();
  432. }
  433. inline void StringIndex::set_ndx_in_parent(size_t ndx_in_parent) noexcept
  434. {
  435. m_array->set_ndx_in_parent(ndx_in_parent);
  436. }
  437. inline void StringIndex::update_from_parent() noexcept
  438. {
  439. m_array->update_from_parent();
  440. }
  441. } // namespace realm
  442. #endif // REALM_INDEX_STRING_HPP