b-tree-plus-alpha
Loading...
Searching...
No Matches
dynamic_wavelet_tree.hpp
1#pragma once
3
4namespace stool
5{
6 namespace bptree
7 {
8
14 {
16 std::vector<std::vector<BIT_SEQUENCE>> bits_seq;
17 std::vector<int64_t> char_rank_vec;
18 std::vector<uint8_t> alphabet;
19 uint64_t rank_bit_size = 0;
20
21 public:
23 {
24 this->alphabet.clear();
25 };
26 DynamicWaveletTree(const std::vector<uint8_t> &_alphabet)
27 {
28 this->set_alphabet(_alphabet);
29 };
31 {
32 this->initialize();
33 };
34 DynamicWaveletTree &operator=(const DynamicWaveletTree &) = delete;
35 DynamicWaveletTree(DynamicWaveletTree &&) noexcept = default;
36 DynamicWaveletTree &operator=(DynamicWaveletTree &&) noexcept = default;
37
38 private:
39
40 void build_bits(const std::vector<uint8_t> &_text, uint64_t h, uint64_t i)
41 {
42 uint64_t bit_idx = this->rank_bit_size - h - 1;
43 std::vector<bool> bits;
44 bits.resize(_text.size(), false);
45 uint64_t counter = 0;
48
49 for (uint8_t c : _text)
50 {
51 int64_t c_rank = this->char_rank_vec[c];
52 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
53 bits[counter++] = b;
54 if (b)
55 {
57 }
58 else
59 {
61 }
62 }
64 this->bits_seq[h][i].swap(dbs);
65 if (h + 1 < rank_bit_size)
66 {
67 uint64_t next_i = i * 2;
68 {
69 std::vector<uint8_t> left;
71 counter = 0;
72
73 for (uint8_t c : _text)
74 {
75 int64_t c_rank = this->char_rank_vec[c];
76 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
77 if (!b)
78 {
79 left[counter++] = c;
80 }
81 }
82 this->build_bits(left, h + 1, next_i);
83 }
84 {
85 std::vector<uint8_t> right;
87 counter = 0;
88
89 for (uint8_t c : _text)
90 {
91 int64_t c_rank = this->char_rank_vec[c];
92 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
93 if (b)
94 {
95 right[counter++] = c;
96 }
97 }
98 this->build_bits(right, h + 1, next_i + 1);
99 }
100 }
101 }
102
103 public:
104 void build_from_text(const std::vector<uint8_t> &_text, const std::vector<uint8_t> &_alphabet){
105 DynamicWaveletTree dwt = DynamicWaveletTree::build(_text, _alphabet);
106 this->swap(dwt);
107 }
108 static DynamicWaveletTree build(const std::vector<uint8_t> &_text, const std::vector<uint8_t> &_alphabet)
109 {
111 dwt.build_bits(_text, 0, 0);
112 return dwt;
113 }
114
115 void swap(DynamicWaveletTree &item)
116 {
117 this->bits_seq.swap(item.bits_seq);
118
119 this->char_rank_vec.swap(item.char_rank_vec);
120 this->alphabet.swap(item.alphabet);
121 std::swap(this->rank_bit_size, item.rank_bit_size);
122 }
123
124 private:
125 void initialize()
126 {
127 for (uint64_t i = 0; i < this->bits_seq.size(); i++)
128 {
129 for (uint64_t j = 0; j < this->bits_seq[i].size(); j++)
130 {
131 this->bits_seq[i][j].clear();
132 // delete this->bits_seq[i][j];
133 }
134 }
135 this->bits_seq.clear();
136 this->char_rank_vec.clear();
137 this->alphabet.clear();
138 }
139
140 public:
141 void set_alphabet(const std::vector<uint8_t> &_alphabet)
142 {
143 this->initialize();
144
145 for (auto c : _alphabet)
146 {
147 this->alphabet.push_back(c);
148 }
149 std::sort(this->alphabet.begin(), this->alphabet.end());
150
151 this->char_rank_vec.resize(256, -1);
152 for (uint64_t i = 0; i < alphabet.size(); i++)
153 {
154 this->char_rank_vec[alphabet[i]] = i;
155 }
156 this->rank_bit_size = stool::LSBByte::get_code_length(alphabet.size() - 1);
157
158 for (uint64_t i = 0; i < rank_bit_size; i++)
159 {
160 this->bits_seq.push_back(std::vector<BIT_SEQUENCE>());
161 if (i == 0)
162 {
163 this->bits_seq[i].push_back(BIT_SEQUENCE());
164 }
165 else
166 {
167 uint64_t x = 1ULL << i;
168 for (uint64_t j = 0; j < x; j++)
169 {
170 this->bits_seq[i].push_back(BIT_SEQUENCE());
171 }
172 }
173 }
174 }
175 uint64_t get_alphabet_size() const
176 {
177 return this->alphabet.size();
178 }
179
180 void clear()
181 {
182 for (uint64_t i = 0; i < this->bits_seq.size(); i++)
183 {
184 for (uint64_t j = 0; j < this->bits_seq[i].size(); j++)
185 {
186 this->bits_seq[i][j].clear();
187 }
188 }
189 }
190
191 uint64_t size() const
192 {
193 if (!this->has_empty_alphabet())
194 {
195 return this->bits_seq[0][0].size();
196 }
197 else
198 {
199 return 0;
200 }
201 }
202 uint64_t height() const
203 {
204 return this->bits_seq.size();
205 }
206
207 private:
208 void push_back_sub(uint64_t c_rank)
209 {
210#ifdef DEBUG
211 uint64_t _size = this->size();
212#endif
213 uint64_t pos = 0;
214 for (uint64_t i = 0; i < this->rank_bit_size; i++)
215 {
216 bool b = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - 1 - i);
217 this->bits_seq[i][pos].push_back(b ? 1 : 0);
218 pos = (pos * 2) + (b ? 1 : 0);
219 }
220
221#ifdef DEBUG
222 assert(this->size() == _size + 1);
223#endif
224 }
225 int64_t rank_sub(uint64_t i, uint64_t c_rank) const
226 {
227 bool b1 = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - 1);
228 uint64_t _rank = this->bits_seq[0][0].rank(i + 1, b1);
229 if (_rank == 0)
230 {
231 return 0;
232 }
233 else
234 {
235 uint64_t nth = _rank - 1;
236 uint64_t next = b1 ? 1 : 0;
237
238 for (uint64_t i = 1; i < this->rank_bit_size; i++)
239 {
240 bool bx = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - 1 - i);
241
242 if (nth >= this->bits_seq[i][next].size())
243 {
244 assert(false);
245 throw std::range_error("Error: DynamicSequence::rank_sub()");
246 }
247
248 uint64_t _rankx = this->bits_seq[i][next].rank(nth + 1, bx);
249 if (_rankx == 0)
250 {
251 return 0;
252 }
253 else
254 {
255 nth = _rankx - 1;
256 next = (next * 2) + (bx ? 1 : 0);
257 }
258 }
259 return nth + 1;
260 }
261 }
262 int64_t select_sub(uint64_t nth, uint64_t c_rank) const
263 {
264 int64_t depth = this->bits_seq.size();
265 assert(depth > 0);
266 int64_t h = 0;
267 int64_t j = c_rank / 2;
268 bool hbit = stool::LSBByte::get_bit(c_rank, h);
269 assert(j < (int64_t)this->bits_seq[depth - h - 1].size());
270 uint64_t hbit_rank = this->bits_seq[depth - h - 1][j].count_c(hbit);
271
272 if (nth <= hbit_rank)
273 {
274 int64_t hbit_select = this->bits_seq[depth - h - 1][j].select(nth - 1, hbit);
275
276 assert(hbit_select >= 0);
277
279 j = j / 2;
280 h++;
281 while (h < (int64_t)this->rank_bit_size)
282 {
283 hbit = stool::LSBByte::get_bit(c_rank, h);
284 assert(next_nth <= this->bits_seq[depth - h - 1][j].count_c(hbit));
285 hbit_select = this->bits_seq[depth - h - 1][j].select(next_nth - 1, hbit);
286
287 assert(hbit_select >= 0);
288 next_nth = hbit_select + 1;
289 j = j / 2;
290 h++;
291 }
292 assert(next_nth > 0);
293 return next_nth - 1;
294 }
295 else
296 {
297 return -1;
298 }
299 }
300 void insert_sub(uint64_t nth, uint8_t c_rank)
301 {
302 if (nth <= this->size())
303 {
304
306 uint64_t j = 0;
307 for (int64_t h = 0; h < (int64_t)this->height(); h++)
308 {
309 bool hbit = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - h - 1);
310 assert(next_nth <= this->bits_seq[h][j].size());
311 this->bits_seq[h][j].insert(next_nth, hbit);
312 next_nth = this->bits_seq[h][j].rank(next_nth, hbit);
313 j = (j * 2) + (hbit ? 1 : 0);
314 }
315 }
316 else
317 {
318 throw std::range_error("Error: DynamicSequence::insert(nth, c_rank)");
319 }
320 }
321
322 public:
323 bool has_empty_alphabet() const
324 {
325 return this->rank_bit_size == 0;
326 }
327 int64_t rank(uint64_t i, uint64_t c) const
328 {
329 if (!this->has_empty_alphabet())
330 {
331 assert(c < char_rank_vec.size());
332 int64_t c_rank = this->char_rank_vec[c];
333 assert(c_rank <= (int64_t)this->alphabet.size());
334
335 if (c_rank == -1 || i == 0)
336 {
337 return 0;
338 }
339 else
340 {
341 return this->rank_sub(i - 1, c_rank);
342 }
343 }
344 else
345 {
346 return 0;
347 }
348 }
349
350 int64_t select(uint64_t i, uint64_t c) const
351 {
352 if (!this->has_empty_alphabet())
353 {
354 assert(c < char_rank_vec.size());
355 int64_t c_rank = this->char_rank_vec[c];
356 assert(c_rank <= (int64_t)this->alphabet.size());
357
358 if (c_rank == -1)
359 {
360 return -1;
361 }
362 else
363 {
364 return this->select_sub(i + 1, c_rank);
365 }
366 }
367 else
368 {
369 return -1;
370 }
371 }
372 void push_many(const std::vector<uint8_t> &str)
373 {
374 for (auto c : str)
375 {
376 this->push_back(c);
377 }
378 }
379 void push_back(uint8_t c)
380 {
381 assert(!this->has_empty_alphabet());
382 if (!this->has_empty_alphabet())
383 {
384 int64_t c_rank = this->char_rank_vec[c];
385 if (c_rank == -1)
386 {
387 assert(false);
388 throw std::runtime_error("Error: DynamicSequence::push_back()");
389 }
390 else
391 {
392 this->push_back_sub(c_rank);
393 }
394 }
395 else
396 {
397 throw std::runtime_error("Error: DynamicSequence::push_back()");
398 }
399 }
400 uint64_t get_nth_char_rank(uint64_t nth) const
401 {
402 assert(!this->has_empty_alphabet());
403 if (!this->has_empty_alphabet())
404 {
405 if (nth < this->size())
406 {
408 uint64_t j = 0;
409 for (int64_t h = 0; h < (int64_t)this->height(); h++)
410 {
411 bool b = this->bits_seq[h][j].at(next_nth);
412 next_nth = this->bits_seq[h][j].rank(next_nth, b);
413 j = (j * 2) + (b ? 1 : 0);
414 }
415 return j;
416 }
417 else
418 {
419 throw std::range_error("Error: DynamicSequence::get_nth_char_rank(nth)");
420 }
421 }
422 else
423 {
424 throw std::range_error("Error: DynamicSequence::get_nth_char_rank(nth)");
425 }
426 }
427 uint64_t at(uint64_t nth) const
428 {
429 assert(!this->has_empty_alphabet());
430 if (!this->has_empty_alphabet())
431 {
432 return this->alphabet[this->get_nth_char_rank(nth)];
433 }
434 else
435 {
436 throw std::range_error("Error: DynamicSequence::at(nth)");
437 }
438 }
439
440 void remove(uint64_t nth)
441 {
442 if (nth < this->size())
443 {
445 uint64_t j = 0;
446 for (int64_t h = 0; h < (int64_t)this->height(); h++)
447 {
449 bool b = this->bits_seq[h][j].at(current_nth);
450 next_nth = this->bits_seq[h][j].rank(current_nth, b);
451 this->bits_seq[h][j].remove(current_nth);
452 j = (j * 2) + (b ? 1 : 0);
453 }
454 }
455 else
456 {
457 throw std::range_error("Error: DynamicSequence::remove(nth)");
458 }
459 }
460
461 void insert(uint64_t nth, uint8_t c)
462 {
463 assert(!this->has_empty_alphabet());
464 if (!this->has_empty_alphabet())
465 {
466 int64_t c_rank = this->char_rank_vec[c];
467 if (c_rank == -1)
468 {
469 assert(false);
470 throw std::runtime_error("Error: DynamicSequence::insert(nth, c)");
471 }
472 else
473 {
474 this->insert_sub(nth, c_rank);
475 }
476 }
477 else
478 {
479 throw std::runtime_error("Error: DynamicSequence::insert(nth, c)");
480 }
481 }
482 std::string to_string() const
483 {
484 std::string s;
485 s.resize(this->size());
486 for (uint64_t i = 0; i < this->size(); i++)
487 {
488 s[i] = this->at(i);
489 }
490 return s;
491 }
492 std::vector<uint8_t> to_uint8_str() const
493 {
494 std::vector<uint8_t> s;
495 s.resize(this->size());
496 for (uint64_t i = 0; i < this->size(); i++)
497 {
498 s[i] = this->at(i);
499 }
500 return s;
501 }
502 int64_t get_rank_of_character_in_alphabet(uint8_t c) const
503 {
504 return this->char_rank_vec[c];
505 }
506
507 void print() const
508 {
509 std::cout << "================DynamicSequence==================" << std::endl;
510 stool::DebugPrinter::print_characters(this->alphabet, "Alphabet");
511 stool::DebugPrinter::print_integers(this->char_rank_vec, "Char_rank_vec");
512 std::cout << "Rank_bit_size: " << this->rank_bit_size << std::endl;
513
514 for (uint64_t i = 0; i < this->rank_bit_size; i++)
515 {
516 std::cout << "level: " << i << " / count: " << this->bits_seq[i].size() << std::endl;
517 for (uint64_t j = 0; j < this->bits_seq[i].size(); j++)
518 {
519 std::cout << this->bits_seq[i][j].to_string() << std::endl;
520 }
521 }
522 std::cout << "==================================" << std::endl;
523 }
524
525 uint8_t operator[](uint64_t n) const
526 {
527 return this->at(n);
528 }
529
530 uint64_t get_smallest_character_in_alphabet() const
531 {
532 assert(this->alphabet.size() > 0);
533 return this->alphabet[0];
534 }
535 uint64_t count_c(uint8_t c) const
536 {
537 int64_t c_rank = this->get_rank_of_character_in_alphabet(c);
538 if (c_rank == -1)
539 {
540 return 0;
541 }
542 else
543 {
544 uint64_t idx = (c_rank / 2);
545 uint64_t bit_idx = 0;
546 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
547 return this->bits_seq[this->bits_seq.size() - 1][idx].count_c(b);
548 }
549 }
550
551 static void store_to_file(DynamicWaveletTree &item, std::ofstream &os)
552 {
553 uint64_t alphabet_size = item.alphabet.size();
554 os.write(reinterpret_cast<const char *>(&alphabet_size), sizeof(uint64_t));
555 os.write(reinterpret_cast<const char *>(item.alphabet.data()), alphabet_size * sizeof(uint8_t));
556 for (auto &it : item.bits_seq)
557 {
558 for (auto &it2 : it)
559 {
561 // it2->save(os);
562 }
563 }
564 }
565 static DynamicWaveletTree load_from_file(std::ifstream &ifs)
566 {
569 ifs.read(reinterpret_cast<char *>(&_alphabet_size), sizeof(uint64_t));
570
571 std::vector<uint8_t> _alphabet;
573 ifs.read(reinterpret_cast<char *>(_alphabet.data()), _alphabet_size * sizeof(uint8_t));
574
575 r.set_alphabet(_alphabet);
576 for (auto &it : r.bits_seq)
577 {
578 for (auto &it2 : it)
579 {
581 it2.swap(bits);
582 }
583 }
584 return r;
585 }
586
587 uint64_t size_in_bytes() const
588 {
590 for (uint64_t h = 0; h < this->bits_seq.size(); h++)
591 {
592 for (const BIT_SEQUENCE &seq : this->bits_seq[h])
593 {
595 }
596 }
597 return total_size_in_bytes;
598 }
599 void print_statistics(int message_paragraph = stool::Message::SHOW_MESSAGE) const
600 {
601 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "Statistics(DynamicWaveletTree):" << std::endl;
602 std::cout << stool::Message::get_paragraph_string(message_paragraph + 1) << "Alphabet size: " << this->alphabet.size() << std::endl;
603 std::cout << stool::Message::get_paragraph_string(message_paragraph + 1) << "Text length: " << this->size() << std::endl;
604 std::cout << stool::Message::get_paragraph_string(message_paragraph + 1) << "Estimated memory usage: " << this->size_in_bytes() << " bytes" << std::endl;
605 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "[END]" << std::endl;
606 }
607 std::vector<std::string> get_memory_usage_info(int message_paragraph = stool::Message::SHOW_MESSAGE) const
608 {
609 std::vector<std::string> r;
610 r.push_back(stool::Message::get_paragraph_string(message_paragraph) + "=DynamicWaveletTree: " + std::to_string(this->size_in_bytes()) + " bytes =");
611 uint64_t counter = 0;
612 for (uint64_t h = 0; h < this->bits_seq.size(); h++)
613 {
614 for (const BIT_SEQUENCE &seq : this->bits_seq[h])
615 {
616 if (counter < 5)
617 {
618 std::vector<std::string> log1 = seq.get_memory_usage_info(message_paragraph + 1);
619 for (auto s : log1)
620 {
621 r.push_back(s);
622 }
623 }
624 counter++;
625 }
626 }
627 if(counter > 5){
628 r.push_back(stool::Message::get_paragraph_string(message_paragraph) + "Omitted details of " + std::to_string(counter - 5) + " dynamic bit sequences.");
629 }
630
631 r.push_back(stool::Message::get_paragraph_string(message_paragraph) + "==");
632 return r;
633 }
634 void print_content(int message_paragraph = stool::Message::SHOW_MESSAGE) const
635 {
636 std::string s = this->to_string();
637 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "Content(DynamicWaveletTree):" << std::endl;
638 std::cout << stool::Message::get_paragraph_string(message_paragraph+1) << "String: " << s << "(" << s.size() << ")" << std::endl;
639 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "[END]" << std::endl;
640 }
641
642 void print_memory_usage(int message_paragraph = stool::Message::SHOW_MESSAGE) const
643 {
644 std::vector<std::string> log = this->get_memory_usage_info(message_paragraph);
645 for (std::string &s : log)
646 {
647 std::cout << s << std::endl;
648 }
649 }
650 };
651
652 // template <typename T>
653 // using VLCDequeSeq = DynamicSequence<VLCDeque, T>;
654 }
655
656}
An implementation of B+-tree [Unchecked AI's Comment].
Definition bp_tree.hpp:24
void build(const std::vector< VALUE > &_values)
Builds a complete B+ tree from a vector of values.
Definition bp_tree.hpp:2858
uint64_t size_in_bytes(bool only_extra_bytes=false) const
Returns the total memory size of the B+ tree.
Definition bp_tree.hpp:930
void push_back(VALUE value)
Pushes a single value to the B+ tree.
Definition bp_tree.hpp:1851
std::vector< std::string > get_memory_usage_info(int message_paragraph=stool::Message::SHOW_MESSAGE) const
Returns a vector of strings containing memory usage information about the B+ tree.
Definition bp_tree.hpp:1070
uint64_t size() const
Return the number of values stored in this tree.
Definition bp_tree.hpp:316
VALUE at(uint64_t pos) const
Returns the value at position pos in the B+ tree.
Definition bp_tree.hpp:574
void remove(uint64_t pos)
Removes a value at a specific position in the B+ tree.
Definition bp_tree.hpp:2088
void swap(BPTree &_tree, bool swap_linked_tree)
Swaps the contents of this B+ tree with another B+ tree.
Definition bp_tree.hpp:178
void clear()
Clears all contents of the B+ tree and resets it to an empty state.
Definition bp_tree.hpp:219
void load_from_file(std::ifstream &ifs)
Builds a B+ tree from serialized data in a file stream.
Definition bp_tree.hpp:2993
void resize(uint64_t _size, VALUE default_value)
Resizes the B+ tree to contain a specified number of elements.
Definition bp_tree.hpp:2194
static void store_to_file(BPTree &bp, std::ofstream &os)
Saves the B+ tree structure to a file stream.
Definition bp_tree.hpp:2945
A dynamic data structure supporting rank and select queries on a string. [Unchecked AI's Comment].
Definition dynamic_wavelet_tree.hpp:14
A dynamic data structure supporting rank and select queries on a bit sequence.