b-tree-plus-alpha
Loading...
Searching...
No Matches
dynamic_wavelet_tree.hpp
1#pragma once
3
4namespace stool
5{
6 namespace bptree
7 {
8
14 {
16 std::vector<std::vector<BIT_SEQUENCE>> bits_seq;
17 std::vector<int64_t> char_rank_vec;
18 std::vector<uint8_t> alphabet;
19 uint64_t rank_bit_size = 0;
20
21 public:
23 {
24 this->alphabet.clear();
25 };
26 DynamicWaveletTree(const std::vector<uint8_t> &_alphabet)
27 {
28 this->set_alphabet(_alphabet);
29 };
31 {
32 this->initialize();
33 };
34 DynamicWaveletTree &operator=(const DynamicWaveletTree &) = delete;
35 DynamicWaveletTree(DynamicWaveletTree &&) noexcept = default;
36 DynamicWaveletTree &operator=(DynamicWaveletTree &&) noexcept = default;
37
38 private:
39 void build_bits(const std::vector<uint8_t> &_text, uint64_t h, uint64_t i)
40 {
41 uint64_t bit_idx = this->rank_bit_size - h - 1;
42 std::vector<bool> bits;
43 bits.resize(_text.size(), false);
44 uint64_t counter = 0;
47
48 for (uint8_t c : _text)
49 {
50 int64_t c_rank = this->char_rank_vec[c];
51 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
52 bits[counter++] = b;
53 if (b)
54 {
56 }
57 else
58 {
60 }
61 }
63 this->bits_seq[h][i].swap(dbs);
64 if (h + 1 < rank_bit_size)
65 {
66 uint64_t next_i = i * 2;
67 {
68 std::vector<uint8_t> left;
70 counter = 0;
71
72 for (uint8_t c : _text)
73 {
74 int64_t c_rank = this->char_rank_vec[c];
75 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
76 if (!b)
77 {
78 left[counter++] = c;
79 }
80 }
81 this->build_bits(left, h + 1, next_i);
82 }
83 {
84 std::vector<uint8_t> right;
86 counter = 0;
87
88 for (uint8_t c : _text)
89 {
90 int64_t c_rank = this->char_rank_vec[c];
91 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
92 if (b)
93 {
94 right[counter++] = c;
95 }
96 }
97 this->build_bits(right, h + 1, next_i + 1);
98 }
99 }
100 }
101
102 public:
103 static DynamicWaveletTree build(const std::vector<uint8_t> &_text, const std::vector<uint8_t> &_alphabet)
104 {
106 dwt.build_bits(_text, 0, 0);
107 return dwt;
108 }
109
110 void swap(DynamicWaveletTree &item)
111 {
112 this->bits_seq.swap(item.bits_seq);
113
114 this->char_rank_vec.swap(item.char_rank_vec);
115 this->alphabet.swap(item.alphabet);
116 std::swap(this->rank_bit_size, item.rank_bit_size);
117 }
118
119 private:
120 void initialize()
121 {
122 for (uint64_t i = 0; i < this->bits_seq.size(); i++)
123 {
124 for (uint64_t j = 0; j < this->bits_seq[i].size(); j++)
125 {
126 this->bits_seq[i][j].clear();
127 // delete this->bits_seq[i][j];
128 }
129 }
130 this->bits_seq.clear();
131 this->char_rank_vec.clear();
132 this->alphabet.clear();
133 }
134
135 public:
136 void set_alphabet(const std::vector<uint8_t> &_alphabet)
137 {
138 this->initialize();
139
140 for (auto c : _alphabet)
141 {
142 this->alphabet.push_back(c);
143 }
144 std::sort(this->alphabet.begin(), this->alphabet.end());
145
146 this->char_rank_vec.resize(256, -1);
147 for (uint64_t i = 0; i < alphabet.size(); i++)
148 {
149 this->char_rank_vec[alphabet[i]] = i;
150 }
151 this->rank_bit_size = stool::LSBByte::get_code_length(alphabet.size() - 1);
152
153 for (uint64_t i = 0; i < rank_bit_size; i++)
154 {
155 this->bits_seq.push_back(std::vector<BIT_SEQUENCE>());
156 if (i == 0)
157 {
158 this->bits_seq[i].push_back(BIT_SEQUENCE());
159 }
160 else
161 {
162 uint64_t x = 1ULL << i;
163 for (uint64_t j = 0; j < x; j++)
164 {
165 this->bits_seq[i].push_back(BIT_SEQUENCE());
166 }
167 }
168 }
169 }
170 uint64_t get_alphabet_size() const
171 {
172 return this->alphabet.size();
173 }
174
175 void clear()
176 {
177 for (uint64_t i = 0; i < this->bits_seq.size(); i++)
178 {
179 for (uint64_t j = 0; j < this->bits_seq[i].size(); j++)
180 {
181 this->bits_seq[i][j].clear();
182 }
183 }
184 }
185
186 uint64_t size() const
187 {
188 if (!this->has_empty_alphabet())
189 {
190 return this->bits_seq[0][0].size();
191 }
192 else
193 {
194 return 0;
195 }
196 }
197 uint64_t height() const
198 {
199 return this->bits_seq.size();
200 }
201
202 private:
203 void push_back_sub(uint64_t c_rank)
204 {
205#ifdef DEBUG
206 uint64_t _size = this->size();
207#endif
208 uint64_t pos = 0;
209 for (uint64_t i = 0; i < this->rank_bit_size; i++)
210 {
211 bool b = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - 1 - i);
212 this->bits_seq[i][pos].push_back(b ? 1 : 0);
213 pos = (pos * 2) + (b ? 1 : 0);
214 }
215
216#ifdef DEBUG
217 assert(this->size() == _size + 1);
218#endif
219 }
220 int64_t rank_sub(uint64_t i, uint64_t c_rank) const
221 {
222 bool b1 = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - 1);
223 uint64_t _rank = this->bits_seq[0][0].rank(i + 1, b1);
224 if (_rank == 0)
225 {
226 return 0;
227 }
228 else
229 {
230 uint64_t nth = _rank - 1;
231 uint64_t next = b1 ? 1 : 0;
232
233 for (uint64_t i = 1; i < this->rank_bit_size; i++)
234 {
235 bool bx = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - 1 - i);
236
237 if (nth >= this->bits_seq[i][next].size())
238 {
239 assert(false);
240 throw std::range_error("Error: DynamicSequence::rank_sub()");
241 }
242
243 uint64_t _rankx = this->bits_seq[i][next].rank(nth + 1, bx);
244 if (_rankx == 0)
245 {
246 return 0;
247 }
248 else
249 {
250 nth = _rankx - 1;
251 next = (next * 2) + (bx ? 1 : 0);
252 }
253 }
254 return nth + 1;
255 }
256 }
257 int64_t select_sub(uint64_t nth, uint64_t c_rank) const
258 {
259 int64_t depth = this->bits_seq.size();
260 assert(depth > 0);
261 int64_t h = 0;
262 int64_t j = c_rank / 2;
263 bool hbit = stool::LSBByte::get_bit(c_rank, h);
264 assert(j < (int64_t)this->bits_seq[depth - h - 1].size());
265 uint64_t hbit_rank = this->bits_seq[depth - h - 1][j].count_c(hbit);
266
267 if (nth <= hbit_rank)
268 {
269 int64_t hbit_select = this->bits_seq[depth - h - 1][j].select(nth - 1, hbit);
270
271 assert(hbit_select >= 0);
272
274 j = j / 2;
275 h++;
276 while (h < (int64_t)this->rank_bit_size)
277 {
278 hbit = stool::LSBByte::get_bit(c_rank, h);
279 assert(next_nth <= this->bits_seq[depth - h - 1][j].count_c(hbit));
280 hbit_select = this->bits_seq[depth - h - 1][j].select(next_nth - 1, hbit);
281
282 assert(hbit_select >= 0);
283 next_nth = hbit_select + 1;
284 j = j / 2;
285 h++;
286 }
287 assert(next_nth > 0);
288 return next_nth - 1;
289 }
290 else
291 {
292 return -1;
293 }
294 }
295 void insert_sub(uint64_t nth, uint8_t c_rank)
296 {
297 if (nth <= this->size())
298 {
299
301 uint64_t j = 0;
302 for (int64_t h = 0; h < (int64_t)this->height(); h++)
303 {
304 bool hbit = stool::LSBByte::get_bit(c_rank, this->rank_bit_size - h - 1);
305 assert(next_nth <= this->bits_seq[h][j].size());
306 this->bits_seq[h][j].insert(next_nth, hbit);
307 next_nth = this->bits_seq[h][j].rank(next_nth, hbit);
308 j = (j * 2) + (hbit ? 1 : 0);
309 }
310 }
311 else
312 {
313 throw std::range_error("Error: DynamicSequence::insert(nth, c_rank)");
314 }
315 }
316
317 public:
318 bool has_empty_alphabet() const
319 {
320 return this->rank_bit_size == 0;
321 }
322 int64_t rank(uint64_t i, uint64_t c) const
323 {
324 if (!this->has_empty_alphabet())
325 {
326 assert(c < char_rank_vec.size());
327 int64_t c_rank = this->char_rank_vec[c];
328 assert(c_rank <= (int64_t)this->alphabet.size());
329
330 if (c_rank == -1 || i == 0)
331 {
332 return 0;
333 }
334 else
335 {
336 return this->rank_sub(i - 1, c_rank);
337 }
338 }
339 else
340 {
341 return 0;
342 }
343 }
344
345 int64_t select(uint64_t i, uint64_t c) const
346 {
347 if (!this->has_empty_alphabet())
348 {
349 assert(c < char_rank_vec.size());
350 int64_t c_rank = this->char_rank_vec[c];
351 assert(c_rank <= (int64_t)this->alphabet.size());
352
353 if (c_rank == -1)
354 {
355 return -1;
356 }
357 else
358 {
359 return this->select_sub(i + 1, c_rank);
360 }
361 }
362 else
363 {
364 return -1;
365 }
366 }
367 void push_many(const std::vector<uint8_t> &str)
368 {
369 for (auto c : str)
370 {
371 this->push_back(c);
372 }
373 }
374 void push_back(uint8_t c)
375 {
376 assert(!this->has_empty_alphabet());
377 if (!this->has_empty_alphabet())
378 {
379 int64_t c_rank = this->char_rank_vec[c];
380 if (c_rank == -1)
381 {
382 assert(false);
383 throw std::runtime_error("Error: DynamicSequence::push_back()");
384 }
385 else
386 {
387 this->push_back_sub(c_rank);
388 }
389 }
390 else
391 {
392 throw std::runtime_error("Error: DynamicSequence::push_back()");
393 }
394 }
395 uint64_t get_nth_char_rank(uint64_t nth) const
396 {
397 assert(!this->has_empty_alphabet());
398 if (!this->has_empty_alphabet())
399 {
400 if (nth < this->size())
401 {
403 uint64_t j = 0;
404 for (int64_t h = 0; h < (int64_t)this->height(); h++)
405 {
406 bool b = this->bits_seq[h][j].at(next_nth);
407 next_nth = this->bits_seq[h][j].rank(next_nth, b);
408 j = (j * 2) + (b ? 1 : 0);
409 }
410 return j;
411 }
412 else
413 {
414 throw std::range_error("Error: DynamicSequence::get_nth_char_rank(nth)");
415 }
416 }
417 else
418 {
419 throw std::range_error("Error: DynamicSequence::get_nth_char_rank(nth)");
420 }
421 }
422 uint64_t at(uint64_t nth) const
423 {
424 assert(!this->has_empty_alphabet());
425 if (!this->has_empty_alphabet())
426 {
427 return this->alphabet[this->get_nth_char_rank(nth)];
428 }
429 else
430 {
431 throw std::range_error("Error: DynamicSequence::at(nth)");
432 }
433 }
434
435 void remove(uint64_t nth)
436 {
437 if (nth < this->size())
438 {
440 uint64_t j = 0;
441 for (int64_t h = 0; h < (int64_t)this->height(); h++)
442 {
444 bool b = this->bits_seq[h][j].at(current_nth);
445 next_nth = this->bits_seq[h][j].rank(current_nth, b);
446 this->bits_seq[h][j].remove(current_nth);
447 j = (j * 2) + (b ? 1 : 0);
448 }
449 }
450 else
451 {
452 throw std::range_error("Error: DynamicSequence::remove(nth)");
453 }
454 }
455
456 void insert(uint64_t nth, uint8_t c)
457 {
458 assert(!this->has_empty_alphabet());
459 if (!this->has_empty_alphabet())
460 {
461 int64_t c_rank = this->char_rank_vec[c];
462 if (c_rank == -1)
463 {
464 assert(false);
465 throw std::runtime_error("Error: DynamicSequence::insert(nth, c)");
466 }
467 else
468 {
469 this->insert_sub(nth, c_rank);
470 }
471 }
472 else
473 {
474 throw std::runtime_error("Error: DynamicSequence::insert(nth, c)");
475 }
476 }
477 std::string to_string() const
478 {
479 std::string s;
480 s.resize(this->size());
481 for (uint64_t i = 0; i < this->size(); i++)
482 {
483 s[i] = this->at(i);
484 }
485 return s;
486 }
487 std::vector<uint8_t> to_uint8_str() const
488 {
489 std::vector<uint8_t> s;
490 s.resize(this->size());
491 for (uint64_t i = 0; i < this->size(); i++)
492 {
493 s[i] = this->at(i);
494 }
495 return s;
496 }
497 int64_t get_rank_of_character_in_alphabet(uint8_t c) const
498 {
499 return this->char_rank_vec[c];
500 }
501
502 void print() const
503 {
504 std::cout << "================DynamicSequence==================" << std::endl;
505 stool::DebugPrinter::print_characters(this->alphabet, "Alphabet");
506 stool::DebugPrinter::print_integers(this->char_rank_vec, "Char_rank_vec");
507 std::cout << "Rank_bit_size: " << this->rank_bit_size << std::endl;
508
509 for (uint64_t i = 0; i < this->rank_bit_size; i++)
510 {
511 std::cout << "level: " << i << " / count: " << this->bits_seq[i].size() << std::endl;
512 for (uint64_t j = 0; j < this->bits_seq[i].size(); j++)
513 {
514 std::cout << this->bits_seq[i][j].to_string() << std::endl;
515 }
516 }
517 std::cout << "==================================" << std::endl;
518 }
519
520 uint8_t operator[](uint64_t n) const
521 {
522 return this->at(n);
523 }
524
525 uint64_t get_smallest_character_in_alphabet() const
526 {
527 assert(this->alphabet.size() > 0);
528 return this->alphabet[0];
529 }
530 uint64_t count_c(uint8_t c) const
531 {
532 int64_t c_rank = this->get_rank_of_character_in_alphabet(c);
533 if (c_rank == -1)
534 {
535 return 0;
536 }
537 else
538 {
539 uint64_t idx = (c_rank / 2);
540 uint64_t bit_idx = 0;
541 bool b = stool::LSBByte::get_bit(c_rank, bit_idx);
542 return this->bits_seq[this->bits_seq.size() - 1][idx].count_c(b);
543 }
544 }
545
546 static void save(DynamicWaveletTree &item, std::ofstream &os)
547 {
548 uint64_t alphabet_size = item.alphabet.size();
549 os.write(reinterpret_cast<const char *>(&alphabet_size), sizeof(uint64_t));
550 os.write(reinterpret_cast<const char *>(item.alphabet.data()), alphabet_size * sizeof(uint8_t));
551 for (auto &it : item.bits_seq)
552 {
553 for (auto &it2 : it)
554 {
556 // it2->save(os);
557 }
558 }
559 }
560 static DynamicWaveletTree build_from_data(std::ifstream &ifs)
561 {
564 ifs.read(reinterpret_cast<char *>(&_alphabet_size), sizeof(uint64_t));
565
566 std::vector<uint8_t> _alphabet;
568 ifs.read(reinterpret_cast<char *>(_alphabet.data()), _alphabet_size * sizeof(uint8_t));
569
570 r.set_alphabet(_alphabet);
571 for (auto &it : r.bits_seq)
572 {
573 for (auto &it2 : it)
574 {
576 it2.swap(bits);
577 }
578 }
579 return r;
580 }
581
582 uint64_t size_in_bytes() const
583 {
585 for (uint64_t h = 0; h < this->bits_seq.size(); h++)
586 {
587 for (const BIT_SEQUENCE &seq : this->bits_seq[h])
588 {
590 }
591 }
592 return total_size_in_bytes;
593 }
594 void print_statistics(int message_paragraph = stool::Message::SHOW_MESSAGE) const
595 {
596 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "Statistics(DynamicWaveletTree):" << std::endl;
597 std::cout << stool::Message::get_paragraph_string(message_paragraph + 1) << "Alphabet size: " << this->alphabet.size() << std::endl;
598 std::cout << stool::Message::get_paragraph_string(message_paragraph + 1) << "Text length: " << this->size() << std::endl;
599 std::cout << stool::Message::get_paragraph_string(message_paragraph + 1) << "Estimated memory usage: " << this->size_in_bytes() << " bytes" << std::endl;
600 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "[END]" << std::endl;
601 }
602 std::vector<std::string> get_memory_usage_info(int message_paragraph = stool::Message::SHOW_MESSAGE) const
603 {
604 std::vector<std::string> r;
605 r.push_back(stool::Message::get_paragraph_string(message_paragraph) + "=DynamicWaveletTree: " + std::to_string(this->size_in_bytes()) + " bytes =");
606 uint64_t counter = 0;
607 for (uint64_t h = 0; h < this->bits_seq.size(); h++)
608 {
609 for (const BIT_SEQUENCE &seq : this->bits_seq[h])
610 {
611 if (counter < 5)
612 {
613 std::vector<std::string> log1 = seq.get_memory_usage_info(message_paragraph + 1);
614 for (auto s : log1)
615 {
616 r.push_back(s);
617 }
618 }
619 counter++;
620 }
621 }
622 if(counter > 5){
623 r.push_back(stool::Message::get_paragraph_string(message_paragraph) + "Omitted details of " + std::to_string(counter - 5) + " dynamic bit sequences.");
624 }
625
626 r.push_back(stool::Message::get_paragraph_string(message_paragraph) + "==");
627 return r;
628 }
629 void print_content(int message_paragraph = stool::Message::SHOW_MESSAGE) const
630 {
631 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "Content(DynamicWaveletTree):" << std::endl;
632 std::cout << stool::Message::get_paragraph_string(message_paragraph+1) << "String: " << this->to_string() << std::endl;
633 std::cout << stool::Message::get_paragraph_string(message_paragraph) << "[END]" << std::endl;
634 }
635
636 void print_memory_usage(int message_paragraph = stool::Message::SHOW_MESSAGE) const
637 {
638 std::vector<std::string> log = this->get_memory_usage_info(message_paragraph);
639 for (std::string &s : log)
640 {
641 std::cout << s << std::endl;
642 }
643 }
644 };
645
646 // template <typename T>
647 // using VLCDequeSeq = DynamicSequence<VLCDeque, T>;
648 }
649
650}
An implementation of B+-tree.
Definition bp_tree.hpp:24
void swap(BPTree &_tree, bool swap_linked_tree)
Swaps the contents of this B+ tree with another B+ tree.
Definition bp_tree.hpp:178
VALUE at(uint64_t pos) const
Returns the value at position pos in the B+ tree.
Definition bp_tree.hpp:574
void resize(uint64_t _size, VALUE default_value)
Resizes the B+ tree to contain a specified number of elements.
Definition bp_tree.hpp:2190
uint64_t size() const
Return the number of values stored in this tree.
Definition bp_tree.hpp:316
void build(const std::vector< VALUE > &_values)
Builds a complete B+ tree from a vector of values.
Definition bp_tree.hpp:2857
void push_back(VALUE value)
Pushes a single value to the B+ tree.
Definition bp_tree.hpp:1849
void remove(uint64_t pos)
Removes a value at a specific position in the B+ tree.
Definition bp_tree.hpp:2086
void save(std::vector< uint8_t > &output, uint64_t &pos)
Saves the B+ tree structure to a byte vector.
Definition bp_tree.hpp:2912
void clear()
Clears all contents of the B+ tree and resets it to an empty state.
Definition bp_tree.hpp:219
void build_from_data(const std::vector< uint8_t > &data, uint64_t &pos)
Builds a B+ tree from serialized data in a byte vector.
Definition bp_tree.hpp:2968
uint64_t size_in_bytes() const
Returns the total memory size of the B+ tree.
Definition bp_tree.hpp:931
std::vector< std::string > get_memory_usage_info(int message_paragraph=stool::Message::SHOW_MESSAGE) const
Returns a vector of strings containing memory usage information about the B+ tree.
Definition bp_tree.hpp:1068
A dynamic data structure supporting rank and select queries on a string.
Definition dynamic_wavelet_tree.hpp:14
A dynamic data structure supporting rank and select queries on a bit sequence.