Lower RAM implementation of slice_columns for BRWT#226
Conversation
DBGMode used in build_anno_graph
karasikov
left a comment
There was a problem hiding this comment.
What are the results of the benchmarks?
| const ColumnCallback &callback) const { | ||
| uint64_t nrows = num_rows(); | ||
| sdsl::bit_vector code_column(reduced_matrix_.num_rows()); | ||
| reduced_matrix_.slice_columns(columns, [&](Column j, bitmap&& rows) { |
There was a problem hiding this comment.
| reduced_matrix_.slice_columns(columns, [&](Column j, bitmap&& rows) { | |
| reduced_matrix_.slice_columns(columns, [&](Column j, bitmap&& reduced_column) { |
| rows.add_to(&code_column); | ||
|
|
||
| callback(j, bitmap_generator([&](const auto &index_callback) { | ||
| for (uint64_t i = 0; i < nrows; ++i) { |
There was a problem hiding this comment.
This will take forever. Make it parallel
| for (uint64_t i = 0; i < nrows; ++i) { | |
| #pragma parallel num_threads(get_num_threads()) | |
| for (uint64_t i = 0; i < nrows; ++i) { |
| void slice_columns(const std::vector<Column> &columns, | ||
| const ColumnCallback &callback) const override; |
| void BRWT::slice_columns(const std::vector<Column> &column_ids, | ||
| const ColumnCallback &callback) const { |
| if (column_ids.empty()) | ||
| return; | ||
|
|
||
| auto num_nonzero_rows = nonzero_rows_->num_set_bits(); |
There was a problem hiding this comment.
| auto num_nonzero_rows = nonzero_rows_->num_set_bits(); | |
| uint64_t num_nonzero_rows = nonzero_rows_->num_set_bits(); |
| // check if the column is empty | ||
| if (!num_nonzero_rows) | ||
| return; |
There was a problem hiding this comment.
Even if they are empty, you still need to call them. Add unit tests?
| if (!child_nodes_.size()) { | ||
| // return the index column | ||
| for (size_t k = 0; k < column_ids.size(); ++k) { | ||
| callback(column_ids[k], std::move(*nonzero_rows_->copy())); |
There was a problem hiding this comment.
Better call a const reference, so the column can be copied by the caller if it's needed, and otherwise, there is no overhead.
| callback(column_ids[k], std::move(*nonzero_rows_->copy())); | |
| callback(column_ids[k], *nonzero_rows_); |
| for (size_t k = 0; k < column_ids.size(); ++k) { | ||
| callback(column_ids[k], std::move(*nonzero_rows_->copy())); |
There was a problem hiding this comment.
Why not range-based loop?
for (size_t col_id : column_ids) {
...
}
| tsl::hopscotch_map<uint32_t, std::vector<Column>> child_columns_map; | ||
| for (size_t i = 0; i < column_ids.size(); ++i) { | ||
| assert(column_ids[i] < num_columns()); | ||
| auto child_node = assignments_.group(column_ids[i]); | ||
| auto child_column = assignments_.rank(column_ids[i]); | ||
|
|
||
| auto it = child_columns_map.find(child_node); | ||
| if (it == child_columns_map.end()) | ||
| it = child_columns_map.emplace(child_node, std::vector<Column>{}).first; | ||
|
|
||
| it.value().push_back(child_column); | ||
| } | ||
|
|
||
| auto process = [&](auto child_node, auto *child_columns_ptr) { | ||
| if (num_nonzero_rows == nonzero_rows_->size()) { | ||
| child_nodes_[child_node]->slice_columns(*child_columns_ptr, | ||
| [&](Column j, bitmap&& rows) { | ||
| callback(assignments_.get(child_node, j), std::move(rows)); | ||
| } | ||
| ); | ||
| } else { | ||
| const BRWT *child_node_brwt = dynamic_cast<const BRWT*>( | ||
| child_nodes_[child_node].get() | ||
| ); | ||
| if (child_node_brwt | ||
| && child_columns_ptr->size() > 1 | ||
| && !child_node_brwt->child_nodes_.size()) { | ||
| // if there are multiple column ids corresponding to the same leaf | ||
| // node, then this branch avoids doing redundant select1 calls | ||
| const auto *nonzero_rows = child_node_brwt->nonzero_rows_.get(); | ||
| size_t num_nonzero_rows = nonzero_rows->num_set_bits(); | ||
| if (num_nonzero_rows) { | ||
| std::vector<uint64_t> set_bits; | ||
| set_bits.reserve(num_nonzero_rows); | ||
| nonzero_rows->call_ones([&](auto i) { | ||
| set_bits.push_back(nonzero_rows->select1(i + 1)); | ||
| }); | ||
|
|
||
| for (size_t k = 0; k < child_columns_ptr->size() - 1; ++k) { | ||
| callback(assignments_.get(child_node, (*child_columns_ptr)[k]), | ||
| bitmap_generator(std::move(set_bits), num_rows())); | ||
| } | ||
|
|
||
| callback(assignments_.get(child_node, child_columns_ptr->back()), | ||
| bitmap_generator(std::move(set_bits), num_rows())); | ||
| } | ||
| } else { |
There was a problem hiding this comment.
Could you add some comments to explain why this is going to make things faster than the basic call?
No description provided.