diff --git a/metagraph/src/annotation/binary_matrix/base/binary_matrix.cpp b/metagraph/src/annotation/binary_matrix/base/binary_matrix.cpp index 8b7ec4ea06..fb32ce927b 100644 --- a/metagraph/src/annotation/binary_matrix/base/binary_matrix.cpp +++ b/metagraph/src/annotation/binary_matrix/base/binary_matrix.cpp @@ -1,6 +1,7 @@ #include "binary_matrix.hpp" #include +#include #include "common/vectors/bitmap.hpp" #include "common/serialization.hpp" @@ -38,11 +39,14 @@ BinaryMatrix::slice_rows(const std::vector &row_ids) const { } void BinaryMatrix::call_columns(const std::vector &column_ids, - const std::function &callback, + const std::function &callback, size_t num_threads) const { + ProgressBar progress_bar(column_ids.size(), "Parsing columns", + std::cerr, !common::get_verbose()); #pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (size_t k = 0; k < column_ids.size(); ++k) { callback(k, bitmap_generator(get_column(column_ids[k]), num_rows())); + ++progress_bar; } } diff --git a/metagraph/src/annotation/binary_matrix/base/binary_matrix.hpp b/metagraph/src/annotation/binary_matrix/base/binary_matrix.hpp index b9d4815bc2..f5f8a166ce 100644 --- a/metagraph/src/annotation/binary_matrix/base/binary_matrix.hpp +++ b/metagraph/src/annotation/binary_matrix/base/binary_matrix.hpp @@ -9,7 +9,7 @@ #include "common/vector.hpp" -class bitmap; +class bitmap_generator; namespace mtg { namespace annot { @@ -41,7 +41,7 @@ class BinaryMatrix { // For each column id in columns, run callback on its respective index in columns // and a bitmap represnting the column virtual void call_columns(const std::vector &columns, - const std::function &callback, + const std::function &callback, size_t num_threads = 1) const; virtual bool load(std::istream &in) = 0; diff --git a/metagraph/src/annotation/binary_matrix/column_sparse/column_major.cpp b/metagraph/src/annotation/binary_matrix/column_sparse/column_major.cpp index acf930a484..30f4bf4a13 100644 --- a/metagraph/src/annotation/binary_matrix/column_sparse/column_major.cpp +++ b/metagraph/src/annotation/binary_matrix/column_sparse/column_major.cpp @@ -104,7 +104,7 @@ ColumnMajor::slice_rows(const std::vector &row_ids) const { } void ColumnMajor::call_columns(const std::vector &columns, - const std::function &callback, + const std::function &callback, size_t num_threads) const { #pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (size_t j = 0; j < columns.size(); ++j) { diff --git a/metagraph/src/annotation/binary_matrix/column_sparse/column_major.hpp b/metagraph/src/annotation/binary_matrix/column_sparse/column_major.hpp index a473e6f1d0..4d29932c2b 100644 --- a/metagraph/src/annotation/binary_matrix/column_sparse/column_major.hpp +++ b/metagraph/src/annotation/binary_matrix/column_sparse/column_major.hpp @@ -32,7 +32,7 @@ class ColumnMajor : public BinaryMatrix { std::vector slice_rows(const std::vector &rows) const override; void call_columns(const std::vector &columns, - const std::function &callback, + const std::function &callback, size_t num_threads = 1) const override; bool load(std::istream &in) override; diff --git a/metagraph/src/cli/assemble.cpp b/metagraph/src/cli/assemble.cpp index d2e60bfd8d..6cc7deefc5 100644 --- a/metagraph/src/cli/assemble.cpp +++ b/metagraph/src/cli/assemble.cpp @@ -11,6 +11,7 @@ #include "seq_io/sequence_io.hpp" #include "config/config.hpp" #include "load/load_graph.hpp" +#include "load/load_annotation.hpp" #include "load/load_annotated_graph.hpp" #include "graph/annotated_graph_algorithm.hpp" #include "graph/representation/masked_graph.hpp" @@ -41,10 +42,8 @@ void check_labels(const tsl::hopscotch_set &label_set, exit(1); } -DifferentialAssemblyConfig diff_assembly_config(const Json::Value &experiment, - const DeBruijnGraph &graph) { +DifferentialAssemblyConfig diff_assembly_config(const Json::Value &experiment) { DifferentialAssemblyConfig diff_config; - diff_config.add_complement = graph.get_mode() == DeBruijnGraph::CANONICAL; diff_config.label_mask_in_kmer_fraction = experiment.get("in_min_fraction", 1.0).asDouble(); diff_config.label_mask_in_unitig_fraction = experiment.get("unitig_in_min_fraction", 0.0).asDouble(); diff_config.label_mask_out_kmer_fraction = experiment.get("out_max_fraction", 0.0).asDouble(); @@ -81,16 +80,11 @@ void call_masked_graphs(const AnnotatedDBG &anno_graph, Json::Value diff_json; fin >> diff_json; - tsl::hopscotch_set foreground_labels; - tsl::hopscotch_set background_labels; - tsl::hopscotch_set shared_foreground_labels; - tsl::hopscotch_set shared_background_labels; - for (const Json::Value &group : diff_json["groups"]) { - if (group["shared_labels"]) { - shared_foreground_labels.clear(); - shared_background_labels.clear(); + tsl::hopscotch_set shared_foreground_labels; + tsl::hopscotch_set shared_background_labels; + if (group["shared_labels"]) { for (const Json::Value &in_label : group["shared_labels"]["in"]) { shared_foreground_labels.emplace(in_label.asString()); } @@ -107,12 +101,13 @@ void call_masked_graphs(const AnnotatedDBG &anno_graph, throw std::runtime_error("Missing experiments in group"); for (const Json::Value &experiment : group["experiments"]) { - DifferentialAssemblyConfig diff_config = diff_assembly_config( - experiment, anno_graph.get_graph() - ); + tsl::hopscotch_set foreground_labels; + tsl::hopscotch_set background_labels; - foreground_labels.clear(); - background_labels.clear(); + DifferentialAssemblyConfig diff_config = diff_assembly_config(experiment); + + std::string exp_name = experiment["name"].asString() + + (config->enumerate_out_sequences ? "." : ""); for (const Json::Value &in_label : experiment["in"]) { foreground_labels.emplace(in_label.asString()); @@ -125,9 +120,6 @@ void call_masked_graphs(const AnnotatedDBG &anno_graph, check_labels(foreground_labels, anno_graph); check_labels(background_labels, anno_graph); - std::string exp_name = experiment["name"].asString() - + (config->enumerate_out_sequences ? "." : ""); - logger->trace("Running assembly: {}", exp_name); callback(*mask_nodes_by_label(anno_graph, @@ -135,7 +127,65 @@ void call_masked_graphs(const AnnotatedDBG &anno_graph, background_labels, shared_foreground_labels, shared_background_labels, - diff_config, num_threads), exp_name); + diff_config, num_threads, + config->parallel_nodes), exp_name); + } + } +} + +void call_masked_graphs(std::shared_ptr graph_ptr, + Config *config, + const CallMaskedGraphHeader &callback) { + assert(!config->assembly_config_file.empty()); + + std::ifstream fin(config->assembly_config_file); + if (!fin.good()) + throw std::iostream::failure("Failed to read assembly config JSON from " + config->assembly_config_file); + + for (const auto &name : config->fnames) { + if (parse_annotation_type(name) != Config::ColumnCompressed) { + throw std::runtime_error("Multiple annotation files must be ColumnCompressed"); + } + } + + size_t num_threads = std::max(1u, get_num_threads()); + + Json::Value diff_json; + fin >> diff_json; + + for (const Json::Value &group : diff_json["groups"]) { + if (group["shared_labels"]) { + throw std::runtime_error("Shared labels not supported with multiple annotation files"); + } + + if (!group["experiments"]) + throw std::runtime_error("Missing experiments in group"); + + for (const Json::Value &experiment : group["experiments"]) { + tsl::hopscotch_set foreground_labels; + tsl::hopscotch_set background_labels; + + DifferentialAssemblyConfig diff_config = diff_assembly_config(experiment); + + std::string exp_name = experiment["name"].asString() + + (config->enumerate_out_sequences ? "." : ""); + + for (const Json::Value &in_label : experiment["in"]) { + foreground_labels.emplace(in_label.asString()); + } + + for (const Json::Value &out_label : experiment["out"]) { + background_labels.emplace(out_label.asString()); + } + + logger->trace("Running assembly: {}", exp_name); + + callback(*mask_nodes_by_label(graph_ptr, + config->fnames, + foreground_labels, + background_labels, + diff_config, num_threads, + config->parallel_nodes), exp_name); } } } @@ -143,9 +193,17 @@ void call_masked_graphs(const AnnotatedDBG &anno_graph, int assemble(Config *config) { assert(config); + if (config->infbase_annotators.size()) { + assert(config->assembly_config_file.size()); + if (!std::filesystem::exists(config->assembly_config_file)) { + logger->error("Differential assembly config does not exist\n{}", + config->assembly_config_file); + exit(1); + } + } + const auto &files = config->fnames; - assert(files.size() == 1); assert(config->outfbase.size()); Timer timer; @@ -156,48 +214,49 @@ int assemble(Config *config) { logger->trace("Graph loaded in {} sec", timer.elapsed()); if (config->infbase_annotators.size()) { - config->infbase = files.at(0); - - assert(config->assembly_config_file.size()); - auto anno_graph = initialize_annotated_dbg(graph, *config); - logger->trace("Generating masked graphs..."); + std::mutex write_mutex; + size_t num_threads = std::max(1u, get_num_threads()); std::filesystem::remove( utils::remove_suffix(config->outfbase, ".gz", ".fasta") + ".fasta.gz" ); - std::mutex write_mutex; - - size_t num_threads = std::max(1u, get_num_threads()); - - call_masked_graphs(*anno_graph, config, - [&](const graph::MaskedDeBruijnGraph &graph, const std::string &header) { - seq_io::FastaWriter writer(config->outfbase, header, - config->enumerate_out_sequences, - num_threads > 1, /* async write */ - "a" /* append mode */); - - if (config->unitigs || config->min_tip_size > 1) { - graph.call_unitigs([&](const std::string &unitig, auto&&) { - std::lock_guard lock(write_mutex); - writer.write(unitig); - }, - num_threads, config->min_tip_size, - config->kmers_in_single_form); - } else { - graph.call_sequences([&](const std::string &seq, auto&&) { - std::lock_guard lock(write_mutex); - writer.write(seq); - }, - num_threads, config->kmers_in_single_form); - } + auto graph_callback = [&](const graph::MaskedDeBruijnGraph &graph, const std::string &header) { + seq_io::FastaWriter writer(config->outfbase, header, + config->enumerate_out_sequences, + num_threads > 1, /* async write */ + "a" /* append mode */); + + if (config->unitigs || config->min_tip_size > 1) { + graph.call_unitigs([&](const std::string &unitig, auto&&) { + std::lock_guard lock(write_mutex); + writer.write(unitig); + }, + num_threads, config->min_tip_size, + config->kmers_in_single_form); + } else { + graph.call_sequences([&](const std::string &seq, auto&&) { + std::lock_guard lock(write_mutex); + writer.write(seq); + }, + num_threads, config->kmers_in_single_form); } - ); - + }; + + if (files.size() > 1) { + config->fnames.erase(config->fnames.begin(), config->fnames.begin() + 1); + call_masked_graphs(graph, config, graph_callback); + } else { + config->infbase = files.at(0); + auto anno_graph = initialize_annotated_dbg(graph, *config); + call_masked_graphs(*anno_graph, config, graph_callback); + } return 0; } + assert(files.size() == 1); + logger->trace("Extracting sequences from graph..."); timer.reset(); diff --git a/metagraph/src/common/vectors/bitmap.cpp b/metagraph/src/common/vectors/bitmap.cpp index f4888735e4..6d31c5f7c8 100644 --- a/metagraph/src/common/vectors/bitmap.cpp +++ b/metagraph/src/common/vectors/bitmap.cpp @@ -356,6 +356,13 @@ uint64_t bitmap_lazy::num_set_bits() const { return num_set_bits_; } +std::optional bitmap_lazy::try_get_num_set_bits() const { + if (num_set_bits_ == static_cast(-1)) + return std::nullopt; + + return num_set_bits_; +} + void bitmap_lazy::add_to(sdsl::bit_vector *other) const { assert(other); assert(other->size() == size()); diff --git a/metagraph/src/common/vectors/bitmap.hpp b/metagraph/src/common/vectors/bitmap.hpp index b5b855a5eb..9f8d68438a 100644 --- a/metagraph/src/common/vectors/bitmap.hpp +++ b/metagraph/src/common/vectors/bitmap.hpp @@ -176,6 +176,7 @@ class bitmap_lazy : public bitmap { uint64_t size() const { return size_; } uint64_t num_set_bits() const; + std::optional try_get_num_set_bits() const; void add_to(sdsl::bit_vector *other) const; void call_ones_in_range(uint64_t begin, uint64_t end, @@ -197,12 +198,18 @@ class bitmap_generator : public bitmap { std::for_each(s.begin(), s.end(), callback); }) {} + bitmap_generator(const bitmap &bits) noexcept + : size_(bits.size()), num_set_bits_(bits.num_set_bits()), + generator_([&bits](const VoidCall &callback) { + bits.call_ones(callback); + }) {} + bitmap_generator(std::function&)>&& generator, size_t size, size_t num_set_bits = -1) noexcept; - bool operator[](uint64_t) const { throw std::runtime_error("Not implemented"); } - uint64_t get_int(uint64_t, uint32_t) const { throw std::runtime_error("Not implemented"); } + bool operator[](uint64_t) const { throw std::runtime_error("operator[] not implemented for bitmap_generator"); } + uint64_t get_int(uint64_t, uint32_t) const { throw std::runtime_error("get_int not implemented for bitmap_generator"); } uint64_t size() const { return size_; } uint64_t num_set_bits() const { return num_set_bits_; } diff --git a/metagraph/src/graph/annotated_graph_algorithm.cpp b/metagraph/src/graph/annotated_graph_algorithm.cpp index 60dc814919..0a5ee8d1f7 100644 --- a/metagraph/src/graph/annotated_graph_algorithm.cpp +++ b/metagraph/src/graph/annotated_graph_algorithm.cpp @@ -4,6 +4,7 @@ #include "common/vectors/vector_algorithm.hpp" #include "common/vectors/bitmap.hpp" #include "graph/representation/masked_graph.hpp" +#include "annotation/representation/column_compressed/annotate_column_compressed.hpp" namespace mtg { @@ -12,7 +13,9 @@ namespace graph { using mtg::common::logger; typedef AnnotatedDBG::node_index node_index; +typedef AnnotatedDBG::Annotator Annotator; typedef AnnotatedDBG::Annotator::Label Label; +using Column = annot::binmat::BinaryMatrix::Column; typedef std::function LabelCountCallback; @@ -20,21 +23,36 @@ constexpr std::memory_order MO_RELAXED = std::memory_order_relaxed; /** - * Return an int_vector<>, bit_vector pair, of lengths anno_graph.get_graph().max_index() * 2 - * and anno_graph.get_graph().max_index(), respectively. + * Return an int_vector<>, bit_vector of lengths anno_graph.get_graph().max_index() * 2 + * and anno_graph.get_graph().max_index(), respectively, and a bit_vector of length + * equal to the total number of labels. * For an index i, the int_vector at indices 2*i and 2*i + 1 represent the * number of labels in labels_in and labels_out which the k-mer of index i is * annotated with, respectively. The width of the int_vector<> is computed to be * wide enough to contain counts up to num_labels. * The returned bit_vector is a k-mer mask indicating those k-mers annotated - * with at least one in-label or out-label. + * with at least one in-label. If add_out_labels_to_mask is true, then it also indicates + * which those k-mers with at least one out-label. + * The second bit vector indicates which of the labels correspond to an "other" + * label (i.e., not in- or out-). */ -std::pair, sdsl::bit_vector> -construct_diff_label_count_vector(const AnnotatedDBG &anno_graph, +std::tuple, sdsl::bit_vector, sdsl::bit_vector> +construct_diff_label_count_vector(const Annotator &annotator, const tsl::hopscotch_set