Needle
An application for fast and efficient searches of NGS data.
|
#include <chrono>
#include <deque>
#include <iostream>
#include <math.h>
#include <numeric>
#include <omp.h>
#include <string>
#include <algorithm>
#include <filesystem>
#include <ranges>
#include <robin_hood.h>
#include <seqan3/alphabet/container/concatenated_sequences.hpp>
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/core/concept/cereal.hpp>
#include <seqan3/io/sequence_file/all.hpp>
#include <seqan3/io/stream/detail/fast_istreambuf_iterator.hpp>
#include <seqan3/utility/container/dynamic_bitset.hpp>
#include "ibf.h"
#include "shared.h"
Functions | |
void | get_include_set_table (min_arguments const &args, std::filesystem::path const include_file, robin_hood::unordered_set< uint64_t > &include_table) |
bool | check_for_fasta_format (std::vector< std::string > const &valid_extensions, std::string const &file_path) |
uint8_t | calculate_cutoff (std::filesystem::path sequence_file, int samples) |
void | fill_hash_table (min_arguments const &args, seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq >> &fin, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table, robin_hood::unordered_node_map< uint64_t, uint8_t > &cutoff_table, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, bool const only_include=false, uint8_t cutoff=0) |
void | count (min_arguments const &args, std::vector< std::filesystem::path > sequence_files, std::filesystem::path include_file, std::filesystem::path exclude_file, bool paired) |
Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available. More... | |
void | read_binary (std::filesystem::path filename, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table) |
Reads a binary file that needle minimiser creates. More... | |
void | read_binary_start (min_arguments &args, std::filesystem::path filename, uint64_t &num_of_minimisers, uint8_t &cutoff) |
Reads the beginning of a binary file that needle minimiser creates. More... | |
void | check_expression (std::vector< uint16_t > &expression_thresholds, uint8_t &number_expression_thresholds, std::filesystem::path const expression_by_genome_file) |
void | check_cutoffs_samples (std::vector< std::filesystem::path > const &sequence_files, bool const paired, std::vector< int > &samples, std::vector< uint8_t > &cutoffs) |
void | check_fpr (uint8_t const number_expression_thresholds, std::vector< double > &fprs) |
void | get_expression_thresholds (uint8_t const number_expression_thresholds, robin_hood::unordered_node_map< uint64_t, uint16_t > const &hash_table, std::vector< uint16_t > &expression_thresholds, std::vector< uint64_t > &sizes, robin_hood::unordered_set< uint64_t > const &genome, uint8_t cutoff, bool all=true) |
void | get_filsize_per_expression_level (std::filesystem::path filename, uint8_t const number_expression_thresholds, std::vector< uint16_t > const &expression_thresholds, std::vector< uint64_t > &sizes, robin_hood::unordered_set< uint64_t > const &genome, bool all=true) |
template<bool samplewise, bool minimiser_files_given = true> | |
void | ibf_helper (std::vector< std::filesystem::path > const &minimiser_files, std::vector< double > const &fprs, estimate_ibf_arguments &ibf_args, std::vector< uint8_t > &cutoffs={}, size_t num_hash=1, std::filesystem::path expression_by_genome_file="", minimiser_arguments const &minimiser_args={}) |
std::vector< uint16_t > | ibf (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< double > &fpr, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file, size_t num_hash) |
Creates IBFs. More... | |
std::vector< uint16_t > | ibf (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::vector< double > &fpr, std::filesystem::path const expression_by_genome_file, size_t num_hash) |
Creates IBFs based on the minimiser files. More... | |
void | calculate_minimiser (std::vector< std::filesystem::path > const &sequence_files, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, min_arguments const &args, minimiser_arguments const &minimiser_args, unsigned const i, std::vector< uint8_t > &cutoffs) |
void | minimiser (std::vector< std::filesystem::path > const &sequence_files, min_arguments const &args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs) |
Create minimiser and header files. More... | |
uint8_t calculate_cutoff | ( | std::filesystem::path | sequence_file, |
int | samples | ||
) |
void calculate_minimiser | ( | std::vector< std::filesystem::path > const & | sequence_files, |
robin_hood::unordered_set< uint64_t > const & | include_set_table, | ||
robin_hood::unordered_set< uint64_t > const & | exclude_set_table, | ||
min_arguments const & | args, | ||
minimiser_arguments const & | minimiser_args, | ||
unsigned const | i, | ||
std::vector< uint8_t > & | cutoffs | ||
) |
void check_cutoffs_samples | ( | std::vector< std::filesystem::path > const & | sequence_files, |
bool const | paired, | ||
std::vector< int > & | samples, | ||
std::vector< uint8_t > & | cutoffs | ||
) |
void check_expression | ( | std::vector< uint16_t > & | expression_thresholds, |
uint8_t & | number_expression_thresholds, | ||
std::filesystem::path const | expression_by_genome_file | ||
) |
|
inline |
void check_fpr | ( | uint8_t const | number_expression_thresholds, |
std::vector< double > & | fprs | ||
) |
void count | ( | min_arguments const & | args, |
std::vector< std::filesystem::path > | sequence_files, | ||
std::filesystem::path | genome_file, | ||
std::filesystem::path | exclude_file, | ||
bool | paired | ||
) |
Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available.
args | The minimiser arguments to use (seed, shape, window size). |
sequence_files | The sequence files, which contains the reads. |
genome_file | A file containing the transcripts which expression values should be determined. |
exclude_file | A file containing minimizers which should be ignored. |
paired | Flag to indicate if input data is paired or not. |
void fill_hash_table | ( | min_arguments const & | args, |
seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq >> & | fin, | ||
robin_hood::unordered_node_map< uint64_t, uint16_t > & | hash_table, | ||
robin_hood::unordered_node_map< uint64_t, uint8_t > & | cutoff_table, | ||
robin_hood::unordered_set< uint64_t > const & | include_set_table, | ||
robin_hood::unordered_set< uint64_t > const & | exclude_set_table, | ||
bool const | only_include = false , |
||
uint8_t | cutoff = 0 |
||
) |
void get_expression_thresholds | ( | uint8_t const | number_expression_thresholds, |
robin_hood::unordered_node_map< uint64_t, uint16_t > const & | hash_table, | ||
std::vector< uint16_t > & | expression_thresholds, | ||
std::vector< uint64_t > & | sizes, | ||
robin_hood::unordered_set< uint64_t > const & | genome, | ||
uint8_t | cutoff, | ||
bool | all = true |
||
) |
void get_filsize_per_expression_level | ( | std::filesystem::path | filename, |
uint8_t const | number_expression_thresholds, | ||
std::vector< uint16_t > const & | expression_thresholds, | ||
std::vector< uint64_t > & | sizes, | ||
robin_hood::unordered_set< uint64_t > const & | genome, | ||
bool | all = true |
||
) |
void get_include_set_table | ( | min_arguments const & | args, |
std::filesystem::path const | include_file, | ||
robin_hood::unordered_set< uint64_t > & | include_table | ||
) |
std::vector<uint16_t> ibf | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
estimate_ibf_arguments & | ibf_args, | ||
std::vector< double > & | fpr, | ||
std::filesystem::path const | expression_by_genome_file = "" , |
||
size_t | num_hash = 1 |
||
) |
Creates IBFs based on the minimiser files.
minimiser_files | A vector of minimiser file paths. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
fpr | The average false positive rate that should be used. |
expression_by_genome_file | File that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds. |
num_hash | The number of hash functions to use. |
std::vector<uint16_t> ibf | ( | std::vector< std::filesystem::path > const & | sequence_files, |
estimate_ibf_arguments & | ibf_args, | ||
minimiser_arguments & | minimiser_args, | ||
std::vector< double > & | fpr, | ||
std::vector< uint8_t > & | cutoffs, | ||
std::filesystem::path const | expression_by_genome_file = "" , |
||
size_t | num_hash = 1 |
||
) |
Creates IBFs.
sequence_files | A vector of sequence file paths. |
ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
minimiser_args | The minimiser specific arguments to use. |
fpr | The average false positive rate that should be used. |
cutoffs | List of cutoffs. |
expression_by_genome_file | File that contains the only minimisers that should be considered for the determination of the expression thresholds. |
num_hash | The number of hash functions to use. |
void ibf_helper | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
std::vector< double > const & | fprs, | ||
estimate_ibf_arguments & | ibf_args, | ||
std::vector< uint8_t > & | cutoffs = {} , |
||
size_t | num_hash = 1 , |
||
std::filesystem::path | expression_by_genome_file = "" , |
||
minimiser_arguments const & | minimiser_args = {} |
||
) |
void minimiser | ( | std::vector< std::filesystem::path > const & | sequence_files, |
min_arguments const & | args, | ||
minimiser_arguments & | minimiser_args, | ||
std::vector< uint8_t > & | cutoffs | ||
) |
Create minimiser and header files.
sequence_files | A vector of sequence file paths. |
args | The minimiser arguments to use (seed, shape, window size). |
minimiser_args | The minimiser specific arguments to use. |
cutoffs | List of cutoffs. |
void read_binary | ( | std::filesystem::path | filename, |
robin_hood::unordered_node_map< uint64_t, uint16_t > & | hash_table | ||
) |
Reads a binary file that needle minimiser creates.
filename | The filename of the binary file. |
hash_table | The hash table to store minimisers into. |
void read_binary_start | ( | min_arguments & | args, |
std::filesystem::path | filename, | ||
uint64_t & | num_of_minimisers, | ||
uint8_t & | cutoff | ||
) |
Reads the beginning of a binary file that needle minimiser creates.
args | Min arguments. |
filename | The filename of the binary file. |
num_of_minimisers | Variable, where to number of minimisers should be stored. |
cutoff | cutoff value. |