Needle
An application for fast and efficient searches of NGS data.
Functions
ibf.cpp File Reference
#include <chrono>
#include <deque>
#include <iostream>
#include <math.h>
#include <numeric>
#include <omp.h>
#include <string>
#include <algorithm>
#include <filesystem>
#include <ranges>
#include <robin_hood.h>
#include <seqan3/alphabet/container/concatenated_sequences.hpp>
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/core/concept/cereal.hpp>
#include <seqan3/io/sequence_file/all.hpp>
#include <seqan3/io/stream/detail/fast_istreambuf_iterator.hpp>
#include <seqan3/utility/container/dynamic_bitset.hpp>
#include "ibf.h"
#include "shared.h"

Functions

void get_include_set_table (min_arguments const &args, std::filesystem::path const include_file, robin_hood::unordered_set< uint64_t > &include_table)
 
bool check_for_fasta_format (std::vector< std::string > const &valid_extensions, std::string const &file_path)
 
uint8_t calculate_cutoff (std::filesystem::path sequence_file, int samples)
 
void fill_hash_table (min_arguments const &args, seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq >> &fin, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table, robin_hood::unordered_node_map< uint64_t, uint8_t > &cutoff_table, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, bool const only_include=false, uint8_t cutoff=0)
 
void count (min_arguments const &args, std::vector< std::filesystem::path > sequence_files, std::filesystem::path include_file, std::filesystem::path exclude_file, bool paired)
 Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available. More...
 
void read_binary (std::filesystem::path filename, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table)
 Reads a binary file that needle minimiser creates. More...
 
void read_binary_start (min_arguments &args, std::filesystem::path filename, uint64_t &num_of_minimisers, uint8_t &cutoff)
 Reads the beginning of a binary file that needle minimiser creates. More...
 
void check_expression (std::vector< uint16_t > &expression_thresholds, uint8_t &number_expression_thresholds, std::filesystem::path const expression_by_genome_file)
 
void check_cutoffs_samples (std::vector< std::filesystem::path > const &sequence_files, bool const paired, std::vector< int > &samples, std::vector< uint8_t > &cutoffs)
 
void check_fpr (uint8_t const number_expression_thresholds, std::vector< double > &fprs)
 
void get_expression_thresholds (uint8_t const number_expression_thresholds, robin_hood::unordered_node_map< uint64_t, uint16_t > const &hash_table, std::vector< uint16_t > &expression_thresholds, std::vector< uint64_t > &sizes, robin_hood::unordered_set< uint64_t > const &genome, uint8_t cutoff, bool all=true)
 
void get_filsize_per_expression_level (std::filesystem::path filename, uint8_t const number_expression_thresholds, std::vector< uint16_t > const &expression_thresholds, std::vector< uint64_t > &sizes, robin_hood::unordered_set< uint64_t > const &genome, bool all=true)
 
template<bool samplewise, bool minimiser_files_given = true>
void ibf_helper (std::vector< std::filesystem::path > const &minimiser_files, std::vector< double > const &fprs, estimate_ibf_arguments &ibf_args, std::vector< uint8_t > &cutoffs={}, size_t num_hash=1, std::filesystem::path expression_by_genome_file="", minimiser_arguments const &minimiser_args={})
 
std::vector< uint16_t > ibf (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< double > &fpr, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file, size_t num_hash)
 Creates IBFs. More...
 
std::vector< uint16_t > ibf (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::vector< double > &fpr, std::filesystem::path const expression_by_genome_file, size_t num_hash)
 Creates IBFs based on the minimiser files. More...
 
void calculate_minimiser (std::vector< std::filesystem::path > const &sequence_files, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, min_arguments const &args, minimiser_arguments const &minimiser_args, unsigned const i, std::vector< uint8_t > &cutoffs)
 
void minimiser (std::vector< std::filesystem::path > const &sequence_files, min_arguments const &args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs)
 Create minimiser and header files. More...
 

Function Documentation

◆ calculate_cutoff()

uint8_t calculate_cutoff ( std::filesystem::path  sequence_file,
int  samples 
)

◆ calculate_minimiser()

void calculate_minimiser ( std::vector< std::filesystem::path > const &  sequence_files,
robin_hood::unordered_set< uint64_t > const &  include_set_table,
robin_hood::unordered_set< uint64_t > const &  exclude_set_table,
min_arguments const &  args,
minimiser_arguments const &  minimiser_args,
unsigned const  i,
std::vector< uint8_t > &  cutoffs 
)

◆ check_cutoffs_samples()

void check_cutoffs_samples ( std::vector< std::filesystem::path > const &  sequence_files,
bool const  paired,
std::vector< int > &  samples,
std::vector< uint8_t > &  cutoffs 
)

◆ check_expression()

void check_expression ( std::vector< uint16_t > &  expression_thresholds,
uint8_t &  number_expression_thresholds,
std::filesystem::path const  expression_by_genome_file 
)

◆ check_for_fasta_format()

bool check_for_fasta_format ( std::vector< std::string > const &  valid_extensions,
std::string const &  file_path 
)
inline

◆ check_fpr()

void check_fpr ( uint8_t const  number_expression_thresholds,
std::vector< double > &  fprs 
)

◆ count()

void count ( min_arguments const &  args,
std::vector< std::filesystem::path >  sequence_files,
std::filesystem::path  genome_file,
std::filesystem::path  exclude_file,
bool  paired 
)

Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available.

Parameters
argsThe minimiser arguments to use (seed, shape, window size).
sequence_filesThe sequence files, which contains the reads.
genome_fileA file containing the transcripts which expression values should be determined.
exclude_fileA file containing minimizers which should be ignored.
pairedFlag to indicate if input data is paired or not.

◆ fill_hash_table()

void fill_hash_table ( min_arguments const &  args,
seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq >> &  fin,
robin_hood::unordered_node_map< uint64_t, uint16_t > &  hash_table,
robin_hood::unordered_node_map< uint64_t, uint8_t > &  cutoff_table,
robin_hood::unordered_set< uint64_t > const &  include_set_table,
robin_hood::unordered_set< uint64_t > const &  exclude_set_table,
bool const  only_include = false,
uint8_t  cutoff = 0 
)

◆ get_expression_thresholds()

void get_expression_thresholds ( uint8_t const  number_expression_thresholds,
robin_hood::unordered_node_map< uint64_t, uint16_t > const &  hash_table,
std::vector< uint16_t > &  expression_thresholds,
std::vector< uint64_t > &  sizes,
robin_hood::unordered_set< uint64_t > const &  genome,
uint8_t  cutoff,
bool  all = true 
)

◆ get_filsize_per_expression_level()

void get_filsize_per_expression_level ( std::filesystem::path  filename,
uint8_t const  number_expression_thresholds,
std::vector< uint16_t > const &  expression_thresholds,
std::vector< uint64_t > &  sizes,
robin_hood::unordered_set< uint64_t > const &  genome,
bool  all = true 
)

◆ get_include_set_table()

void get_include_set_table ( min_arguments const &  args,
std::filesystem::path const  include_file,
robin_hood::unordered_set< uint64_t > &  include_table 
)

◆ ibf() [1/2]

std::vector<uint16_t> ibf ( std::vector< std::filesystem::path > const &  minimiser_files,
estimate_ibf_arguments ibf_args,
std::vector< double > &  fpr,
std::filesystem::path const  expression_by_genome_file = "",
size_t  num_hash = 1 
)

Creates IBFs based on the minimiser files.

Parameters
minimiser_filesA vector of minimiser file paths.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
fprThe average false positive rate that should be used.
expression_by_genome_fileFile that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds.
num_hashThe number of hash functions to use.
Returns
The expression thresholds per experiment.

◆ ibf() [2/2]

std::vector<uint16_t> ibf ( std::vector< std::filesystem::path > const &  sequence_files,
estimate_ibf_arguments ibf_args,
minimiser_arguments minimiser_args,
std::vector< double > &  fpr,
std::vector< uint8_t > &  cutoffs,
std::filesystem::path const  expression_by_genome_file = "",
size_t  num_hash = 1 
)

Creates IBFs.

Parameters
sequence_filesA vector of sequence file paths.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
minimiser_argsThe minimiser specific arguments to use.
fprThe average false positive rate that should be used.
cutoffsList of cutoffs.
expression_by_genome_fileFile that contains the only minimisers that should be considered for the determination of the expression thresholds.
num_hashThe number of hash functions to use.
Returns
The expression thresholds per experiment.

◆ ibf_helper()

template<bool samplewise, bool minimiser_files_given = true>
void ibf_helper ( std::vector< std::filesystem::path > const &  minimiser_files,
std::vector< double > const &  fprs,
estimate_ibf_arguments ibf_args,
std::vector< uint8_t > &  cutoffs = {},
size_t  num_hash = 1,
std::filesystem::path  expression_by_genome_file = "",
minimiser_arguments const &  minimiser_args = {} 
)

◆ minimiser()

void minimiser ( std::vector< std::filesystem::path > const &  sequence_files,
min_arguments const &  args,
minimiser_arguments minimiser_args,
std::vector< uint8_t > &  cutoffs 
)

Create minimiser and header files.

Parameters
sequence_filesA vector of sequence file paths.
argsThe minimiser arguments to use (seed, shape, window size).
minimiser_argsThe minimiser specific arguments to use.
cutoffsList of cutoffs.

◆ read_binary()

void read_binary ( std::filesystem::path  filename,
robin_hood::unordered_node_map< uint64_t, uint16_t > &  hash_table 
)

Reads a binary file that needle minimiser creates.

Parameters
filenameThe filename of the binary file.
hash_tableThe hash table to store minimisers into.

◆ read_binary_start()

void read_binary_start ( min_arguments args,
std::filesystem::path  filename,
uint64_t &  num_of_minimisers,
uint8_t &  cutoff 
)

Reads the beginning of a binary file that needle minimiser creates.

Parameters
argsMin arguments.
filenameThe filename of the binary file.
num_of_minimisersVariable, where to number of minimisers should be stored.
cutoffcutoff value.