Needle
An application for fast and efficient searches of NGS data.
Loading...
Searching...
No Matches
Classes | Functions
ibf.h File Reference
#include <iostream>
#include <math.h>
#include <numeric>
#include <string>
#include <seqan3/alphabet/container/concatenated_sequences.hpp>
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <filesystem>
#include "shared.h"

Go to the source code of this file.

Classes

struct  minimiser_arguments
 
struct  RandomGenerator
 Generates a random integer not greater than a given maximum. More...
 

Functions

void count (min_arguments const &args, std::vector< std::filesystem::path > sequence_files, std::filesystem::path include_file, std::filesystem::path genome_file, bool paired)
 Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available.
 
void count_genome (min_arguments const &args, std::filesystem::path include_file, std::filesystem::path exclude_file)
 Creates a set of minimizers to ignore, which should be used as an input to count.
 
void read_binary (std::filesystem::path filename, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table)
 Reads a binary file that needle minimiser creates.
 
void read_binary_start (min_arguments &args, std::filesystem::path filename, uint64_t &num_of_minimisers, uint8_t &cutoff)
 Reads the beginning of a binary file that needle minimiser creates.
 
std::vector< uint16_t > ibf (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< double > &fpr, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file="", size_t num_hash=1)
 Creates IBFs.
 
std::vector< uint16_t > ibf (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::vector< double > &fpr, std::filesystem::path const expression_by_genome_file="", size_t num_hash=1)
 Creates IBFs based on the minimiser files.
 
void minimiser (std::vector< std::filesystem::path > const &sequence_files, min_arguments const &args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs)
 Create minimiser and header files.
 
std::vector< uint16_t > insert (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise)
 Insert into IBFs.
 
std::vector< uint16_t > insert (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise)
 Insert into IBFs based on the minimiser files.
 
void delete_bin (std::vector< uint64_t > const &delete_files, estimate_ibf_arguments &ibf_args, std::filesystem::path path_in, bool samplewise)
 Delete bins from ibfs.
 

Function Documentation

◆ count()

void count ( min_arguments const &  args,
std::vector< std::filesystem::path >  sequence_files,
std::filesystem::path  include_file,
std::filesystem::path  genome_file,
bool  paired 
)

Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available.

Parameters
argsThe minimiser arguments to use (seed, shape, window size).
sequence_filesThe sequence files, which contains the reads.
include_fileA file containing the transcripts which expression values should be determined.
genome_fileA "*.genome" file constructed with the command genome.
pairedFlag to indicate if input data is paired or not.

◆ count_genome()

void count_genome ( min_arguments const &  args,
std::filesystem::path  include_file,
std::filesystem::path  exclude_file 
)

Creates a set of minimizers to ignore, which should be used as an input to count.

Parameters
argsThe minimiser arguments to use (seed, shape, window size).
include_fileA file containing the transcripts which expression values should be determined.
exclude_fileA file containing minimizers which should be ignored.

◆ delete_bin()

void delete_bin ( std::vector< uint64_t > const &  delete_files,
estimate_ibf_arguments ibf_args,
std::filesystem::path  path_in,
bool  samplewise 
)

Delete bins from ibfs.

Parameters
delete_filesA vector of integers specifiying the bins to delete.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
path_inInput directory.
samplewiseTrue, if expression levels were set beforehand.

◆ ibf() [1/2]

std::vector< uint16_t > ibf ( std::vector< std::filesystem::path > const &  minimiser_files,
estimate_ibf_arguments ibf_args,
std::vector< double > &  fpr,
std::filesystem::path const  expression_by_genome_file = "",
size_t  num_hash = 1 
)

Creates IBFs based on the minimiser files.

Parameters
minimiser_filesA vector of minimiser file paths.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
fprThe average false positive rate that should be used.
expression_by_genome_fileFile that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds.
num_hashThe number of hash functions to use.
Returns
The expression thresholds per experiment.

◆ ibf() [2/2]

std::vector< uint16_t > ibf ( std::vector< std::filesystem::path > const &  sequence_files,
estimate_ibf_arguments ibf_args,
minimiser_arguments minimiser_args,
std::vector< double > &  fpr,
std::vector< uint8_t > &  cutoffs,
std::filesystem::path const  expression_by_genome_file = "",
size_t  num_hash = 1 
)

Creates IBFs.

Parameters
sequence_filesA vector of sequence file paths.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
minimiser_argsThe minimiser specific arguments to use.
fprThe average false positive rate that should be used.
cutoffsList of cutoffs.
expression_by_genome_fileFile that contains the only minimisers that should be considered for the determination of the expression thresholds.
num_hashThe number of hash functions to use.
Returns
The expression thresholds per experiment.

◆ insert() [1/2]

std::vector< uint16_t > insert ( std::vector< std::filesystem::path > const &  minimiser_files,
estimate_ibf_arguments ibf_args,
std::filesystem::path const  expression_by_genome_file,
std::filesystem::path  path_in,
bool  samplewise 
)

Insert into IBFs based on the minimiser files.

Parameters
minimiser_filesA vector of minimiser file paths.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
expression_by_genome_fileFile that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds.
path_inInput directory.
samplewiseTrue, if expression levels were set beforehand.
Returns
The expression thresholds per experiment.

◆ insert() [2/2]

std::vector< uint16_t > insert ( std::vector< std::filesystem::path > const &  sequence_files,
estimate_ibf_arguments ibf_args,
minimiser_arguments minimiser_args,
std::vector< uint8_t > &  cutoffs,
std::filesystem::path const  expression_by_genome_file,
std::filesystem::path  path_in,
bool  samplewise 
)

Insert into IBFs.

Parameters
sequence_filesA vector of sequence file paths.
ibf_argsThe IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments.
minimiser_argsThe minimiser specific arguments to use.
cutoffsList of cutoffs.
expression_by_genome_fileFile that contains the only minimisers that should be considered for the determination of the expression thresholds.
path_inInput directory.
samplewiseTrue, if expression levels were set beforehand.
Returns
The expression thresholds per experiment.

◆ minimiser()

void minimiser ( std::vector< std::filesystem::path > const &  sequence_files,
min_arguments const &  args,
minimiser_arguments minimiser_args,
std::vector< uint8_t > &  cutoffs 
)

Create minimiser and header files.

Parameters
sequence_filesA vector of sequence file paths.
argsThe minimiser arguments to use (seed, shape, window size).
minimiser_argsThe minimiser specific arguments to use.
cutoffsList of cutoffs.

◆ read_binary()

void read_binary ( std::filesystem::path  filename,
robin_hood::unordered_node_map< uint64_t, uint16_t > &  hash_table 
)

Reads a binary file that needle minimiser creates.

Parameters
filenameThe filename of the binary file.
hash_tableThe hash table to store minimisers into.

◆ read_binary_start()

void read_binary_start ( min_arguments args,
std::filesystem::path  filename,
uint64_t &  num_of_minimisers,
uint8_t &  cutoff 
)

Reads the beginning of a binary file that needle minimiser creates.

Parameters
argsMin arguments.
filenameThe filename of the binary file.
num_of_minimisersVariable, where to number of minimisers should be stored.
cutoffcutoff value.