Augustus 3.4.0
Loading...
Searching...
No Matches
geneMSA.hh
1/*
2 * geneMSA.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _GENEMSA
9#define _GENEMSA
10
11// project includes
12#include "alignment.hh"
13#include "exoncand.hh"
14#include "orthoexon.hh"
15#include "randseqaccess.hh"
16#include "speciesgraph.hh"
17#include "contTimeMC.hh"
18#include<unordered_map>
19#include<boost/functional/hash.hpp>
20
21//forward declarations
22
28class OrthoGraph;
29string printRFC(vector<int>);
30
37class GeneMSA {
38public:
40 ~GeneMSA();
41
42 /*
43 * get and set functions
44 */
45 // start and end position (0-based, inclusive) of the gene range for the given species
46 // start <= end refer to the FORWARD strand of the sequence (different from class 'alignment')
47 // start = end = -1 if the species is absent from the alignment
48 string getSeqID(int speciesIdx);
49 Strand getStrand(int speciesIdx);
50 int getStart(int speciesIdx){ return starts[speciesIdx]; }
51 int getEnd(int speciesIdx){ return ends[speciesIdx]; }
52 list<ExonCandidate*>* getExonCands(int speciesIdx){ return exoncands.at(speciesIdx); }
53 //list<OrthoExon> getOrthoExons() { return orthoExonsList; }
54 Alignment * getAlignment() {return alignment;}
55 vector<int> getOffsets() {return offsets;}
56
57 map<string,ExonCandidate*>* getECHash(list<ExonCandidate*> *ec); // adds the keys to the map function
58
59 /*
60 * assmotifqthresh, assqthresh, dssqthresh are between 0 and 1 and thresholds for the inclusion of
61 * acceptor/donor splice sites based on the pattern probability
62 * assqthresh=0.05 means that only acceptor ss are considered
63 * that have a pattern, such that 5% of true splice site patterns have lower probability.
64 * The default threshold of 0 means that all splice site patterns are considered.
65 */
66 void createExonCands(int s, const char *dna, map<int_fast64_t, ExonCandidate*> &ecs, map<int_fast64_t, ExonCandidate*> &addECs);
67 void setExonCands(vector<map<int_fast64_t, ExonCandidate*> > &ecs);
68
79 void createOrthoExons(list<OrthoExon> &orthoExonsList, map<int_fast64_t, list<pair<int,ExonCandidate*> > > &alignedECs, Evo *evo, float consThres = 0.0, int minAvLen = 0);
80 void printStats(); // to stdout
81 void printGeneRanges();
82 void printExonCands();
83 void printOrthoExons(list<OrthoExon> &orthoExonsList);
84 void computeOmegas(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree);
85 void computeOmegasEff(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
93 void computeClamsaEff(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
94 void computeClamsa(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
95 vector<string> pruneToBV(vector<string> *cs, bit_vector bv); // prune codon strings to bit_vector
96 vector<int> pruneToBV(vector<int> *rfc, bit_vector bv); // prune RFC to bit_vector
97 double omegaForCodonTuple(vector<double> *loglik);
98 void printOmegaForCodon(string outdir);
99 void printCumOmega();
100 void comparativeSignalScoring(list<OrthoExon> &orthoExonsList);
101 // Charlotte Janas playground
102 LocusTree *constructTree(); // creates, stores are returns the locus tree for the sequence tuple
103
104 // calculate a columnwise conservation score and output it (for each species) in wiggle format
105 void calcConsScore(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, string outdir);
106 double calcColumnScore(int a, int c, int t, int g); // input: number of a,c,t and g's in one alignment column
107 void consToWig(vector<double> &consScore, string outdir);
108
109 // static functions
110 static void setTree(PhyloTree *t){tree = t;}
111 static void setCodonEvo(CodonEvo *c){ codonevo = c; }
112 static void setCodonEvoDiscr(CodonEvoDiscr *c){ codonevodiscr = c; }
113 static int numSpecies(){ return tree->numSpecies(); }
114 static void openOutputFiles(string outdir);
115 static void closeOutputFiles();
116
117 // static data members
118 static int padding; // add this many bases to the region before and after the aligned region
119 static int orthoExonID; // ID for exons of different species which are orthologous to each other
120 static int geneRangeID; // stores an ID for the possible gene ranges of the different species which belong together
121 static vector<int> exonCandID; // IDs for exon candidates of different species
122 static unordered_map< bit_vector, PhyloTree*, boost::hash<bit_vector>> topologies;
123 // pointers to the output files
124 static vector< ofstream* > exonCands_outfiles, orthoExons_outfiles, geneRanges_outfiles_bed, geneRanges_outfiles_gff, omega_outfiles;
125 // map that stored all codon combinations on which fitch and pruning algorithm already have been run (for calculation of omega and number of substitutions)
126 static map<vector<string>, pair<vector<double>, int> > computedCumValues;
127
128 void printSingleOrthoExon(OrthoExon const &oe, bool files = true);
129 void collect_features(int species, list<OrthoExon> *hects, SpeciesGraph *speciesgraph);
130
140 void getAllOEMsas(int species, list<OrthoExon> *hects, unordered_map<string,int> *ref_class,
141 vector<AnnoSequence*> const &seqRanges);
142
152 StringAlignment getMsa(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges, size_t flanking = 0);
153private:
173 vector<string> getCodonAlignment(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges,
174 const vector<vector<fragment>::const_iterator > &froms,
175 map<unsigned, vector<int> > *alignedCodons = NULL,
176 bool generateString = true,
177 ofstream *codonAli = NULL);
197 vector<string> getCodonAlignment2(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges,
198 const vector<vector<fragment>::const_iterator > &froms);
199 void cutIncompleteCodons(OrthoExon &oe);
200 cumValues* findCumValues(bit_vector bv, vector<int> rfc);
201 static PhyloTree *tree;
202 LocusTree *ltree;
203 static CodonEvo *codonevo;
204 static CodonEvoDiscr *codonevodiscr;
205 vector<int> starts, ends; // gene ranges for each species
206 vector<int> offsets; // this many bases are upstream from the region
207 RandSeqAccess *rsa;
208 Alignment* alignment; // alignment of regions which possibly belong to a gene
209 vector< list<ExonCandidate*>* > exoncands; // exon candidates found in the different species in a gene segment
210 // store cumulative omega values for every reading frame combination and every bitvector that exist
211 unordered_map<bit_vector, vector<pair<vector<int>, cumValues> >, boost::hash<bit_vector> > cumOmega;
212 map<bit_vector, map<vector<int>, vector<double> > > codonOmega; // deprecated
213
214 #ifdef TESTING
215 /*
216 * added by Giovanna Migliorelli 14.05.2020
217 * code responsible for serialization
218 * aknowledgement : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
219 */
220
221 friend class boost::serialization::access;
222 template<class Archive>
223 void serialize(Archive & ar, const unsigned int version)
224 {
225 ar & starts;
226 ar & alignment;
227
228 }
229
230 /* added by Giovanna Migliorelli 14.05.2020
231 * default consttruct to allow for serialization
232 */
233
234 GeneMSA(){};
235
236 #endif
237};
238
239
240
241#endif // _GENEMSA
global multiple sequence alignment with efficiently stored long gaps.
Definition alignment.hh:160
class CodonEvoDiscr
Definition codonevodiscr.hh:35
class CodonEvo
Definition codonevo.hh:30
abstract base class for codon and exon evolution
Definition contTimeMC.hh:31
multiple sequence alignment of genomes for comparative gene prediction
Definition geneMSA.hh:37
void computeClamsaEff(list< OrthoExon > &orthoExonsList, vector< AnnoSequence * > const &seqRanges, PhyloTree *ctree, ofstream *codonAli)
Definition geneMSA.cc:1724
void createOrthoExons(list< OrthoExon > &orthoExonsList, map< int_fast64_t, list< pair< int, ExonCandidate * > > > &alignedECs, Evo *evo, float consThres=0.0, int minAvLen=0)
Definition geneMSA.cc:223
StringAlignment getMsa(OrthoExon const &oe, vector< AnnoSequence * > const &seqRanges, size_t flanking=0)
Definition geneMSA.cc:783
void getAllOEMsas(int species, list< OrthoExon > *hects, unordered_map< string, int > *ref_class, vector< AnnoSequence * > const &seqRanges)
Definition geneMSA.cc:738
Definition phylotree.hh:171
maintains orthologous exons for comparative gene prediction
Definition orthoexon.hh:36
orthologous graphs for comparative gene prediction
Definition orthograph.hh:34
Definition phylotree.hh:104
abstract class for quick access to an arbitrary sequence segment in genomes needed for comparative ge...
Definition randseqaccess.hh:62
builds a directed acyclic graph from a set of sampled genes.
Definition speciesgraph.hh:32
global multiple sequence alignment in (standard) string representation
Definition alignment.hh:351
Definition types.hh:903