Augustus 3.4.0
Loading...
Searching...
No Matches
genomicMSA.hh
1/*
2 * genomicMSA.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _GENOMICMSA
9#define _GENOMICMSA
10
11#include "alignment.hh"
12#include "exoncand.hh"
13#include "geneMSA.hh"
14#include "randseqaccess.hh"
15#include <boost/graph/adjacency_list.hpp>
16#include <boost/graph/depth_first_search.hpp>
17
18#define NUMCOLNAMES 32
19const string colornames[NUMCOLNAMES] = {"aquamarine", "darksalmon", "gainsboro", "gold", "cadetblue", "yellowgreen",
20 "crimson", "peru", "cadetblue1", "hotpink1", "lightcyan", "magenta", "mediumseagreen",
21 "mintcream", "olivedrab3", "violetred", "grey44", "peachpuff", "chartreuse3",
22 "aquamarine2", "darkorange1", "forestgreen", "gray66", "khaki", "olivedrab1",
23 "skyblue4", "maroon2", "grey40", "darkturquoise", "brown4", "seagreen1", "royalblue3"};
24
29struct AliNode {
30 Alignment* a;
31 int id;
32 int weight;
33 bool covered;
34 int pred;
35 int topoIdx;
36};
37
42struct AliEdge {
43 int weight;
44};
45
50struct AliGraph {
51 vector<size_t> topo; // topologial order
52 int maxWexceptCov;
53};
54
55// typedef for an AlignmentGraph: aligments with edges for possible neighborhood in the parental alignment
56typedef boost::adjacency_list<boost::setS, // disallow parallel edges
57 boost::vecS, // random access to nodes (deletion inefficient)
58 boost::bidirectionalS,
59 AliNode,
60 boost::property < boost::edge_weight_t, int, AliEdge >,
61 AliGraph> AlignmentGraph;
62
67class AliPath {
68public:
69 int maxSeqRange(AlignmentGraph &g);
70 list<int> path; // the nodes indices on the simple path through the alignment graph
71 const MsaSignature *sig; // signature used to construct the graph (some paths differ only through sig)
72 set<string> ranges;
73 int weights;
74 friend ostream& operator<< (ostream& strm, const AliPath &p);
75};
76
77typedef AlignmentGraph::vertex_descriptor vertex_descriptor;
78typedef AlignmentGraph::edge_descriptor edge_descriptor;
79
84class dfs_time_visitor: public boost::default_dfs_visitor {
85 // typedef typename property_traits < size_t* >::value_type T;
86public:
87 dfs_time_visitor(size_t *fmap, size_t n) : m_ftimemap(fmap), t(n) { }
88 template < typename Vertex, typename Graph >
89 void finish_vertex(Vertex u, const Graph & g) { m_ftimemap[--t] = u;}
90 size_t *m_ftimemap;
91 size_t t;
92};
93
94
95
96// use funcion overloading on this as the STL list cannot delete from normal and reverse iterators in the same way
97void eraseListRange(list<int> L, list<int>::reverse_iterator from, list<int>::reverse_iterator to);
98void eraseListRange(list<int> L, list<int>::iterator from, list<int>::iterator to);
99
106public:
107 GenomicMSA(RandSeqAccess *rsa_) : rsa(rsa_){}
108 ~GenomicMSA(){}
109
110 void readAlignment(string alignFilename); // reads a multiple species alignment from a *.maf file
111 void printAlignment(string outFname); // print alignment in .maf format, to stdout if outFname is empty string
112 int numAlignments() { return alignment.size(); }
113
114 GeneMSA *getNextGene();
119 vector<map<string,int>> chrLen;
124 void compactify();
129 void findGeneRanges();
133 void writeDot(AlignmentGraph const &g, string fname, MsaSignature const *superSig = NULL);
134
135 void project(AlignmentGraph &g, const MsaSignature *sig);
136 AliPath getBestConsensus(AlignmentGraph &g, const MsaSignature *sig, int &numNewCovered);
137 int findBestPath(AlignmentGraph &g);
141 //void chunkyFyPaths(vector<AliPath> &allPaths, AlignmentGraph &g);
145 bool prunePaths(vector<AliPath> &allPaths, AlignmentGraph &g);
146 template< class Iterator >
147 bool prunePathWrt2Other(AliPath &p, Iterator pstart, Iterator pend,
148 AliPath &other, Iterator ostart, Iterator oend,
149 AlignmentGraph &g, bool forward);
150 bool deletePathWrt2Other(AliPath &p, AliPath &other, AlignmentGraph &g);
151
152 static int weight(const Alignment *a, const MsaSignature *sig); // node weight when projecting a to sig
153 static int weight(const Alignment *a, const Alignment *b, const MsaSignature *sig); // edge weight after projection to sig
154
155 #ifdef TESTING
156 // temporarily added to come around some problem with boost string serialization
157 int seqID2seqIDarhiveConversion(int species, string seqID){return seqID2seqIDarhive[species][seqID];}
158 string seqIDarhive2seqIDConversion(int species, int seqIDarchive){return seqIDarhive2seqID[species][seqIDarchive];}
159 void readNameDB(string dir);
160 Alignment* getNextAlignment();
161 #endif
162
163private:
164 list<Alignment*> alignment;
165 int numSpecies;
166 RandSeqAccess *rsa; // the actual data is managed in CompGenePred
167 map<string, MsaSignature> signatures;
168 static int maxIntronLen;
169 static int minGeneLen;
170 static int maxGeneLen;
171
172 #ifdef TESTING
173 // temporarily added to come around some problem with boost string serialization
174 vector<map<string, int> > seqID2seqIDarhive;
175 vector<map<int, string> > seqIDarhive2seqID;
176 #endif
177};
178
179#endif // _GENOMICMSA
Definition genomicMSA.hh:67
global multiple sequence alignment with efficiently stored long gaps.
Definition alignment.hh:160
multiple sequence alignment of genomes for comparative gene prediction
Definition geneMSA.hh:37
multiple sequence alignment of genomes for comparative gene prediction
Definition genomicMSA.hh:105
void writeDot(AlignmentGraph const &g, string fname, MsaSignature const *superSig=NULL)
Definition genomicMSA.cc:1066
vector< map< string, int > > chrLen
Definition genomicMSA.hh:119
bool prunePaths(vector< AliPath > &allPaths, AlignmentGraph &g)
Definition genomicMSA.cc:693
void findGeneRanges()
Definition genomicMSA.cc:352
void compactify()
Definition genomicMSA.cc:317
Definition graph.hh:170
MsaSignature is a summary of the seqId/strand combinations of the alignment.
Definition alignment.hh:275
abstract class for quick access to an arbitrary sequence segment in genomes needed for comparative ge...
Definition randseqaccess.hh:62
Definition genomicMSA.hh:84
Definition genomicMSA.hh:42
Definition genomicMSA.hh:50
Definition genomicMSA.hh:29