Augustus 3.4.0
Loading...
Searching...
No Matches
alignment.hh
1/*
2 * alignment.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _ALIGNMENT
9#define _ALIGNMENT
10
11#include "types.hh"
12
13#include <vector>
14#include <list>
15#include <iostream>
16#include <exception>
17#include <climits>
18
19class MsaSignature; // forward declaration
20
26class fragment {
27public:
28 fragment(int chrPos, int aliPos, int len) : chrPos(chrPos), aliPos(aliPos), len(len) {}
29 fragment() {}
30 int chrEnd() const { return chrPos + len - 1; }
31 int chrPos; // chromosomal start position of fragment, 0-based
32 int aliPos; // start position of fragment in alignment, 0-based
33 int len; // fragment length
34
35 /*
36 * added by Giovanna Migliorelli 14.05.2020
37 * code responsible for serialization
38 * aknowledgement : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
39 */
40
41 #ifdef TESTING
42 friend class boost::serialization::access;
43 template<class Archive>
44 void serialize(Archive & ar, const unsigned int version)
45 {
46 ar & chrPos;
47 ar & aliPos;
48 ar & len;
49 }
50 #endif
51};
52
62public:
63 AlignmentRow() : cumFragLen(0) {}
64 AlignmentRow(string seqID, int chrPos, Strand strand, string rowbuffer);
66
67 int chrStart() const;
68 int chrEnd() const;
69 int aliEnd() const;
70 int getSeqLen() const { return chrEnd() - chrStart() + 1; }
71 int getCumFragLen() const { return cumFragLen; }
72 void setCumFragLen(int len) { cumFragLen = len;} // use with care to ensure consistency
73 int gapLenAfterFrag(size_t i) const {
74 if (i+1 >= frags.size())
75 return 0;
76 return frags[i+1].chrPos - frags[i].chrPos - frags[i].len;
77 }
78 void addFragment(int chrPos, int aliPos, int len);
79 void addFragment(fragment &f) { addFragment(f.chrPos, f.aliPos, f.len); }
80 string getSignature() const {return seqID + ((strand == minusstrand)? "-" : "+");}
81 void pack();
82 friend ostream& operator<< (ostream& strm, const AlignmentRow &row);
83 friend void appendRow(AlignmentRow **r1, const AlignmentRow *r2, int aliLen1, string sigstr);
84
90 int getAliPos(int chrPos, vector<fragment>::const_iterator from);
91 int getAliPos(int chrPos, vector<fragment>::const_iterator *from); // variant from Patrick Balmerth
92
93 int getAliPos(int chrPos) { return getAliPos(chrPos, frags.begin()); }
94
95 // convert from alignment to chromosomal position (inverse function of getAliPos())
96 int getChrPos(int aliPos, vector<fragment>::const_iterator from);
97 int getChrPos(int aliPos) { return getChrPos(aliPos, frags.begin()); }
98
99 // data members
100 string seqID; // e.g. chr21
101
102 #ifdef TESTING
103 int seqIDarchive; // GM temporarily added to come around some problem with string serialization
104 int chrLen; // GM added to properly set rsa chr lengths during deserialization
105 #endif
106
107 Strand strand;
108 vector<fragment> frags; // fragments are sorted by alignment positions AND by chromosomal positions (assumption for now)
109private:
110 int cumFragLen; // total length of all fragments, becomes a nonredundant attribute after merging of alignments
111
112 /*
113 * added by Giovanna Migliorelli 14.05.2020
114 * code responsible for serialization
115 * Acknowledegment : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
116 */
117
118 #ifdef TESTING
119 friend class boost::serialization::access;
120 template<class Archive>
121 void serialize(Archive & ar, const unsigned int version)
122 {
123 ar & cumFragLen;
124 ar & seqIDarchive;
125 ar & strand;
126 ar & frags;
127 ar & chrLen;
128 }
129 #endif
130};
131
132
133
161public:
162 Alignment(size_t k) : aliLen(0), rows(k, NULL) {} // initialize with NULL, which stand for missing AlignmentRows
163 ~Alignment(){
164 for (unsigned i=0; i<rows.size(); i++)
165 delete rows[i];
166 }
167 friend bool mergeable (Alignment *a1, Alignment *a2, int maxGapLen, float mergeableFrac, bool strong);
168 friend ostream& operator<< (ostream& strm, const Alignment &a);
169 void printTextGraph(ostream& strm);
170 void merge(Alignment *other, const MsaSignature *sig = NULL); // append 'other' Alignment to this
171 friend Alignment* mergeAliList(list<Alignment*> alis, const MsaSignature *sig);
172 friend void capAliSize(list<Alignment*> &alis, int maxRange);
173 friend void reduceOvlpRanges(list<Alignment*> &alis, size_t maxCov, float covPen);
174 int maxRange(); // chromosomal range, maximized over rows
175 int numRows() const { return rows.size(); }
176 int numFilledRows() const; // number of nonempty rows
177 int getCumFragLen() const; // total length of all fragments
178 int getMaxSeqIdLen() const;
179 friend int medianChrStartEndDiff(Alignment *a1, Alignment *a2);
180 string getSignature() const;
181 int numFitting(const MsaSignature *sig) const;
182 void shiftAliPositions(int offset);
183 void pack(); // merge pairs of fragments without gap into one fragment making the alignment representation more compact
184
185 #ifdef TESTING
186 void convertAlignment(int r, int start, int end);
187 #endif
188
189public: // should rather be private
190 int aliLen; // all aligned sequences are this long when gaps are included
191 vector<AlignmentRow*> rows;
192
193private:
194 /*
195 * added by Giovanna Migliorelli 14.05.2020
196 * code responsible for serialization and default constructor required in serialization
197 * aknowledgement : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
198 */
199
200 #ifdef TESTING
201 friend class boost::serialization::access;
202 template<class Archive>
203 void serialize(Archive & ar, const unsigned int version)
204 {
205 ar & aliLen;
206 ar & rows;
207 }
208
209 Alignment(){};
210 #endif
211};
212
213int medianChrStartEndDiff(Alignment *a1, Alignment *a2);
214
215// sorting operator, with respect to a given species index
217 SortCriterion(size_t speciesIdx) : s(speciesIdx) {};
218 bool operator() (Alignment* const a1, Alignment* const a2){
219 // alignments, where row s is missing come last
220 if (a2->rows[s] == NULL)
221 return true;
222 if (a1->rows[s] == NULL)
223 return false;
224 if (a1->rows[s]->seqID < a2->rows[s]->seqID)
225 return true;
226 if (a1->rows[s]->seqID > a2->rows[s]->seqID)
227 return false;
228 // same sequence, compare chromosomal start positions
229 return (a1->rows[s]->chrStart() < a2->rows[s]->chrStart());
230 }
231 size_t s;
232};
233
234
235// sorting operator, with respect to a given species index, index version
236// for sorting a vector or list of indices to another vector that holds the Alignments
238 IdxSortCriterion(vector<Alignment*> const &a_, size_t speciesIdx) : s(speciesIdx), a(a_) {};
239 bool operator() (int i, int j){
240 // alignments, where row s is missing come last
241 if (a[j]->rows[s] == NULL && a[i]->rows[s]) // this conjunction makes sorting stable where the sequence is missing
242 return true;
243 if (a[i]->rows[s] == NULL)
244 return false;
245 if (a[i]->rows[s]->seqID < a[j]->rows[s]->seqID)
246 return true;
247 if (a[i]->rows[s]->seqID > a[j]->rows[s]->seqID)
248 return false;
249 // same sequence, compare chromosomal start positions
250 return (a[i]->rows[s]->chrStart() < a[j]->rows[s]->chrStart());
251 }
252 size_t s;
253 vector<Alignment*> const &a;
254};
255
256/*
257 * b1 and b2 can be merged in that order because they are very similar and right next to each other.
258 * In at least 'mergeableFrac' of the alignment rows the aligned sequenes are
259 * strong=false: - refer to the same terget sequence, are on the same strand and satisfy 0 <= gaplen <= maxGapLen
260 * if both are present
261 * strong=true: - refer to the same terget sequence, are on the same strand and satisfy 0 <= gaplen <= maxGapLen
262 * if at least one is present
263 */
264bool mergeable (Alignment *b1, Alignment *b2, int maxGapLen, float mergeableFrac, bool strong);
265
266inline bool isGap(char c){
267 return (c == '-' || c == '.');
268}
269
276public:
277 string sigstr() const{
278 string str;
279 for (unsigned s = 0; s < sigrows.size(); ++s)
280 if (sigrows[s] != "")
281 str += itoa(s) + ":" + sigrows[s];
282 return str;
283 }
284 vector<string> sigrows; // each row contains seqID and strand, e.g. chr21+
285 int numAli;
286 int sumAliLen;
287 int sumCumFragLen;
288 int depth;
289 int color;
290 bool operator< (const MsaSignature &other) const {
291 return (sumCumFragLen > other.sumCumFragLen);
292 // before: sorting by 1) number of species and 2) sumAliLen
293 // return (depth > other.depth || (depth == other.depth && sumAliLen > other.sumAliLen));
294 }
295 bool fits(const Alignment &a, size_t s) const {
296 return a.rows[s] && (sigrows[s] == a.rows[s]->seqID + strandChar(a.rows[s]->strand));
297 }
298 static int maxSigStrLen;
299};
300
301bool cmpSigPtr(const MsaSignature *s1, const MsaSignature *s2);
302
303typedef pair<size_t, int> BoundaryFragment; // (s, chrPos), where s= species index
304
310public:
311 bool operator()(BoundaryFragment& bf1, BoundaryFragment& bf2) {
312 return bf1.second > bf2.second; // => sort by increasing chromosomal position
313 }
314};
315
316
321public:
322 MsaInsertion(size_t s, size_t insertpos, string insert): s(s), insertpos(insertpos), insert(insert){ }
323 MsaInsertion() { }
328 bool operator<(const MsaInsertion& other) const {
329 if (insertpos > other.insertpos)
330 return true;
331 if (insertpos < other.insertpos)
332 return false;
333 if (insert.length() > other.insert.length())
334 return true;
335 if (insert.length() < other.insert.length())
336 return false;
337 if (s < other.s)
338 return true;
339 return false;
340 }
341 size_t s;
342 size_t insertpos;
343 string insert;
344};
345
346
352public:
353 StringAlignment(size_t numrows) : rows(numrows, ""), k(numrows), len(0) {} // initialize with empty strings
354 ~StringAlignment(){len = 0;}
355
367 void insert(std::list<MsaInsertion> &insList, int maxInsertLen = INT_MAX);
368
379 size_t removeGapOnlyCols();
380
381 bool isGapOnlyCol(size_t col){
382 for (size_t s = 0; s < k; ++s)
383 if (!rows[s].empty() && rows[s].at(col) != '-')
384 return false;
385 return true;
386 }
387
392 size_t m = 0, rowlen;
393 for (size_t s = 0; s < k; ++s){
394 rowlen = rows[s].length();
395 if (rowlen > 0){
396 if (m == 0)
397 m = rowlen;
398 else if (m != rowlen)
399 throw length_error("StringAlignment with rows of differing lengths");
400 }
401 }
402 len = m;
403 }
404
405 friend ostream& operator<< (ostream& strm, const StringAlignment &msa);
406
407// data members
408 vector<string> rows;
409 size_t k;
410 size_t len;
411};
412
413
414#endif // _ALIGNMENT
global multiple sequence alignment
Definition alignment.hh:61
int getAliPos(int chrPos, vector< fragment >::const_iterator from)
Definition alignment.cc:101
friend void appendRow(AlignmentRow **r1, const AlignmentRow *r2, int aliLen1, string sigstr)
Definition alignment.cc:147
global multiple sequence alignment with efficiently stored long gaps.
Definition alignment.hh:160
friend void capAliSize(list< Alignment * > &alis, int maxRange)
Definition alignment.cc:348
Definition alignment.hh:309
Definition alignment.hh:320
bool operator<(const MsaInsertion &other) const
Definition alignment.hh:328
size_t insertpos
index of species
Definition alignment.hh:342
string insert
at which position to insert
Definition alignment.hh:343
MsaSignature is a summary of the seqId/strand combinations of the alignment.
Definition alignment.hh:275
global multiple sequence alignment in (standard) string representation
Definition alignment.hh:351
void insert(std::list< MsaInsertion > &insList, int maxInsertLen=INT_MAX)
Definition alignment.cc:924
void computeLen()
Definition alignment.hh:391
size_t removeGapOnlyCols()
Definition alignment.cc:954
gapless alignment fragment
Definition alignment.hh:26
Definition alignment.hh:237
Definition alignment.hh:216