Augustus 3.4.0
Loading...
Searching...
No Matches
liftover.hh
1/*
2 * liftover.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _LIFTOVER_HH
9#define _LIFTOVER_HH
10
11// project includes
12#include "alignment.hh"
13
27
28public:
29 SeqIntKey(int_fast64_t start, int_fast64_t len, int_fast64_t addInfo){
30 key = (start << 22) // 42 bits
31 + (len << 7 ) // 15 bits
32 + (addInfo); // 7 bits
33 }
34 SeqIntKey(int_fast64_t k) : key(k) {}
35 ~SeqIntKey() {}
36 inline int_fast64_t getStart() const {return key >> 22;}
37 inline int_fast64_t getLen() const {return ((key>>7) & 0x7FFF);} // 0x7FFF bit mask for retrieving bits 1-15
38 inline int_fast64_t getAddInfo() const {return key & 0x7F;} // 0x7F bit mask for retrieving bits 1-7
39 inline int_fast64_t getKey() const {return key;}
40
41private:
42 int_fast64_t key;
43};
44
60class LiftOver {
61
62private:
63 Alignment* alignment;
64 vector<int> &offsets;
65
66public:
67 LiftOver(Alignment* a, vector<int> &o) : alignment(a), offsets(o) {}
68 ~LiftOver() {}
69
70 /*
71 * mapping sequence intervals from positions in the genome to positions in the alignment
72 * input: a vector of hashes (keys encode start and length of the SIs in the genome, value are pointers to the SIs)
73 * ( i-th position in the vector points to the hash of SIs of species i)
74 * output: a hash that stores for each sequence interval T in alignment space (key encodes start and length of the SI in the alignment)
75 * a list of (speciesIdx, T*), e.g.
76 * alignedSIs["100:200"] = {(0, si0), (3, si3)}
77 */
78 template <typename T> void projectToAli(vector< map<int_fast64_t,T*> > &seqints, // input
79 map<int_fast64_t, list<pair<int,T*> > > &alignedSIs) { // output
80
81 int k = alignment->rows.size();
82 int_fast64_t aliStart, aliEnd;
83 int chrStart, chrEnd;
84
85 typename map<int_fast64_t, list<pair<int,T*> > >::iterator asi;
86 /*
87 * map all SIs from genome coordinates to alignment coordinates where possible
88 * this search in LINEAR in the length of all sequence intervals
89 * + the number of all alignment fragments
90 */
91 for (size_t s=0; s<k; s++){
92 if (alignment->rows[s] == NULL)
93 continue;
94 int offset = offsets[s];
95 AlignmentRow *row = alignment->rows[s];
96 vector<fragment>::const_iterator from = row->frags.begin();
97 for(typename map<int_fast64_t,T*>::iterator siit = seqints[s].begin(); siit != seqints[s].end(); ++siit){
98
99 // go the the first fragment that may contain the SI start
100 while (from != row->frags.end() && from->chrPos + from->len - 1 < siit->second->getStart() + offset)
101 ++from;
102 if (from == row->frags.end())
103 break; // have searched beyond the last alignment fragment => finished
104 SeqIntKey chrKey(siit->first); // key encodes SI start position and length in the genome
105 chrStart = chrKey.getStart() + offset;
106 aliStart = row->getAliPos(chrStart, from);
107 if (aliStart >= 0){ // left sequence boundary mappable
108 chrEnd = chrKey.getLen() + chrStart;
109 aliEnd = row->getAliPos(chrEnd, from);
110 if (aliEnd >= 0){
111 // both sequence boundaries were mappable
112 // store the SI in the hash
113 SeqIntKey aliKey(aliStart,aliEnd-aliStart,chrKey.getAddInfo());
114 asi = alignedSIs.find(aliKey.getKey()); // key encodes SI start position and length in the alignment
115 if (asi == alignedSIs.end()){ // insert new list
116 list<pair<int,T*> > si;
117 si.push_back(pair<int,T*> (s, siit->second));
118 alignedSIs.insert(pair<int_fast64_t,list<pair<int,T*> > >(aliKey.getKey(), si));
119 } else {// append new entry to existing list
120 asi->second.push_back(pair<int,T*> (s, siit->second));
121 }
122 }
123 }
124 }
125 }
126 }
127 /*
128 * mapping sequence intervals from positions in the alignment to positions in the genomes
129 * input: a hash that stores for each sequence interval T in alignment space a list of (speciesIdx, T*)
130 * a vector of AnnoSeqs (necessary to verify whether an si is mappable to the genome)
131 * a vector of hashes of SIs.
132 *
133 * version 1) 'insertMissingSIs'=true
134 *
135 * - tuple (i,*si) is inserted both in alignment space ('aligendSIs') and in genome space ('seqInts').
136 * if an SI is mappable to species i (e.g. both boundaries are aligned to i and boundary signals are present).
137 * - tuple (i,NULL) is inserted in alignment space, if an SI is not mappable, but aligned to species i
138 * - (e.g. both boundaries are aligned to i, but a boundary signal is missing in i)
139 *
140 * version 2) 'insertMissingSIs'=false
141 *
142 * - no new tuples (i,*si) are created
143 * - tuple (i,NULL) is inserted in alignment space, if an SI is aligned to species i (e.g. both boundaries are aligned to i,
144 * boundary signal can be present or not.)
145 *
146 */
147 template <typename T> void projectToGenome(map<int_fast64_t, list<pair<int,T*> > > &alignedSIs, vector<AnnoSequence*> const &seqRanges,
148 vector< map<int_fast64_t,T*> > &seqints, bool insertMissingSIs=true) {
149
150 int k = alignment->rows.size();
151 vector<vector<fragment>::const_iterator > fragsit(k);
152
153 // move fragment iterators to the start
154 for (size_t s=0; s < k; s++){
155 if (alignment->rows[s])
156 fragsit[s] = alignment->rows[s]->frags.begin();
157 }
158
159 typename map<int_fast64_t, list<pair<int,T*> > >::iterator asi;
160 /*
161 * map all SIs from alignment coordinates to genome coordinates where possible
162 * this search in LINEAR in the length of all sequence intervals
163 * + the number of all alignment fragments
164 */
165 for(asi = alignedSIs.begin(); asi != alignedSIs.end(); asi++){
166 SeqIntKey aliKey(asi->first); // key encodes SI start position and length in the alignment
167 int_fast64_t aliStart = aliKey.getStart();
168 int_fast64_t aliEnd = aliKey.getLen() + aliStart;
169
170 typename list<pair<int,T*> >::iterator siit = asi->second.begin();
171
172 for(size_t s=0; s<k; s++){
173 if(siit != asi->second.end() && siit->first == s){ // SI is already in the hash
174 ++siit;
175 continue;
176 }
177 if (alignment->rows[s] == NULL)
178 continue;
179 AlignmentRow *row = alignment->rows[s];
180 // go the the first fragment that may contain aliStart
181 while (fragsit[s] != row->frags.end() && fragsit[s]->aliPos + fragsit[s]->len - 1 < aliStart){
182 ++fragsit[s];
183 }
184 if (fragsit[s] == row->frags.end()){
185 continue; // searched beyond the last fragment
186 }
187 int_fast64_t chrStart = row->getChrPos(aliStart,fragsit[s]) - offsets[s];
188 int_fast64_t chrEnd = row->getChrPos(aliEnd, fragsit[s]) - offsets[s];
189
190 if (chrStart >= 0 && chrEnd >= 0){ // both sequence boundaries are aligned to s
191 T* si = NULL;
192 if(insertMissingSIs){ // check if SI exists in species s (e.g. check if boundary signals are present)
193 SeqIntKey chrKey(chrStart,chrEnd-chrStart,aliKey.getAddInfo()); // key encodes SI start position and length in the genome
194 si = create(chrKey.getKey(), seqRanges[s]->sequence, seqRanges[s]->length);
195 if(si){
196 // insert mapped SI in genome space
197 seqints[s].insert(pair<int_fast64_t,T*>(chrKey.getKey(), si));
198 }
199 }
200 // insert mapped/aligned SI in alignment space
201 asi->second.insert(siit,pair<int,T*> (s, si));
202 }
203 }
204 }
205 }
206};
207
208#endif // _LIFTOVER_HH
global multiple sequence alignment
Definition alignment.hh:61
int getAliPos(int chrPos, vector< fragment >::const_iterator from)
Definition alignment.cc:101
global multiple sequence alignment with efficiently stored long gaps.
Definition alignment.hh:160
Mapping of sequence intervals from positions in the genome to positions in the alignment and vice ver...
Definition liftover.hh:60
SeqIntKey encodes the start position and length of a sequence interval in a 64 bit integer.
Definition liftover.hh:26