Augustus 3.4.0
Loading...
Searching...
No Matches
pp_fastBlockSearcher.hh
1/*
2 * pp_fastBlockSearcher.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 *
7 * Author : Oliver Keller
8 * Project : Gene Prediction with Protein Family Patterns
9 *
10 * Description: A fast block search class to determine sequence parts
11 * relevant for the ppx extension
12 */
13
14#ifndef __PP_BLOCKSEARCHER_HH
15#define __PP_BLOCKSEARCHER_HH
16
17// project includes
18#include "pp_profile.hh"
19
20// standard C/C++ includes
21#include <deque>
22#include <vector>
23#include <map>
24#include <ostream>
25#include <iostream>
26#include <iomanip>
27#include <cstdlib>
28
29using namespace std;
30
31namespace PP {
32
33 /*
34 * FsSeedCollection: Contains, for each triplet of residues, a
35 * set of positions that this tuple is likely
36 * to appear as part of a block.
37 * The decision whether a triplet is considered
38 * a seed for a certain position depends on two
39 * parameters:
40 * - the maximum number of seeds (chosen
41 * as a subset of 8000 possible triplets) at a
42 * certain position, reversely proportional to
43 * the length of the block (so that the total
44 * number of seeds used for a block is independent
45 * of its length)
46 * - if, according to the profile, a high probability
47 * that a given triplet is a seed can be reached with
48 * a smaller number of seeds, this reduced number is
49 * used.
50 * Given this number, the most likely (i.e. highest
51 * scoring) triplets are taken into the collection of
52 * seeds.
53 *
54 * The use of a seed collection has the advantage that it can be calculated
55 * once and used for filtering during the sequence analysis just by looking
56 * at each 9-tuples of nucleotides in the sequence without calculating scores
57 * at each point. Furthermore, the decision whether a 9-tuple translates to
58 * a seed can be made simultaneously for all offsets of all blocks at once.
59 *
60 */
61
63// multimap<string, Position> seeds;
64 vector<int> vindex;
65 vector<Position> vseeds;
66
67 static int expSeedCount; // maximal number of seeds per block
68 static double maxCoverage; // position-specific coverage considered sufficient
69// multimap<string, Position>::iterator current; // used internally in getNext()
70 int current;
71 int currend;
72
73#ifdef DEBUG
74 static double blockP(const PP::Block& blk, string s, int from) {
75 double result = 1;
76 for (int i=0; i<s.length(); i++)
77 result *= blk[from+i][GeneticCode::get_aa_from_symbol(s[i])];
78 return result;
79 }
80 static double blockQ(const Block& blk, string s, int from) {
81 double result = 1;
82 for (int i=0; i<s.length(); i++)
83 result *= blk[from+i].Q(GeneticCode::get_aa_from_symbol(s[i])).doubleValue();
84 return result;
85 }
86#endif
87
88 static double blockP(const PP::Block& blk, int aas, int from) {
89
90 double result = 1;
91 for (int i=from+2; i>=from; i--) {
92 result *= blk[i][aas % 20];
93 aas /= 20;
94 }
95 return result;
96 }
97 static double blockQ(const Block& blk, int aas, int from) {
98 double result = 1;
99 for (int i=from+2; i>=from; i--) {
100 result *= blk[i].Q(aas % 20).doubleValue();
101 aas /= 20;
102 }
103 return result;
104 }
105
106 public:
107 // create a seed collection
108 FsSeedCollection(const PP::Profile& prfl) :
109 vindex(1,0), current(0), currend(0)
110 {
111 multimap<int, Position> seeds;
112 multimap<int, Position>::iterator it;
113
114
115 for (int b = 0; b<prfl.blockCount(); b++) {
116 const Block& blk = prfl[b];
117#ifdef VERBOSE_SEEDS
118 cerr << "Determining seeds for block " << b << " (" << blk.id << ", " << blk.size() << " columns).\n";
119#endif
120 int maxcount = expSeedCount / blk.size();
121
122 for (int i=0; i<=blk.size()-3; i++) {
123 multimap<double,int> currtriples;
124 string s(3, 0);
125 int val=0;
126 for (int t1 = 0; t1<NUM_AA; t1++) {
127 s[0] = GeneticCode::aa_symbols[t1];
128 for (int t2 = 0; t2<NUM_AA; t2++) {
129 s[1] = GeneticCode::aa_symbols[t2];
130 for (int t3 = 0; t3<NUM_AA; t3++) {
131 s[2] = GeneticCode::aa_symbols[t3];
132 double q_val = blockQ(blk, val, i);
133 currtriples.insert(make_pair(q_val, val));
134 val++;
135 }
136 }
137 }
138 int count = 0;
139 double p = 0.0;
140 map<double,int>::reverse_iterator it = currtriples.rbegin();
141 for (count = 0; count < maxcount && p < maxCoverage; count++) {
142 p += blockP(blk, it->second, i);
143 seeds.insert(make_pair(it->second, Position(b,i)));
144 ++it;
145 }
146#ifdef DEBUG
147 cerr << "[" << i << ".." << (i+2) << "]: coverage=" << p << " minBlockQ=" << (--it)->first << " count=" << count << endl;
148#endif
149 }
150 }
151 vseeds.reserve(seeds.size());
152 vindex.reserve(8001);
153 for (it=seeds.begin(); it != seeds.end(); ++it) {
154 while (vindex.size() <= it->first)
155 vindex.push_back(vseeds.size());
156 vseeds.push_back(it->second);
157 }
158 while (vindex.size()<=8000)
159 vindex.push_back(vseeds.size());
160 }
161
162 // return positions for which s is a seed (or null if there are none)
163 // the first
164 void setFirst(int patt) {
165 current = vindex[patt];
166 currend = vindex[patt+1];
167 }
168 bool hasNext() {
169 return current < currend;
170 }
171 // the next
172 PP::Position& getNext() {
173 return vseeds[current++];
174 }
175
176 int size() {
177 return vseeds.size();
178 }
179 }; // PP::FsSeedCollection
180
181 /*
182 * FsHitType
183 */
184 struct FsHitType;
185 typedef deque<FsHitType*> HitQueue;
186
187 struct FsHitType {
188 FsHitType(int p, int b, bool r, PartScoreType sc) :
189 pos(p),
190 head(this),
191 blockNo(b),
192 reverse(r),
193 score(sc.score),
194 blockfrom(sc.from),
195 blockto(sc.to),
196 pathScore(sc.score),
197 predecessor(0) {}
198 int pos;
199 int start() {
200 return head->pos;
201 }
202 FsHitType* head;
203 int blockNo;
204 bool reverse;
205 double score;
206 int blockfrom;
207 int blockto;
208 double pathScore;
209 FsHitType* predecessor;
210 void linkTo(HitQueue& queue) {
211 while(!queue.empty() && queue.front()->pos < pos - maxIntronLen)
212 queue.pop_front();
213 if (!queue.empty()) {
214 predecessor = queue.front();
215 head = predecessor->head;
216 pathScore = predecessor->pathScore - intronMalus*(pos - predecessor->pos) + score;
217 }
218 }
219 void pushOn(HitQueue& queue) {
220 while (!queue.empty()) {
221 FsHitType* ht = queue.back();
222 if (ht->pathScore < pathScore + intronMalus*(pos - ht->pos))
223 queue.pop_back();
224 else
225 break;
226 }
227 queue.push_back(this);
228 }
229 static int maxIntronLen;
230 static double intronMalus;
231 }; // PP::FsHitType
232
233 /*
234 * FsHitCollection
235 */
237 vector<HitQueue> pendingHits[2];
238 vector<FsHitType*> finalResult;
239 vector<FsHitType*> allHits;
240 int size;
241
242 public:
243 FsHitCollection(int n) : size(n) {
244 pendingHits[0].resize(n);
245 pendingHits[1].resize(n);
246 }
248 for (int i=0; i<allHits.size(); i++)
249 delete allHits[i];
250 }
251 int allHitCount() {
252 return allHits.size();
253 }
254 int resultCount() {
255 return finalResult.size();
256 }
257
258 void newHit(FsHitType*);
259 void storeBestResults(multimap<double, FsHitType>&, int mincount, double threshold);
260 }; // PP::FsHitCollection
261
262
263
264 /*
265 * PP::CandidateCollection: collects "candidates" for block hits
266 *
267 * Each substring of the input sequence is a potential candidate.
268 * The collection is initialized with the profile (sequence of blocks)
269 * and the seed collection (computed once in advance) containing the
270 * seeds (see above)
271 *
272 */
273
274 // this type represents the results for a particular part of the
275 // input sequence when translated and aligned to a block
276 // - how many residues of the translation are part of a seed
277 // (with respect to the corresponding block position)
278 // - which is the rightmost residue that is part of seed
280 ScoringCandidate() : inSeedCount(0), lastOffset(-10) {}
281 int inSeedCount;
282 int lastOffset;
283 };
284
285 struct twomer {
286 signed char v[2];
287 twomer() { v[0]=v[1]=-1; }
288 signed char& operator[] (int n) {
289 return v[n];
290 }
291 int val() {
292 return (v[0]<0 || v[1]<0) ? -1 : (v[0]*20 + v[1]);
293 }
294 };
295
296
298 typedef deque<ScoringCandidate> SeedCountsQueue;
299 // we have, for both strands and each block a queue
300 // of scoring candidates; the index for the queue is
301 // given by the distance of the block match start to
302 // the last observed nucleotide
303 vector<SeedCountsQueue> theCandidates[2];
304 FsSeedCollection& seedColl;
305 const Profile& prfl;
306
307 vector<int> maxseedcount; // maximum of inSeedCounts
308 vector<int> seedhitsizes; // sum of inSeedCounts
309 vector<int> candcounts; // for how many locations bestPartialLogScore is called
310 vector<int> realhitcounts; // ...and successful
311
312 // pending 2-mers of amino acids, one waiting for each frame/strand
313 deque<twomer> patterns;
314
315
317 seedColl(coll),
318 prfl(p),
319 maxseedcount(p.blockCount(), 0),
320 seedhitsizes(p.blockCount(), 0),
321 candcounts(p.blockCount(), 0),
322 realhitcounts(p.blockCount(), 0),
323 patterns(6, twomer())
324 {
325 // start the computation with a queue of emtpy scoring candidates
326 for (int b=0; b<p.blockCount(); b++)
327 theCandidates[0].push_back(SeedCountsQueue(p.blockSize(b) * 3 -8,
329 theCandidates[1] = theCandidates[0];
330 }
331
332 // put SeedsToEntries: given a 3-aminoacid-motif, add it to those scoring
333 // candidates for which it represents a seed at the corresponding position
334 void putSeedsToEntries(int currpat, bool reverse) {
335 seedColl.setFirst(currpat);
336 while (seedColl.hasNext()) {
337 PP::Position& pos = seedColl.getNext();
338 int index = (reverse ? prfl.blockSize(pos.b) - pos.i - 3 : pos.i) * 3;
339
340 ScoringCandidate& pf = theCandidates[reverse][pos.b][index];
341#ifdef DEBUG
342 if (pf.lastOffset >= 0 && (pf.lastOffset == pos.i || reverse == (pf.lastOffset < pos.i)))
343 cerr << "ERROR: last offset (" << pf.lastOffset << ") " << (reverse ? "<" : ">")
344 << " current offset (" << pos.i << ") !" << endl;
345#endif
346 int range = abs(pf.lastOffset-pos.i);
347 pf.inSeedCount += range < 3 ? range : 3;
348 pf.lastOffset = pos.i;
349 }
350 }
351
352 // pushSeeds: move forward the sequence: for each strand, the next pending
353 // pair of aminoacids is concatenated to the next residue found and putSeedsToEntries
354 // is called with the resulting triple; and new pairs are added to the pending queue
355 void pushSeeds(int t) {
356 signed char aa, aab;
357 twomer nextPattern = patterns.back();
358 patterns.pop_back();
359 twomer revPattern = patterns.back();
360 patterns.pop_back();
361 try {
362 aa = GeneticCode::map[Seq2Int(3)(PP::DNA::sequence + t-2)];
363 aab = GeneticCode::map[Seq2Int(3).rc(PP::DNA::sequence + t-2)];
364 int forval = nextPattern.val()*20;
365 int revval = revPattern.val();
366 if (aa >= 0 && forval >= 0)
367 putSeedsToEntries(forval + aa, false);
368 if (aab >= 0 && revval >= 0)
369 putSeedsToEntries(revval + 400*aab, true);
370 } catch (InvalidNucleotideError &e) {
371 aa = aab = -2;
372 }
373 nextPattern[0] = nextPattern[1];
374 nextPattern[1] = aa;
375 patterns.push_front(nextPattern);
376 revPattern[1] = revPattern[0];
377 revPattern[0] = aab;
378 patterns.push_front(revPattern);
379 }
380
381 // takeCandidates: move forward the candidate queues (one for each block and strand)
382 // if a scoring candidate is complete (fully evaluated) and its seed count
383 // is above a threshold (here: around 25% of the block width), call it a hit candidate,
384 // compute log scores for it, and if there is one non-negative, call it a hit, and
385 // add it to the FsHitCollection target
386 void takeCandidates(int t, FsHitCollection& target, bool reverse) {
387 for (int b=0; b < prfl.blockCount(); b++) {
388 theCandidates[reverse][b].push_front(ScoringCandidate());
389 int hc = theCandidates[reverse][b].back().inSeedCount;
390 theCandidates[reverse][b].pop_back();
391 seedhitsizes[b] += hc;
392 if (hc < 0)
393 cerr << "ERROR: seedhitcount < 0!\n";
394 if (maxseedcount[b] < hc)
395 maxseedcount[b] = hc;
396 if (hc > 4 + prfl.blockSize(b)/4) {
397 int blstart = t - prfl.blockSize(b) * 3;
398 if (blstart < 0)
399 continue;
400 candcounts[b]++;
402 prfl[b].bestPartialLogScore(reverse, blstart, pt);
403 int width = pt.to - pt.from;
404 if (pt.score >= 0 &&
405 width >= MIN_BLOCKSIZE &&
406 width >= 0.3*prfl.blockSize(b)) {
407 FsHitType* newHit = new FsHitType(blstart, b, reverse, pt);
408 target.newHit(newHit);
409 realhitcounts[b]++;
410 }
411 }
412 }
413 }
414
415 // this is the entry point to the candidate collection
416 // calls takeCandidates and pushSeeds
417 void proceed(int t, FsHitCollection& target) {
418 takeCandidates(t, target, false);
419 takeCandidates(t, target, true);
420 pushSeeds(t);
421 }
422
423 void debugging_output() {
424 for (int b=0; b<prfl.blockCount(); b++) {
425 cerr << "Block " << b << " (" << prfl[b].id << ")" << endl;
426 cerr << " len: " << prfl.blockSize(b) << endl;
427 cerr << " av c: " << double(seedhitsizes[b])/DNA::len;
428 cerr << " max c: " << maxseedcount[b] << endl;
429 cerr << " % c>t: " << fixed << setw(4) << setprecision(2) << double(candcounts[b]*100)/DNA::len << endl;
430 }
431 }
432 }; // class PP::CandidateCollection
433} // namespace PP
434#endif // __PP_BLOCKSEARCHER_HH
Definition types.hh:480
Definition pp_profile.hh:412
Definition pp_fastBlockSearcher.hh:236
Definition pp_fastBlockSearcher.hh:62
Definition pp_profile.hh:665
a class for converting sequence into integer replacing Base4Int
Definition geneticcode.hh:163
Definition pp_fastBlockSearcher.hh:297
Definition pp_fastBlockSearcher.hh:187
Definition pp_profile.hh:407
Definition pp_profile.hh:346
Definition pp_fastBlockSearcher.hh:279
Definition pp_fastBlockSearcher.hh:285