Augustus 3.4.0
Loading...
Searching...
No Matches
pp_scoring.hh
1/*
2 * pp_scoring.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 *
7 * Project : Gene Prediction with Protein Family Patterns
8 * Description: Classes that perform the evaluation of ProteinProfile
9 * emission probabilities
10 */
11
12#ifndef __PP_SCORING_HH
13#define __PP_SCORING_HH
14
15// project includes
16#include "vitmatrix.hh"
17#include "pp_hitseq.hh"
18#ifdef DEBUG
19#include "matrix.hh"
20extern int globalThreshUsedCount;
21#endif
22
23
24namespace PP {
25 /*
26 * storing maximal bonusProbs for various right sides
27 * a bonusProb is the predProb (predecessor viterbi * transEmiProb)
28 * multiplied with the bonus factor of a hit sequence if any
29 */
30
32 Double value; // current maximal bonusProb for the blockpos
33 Double prefixFactor; // the factor for the block part [0..i]
34 int blockpos; // the corresponding i for the right side
35 ViterbiSubmapType* argmaxMap;
36 SubstateId argmaxId;
37 BonusProbEntry(int i, Double factor) :
38 value(0),
39 prefixFactor(factor), blockpos(i),
40 argmaxMap(0), argmaxId() {}
41#ifdef DEBUG
42 bool operator==(const BonusProbEntry& other) const {
43 return value==other.value
44 && blockpos==other.blockpos
45 && prefixFactor==other.prefixFactor;
46 }
47#endif
48 Double fullProb() const {
49 return value * prefixFactor;
50 }
51 void set(const Double& p, ViterbiSubmapType* source, SubstateId leftPos) {
52 value = p;
53 argmaxMap = source;
54 argmaxId = leftPos;
55 }
56 }; // struct PP::BonusProbEntry
57
58
60 // BonusProbType(const BlockScoreType&, int base, int offset, bool complement);
61#ifdef DEBUG
62 string debug_info();
63#endif
64 void push_back(int i, const Double& factor) {
65 while (indexTable.size() <= i)
66 indexTable.push_back(v.size());
67 BonusProbEntry newval(i, factor);
68 v.push_back(newval);
69 }
70 void init() {
71 }
72 Double& value(int j) {
73 return v[j].value;
74 }
75 const Double& value(int j) const {
76 return v[j].value;
77 }
78 void update(const Double& p, int begin, int end,
79 ViterbiSubmapType* source, SubstateId leftId);
80
81 void mergeIntoSubmap(int blockindex,
82 ViterbiSubmapType& target,
83 SubstateIdMap* );
84
85 // the BonusProbEntry, one for each valid right side
86 // v[j].blockpos is the corresponding i
87 vector<BonusProbEntry> v;
88
89 // for each block pos i, j = indexTable[i] is:
90 // - the index of the BonusProbEntry having blockpos=i, or
91 // - the index of the first BonusProbEntry having blockpos>i.
92 // if neither exists, indexTable has size <= i
93 vector<int> indexTable;
94#ifdef DEBUG
95 bool operator==(const BonusProbType& other) const {
96 return v==other.v && indexTable==other.indexTable;
97 }
98#endif
99 }; // struct PP::BonusProbType
100
101
102 struct Match {
103 Match (string id, int blno, int f, int l, Double d, int off, bool comp) :
104 blockId(id), blockno(blno), firstBase(f), lastBase(l), score(d),
105 offset(off), complement(comp) {}
106 // shift is implemented as a function _object_
107 // since we need it in for_each
108 struct shift {
109 shift(int d) : offset(d) {}
110 void operator() (Match& m) const {
111 m.firstBase += offset;
112 m.lastBase += offset;
113 }
114 int offset;
115 };
116 int firstCodon() const { // numbering starts with 0
117 return offset/3;
118 }
119 int lastCodon() const { // equals firstCodon of next match if codon is split
120 return (offset + lastBase - firstBase)/3;
121 }
122 int phase() const {
123 return mod3(-offset);
124 }
125
126 string blockId;
127 int blockno;
128 int firstBase;
129 int lastBase;
130 Double score;
131 int offset;
132 bool complement;
133 };
134
135
138 void initScores() {
139 for (int strand=0; strand<2; strand++) {
140 blockScores[strand].clear();
141 for (int b=0; b<blockCount(); b++)
142 blockScores[strand].push_back(BlockScoreType( getBlock(b,strand) ));
143 }
144 }
145 void initHitSequences() {
146 // initialize hit sequence collections
147 for (int f=0; f<3; f++) {
148 hitSeqColl[0][f].reset(blockCount());
149 hitSeqColl[1][f].reset(blockCount());
150 }
151 }
152 void advanceScores(int base);
153 void scoreAssOrRDss(ViterbiSubmapType& submap, bool complement, int firstBase, const Double& maxProb);
154 void clearLowScoring(ViterbiSubmapType& submap);
155 void clearLowScoringSubstates(ViterbiColumnType& vit) {
156 vit.foreachSubstate(this, &SubstateModel::clearLowScoring);
157 }
158 double getIntronBonus(int b, int i, int frame, bool comp) {
159 const Block& blk = getBlock(b, comp);
160 if (!blk.hasIP())
161 return 1.0;
162 double freq;
163 if (comp) {
164 if (frame)
165 freq = blk.getIntronFreq(blk.size()-i-1, 3-frame);
166 else
167 freq = blk.getIntronFreq(blk.size()-i, 0);
168 } else
169 freq = blk.getIntronFreq(i, frame);
170 freq *= fullIntronBonus;
171 return freq > minIntronFactor ? freq : minIntronFactor;
172 }
173 Profile& prfl;
174 int maxBlockSize;
175 vector<BlockScoreType> blockScores[2]; //index of blockScores is b, not blockNo!!!
176 HitSequenceCollection hitSeqColl[2][3];
177 list<Match> currentResult;
178 bool allow_truncated; // allow right-truncated block-scoring exons
179 bool exhaustive_substates; // do not use a relaxation heuristic to clear substates
180 // see SubstateModel::clearLowScoring
181
182 /*---- methods evaluating positions in the profile ----*/
183
184 // the global nucleotide coordinate of a position
185 // static int getId(int b, int i) {
186 // return Position(b,i).id();
187 // }
188 // SubstateId toCopyId(SubstateId id) const {
189 // return SubstateId(id.slot + MAX_BLOCKCOUNT);
190 // }
191 // int fromCopyId(int id) const {
192 // return id + copyOffset;
193 // }
194 // int getCopyId(Position ppos) const {
195 // return toCopyId(ppos.id());
196 // }
197 // int getCopyId(int b, int i) const {
198 // return toCopyId(getId(b,i));
199 // }
200
201// Position getPosOfId(int id) const {
202// Position result = Position::fromId(id);
203// #ifdef DEBUG
204// if (result.b > blockCount())
205// cout << "getB() > #blocks\n";
206// #endif
207// return result;
208// }
209// Position getPosOfCopyId(int id) const {
210// return getPosOfId(fromCopyId(id));
211// }
212
213#ifdef DEBUG
214 void checkInRange(Position pos, bool complement) {
215 if (pos.b > blockCount() || pos.b < 0 ||
216 (pos.i>0 && pos.i > blockSize(pos.b, complement)))
217 throw ProjectError("block position out of range");
218 }
219#endif
220 int blockCount() const {
221 return prfl.blockCount();
222 }
223 int blockNoOfB(int b, bool comp) const {
224 if (b == blockCount())
225 throw ProjectError("Invalid block no. in SubstateModel::blockNoOfB");
226 return comp ? blockCount() - 1 - b : b;
227 }
228 int iBlockOfB(int b, bool comp) const {
229 return comp ? blockCount() - b : b;
230 }
231 DistanceType interBlockDist(int b, bool comp) const {
232 return prfl.interBlockDist(iBlockOfB(b, comp));
233 }
234 int blockSize(int b, bool comp) const {
235 return b == blockCount() ? 0 : prfl.blockSize(blockNoOfB(b, comp));
236 }
237 const Block& getBlock(int b, bool comp) const {
238 return prfl[blockNoOfB(b, comp)];
239 }
240
241 void addHitSeqMatch(const HitSequence* hs, bool comp) {
242 if (hs)
243 for (int b = hs->last(); b >= hs->first(); b--) {
244 int blockno = blockNoOfB(b, comp);
245 int firstBase = (*hs)[b];
246 int lastBase = firstBase + prfl.blockSize(blockno)*3 - 1;
247 Double score = blockScores[comp][b].savedScore(firstBase).getRoot(prfl.blockSize(blockno));
248 currentResult.push_front(Match(prfl[blockno].id, blockno, firstBase, lastBase, score, 0, comp));
249 }
250 }
251 void addPrefixMatch(Position ppos,
252 int endOfLastCodon, int endOfBioExon,
253 bool comp) {
254 int blockno = blockNoOfB(ppos.b, comp);
255#ifdef DEBUG
256 checkInRange(ppos, comp);
257#endif
258 int startOfMatch = endOfLastCodon - 3*ppos.i + 1;
259 if (startOfMatch <= endOfBioExon) {
260 Double score = ppos.i==0 ? 1 :
261 prfl[blockno].scoreFromScratch(comp, startOfMatch, 0, ppos.i).getRoot(ppos.i);
262 int offset = comp ?
263 3*(prfl.blockSize(blockno) - ppos.i)-endOfBioExon + endOfLastCodon : 0;
264 currentResult.push_front(Match(prfl[blockno].id, blockno,
265 startOfMatch, endOfBioExon,
266 score, offset, comp));
267 }
268#ifdef DEBUG
269 else throw ProjectError("Called addPrefixMatch with invalid value!");
270#endif
271 }
272
273 void addSuffixMatch(Position ppos,
274 int beginOfFirstCodon, int beginOfBioExon,
275 bool comp) {
276 int blockno = blockNoOfB(ppos.b, comp);
277#ifdef DEBUG
278 checkInRange(ppos, comp);
279#endif
280 int len = prfl.blockSize(blockno) - ppos.i;
281 int endOfMatch = beginOfFirstCodon + 3*len - 1;
282 if (beginOfBioExon <= endOfMatch) {
283 Double score = len==0 ? 1 :
284 prfl[blockno].scoreFromScratch(comp, beginOfFirstCodon, ppos.i).getRoot(len);
285 int offset = comp ? 0 :
286 3*ppos.i-(beginOfFirstCodon-beginOfBioExon);
287 currentResult.push_front(Match(prfl[blockno].id, blockno,
288 beginOfBioExon, endOfMatch,
289 score, offset, comp));
290 }
291#ifdef DEBUG
292 else throw ProjectError("Called addSuffixMatch with invalid value!");
293#endif
294 }
295
296 void addInternMatch(Position ppos,
297 int beginOfBioExon, int endOfLastCodon, int endOfBioExon,
298 bool comp) {
299 int blockno = blockNoOfB(ppos.b, comp);
300 int len = (endOfLastCodon - beginOfBioExon)/3;
301 Double score = len == 0 ? 1 :
302 prfl[blockno].scoreFromScratch(comp, endOfLastCodon-3*len+1, ppos.i-len, len).getRoot(len);
303 int offset = comp ?
304 3*(prfl.blockSize(blockno) - ppos.i)-endOfBioExon + endOfLastCodon :
305 3*ppos.i - endOfLastCodon + beginOfBioExon;
306 currentResult.push_front(Match(prfl[blockno].id, blockno,
307 beginOfBioExon, endOfBioExon,
308 score, offset, comp));
309 }
310
311 void appendMatchesTo(list<Match>& target) {
312 target.splice(target.end(), currentResult);
313 }
314
315#ifdef DEBUG
316 void outputFullBlocks(int offset=0) {
317 multimap< int, string > output;
318 for (int b=0; b<prfl.blockCount(); b++)
319 for (int f=0; f<6; f++) {
320 bool complement = f%2;
321 const map<int, Double>& hits = blockScores[complement][b].savedHits(f%3);
322 int blockno = blockNoOfB(b, complement);
323 for (map<int, Double>::const_iterator it = hits.begin();
324 it != hits.end(); ++it) {
325 ostringstream strm;
326 strm << it->first + offset << "\t"
327 << it->first + offset + 3*prfl.blockSize(blockno) << "\t"
328 << it->second << "\t"
329 << (complement ? '-' : '+') << "\t0\t"
330 << prfl[blockno].id << "\n";
331 output.insert(make_pair(it->first, strm.str()));
332 }
333 }
334 multimap<int, string>::iterator it;
335 for (it = output.begin(); it != output.end(); ++it)
336 cerr << "\tAUGUSTUS\tblock_hit\t" << it->second ;
337 }
338 void printGlobalThresh(Double p) {
339 for (int b=0; b<blockCount(); b++) {
340 Double p1 = prfl.getGlobalThresh(false, Position(b,0));
341 Double p2 = prfl.getGlobalThresh(true, Position(b,0));
342 cerr << "(thresh " << b << ") (+):" << p * p1 << " (-):" << p * p2 << "\n";
343 }
344 cerr << "( backward strand states are:";
345 for (int i=0; i<stateStrands.size(); i++)
346 if (stateStrands[i])
347 cerr << " " << i;
348 }
349#endif
350 static void setStateStrands(const vector<StateType>& stateMap) {
351 stateStrands.resize(stateMap.size());
352 for (int i=0; i<stateMap.size(); i++)
353 stateStrands[i] = !isOnFStrand(stateMap[i]);
354 }
355
356 private:
357 // int copyOffset;
358 static vector<bool> stateStrands;
359 static double fullIntronBonus;
360 static double minIntronFactor;
361
362 }; // struct PP::SubstateModel
363
364
366 public:
367 virtual ~ExonScorer() {
368#ifdef DEBUG
369 if (backward_mode)
370 delete currentHSColl;
371#endif
372 }
373 void newFirstCodon(int beginOfBioExon) {
374 beginOfFirstCodon = beginOfBioExon + mod3(endOfLastCodon - beginOfBioExon +1);
375 aa_count = (endOfLastCodon - beginOfFirstCodon + 1)/3;
376 }
377
378 // score initial / rterminal exons
379 virtual void scoreFirst(ViterbiColumnType& col, int predState, const Double& transEmiProb) = 0;
380 // score internal / rinternal exons
381 virtual void scoreInternal(ViterbiColumnType& col, int predState, const Double& transEmiProb) = 0;
382 void score(ViterbiColumnType& col, int predState, const Double& transEmiProb) {
383 (this->*scoreFunPtr)(col, predState, transEmiProb);
384 }
385 virtual void postProcessing(const Double& maxProb) = 0;
386 virtual void exportSubstates(ViterbiSubmapType&) {}
387 virtual SubstateId getPredSubstate() {
388 return SubstateId();
389 }
390 virtual bool hasNewMax() {
391 return false;
392 }
393 virtual void addMatches(int, int) {}
394 bool validSize() {
395 return aa_count > 0;
396 }
397#ifdef DEBUG
398 virtual Double getMaxProb() {
399 return 0;
400 }
401 static Matrix<Double>* transitions;
402#endif
403 protected:
404 ExonScorer(SubstateModel& model, int endOfBioExon, StateType type) :
405 scoreFunPtr(0),
406 complement(!isOnFStrand(type)),
407 mdl(model),
408 blockScores(model.blockScores[complement]),
409 beginOfFirstCodon(-1),
410 endOfLastCodon(isOnFStrand(type) ?
411 endOfBioExon - stateReadingFrames[type] :
412 endOfBioExon - (2 - stateReadingFrames[type])),
413 currentHSColl(&model.hitSeqColl[complement][mod3(endOfLastCodon+1)]),
414 is_active(true),
415#ifdef DEBUG
416 is_first(isFirstExon(type)),
417 is_last(isLastExon(type)),
418#endif
419 backward_mode(false)
420 {
421 if (isFirstExon(type))
422 scoreFunPtr = &ExonScorer::scoreFirst;
423 else
424 scoreFunPtr = &ExonScorer::scoreInternal;
425
426 switch (type) {
427 case terminal: case singleG: // subtract the stop codon
428 endStateOffset=-1; break;
429 case initial1: case initial2: // add the split codon
430 case internal1: case internal2:
431 case rinternal0: case rinternal1:
432 case rterminal1: case rterminal0:
433 endStateOffset=1; break;
434 case initial0: case internal0: // splicesite at codon end
435 case rinternal2: case rinitial:
436 case rterminal2: case rsingleG: // reverse gene start at exon end
437 endStateOffset=0; break;
438 default:
439 throw ProjectError("Internal error: State type " + itoa(type) + " unsuitable for exon scorer.");
440 }
441 }
442
443
444 // accessing prfl
445 int blockCount() const {
446 return mdl.blockCount();
447 }
448 int blockSize(int b) const {
449 return mdl.blockSize(b,complement);
450 }
451 const Block& getBlock(int b) const {
452 return mdl.getBlock(b, complement);
453 }
454 bool exceedsBlock(Position ppos) const {
455 return ppos.i > blockSize(ppos.b);
456 }
457 DistanceType interBlockDist(int b) const {
458 return mdl.interBlockDist(b,complement);
459 }
460 DistanceType interBlockFullDist(int b) const {
461 if (b==0)
462 return interBlockDist(b);
463 else
464 return interBlockDist(b) + Range(blockSize(b-1));
465 }
466 Range interBlockRange(int b) const {
467 return interBlockDist(b).r;
468 }
469 Range interBlockFullRange(int b) const {
470 return interBlockFullDist(b).r;
471 }
472 Position asNextPos(Position ppos) {
473 ppos.i -= blockSize(ppos.b);
474 ppos.i -= interBlockRange(++ppos.b).max;
475 return ppos;
476 }
477
478 // which scoring function to use
479 void (ExonScorer::*scoreFunPtr)(ViterbiColumnType&, int predState, const Double&);
480
481 // internal data fields
482 bool complement;
483 SubstateModel& mdl;
484 vector<BlockScoreType>& blockScores;
485
486 /*
487 * codon coordinates of the currently observed exon
488 *
489 * endOfLastCodon is the position of the rightmost nucleotide
490 * with reading frame 2 (forward) resp. 0 (backward) in the
491 * biological exon
492 * endOfLastCodon is the position until which profile scores
493 * are calculated for exons no matter where "base" is in this case
494 */
495 int aa_count, beginOfFirstCodon, endOfLastCodon;
496
497 // these are the block hit sequences
498 HitSequenceCollection* currentHSColl;
499
500 bool is_active;
501#ifdef DEBUG
502 bool is_first, is_last;
503#endif
504 int endStateOffset; // in reading frames != 0, the state id
505 // refers to the end of the split codon,
506 // and not to the last full codon in the exon
507 // we set endStateOffset:=1 in these cases
508
509 bool backward_mode; // true if called from sampling or backtracking
510 }; // class PP::ExonScorer
511
512
514 public:
515 SingleTargetExonScorer(SubstateModel& model, StateType etype, int endOfBioExon);
516 SingleTargetExonScorer(SubstateModel& model, StateType etype, SubstateId substate,
517 int endofBioExon, int ORFleft);
518 void calcAllowedDist(bool is_last) {
519 allowedDist.has_max = interBlockDist(rightPos.b).has_max || rightPos.i<0;
520 allowedDist.r =
521 (rightPos.i > 0 || is_last) ?
522 interBlockRange(rightPos.b).alignRight(rightPos.i) : rightPos.i;
523 }
524 void createHitSequences(int ORFleft);
525
526 // score single, backtrack initial/rterminal/single exons
527 virtual void scoreFirst(ViterbiColumnType& col, int predState, const Double& transEmiProb);
528
529 // score terminal/rinitial, backtrack terminal/rinitial/internal/rinternal
530 virtual void scoreInternal(ViterbiColumnType& col, int predState, const Double& transEmiProb);
531
532 virtual void postProcessing(const Double& mainProb);
533 virtual void exportSubstates(ViterbiSubmapType&);
534 virtual SubstateId getPredSubstate() {
535 return predSubstate;
536 }
537 virtual bool hasNewMax() {
538 return newmax;
539 }
540 virtual void addMatches(int beginOfBioExon, int endOfBioExon) {
541 newFirstCodon(beginOfBioExon);
542 if (rightPos.i - endStateOffset > aa_count) {
543 mdl.addInternMatch(rightPos - endStateOffset, beginOfBioExon, endOfLastCodon, endOfBioExon, complement);
544 return;
545 }
546 if (rightPos.i > 0)
547 mdl.addPrefixMatch(rightPos - endStateOffset, endOfLastCodon, endOfBioExon, complement);
548 mdl.addHitSeqMatch(bestHitSeq, complement);
549 if (predSubstate.slot>=0) {
550 Position ppos = predSubstate;
551 if (ppos.i > 0)
552 mdl.addSuffixMatch(ppos, beginOfFirstCodon, beginOfBioExon, complement);
553 }
554 }
555 // SubstateId rightId() const {
556 // return rightPos.id();
557 // }
558#ifdef DEBUG
559 Double getFactor();
560 Double getPrefixFactor() const {
561 return prefixFactor;
562 }
563 virtual Double getMaxProb() {
564 return maxProb;
565 }
566 void checkResults(const OptionListItem& oli, const ViterbiMatrixType& viterbi,
567 int, int, Double endPartProb, Double notEndPartProb);
568#endif
569
570 private:
571 Double bestFittingBonusFactor(int b, int maxBlockStart, const HitSequence*& argmax);
572
573 Position rightPos;
574 DistanceType allowedDist; // this is the range of values for targetPos.i compatible with rightPos
575 SubstateId predSubstate;
576 Double maxProb;
577 bool newmax;
578 ViterbiSubmapType* bestSource;
579 const HitSequence* bestHitSeq;
580 Double prefixFactor;
581 }; // class PP::SingleTargetExonScorer
582
583
585 public:
587 StateType etype, int endOfBioExon);
588 // no destructor needed:
589 // bestLeftId is owned by the ViterbiSubmap, source just linked to one
590 // ~MultiTargetExonScorer() {}
591
592 private:
593 void initBonusProbs(BonusProbType&, int b, int frame);
594 void updateBonusProbs(Position targetPos, const Double& bonusProb);
595 bool scoreSourceEntry(int b, int maxBlockStart, Double bonusProb);
596
597 // score initial/rterminal exons
598 virtual void scoreFirst(ViterbiColumnType& source, int predState, const Double& transEmiProb);
599
600 // score internal/rinternal exons
601 virtual void scoreInternal(ViterbiColumnType& source, int predState, const Double& transEmiProb);
602
603 virtual void postProcessing(const Double& maxProb);
604
605 ViterbiSubmapType& targetVit;
606 vector<BonusProbType> bonusProbs;
607 ViterbiSubmapType* source;
608 SubstateId leftId;
609 SubstateIdMap* bestLeftId;
610 }; // class PP::MultiTargetExonScorer
611
612} // namespace PP
613#endif
This class implements a double object with a very large range.
Definition lldouble.hh:31
A simple matrix class. Base class for all mathematical matrix objects.
Definition matrix.hh:27
Options lists are used for sampling; items also in backtracking.
Definition vitmatrix.hh:748
Definition pp_profile.hh:566
Definition pp_profile.hh:412
Definition pp_scoring.hh:365
Definition pp_hitseq.hh:362
Definition pp_scoring.hh:584
Definition pp_profile.hh:665
Definition pp_scoring.hh:513
Definition types.hh:449
Definition vitmatrix.hh:491
An array of Viterbi columns.
Definition vitmatrix.hh:687
Definition vitmatrix.hh:209
Definition pp_scoring.hh:31
Definition pp_scoring.hh:59
Definition pp_profile.hh:227
Definition pp_hitseq.hh:27
Definition pp_scoring.hh:108
Definition pp_scoring.hh:102
Definition pp_profile.hh:346
Definition pp_profile.hh:190
Definition pp_scoring.hh:136
Definition vitmatrix.hh:33