Augustus 3.4.0
Loading...
Searching...
No Matches
pp_simscore.hh
1/*
2 * Name: pp_simscore.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 *
7 * Project: similarity-score algorithm for block-profile and protein sequence
8 * with intron information
9 * Author: Lars Gabriel
10 *
11 */
12
13#ifndef __PP_SIMSCORE_HH
14#define __PP_SIMSCORE_HH
15
16#include <iostream>
17#include <fstream>
18#include <limits>
19#include <math.h>
20#include <string>
21#include "properties.hh"
22#include "pp_profile.hh"
23
24 namespace SS {
42 struct Cell {
43 double score;
44 std::vector < std::tuple <int, int, char > > prev;
45 };
46
67 class Row {
68 private:
69 int len;
70 public:
71 Cell* row;
72 int position;
73 Row (int length, int position);
74 Cell& operator[] (int n);
75 int length();
76 };
77
107 private:
108 std::vector<Row> matrix;
109 public:
110 Row& operator[] (int n);
111 void addRow (int l, int p);
112 int length ();
113 bool empty ();
115 };
116
150 private:
151 char* sequence = NULL;
152 std::map < int, int > introns;
153 std::vector < int > num_prev_introns;
154 int len;
155 char* name;
156 public:
157 ProteinSequence (const char* fileName);
159 char& operator[] (int n);
160 int length();
161 int intronAt(int i);
162 int intronsInRange (int start, int end);
163 };
164
175 //amino acid sequence that has the highest likelihood to belong
176 //to the profile
177 std::string argmax_aminoacids;
178 //for each column the frequency of the aligned amino acid from
179 //the protein sequence in the profile
180 std::vector < double > match_prob;
181 };
182
201 int protSeq_pos;
202 int block_pos;
203 int col_pos;
204 int length;
205 char type;
206 };
207
215 int i;
216 int j;
217 int j_prev;
218 int block_count;
219 int col_count;
220 int col_posStart;
221 AlignmentElement align_element;
222 std::vector <AlignmentElement > alignment;
223 };
224
251 private:
252 //Similarity matrix:
253 //one column represents one entry of the sequence
254 //one row represents an entry of the blockprofile
256 //data structure for the protein sequence P
257 ProteinSequence* seq;
258 //data structure for the block profile B
259 PP::Profile* prfl;
260 //parameter for the gap costs
261 double gap_cost_inter;
262 double gap_cost_intra;
263 double gap_cost_intron;
264 //parameter for the intron weights
265 double intron_weight_inter;
266 double intron_weight_intra;
267 //parameter for the pseudocount
268 //the pseudocount is added to a relative frequency (v/w) of an
269 //intron position with (v+epsi_intron)/(w+epsi_intron+epsi_noIntron)
270 double epsi_intron;
271 double epsi_noIntron;
272
273 //number of protein sequences used to create the
274 //intron profile of the block profile
275 int intronPrfl_noProt;
276
277 //background frequency of an introns at an intrablock intron position,
278 //estimated from the intron positions of 15799 protein sequences
279 const double intronIntraBlock_bfreq = 1.706e-3;
280 //background frequency of the number of introns in
281 //an intrablock intron position, estimated from the intron positions
282 //of 15799 protein sequences in inter-block section
283 //the length an inter-block section that was used for this estimation
284 //is the mean of the inter-block distance interval
285 const double intronAvgInterBlock_bfreq = 9.599e-3;
286
287 //computes the intra-block intron score for a match of intron positions
288 //with f residual nucleotides, if an intron of P is aligned to
289 //block column s of block k
290 //if f=-1: ths score of a mismatch in intron position
291 //for f in {0,1,2} is computed
292 double intraBlock_iscore (int k, int s, int f, int intron_frame);
293 //Poisson distribution, which is used as the background distribution
294 //of k intron in an inter-block section
295 //lambda is estimated for I=[d_min, d_max] as
296 //3 * ((d_min+d_max)/2 +1) * (intronAvgInterBlock_bfreq/intronPrfl_noProt)
297 double PoiDist (int k, double lambda);
298 //method for comparison, whether a score is the new best score
299 //for a cell
300 //and stores the information of the current best score for backtracking
301 double compareScores(double new_score, double old_score, int current_i,\
302 int current_j, int prev_i, int prev_j, char score_type);
303 //calculates the intron score
304 double intronScore (double q, double q_b);
305
306 public:
307 SimilarityScore (double g, double b, double g_i, double iw1,\
308 double iw2, double e_i, double e_n);
310 //prints the inter block distance intervals of B to stdout
311 void printInterBlock();
312 bool SimilarityMatrix_empty();
313 //prints an optimal alignment as the coordinates of connected
314 //alignment sections of the same alignment type
315 //Format: List of AlignmentElement
316 void printAlignmentDB();
317 //prints a list of a translations from the indice of a block to
318 //the number of the block in the .prfl file.
319 void printBlockPos ();
320 //prints the average of the argmax of the block columns for
321 //the complete profile
322 pair<double, double> avgArgMaxProb();
323 //read files:
324 //sequence input file has to be in FASTA
325 //intron positions are optional as '[Introns]' section
326 //in the file format (see description of class ProteinSequence)
327 //blockprofile input file has to be in .prfl
328 //intron information are optional in the file format
329 //(see README file)
330 void readFiles (const char* seqFileName, const char* profileFileName);
331
332 //algorithm to fill the similarity matrix and compute
333 //the final similarity score
334 void fillSimilarityMatrix ();
335
336 //traces back the alignments corresponding to the final score
337 //alignment are stored as list of AlignmentElements
338 void backtrackingDB (int number_Alignments);
339
340 //print final similarity score
341 double score();
342
343 //final alignments of the sequence and the profile
344 std::vector <std::vector <AlignmentElement > > alignments;
345
346 //datastructure for a readable alignment, which is used
347 //for printing an alignment
348 std::vector < pair <std::string, PrflAlignment > > alignmentsReadable;
349 //prints similarity matrix and alignments with cout
350 void printSimilarityMatrix ();
351
352 //print readable alignment to stdout
353 //FORMAT: - Alignment representation of P (AminoAcid, gap symbol
354 // or number of amino acids in inter-block)
355 // - Alignemnt representation of argmax of B
356 // (argmax AminoAcid for aligned block column,
357 // gap symbol or inter-block length)
358 // - frequency of amino acid of P in aligned block column of B,
359 // if alignment type is a match
360 void printAlignment ();
361 };
362}
363//namespace SS
364#endif
Definition pp_profile.hh:665
class representing a protein sequence with optional informations about intron postions
Definition pp_simscore.hh:149
class representing one row of a SimilarityMatrix
Definition pp_simscore.hh:67
class representing a similarity matrix, for the SimilarityScore algorithm
Definition pp_simscore.hh:106
algorithm to compare the similarity of a protein sequence and a protein profile and compute an optima...
Definition pp_simscore.hh:250
structure representing one element of a final alignment produced by the backtracking algorithm
Definition pp_simscore.hh:200
structure representing all information internally needed for backtracking one final alignment
Definition pp_simscore.hh:214
structure representing one cell of a similarity matrix
Definition pp_simscore.hh:42
Information of the representation of the profile in an alignment for a readable output to stdout.
Definition pp_simscore.hh:174