Augustus 3.4.0
Loading...
Searching...
No Matches
types.hh
1/*
2 * types.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _TYPES_HH
9#define _TYPES_HH
10
11// project includes
12#include "lldouble.hh"
13
14
15#ifdef TESTING
16#include <boost/archive/text_oarchive.hpp>
17#include <boost/archive/text_iarchive.hpp>
18#include <boost/archive/impl/basic_text_oarchive.ipp>
19#include <boost/archive/impl/text_oarchive_impl.ipp>
20#include <boost/serialization/list.hpp>
21#include <boost/serialization/vector.hpp>
22
23/*
24 * author Stefanie Koenig (included here and adapted by Giovanna Migliorelli on 14.05.2020)
25 * append slash to the end of a directory name
26 */
27void expandDir(string& filename);
28#endif
29
30// standard C/C++ includes
31#include <string>
32#include <exception>
33#include <sstream>
34#include <cstring>
35#include <cstdlib>
36#include <vector>
37#include <limits>
38#include <algorithm> // for std:sort
39#include <map>
40#ifdef COMPGENEPRED
41#include <unordered_map>
42#endif
43
44using namespace std;
45
50enum Strand {STRAND_UNKNOWN=-1, plusstrand, minusstrand, bothstrands};
51char strandChar (Strand s);
52ostream& operator<< (ostream& strm, const Strand s);
53
59#define ASS_MIDDLE 2
60#define DSS_MIDDLE 2
61#define STARTCODON_LEN 3
62#define STOPCODON_LEN 3
63#define OCHRECODON "taa"
64#define OPALCODON "tga"
65#define AMBERCODON "tag"
66#define RCOCHRECODON "tta"
67#define RCOPALCODON "tca"
68#define RCAMBERCODON "cta"
69#define TRUNC_LEFT 1
70#define TRUNC_RIGHT 2
71
72#define SPECIES_SUBDIR "species/"
73#define MODEL_SUBDIR "model/"
74#define EXTRINSIC_SUBDIR "extrinsic/"
75
76#define VERSION "3.5.0"
77
78#define PREAMBLE "# This output was generated with AUGUSTUS (version " VERSION ").\n\
79# AUGUSTUS is a gene prediction tool written by M. Stanke (mario.stanke@uni-greifswald.de),\n\
80# O. Keller, S. König, L. Gerischer, L. Romoth, Katharina Hoff, Henry Mehlan and Daniel Honsel.\n\
81# Please cite: Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008),\n\
82# Using native and syntenically mapped cDNA alignments to improve de novo gene finding\n\
83# Bioinformatics 24: 637-644, doi 10.1093/bioinformatics/btn013"
84
85#define GREETING "AUGUSTUS (" VERSION ") is a gene prediction tool.\n\
86Sources and documentation at https://github.com/Gaius-Augustus/Augustus"
87
88#define SPECIES_LIST "usage:\n\
89augustus [parameters] --species=SPECIES queryfilename\n\
90\n\
91where SPECIES is one of the following identifiers\n\
92\n\
93identifier | species\n\
94-----------------------------------------|----------------------\n\
95pea_aphid | Acyrthosiphon pisum\n\
96aedes | Aedes aegypti\n\
97amphimedon | Amphimedon queenslandica\n\
98ancylostoma_ceylanicum | Ancylostoma ceylanicum\n\
99adorsata | Apis dorsata\n\
100honeybee1 | Apis mellifera\n\
101arabidopsis | Arabidopsis thaliana\n\
102aspergillus_fumigatus | Aspergillus fumigatus\n\
103aspergillus_nidulans | Aspergillus nidulans\n\
104(anidulans) | Aspergillus nidulans\n\
105aspergillus_oryzae | Aspergillus oryzae\n\
106aspergillus_terreus | Aspergillus terreus\n\
107bombus_impatiens1 | Bombus impatiens\n\
108bombus_terrestris2 | Bombus terrestris\n\
109botrytis_cinerea | Botrytis cinerea\n\
110brugia | Brugia malayi\n\
111b_pseudomallei | Burkholderia pseudomallei\n\
112caenorhabditis | Caenorhabditis elegans\n\
113(c_elegans_trsk) | Caenorhabditis elegans\n\
114(elegans) | Caenorhabditis elegans\n\
115elephant_shark | Callorhinchus milii\n\
116camponotus_floridanus | Camponotus floridanus\n\
117candida_albicans | Candida albicans\n\
118candida_guilliermondii | Candida guilliermondii\n\
119candida_tropicalis | Candida tropicalis\n\
120chaetomium_globosum | Chaetomium globosum\n\
121chiloscyllium | Chiloscyllium punctatum\n\
122chlamy2011 | Chlamydomonas reinhardtii\n\
123(chlamydomonas) | Chlamydomonas reinhardtii\n\
124chlorella | Chlorella sp.\n\
125ciona | Ciona intestinalis\n\
126coccidioides_immitis | Coccidioides immitis\n\
127coccidioides_immitis | Coccidioides immitis\n\
128Conidiobolus_coronatus | Conidiobolus coronatus\n\
129coprinus_cinereus | Coprinus cinereus\n\
130coprinus_cinereus | Coprinus cinereus\n\
131(coprinus) | Coprinus cinereus\n\
132coprinus | Coprinus cinereus\n\
133cryptococcus_neoformans_gattii | Cryptococcus neoformans gattii\n\
134cryptococcus_neoformans_neoformans_B | Cryptococcus neoformans neoformans\n\
135(cryptococcus) | Cryptococcus neoformans\n\
136culex | Culex pipiens\n\
137zebrafish | Danio rerio\n\
138debaryomyces_hansenii | Debaryomyces hansenii\n\
139fly | Drosophila melanogaster\n\
140(fly_exp) | Drosophila melanogaster\n\
141encephalitozoon_cuniculi_GB | Encephalitozoon cuniculi\n\
142eremothecium_gossypii | Eremothecium gossypii\n\
143E_coli_K12 | Escherichia coli K12\n\
144fusarium_graminearum | Fusarium graminearum\n\
145(fusarium) | Fusarium graminearum\n\
146galdieria | Galdieria sulphuraria\n\
147chicken | Gallus gallus domesticus\n\
148heliconius_melpomene1 | Heliconius melpomene\n\
149histoplasma_capsulatum | Histoplasma capsulatum\n\
150(histoplasma) | Histoplasma capsulatum\n\
151human | Homo sapiens\n\
152kluyveromyces_lactis | Kluyveromyces lactis\n\
153laccaria_bicolor | Laccaria bicolor\n\
154leishmania_tarentolae | Leishmania tarentolae\n\
155japaneselamprey | Lethenteron camtschaticum\n\
156lodderomyces_elongisporus | Lodderomyces elongisporus\n\
157magnaporthe_grisea | Magnaporthe grisea\n\
158mnemiopsis_leidyi | Mnemiopsis leidyi\n\
159nasonia | Nasonia vitripennis\n\
160nematostella_vectensis | Nematostella vectensis\n\
161neurospora_crassa | Neurospora crassa\n\
162(neurospora) | Neurospora crassa\n\
163coyote_tobacco | Nicotiana attenuata\n\
164rice | Oryza sativa\n\
165parasteatoda | Parasteatoda sp.\n\
166sealamprey | Petromyzon marinus\n\
167phanerochaete_chrysosporium | Phanerochaete chrysosporium\n\
168(pchrysosporium) | Phanerochaete chrysosporium\n\
169pichia_stipitis | Pichia stipitis\n\
170pisaster | Pisaster ochraceus\n\
171pfalciparum | Plasmodium falciparum\n\
172pneumocystis | Pneumocystis jirovecii\n\
173rhincodon | Rhincodon typus\n\
174rhizopus_oryzae | Rhizopus oryzae\n\
175rhodnius | Rhodnius prolixus\n\
176saccharomyces_cerevisiae_rm11-1a_1 | Saccharomyces cerevisiae\n\
177saccharomyces_cerevisiae_S288C | Saccharomyces cerevisiae\n\
178(saccharomyces) | Saccharomyces cerevisiae\n\
179schistosoma | Schistosoma mansoni\n\
180(schistosoma2) | Schistosoma mansoni\n\
181schizosaccharomyces_pombe | Schizosaccharomyces pombe\n\
182scyliorhinus | Scyliorhinus torazame\n\
183tomato | Solanum lycopersicum\n\
184s_aureus | Staphylococcus aureus\n\
185s_pneumoniae | Streptococcus pneumoniae\n\
186strongylocentrotus_purpuratus | Strongylocentrotus purpuratus\n\
187sulfolobus_solfataricus | Sulfolobus solfataricus\n\
188tetrahymena | Tetrahymena thermophila\n\
189cacao | Theobroma cacao\n\
190thermoanaerobacter_tengcongensis | Thermoanaerobacter tengcongensis\n\
191toxoplasma | Toxoplasma gondii\n\
192tribolium2012 | Tribolium castaneum\n\
193trichinella | Trichinella sp.\n\
194wheat | Triticum sp.\n\
195ustilago_maydis | Ustilago maydis\n\
196(ustilago) | Ustilago maydis\n\
197verticillium_albo_atrum1 | Verticillium albo atrum\n\
198verticillium_longisporum1 | Verticillium longisporum\n\
199volvox | Volvox sp.\n\
200Xiphophorus_maculatus | Xiphophorus maculatus\n\
201yarrowia_lipolytica | Yarrowia lipolytica\n\
202maize | Zea mays\n\
203(maize5) | Zea mays\n\
204\n"
205
206#define HELP_USAGE "usage:\n\
207augustus [parameters] --species=SPECIES queryfilename\n\
208\n\
209'queryfilename' is the filename (including relative path) to the file containing the query sequence(s)\n\
210in fasta format.\n\
211\n\
212SPECIES is an identifier for the species. Use --species=help to see a list.\n\
213\n\
214parameters:\n\
215--strand=both, --strand=forward or --strand=backward\n\
216--genemodel=partial, --genemodel=intronless, --genemodel=complete, --genemodel=atleastone or --genemodel=exactlyone\n\
217 partial : allow prediction of incomplete genes at the sequence boundaries (default)\n\
218 intronless : only predict single-exon genes like in prokaryotes and some eukaryotes\n\
219 complete : only predict complete genes\n\
220 atleastone : predict at least one complete gene\n\
221 exactlyone : predict exactly one complete gene\n\
222--singlestrand=true\n\
223 predict genes independently on each strand, allow overlapping genes on opposite strands\n\
224 This option is turned off by default.\n\
225--hintsfile=hintsfilename\n\
226 When this option is used the prediction considering hints (extrinsic information) is turned on.\n\
227 hintsfilename contains the hints in gff format.\n\
228--AUGUSTUS_CONFIG_PATH=path\n\
229 path to config directory (if not specified as environment variable)\n\
230--alternatives-from-evidence=true/false\n\
231 report alternative transcripts when they are suggested by hints\n\
232--alternatives-from-sampling=true/false\n\
233 report alternative transcripts generated through probabilistic sampling\n\
234--sample=n\n\
235--minexonintronprob=p\n\
236--minmeanexonintronprob=p\n\
237--maxtracks=n\n\
238 For a description of these parameters see section 2 of RUNNING-AUGUSTUS.md.\n\
239--proteinprofile=filename\n\
240 When this option is used the prediction will consider the protein profile provided as parameter.\n\
241 The protein profile extension is described in section 5 of RUNNING-AUGUSTUS.md.\n\
242--progress=true\n\
243 show a progressmeter\n\
244--gff3=on/off\n\
245 output in gff3 format\n\
246--predictionStart=A, --predictionEnd=B\n\
247 A and B define the range of the sequence for which predictions should be found.\n\
248--UTR=on/off\n\
249 predict the untranslated regions in addition to the coding sequence. This currently works only for a subset of species.\n\
250--noInFrameStop=true/false\n\
251 Do not report transcripts with in-frame stop codons. Otherwise, intron-spanning stop codons could occur. Default: false\n\
252--noprediction=true/false\n\
253 If true and input is in genbank format, no prediction is made. Useful for getting the annotated protein sequences.\n\
254--uniqueGeneId=true/false\n\
255 If true, output gene identifyers like this: seqname.gN\n\
256\n\
257For a complete list of parameters, type \"augustus --paramlist\". A description of the important ones can be found in the file RUNNING-AUGUSTUS.md.\n"
258
259#define HELP_USAGE_ETRAINING "usage:\n\
260etraining --species=SPECIES trainfilename\n\
261\n\
262'trainfilename' is the filename (including relative path) to the file in genbank format containing the training sequences. These can be multi-gene sequences.\n\
263SPECIES is a name for your species.\n\
264\n\
265parameters:\n\
266--/genbank/verbosity=n\n\
267 Choose one of 0,1,2 or 3. The larger the verbosity, the more (error) messages you get.\n"
268
269
270#define POWER4TOTHE(x) ((int) (pow((double) 4,(double) (x)) + 0.1))
271
272/*
273 * bonus for (almost sure) hint with negative score
274 * the user sets an (almost sure) anchor
275 */
276
277#define BONUSSUREHINT 1e100
278
282typedef long int Integer;
286typedef int Short;
290typedef bool Boolean;
294typedef LLDouble Double;
298typedef double Float;
299
304class Constant {
305public:
306 static void init();
307 static string fullSpeciesPath() {return configPath + speciesDir;}
308 static string modelPath() {return configPath + MODEL_SUBDIR;}
309 static string extrinsicPath() {return configPath + EXTRINSIC_SUBDIR;}
310
311 // size of splicesite patterns
312 static int ass_size() { return ass_start + ass_end; } // excluding "AG"
313 static int ass_whole_size() { return ass_size() + ASS_MIDDLE; } // including "AG"
314 static int dss_size() { return dss_start + dss_end; } // excluding "GT"
315 static int dss_whole_size() { return dss_size() + DSS_MIDDLE; } // including "GT"
316
317 static string configPath;
318 static string speciesDir;
319 static int decomp_num_at;
320 static int decomp_num_gc;
321 static int decomp_num_steps;
322 static int trans_init_window;
323 static int tis_maxbinsize;
324 static int ass_upwindow_size;
325 static int init_coding_len;
326 static int et_coding_len;
327 static int ass_start; // size of ASS pattern before "AG"
328 static int ass_end; // after "AG"
329 static int dss_start; // size of DSS pattern before "GT"
330 static int dss_end; // after "GT"
331 static int dss_maxbinsize;
332 static int ass_maxbinsize;
333 static int tss_upwindow_size; // win from start of model up to tss
334 static int tss_start;
335 static int min_coding_len;
336 static int max_exon_len;
337 static Integer d_polyasig_cleavage;
338 static bool keep_viterbi;
339 static double gc_range_min;
340 static double gc_range_max;
341 static double probNinCoding;
342 static double opalprob;
343 static double amberprob;
344 static double ochreprob;
345 static bool utr_option_on;
346 static bool nc_option_on;
347 static Integer augustus_verbosity;
348 static bool alternatives_from_evidence;
349 static double subopt_transcript_threshold;
350 static Integer almost_identical_maxdiff; // maximum allowed difference for the ends of the transcript
351 static bool uniqueGeneId;
352 static double max_contra_supp_ratio;
353 static bool reportUtrOnlyGenes;
354 static bool useCRFtraining;
355 static bool CRFtrainTIS;
356 static bool CRFtrainSS;
357 static bool CRFtrainIntron;
358 static bool CRFtrainIgenic;
359 static bool CRFtrainCDS;
360 static bool CRFtrainUTR;
361 static bool dss_gc_allowed;
362 static Boolean tieIgenicIntron; // whether to tie igenic model parameters to intron model parameters, i.e. use just one content model, that of the intron
363 static Boolean exoncands;
364 static Boolean proteinOutput;
365 static Boolean codSeqOutput;
366 static Boolean contentmodels; // whether to use content models, default: true
367 static Integer min_intron_len; // minimal intron length (for hints)
368 static bool MultSpeciesMode; // whether we do comparative gene prediction in multiple species
369 static string treefile; // file name in which a tree is specified in Newick format
370 static string speciesfilenames; // file name to file which contains the names of species and the corresponding file names
371 static string dbaccess; // comma separated string with database access (hostname, database name, table name, user, passwd
372 static string alnfile; // name of file that contains MSA of genomes
373 static string codonalnfile; // name of file that contains MSA of codon sequences
374 static bool overlapmode; // whether overlapping exons are allowed in Viterbi algorithm
375 static Boolean printEvidence; // "evidence for and against"
376 static Boolean printExonCandsMSA; // print the OE nucleotide alignments
377 static Boolean printOEs; // output ortho exons to file
378 static Boolean printHints;
379 static Boolean printMEA; // output .mea files (base genes) during CGP
380 static Boolean printSampled; // output .sampled_GFs during CGP
381 static Boolean printGeneRangesBED;
382 static Boolean printGeneRangesGFF;
383 static Integer maxOvlp; // parameters for overlapping coding regions in bacteria
384 static vector<Double> head2tail_ovlp;
385 static vector<Double> head2head_ovlp;
386 static vector<Double> tail2tail_ovlp;
387 static unsigned temperature; // heating the distribution for sampling, 0=cold, 7=hottest, take probs to the power of (8-temperature)/8
388 static bool softmasking; // if true, lower-case character regions
389 // give rise to nonexonpart hints
390 static bool softmasking_explicitly_requested; // i.e. it was not
391 // left at default value
392 static bool dbhints;
393 // scores from logistic regression
394 static bool logreg;
395 static string referenceFile;
396 static string refSpecies;
397 static double GD_stepsize;
398 static bool rLogReg;
399 static double label_flip_prob;
400 static double lambda;
401 #ifdef COMPGENEPRED
402 static unordered_map<string, pair<int, vector<double> > > logReg_feature;
403 #endif
404 static vector<double> ex_sc;
405 static vector<double> in_sc;
406 static int oeExtensionWidth;
407 static vector<double> lg_es;
408 static bool computeNumSubs;
409 static bool useAArates;
410 static bool useNonCodingModel;
411 static bool rescaleBoni;
412};
413
414
415extern bool inCRFTraining;
416
417extern const int power2id[31];
418
419#define A_SET_FLAG(x) power2id[x]
420
425struct Bitmask {
426 Bitmask (int n=0) : value(n) {}
427 Bitmask (const Bitmask& other) : value(other.value) {}
428 bool operator[] (int n) const {
429 return value & A_SET_FLAG(n);
430 }
431 void set(int n) {
432 value |= A_SET_FLAG(n);
433 }
434 void unset(int n) {
435 value &= ~A_SET_FLAG(n);
436 }
437 static Bitmask any() {
438 return Bitmask(-1);
439 }
440 int value;
441};
442
443
449class ProjectError : public exception {
450public:
455 ProjectError( string msg ) throw () : exception(), message( msg ) {
456 }
460 ProjectError( ) throw () : exception() { }
461
465 ~ProjectError() throw() {}
466
471 string getMessage( ) { return message; }
472private:
473 string message;
474};
475
481public:
482 InvalidNucleotideError( char t ) : ProjectError( "Invalid nucleotide '" + string(1,t) + "' encountered." ) { }
483};
484
485
486#define NUM_TYPES 86
487
492enum StateType{TYPE_UNKNOWN = -1, igenic,
493 // forward strand
494 singleG, initial0, initial1, initial2, internal0, internal1, internal2, terminal,
495 lessD0, longdss0, equalD0, geometric0, longass0, // The five intron states for frame 0
496 lessD1, longdss1, equalD1, geometric1, longass1, // The five intron states for frame 1
497 lessD2, longdss2, equalD2, geometric2, longass2, // The five intron states for frame 2
498 utr5single, utr5init, utr5intron, utr5intronvar, utr5internal, utr5term, // 5'UTR states
499 utr3single, utr3init, utr3intron, utr3intronvar, utr3internal, utr3term, // 3'UTR states
500 // reverse strand
501 rsingleG, rinitial, rinternal0, rinternal1, rinternal2, rterminal0, rterminal1, rterminal2,
502 rlessD0, rlongdss0, requalD0, rgeometric0, rlongass0, // The five intron states for frame 0
503 rlessD1, rlongdss1, requalD1, rgeometric1, rlongass1, // The five intron states for frame 1
504 rlessD2, rlongdss2, requalD2, rgeometric2, rlongass2, // The five intron states for frame 2
505 rutr5single, rutr5init, rutr5intron, rutr5intronvar, rutr5internal, rutr5term, // 5'UTR states
506 rutr3single, rutr3init, rutr3intron, rutr3intronvar, rutr3internal, rutr3term, // 3'UTR states
507
508 intron_type, rintron_type, exon_type,
509 // non-protein-coding states
510 ncsingle, ncinit, ncintron, ncintronvar, ncinternal, ncterm, // forward strand
511 rncsingle, rncinit, rncintron, rncintronvar, rncinternal, rncterm // reverse strand
512};
513
514extern const char* stateTypeNames[NUM_TYPES];
515extern const char* stateTypeIdentifiers[NUM_TYPES];
516extern const int stateReadingFrames[NUM_TYPES];
517
518
519
520/*
521 * mod3, returns the number in {0,1,2} being congruent to the argument modulo 3
522 */
523inline int mod3 (int k) {
524 return (k>=0)? k%3 : (k%3+3)%3;
525}
526
527inline int modm(int k, int m) {
528 return (k>=0)? k%m : (k%m+m)%m;
529}
530
531
532/*
533 * state type functions
534 */
535
536inline StateType toStateType( const char* str ){
537 int i;
538 for (i=0; i<NUM_TYPES; i++)
539 if (strcmp(str, stateTypeIdentifiers[i]) == 0)
540 return (StateType) i;
541 return TYPE_UNKNOWN;
542}
543
544inline Boolean isInitialExon(StateType type){
545 return (type==initial0 || type==initial1 || type==initial2);
546}
547
548inline Boolean isInternalExon(StateType type){
549 return (type==internal0 || type==internal1 || type==internal2);
550}
551
552inline Boolean isRTerminalExon(StateType type){
553 return (type==rterminal0 || type==rterminal1 || type==rterminal2);
554}
555
556inline Boolean isRInternalExon(StateType type){
557 return (type==rinternal0 || type==rinternal1 || type==rinternal2);
558}
559
560inline Boolean isFirstExon(StateType type){
561 return (isInitialExon(type) || isRTerminalExon(type) || type==singleG || type==rsingleG);
562}
563
564inline Boolean isLastExon(StateType type){
565 return (type==terminal || type==rinitial || type==singleG || type == rsingleG);
566}
567
568inline Boolean is5UTR(StateType type){
569 return (utr5single <= type && type <= utr5term)
570 || (rutr5single <= type && type <= rutr5term);
571}
572
573inline Boolean is5UTRIntron(StateType type){
574 return (type == utr5intron || type == utr5intronvar
575 || type == rutr5intron || type == rutr5intronvar);
576}
577
578inline Boolean is5UTRExon(StateType type){
579 return is5UTR(type) && !is5UTRIntron(type);
580}
581
582inline Boolean is3UTR(StateType type){
583 return (utr3single <= type && type <= utr3term)
584 || (rutr3single <= type && type <= rutr3term);
585}
586
587inline Boolean is3UTRIntron(StateType type){
588 return (type == utr3intron || type == utr3intronvar
589 || type == rutr3intron || type == rutr3intronvar);
590}
591
592inline Boolean is3UTRExon(StateType type){
593 return is3UTR(type) && !is3UTRIntron(type);
594}
595
596inline Boolean isFirstUTRExon(StateType type){
597 return (type == utr5single || type == utr5init || type == rutr3single || type == rutr3term);
598}
599
600inline Boolean isLastUTRExon(StateType type){
601 return (type == utr3single || type == utr3term || type == rutr5single || type == rutr5init);
602}
603
604inline Boolean isLongUTRIntron(StateType type){
605 return (type == utr5intronvar || type == utr3intronvar || type == rutr5intronvar || type == rutr3intronvar);
606}
607
608inline Boolean isCodingExon(StateType type) {
609 return (singleG <= type && type <= terminal)
610 || (rsingleG <= type && type <= rterminal2);
611}
612
613inline Boolean isNcIntron(StateType type) {
614 return (type == ncintron || type == ncintronvar || type == rncintron || type == rncintronvar);
615}
616
617inline Boolean isNcExon(StateType type) {
618 return (ncsingle <= type && !isNcIntron(type));
619}
620
621inline Boolean isNc(StateType type) {
622 return (ncsingle <= type);
623}
624
625inline Boolean isExon(StateType type) {
626 return (isCodingExon(type) || is5UTRExon(type) || is3UTRExon(type) || isNcExon(type));
627}
628
629inline Boolean isCodingIntron(StateType type) {
630 return (lessD0 <= type && type <= longass2)
631 || (rlessD0 <= type && type <= rlongass2);
632}
633
634inline Boolean isIntron(StateType type) {
635 return (isCodingIntron(type) || is5UTRIntron(type) || is3UTRIntron(type)
636 || type == intron_type || type == rintron_type || isNcIntron(type));
637}
638
639inline Boolean isGeometricIntron(StateType type){
640 return (type==geometric0 || type==geometric1 || type==geometric2);
641}
642
643inline Boolean isRGeometricIntron(StateType type){
644 return (type==rgeometric0 || type==rgeometric1 || type==rgeometric2);
645}
646
647inline Boolean isLongAssIntron(StateType type){
648 return (type==longass0 || type== longass1 || type==longass2);
649}
650
651inline Boolean isRLongDssIntron(StateType type){
652 return (type==rlongdss0 || type== rlongdss1 || type==rlongdss2);
653}
654
655inline Boolean isLongDssIntron(StateType type){
656 return (type==longdss0 || type== longdss1 || type==longdss2);
657}
658
659inline Boolean isOnFStrand(StateType type){
660 return ((type >= singleG && type < rsingleG) || (type >= ncsingle && type < rncsingle));
661}
662
663inline StateType initialExon(int frame){
664 frame = mod3(frame);
665 if (frame == 0)
666 return initial0;
667 if (frame == 1)
668 return initial1;
669 if (frame == 2)
670 return initial2;
671 return TYPE_UNKNOWN;
672}
673
674inline StateType internalExon(int frame){
675 frame = mod3(frame);
676 if (frame == 0)
677 return internal0;
678 if (frame == 1)
679 return internal1;
680 if (frame == 2)
681 return internal2;
682 return TYPE_UNKNOWN;
683}
684
685inline StateType lessDIntron(int frame){
686 frame = mod3(frame);
687 if (frame == 0)
688 return lessD0;
689 if (frame == 1)
690 return lessD1;
691 if (frame == 2)
692 return lessD2;
693 return TYPE_UNKNOWN;
694}
695
696
697inline StateType geometricIntron(int frame){
698 frame = mod3(frame);
699 if (frame == 0)
700 return geometric0;
701 if (frame == 1)
702 return geometric1;
703 if (frame == 2)
704 return geometric2;
705 return TYPE_UNKNOWN;
706}
707
708inline StateType longdssIntron(int frame){
709 frame = mod3(frame);
710 if (frame == 0)
711 return longdss0;
712 if (frame == 1)
713 return longdss1;
714 if (frame == 2)
715 return longdss2;
716 return TYPE_UNKNOWN;
717}
718
719inline StateType longassIntron(int frame){
720 frame = mod3(frame);
721 if (frame == 0)
722 return longass0;
723 if (frame == 1)
724 return longass1;
725 if (frame == 2)
726 return longass2;
727 return TYPE_UNKNOWN;
728}
729
730inline StateType equalDIntron(int frame){
731 frame = mod3(frame);
732 if (frame == 0)
733 return equalD0;
734 if (frame == 1)
735 return equalD1;
736 if (frame == 2)
737 return equalD2;
738 return TYPE_UNKNOWN;
739}
740
741inline StateType rterminalExon(int frame){
742 frame = mod3(frame);
743 if (frame == 0)
744 return rterminal0;
745 if (frame == 1)
746 return rterminal1;
747 if (frame == 2)
748 return rterminal2;
749 return TYPE_UNKNOWN;
750}
751
752inline StateType rinternalExon(int frame){
753 frame = mod3(frame);
754 if (frame == 0)
755 return rinternal0;
756 if (frame == 1)
757 return rinternal1;
758 if (frame == 2)
759 return rinternal2;
760 return TYPE_UNKNOWN;
761}
762inline StateType rlongdssIntron(int frame){
763 frame = mod3(frame);
764 if (frame == 0)
765 return rlongdss0;
766 if (frame == 1)
767 return rlongdss1;
768 if (frame == 2)
769 return rlongdss2;
770 return TYPE_UNKNOWN;
771}
772
773inline StateType rlongassIntron(int frame){
774 frame = mod3(frame);
775 if (frame == 0)
776 return rlongass0;
777 if (frame == 1)
778 return rlongass1;
779 if (frame == 2)
780 return rlongass2;
781 return TYPE_UNKNOWN;
782}
783
784inline StateType rlessDIntron(int frame){
785 frame = mod3(frame);
786 if (frame == 0)
787 return rlessD0;
788 if (frame == 1)
789 return rlessD1;
790 if (frame == 2)
791 return rlessD2;
792 return TYPE_UNKNOWN;
793}
794
795inline StateType rgeometricIntron(int frame){
796 frame = mod3(frame);
797 if (frame == 0)
798 return rgeometric0;
799 if (frame == 1)
800 return rgeometric1;
801 if (frame == 2)
802 return rgeometric2;
803 return TYPE_UNKNOWN;
804}
805
806inline StateType requalDIntron(int frame){
807 frame = mod3(frame);
808 if (frame == 0)
809 return requalD0;
810 if (frame == 1)
811 return requalD1;
812 if (frame == 2)
813 return requalD2;
814 return TYPE_UNKNOWN;
815}
816
817int howOftenOccursIt(const char* haystack, const char* needle, const char* endhaystack=NULL);
818bool containsJustNonNucs(const char *dna, int dnalen);
819bool isNuc(const char *dna);
820
821/*
822 * expand the ~ to the $Home directory
823 */
824string expandHome(string filename);
825
826/*
827 * convert int to string
828 */
829inline string itoa(int n) {
830 ostringstream strm;
831 strm << n;
832 return strm.str();
833}
834
835/*
836 * convert double to string
837 */
838inline string ftoa(double n) {
839 ostringstream strm;
840 strm << n;
841 return strm.str();
842}
843
844/*
845 * copy string to newly allocated character array
846 */
847inline char* newstrcpy(const char* s, int len) {
848 char* result = new char[len + 1];
849 result[len] = '\0';
850 return strncpy(result, s, len);
851}
852inline char* newstrcpy(const char* s) {
853 int len = strlen(s);
854 char* result = new char[len + 1];
855 return strcpy(result, s);
856}
857
858inline char* newstrcpy(const string& s, int len) {
859 char* result = new char[len + 1];
860 s.copy(result, len);
861 result[len] = '\0';
862 return result;
863}
864inline char* newstrcpy(const string& s) {
865 return newstrcpy(s, s.length());
866}
867
868inline void strip_newlines(char* p) {
869 char* q = p;
870 while (p != 0 && *p != '\0') {
871 if (*p == '\n') {
872 p++;
873 *q = *p;
874 }
875 else {
876 *q++ = *p++;
877 }
878 }
879 *q = '\0';
880}
881
882char *getRandomDNA(int len);
883
884/* for 0 <= q <=1 get the q-th quantile of the values store in
885 * vector v.
886 */
887Double quantile(const vector<Double> &v, float q);
888int quantile(const vector<int> &v, float q);
889
890/* obtain a "hashtable" from a list of unique names, e.g.
891 * [human, mouse, dog] becomes
892 * human => 0, mouse => 1, dog => 2
893 * Mario: This could become an unordered_map (expected constant time) once we commit to C++11,
894 * rather than a logarithmic tree data structure
895 */
896map<string, size_t> *getMap (vector<string> names);
897
898/* Structure to store and accumulate a vector of doubles.
899 * Used for storing cumulative sums of log likelihoods for each rate matrix (e.g. for each omega)
900 * (optional: cumulative sum of number of substitutions)
901 * we create an object of cumValues for each bit_vector and reading frame combination
902 */
904 vector<double> logliks;
905 int numSubs;
906 int id;
907 int numSites; // number of codon alignment columns
908 vector<string> rows; // for debugging, temporary
909
910 cumValues(int i=0, int s=-1):
911 numSubs(s), id(i), numSites(0) {};
912
913 void addLogliks(vector<double>* ll){
914 if (logliks.size() == 0){
915 logliks.resize(ll->size(), 0.0);
916 }
917 for (unsigned u = 0; u < ll->size(); u++){
918 logliks[u] += (*ll)[u];
919 }
920 numSites++;
921 }
922 void addNumSubs(int subs){
923 numSubs += subs;
924 }
925 void addRows(vector<string> &cs) {
926 if (rows.size() == 0)
927 rows.assign(cs.size(), "");
928 for (unsigned i=0; i < cs.size(); i++){
929 rows[i].append(cs[i]);
930 }
931 }
932};
933
934/*
935 * functions used in earlier versions
936
937 *
938 * sort an array increasingly p from index a to index b (included).
939 */
940// void QuickSort(Double *p, int a, int b);
941
942/*
943 * compute the unbiased empirical standard deviation of a distribution given by an array of doubles of size n
944 * a distribution on the numbers {0,1, ..., n-1}
945 */
946// double variance(double *p, int n);
947
948/*
949 * compute the chi-square statistics for
950 * a multinomial distribution on the numbers {0,1, ..., r-1} given by p
951 * reference: uniform distribution
952 */
953// double chiSquareUniform(int *p, int n);
954
955// double chisquare(int a[4][4]);
956// int branchPointPosition(char *seq, int asspos);
957
958#endif // _TYPES_HH
Definition types.hh:304
Definition types.hh:480
This class implements a double object with a very large range.
Definition lldouble.hh:31
Definition types.hh:449
ProjectError(string msg)
Definition types.hh:455
string getMessage()
Definition types.hh:471
~ProjectError()
Definition types.hh:465
ProjectError()
Definition types.hh:460
Definition types.hh:425
Definition types.hh:903