PTreeGenerator  1.0
Simple phylogenetic tree generation from multiple sequence alignment.
 All Classes Namespaces Files Functions Variables
computation.py
Go to the documentation of this file.
1 ## @package computation
2 # Contains just the ptreegen::computation::Computation class.
3 #
4 
5 from Bio import AlignIO
6 from Bio.Alphabet import generic_protein, generic_dna, generic_rna
7 from Bio.Align import MultipleSeqAlignment
8 from Bio.SeqRecord import SeqRecord
9 
10 from enums import *
11 import distance_functions as dfuncs
12 from neighbor_joining import NeighborJoining
13 from parsimony import LargeParsimony
14 import visualization
15 
16 
17 
18 
19 
20 ##
21 # Parses user specified options and delegetes
22 # appropriate actions to other modules. Also serves as
23 # a data storage of computed results.
25 
26  ##
27  # Constructor initializes the object variables
28  # and calls other modules to do so.
29  #
30  # @param options computation options in the form of a dictionary-like object
31  def __init__(self, options):
32  self.algorithm = None
33  self.gapPenalty = None
34  self.includeGaps = None
35  self.removePoor = None
36  self.gapCutoff = None
37  self.pairCutoff = None
38  self.seqType = None
39  self.distFunction = None
40  self.parsIterCnt = None
41  self.options = options
42  self.parseOptions(self.options)
44  self.checkAlignment()
45  self.distanceMatrix = None
46  self.tree = self.computeTree()
48 
49  ##
50  # Parses the options passed to the constructor
51  #
52  # @param options computation options in the form of a dictionary-like object
53  def parseOptions(self, options):
54  self.algorithm = options["method"]
55  if self.algorithm not in (TreeBuildAlgorithms.NJ, TreeBuildAlgorithms.PARSIMONY):
56  raise RuntimeError("Unknown method: " + self.algorithm)
57  self.parsIterCnt = options["pars_tree_count"]
58  self.gapPenalty = options["gap_penalty"]
59  if self.gapPenalty < 0 or self.gapPenalty > 1:
60  raise RuntimeError("Bad gap penalty value. Must be between 0 and 1. Got: " + self.gapPenalty)
61  self.includeGaps = not options["no_gaps"]
62  self.removePoor = not options["no_cleaning"]
63  self.gapCutoff = options["gap_cutoff"]
64  if self.gapCutoff < 0 or self.gapCutoff > 1:
65  raise RuntimeError("Bad gap cutoff value. Must be between 0 and 1. Got: " + self.gapCutoff)
66  self.pairCutoff = options["pair_cutoff"]
67  self.seqType = options["sequence_type"]
68  if self.seqType == SeqTypes.AA:
69  self.alignment = AlignIO.read(options["alignment_file"], "fasta", alphabet=generic_protein)
70  elif self.seqType == SeqTypes.DNA:
71  self.alignment = AlignIO.read(options["alignment_file"], "fasta", alphabet=generic_dna)
72  elif self.seqType == SeqTypes.RNA:
73  self.alignment = AlignIO.read(options["alignment_file"], "fasta", alphabet=generic_rna)
74  else:
75  raise RuntimeError("Unknown sequence type: " + self.seqType)
76  if options["dist_measure"] == DistMeasures.P_DISTANCE:
77  self.distFunction = dfuncs.p_distance
78  elif options["dist_measure"] == DistMeasures.POISSON_CORRECTED:
79  self.distFunction = dfuncs.poisson_corrected
80  elif options["dist_measure"] == DistMeasures.JUKES_CANTOR:
81  self.distFunction = dfuncs.jukes_cantor
82  else:
83  raise RuntimeError("Unknown distance measure: " + options["dist_measure"])
84 
85  ##
86  # Checks the input alignment for duplicate
87  # taxa id strings.
88  #
89  def checkAlignment(self):
90  found = set()
91  for x in self.alignment:
92  if x.id in found:
93  raise RuntimeError("Duplicate taxa identification strings found: " + x.id)
94  else:
95  found.add(x.id)
96 
97  ##
98  # Method that delegates tree computation
99  # to the appropriate module.
100  #
101  # @return reference to the computed tree object
102  def computeTree(self):
103  if self.algorithm == TreeBuildAlgorithms.NJ:
105  return NeighborJoining(self.distanceMatrix, self.alignment).tree
106  elif self.algorithm == TreeBuildAlgorithms.PARSIMONY:
107  return LargeParsimony(self.alignment, steps=self.parsIterCnt).tree
108  else:
109  raise RuntimeError(self.algorithm + " not implemented.")
110 
111  ##
112  # Method that can be used to update the results,
113  # if the Computation class changes.
114  #
115  def update(self):
116  self.alignment = self.cleanAlignment(self.alignment)
118  self.tree = self.computeTree()
119 
120  ##
121  # Computes the distance matrix from the alignment.
122  #
123  # @param alignment the mutliple sequence alignment instance
124  # @param distFunction the distance measure used, can be one
125  # of the functions in ptreegen::distance_functions.
126  # @return distance matrix as a tuple
127  def computeDistanceMatrix(self, alignment, distFunction):
128  dist_matrix = []
129  for i,record_i in enumerate(alignment):
130  seq_i = record_i.seq
131  distances = []
132  for j,record_j in enumerate(alignment):
133  if j > i:
134  seq_j = record_j.seq
135  distances.append(distFunction(seq_i, seq_j, **self.options))
136  elif i == j:
137  distances.append(0)
138  else:
139  distances.append(dist_matrix[j][i])
140  dist_matrix.append(tuple(distances))
141  return tuple(dist_matrix)
142 
143  ##
144  # Method responsible for the correct alignment cleaning.
145  #
146  # It removes badly conserved regions and/or regions with too many gaps,
147  # if requested by the user.
148  #
149  # @param alignment the mutliple sequence alignment instance
150  def cleanAlignment(self, alignment):
151  to_remove = set()
152  for col_idx in range(alignment.get_alignment_length()):
153  column = alignment[:,col_idx]
154 
155  # removal of columns with gaps
156  if not self.includeGaps:
157  if column.find("-") != -1:
158  to_remove.add(col_idx)
159 
160  # poorly conserved regions removal
161  if self.removePoor and (col_idx not in to_remove):
162  # remove column if it contains too many gaps
163  nongap_ratio = float(len(column) - column.count("-")) / len(column)
164  if nongap_ratio < self.gapCutoff:
165  to_remove.add(col_idx)
166  continue
167 
168  # remove column if there are not enough identical residues
169  all_pairs = (len(column) - column.count("-")) * ((len(column) - column.count("-")) - 1) / 2.0
170  identical_pairs = 0
171  for res in set(column):
172  if res != "-":
173  res_count = column.count(res)
174  if res_count > 1:
175  identical_pairs += res_count * (res_count - 1) / 2.0
176  identical_pairs_ratio = identical_pairs / all_pairs
177  if identical_pairs_ratio < self.pairCutoff:
178  to_remove.add(col_idx)
179  continue
180 
181  cleaned_alignment = MultipleSeqAlignment([])
182  for record in alignment:
183  seq = record.seq.tomutable()
184  counter = 0
185  for idx in to_remove:
186  seq.pop(idx - counter)
187  counter+=1
188  cleaned_alignment.append(SeqRecord(seq.toseq(), id=record.id))
189 
190  return cleaned_alignment
191 
192  def showResults(self):
193  self.visualization.show()
194 
195  ##
196  # @var algorithm
197  # The methodology used to build the tree as one of those in ptreegen::enums.
198  # @var gapPenalty
199  # Cost of gaps when they are included in the distance computation.
200  # @var includeGaps
201  # Specifies if columns with gaps should be left in the alignment or deleted.
202  # @var removePoor
203  # Specifies if poorly conserved regions should be removed from the alignment.
204  # @var gapCutoff
205  # If at least x% in column are not gaps, then the column is left in the alignment.
206  # @var pairCutoff
207  # If at least x% of pairs in column are identical, then the column is left in the alignment.
208  # @var seqType
209  # The type of the input sequence as one of ptreegen::enums.
210  # @var distFunction
211  # Pointer to function used to compute distances between a two sequences.
212  # @var options
213  # Reference to the dictionary like object.
214  # @var alignment
215  # Reference to an object representing the multiple sequence alignment.
216  # @var distanceMatrix
217  # The distance matrix for the alignment.
218  # @var tree
219  # The generated tree.