PTreeGenerator  1.0
Simple phylogenetic tree generation from multiple sequence alignment.
•All Classes Namespaces Files Functions Variables
parsimony.py
Go to the documentation of this file.
1 ## @package parsimony
2 # Contains two classes (ptreegen::parsimony::SmallParsimony
3 # and ptreegen::parsimony::LargeParsimony) that implement
4 # the basic steps of the parsimony approach to tree building.
5 #
6 
7 from random import shuffle
8 
9 from ete2 import Tree
10 from Bio.Align import MultipleSeqAlignment
11 
12 from utilities import *
13 
14 ##
15 # Represents a solution to the small parsimony problem
16 # (tree is known and we are intersted
17 # in the parsimony score of the tree).
18 #
19 # It implements the Fitch's algorithm to score the tree,
20 # therefore the input tree should be a rooted binary tree,
21 # even though this implementation is extended to work with any
22 # type of tree.
23 #
24 # \sa LargeParsimony
25 #
27 
28  ##
29  # Constructor takes tree and a corresponding alignment as input
30  # and saves references to them as SmallParsimony::_tree
31  # and SmallParsimony::_alignment.
32  #
33  # @param tree the tree to be scored
34  # @param alignment alignment corresponding to the input tree
35  def __init__(self, tree, alignment):
36  self._tree = tree
37  self._alignment = alignment
38  self._sequencesDict = {x.id : x.seq for x in list(alignment)}
39  leaf_names = set(tree.get_leaf_names())
40  if len(self._sequencesDict) != len(leaf_names):
41  raise RuntimeError("Number of sequnces in alignment:\n" + repr(alignment)
42  + "\ndoes not match the number of leaves in tree:\n" + repr(tree))
43  for seq_id in self._sequencesDict:
44  if seq_id not in leaf_names:
45  raise RuntimeError("Sequence ID (" + seq_id
46  + ") does not match any leaf in tree:\n" + repr(tree))
47 
48  self._treeCharacterDict = dict()
49  self._cost = float('inf')
50 
51  ##
52  # A getter for the cost value. Calls the
53  # SmallParsimony::_solve() method to compute it.
54  #
55  # @return parsimony score as a single value
56  @property
57  def cost(self):
58  self._cost = 0
59  self._solve()
60  return self._cost
61 
62  ##
63  # Iteraterates over each column of the alignment
64  # and computes the parsimony score for each.
65  # It calls the SmallParsimony::_assign() method
66  # to score each column and add the score to
67  # the SmallParsimony::_cost member.
68  #
69  def _solve(self):
70  for col_idx in range(self._alignment.get_alignment_length()):
71  self._treeCharacterDict = dict()
72  self._assign(self._tree, col_idx)
73 
74  ##
75  # Recursive method implementing the Fitch's algorithm.
76  # It traverses the tree from an arbitrary node (preferably root)
77  # and computes the overall parsimony score.
78  #
79  # @param node the node to start traversing the tree from (preferably root)
80  # @param site_idx index of the column in the alignment that is being processed
81  def _assign(self, node, site_idx):
82  if node.is_leaf():
83  self._treeCharacterDict[node] = set(self._sequencesDict[node.name][site_idx])
84  else:
85  for child in node.children:
86  self._assign(child, site_idx)
87 
88  character_set = set(self._treeCharacterDict[node.children[0]])
89  for child in node.children:
90  character_set.intersection_update(self._treeCharacterDict[child])
91 
92  if character_set:
93  self._treeCharacterDict[node] = character_set
94  else:
95  for child in node.children:
96  character_set.update(self._treeCharacterDict[child])
97  self._treeCharacterDict[node] = character_set
98  self._cost += 1
99  ##
100  # @var _tree
101  # Refernce to the tree being scored.
102  # @var _alignment
103  # Refernce to the coresponding alignment.
104  # @var _sequencesDict
105  # Associates sequence id string with a sequence instance.
106  # @var _treeCharacterDict
107  # A dictionary that stores the character sets asigned to each node
108  # everytime the SmallParsimony::_assign() method is executed
109  # for each column of the alignment.
110  # @var _cost
111  # Variable to store the intermediate results.
112  #
113 
114 ##
115 # Represents a solution to the large parsimony problem
116 # (the tree toplogy is unknown as oposed to the
117 # small parsimony problem).
118 #
119 # It implements the quartet puzzling heuristic to find
120 # a set of possible tree topologies and it then builds
121 # a consensus tree employing the 50% majority rule
122 # (implemented in ptreegen::utilities::tree_utils::findConsensusTree()).
123 #
124 # For more information on the quartet puzzling algorithm see: <br />
125 # Strimmer, Korbinian, and Arndt Von Haeseler.
126 # <em>Quartet puzzling: a quartet maximum-likelihood method
127 # for reconstructing tree topologies.</em>
128 # Molecular Biology and Evolution 13.7 (1996): 964-969.<br />
129 # or the appropriate chapter in: <br />
130 # Bockenhauer, H.; Bongartz, D. <em>Algorithmic Aspects of
131 # Bioinformatics</em>, 1st ed.; Springer: Berlin, 1998.
132 #
133 # \sa SmallParsimony
134 # \sa LargeParsimony::quartetPuzzling()
135 #
137 
138  ##
139  # Initializes the members used during the computation.
140  # It also generates the optimal quartets by first
141  # computing the parsimony cost of all possible unique quartet
142  # combinations and then selecting the quartet with minimum
143  # parsimony cost for every four taxa.
144  #
145  # @param alignment multiple sequence alignment
146  # @param steps number of possible tree topologies that will be generated
147  def __init__(self, alignment, steps=1000):
148  self._qpSteps = int(steps)
149  self._alignment = alignment
150  self._sequencesDict = {x.id : x.seq for x in list(self._alignment)}
151  quartet_combinations = []
152  for combination in combination_utils.uniqueCombinationsGenerator(self._sequencesDict.keys(), 4):
153  quartet_combinations.append(combination)
154  self._optimalQuartets = self.getOptimalQuartets(quartet_combinations)
155  self._tree = None
156 
157  ##
158  # A getter for the obtained consensus tree.
159  #
160  # @return reference to the tree instance
161  @property
162  def tree(self):
163  if self._tree:
164  return self._tree
165  else:
166  self._tree = self.quartetPuzzling(self._qpSteps)
167  self._tree.get_tree_root().unroot()
168  return self._tree
169 
170  ##
171  # A getter for the parsimony score of the
172  # computed tree.
173  #
174  # @return parsimony cost of the tree as a single value
175  @property
176  def cost(self):
177  return SmallParsimony(self.tree, self._alignment).cost
178 
179  ##
180  # Iteratively applies the quartet puzzling
181  # heuristic to obtain intermediate tree topologies.
182  #
183  # Implemented as described in: <br />
184  # Bockenhauer, H.; Bongartz, D. <em>Algorithmic Aspects of
185  # Bioinformatics</em>, 1st ed.; Springer: Berlin, 1998.
186  #
187  # @param steps number of puzzling steps (equals the number of intermediate
188  # tree topologies)
189  # @return consensus tree according to the 50% majority rule
190  def quartetPuzzling(self, steps):
191  seq_ids = self._sequencesDict.keys()
192  if len(seq_ids) < 4:
193  tree = Tree()
194  for seq_id in seq_ids:
195  tree.add_child(name=seq_id)
196  return tree
197 
198  trees = []
199  for step in range(steps):
200  shuffle(seq_ids)
201  first_quartet = self._optimalQuartets[self.getQuartetID(seq_ids[0:4])]["topology"]
202  rooted_tree = self.treeFromQuartet(first_quartet)
203  tree = rooted_tree.children[0]
204  tree.add_child(rooted_tree.children[1])
205  # tree.show()
206 
207  for i in range(4,len(seq_ids)):
208  tree_utils.initEdgeLengths(tree, 0)
209 
210  quartets = []
211  for triplet in combination_utils.combinationsGenerator(seq_ids[0:i], 3):
212  triplet.append(seq_ids[i])
213  quartets.append(tuple(triplet))
214 
215  qt_topos_found = set()
216  for quartet in quartets:
217  optimal_qt_topo_id = self._optimalQuartets[self.getQuartetID(quartet)]["topology_id"]
218  qt_topo_id = self.getTopologyID(quartet)
219  if qt_topo_id == optimal_qt_topo_id and qt_topo_id not in qt_topos_found:
220  qt_topos_found.add(qt_topo_id)
221  self.increaseCostOnPath(tree, quartet[0], quartet[1])
222 
223  # choose edge with minimum cost, delete it and add new leaf seq_ids[i]
224  shortest_edge = tree_utils.findShortestEdge(tree)
225  # new_node = Tree(name=shortest_edge[0].name + "_" + shortest_edge[1].name)
226  new_node = Tree()
227  new_node.add_child(name=seq_ids[i])
228  detached = shortest_edge[1].detach()
229  shortest_edge[0].add_child(new_node)
230  new_node.add_child(detached)
231  # tree.show()
232 
233  tree_utils.initEdgeLengths(tree, 1)
234  trees.append(tree)
235 
236  # find consensus tree
237  return tree_utils.findConsensusTree(trees)
238 
239  ##
240  # Finds the optimal topologies for every four taxa.
241  #
242  # @param quartets all possible unique combinations of taxa of size four
243  # @return a dictionary that associates every for taxa with the optimal topology
244  def getOptimalQuartets(self, quartets):
245  optimal_quartets = dict()
246  for quartet in quartets:
247  quartet_id = self.getQuartetID(quartet)
248  assert quartet_id not in optimal_quartets
249 
250  trees = {tuple(quartet) : self.treeFromQuartet(quartet)}
251  for i in range(0,2):
252  temp = quartet[i]
253  quartet[i] = quartet[2]
254  quartet[2] = temp
255  trees[tuple(quartet)] = self.treeFromQuartet(quartet)
256 
257  min_cost = float("inf")
258  for quartet_key, tree in trees.iteritems():
259  alignment = MultipleSeqAlignment([])
260  for record in self._alignment:
261  if record.id in quartet:
262  alignment.append(record)
263  small_parsimony = SmallParsimony(tree, alignment)
264  if small_parsimony.cost < min_cost:
265  min_cost = small_parsimony.cost
266  optimal_quartets[quartet_id] = {
267  "topology" : quartet_key
268  , "topology_id" : self.getTopologyID(quartet_key)
269  }
270  return optimal_quartets
271 
272  ##
273  # Provides a unique identification
274  # of the quartet by sorting the
275  # names of the taxa in the quartet
276  # alphabetically and then concatenating
277  # their identification strings, hence
278  # allowing quartets to be identified with their
279  # respective taxa.
280  #
281  # Note that the taxa identification
282  # strings should be unique in order
283  # to eliminate collisions.
284  #
285  # @param quartet quartet to be identified
286  @staticmethod
287  def getQuartetID(quartet):
288  return "".join(sorted(quartet))
289 
290  ##
291  # Provides a string that uniquely
292  # identifies a quartet topology.
293  #
294  # Quartet topology does not depend
295  # on the order of the first two and
296  # the last two elements in the quartet.
297  # This method is needed to distinguish
298  # two quartets of the same four taxa
299  # that have different topologies.
300  #
301  # It also allows to have the first two
302  # and last two elements of the quartet
303  # in an arbitrary order.
304  #
305  # @param quartet quartet to be identified
306  @staticmethod
307  def getTopologyID(quartet):
308  return LargeParsimony.getQuartetID(quartet[0:2]) + LargeParsimony.getQuartetID(quartet[2:4])
309 
310  ##
311  # Constructs a tree from the provided quartet.
312  # The tree is rooted so that it can be used
313  # in the Fitch's algorithm for solving the small
314  # parsimony problem.
315  #
316  # @param quartet quartet to be transformed
317  # @return a reference to the root of the tree
318  @staticmethod
319  def treeFromQuartet(quartet):
320  root = Tree()
321  root.name = "root"
322  left = root.add_child(name="Left")
323  left.add_child(name=quartet[0])
324  left.add_child(name=quartet[1])
325  right = root.add_child(name="Right")
326  right.add_child(name=quartet[2])
327  right.add_child(name=quartet[3])
328  for desc in root.iter_descendants():
329  desc.dist = 0
330  return root
331 
332  ##
333  # Finds a path between two vertices
334  # and then increases the weight of all edges
335  # on that path by one.
336  #
337  # @param tree reference to any vertex of the processed tree
338  # @param start reference to the start node
339  # @param dest reference to the end node
340  @staticmethod
341  def increaseCostOnPath(tree, start, dest):
342  start_node = tree.search_nodes(name=start)[0]
343  end_node = tree.search_nodes(name=dest)[0]
344  node_from = dict()
345  to_visit = set()
346  visited = set()
347 
348  to_visit.add(start_node)
349  node_from[start_node] = None
350  path_found = False
351  while to_visit and not path_found:
352  current = to_visit.pop()
353  neighbors = set(current.children)
354  if current.up:
355  neighbors.add(current.up)
356  for neighbor in neighbors:
357  if neighbor not in visited:
358  node_from[neighbor] = current
359  if neighbor == end_node:
360  path_found = True
361  break
362  else:
363  to_visit.add(neighbor)
364  visited.add(current)
365 
366  node = end_node
367  while node:
368  if node.name != "Left":
369  node.dist += 1
370  node = node_from[node]
371 
372  ##
373  # @var _qpSteps
374  # The number of quartet puzzling steps to take.
375  # @var _alignment
376  # Reference to the multiple sequence alignment instance
377  # @var _sequencesDict
378  # Associates sequence id string with a sequence instance.
379  # @var _optimalQuartets
380  # Associates every four taxa with their optimal quartet.
381  # @var _tree
382  # Reference to the tree that was computed in LargeParsimony::quartetPuzzling().