PTreeGenerator  1.0
Simple phylogenetic tree generation from multiple sequence alignment.
 All Classes Namespaces Files Functions Variables
neighbor_joining.py
Go to the documentation of this file.
1 ## @package neighbor_joining
2 # Contains just the ptreegen::neighbor_joining::NeighborJoining class
3 #
4 
5 from ete2 import Tree
6 
7 from distance_matrix import DistanceMatrix
8 
9 ##
10 # Contains an implementation of the Neighbor-Joining algorithm.
11 #
12 # The computed tree is stored as an internal member NeighborJoining::tree.
13 #
15 
16  ##
17  # The constructor just saves the data for the execution.
18  #
19  # @param distMatrtix the distance matrix
20  # @param alignment should be specified when the names parameter is not present
21  # @param names the names of the taxa in the distance matrix
22  def __init__(self, distMatrtix, alignment=None, names=None):
23  self._alignment = alignment
24  self._names = names
25  if self._alignment:
26  self._distMatrix = DistanceMatrix(distMatrtix, [x.id for x in self._alignment])
27  elif self._names:
28  self._distMatrix = DistanceMatrix(distMatrtix, self._names)
29  else:
30  raise RuntimeError("You must pass either an alignment or a list of names.")
31 
32  ##
33  # Neighbor-Joining implementation.
34  #
35  # @return constructed tree with edges weighted by distances
36  @property
37  def tree(self):
38  L = self._distMatrix.columnNames
39  tree = Tree()
40  tree.name = "root"
41  tree.dist = 0
42  for seq in L:
43  tree.add_child(name=seq, dist=0)
44 
45  iter_count = 1
46  while len(L) > 2:
47  nearest_nbs = self._distMatrix.getNearestNeigbors()
48  node_i = tree.search_nodes(name=nearest_nbs[0])[0]
49  node_j = tree.search_nodes(name=nearest_nbs[1])[0]
50  L.remove(nearest_nbs[0])
51  L.remove(nearest_nbs[1])
52 
53  node_k = Tree()
54  node_k.dist = 0
55  node_k.name = "X" + str(iter_count)
56  d_ij = self._distMatrix.getDistance(node_i.name, node_j.name)
57  assert d_ij > 0
58  d_ik = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_i.name) - self._distMatrix.getSeparation(node_j.name))
59  d_jk = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_j.name) - self._distMatrix.getSeparation(node_i.name))
60 
61  tree.remove_child(node_i)
62  tree.remove_child(node_j)
63  node_k.add_child(node_i, dist=d_ik)
64  node_k.add_child(node_j, dist=d_jk)
65  tree.add_child(node_k)
66 
67  d_km = []
68  for node_m in L:
69  d_km.append(0.5 * (self._distMatrix.getDistance(node_i.name, node_m) + self._distMatrix.getDistance(node_j.name, node_m) - d_ij) )
70  assert d_km > 0
71 
72  self._distMatrix.removeData((node_i.name, node_j.name))
73  self._distMatrix.appendData(d_km, node_k.name)
74 
75  iter_count+=1
76  L = self._distMatrix.columnNames
77 
78  last_nodes = tree.get_children()
79  d_ij = self._distMatrix.getDistance(last_nodes[0].name, last_nodes[1].name)
80  leaf = None
81  new_root = None
82  for node in last_nodes:
83  if node.is_leaf():
84  node.dist = d_ij
85  leaf = node.detach()
86  else:
87  new_root = node.detach()
88  if not leaf:
89  leaf = last_nodes[0]
90  leaf.dist = d_ij
91  new_root.add_child(leaf)
92 
93  return new_root
94 
95  ##
96  # @var _distMatrix
97  # the distance matrix in more or less arbitrary form
98  # @var _names
99  # taxa identification strings
100  # @var _alignment
101  # multiple sequence alignment