PTreeGenerator  1.0
Simple phylogenetic tree generation from multiple sequence alignment.
 All Classes Namespaces Files Functions Variables
distance_matrix.py
Go to the documentation of this file.
1 ## @package distance_matrix
2 # Constains just the ptreegen::distance_matrix::DistanceMatrix class.
3 #
4 
5 import numpy as np
6 
7 ##
8 # Basically a wrapper around a numpy array obejct
9 # representing the alignment distance matrix.
10 #
11 # Performs some other additional operations usefull for tree building.
12 #
14 
15  ##
16  # Takes any "matrix-like" object and tries to convert it to a numpy array.
17  #
18  # @param matrix a "matrix-like" object
19  # @param names optional parameter with column and row names (the taxa names)
20  #
21  def __init__(self, matrix, names=None):
22  self._distMatrix = np.array(matrix, float)
23  assert len(self._distMatrix.shape) == 2
24  assert self._distMatrix.shape[0] == self._distMatrix.shape[1]
25 
26  if not names:
27  generated_names = []
28  for i in range(self.size):
29  generated_names.append("AUTOGEN_" + str(i+1))
30  self._columnNames = list(generated_names)
31  else:
32  self._columnNames = list(names)
33 
34  assert len(self._columnNames) == len(self._distMatrix)
35 
36  ##
37  # A getter for the matrix size (number of columns/taxa).
38  #
39  @property
40  def size(self):
41  return len(self._distMatrix)
42 
43  ##
44  # A getter for a copy of the whole distance matrix.
45  #
46  @property
47  def distMatrix(self):
48  return np.array(self._distMatrix, float)
49 
50  ##
51  # A getter for a list of column/taxa names.
52  #
53  @property
54  def columnNames(self):
55  return list(self._columnNames)
56 
57  ##
58  # Returns a separation of value used in the Neigbor-joining algorithm.
59  #
60  # It can be computed for one sequence only (parameter name)
61  # or for all sequences (no parameter).
62  #
63  # The separation value is computed as follows:
64  # <em>sum(d_ik) / (L - 2)</em>, where <em>sum(d_ik)</em> is the
65  # sum of distances from one sequence to
66  # all the other sequences and L is the total number of sequences.
67  #
68  # @param name identification of one sequence
69  # @return returns separation values for all sequences as a list
70  # or one value for one sequence with the specified name
71  def getSeparation(self, name=None):
72  dist_sum = None
73  if name:
74  idx = self._columnNames.index(name)
75  dist_sum = self._distMatrix[idx].sum()
76  else:
77  dist_sum = self._distMatrix.sum(axis=0)
78  return dist_sum / (self.size - 2)
79 
80  ##
81  # Finds the pair of nearest sequences.
82  #
83  # Finds the pair of closest sequences according to the rule
84  # from the Neigbor-Joining algorithm.
85  #
86  # @return tuple of size two that contains the names of two nearest sequences
87  def getNearestNeigbors(self):
88  min_obj_value = None
89  nearest_nbrs = tuple()
90  separation = self.getSeparation()
91  for i in range(self.size):
92  for j in range(self.size):
93  if j > i:
94  obj_value = self._distMatrix[i, j] - separation[i] - separation[j]
95  if not min_obj_value or obj_value < min_obj_value:
96  min_obj_value = obj_value
97  nearest_nbrs = (self._columnNames[i], self._columnNames[j])
98  return nearest_nbrs
99 
100  ##
101  # Returns the distance from one sequence to another.
102  #
103  # Based on the value from the distance matrix.
104  #
105  # @return one single number
106  def getDistance(self, name_i, name_j):
107  return self._distMatrix[self._columnNames.index(name_i), self._columnNames.index(name_j)]
108 
109  ##
110  # Finds the position of a sequence in the distance matrix.
111  #
112  # @param name the identification of the sequence
113  # @return index of a column in the matrix as a single number
114  def getIdx(self, name):
115  return self._columnNames.index(name)
116 
117  ##
118  # Finds the name of a sequence based on its position in the matrix.
119  #
120  # @param idx the position in the matrix
121  # @return index the identification of the sequence as string
122  def getName(self, idx):
123  return self._columnNames[idx]
124 
125  ##
126  # Removes rows and columns for the specified sequences.
127  #
128  # @param names the identifications of the sequences as an iterable
129  def removeData(self, names):
130  indices = (self._columnNames.index(x) for x in names)
131  for idx in indices:
132  self._distMatrix = np.delete(self._distMatrix, idx, axis=0)
133  self._distMatrix = np.delete(self._distMatrix, idx, axis=1)
134  self._columnNames.pop(idx)
135 
136  ##
137  # Adds a row and a column for the specified sequence.
138  #
139  # @param name the identification of the sequence
140  # @param data data to be appended as an iterable
141  def appendData(self, data, name):
142  arr = np.array([data], float)
143  self._distMatrix = np.append(self._distMatrix, arr, axis=0)
144  xs = []
145  for x in data:
146  xs.append([x])
147  xs.append([0])
148  arr = np.array(xs, float)
149  self._distMatrix = np.append(self._distMatrix, arr, axis=1)
150  self._columnNames.append(name)
151 
152  ##
153  # @var _distMatrix
154  # Distance matrix as a numpy array object.
155  # @var _columnNames
156  # List of column names (the identification strings of the sequences).
157  #