PTreeGenerator  1.0
Simple phylogenetic tree generation from multiple sequence alignment.
 All Classes Namespaces Files Functions Variables
distance_functions.py
Go to the documentation of this file.
1 ## @package distance_functions
2 # Contains implementations of possible distance measures.
3 #
4 
5 import math
6 
7 from enums import SeqTypes
8 
9 ##
10 # Computation of the uncorrected p-distance.
11 #
12 # The formula is similar to the one used in EMBOSS
13 # (see http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/distmat.html).
14 #
15 # @param seq1 first sequence
16 # @param seq2 second sequence
17 # @param *args positional arguments
18 # @param **kwargs keyword arguments
19 # (the "gap_penalty" argument is used to determine the gap penalty)
20 #
21 # @return distance as a single value
22 def p_distance(seq1, seq2, *args, **kwargs):
23  assert len(seq1) == len(seq2)
24  gap_penalty = 0
25  if kwargs.has_key("gap_penalty"):
26  gap_penalty = kwargs["gap_penalty"]
27  positions_all = len(seq1)
28  matches = 0
29  gaps = 0
30  for idx in range(positions_all):
31  res1 = seq1[idx]
32  res2 = seq2[idx]
33  if res1 == res2 and not(res1 == "-" and res2 == "-"):
34  matches+=1
35  elif res1 == "-" or res2 == "-" and not(res1 == "-" and res2 == "-"):
36  gaps+=1
37  return 1 - float(matches) / ((positions_all - gaps) + gaps * gap_penalty)
38 
39 ##
40 # Poisson corrected p-distance.
41 #
42 # For more info see: http://goo.gl/upr3wR
43 #
44 # @param seq1 first sequence
45 # @param seq2 second sequence
46 # @param *args positional arguments
47 # @param **kwargs keyword arguments
48 #
49 # @return distance as a single value
50 def poisson_corrected(seq1, seq2, *args, **kwargs):
51  p_dist = p_distance(seq1, seq2, *args, **kwargs)
52  return -1 * math.log(1 - p_dist)
53 
54 ##
55 # Distance according to the Jukes-Cantor model.
56 #
57 # For more info see: http://goo.gl/upr3wR
58 #
59 # @param seq1 first sequence
60 # @param seq2 second sequence
61 # @param *args positional arguments
62 # @param **kwargs keyword arguments
63 # ("sequence_type" option is used to determine the parameters for the formula)
64 #
65 # @return distance as a single value
66 def jukes_cantor(seq1, seq2, *args, **kwargs):
67  p_dist = p_distance(seq1, seq2, *args, **kwargs)
68  if kwargs["sequence_type"] == SeqTypes.AA:
69  return (-19.0/20.0) * math.log(1 - (20.0/19.0) * p_dist)
70  elif kwargs["sequence_type"] == SeqTypes.DNA or kwargs["sequence_type"] == SeqTypes.RNA:
71  return (-3.0/4.0) * math.log(1 - (4.0/3.0) * p_dist)
72  else:
73  assert False