5 from Bio
import AlignIO
6 from Bio.Alphabet
import generic_protein, generic_dna, generic_rna
7 from Bio.Align
import MultipleSeqAlignment
8 from Bio.SeqRecord
import SeqRecord
11 import distance_functions
as dfuncs
12 from neighbor_joining
import NeighborJoining
13 from parsimony
import LargeParsimony
55 if self.
algorithm not in (TreeBuildAlgorithms.NJ, TreeBuildAlgorithms.PARSIMONY):
56 raise RuntimeError(
"Unknown method: " + self.
algorithm)
60 raise RuntimeError(
"Bad gap penalty value. Must be between 0 and 1. Got: " + self.
gapPenalty)
65 raise RuntimeError(
"Bad gap cutoff value. Must be between 0 and 1. Got: " + self.
gapCutoff)
67 self.
seqType = options[
"sequence_type"]
69 self.
alignment = AlignIO.read(options[
"alignment_file"],
"fasta", alphabet=generic_protein)
70 elif self.
seqType == SeqTypes.DNA:
71 self.
alignment = AlignIO.read(options[
"alignment_file"],
"fasta", alphabet=generic_dna)
72 elif self.
seqType == SeqTypes.RNA:
73 self.
alignment = AlignIO.read(options[
"alignment_file"],
"fasta", alphabet=generic_rna)
75 raise RuntimeError(
"Unknown sequence type: " + self.
seqType)
76 if options[
"dist_measure"] == DistMeasures.P_DISTANCE:
78 elif options[
"dist_measure"] == DistMeasures.POISSON_CORRECTED:
80 elif options[
"dist_measure"] == DistMeasures.JUKES_CANTOR:
83 raise RuntimeError(
"Unknown distance measure: " + options[
"dist_measure"])
93 raise RuntimeError(
"Duplicate taxa identification strings found: " + x.id)
103 if self.
algorithm == TreeBuildAlgorithms.NJ:
106 elif self.
algorithm == TreeBuildAlgorithms.PARSIMONY:
109 raise RuntimeError(self.
algorithm +
" not implemented.")
129 for i,record_i
in enumerate(alignment):
132 for j,record_j
in enumerate(alignment):
139 distances.append(dist_matrix[j][i])
140 dist_matrix.append(tuple(distances))
141 return tuple(dist_matrix)
152 for col_idx
in range(alignment.get_alignment_length()):
153 column = alignment[:,col_idx]
157 if column.find(
"-") != -1:
158 to_remove.add(col_idx)
161 if self.
removePoor and (col_idx
not in to_remove):
163 nongap_ratio = float(len(column) - column.count(
"-")) / len(column)
165 to_remove.add(col_idx)
169 all_pairs = (len(column) - column.count(
"-")) * ((len(column) - column.count(
"-")) - 1) / 2.0
171 for res
in set(column):
173 res_count = column.count(res)
175 identical_pairs += res_count * (res_count - 1) / 2.0
176 identical_pairs_ratio = identical_pairs / all_pairs
178 to_remove.add(col_idx)
181 cleaned_alignment = MultipleSeqAlignment([])
182 for record
in alignment:
183 seq = record.seq.tomutable()
185 for idx
in to_remove:
186 seq.pop(idx - counter)
188 cleaned_alignment.append(SeqRecord(seq.toseq(), id=record.id))
190 return cleaned_alignment
193 self.visualization.show()