Disembl binaries and its dependancies e.g. minimized BioPython distribution and sovgo...
[jabaws.git] / binaries / src / disembl / biopython-1.50 / Bio / Data / IUPACData.py
diff --git a/binaries/src/disembl/biopython-1.50/Bio/Data/IUPACData.py b/binaries/src/disembl/biopython-1.50/Bio/Data/IUPACData.py
new file mode 100644 (file)
index 0000000..ebd5a12
--- /dev/null
@@ -0,0 +1,209 @@
+# Information about the IUPAC alphabets
+
+protein_letters = "ACDEFGHIKLMNPQRSTVWY"
+extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
+#   B = "Asx";  aspartic acid or asparagine (D or N)
+#   X = "Xxx";  unknown or 'other' amino acid
+#   Z = "Glx";  glutamic acid or glutamine (E or Q)
+#   J = "Xle";  leucine or isoleucine (L or I, used in mass-spec)
+#   U = "Sec";  selenocysteine
+#   O = "Pyl";  pyrrolysine
+ambiguous_dna_letters = "GATCRYWSMKHBVDN"
+unambiguous_dna_letters = "GATC"
+ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
+unambiguous_rna_letters = "GAUC"
+
+#   B == 5-bromouridine
+#   D == 5,6-dihydrouridine
+#   S == thiouridine
+#   W == wyosine
+extended_dna_letters = "GATCBDSW"
+
+# are there extended forms?
+#extended_rna_letters = "GAUCBDSW"
+
+ambiguous_dna_values = {
+    "A": "A",
+    "C": "C",
+    "G": "G",
+    "T": "T",
+    "M": "AC",
+    "R": "AG",
+    "W": "AT",
+    "S": "CG",
+    "Y": "CT",
+    "K": "GT",
+    "V": "ACG",
+    "H": "ACT",
+    "D": "AGT",
+    "B": "CGT",
+    "X": "GATC",
+    "N": "GATC",
+    }
+ambiguous_rna_values = {
+    "A": "A",
+    "C": "C",
+    "G": "G",
+    "U": "U",
+    "M": "AC",
+    "R": "AG",
+    "W": "AU",
+    "S": "CG",
+    "Y": "CU",
+    "K": "GU",
+    "V": "ACG",
+    "H": "ACU",
+    "D": "AGU",
+    "B": "CGU",
+    "X": "GAUC",
+    "N": "GAUC",
+    }
+
+ambiguous_dna_complement = {
+    "A": "T",
+    "C": "G",
+    "G": "C",
+    "T": "A",
+    "M": "K",
+    "R": "Y",
+    "W": "W",
+    "S": "S",
+    "Y": "R",
+    "K": "M",
+    "V": "B",
+    "H": "D",
+    "D": "H",
+    "B": "V",
+    "X": "X",
+    "N": "N",
+    }
+
+ambiguous_rna_complement = {
+    "A": "U",
+    "C": "G",
+    "G": "C",
+    "U": "A",
+    "M": "K",
+    "R": "Y",
+    "W": "W",
+    "S": "S",
+    "Y": "R",
+    "K": "M",
+    "V": "B",
+    "H": "D",
+    "D": "H",
+    "B": "V",
+    "X": "X",
+    "N": "N",
+    }
+
+
+def _make_ranges(dict):
+    d = {}
+    for key, value in dict.items():
+        d[key] = (value, value)
+    return d
+
+# From bioperl's SeqStats.pm
+unambiguous_dna_weights = {
+    "A": 347.,
+    "C": 323.,
+    "G": 363.,
+    "T": 322.,
+    }
+unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
+
+unambiguous_rna_weights = {
+    "A": unambiguous_dna_weights["A"] + 16.,  # 16 for the oxygen
+    "C": unambiguous_dna_weights["C"] + 16.,
+    "G": unambiguous_dna_weights["G"] + 16.,
+    "U": 340.,
+}
+unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
+
+def _make_ambiguous_ranges(dict, weight_table):
+    range_d = {}
+    avg_d = {}
+    for letter, values in dict.items():
+        #Following line is a quick hack to skip undefined weights for U and O
+        if len(values)==1 and values[0] not in weight_table : continue
+        weights = map(weight_table.get, values)
+        range_d[letter] = (min(weights), max(weights))
+        total_w = 0.0
+        for w in weights:
+            total_w = total_w + w
+        avg_d[letter] = total_w / len(weights)
+    return range_d, avg_d
+
+ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \
+               _make_ambiguous_ranges(ambiguous_dna_values,
+                                      unambiguous_dna_weights)
+
+ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \
+               _make_ambiguous_ranges(ambiguous_rna_values,
+                                      unambiguous_rna_weights)
+
+protein_weights = {
+    "A": 89.09,
+    "C": 121.16,
+    "D": 133.10,
+    "E": 147.13,
+    "F": 165.19,
+    "G": 75.07,
+    "H": 155.16,
+    "I": 131.18,
+    "K": 146.19,
+    "L": 131.18,
+    "M": 149.21,
+    "N": 132.12,
+    #"O": 0.0, # Needs to be recorded!
+    "P": 115.13,
+    "Q": 146.15,
+    "R": 174.20,
+    "S": 105.09,
+    "T": 119.12,
+    #"U": 168.05, # To be confirmed
+    "V": 117.15,
+    "W": 204.23,
+    "Y": 181.19
+    }
+
+extended_protein_values = {
+    "A": "A",
+    "B": "ND",
+    "C": "C",
+    "D": "D",
+    "E": "E",
+    "F": "F",
+    "G": "G",
+    "H": "H",
+    "I": "I",
+    "J": "IL",
+    "K": "K",
+    "L": "L",
+    "M": "M",
+    "N": "N",
+    "O": "O",
+    "P": "P",
+    "Q": "Q",
+    "R": "R",
+    "S": "S",
+    "T": "T",
+    "U": "U",
+    "V": "V",
+    "W": "W",
+    "X": "ACDEFGHIKLMNPQRSTVWY",
+    #TODO - Include U and O in the possible values of X?
+    #This could alter the extended_protein_weight_ranges ...
+    "Y": "Y",
+    "Z": "QE",
+}
+    
+protein_weight_ranges = _make_ranges(protein_weights)
+
+extended_protein_weight_ranges, avg_extended_protein_weights = \
+               _make_ambiguous_ranges(extended_protein_values,
+                                      protein_weights)
+
+
+