new services are registered in wsbuild

[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
diff --git a/datamodel/compbio/data/sequence/SequenceUtil.java b/datamodel/compbio/data/sequence/SequenceUtil.java

index 99a8147..c4e1def 100644 (file)
--- a/datamodel/compbio/data/sequence/SequenceUtil.java
+++ b/datamodel/compbio/data/sequence/SequenceUtil.java
@@ -1,22 +1,15 @@
-/* \r
- * @(#)SequenceUtil.java 1.0 September 2009\r
- * \r
- * Copyright (c) 2009 Peter Troshin\r
- *  \r
- * Jalview Web Services version: 2.0     \r
- * \r
- *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
- *  Apache License version 2 as published by the Apache Software Foundation\r
- * \r
- *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
- *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
- *  License for more details.\r
- * \r
- *  A copy of the license is in apache_license.txt. It is also available here:\r
- * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
- * \r
- * Any republication or derived work distributed in source code form\r
- * must include this copyright and license notice.\r
+/*\r
+ * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
+ * Jalview Web Services version: 2.0 This library is free software; you can\r
+ * redistribute it and/or modify it under the terms of the Apache License\r
+ * version 2 as published by the Apache Software Foundation This library is\r
+ * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
+ * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
+ * license is in apache_license.txt. It is also available here: see:\r
+ * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
+ * work distributed in source code form must include this copyright and license\r
+ * notice.\r
   */\r
  \r
  package compbio.data.sequence;\r
@@ -32,11 +25,18 @@ import java.io.InputStreamReader;
  import java.io.OutputStream;\r
  import java.io.OutputStreamWriter;\r
  import java.util.ArrayList;\r
+import java.util.HashMap;\r
+import java.util.HashSet;\r
  import java.util.List;\r
+import java.util.Map;\r
+import java.util.Scanner;\r
+import java.util.TreeSet;\r
  import java.util.logging.Level;\r
  import java.util.regex.Matcher;\r
  import java.util.regex.Pattern;\r
  \r
+import compbio.util.Util;\r
+\r
  /**\r
   * Utility class for operations on sequences\r
   * \r
@@ -45,362 +45,664 @@ import java.util.regex.Pattern;
   */\r
  public final class SequenceUtil {\r
  \r
-    /**\r
-     * A whitespace character: [\t\n\x0B\f\r]\r
-     */\r
-    public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
-\r
-    /**\r
-     * A digit\r
-     */\r
-    public static final Pattern DIGIT = Pattern.compile("\\d");\r
-\r
-    /**\r
-     * Non word\r
-     */\r
-    public static final Pattern NONWORD = Pattern.compile("\\W");\r
-\r
-    /**\r
-     * Valid Amino acids\r
-     */\r
-    public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
-           Pattern.CASE_INSENSITIVE);\r
-\r
-    /**\r
-     * inversion of AA pattern\r
-     */\r
-    public static final Pattern NON_AA = Pattern.compile(\r
-           "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
-\r
-    /**\r
-     * Same as AA pattern but with two additional letters - XU\r
-     */\r
-    public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
-           "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
-\r
-    /**\r
-     * Nucleotides a, t, g, c, u\r
-     */\r
-    public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
-           Pattern.CASE_INSENSITIVE);\r
-\r
-    /**\r
-     * Ambiguous nucleotide\r
-     */\r
-    public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
-           "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
-    /**\r
-     * Non nucleotide\r
-     */\r
-    public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
-           Pattern.CASE_INSENSITIVE);\r
-\r
-    private SequenceUtil() {\r
-    } // utility class, no instantiation\r
-\r
-    /*\r
-     * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
-     * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
-     * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
-     * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
-     * SysPrefs.newlinechar); pir_out.close(); }\r
-     * \r
-     * public static void write_FastaSeq(OutputStream os, FastaSequence seq)\r
-     * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new\r
-     * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
-     * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
-     * SysPrefs.newlinechar); fasta_out.close(); }\r
-     */\r
-\r
-    /**\r
-     * @return true is the sequence contains only letters a,c, t, g, u\r
-     */\r
-    public static boolean isNucleotideSequence(final FastaSequence s) {\r
-       return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
-    }\r
-\r
-    /**\r
-     * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
-     * (!) - B char\r
-     */\r
-    public static boolean isNonAmbNucleotideSequence(String sequence) {\r
-       sequence = SequenceUtil.cleanSequence(sequence);\r
-       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
-           return false;\r
+       /**\r
+        * A whitespace character: [\t\n\x0B\f\r]\r
+        */\r
+       public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
+\r
+       /**\r
+        * A digit\r
+        */\r
+       public static final Pattern DIGIT = Pattern.compile("\\d");\r
+\r
+       /**\r
+        * Non word\r
+        */\r
+       public static final Pattern NONWORD = Pattern.compile("\\W");\r
+\r
+       /**\r
+        * Valid Amino acids\r
+        */\r
+       public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
+                       Pattern.CASE_INSENSITIVE);\r
+\r
+       /**\r
+        * inversion of AA pattern\r
+        */\r
+       public static final Pattern NON_AA = Pattern.compile(\r
+                       "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
+\r
+       /**\r
+        * Same as AA pattern but with two additional letters - XU\r
+        */\r
+       public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
+                       "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
+\r
+       /**\r
+        * Nucleotides a, t, g, c, u\r
+        */\r
+       public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
+                       Pattern.CASE_INSENSITIVE);\r
+\r
+       /**\r
+        * Ambiguous nucleotide\r
+        */\r
+       public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
+                       "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
+       /**\r
+        * Non nucleotide\r
+        */\r
+       public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
+                       Pattern.CASE_INSENSITIVE);\r
+\r
+       private SequenceUtil() {\r
+       } // utility class, no instantiation\r
+\r
+       /*\r
+        * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
+        * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
+        * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
+        * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
+        * SysPrefs.newlinechar); pir_out.close(); } public static void\r
+        * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
+        * BufferedWriter fasta_out = new BufferedWriter( new\r
+        * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
+        * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
+        * SysPrefs.newlinechar); fasta_out.close(); }\r
+        */\r
+\r
+       /**\r
+        * @return true is the sequence contains only letters a,c, t, g, u\r
+        */\r
+       public static boolean isNucleotideSequence(final FastaSequence s) {\r
+               return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
+       }\r
+\r
+       /**\r
+        * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
+        * (!) - B char\r
+        */\r
+       public static boolean isNonAmbNucleotideSequence(String sequence) {\r
+               sequence = SequenceUtil.cleanSequence(sequence);\r
+               if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+                       return false;\r
+               }\r
+               if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
+                       return false;\r
+                       /*\r
+                        * System.out.format("I found the text starting at " +\r
+                        * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
+                        * nonDNAmatcher.end());\r
+                        */\r
+               }\r
+               final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
+               return DNAmatcher.find();\r
+       }\r
+\r
+       /**\r
+        * Removes all whitespace chars in the sequence string\r
+        * \r
+        * @param sequence\r
+        * @return cleaned up sequence\r
+        */\r
+       public static String cleanSequence(String sequence) {\r
+               assert sequence != null;\r
+               final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
+               sequence = m.replaceAll("").toUpperCase();\r
+               return sequence;\r
+       }\r
+\r
+       /**\r
+        * Removes all special characters and digits as well as whitespace chars\r
+        * from the sequence\r
+        * \r
+        * @param sequence\r
+        * @return cleaned up sequence\r
+        */\r
+       public static String deepCleanSequence(String sequence) {\r
+               sequence = SequenceUtil.cleanSequence(sequence);\r
+               sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
+               sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
+               final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
+               sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
+               return sequence;\r
+       }\r
+\r
+       /**\r
+        * @param sequence\r
+        * @return true is the sequence is a protein sequence, false overwise\r
+        */\r
+       public static boolean isProteinSequence(String sequence) {\r
+               sequence = SequenceUtil.cleanSequence(sequence);\r
+               if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
+                       return false;\r
+               }\r
+               if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+                       return false;\r
+               }\r
+               if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
+                       return false;\r
+               }\r
+               final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
+               return protmatcher.find();\r
+       }\r
+\r
+       /**\r
+        * Check whether the sequence confirms to amboguous protein sequence\r
+        * \r
+        * @param sequence\r
+        * @return return true only if the sequence if ambiguous protein sequence\r
+        *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
+        *         protein or DNA\r
+        */\r
+       public static boolean isAmbiguosProtein(String sequence) {\r
+               sequence = SequenceUtil.cleanSequence(sequence);\r
+               if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
+                       return false;\r
+               }\r
+               if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+                       return false;\r
+               }\r
+               if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
+                       return false;\r
+               }\r
+               if (SequenceUtil.AA.matcher(sequence).find()) {\r
+                       return false;\r
+               }\r
+               final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
+               return amb_prot.find();\r
+       }\r
+\r
+       /**\r
+        * Writes list of FastaSequeces into the outstream formatting the sequence\r
+        * so that it contains width chars on each line\r
+        * \r
+        * @param outstream\r
+        * @param sequences\r
+        * @param width\r
+        *            - the maximum number of characters to write in one line\r
+        * @throws IOException\r
+        */\r
+       public static void writeFasta(final OutputStream outstream,\r
+                       final List<FastaSequence> sequences, final int width)\r
+                       throws IOException {\r
+               writeFastaKeepTheStream(outstream, sequences, width);\r
+               outstream.close();\r
+       }\r
+\r
+       public static void writeFastaKeepTheStream(final OutputStream outstream,\r
+                       final List<FastaSequence> sequences, final int width)\r
+                       throws IOException {\r
+               final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
+               final BufferedWriter fastawriter = new BufferedWriter(writer);\r
+               for (final FastaSequence fs : sequences) {\r
+                       fastawriter.write(">" + fs.getId() + "\n");\r
+                       fastawriter.write(fs.getFormatedSequence(width));\r
+                       fastawriter.write("\n");\r
+               }\r
+               fastawriter.flush();\r
+               writer.flush();\r
         }\r
-       if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
-           return false;\r
-           /*\r
-            * System.out.format("I found the text starting at " +\r
-            * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
-            * nonDNAmatcher.end());\r
-            */\r
+\r
+       /**\r
+        * Reads fasta sequences from inStream into the list of FastaSequence\r
+        * objects\r
+        * \r
+        * @param inStream\r
+        *            from\r
+        * @return list of FastaSequence objects\r
+        * @throws IOException\r
+        */\r
+       public static List<FastaSequence> readFasta(final InputStream inStream)\r
+                       throws IOException {\r
+               final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
+\r
+               final BufferedReader infasta = new BufferedReader(\r
+                               new InputStreamReader(inStream, "UTF8"), 16000);\r
+               final Pattern pattern = Pattern.compile("//s+");\r
+\r
+               String line;\r
+               String sname = "", seqstr = null;\r
+               do {\r
+                       line = infasta.readLine();\r
+                       if ((line == null) || line.startsWith(">")) {\r
+                               if (seqstr != null) {\r
+                                       seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+                               }\r
+                               sname = line; // remove >\r
+                               seqstr = "";\r
+                       } else {\r
+                               final String subseq = pattern.matcher(line).replaceAll("");\r
+                               seqstr += subseq;\r
+                       }\r
+               } while (line != null);\r
+\r
+               infasta.close();\r
+               return seqs;\r
         }\r
-       final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
-       return DNAmatcher.find();\r
-    }\r
-\r
-    /**\r
-     * Removes all whitespace chars in the sequence string\r
-     * \r
-     * @param sequence\r
-     * @return cleaned up sequence\r
-     */\r
-    public static String cleanSequence(String sequence) {\r
-       assert sequence != null;\r
-       final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
-       sequence = m.replaceAll("").toUpperCase();\r
-       return sequence;\r
-    }\r
-\r
-    /**\r
-     * Removes all special characters and digits as well as whitespace chars\r
-     * from the sequence\r
-     * \r
-     * @param sequence\r
-     * @return cleaned up sequence\r
-     */\r
-    public static String deepCleanSequence(String sequence) {\r
-       sequence = SequenceUtil.cleanSequence(sequence);\r
-       sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
-       sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
-       final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
-       sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
-       return sequence;\r
-    }\r
-\r
-    /**\r
-     * \r
-     * @param sequence\r
-     * @return true is the sequence is a protein sequence, false overwise\r
-     */\r
-    public static boolean isProteinSequence(String sequence) {\r
-       sequence = SequenceUtil.cleanSequence(sequence);\r
-       if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
-           return false;\r
+\r
+       /**\r
+        * Writes FastaSequence in the file, each sequence will take one line only\r
+        * \r
+        * @param os\r
+        * @param sequences\r
+        * @throws IOException\r
+        */\r
+       public static void writeFasta(final OutputStream os,\r
+                       final List<FastaSequence> sequences) throws IOException {\r
+               final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
+               final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
+               for (final FastaSequence fs : sequences) {\r
+                       fasta_out.write(fs.getOnelineFasta());\r
+               }\r
+               fasta_out.close();\r
+               outWriter.close();\r
         }\r
-       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
-           return false;\r
+\r
+       public static Map<String, Score> readJRonn(final File result)\r
+                       throws IOException, UnknownFileFormatException {\r
+               InputStream input = new FileInputStream(result);\r
+               Map<String, Score> sequences = readJRonn(input);\r
+               input.close();\r
+               return sequences;\r
         }\r
-       if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
-           return false;\r
+\r
+       /**\r
+        * Reader for JRonn horizontal file format\r
+        * \r
+        * <pre>\r
+        * &gtFoobar M G D T T A G 0.48 0.42\r
+        * 0.42 0.48 0.52 0.53 0.54\r
+        * \r
+        * <pre>\r
+        * Where all values are tab delimited\r
+        * \r
+        * @param inStream\r
+        *            the InputStream connected to the JRonn output file\r
+        * @return List of {@link AnnotatedSequence} objects\r
+        * @throws IOException\r
+        *             is thrown if the inStream has problems accessing the data\r
+        * @throws UnknownFileFormatException\r
+        *             is thrown if the inStream represents an unknown source of\r
+        * data, i.e. not a JRonn output\r
+        */\r
+       public static Map<String, Score> readJRonn(final InputStream inStream)\r
+                       throws IOException, UnknownFileFormatException {\r
+               final Map<String, Score> seqs = new HashMap<String, Score>();\r
+\r
+               final BufferedReader infasta = new BufferedReader(\r
+                               new InputStreamReader(inStream, "UTF8"), 16000);\r
+\r
+               String line;\r
+               String sname = "";\r
+               do {\r
+                       line = infasta.readLine();\r
+                       if (line == null || line.isEmpty()) {\r
+                               // skip empty lines\r
+                               continue;\r
+                       }\r
+                       if (line.startsWith(">")) {\r
+                               // read name\r
+                               sname = line.trim().substring(1);\r
+                               // read sequence line\r
+                               line = infasta.readLine();\r
+                               final String sequence = line.replace("\t", "");\r
+                               // read annotation line\r
+                               line = infasta.readLine();\r
+                               String[] annotValues = line.split("\t");\r
+                               float[] annotation = convertToNumber(annotValues);\r
+                               if (annotation.length != sequence.length()) {\r
+                                       throw new UnknownFileFormatException(\r
+                                                       "File does not look like Jronn horizontally formatted output file!\n"\r
+                                                                       + JRONN_WRONG_FORMAT_MESSAGE);\r
+                               }\r
+                               seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
+                       }\r
+               } while (line != null);\r
+\r
+               infasta.close();\r
+               return seqs;\r
         }\r
-       final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
-       return protmatcher.find();\r
-    }\r
-\r
-    /**\r
-     * Check whether the sequence confirms to amboguous protein sequence\r
-     * \r
-     * @param sequence\r
-     * @return return true only if the sequence if ambiguous protein sequence\r
-     *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
-     *         protein or DNA\r
-     */\r
-    public static boolean isAmbiguosProtein(String sequence) {\r
-       sequence = SequenceUtil.cleanSequence(sequence);\r
-       if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
-           return false;\r
+\r
+       private static float[] convertToNumber(String[] annotValues)\r
+                       throws UnknownFileFormatException {\r
+               float[] annotation = new float[annotValues.length];\r
+               try {\r
+                       for (int i = 0; i < annotation.length; i++) {\r
+                               annotation[i] = Float.parseFloat(annotValues[i]);\r
+                       }\r
+               } catch (NumberFormatException e) {\r
+                       throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
+                                       e.getCause());\r
+               }\r
+               return annotation;\r
         }\r
-       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
-           return false;\r
+\r
+       private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
+                       + ">sequence_name\n "\r
+                       + "M    V       S\n"\r
+                       + "0.43 0.22    0.65\n"\r
+                       + "Where first line is the sequence name,\n"\r
+                       + "second line is the tab delimited sequence,\n"\r
+                       + "third line contains tab delimited disorder prediction values.\n"\r
+                       + "No lines are allowed between these three. Additionally, the number of  "\r
+                       + "sequence residues must be equal to the number of the disorder values.";\r
+\r
+       /**\r
+        * Closes the Closable and logs the exception if any\r
+        * \r
+        * @param log\r
+        * @param stream\r
+        */\r
+       public final static void closeSilently(java.util.logging.Logger log,\r
+                       Closeable stream) {\r
+               if (stream != null) {\r
+                       try {\r
+                               stream.close();\r
+                       } catch (IOException e) {\r
+                               log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
+                       }\r
+               }\r
         }\r
-       if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
-           return false;\r
+\r
+       /**\r
+        * \r
+        > Foobar_dundeefriends\r
+        * \r
+        * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
+        * \r
+        * # REM465 355-368\r
+        * \r
+        * # HOTLOOPS 190-204\r
+        * \r
+        * # RESIDUE COILS REM465 HOTLOOPS\r
+        * \r
+        * M 0.86010 0.88512 0.37094\r
+        * \r
+        * T 0.79983 0.85864 0.44331\r
+        * \r
+        * >Next Sequence name\r
+        * \r
+        * \r
+        * @param input\r
+        * @return\r
+        * @throws IOException\r
+        * @throws UnknownFileFormatException\r
+        */\r
+       public static HashMap<FastaSequence, HashSet<Score>> readDisembl(\r
+                       final InputStream input) throws IOException,\r
+                       UnknownFileFormatException {\r
+               Scanner scan = new Scanner(input);\r
+               scan.useDelimiter(">");\r
+               if (!scan.hasNext()) {\r
+                       throw new UnknownFileFormatException(\r
+                                       "In Disembl score format each sequence score is expected "\r
+                                                       + "to start from the line: >Sequence name "\r
+                                                       + " No such line was found!");\r
+               }\r
+\r
+               HashMap<FastaSequence, HashSet<Score>> results = new HashMap<FastaSequence, HashSet<Score>>();\r
+               int seqCounter = 0;\r
+               while (scan.hasNext()) {\r
+                       seqCounter++;\r
+                       String singleSeq = scan.next();\r
+                       Scanner scansingle = new Scanner(singleSeq);\r
+                       if (!scansingle.hasNextLine()) {\r
+                               throw new RuntimeException(\r
+                                               "The input looks like an incomplete disembl file - cannot parse!");\r
+                       }\r
+\r
+                       StringBuffer seqbuffer = new StringBuffer();\r
+                       ArrayList<Float> coils = new ArrayList<Float>();\r
+                       ArrayList<Float> rem = new ArrayList<Float>();\r
+                       ArrayList<Float> hotloops = new ArrayList<Float>();\r
+\r
+                       String sequenceName = scansingle.nextLine().trim();\r
+                       TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
+                                       scansingle.nextLine());\r
+                       TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
+                                       scansingle.nextLine());\r
+                       TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
+                                       scansingle.nextLine());\r
+\r
+                       String title = scansingle.nextLine();\r
+                       assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
+\r
+                       while (scansingle.hasNext()) {\r
+                               seqbuffer.append(scansingle.next());\r
+                               coils.add(scansingle.nextFloat());\r
+                               rem.add(scansingle.nextFloat());\r
+                               hotloops.add(scansingle.nextFloat());\r
+                       }\r
+                       FastaSequence fs = new FastaSequence(sequenceName,\r
+                                       seqbuffer.toString());\r
+                       HashSet<Score> scores = new HashSet<Score>();\r
+                       scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
+                       scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
+                       scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
+                       results.put(fs, scores);\r
+\r
+                       scansingle.close();\r
+               }\r
+               scan.close();\r
+               input.close();\r
+               return results;\r
         }\r
-       if (SequenceUtil.AA.matcher(sequence).find()) {\r
-           return false;\r
+\r
+       /**\r
+        * Parsing:\r
+        * \r
+        * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
+        * 350-391, 429-485, 497-506, 539-547\r
+        * \r
+        * # REM465 355-368\r
+        * \r
+        * # HOTLOOPS 190-204\r
+        * \r
+        * @param lines\r
+        * @return\r
+        */\r
+       private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
+               TreeSet<Range> ranges = new TreeSet<Range>();\r
+\r
+               Scanner scan = new Scanner(lines);\r
+\r
+               assert scan.hasNext();\r
+               String del = scan.next();\r
+               assert "#".equals(del); // pass delimiter #\r
+               String type = scan.next(); // pass enum name e.g. COILS\r
+               assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
+                               + resultType.toString();\r
+\r
+               // beginning of the ranges\r
+               scan.useDelimiter(",");\r
+               while (scan.hasNext()) {\r
+                       String range = scan.next();\r
+                       if (!Util.isEmpty(range)) {\r
+                               ranges.add(new Range(range.split("-")));\r
+                       }\r
+               }\r
+               return ranges;\r
         }\r
-       final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
-       return amb_prot.find();\r
-    }\r
-\r
-    /**\r
-     * Writes list of FastaSequeces into the outstream formatting the sequence\r
-     * so that it contains width chars on each line\r
-     * \r
-     * @param outstream\r
-     * @param sequences\r
-     * @param width\r
-     *            - the maximum number of characters to write in one line\r
-     * @throws IOException\r
-     */\r
-    public static void writeFasta(final OutputStream outstream,\r
-           final List<FastaSequence> sequences, final int width)\r
-           throws IOException {\r
-       final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
-       final BufferedWriter fastawriter = new BufferedWriter(writer);\r
-       for (final FastaSequence fs : sequences) {\r
-           fastawriter.write(fs.getFormatedSequence(width));\r
+\r
+       public static HashMap<String, HashSet<Score>> removeSequences(\r
+                       HashMap<FastaSequence, HashSet<Score>> disemblResults) {\r
+               HashMap<String, HashSet<Score>> seqNameScores = new HashMap<String, HashSet<Score>>();\r
+               for (Map.Entry<FastaSequence, HashSet<Score>> dres : disemblResults\r
+                               .entrySet()) {\r
+                       seqNameScores.put(dres.getKey().getId(), dres.getValue());\r
+               }\r
+               return seqNameScores;\r
         }\r
-       outstream.flush();\r
-       fastawriter.close();\r
-       writer.close();\r
-    }\r
-\r
-    /**\r
-     * Reads fasta sequences from inStream into the list of FastaSequence\r
-     * objects\r
-     * \r
-     * @param inStream\r
-     *            from\r
-     * @return list of FastaSequence objects\r
-     * @throws IOException\r
-     */\r
-    public static List<FastaSequence> readFasta(final InputStream inStream)\r
-           throws IOException {\r
-       final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
-\r
-       final BufferedReader infasta = new BufferedReader(\r
-               new InputStreamReader(inStream, "UTF8"), 16000);\r
-       final Pattern pattern = Pattern.compile("//s+");\r
-\r
-       String line;\r
-       String sname = "", seqstr = null;\r
-       do {\r
-           line = infasta.readLine();\r
-           if ((line == null) || line.startsWith(">")) {\r
-               if (seqstr != null) {\r
-                   seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+\r
+       /**\r
+        * \r
+        > Foobar_dundeefriends\r
+        * \r
+        * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
+        * \r
+        * # REM465 355-368\r
+        * \r
+        * # HOTLOOPS 190-204\r
+        * \r
+        * # RESIDUE COILS REM465 HOTLOOPS\r
+        * \r
+        * M 0.86010 0.88512 0.37094\r
+        * \r
+        * T 0.79983 0.85864 0.44331\r
+        * \r
+        * >Next Sequence name\r
+        * \r
+        * \r
+        * @param input\r
+        * @return\r
+        * @throws IOException\r
+        * @throws UnknownFileFormatException\r
+        */\r
+       public static HashMap<FastaSequence, HashSet<Score>> readGlobPlot(\r
+                       final InputStream input) throws IOException,\r
+                       UnknownFileFormatException {\r
+               Scanner scan = new Scanner(input);\r
+               scan.useDelimiter(">");\r
+               if (!scan.hasNext()) {\r
+                       throw new UnknownFileFormatException(\r
+                                       "In GlobPlot score format each sequence score is expected "\r
+                                                       + "to start from the line: >Sequence name "\r
+                                                       + " No such line was found!");\r
+               }\r
+\r
+               HashMap<FastaSequence, HashSet<Score>> results = new HashMap<FastaSequence, HashSet<Score>>();\r
+               int seqCounter = 0;\r
+               while (scan.hasNext()) {\r
+                       seqCounter++;\r
+                       String singleSeq = scan.next();\r
+                       Scanner scansingle = new Scanner(singleSeq);\r
+                       if (!scansingle.hasNextLine()) {\r
+                               throw new RuntimeException(\r
+                                               "The input looks like an incomplete GlobPlot file - cannot parse!");\r
+                       }\r
+\r
+                       StringBuffer seqbuffer = new StringBuffer();\r
+                       ArrayList<Float> dydxScore = new ArrayList<Float>();\r
+                       ArrayList<Float> rawScore = new ArrayList<Float>();\r
+                       ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
+\r
+                       String sequenceName = scansingle.nextLine().trim();\r
+                       TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
+                                       scansingle.nextLine());\r
+                       TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
+                                       scansingle.nextLine());\r
+\r
+                       String title = scansingle.nextLine();\r
+                       assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
+\r
+                       while (scansingle.hasNext()) {\r
+                               seqbuffer.append(scansingle.next());\r
+                               dydxScore.add(scansingle.nextFloat());\r
+                               rawScore.add(scansingle.nextFloat());\r
+                               smoothedScore.add(scansingle.nextFloat());\r
+                       }\r
+                       FastaSequence fs = new FastaSequence(sequenceName,\r
+                                       seqbuffer.toString());\r
+                       HashSet<Score> scores = new HashSet<Score>();\r
+                       scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
+                       scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
+                       scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
+                       scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
+                       scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
+                       results.put(fs, scores);\r
+\r
+                       scansingle.close();\r
                 }\r
-               sname = line; // remove >\r
-               seqstr = "";\r
-           } else {\r
-               final String subseq = pattern.matcher(line).replaceAll("");\r
-               seqstr += subseq;\r
-           }\r
-       } while (line != null);\r
-\r
-       infasta.close();\r
-       return seqs;\r
-    }\r
-\r
-    /**\r
-     * Writes FastaSequence in the file, each sequence will take one line only\r
-     * \r
-     * @param os\r
-     * @param sequences\r
-     * @throws IOException\r
-     */\r
-    public static void writeFasta(final OutputStream os,\r
-           final List<FastaSequence> sequences) throws IOException {\r
-       final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
-       final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
-       for (final FastaSequence fs : sequences) {\r
-           fasta_out.write(fs.getOnelineFasta());\r
+               scan.close();\r
+               input.close();\r
+               return results;\r
         }\r
-       fasta_out.close();\r
-       outWriter.close();\r
-    }\r
-\r
-    public static List<AnnotatedSequence> readJRonn(final File result)\r
-           throws IOException, UnknownFileFormatException {\r
-       InputStream input = new FileInputStream(result);\r
-       List<AnnotatedSequence> sequences = readJRonn(input);\r
-       input.close();\r
-       return sequences;\r
-    }\r
-\r
-    /**\r
-     * Reader for JRonn horizontal file format\r
-     * \r
-     * >Foobar\r
-     * \r
-     * M G D T T A G\r
-     * \r
-     * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
-     * \r
-     * All values are tab delimited\r
-     * \r
-     * @param inStream\r
-     * @return\r
-     * @throws IOException\r
-     * @throws UnknownFileFormatException\r
-     */\r
-    public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
-           throws IOException, UnknownFileFormatException {\r
-       final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
-\r
-       final BufferedReader infasta = new BufferedReader(\r
-               new InputStreamReader(inStream, "UTF8"), 16000);\r
-\r
-       String line;\r
-       String sname = "";\r
-       do {\r
-           line = infasta.readLine();\r
-           if (line == null || line.isEmpty()) {\r
-               // skip empty lines\r
-               continue;\r
-           }\r
-           if (line.startsWith(">")) {\r
-               // read name\r
-               sname = line.trim().substring(1);\r
-               // read sequence line\r
-               line = infasta.readLine();\r
-               final String sequence = line.replace("\t", "");\r
-               // read annotation line\r
-               line = infasta.readLine();\r
-               String[] annotValues = line.split("\t");\r
-               float[] annotation = convertToNumber(annotValues);\r
-               if (annotation.length != sequence.length()) {\r
-                   throw new UnknownFileFormatException(\r
-                           "File does not look like Jronn horizontally formatted output file!\n"\r
-                                   + JRONN_WRONG_FORMAT_MESSAGE);\r
+       /**\r
+        * Read AACon result with no alignment files. This method leaves incoming\r
+        * the InputStream results open!\r
+        * \r
+        * @param results\r
+        *            output file of AAConservation\r
+        * @return Map with keys {@link ConservationMethod} -> float[]\r
+        */\r
+       public static HashSet<Score> readAAConResults(InputStream results) {\r
+               if (results == null) {\r
+                       throw new NullPointerException(\r
+                                       "InputStream with results must be provided");\r
+               }\r
+               HashSet<Score> annotations = new HashSet<Score>();\r
+               Scanner sc = new Scanner(results);\r
+               sc.useDelimiter("#");\r
+               while (sc.hasNext()) {\r
+                       String line = sc.next();\r
+                       int spacePos = line.indexOf(" ");\r
+                       assert spacePos > 0 : "Space is expected as delimited between method "\r
+                                       + "name and values!";\r
+                       String methodLine = line.substring(0, spacePos);\r
+                       ConservationMethod method = ConservationMethod\r
+                                       .getMethod(methodLine);\r
+                       assert method != null : "Method " + methodLine\r
+                                       + " is not recognized! ";\r
+                       Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
+                       ArrayList<Float> values = new ArrayList<Float>();\r
+                       while (valuesScanner.hasNextDouble()) {\r
+                               Double value = valuesScanner.nextDouble();\r
+                               values.add(value.floatValue());\r
+                       }\r
+                       annotations.add(new Score(method, values));\r
                 }\r
-               seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
-           }\r
-       } while (line != null);\r
-\r
-       infasta.close();\r
-       return seqs;\r
-    }\r
-\r
-    private static float[] convertToNumber(String[] annotValues)\r
-           throws UnknownFileFormatException {\r
-       float[] annotation = new float[annotValues.length];\r
-       try {\r
-           for (int i = 0; i < annotation.length; i++) {\r
-               annotation[i] = Float.parseFloat(annotValues[i]);\r
-           }\r
-       } catch (NumberFormatException e) {\r
-           throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
-                   .getCause());\r
+               return annotations;\r
         }\r
-       return annotation;\r
-    }\r
-\r
-    private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
-           + ">sequence_name\n "\r
-           + "M        V       S\n"\r
-           + "0.43     0.22    0.65\n"\r
-           + "Where first line is the sequence name,\n"\r
-           + "second line is the tab delimited sequence,\n"\r
-           + "third line contains tab delimited disorder prediction values.\n"\r
-           + "No lines are allowed between these three. Additionally, the number of  "\r
-           + "sequence residues must be equal to the number of the disorder values.";\r
-\r
-    /**\r
-     * Closes the Closable and logs the exception if any\r
-     * \r
-     * @param log\r
-     * @param stream\r
-     */\r
-    public final static void closeSilently(java.util.logging.Logger log,\r
-           Closeable stream) {\r
-       if (stream != null) {\r
-           try {\r
-               stream.close();\r
-           } catch (IOException e) {\r
-               log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
-           }\r
+\r
+       /**\r
+        * Reads and parses Fasta or Clustal formatted file into a list of\r
+        * FastaSequence objects\r
+        * \r
+        * @param inFilePath\r
+        *            the path to the input file\r
+        * @throws IOException\r
+        *             if the file denoted by inFilePath cannot be read\r
+        * @throws UnknownFileFormatException\r
+        *             if the inFilePath points to the file which format cannot be\r
+        *             recognised\r
+        * @return the List of FastaSequence objects\r
+        * \r
+        */\r
+       public static List<FastaSequence> openInputStream(String inFilePath)\r
+                       throws IOException, UnknownFileFormatException {\r
+\r
+               // This stream gets closed in isValidClustalFile method\r
+               InputStream inStrForValidation = new FileInputStream(inFilePath);\r
+               // This stream is closed in the calling methods\r
+               InputStream inStr = new FileInputStream(inFilePath);\r
+               List<FastaSequence> fastaSeqs = null;\r
+               if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
+                       Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
+                       // alignment cannot be null see\r
+                       // ClustalAlignmentUtil.readClustalFile(inStr);\r
+                       fastaSeqs = al.getSequences();\r
+               } else {\r
+                       fastaSeqs = SequenceUtil.readFasta(inStr);\r
+               }\r
+               return fastaSeqs;\r
         }\r
-    }\r
-\r
-    public static List<AnnotatedSequence> readDisembl(final File result)\r
-           throws IOException, UnknownFileFormatException {\r
-       InputStream input = new FileInputStream(result);\r
-       List<AnnotatedSequence> sequences = readJRonn(input);\r
-       input.close();\r
-       return sequences;\r
-    }\r
+\r
+}\r
+\r
+enum DisemblResult {\r
+       /** These contains ranges and scores */\r
+       COILS, REM465, HOTLOOPS\r
  }\r
+enum GlobProtResult {\r
+       /** This a range with no scores */\r
+       GlobDoms,\r
+       /** This a range with no scores */\r
+       Disorder,\r
+       /** This a score with no range */\r
+       Dydx,\r
+       /** This a score with no range */\r
+       SmoothedScore,\r
+       /** This a score with no range */\r
+       RawScore\r
+}
\ No newline at end of file