Add JRonn runner, tester, methods to parse jronn output files.

[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
diff --git a/datamodel/compbio/data/sequence/SequenceUtil.java b/datamodel/compbio/data/sequence/SequenceUtil.java

index 1a3ce5b..f7c923a 100644 (file)
--- a/datamodel/compbio/data/sequence/SequenceUtil.java
+++ b/datamodel/compbio/data/sequence/SequenceUtil.java
@@ -1,6 +1,9 @@
-/* Copyright (c) 2009 Peter Troshin\r
+/* \r
+ * @(#)SequenceUtil.java 1.0 September 2009\r
+ * \r
+ * Copyright (c) 2009 Peter Troshin\r
   *  \r
- *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0\r
+ * Jalview Web Services version: 2.0     \r
   * \r
   *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
   *  Apache License version 2 as published by the Apache Software Foundation\r
@@ -10,7 +13,7 @@
   *  License for more details.\r
   * \r
   *  A copy of the license is in apache_license.txt. It is also available here:\r
- * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
   * \r
   * Any republication or derived work distributed in source code form\r
   * must include this copyright and license notice.\r
@@ -21,6 +24,8 @@ package compbio.data.sequence;
  import java.io.BufferedReader;\r
  import java.io.BufferedWriter;\r
  import java.io.Closeable;\r
+import java.io.File;\r
+import java.io.FileInputStream;\r
  import java.io.IOException;\r
  import java.io.InputStream;\r
  import java.io.InputStreamReader;\r
@@ -35,9 +40,8 @@ import java.util.regex.Pattern;
  /**\r
   * Utility class for operations on sequences\r
   * \r
- * @author pvtroshin\r
- * \r
- *         Date September 2009\r
+ * @author Petr Troshin\r
+ * @version 1.0\r
   */\r
  public final class SequenceUtil {\r
  \r
@@ -111,8 +115,8 @@ public final class SequenceUtil {
      /**\r
       * @return true is the sequence contains only letters a,c, t, g, u\r
       */\r
-    public static boolean isNucleotideSequence(FastaSequence s) {\r
-       return isNonAmbNucleotideSequence(s.getSequence());\r
+    public static boolean isNucleotideSequence(final FastaSequence s) {\r
+       return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
      }\r
  \r
      /**\r
@@ -120,11 +124,11 @@ public final class SequenceUtil {
       * (!) - B char\r
       */\r
      public static boolean isNonAmbNucleotideSequence(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       if (DIGIT.matcher(sequence).find()) {\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
             return false;\r
         }\r
-       if (NON_NUCLEOTIDE.matcher(sequence).find()) {\r
+       if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
             return false;\r
             /*\r
              * System.out.format("I found the text starting at " +\r
@@ -132,7 +136,7 @@ public final class SequenceUtil {
              * nonDNAmatcher.end());\r
              */\r
         }\r
-       Matcher DNAmatcher = NUCLEOTIDE.matcher(sequence);\r
+       final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
         return DNAmatcher.find();\r
      }\r
  \r
@@ -144,7 +148,7 @@ public final class SequenceUtil {
       */\r
      public static String cleanSequence(String sequence) {\r
         assert sequence != null;\r
-       final Matcher m = WHITE_SPACE.matcher(sequence);\r
+       final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
         sequence = m.replaceAll("").toUpperCase();\r
         return sequence;\r
      }\r
@@ -157,10 +161,10 @@ public final class SequenceUtil {
       * @return cleaned up sequence\r
       */\r
      public static String deepCleanSequence(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       sequence = DIGIT.matcher(sequence).replaceAll("");\r
-       sequence = NONWORD.matcher(sequence).replaceAll("");\r
-       Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
+       sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
+       final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
         sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
         return sequence;\r
      }\r
@@ -171,17 +175,17 @@ public final class SequenceUtil {
       * @return true is the sequence is a protein sequence, false overwise\r
       */\r
      public static boolean isProteinSequence(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       if (isNonAmbNucleotideSequence(sequence)) {\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
             return false;\r
         }\r
-       if (DIGIT.matcher(sequence).find()) {\r
+       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
             return false;\r
         }\r
-       if (NON_AA.matcher(sequence).find()) {\r
+       if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
             return false;\r
         }\r
-       Matcher protmatcher = AA.matcher(sequence);\r
+       final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
         return protmatcher.find();\r
      }\r
  \r
@@ -194,20 +198,20 @@ public final class SequenceUtil {
       *         protein or DNA\r
       */\r
      public static boolean isAmbiguosProtein(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       if (isNonAmbNucleotideSequence(sequence)) {\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
             return false;\r
         }\r
-       if (DIGIT.matcher(sequence).find()) {\r
+       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
             return false;\r
         }\r
-       if (NON_AA.matcher(sequence).find()) {\r
+       if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
             return false;\r
         }\r
-       if (AA.matcher(sequence).find()) {\r
+       if (SequenceUtil.AA.matcher(sequence).find()) {\r
             return false;\r
         }\r
-       Matcher amb_prot = AMBIGUOUS_AA.matcher(sequence);\r
+       final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
         return amb_prot.find();\r
      }\r
  \r
@@ -221,12 +225,13 @@ public final class SequenceUtil {
       *            - the maximum number of characters to write in one line\r
       * @throws IOException\r
       */\r
-    public static void writeFasta(OutputStream outstream,\r
-           List<FastaSequence> sequences, int width) throws IOException {\r
-       OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
-       BufferedWriter fastawriter = new BufferedWriter(writer);\r
-       for (FastaSequence fs : sequences) {\r
-           fastawriter.write(fs.getOnelineFasta());\r
+    public static void writeFasta(final OutputStream outstream,\r
+           final List<FastaSequence> sequences, final int width)\r
+           throws IOException {\r
+       final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
+       final BufferedWriter fastawriter = new BufferedWriter(writer);\r
+       for (final FastaSequence fs : sequences) {\r
+           fastawriter.write(fs.getFormatedSequence(width));\r
         }\r
         outstream.flush();\r
         fastawriter.close();\r
@@ -242,28 +247,30 @@ public final class SequenceUtil {
       * @return list of FastaSequence objects\r
       * @throws IOException\r
       */\r
-    public static List<FastaSequence> readFasta(InputStream inStream)\r
+    public static List<FastaSequence> readFasta(final InputStream inStream)\r
             throws IOException {\r
-       List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
-       InputStreamReader inReader = new InputStreamReader(inStream);\r
-       BufferedReader infasta = new BufferedReader(inReader);\r
-       Pattern pattern = Pattern.compile("//s+");\r
+       final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
+\r
+       final BufferedReader infasta = new BufferedReader(\r
+               new InputStreamReader(inStream, "UTF8"), 16000);\r
+       final Pattern pattern = Pattern.compile("//s+");\r
  \r
         String line;\r
         String sname = "", seqstr = null;\r
         do {\r
             line = infasta.readLine();\r
-           if (line == null || line.startsWith(">")) {\r
-               if (seqstr != null)\r
+           if ((line == null) || line.startsWith(">")) {\r
+               if (seqstr != null) {\r
                     seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+               }\r
                 sname = line; // remove >\r
                 seqstr = "";\r
             } else {\r
-               String subseq = pattern.matcher(line).replaceAll("");\r
+               final String subseq = pattern.matcher(line).replaceAll("");\r
                 seqstr += subseq;\r
             }\r
         } while (line != null);\r
-       inReader.close();\r
+\r
         infasta.close();\r
         return seqs;\r
      }\r
@@ -275,17 +282,103 @@ public final class SequenceUtil {
       * @param sequences\r
       * @throws IOException\r
       */\r
-    public static void writeFasta(OutputStream os, List<FastaSequence> sequences)\r
-           throws IOException {\r
-       OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
-       BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
-       for (FastaSequence fs : sequences) {\r
+    public static void writeFasta(final OutputStream os,\r
+           final List<FastaSequence> sequences) throws IOException {\r
+       final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
+       final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
+       for (final FastaSequence fs : sequences) {\r
             fasta_out.write(fs.getOnelineFasta());\r
         }\r
         fasta_out.close();\r
         outWriter.close();\r
      }\r
  \r
+    public static List<AnnotatedSequence> readJRonn(final File result)\r
+           throws IOException, UnknownFileFormatException {\r
+       InputStream input = new FileInputStream(result);\r
+       List<AnnotatedSequence> sequences = readJRonn(input);\r
+       input.close();\r
+       return sequences;\r
+    }\r
+\r
+    /**\r
+     * Reader for JRonn horizontal file format\r
+     * \r
+     * >Foobar\r
+     * \r
+     * M G D T T A G\r
+     * \r
+     * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
+     * \r
+     * All values are tab delimited\r
+     * \r
+     * @param inStream\r
+     * @return\r
+     * @throws IOException\r
+     * @throws UnknownFileFormatException\r
+     */\r
+    public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
+           throws IOException, UnknownFileFormatException {\r
+       final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
+\r
+       final BufferedReader infasta = new BufferedReader(\r
+               new InputStreamReader(inStream, "UTF8"), 16000);\r
+\r
+       String line;\r
+       String sname = "";\r
+       do {\r
+           line = infasta.readLine();\r
+           if (line == null || line.isEmpty()) {\r
+               // skip empty lines\r
+               continue;\r
+           }\r
+           if (line.startsWith(">")) {\r
+               // read name\r
+               sname = line.trim().substring(1);\r
+               // read sequence line\r
+               line = infasta.readLine();\r
+               final String sequence = line.replace("\t", "");\r
+               // read annotation line\r
+               line = infasta.readLine();\r
+               String[] annotValues = line.split("\t");\r
+               float[] annotation = convertToNumber(annotValues);\r
+               if (annotation.length != sequence.length()) {\r
+                   throw new UnknownFileFormatException(\r
+                           "File does not look like Jronn horizontally formatted output file!\n"\r
+                                   + JRONN_WRONG_FORMAT_MESSAGE);\r
+               }\r
+               seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
+           }\r
+       } while (line != null);\r
+\r
+       infasta.close();\r
+       return seqs;\r
+    }\r
+\r
+    private static float[] convertToNumber(String[] annotValues)\r
+           throws UnknownFileFormatException {\r
+       float[] annotation = new float[annotValues.length];\r
+       try {\r
+           for (int i = 0; i < annotation.length; i++) {\r
+               annotation[i] = Float.parseFloat(annotValues[i]);\r
+           }\r
+       } catch (NumberFormatException e) {\r
+           throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
+                   .getCause());\r
+       }\r
+       return annotation;\r
+    }\r
+\r
+    private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
+           + ">sequence_name\n "\r
+           + "M        V       S\n"\r
+           + "0.43     0.22    0.65\n"\r
+           + "Where first line is the sequence name,\n"\r
+           + "second line is the tab delimited sequence,\n"\r
+           + "third line contains tab delimited disorder prediction values.\n"\r
+           + "No lines are allowed between these three. Additionally, the number of  "\r
+           + "sequence residues must be equal to the number of the disorder values.";\r
+\r
      /**\r
       * Closes the Closable and logs the exception if any\r
       * \r