Add JRonn runner, tester, methods to parse jronn output files.
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
index 1a3ce5b..f7c923a 100644 (file)
@@ -1,6 +1,9 @@
-/* Copyright (c) 2009 Peter Troshin\r
+/* \r
+ * @(#)SequenceUtil.java 1.0 September 2009\r
+ * \r
+ * Copyright (c) 2009 Peter Troshin\r
  *  \r
- *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0\r
+ * Jalview Web Services version: 2.0     \r
  * \r
  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
  *  Apache License version 2 as published by the Apache Software Foundation\r
@@ -10,7 +13,7 @@
  *  License for more details.\r
  * \r
  *  A copy of the license is in apache_license.txt. It is also available here:\r
- * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
  * \r
  * Any republication or derived work distributed in source code form\r
  * must include this copyright and license notice.\r
@@ -21,6 +24,8 @@ package compbio.data.sequence;
 import java.io.BufferedReader;\r
 import java.io.BufferedWriter;\r
 import java.io.Closeable;\r
+import java.io.File;\r
+import java.io.FileInputStream;\r
 import java.io.IOException;\r
 import java.io.InputStream;\r
 import java.io.InputStreamReader;\r
@@ -35,9 +40,8 @@ import java.util.regex.Pattern;
 /**\r
  * Utility class for operations on sequences\r
  * \r
- * @author pvtroshin\r
- * \r
- *         Date September 2009\r
+ * @author Petr Troshin\r
+ * @version 1.0\r
  */\r
 public final class SequenceUtil {\r
 \r
@@ -111,8 +115,8 @@ public final class SequenceUtil {
     /**\r
      * @return true is the sequence contains only letters a,c, t, g, u\r
      */\r
-    public static boolean isNucleotideSequence(FastaSequence s) {\r
-       return isNonAmbNucleotideSequence(s.getSequence());\r
+    public static boolean isNucleotideSequence(final FastaSequence s) {\r
+       return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
     }\r
 \r
     /**\r
@@ -120,11 +124,11 @@ public final class SequenceUtil {
      * (!) - B char\r
      */\r
     public static boolean isNonAmbNucleotideSequence(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       if (DIGIT.matcher(sequence).find()) {\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
            return false;\r
        }\r
-       if (NON_NUCLEOTIDE.matcher(sequence).find()) {\r
+       if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
            return false;\r
            /*\r
             * System.out.format("I found the text starting at " +\r
@@ -132,7 +136,7 @@ public final class SequenceUtil {
             * nonDNAmatcher.end());\r
             */\r
        }\r
-       Matcher DNAmatcher = NUCLEOTIDE.matcher(sequence);\r
+       final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
        return DNAmatcher.find();\r
     }\r
 \r
@@ -144,7 +148,7 @@ public final class SequenceUtil {
      */\r
     public static String cleanSequence(String sequence) {\r
        assert sequence != null;\r
-       final Matcher m = WHITE_SPACE.matcher(sequence);\r
+       final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
        sequence = m.replaceAll("").toUpperCase();\r
        return sequence;\r
     }\r
@@ -157,10 +161,10 @@ public final class SequenceUtil {
      * @return cleaned up sequence\r
      */\r
     public static String deepCleanSequence(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       sequence = DIGIT.matcher(sequence).replaceAll("");\r
-       sequence = NONWORD.matcher(sequence).replaceAll("");\r
-       Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
+       sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
+       final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
        sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
        return sequence;\r
     }\r
@@ -171,17 +175,17 @@ public final class SequenceUtil {
      * @return true is the sequence is a protein sequence, false overwise\r
      */\r
     public static boolean isProteinSequence(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       if (isNonAmbNucleotideSequence(sequence)) {\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
            return false;\r
        }\r
-       if (DIGIT.matcher(sequence).find()) {\r
+       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
            return false;\r
        }\r
-       if (NON_AA.matcher(sequence).find()) {\r
+       if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
            return false;\r
        }\r
-       Matcher protmatcher = AA.matcher(sequence);\r
+       final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
        return protmatcher.find();\r
     }\r
 \r
@@ -194,20 +198,20 @@ public final class SequenceUtil {
      *         protein or DNA\r
      */\r
     public static boolean isAmbiguosProtein(String sequence) {\r
-       sequence = cleanSequence(sequence);\r
-       if (isNonAmbNucleotideSequence(sequence)) {\r
+       sequence = SequenceUtil.cleanSequence(sequence);\r
+       if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
            return false;\r
        }\r
-       if (DIGIT.matcher(sequence).find()) {\r
+       if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
            return false;\r
        }\r
-       if (NON_AA.matcher(sequence).find()) {\r
+       if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
            return false;\r
        }\r
-       if (AA.matcher(sequence).find()) {\r
+       if (SequenceUtil.AA.matcher(sequence).find()) {\r
            return false;\r
        }\r
-       Matcher amb_prot = AMBIGUOUS_AA.matcher(sequence);\r
+       final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
        return amb_prot.find();\r
     }\r
 \r
@@ -221,12 +225,13 @@ public final class SequenceUtil {
      *            - the maximum number of characters to write in one line\r
      * @throws IOException\r
      */\r
-    public static void writeFasta(OutputStream outstream,\r
-           List<FastaSequence> sequences, int width) throws IOException {\r
-       OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
-       BufferedWriter fastawriter = new BufferedWriter(writer);\r
-       for (FastaSequence fs : sequences) {\r
-           fastawriter.write(fs.getOnelineFasta());\r
+    public static void writeFasta(final OutputStream outstream,\r
+           final List<FastaSequence> sequences, final int width)\r
+           throws IOException {\r
+       final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
+       final BufferedWriter fastawriter = new BufferedWriter(writer);\r
+       for (final FastaSequence fs : sequences) {\r
+           fastawriter.write(fs.getFormatedSequence(width));\r
        }\r
        outstream.flush();\r
        fastawriter.close();\r
@@ -242,28 +247,30 @@ public final class SequenceUtil {
      * @return list of FastaSequence objects\r
      * @throws IOException\r
      */\r
-    public static List<FastaSequence> readFasta(InputStream inStream)\r
+    public static List<FastaSequence> readFasta(final InputStream inStream)\r
            throws IOException {\r
-       List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
-       InputStreamReader inReader = new InputStreamReader(inStream);\r
-       BufferedReader infasta = new BufferedReader(inReader);\r
-       Pattern pattern = Pattern.compile("//s+");\r
+       final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
+\r
+       final BufferedReader infasta = new BufferedReader(\r
+               new InputStreamReader(inStream, "UTF8"), 16000);\r
+       final Pattern pattern = Pattern.compile("//s+");\r
 \r
        String line;\r
        String sname = "", seqstr = null;\r
        do {\r
            line = infasta.readLine();\r
-           if (line == null || line.startsWith(">")) {\r
-               if (seqstr != null)\r
+           if ((line == null) || line.startsWith(">")) {\r
+               if (seqstr != null) {\r
                    seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+               }\r
                sname = line; // remove >\r
                seqstr = "";\r
            } else {\r
-               String subseq = pattern.matcher(line).replaceAll("");\r
+               final String subseq = pattern.matcher(line).replaceAll("");\r
                seqstr += subseq;\r
            }\r
        } while (line != null);\r
-       inReader.close();\r
+\r
        infasta.close();\r
        return seqs;\r
     }\r
@@ -275,17 +282,103 @@ public final class SequenceUtil {
      * @param sequences\r
      * @throws IOException\r
      */\r
-    public static void writeFasta(OutputStream os, List<FastaSequence> sequences)\r
-           throws IOException {\r
-       OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
-       BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
-       for (FastaSequence fs : sequences) {\r
+    public static void writeFasta(final OutputStream os,\r
+           final List<FastaSequence> sequences) throws IOException {\r
+       final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
+       final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
+       for (final FastaSequence fs : sequences) {\r
            fasta_out.write(fs.getOnelineFasta());\r
        }\r
        fasta_out.close();\r
        outWriter.close();\r
     }\r
 \r
+    public static List<AnnotatedSequence> readJRonn(final File result)\r
+           throws IOException, UnknownFileFormatException {\r
+       InputStream input = new FileInputStream(result);\r
+       List<AnnotatedSequence> sequences = readJRonn(input);\r
+       input.close();\r
+       return sequences;\r
+    }\r
+\r
+    /**\r
+     * Reader for JRonn horizontal file format\r
+     * \r
+     * >Foobar\r
+     * \r
+     * M G D T T A G\r
+     * \r
+     * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
+     * \r
+     * All values are tab delimited\r
+     * \r
+     * @param inStream\r
+     * @return\r
+     * @throws IOException\r
+     * @throws UnknownFileFormatException\r
+     */\r
+    public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
+           throws IOException, UnknownFileFormatException {\r
+       final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
+\r
+       final BufferedReader infasta = new BufferedReader(\r
+               new InputStreamReader(inStream, "UTF8"), 16000);\r
+\r
+       String line;\r
+       String sname = "";\r
+       do {\r
+           line = infasta.readLine();\r
+           if (line == null || line.isEmpty()) {\r
+               // skip empty lines\r
+               continue;\r
+           }\r
+           if (line.startsWith(">")) {\r
+               // read name\r
+               sname = line.trim().substring(1);\r
+               // read sequence line\r
+               line = infasta.readLine();\r
+               final String sequence = line.replace("\t", "");\r
+               // read annotation line\r
+               line = infasta.readLine();\r
+               String[] annotValues = line.split("\t");\r
+               float[] annotation = convertToNumber(annotValues);\r
+               if (annotation.length != sequence.length()) {\r
+                   throw new UnknownFileFormatException(\r
+                           "File does not look like Jronn horizontally formatted output file!\n"\r
+                                   + JRONN_WRONG_FORMAT_MESSAGE);\r
+               }\r
+               seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
+           }\r
+       } while (line != null);\r
+\r
+       infasta.close();\r
+       return seqs;\r
+    }\r
+\r
+    private static float[] convertToNumber(String[] annotValues)\r
+           throws UnknownFileFormatException {\r
+       float[] annotation = new float[annotValues.length];\r
+       try {\r
+           for (int i = 0; i < annotation.length; i++) {\r
+               annotation[i] = Float.parseFloat(annotValues[i]);\r
+           }\r
+       } catch (NumberFormatException e) {\r
+           throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
+                   .getCause());\r
+       }\r
+       return annotation;\r
+    }\r
+\r
+    private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
+           + ">sequence_name\n "\r
+           + "M        V       S\n"\r
+           + "0.43     0.22    0.65\n"\r
+           + "Where first line is the sequence name,\n"\r
+           + "second line is the tab delimited sequence,\n"\r
+           + "third line contains tab delimited disorder prediction values.\n"\r
+           + "No lines are allowed between these three. Additionally, the number of  "\r
+           + "sequence residues must be equal to the number of the disorder values.";\r
+\r
     /**\r
      * Closes the Closable and logs the exception if any\r
      * \r