Parser for DisEMBL results and finished runner, few test cases. DisemblTester has...
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
index 85ae95f..a636c3a 100644 (file)
@@ -406,12 +406,15 @@ public final class SequenceUtil {
         * \r
         * TODO complete!\r
         * \r
+        * >Sequence name\r
+        * \r
         * RESIDUE COILS REM465 HOTLOOPS\r
         * \r
         * M 0.86010 0.88512 0.37094\r
         * \r
         * T 0.79983 0.85864 0.44331 ....\r
-        * \r
+\r
+        * >Next Sequence name \r
         * RESIDUE COILS REM465 HOTLOOPS\r
         * \r
         * M 0.86010 0.88512 0.37094\r
@@ -422,14 +425,14 @@ public final class SequenceUtil {
         * @throws IOException\r
         * @throws UnknownFileFormatException\r
         */\r
-       static Map<FastaSequence, Set<Score>> readDisembl(final InputStream input)\r
+       public static Map<FastaSequence, Set<Score>> readDisembl(final InputStream input)\r
                        throws IOException, UnknownFileFormatException {\r
                Scanner scan = new Scanner(input);\r
-               scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");\r
+               scan.useDelimiter(">");\r
                if (!scan.hasNext()) {\r
                        throw new UnknownFileFormatException(\r
-                                       "In Disembl score format each seqeunce score is expected to start from the line: "\r
-                                                       + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."\r
+                                       "In Disembl score format each sequence score is expected " +\r
+                                       "to start from the line: >Sequence name "\r
                                                        + " No such line was found!");\r
                }\r
 \r
@@ -438,22 +441,27 @@ public final class SequenceUtil {
                while (scan.hasNext()) {\r
                        seqCounter++;\r
                        String singleSeq = scan.next();\r
-                       Scanner scansingle = new Scanner(singleSeq);\r
+                       Scanner scansingle = new Scanner(singleSeq);\r
+                       if(!scansingle.hasNextLine()) {\r
+                               throw new RuntimeException("The input looks like an incomplete disembl file - cannot parse!");\r
+                       }\r
+                       \r
                        StringBuffer seqbuffer = new StringBuffer();\r
                        ArrayList<Float> coils = new ArrayList<Float>();\r
                        ArrayList<Float> rem = new ArrayList<Float>();\r
                        ArrayList<Float> hotloops = new ArrayList<Float>();\r
-                       FastaSequence fs = new FastaSequence(Integer.toString(seqCounter),\r
-                                       singleSeq);\r
-                       while (scansingle.hasNextLine()) {\r
-                               String valueLine = scansingle.nextLine();\r
-                               Scanner values = new Scanner(valueLine);\r
-                               seqbuffer.append(values.next());\r
-                               coils.add(values.nextFloat());\r
-                               rem.add(values.nextFloat());\r
-                               hotloops.add(values.nextFloat());\r
-                               values.close();\r
+\r
+                       String sequenceName = scansingle.nextLine().trim();\r
+                       String title =  scansingle.nextLine();\r
+                       assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
+                       \r
+                       while (scansingle.hasNext()) {\r
+                               seqbuffer.append(scansingle.next());\r
+                               coils.add(scansingle.nextFloat());\r
+                               rem.add(scansingle.nextFloat());\r
+                               hotloops.add(scansingle.nextFloat());\r
                        }\r
+                       FastaSequence fs = new FastaSequence(sequenceName,seqbuffer.toString());\r
                        Set<Score> scores = new HashSet<Score>();\r
                        scores.add(new Score(DisemblResultAnnot.COILS, coils));\r
                        scores.add(new Score(DisemblResultAnnot.HOTLOOPS, hotloops));\r
@@ -466,6 +474,15 @@ public final class SequenceUtil {
                input.close();\r
                return results;\r
        }\r
+       \r
+       public static  Map<String, Set<Score>> removeSequences(Map<FastaSequence, Set<Score>> disemblResults) { \r
+               Map<String, Set<Score>> seqNameScores = new HashMap<String, Set<Score>>();\r
+               for(Map.Entry<FastaSequence,Set<Score>> dres: disemblResults.entrySet()) {\r
+                       seqNameScores.put(dres.getKey().getId(),dres.getValue()); \r
+               }\r
+               return seqNameScores;\r
+       }\r
+       \r
        /**\r
         * Read AACon result with no alignment files. This method leaves incoming\r
         * the InputStream results open!\r