Any character non aa or nucleotide is a space
[jalview.git] / src / jalview / io / FastaFile.java
index adfa967..684f867 100755 (executable)
@@ -1,6 +1,6 @@
 /*\r
 * Jalview - A Sequence Alignment Editor and Viewer\r
-* Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
+* Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
 *\r
 * This program is free software; you can redistribute it and/or\r
 * modify it under the terms of the GNU General Public License\r
@@ -18,8 +18,6 @@
 */\r
 package jalview.io;\r
 \r
-import jalview.analysis.*;\r
-\r
 import jalview.datamodel.*;\r
 \r
 import java.io.*;\r
@@ -27,150 +25,169 @@ import java.io.*;
 import java.util.*;\r
 \r
 \r
-public class FastaFile extends AlignFile {\r
-    public FastaFile() {\r
-    }\r
-\r
-    public FastaFile(String inStr) {\r
-        super(inStr);\r
+/**\r
+ * DOCUMENT ME!\r
+ *\r
+ * @author $author$\r
+ * @version $Revision$\r
+ */\r
+public class FastaFile extends AlignFile\r
+{\r
+  /**\r
+   * Length of a sequence line\r
+   */\r
+  int len = 72;\r
+\r
+  StringBuffer out;\r
+\r
+    /**\r
+     * Creates a new FastaFile object.\r
+     */\r
+    public FastaFile()\r
+    {\r
     }\r
 \r
-    public FastaFile(String inFile, String type) throws IOException {\r
+    /**\r
+     * Creates a new FastaFile object.\r
+     *\r
+     * @param inFile DOCUMENT ME!\r
+     * @param type DOCUMENT ME!\r
+     *\r
+     * @throws IOException DOCUMENT ME!\r
+     */\r
+    public FastaFile(String inFile, String type) throws IOException\r
+    {\r
         super(inFile, type);\r
     }\r
 \r
-    public void parse() throws IOException {\r
-        String id = "";\r
-        StringBuffer seq = new StringBuffer();\r
-        int count = 0;\r
-        boolean flag = false;\r
-\r
-        int sstart = 0;\r
-        int send = 0;\r
+    /**\r
+     * DOCUMENT ME!\r
+     *\r
+     * @throws IOException DOCUMENT ME!\r
+     */\r
+    public void parse() throws IOException\r
+    {\r
+        StringBuffer sb = new StringBuffer();\r
+        boolean firstLine = true;\r
 \r
         String line;\r
-\r
-        while ((line = nextLine()) != null) {\r
-            if (line.length() > 0) {\r
-                // Do we have an id line?\r
-                // JBPNote - this code needs to be standardised to EBI/whatever for the\r
-                // >dbref/dbref/dbref|refid1|refid2|refid3 'human-readable' style of naming (should it really exist)\r
-\r
-                if (line.substring(0, 1).equals(">")) {\r
-                    if (count != 0) {\r
-                        if (sstart != 0) {\r
-                            seqs.addElement(new Sequence(id,\r
-                                    seq.toString().toUpperCase(), sstart, send));\r
-                        } else {\r
-                            seqs.addElement(new Sequence(id,\r
-                                    seq.toString().toUpperCase(), 1,\r
-                                    seq.length()));\r
-                        }\r
-                    }\r
-\r
-                    count++;\r
-\r
-                    StringTokenizer str = new StringTokenizer(line, " ");\r
-\r
-                    id = str.nextToken();\r
-                    id = id.substring(1);\r
-\r
-                    com.stevesoft.pat.Regex dbId = new com.stevesoft.pat.Regex(\r
-                            "[A-Za-z-]+/?[A-Za-z-]+\\|(\\w+)\\|(.+)");\r
-                    // JBPNote At the moment - we don't get rid of the friendly names but this\r
-                    // behaviour is probably wrong in the long run.\r
-                    if (dbId.search(id)) {\r
-                        String dbid = dbId.stringMatched(1);\r
-                        String idname = dbId.stringMatched(2);\r
-                        if ( (idname.length() > 0) &&\r
-                             (idname.indexOf("_") > -1)) {\r
-                          id = idname; // use the friendly name - apparently no dbid\r
-                        } else\r
-                        if (dbid.length()>1) {\r
-                            id = dbid; // ignore the friendly name - we lose uniprot accession ID otherwise\r
-                        }\r
+        Sequence seq = null;\r
+\r
+        boolean annotation = false;\r
+\r
+        while ((line = nextLine()) != null)\r
+        {\r
+            line = line.trim();\r
+            if (line.length() > 0)\r
+            {\r
+              if (line.charAt(0)=='>')\r
+                {\r
+                  if (line.startsWith(">#_"))\r
+                  {\r
+                    if (annotation)\r
+                    {\r
+                      Annotation[] anots = new Annotation[sb.length()];\r
+                      String anotString = sb.toString();\r
+                      for (int i = 0; i < sb.length(); i++)\r
+                      {\r
+                        anots[i] = new Annotation(anotString.substring(i, i+1),\r
+                                                  null,\r
+                                                  ' ', 0);\r
+                      }\r
+                      AlignmentAnnotation aa = new AlignmentAnnotation(\r
+                          seq.getName().substring(2), seq.getDescription(),\r
+                          anots);\r
+\r
+                      annotations.addElement(aa);\r
                     }\r
+                  }\r
+                  else\r
+                    annotation = false;\r
 \r
-                    if (id.indexOf("/") > 0) {\r
-                        StringTokenizer st = new StringTokenizer(id, "/");\r
-\r
-                        if (st.countTokens() == 2) {\r
-                            id = st.nextToken();\r
+                    if (!firstLine)\r
+                    {\r
+                       seq.setSequence(sb.toString());\r
 \r
-                            String tmp = st.nextToken();\r
+                       if (!annotation)\r
+                         seqs.addElement(seq);\r
+                    }\r
 \r
-                            st = new StringTokenizer(tmp, "-");\r
+                    seq = parseId(line.substring(1));\r
+                    firstLine = false;\r
 \r
-                            if (st.countTokens() == 2) {\r
-                                sstart = Integer.valueOf(st.nextToken())\r
-                                                .intValue();\r
-                                send = Integer.valueOf(st.nextToken()).intValue();\r
-                            }\r
-                        }\r
-                    }\r
+                    sb = new StringBuffer();\r
 \r
-                    seq = new StringBuffer();\r
-                } else {\r
-                    seq = seq.append(line);\r
+                    if (line.startsWith(">#_"))\r
+                      annotation = true;\r
+                }\r
+                else\r
+                {\r
+                    sb.append(line);\r
                 }\r
             }\r
         }\r
 \r
-        if (count > 0) {\r
-            if (!isValidProteinSequence(seq.toString().toUpperCase())) {\r
-                throw new IOException("Invalid protein sequence");\r
-            }\r
-\r
-            if (sstart != 0) {\r
-                seqs.addElement(new Sequence(id, seq.toString().toUpperCase(),\r
-                        sstart, send));\r
-            } else {\r
-                seqs.addElement(new Sequence(id, seq.toString().toUpperCase(),\r
-                        1, seq.length()));\r
-            }\r
+        if (annotation)\r
+        {\r
+          Annotation[] anots = new Annotation[sb.length()];\r
+          String anotString = sb.toString();\r
+          for (int i = 0; i < sb.length(); i++)\r
+          {\r
+            anots[i] = new Annotation(anotString.substring(i, i + 1),\r
+                                      null,\r
+                                      ' ', 0);\r
+          }\r
+          AlignmentAnnotation aa = new AlignmentAnnotation(\r
+              seq.getName().substring(2), seq.getDescription(),\r
+              anots);\r
+\r
+          annotations.addElement(aa);\r
         }\r
-    }\r
 \r
-    public static String print(SequenceI[] s) {\r
-        return print(s, 72);\r
-    }\r
-\r
-    public static String print(SequenceI[] s, int len) {\r
-        return print(s, len, true);\r
-    }\r
-\r
-    public static String print(SequenceI[] s, int len, boolean gaps) {\r
-        return print(s, len, gaps, true);\r
+        else if (!firstLine)\r
+        {\r
+            seq.setSequence(sb.toString());\r
+            seqs.addElement(seq);\r
+        }\r
     }\r
 \r
-    public static String print(SequenceI[] s, int len, boolean gaps,\r
-        boolean displayId) {\r
-        StringBuffer out = new StringBuffer();\r
+    /**\r
+     * DOCUMENT ME!\r
+     *\r
+     * @param s DOCUMENT ME!\r
+     * @param len DOCUMENT ME!\r
+     * @param gaps DOCUMENT ME!\r
+     * @param displayId DOCUMENT ME!\r
+     *\r
+     * @return DOCUMENT ME!\r
+     */\r
+    public String print(SequenceI[] s)\r
+    {\r
+        out = new StringBuffer();\r
         int i = 0;\r
 \r
-        while ((i < s.length) && (s[i] != null)) {\r
-            String seq = "";\r
+        while ((i < s.length) && (s[i] != null))\r
+        {\r
+            out.append(">" + printId(s[i]));\r
+            if(s[i].getDescription()!=null)\r
+              out.append(" "+s[i].getDescription());\r
 \r
-            if (gaps) {\r
-                seq = s[i].getSequence();\r
-            } else {\r
-                seq = AlignSeq.extractGaps("-. ", s[i].getSequence());\r
-            }\r
+            out.append("\n");\r
 \r
-            // used to always put this here: + "/" + s[i].getStart() + "-" + s[i].getEnd() +\r
-            out.append(">" +\r
-                ((displayId) ? s[i].getDisplayId() : s[i].getName()) + "\n");\r
+            int nochunks = (s[i].getLength() / len) + 1;\r
 \r
-            int nochunks = (seq.length() / len) + 1;\r
-\r
-            for (int j = 0; j < nochunks; j++) {\r
+            for (int j = 0; j < nochunks; j++)\r
+            {\r
                 int start = j * len;\r
                 int end = start + len;\r
 \r
-                if (end < seq.length()) {\r
-                    out.append(seq.substring(start, end) + "\n");\r
-                } else if (start < seq.length()) {\r
-                    out.append(seq.substring(start) + "\n");\r
+                if (end < s[i].getLength())\r
+                {\r
+                    out.append(s[i].getSequenceAsString(start, end) + "\n");\r
+                }\r
+                else if (start < s[i].getLength())\r
+                {\r
+                    out.append(s[i].getSequenceAsString(start, s[i].getLength()) + "\n");\r
                 }\r
             }\r
 \r
@@ -180,7 +197,13 @@ public class FastaFile extends AlignFile {
         return out.toString();\r
     }\r
 \r
-    public String print() {\r
+    /**\r
+     * DOCUMENT ME!\r
+     *\r
+     * @return DOCUMENT ME!\r
+     */\r
+    public String print()\r
+    {\r
         return print(getSeqsAsArray());\r
     }\r
 }\r