- if (annType.equals("GF"))\r
- {\r
- /* Generic per-File annotation, free text\r
- * Magic features:\r
- * #=GF NH <tree in New Hampshire eXtended format>\r
- * #=GF TN <Unique identifier for the next tree>\r
- * Pfam descriptions:\r
- 7. DESCRIPTION OF FIELDS\r
-\r
- Compulsory fields:\r
- ------------------\r
-\r
- AC Accession number: Accession number in form PFxxxxx.version or PBxxxxxx.\r
- ID Identification: One word name for family.\r
- DE Definition: Short description of family.\r
- AU Author: Authors of the entry.\r
- SE Source of seed: The source suggesting the seed members belong to one family.\r
- GA Gathering method: Search threshold to build the full alignment.\r
- TC Trusted Cutoff: Lowest sequence score and domain score of match in the full alignment.\r
- NC Noise Cutoff: Highest sequence score and domain score of match not in full alignment.\r
- TP Type: Type of family -- presently Family, Domain, Motif or Repeat.\r
- SQ Sequence: Number of sequences in alignment.\r
- AM Alignment Method The order ls and fs hits are aligned to the model to build the full align.\r
- // End of alignment.\r
-\r
- Optional fields:\r
- ----------------\r
-\r
- DC Database Comment: Comment about database reference.\r
- DR Database Reference: Reference to external database.\r
- RC Reference Comment: Comment about literature reference.\r
- RN Reference Number: Reference Number.\r
- RM Reference Medline: Eight digit medline UI number.\r
- RT Reference Title: Reference Title.\r
- RA Reference Author: Reference Author\r
- RL Reference Location: Journal location.\r
- PI Previous identifier: Record of all previous ID lines.\r
- KW Keywords: Keywords.\r
- CC Comment: Comments.\r
- NE Pfam accession: Indicates a nested domain.\r
- NL Location: Location of nested domains - sequence ID, start and end of insert.\r
-\r
- Obsolete fields:\r
- -----------\r
- AL Alignment method of seed: The method used to align the seed members.\r
- */\r
- // Let's save the annotations, maybe we'll be able to do something with them later...\r
- Regex an = new Regex("(\\w+)\\s*(.*)");\r
- if (an.search(annContent)) alAnn.put(an.stringMatched(1), an.stringMatched(2));\r
- }\r
- else if(annType.equals("GS"))\r
- {\r
- // Generic per-Sequence annotation, free text\r
- /* Pfam uses these features:\r
- Feature Description\r
- --------------------- -----------\r
- AC <accession> ACcession number\r
- DE <freetext> DEscription\r
- DR <db>; <accession>; Database Reference\r
- OS <organism> OrganiSm (species)\r
- OC <clade> Organism Classification (clade, etc.)\r
- LO <look> Look (Color, etc.)\r
- */\r
- if (s.search(annContent))\r
- {\r
- String acc = s.stringMatched(1);\r
- String type = s.stringMatched(2);\r
- String content = s.stringMatched(3);\r
-\r
- Hashtable ann;\r
- if (seqAnn.containsKey(acc))\r
- {\r
- ann = (Hashtable) seqAnn.get(acc);\r
- }\r
- else\r
- {\r
- ann = new Hashtable();\r
- }\r
- ann.put(type, content);\r
- seqAnn.put(acc, ann);\r
- }\r
- else\r
- {\r
- throw new IOException("Error parsing " + line);\r
- }\r
- }\r
- else if(annType.equals("GC"))\r
- {\r
- // Generic per-Column annotation, exactly 1 char per column\r
- }\r
- else if(annType.equals("GR"))\r
- {\r
- // Generic per-Sequence AND per-Column markup, exactly 1 char per column\r
- /*\r
- Feature Description Markup letters\r
- ------- ----------- --------------\r
- SS Secondary Structure [HGIEBTSCX]\r
- SA Surface Accessibility [0-9X]\r
- (0=0%-10%; ...; 9=90%-100%)\r
- TM TransMembrane [Mio]\r
- PP Posterior Probability [0-9*]\r
- (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)\r
- LI LIgand binding [*]\r
- AS Active Site [*]\r
- IN INtron (in or after) [0-2]\r
- */\r
- if (s.search(annContent))\r
- {\r
- String acc = s.stringMatched(1);\r
- String type = s.stringMatched(2);\r
- String seq = s.stringMatched(3);\r
- String description = new String();\r
-\r
- // Check for additional information about the current annotation\r
- if (x.search(seq))\r
- {\r
- description = x.stringMatched(1);\r
- seq = x.stringMatched(2);\r
- }\r
- // sequence id with from-to fields\r
-\r
- Hashtable ann;\r
- // Get an object with all the annotations for this sequence\r
- if (seqAnn.containsKey(acc))\r
- {\r
- //logger.debug("Found annotations for " + acc);\r
- ann = (Hashtable) seqAnn.get(acc);\r
- }\r
- else\r
- {\r
- //logger.debug("Creating new annotations holder for " + acc);\r
- ann = new Hashtable();\r
- seqAnn.put(acc, ann);\r
- }\r
-\r
- Hashtable features;\r
- // Get an object with all the content for an annotation\r
- if (ann.containsKey("features"))\r
- {\r
- //logger.debug("Found features for " + acc);\r
- features = (Hashtable) ann.get("features");\r
- }\r
- else\r
- {\r
- //logger.debug("Creating new features holder for " + acc);\r
- features = new Hashtable();\r
- ann.put("features", features);\r
- }\r
-\r
- Hashtable content;\r
- if (features.containsKey(this.id2type(type)))\r
- {\r
- //logger.debug("Found content for " + this.id2type(type));\r
- content = (Hashtable) features.get(this.id2type(type));\r
- }\r
- else\r
- {\r
- //logger.debug("Creating new content holder for " + this.id2type(type));\r
- content = new Hashtable();\r
- features.put(this.id2type(type), content);\r
- }\r
- String ns = (String) content.get(description);\r
- if (ns == null) ns = "";\r
- ns += seq;\r
- content.put(description, seq);\r
- }\r
- else\r
- {\r
- throw new IOException("Error parsing " + line);\r
- }\r
- }\r
- else\r
+ // logger.debug("Adding seq " + acc + " from " + start + " to " + end\r
+ // + ": " + seq);\r
+ this.seqs.addElement(seqO);\r
+ }\r
+ return; // finished parsing this segment of source\r
+ }\r
+ else if (!r.search(line))\r
+ {\r
+ // System.err.println("Found sequence line: " + line);\r
+\r
+ // Split sequence in sequence and accession parts\r
+ if (!x.search(line))\r
+ {\r
+ // logger.error("Could not parse sequence line: " + line);\r
+ throw new IOException("Could not parse sequence line: " + line);\r
+ }\r
+ String ns = (String) seqs.get(x.stringMatched(1));\r
+ if (ns == null)\r
+ {\r
+ ns = "";\r
+ }\r
+ ns += x.stringMatched(2);\r
+\r
+ seqs.put(x.stringMatched(1), ns);\r
+ }\r
+ else\r
+ {\r
+ String annType = r.stringMatched(1);\r
+ String annContent = r.stringMatched(2);\r
+\r
+ // System.err.println("type:" + annType + " content: " + annContent);\r
+\r
+ if (annType.equals("GF"))\r
+ {\r
+ /*\r
+ * Generic per-File annotation, free text Magic features: #=GF NH\r
+ * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier\r
+ * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS\r
+ * \r
+ * Compulsory fields: ------------------\r
+ * \r
+ * AC Accession number: Accession number in form PFxxxxx.version or\r
+ * PBxxxxxx. ID Identification: One word name for family. DE\r
+ * Definition: Short description of family. AU Author: Authors of the\r
+ * entry. SE Source of seed: The source suggesting the seed members\r
+ * belong to one family. GA Gathering method: Search threshold to\r
+ * build the full alignment. TC Trusted Cutoff: Lowest sequence score\r
+ * and domain score of match in the full alignment. NC Noise Cutoff:\r
+ * Highest sequence score and domain score of match not in full\r
+ * alignment. TP Type: Type of family -- presently Family, Domain,\r
+ * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM\r
+ * Alignment Method The order ls and fs hits are aligned to the model\r
+ * to build the full align. // End of alignment.\r
+ * \r
+ * Optional fields: ----------------\r
+ * \r
+ * DC Database Comment: Comment about database reference. DR Database\r
+ * Reference: Reference to external database. RC Reference Comment:\r
+ * Comment about literature reference. RN Reference Number: Reference\r
+ * Number. RM Reference Medline: Eight digit medline UI number. RT\r
+ * Reference Title: Reference Title. RA Reference Author: Reference\r
+ * Author RL Reference Location: Journal location. PI Previous\r
+ * identifier: Record of all previous ID lines. KW Keywords: Keywords.\r
+ * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.\r
+ * NL Location: Location of nested domains - sequence ID, start and\r
+ * end of insert.\r
+ * \r
+ * Obsolete fields: ----------- AL Alignment method of seed: The\r
+ * method used to align the seed members.\r
+ */\r
+ // Let's save the annotations, maybe we'll be able to do something\r
+ // with them later...\r
+ Regex an = new Regex("(\\w+)\\s*(.*)");\r
+ if (an.search(annContent))\r
+ {\r
+ if (an.stringMatched(1).equals("NH"))\r
+ {\r
+ treeString.append(an.stringMatched(2));\r
+ }\r
+ else if (an.stringMatched(1).equals("TN"))\r
+ {\r
+ if (treeString.length() > 0)\r
+ {\r
+ if (treeName == null)\r