- String annType = r.stringMatched(1);\r
- String annContent = r.stringMatched(2);\r
-\r
- //System.err.println("type:" + annType + " content: " + annContent);\r
-\r
- if (annType.equals("GF"))\r
- {\r
- /* Generic per-File annotation, free text\r
- * Magic features:\r
- * #=GF NH <tree in New Hampshire eXtended format>\r
- * #=GF TN <Unique identifier for the next tree>\r
- * Pfam descriptions:\r
- 7. DESCRIPTION OF FIELDS\r
-\r
- Compulsory fields:\r
- ------------------\r
-\r
- AC Accession number: Accession number in form PFxxxxx.version or PBxxxxxx.\r
- ID Identification: One word name for family.\r
- DE Definition: Short description of family.\r
- AU Author: Authors of the entry.\r
- SE Source of seed: The source suggesting the seed members belong to one family.\r
- GA Gathering method: Search threshold to build the full alignment.\r
- TC Trusted Cutoff: Lowest sequence score and domain score of match in the full alignment.\r
- NC Noise Cutoff: Highest sequence score and domain score of match not in full alignment.\r
- TP Type: Type of family -- presently Family, Domain, Motif or Repeat.\r
- SQ Sequence: Number of sequences in alignment.\r
- AM Alignment Method The order ls and fs hits are aligned to the model to build the full align.\r
- // End of alignment.\r
-\r
- Optional fields:\r
- ----------------\r
-\r
- DC Database Comment: Comment about database reference.\r
- DR Database Reference: Reference to external database.\r
- RC Reference Comment: Comment about literature reference.\r
- RN Reference Number: Reference Number.\r
- RM Reference Medline: Eight digit medline UI number.\r
- RT Reference Title: Reference Title.\r
- RA Reference Author: Reference Author\r
- RL Reference Location: Journal location.\r
- PI Previous identifier: Record of all previous ID lines.\r
- KW Keywords: Keywords.\r
- CC Comment: Comments.\r
- NE Pfam accession: Indicates a nested domain.\r
- NL Location: Location of nested domains - sequence ID, start and end of insert.\r
-\r
- Obsolete fields:\r
- -----------\r
- AL Alignment method of seed: The method used to align the seed members.\r
- */\r
- // Let's save the annotations, maybe we'll be able to do something with them later...\r
- Regex an = new Regex("(\\w+)\\s*(.*)");\r
- if (an.search(annContent)) alAnn.put(an.stringMatched(1), an.stringMatched(2));\r
- }\r
- else if(annType.equals("GS"))\r
- {\r
- // Generic per-Sequence annotation, free text\r
- /* Pfam uses these features:\r
- Feature Description\r
- --------------------- -----------\r
- AC <accession> ACcession number\r
- DE <freetext> DEscription\r
- DR <db>; <accession>; Database Reference\r
- OS <organism> OrganiSm (species)\r
- OC <clade> Organism Classification (clade, etc.)\r
- LO <look> Look (Color, etc.)\r
- */\r
- if (s.search(annContent))\r
- {\r
- String acc = s.stringMatched(1);\r
- String type = s.stringMatched(2);\r
- String content = s.stringMatched(3);\r
-\r
- Hashtable ann;\r
- if (seqAnn.containsKey(acc))\r
- {\r
- ann = (Hashtable) seqAnn.get(acc);\r
- }\r
- else\r
- {\r
- ann = new Hashtable();\r
- }\r
- ann.put(type, content);\r
- seqAnn.put(acc, ann);\r
- }\r
- else\r
- {\r
- throw new IOException("Error parsing " + line);\r
- }\r
- }\r
- else if(annType.equals("GC"))\r
- {\r
- // Generic per-Column annotation, exactly 1 char per column\r
- }\r
- else if(annType.equals("GR"))\r
- {\r
- // Generic per-Sequence AND per-Column markup, exactly 1 char per column\r
- /*\r
- Feature Description Markup letters\r
- ------- ----------- --------------\r
- SS Secondary Structure [HGIEBTSCX]\r
- SA Surface Accessibility [0-9X]\r
- (0=0%-10%; ...; 9=90%-100%)\r
- TM TransMembrane [Mio]\r
- PP Posterior Probability [0-9*]\r
- (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)\r
- LI LIgand binding [*]\r
- AS Active Site [*]\r
- IN INtron (in or after) [0-2]\r
- */\r
- if (s.search(annContent))\r
- {\r
- String acc = s.stringMatched(1);\r
- String type = s.stringMatched(2);\r
- String seq = s.stringMatched(3);\r
- String description = new String();\r
-\r
- // Check for additional information about the current annotation\r
- if (x.search(seq))\r
- {\r
- description = x.stringMatched(1);\r
- seq = x.stringMatched(2);\r
- }\r
- // sequence id with from-to fields\r
+ ann = new Hashtable();\r
+ }\r
+ ann.put(type, content);\r
+ seqAnn.put(acc, ann);\r
+ }\r
+ else\r
+ {\r
+ throw new IOException("Error parsing " + line);\r
+ }\r
+ }\r
+ else if (annType.equals("GC"))\r
+ {\r
+ // Generic per-Column annotation, exactly 1 char per column\r
+ // always need a label.\r
+ if (x.search(annContent))\r
+ {\r
+ // parse out and create alignment annotation directly.\r
+ parseAnnotationRow(annotations, x.stringMatched(1), x\r
+ .stringMatched(2));\r
+ }\r
+ }\r
+ else if (annType.equals("GR"))\r
+ {\r
+ // Generic per-Sequence AND per-Column markup, exactly 1 char per\r
+ // column\r
+ /*\r
+ * Feature Description Markup letters ------- -----------\r
+ * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface\r
+ * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane\r
+ * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;\r
+ * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in\r
+ * or after) [0-2]\r
+ */\r
+ if (s.search(annContent))\r
+ {\r
+ String acc = s.stringMatched(1);\r
+ String type = s.stringMatched(2);\r
+ String seq = new String(s.stringMatched(3));\r
+ String description = null;\r
+ // Check for additional information about the current annotation\r
+ // We use a simple string tokenizer here for speed\r
+ StringTokenizer sep = new StringTokenizer(seq," \t");\r
+ description = sep.nextToken();\r
+ if (sep.hasMoreTokens())\r
+ {\r
+ seq = sep.nextToken();\r
+ } else {\r
+ seq = description;\r
+ description = new String();\r
+ }\r
+ // sequence id with from-to fields\r