+ Enumeration j = content.keys();\r
+ while (j.hasMoreElements())\r
+ {\r
+ String desc = j.nextElement().toString();\r
+ String ns = content.get(desc).toString();\r
+ char[] byChar = ns.toCharArray();\r
+ for (int k = 0; k < byChar.length; k++)\r
+ {\r
+ char c = byChar[k];\r
+ if (!(c == ' ' || c == '_' || c == '-' || c == '.')) // PFAM\r
+ // uses\r
+ // '.'\r
+ // for\r
+ // feature\r
+ // background\r
+ {\r
+ int new_pos = posmap[k]; // look up nearest seqeunce\r
+ // position to this column\r
+ SequenceFeature feat = new SequenceFeature(type, desc,\r
+ new_pos, new_pos, 0f, null);\r
+ \r
+ seqO.addSequenceFeature(feat);\r
+ }\r
+ }\r
+ }\r
+ \r
+ }\r
+ \r
+ }\r
+ // garbage collect\r
+ \r
+ // logger.debug("Adding seq " + acc + " from " + start + " to " + end\r
+ // + ": " + seq);\r
+ this.seqs.addElement(seqO);\r
+ }\r
+ return; // finished parsing this segment of source\r
+ }\r
+ else if (!r.search(line))\r
+ {\r
+ // System.err.println("Found sequence line: " + line);\r
+ \r
+ // Split sequence in sequence and accession parts\r
+ if (!x.search(line))\r
+ {\r
+ // logger.error("Could not parse sequence line: " + line);\r
+ throw new IOException("Could not parse sequence line: " + line);\r
+ }\r
+ String ns = (String) seqs.get(x.stringMatched(1));\r
+ if (ns == null)\r
+ {\r
+ ns = "";\r
+ }\r
+ ns += x.stringMatched(2);\r
+ \r
+ seqs.put(x.stringMatched(1), ns);\r
+ }\r
+ else\r
+ {\r
+ String annType = r.stringMatched(1);\r
+ String annContent = r.stringMatched(2);\r
+ \r
+ // System.err.println("type:" + annType + " content: " + annContent);\r
+ \r
+ if (annType.equals("GF"))\r
+ {\r
+ /*\r
+ * Generic per-File annotation, free text Magic features: #=GF NH\r
+ * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier\r
+ * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS\r
+ * \r
+ * Compulsory fields: ------------------\r
+ * \r
+ * AC Accession number: Accession number in form PFxxxxx.version or\r
+ * PBxxxxxx. ID Identification: One word name for family. DE\r
+ * Definition: Short description of family. AU Author: Authors of the\r
+ * entry. SE Source of seed: The source suggesting the seed members\r
+ * belong to one family. GA Gathering method: Search threshold to\r
+ * build the full alignment. TC Trusted Cutoff: Lowest sequence score\r
+ * and domain score of match in the full alignment. NC Noise Cutoff:\r
+ * Highest sequence score and domain score of match not in full\r
+ * alignment. TP Type: Type of family -- presently Family, Domain,\r
+ * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM\r
+ * Alignment Method The order ls and fs hits are aligned to the model\r
+ * to build the full align. // End of alignment.\r
+ * \r
+ * Optional fields: ----------------\r
+ * \r
+ * DC Database Comment: Comment about database reference. DR Database\r
+ * Reference: Reference to external database. RC Reference Comment:\r
+ * Comment about literature reference. RN Reference Number: Reference\r
+ * Number. RM Reference Medline: Eight digit medline UI number. RT\r
+ * Reference Title: Reference Title. RA Reference Author: Reference\r
+ * Author RL Reference Location: Journal location. PI Previous\r
+ * identifier: Record of all previous ID lines. KW Keywords: Keywords.\r
+ * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.\r
+ * NL Location: Location of nested domains - sequence ID, start and\r
+ * end of insert.\r
+ * \r
+ * Obsolete fields: ----------- AL Alignment method of seed: The\r
+ * method used to align the seed members.\r
+ */\r
+ // Let's save the annotations, maybe we'll be able to do something\r
+ // with them later...\r
+ Regex an = new Regex("(\\w+)\\s*(.*)");\r
+ if (an.search(annContent))\r
+ {\r
+ if (an.stringMatched(1).equals("NH"))\r
+ {\r
+ treeString.append(an.stringMatched(2));\r
+ }\r
+ else if (an.stringMatched(1).equals("TN"))\r
+ {\r
+ if (treeString.length() > 0)\r
+ {\r
+ if (treeName == null)\r
+ {\r
+ treeName = "Tree " + (getTreeCount() + 1);\r
+ }\r
+ addNewickTree(treeName, treeString.toString());\r
+ }\r
+ treeName = an.stringMatched(2);\r
+ treeString = new StringBuffer();\r
+ }\r
+ setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));\r
+ }\r
+ }\r
+ else if (annType.equals("GS"))\r
+ {\r
+ // Generic per-Sequence annotation, free text\r
+ /*\r
+ * Pfam uses these features: Feature Description ---------------------\r
+ * ----------- AC <accession> ACcession number DE <freetext>\r
+ * DEscription DR <db>; <accession>; Database Reference OS <organism>\r
+ * OrganiSm (species) OC <clade> Organism Classification (clade, etc.)\r
+ * LO <look> Look (Color, etc.)\r
+ */\r
+ if (s.search(annContent))\r
+ {\r
+ String acc = s.stringMatched(1);\r
+ String type = s.stringMatched(2);\r
+ String content = s.stringMatched(3);\r
+ // TODO: store DR in a vector.\r
+ // TODO: store AC according to generic file db annotation.\r
+ Hashtable ann;\r
+ if (seqAnn.containsKey(acc))\r
+ {\r
+ ann = (Hashtable) seqAnn.get(acc);\r
+ }\r
+ else\r
+ {\r
+ ann = new Hashtable();\r
+ }\r
+ ann.put(type, content);\r
+ seqAnn.put(acc, ann);\r
+ }\r
+ else\r
+ {\r
+ throw new IOException("Error parsing " + line);\r
+ }\r
+ }\r
+ else if (annType.equals("GC"))\r
+ {\r
+ // Generic per-Column annotation, exactly 1 char per column\r
+ // always need a label.\r
+ if (x.search(annContent))\r
+ {\r
+ // parse out and create alignment annotation directly.\r
+ parseAnnotationRow(annotations, x.stringMatched(1),\r
+ x.stringMatched(2));\r
+ }\r
+ }\r
+ else if (annType.equals("GR"))\r
+ {\r
+ // Generic per-Sequence AND per-Column markup, exactly 1 char per\r
+ // column\r
+ /*\r
+ * Feature Description Markup letters ------- -----------\r
+ * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface\r
+ * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane\r
+ * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;\r
+ * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in\r
+ * or after) [0-2]\r
+ */\r
+ if (s.search(annContent))\r
+ {\r
+ String acc = s.stringMatched(1);\r
+ String type = s.stringMatched(2);\r
+ String seq = new String(s.stringMatched(3));\r
+ String description = null;\r
+ // Check for additional information about the current annotation\r
+ // We use a simple string tokenizer here for speed\r
+ StringTokenizer sep = new StringTokenizer(seq, " \t");\r
+ description = sep.nextToken();\r
+ if (sep.hasMoreTokens())\r
+ {\r
+ seq = sep.nextToken();\r
+ }\r
+ else\r
+ {\r
+ seq = description;\r
+ description = new String();\r
+ }\r
+ // sequence id with from-to fields\r
+ \r
+ Hashtable ann;\r
+ // Get an object with all the annotations for this sequence\r
+ if (seqAnn.containsKey(acc))\r
+ {\r
+ // logger.debug("Found annotations for " + acc);\r
+ ann = (Hashtable) seqAnn.get(acc);\r
+ }\r
+ else\r
+ {\r
+ // logger.debug("Creating new annotations holder for " + acc);\r
+ ann = new Hashtable();\r
+ seqAnn.put(acc, ann);\r
+ }\r
+ // TODO test structure, call parseAnnotationRow with vector from\r
+ // hashtable for specific sequence\r
+ Hashtable features;\r
+ // Get an object with all the content for an annotation\r
+ if (ann.containsKey("features"))\r
+ {\r
+ // logger.debug("Found features for " + acc);\r
+ features = (Hashtable) ann.get("features");\r
+ }\r
+ else\r
+ {\r
+ // logger.debug("Creating new features holder for " + acc);\r
+ features = new Hashtable();\r
+ ann.put("features", features);\r
+ }\r
+ \r
+ Hashtable content;\r
+ if (features.containsKey(this.id2type(type)))\r
+ {\r
+ // logger.debug("Found content for " + this.id2type(type));\r
+ content = (Hashtable) features.get(this.id2type(type));\r
+ }\r
+ else\r
+ {\r
+ // logger.debug("Creating new content holder for " +\r
+ // this.id2type(type));\r
+ content = new Hashtable();\r
+ features.put(this.id2type(type), content);\r
+ }\r
+ String ns = (String) content.get(description);\r
+ if (ns == null)\r
+ {\r
+ ns = "";\r
+ }\r
+ ns += seq;\r
+ content.put(description, ns);\r
+ \r
+// if(type.equals("SS")){\r
+ Hashtable strucAnn;\r
+ if (seqAnn.containsKey(acc))\r
+ {\r
+ strucAnn = (Hashtable) seqAnn.get(acc);\r
+ }\r
+ else\r
+ {\r
+ strucAnn = new Hashtable();\r
+ }\r
+ \r
+ Vector newStruc=new Vector();\r
+ parseAnnotationRow(newStruc, type,ns);\r
+ \r
+ strucAnn.put(type, newStruc);\r
+ seqAnn.put(acc, strucAnn);\r
+ }\r
+// }\r
+ else\r
+ {\r
+ System.err\r
+ .println("Warning - couldn't parse sequence annotation row line:\n"\r
+ + line);\r
+ // throw new IOException("Error parsing " + line);\r
+ }\r
+ }\r
+ else\r
+ {\r
+ throw new IOException("Unknown annotation detected: " + annType\r
+ + " " + annContent);\r
+ }\r
+ }\r
+ }\r
+ if (treeString.length() > 0)\r
+ {\r
+ if (treeName == null)\r
+ {\r
+ treeName = "Tree " + (1 + getTreeCount());\r
+ }\r
+ addNewickTree(treeName, treeString.toString());\r
+ }\r
+ }\r