2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.api.AlignViewportI;
25 import jalview.datamodel.AlignedCodonFrame;
26 import jalview.datamodel.Alignment;
27 import jalview.datamodel.AlignmentI;
28 import jalview.datamodel.SequenceDummy;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.schemes.AnnotationColourGradient;
32 import jalview.schemes.GraduatedColor;
33 import jalview.schemes.UserColourScheme;
34 import jalview.util.Format;
35 import jalview.util.MapList;
36 import jalview.util.ParseHtmlBodyAndLinks;
37 import jalview.util.StringUtils;
39 import java.awt.Color;
40 import java.io.IOException;
41 import java.util.ArrayList;
42 import java.util.Arrays;
43 import java.util.HashMap;
44 import java.util.Iterator;
45 import java.util.List;
47 import java.util.Map.Entry;
48 import java.util.StringTokenizer;
51 * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
52 * format. These are tab-delimited formats but with differences in the use of
55 * A Jalview feature file may define feature colours and then declare that the
56 * remainder of the file is in GFF format with the line 'GFF'.
58 * GFF3 files may include alignment mappings for features, which Jalview will
59 * attempt to model, and may include sequence data following a ##FASTA line.
66 public class FeaturesFile extends AlignFile
68 protected static final String STRAND = "STRAND";
70 protected static final String FRAME = "FRAME";
72 protected static final String ATTRIBUTES = "ATTRIBUTES";
74 protected static final String TAB = "\t";
76 protected static final String GFF_VERSION = "##gff-version";
78 private AlignmentI lastmatchedAl = null;
80 private SequenceIdMatcher matcher = null;
82 protected AlignmentI dataset;
84 protected int gffVersion;
87 * Creates a new FeaturesFile object.
94 * Constructor which does not parse the file immediately
100 public FeaturesFile(String inFile, String type) throws IOException
102 super(false, inFile, type);
107 * @throws IOException
109 public FeaturesFile(FileParse source) throws IOException
115 * Constructor that optionally parses the file immediately
117 * @param parseImmediately
120 * @throws IOException
122 public FeaturesFile(boolean parseImmediately, String inFile, String type)
125 super(parseImmediately, inFile, type);
129 * Parse GFF or sequence features file using case-independent matching,
133 * - alignment/dataset containing sequences that are to be annotated
135 * - hashtable to store feature colour definitions
137 * - process html strings into plain text
138 * @return true if features were added
140 public boolean parse(AlignmentI align, Map<String, Object> colours,
143 return parse(align, colours, removeHTML, false);
147 * Extends the default addProperties by also adding peptide-to-cDNA mappings
148 * (if any) derived while parsing a GFF file
151 public void addProperties(AlignmentI al)
153 super.addProperties(al);
154 if (dataset != null && dataset.getCodonFrames() != null)
156 AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
157 for (AlignedCodonFrame codons : dataset.getCodonFrames())
159 ds.addCodonFrame(codons);
165 * Parse GFF or Jalview format sequence features file
168 * - alignment/dataset containing sequences that are to be annotated
170 * - hashtable to store feature colour definitions
172 * - process html strings into plain text
173 * @param relaxedIdmatching
174 * - when true, ID matches to compound sequence IDs are allowed
175 * @return true if features were added
177 public boolean parse(AlignmentI align, Map<String, Object> colours,
178 boolean removeHTML, boolean relaxedIdmatching)
180 Map<String, String> gffProps = new HashMap<String, String>();
182 * keep track of any sequences we try to create from the data
184 List<SequenceI> newseqs = new ArrayList<SequenceI>();
190 String featureGroup = null;
192 while ((line = nextLine()) != null)
194 // skip comments/process pragmas
195 if (line.length() == 0 || line.startsWith("#"))
197 if (line.toLowerCase().startsWith("##"))
199 processGffPragma(line, gffProps, align, newseqs);
204 st = new StringTokenizer(line, TAB);
205 if (st.countTokens() == 1)
207 if (line.trim().equalsIgnoreCase("GFF"))
210 * Jalview features file with appendded GFF
211 * assume GFF2 (though it may declare gff-version 3)
218 if (st.countTokens() > 1 && st.countTokens() < 4)
221 * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
222 * a feature type colour specification; not GFF format
224 String ft = st.nextToken();
225 if (ft.equalsIgnoreCase("startgroup"))
227 featureGroup = st.nextToken();
229 else if (ft.equalsIgnoreCase("endgroup"))
231 // We should check whether this is the current group,
232 // but at present theres no way of showing more than 1 group
238 parseFeatureColour(line, ft, st, colours);
244 * if not a comment, GFF pragma, startgroup, endgroup or feature
245 * colour specification, that just leaves a feature details line
246 * in either Jalview or GFF format
250 parseJalviewFeature(line, st, align, colours, removeHTML,
251 relaxedIdmatching, featureGroup);
255 parseGffFeature(st, align, relaxedIdmatching, newseqs);
259 } catch (Exception ex)
261 // should report somewhere useful for UI if necessary
262 warningMessage = ((warningMessage == null) ? "" : warningMessage)
263 + "Parsing error at\n" + line;
264 System.out.println("Error parsing feature file: " + ex + "\n" + line);
265 ex.printStackTrace(System.err);
274 * Try to parse a Jalview format feature specification. Returns true if
275 * successful or false if not.
280 * @param featureColours
282 * @param relaxedIdmatching
283 * @param featureGroup
285 protected boolean parseJalviewFeature(String line, StringTokenizer st,
286 AlignmentI alignment, Map<String, Object> featureColours,
287 boolean removeHTML, boolean relaxedIdmatching, String featureGroup)
290 * Jalview: description seqid seqIndex start end type [score]
292 String desc = st.nextToken();
293 String seqId = st.nextToken();
294 SequenceI seq = findName(alignment, seqId, relaxedIdmatching, null);
295 if (!st.hasMoreTokens())
298 .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
299 // in all probability, this isn't a file we understand, so bail
304 if (!seqId.equals("ID_NOT_SPECIFIED"))
306 seq = findName(alignment, seqId, relaxedIdmatching, null);
315 int idx = Integer.parseInt(st.nextToken());
316 seq = alignment.getSequenceAt(idx);
317 } catch (NumberFormatException ex)
325 System.out.println("Sequence not found: " + line);
329 int startPos = Integer.parseInt(st.nextToken());
330 int endPos = Integer.parseInt(st.nextToken());
332 String ft = st.nextToken();
334 if (!featureColours.containsKey(ft))
337 * Perhaps an old style groups file with no colours -
338 * synthesize a colour from the feature type
340 UserColourScheme ucs = new UserColourScheme(ft);
341 featureColours.put(ft, ucs.findColour('A'));
343 SequenceFeature sf = new SequenceFeature(ft, desc, "",
344 startPos, endPos, featureGroup);
345 if (st.hasMoreTokens())
350 score = new Float(st.nextToken()).floatValue();
351 // update colourgradient bounds if allowed to
352 } catch (NumberFormatException ex)
359 parseDescriptionHTML(sf, removeHTML);
361 seq.addSequenceFeature(sf);
364 && (seq = alignment.findName(seq, seqId, false)) != null)
366 seq.addSequenceFeature(new SequenceFeature(sf));
372 * Process a feature type colour specification
375 * the current input line (for error messages only)
377 * the first token on the line
379 * holds remaining tokens on the line
381 * map to which to add derived colour specification
383 protected void parseFeatureColour(String line, String featureType,
384 StringTokenizer st, Map<String, Object> colours)
386 Object colour = null;
387 String colscheme = st.nextToken();
388 if (colscheme.indexOf("|") > -1
389 || colscheme.trim().equalsIgnoreCase("label"))
391 colour = parseGraduatedColourScheme(line, colscheme);
395 UserColourScheme ucs = new UserColourScheme(colscheme);
396 colour = ucs.findColour('A');
400 colours.put(featureType, colour);
405 * Parse a Jalview graduated colour descriptor
408 * @param colourDescriptor
411 protected GraduatedColor parseGraduatedColourScheme(String line,
412 String colourDescriptor)
414 // Parse '|' separated graduated colourscheme fields:
415 // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
416 // can either provide 'label' only, first is optional, next two
417 // colors are required (but may be
418 // left blank), next is optional, nxt two min/max are required.
419 // first is either 'label'
420 // first/second and third are both hexadecimal or word equivalent
422 // next two are values parsed as floats.
423 // fifth is either 'above','below', or 'none'.
424 // sixth is a float value and only required when fifth is either
425 // 'above' or 'below'.
426 StringTokenizer gcol = new StringTokenizer(colourDescriptor, "|", true);
428 float min = Float.MIN_VALUE, max = Float.MAX_VALUE;
429 boolean labelCol = false;
431 String mincol = gcol.nextToken();
435 .println("Expected either 'label' or a colour specification in the line: "
439 String maxcol = null;
440 if (mincol.toLowerCase().indexOf("label") == 0)
443 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip '|'
444 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
446 String abso = null, minval, maxval;
449 // at least four more tokens
450 if (mincol.equals("|"))
456 gcol.nextToken(); // skip next '|'
458 // continue parsing rest of line
459 maxcol = gcol.nextToken();
460 if (maxcol.equals("|"))
466 gcol.nextToken(); // skip next '|'
468 abso = gcol.nextToken();
469 gcol.nextToken(); // skip next '|'
470 if (abso.toLowerCase().indexOf("abso") != 0)
477 minval = gcol.nextToken();
478 gcol.nextToken(); // skip next '|'
480 maxval = gcol.nextToken();
481 if (gcol.hasMoreTokens())
483 gcol.nextToken(); // skip next '|'
487 if (minval.length() > 0)
489 min = Float.valueOf(minval);
491 } catch (Exception e)
494 .println("Couldn't parse the minimum value for graduated colour for type ("
496 + ") - did you misspell 'auto' for the optional automatic colour switch ?");
501 if (maxval.length() > 0)
503 max = Float.valueOf(maxval);
505 } catch (Exception e)
508 .println("Couldn't parse the maximum value for graduated colour for type ("
509 + colourDescriptor + ")");
515 // add in some dummy min/max colours for the label-only
521 GraduatedColor colour = null;
524 colour = new GraduatedColor(
525 new UserColourScheme(mincol).findColour('A'),
526 new UserColourScheme(maxcol).findColour('A'), min, max);
527 } catch (Exception e)
529 System.err.println("Couldn't parse the graduated colour scheme ("
530 + colourDescriptor + ")");
535 colour.setColourByLabel(labelCol);
536 colour.setAutoScaled(abso == null);
537 // add in any additional parameters
538 String ttype = null, tval = null;
539 if (gcol.hasMoreTokens())
541 // threshold type and possibly a threshold value
542 ttype = gcol.nextToken();
543 if (ttype.toLowerCase().startsWith("below"))
545 colour.setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
547 else if (ttype.toLowerCase().startsWith("above"))
549 colour.setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
553 colour.setThreshType(AnnotationColourGradient.NO_THRESHOLD);
554 if (!ttype.toLowerCase().startsWith("no"))
556 System.err.println("Ignoring unrecognised threshold type : "
561 if (colour.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
566 tval = gcol.nextToken();
567 colour.setThresh(new Float(tval).floatValue());
568 } catch (Exception e)
570 System.err.println("Couldn't parse threshold value as a float: ("
575 // parse the thresh-is-min token ?
576 if (gcol.hasMoreTokens())
579 .println("Ignoring additional tokens in parameters in graduated colour specification\n");
580 while (gcol.hasMoreTokens())
582 System.err.println("|" + gcol.nextToken());
584 System.err.println("\n");
591 * clear any temporary handles used to speed up ID matching
593 protected void resetMatcher()
595 lastmatchedAl = null;
600 * Returns a sequence matching the given id, as follows
602 * <li>matching is on exact sequence name, or on a token within the sequence
603 * name, or a dbxref, if relaxed matching is selected</li>
604 * <li>first tries to find a match in the alignment sequences</li>
605 * <li>else tries to find a match in the new sequences already generated
606 * parsing the features file</li>
607 * <li>else creates a new placeholder sequence, adds it to the new sequences
608 * list, and returns it</li>
613 * @param relaxedIdMatching
617 protected SequenceI findName(AlignmentI align, String seqId,
618 boolean relaxedIdMatching, List<SequenceI> newseqs)
620 SequenceI match = null;
621 if (relaxedIdMatching)
623 if (lastmatchedAl != align)
625 lastmatchedAl = align;
626 matcher = new SequenceIdMatcher(align.getSequencesArray());
629 matcher.addAll(newseqs);
632 match = matcher.findIdMatch(seqId);
636 match = align.findName(seqId, true);
637 if (match == null && newseqs != null)
639 for (SequenceI m : newseqs)
641 if (seqId.equals(m.getName()))
649 if (match == null && newseqs != null)
651 match = new SequenceDummy(seqId);
652 if (relaxedIdMatching)
654 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
656 // add dummy sequence to the newseqs list
662 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
664 if (sf.getDescription() == null)
668 ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
669 sf.getDescription(), removeHTML, newline);
671 sf.description = (removeHTML) ? parsed.getNonHtmlContent()
673 for (String link : parsed.getLinks())
681 * generate a features file for seqs includes non-pos features by default.
684 * source of sequence features
686 * hash of feature types and colours
687 * @return features file contents
689 public String printJalviewFormat(SequenceI[] sequences,
690 Map<String, Object> visible)
692 return printJalviewFormat(sequences, visible, true, true);
696 * generate a features file for seqs with colours from visible (if any)
701 * hash of Colours for each feature type
703 * when true only feature types in 'visible' will be output
705 * indicates if non-positional features should be output (regardless
707 * @return features file contents
709 public String printJalviewFormat(SequenceI[] sequences,
710 Map<String, Object> visible, boolean visOnly, boolean nonpos)
712 StringBuilder out = new StringBuilder(256);
713 boolean featuresGen = false;
714 if (visOnly && !nonpos && (visible == null || visible.size() < 1))
716 // no point continuing.
717 return "No Features Visible";
720 if (visible != null && visOnly)
722 // write feature colours only if we're given them and we are generating
724 // TODO: decide if feature links should also be written here ?
725 Iterator<String> en = visible.keySet().iterator();
726 String featureType, color;
729 featureType = en.next().toString();
731 if (visible.get(featureType) instanceof GraduatedColor)
733 GraduatedColor gc = (GraduatedColor) visible.get(featureType);
734 color = (gc.isColourByLabel() ? "label|" : "")
735 + Format.getHexString(gc.getMinColor()) + "|"
736 + Format.getHexString(gc.getMaxColor())
737 + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
739 if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
741 if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
747 if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
749 System.err.println("WARNING: Unsupported threshold type ("
750 + gc.getThreshType() + ") : Assuming 'above'");
755 color += "|" + gc.getThresh();
762 else if (visible.get(featureType) instanceof Color)
764 color = Format.getHexString((Color) visible.get(featureType));
768 // legacy support for integer objects containing colour triplet values
769 color = Format.getHexString(new Color(Integer.parseInt(visible
770 .get(featureType).toString())));
772 out.append(featureType);
778 // Work out which groups are both present and visible
779 List<String> groups = new ArrayList<String>();
781 boolean isnonpos = false;
783 SequenceFeature[] features;
784 for (int i = 0; i < sequences.length; i++)
786 features = sequences[i].getSequenceFeatures();
787 if (features != null)
789 for (int j = 0; j < features.length; j++)
791 isnonpos = features[j].begin == 0 && features[j].end == 0;
792 if ((!nonpos && isnonpos)
793 || (!isnonpos && visOnly && !visible
794 .containsKey(features[j].type)))
799 if (features[j].featureGroup != null
800 && !groups.contains(features[j].featureGroup))
802 groups.add(features[j].featureGroup);
811 if (groups.size() > 0 && groupIndex < groups.size())
813 group = groups.get(groupIndex);
815 out.append("STARTGROUP").append(TAB);
824 for (int i = 0; i < sequences.length; i++)
826 features = sequences[i].getSequenceFeatures();
827 if (features != null)
829 for (int j = 0; j < features.length; j++)
831 isnonpos = features[j].begin == 0 && features[j].end == 0;
832 if ((!nonpos && isnonpos)
833 || (!isnonpos && visOnly && !visible
834 .containsKey(features[j].type)))
836 // skip if feature is nonpos and we ignore them or if we only
837 // output visible and it isn't non-pos and it's not visible
842 && (features[j].featureGroup == null || !features[j].featureGroup
848 if (group == null && features[j].featureGroup != null)
852 // we have features to output
854 if (features[j].description == null
855 || features[j].description.equals(""))
857 out.append(features[j].type).append(TAB);
861 if (features[j].links != null
862 && features[j].getDescription().indexOf("<html>") == -1)
864 out.append("<html>");
867 out.append(features[j].description + " ");
868 if (features[j].links != null)
870 for (int l = 0; l < features[j].links.size(); l++)
872 String label = features[j].links.elementAt(l).toString();
873 String href = label.substring(label.indexOf("|") + 1);
874 label = label.substring(0, label.indexOf("|"));
876 if (features[j].description.indexOf(href) == -1)
878 out.append("<a href=\"" + href + "\">" + label + "</a>");
882 if (features[j].getDescription().indexOf("</html>") == -1)
884 out.append("</html>");
890 out.append(sequences[i].getName());
891 out.append("\t-1\t");
892 out.append(features[j].begin);
894 out.append(features[j].end);
896 out.append(features[j].type);
897 if (!Float.isNaN(features[j].score))
900 out.append(features[j].score);
909 out.append("ENDGROUP").append(TAB);
919 } while (groupIndex < groups.size() + 1);
923 return "No Features Visible";
926 return out.toString();
930 * Parse method that is called when a GFF file is dragged to the desktop
935 AlignViewportI av = getViewport();
938 if (av.getAlignment() != null)
940 dataset = av.getAlignment().getDataset();
944 // working in the applet context ?
945 dataset = av.getAlignment();
950 dataset = new Alignment(new SequenceI[] {});
953 boolean parseResult = parse(dataset, null, false, true);
956 // pass error up somehow
960 // update viewport with the dataset data ?
964 setSeqs(dataset.getSequencesArray());
969 * Implementation of unused abstract method
971 * @return error message
974 public String print()
976 return "Use printGffFormat() or printJalviewFormat()";
980 * Returns features output in GFF2 format, including hidden and non-positional
984 * the sequences whose features are to be output
986 * a map whose keys are the type names of visible features
989 public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible)
991 return printGffFormat(sequences, visible, true, true);
995 * Returns features output in GFF2 format
998 * the sequences whose features are to be output
1000 * a map whose keys are the type names of visible features
1001 * @param outputVisibleOnly
1002 * @param includeNonPositionalFeatures
1005 public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible, boolean outputVisibleOnly,
1006 boolean includeNonPositionalFeatures)
1008 StringBuilder out = new StringBuilder(256);
1009 out.append(String.format("%s %d\n", GFF_VERSION, gffVersion));
1012 for (SequenceI seq : sequences)
1014 SequenceFeature[] features = seq.getSequenceFeatures();
1015 if (features != null)
1017 for (SequenceFeature sf : features)
1019 isnonpos = sf.begin == 0 && sf.end == 0;
1020 if (!includeNonPositionalFeatures && isnonpos)
1023 * ignore non-positional features if not wanted
1027 // TODO why the test !isnonpos here?
1028 // what about not visible non-positional features?
1029 if (!isnonpos && outputVisibleOnly
1030 && !visible.containsKey(sf.type))
1033 * ignore not visible features if not wanted
1038 source = sf.featureGroup;
1041 source = sf.getDescription();
1044 out.append(seq.getName());
1048 out.append(sf.type);
1050 out.append(sf.begin);
1054 out.append(sf.score);
1057 out.append(sf.getValue(STRAND, "."));
1060 out.append(sf.getValue(FRAME, "."));
1062 // miscellaneous key-values (GFF column 9)
1063 String attributes = (String) sf.getValue(ATTRIBUTES);
1064 if (attributes != null)
1066 out.append(TAB).append(attributes);
1069 out.append(newline);
1074 return out.toString();
1078 * Helper method to make a mapping given a set of attributes for a GFF feature
1083 * either 1 (forward) or -1 (reverse)
1085 * @throws InvalidGFF3FieldException
1087 protected MapList constructCodonMappingFromAlign(
1088 Map<String, List<String>> set, String attr,
1089 int strand) throws InvalidGFF3FieldException
1093 throw new InvalidGFF3FieldException(attr, set,
1094 "Invalid strand for a codon mapping (cannot be 0)");
1096 List<Integer> fromrange = new ArrayList<Integer>();
1097 List<Integer> torange = new ArrayList<Integer>();
1098 int lastppos = 0, lastpframe = 0;
1099 for (String range : set.get(attr))
1101 List<Integer> ints = new ArrayList<Integer>();
1102 StringTokenizer st = new StringTokenizer(range, " ");
1103 while (st.hasMoreTokens())
1105 String num = st.nextToken();
1108 ints.add(new Integer(num));
1109 } catch (NumberFormatException nfe)
1111 throw new InvalidGFF3FieldException(attr, set,
1112 "Invalid number in field " + num);
1116 * Align positionInRef positionInQuery LengthInRef
1117 * contig_1146 exonerate:p2g:local similarity 8534 11269 3652 - .
1118 * alignment_id 0 ; Query DDB_G0269124 Align 11270 143 120
1120 * 120 bases align at pos 143 in protein to 11270 on dna (-ve strand)
1121 * and so on for additional ' ; Align x y z' groups
1123 if (ints.size() != 3)
1125 throw new InvalidGFF3FieldException(attr, set,
1126 "Invalid number of fields for this attribute ("
1127 + ints.size() + ")");
1129 fromrange.add(ints.get(0));
1130 fromrange.add(ints.get(0) + strand * ints.get(2));
1131 // how are intron/exon boundaries that do not align in codons
1133 if (ints.get(1).intValue() == lastppos && lastpframe > 0)
1135 // extend existing to map
1136 lastppos += ints.get(2) / 3;
1137 lastpframe = ints.get(2) % 3;
1138 torange.set(torange.size() - 1, new Integer(lastppos));
1143 torange.add(ints.get(1));
1144 lastppos = ints.get(1) + ints.get(2) / 3;
1145 lastpframe = ints.get(2) % 3;
1146 torange.add(new Integer(lastppos));
1149 // from and to ranges must end up being a series of start/end intervals
1150 if (fromrange.size() % 2 == 1)
1152 throw new InvalidGFF3FieldException(attr, set,
1153 "Couldn't parse the DNA alignment range correctly");
1155 if (torange.size() % 2 == 1)
1157 throw new InvalidGFF3FieldException(attr, set,
1158 "Couldn't parse the protein alignment range correctly");
1160 // finally, build the map
1161 int[] frommap = new int[fromrange.size()], tomap = new int[torange
1164 for (Integer ip : fromrange)
1166 frommap[p++] = ip.intValue();
1169 for (Integer ip : torange)
1171 tomap[p++] = ip.intValue();
1174 return new MapList(frommap, tomap, 3, 1);
1177 private List<SequenceI> findNames(AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching,
1180 List<SequenceI> found = new ArrayList<SequenceI>();
1181 for (String seqId : list)
1183 SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1193 * Parse a GFF format feature. This may include creating a 'dummy' sequence
1194 * for the feature or its mapped sequence
1198 * @param relaxedIdmatching
1202 protected SequenceI parseGffFeature(StringTokenizer st, AlignmentI alignment, boolean relaxedIdmatching,
1203 List<SequenceI> newseqs)
1207 * GFF: seqid source type start end score strand phase [attributes]
1209 String seqId = st.nextToken();
1212 * locate referenced sequence in alignment _or_
1213 * as a forward reference (SequenceDummy)
1215 seq = findName(alignment, seqId, relaxedIdmatching, newseqs);
1217 String desc = st.nextToken();
1218 String group = null;
1219 if (desc.indexOf(' ') == -1)
1221 // could also be a source term rather than description line
1224 String ft = st.nextToken();
1225 int startPos = StringUtils.parseInt(st.nextToken());
1226 int endPos = StringUtils.parseInt(st.nextToken());
1227 // TODO: decide if non positional feature assertion for input data
1228 // where end==0 is generally valid
1231 // treat as non-positional feature, regardless.
1237 score = new Float(st.nextToken()).floatValue();
1238 } catch (NumberFormatException ex)
1243 SequenceFeature sf = new SequenceFeature(ft, desc, startPos,
1244 endPos, score, group);
1245 if (st.hasMoreTokens())
1247 sf.setValue(STRAND, st.nextToken());
1249 if (st.hasMoreTokens())
1251 sf.setValue(FRAME, st.nextToken());
1254 if (st.hasMoreTokens())
1256 String attributes = st.nextToken();
1257 sf.setValue(ATTRIBUTES, attributes);
1260 * parse semi-structured attributes in column 9 and add them to the
1261 * sequence feature's 'otherData' table; use Note as a best proxy for
1264 Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
1265 new char[] { ' ', '=' });
1266 for (Entry<String, List<String>> attr : nameValues.entrySet())
1268 String values = StringUtils.listToDelimitedString(attr.getValue(),
1270 sf.setValue(attr.getKey(), values);
1271 if ("Note".equals(attr.getKey()))
1273 sf.setDescription(values);
1278 if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
1281 // check whether we should add the sequence feature to any other
1282 // sequences in the alignment with the same or similar
1283 while ((seq = alignment.findName(seq, seqId, true)) != null)
1285 seq.addSequenceFeature(new SequenceFeature(sf));
1292 * After encountering ##fasta in a GFF3 file, process the remainder of the
1293 * file as FAST sequence data. Any placeholder sequences created during
1294 * feature parsing are updated with the actual sequences.
1298 * @throws IOException
1300 protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1306 } catch (IOException q)
1309 FastaFile parser = new FastaFile(this);
1310 List<SequenceI> includedseqs = parser.getSeqs();
1311 SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1312 // iterate over includedseqs, and replacing matching ones with newseqs
1313 // sequences. Generic iterator not used here because we modify includedseqs
1315 for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1317 // search for any dummy seqs that this sequence can be used to update
1318 SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
1319 if (dummyseq != null)
1321 // dummyseq was created so it could be annotated and referred to in
1322 // alignments/codon mappings
1324 SequenceI mseq = includedseqs.get(p);
1325 // mseq is the 'template' imported from the FASTA file which we'll use
1326 // to coomplete dummyseq
1327 if (dummyseq instanceof SequenceDummy)
1329 // probably have the pattern wrong
1330 // idea is that a flyweight proxy for a sequence ID can be created for
1331 // 1. stable reference creation
1332 // 2. addition of annotation
1333 // 3. future replacement by a real sequence
1334 // current pattern is to create SequenceDummy objects - a convenience
1335 // constructor for a Sequence.
1336 // problem is that when promoted to a real sequence, all references
1338 // to be updated somehow.
1339 ((SequenceDummy) dummyseq).become(mseq);
1340 includedseqs.set(p, dummyseq); // template is no longer needed
1344 // finally add sequences to the dataset
1345 for (SequenceI seq : includedseqs)
1347 align.addSequence(seq);
1352 * Process a ## directive
1358 * @throws IOException
1360 protected void processGffPragma(String line, Map<String, String> gffProps, AlignmentI align,
1361 List<SequenceI> newseqs) throws IOException
1364 if ("###".equals(line))
1366 // close off any open 'forward references'
1370 String[] tokens = line.substring(2).split(" ");
1371 String pragma = tokens[0];
1372 String value = tokens.length == 1 ? null : tokens[1];
1374 if ("gff-version".equalsIgnoreCase(pragma))
1380 // value may be e.g. "3.1.2"
1381 gffVersion = Integer.parseInt(value.split("\\.")[0]);
1382 } catch (NumberFormatException e)
1388 else if ("feature-ontology".equalsIgnoreCase(pragma))
1390 // should resolve against the specified feature ontology URI
1392 else if ("attribute-ontology".equalsIgnoreCase(pragma))
1394 // URI of attribute ontology - not currently used in GFF3
1396 else if ("source-ontology".equalsIgnoreCase(pragma))
1398 // URI of source ontology - not currently used in GFF3
1400 else if ("species-build".equalsIgnoreCase(pragma))
1402 // save URI of specific NCBI taxon version of annotations
1403 gffProps.put("species-build", value);
1405 else if ("fasta".equalsIgnoreCase(pragma))
1407 // process the rest of the file as a fasta file and replace any dummy
1409 processAsFasta(align, newseqs);
1413 System.err.println("Ignoring unknown pragma: " + line);
1418 * Processes the 'Query' and 'Align' properties associated with a GFF
1419 * similarity feature; these properties define the mapping of the annotated
1420 * feature to another from which it has transferred annotation
1427 public void processGffSimilarity(Map<String, List<String>> set, SequenceI seq,
1428 SequenceFeature sf, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)
1429 throws InvalidGFF3FieldException
1431 int strand = sf.getStrand();
1432 // exonerate cdna/protein map
1434 List<SequenceI> querySeq = findNames(align, newseqs, relaxedIdMatching,
1436 if (querySeq == null || querySeq.size() != 1)
1438 throw new InvalidGFF3FieldException("Query", set,
1439 "Expecting exactly one sequence in Query field (got "
1440 + set.get("Query") + ")");
1442 if (set.containsKey("Align"))
1444 // process the align maps and create cdna/protein maps
1445 // ideally, the query sequences are in the alignment, but maybe not...
1447 AlignedCodonFrame alco = new AlignedCodonFrame();
1448 MapList codonmapping = constructCodonMappingFromAlign(set, "Align",
1451 // add codon mapping, and hope!
1452 alco.addMap(seq, querySeq.get(0), codonmapping);
1453 align.addCodonFrame(alco);
1459 * take a sequence feature and examine its attributes to decide how it should
1460 * be added to a sequence
1463 * - the destination sequence constructed or discovered in the
1466 * - the base feature with ATTRIBUTES property containing any
1467 * additional attributes
1469 * - true if we are processing a GFF annotation file
1470 * @return true if sf was actually added to the sequence, false if it was
1471 * processed in another way
1473 public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs,
1474 SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching)
1476 String attr = (String) sf.getValue(ATTRIBUTES);
1477 boolean addFeature = true;
1480 for (String attset : attr.split(TAB))
1482 Map<String, List<String>> set = StringUtils.parseNameValuePairs(
1483 attset, ";", new char[] { ' ', '-' });
1485 if ("similarity".equals(sf.getType()))
1489 processGffSimilarity(set, seq, sf, align, newseqs,
1492 } catch (InvalidGFF3FieldException ivfe)
1494 System.err.println(ivfe);
1501 seq.addSequenceFeature(sf);
1508 class InvalidGFF3FieldException extends Exception
1510 String field, value;
1512 public InvalidGFF3FieldException(String field,
1513 Map<String, List<String>> set, String message)
1515 super(message + " (Field was " + field + " and value was "
1516 + set.get(field).toString());
1518 this.value = set.get(field).toString();