2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.api.AlignViewportI;
25 import jalview.datamodel.AlignedCodonFrame;
26 import jalview.datamodel.Alignment;
27 import jalview.datamodel.AlignmentI;
28 import jalview.datamodel.SequenceDummy;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.schemes.AnnotationColourGradient;
32 import jalview.schemes.GraduatedColor;
33 import jalview.schemes.UserColourScheme;
34 import jalview.util.Format;
35 import jalview.util.MapList;
36 import jalview.util.ParseHtmlBodyAndLinks;
37 import jalview.util.StringUtils;
39 import java.awt.Color;
40 import java.io.IOException;
41 import java.util.ArrayList;
42 import java.util.Arrays;
43 import java.util.HashMap;
44 import java.util.Iterator;
45 import java.util.List;
47 import java.util.Map.Entry;
48 import java.util.StringTokenizer;
51 * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
52 * format. These are tab-delimited formats but with differences in the use of
55 * A Jalview feature file may define feature colours and then declare that the
56 * remainder of the file is in GFF format with the line 'GFF'.
58 * GFF3 files may include alignment mappings for features, which Jalview will
59 * attempt to model, and may include sequence data following a ##FASTA line.
66 public class FeaturesFile extends AlignFile
68 private static final String NOTE = "Note";
70 private static final String ALIGN = "Align";
72 private static final String QUERY = "Query";
74 private static final String TARGET = "Target";
76 private static final String SIMILARITY = "similarity";
78 protected static final String STRAND = "STRAND";
80 protected static final String FRAME = "FRAME";
82 protected static final String ATTRIBUTES = "ATTRIBUTES";
84 protected static final String TAB = "\t";
86 protected static final String GFF_VERSION = "##gff-version";
88 private AlignmentI lastmatchedAl = null;
90 private SequenceIdMatcher matcher = null;
92 protected AlignmentI dataset;
94 protected int gffVersion;
97 * Creates a new FeaturesFile object.
104 * Constructor which does not parse the file immediately
108 * @throws IOException
110 public FeaturesFile(String inFile, String type) throws IOException
112 super(false, inFile, type);
117 * @throws IOException
119 public FeaturesFile(FileParse source) throws IOException
125 * Constructor that optionally parses the file immediately
127 * @param parseImmediately
130 * @throws IOException
132 public FeaturesFile(boolean parseImmediately, String inFile, String type)
135 super(parseImmediately, inFile, type);
139 * Parse GFF or sequence features file using case-independent matching,
143 * - alignment/dataset containing sequences that are to be annotated
145 * - hashtable to store feature colour definitions
147 * - process html strings into plain text
148 * @return true if features were added
150 public boolean parse(AlignmentI align, Map<String, Object> colours,
153 return parse(align, colours, removeHTML, false);
157 * Extends the default addProperties by also adding peptide-to-cDNA mappings
158 * (if any) derived while parsing a GFF file
161 public void addProperties(AlignmentI al)
163 super.addProperties(al);
164 if (dataset != null && dataset.getCodonFrames() != null)
166 AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
167 for (AlignedCodonFrame codons : dataset.getCodonFrames())
169 ds.addCodonFrame(codons);
175 * Parse GFF or Jalview format sequence features file
178 * - alignment/dataset containing sequences that are to be annotated
180 * - hashtable to store feature colour definitions
182 * - process html strings into plain text
183 * @param relaxedIdmatching
184 * - when true, ID matches to compound sequence IDs are allowed
185 * @return true if features were added
187 public boolean parse(AlignmentI align, Map<String, Object> colours,
188 boolean removeHTML, boolean relaxedIdmatching)
190 Map<String, String> gffProps = new HashMap<String, String>();
192 * keep track of any sequences we try to create from the data
194 List<SequenceI> newseqs = new ArrayList<SequenceI>();
200 String featureGroup = null;
202 while ((line = nextLine()) != null)
204 // skip comments/process pragmas
205 if (line.length() == 0 || line.startsWith("#"))
207 if (line.toLowerCase().startsWith("##"))
209 processGffPragma(line, gffProps, align, newseqs);
214 st = new StringTokenizer(line, TAB);
215 if (st.countTokens() == 1)
217 if (line.trim().equalsIgnoreCase("GFF"))
220 * Jalview features file with appendded GFF
221 * assume GFF2 (though it may declare gff-version 3)
228 if (st.countTokens() > 1 && st.countTokens() < 4)
231 * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
232 * a feature type colour specification; not GFF format
234 String ft = st.nextToken();
235 if (ft.equalsIgnoreCase("startgroup"))
237 featureGroup = st.nextToken();
239 else if (ft.equalsIgnoreCase("endgroup"))
241 // We should check whether this is the current group,
242 // but at present theres no way of showing more than 1 group
248 parseFeatureColour(line, ft, st, colours);
254 * if not a comment, GFF pragma, startgroup, endgroup or feature
255 * colour specification, that just leaves a feature details line
256 * in either Jalview or GFF format
260 parseJalviewFeature(line, st, align, colours, removeHTML,
261 relaxedIdmatching, featureGroup);
265 parseGffFeature(st, align, relaxedIdmatching, newseqs);
269 } catch (Exception ex)
271 // should report somewhere useful for UI if necessary
272 warningMessage = ((warningMessage == null) ? "" : warningMessage)
273 + "Parsing error at\n" + line;
274 System.out.println("Error parsing feature file: " + ex + "\n" + line);
275 ex.printStackTrace(System.err);
284 * Try to parse a Jalview format feature specification. Returns true if
285 * successful or false if not.
290 * @param featureColours
292 * @param relaxedIdmatching
293 * @param featureGroup
295 protected boolean parseJalviewFeature(String line, StringTokenizer st,
296 AlignmentI alignment, Map<String, Object> featureColours,
297 boolean removeHTML, boolean relaxedIdMatching, String featureGroup)
300 * Jalview: description seqid seqIndex start end type [score]
302 if (st.countTokens() < 6)
304 System.err.println("Ignoring feature line '" + line
305 + "' with unexpected number of columns (" + st.countTokens()
309 String desc = st.nextToken();
310 String seqId = st.nextToken();
311 SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId);
313 if (!seqId.equals("ID_NOT_SPECIFIED"))
315 seq = findName(alignment, null, relaxedIdMatching, seqId);
324 int idx = Integer.parseInt(st.nextToken());
325 seq = alignment.getSequenceAt(idx);
326 } catch (NumberFormatException ex)
334 System.out.println("Sequence not found: " + line);
338 int startPos = Integer.parseInt(st.nextToken());
339 int endPos = Integer.parseInt(st.nextToken());
341 String ft = st.nextToken();
343 if (!featureColours.containsKey(ft))
346 * Perhaps an old style groups file with no colours -
347 * synthesize a colour from the feature type
349 UserColourScheme ucs = new UserColourScheme(ft);
350 featureColours.put(ft, ucs.findColour('A'));
352 SequenceFeature sf = new SequenceFeature(ft, desc, "",
353 startPos, endPos, featureGroup);
354 if (st.hasMoreTokens())
359 score = new Float(st.nextToken()).floatValue();
360 // update colourgradient bounds if allowed to
361 } catch (NumberFormatException ex)
368 parseDescriptionHTML(sf, removeHTML);
370 seq.addSequenceFeature(sf);
373 && (seq = alignment.findName(seq, seqId, false)) != null)
375 seq.addSequenceFeature(new SequenceFeature(sf));
381 * Process a feature type colour specification
384 * the current input line (for error messages only)
386 * the first token on the line
388 * holds remaining tokens on the line
390 * map to which to add derived colour specification
392 protected void parseFeatureColour(String line, String featureType,
393 StringTokenizer st, Map<String, Object> colours)
395 Object colour = null;
396 String colscheme = st.nextToken();
397 if (colscheme.indexOf("|") > -1
398 || colscheme.trim().equalsIgnoreCase("label"))
400 colour = parseGraduatedColourScheme(line, colscheme);
404 UserColourScheme ucs = new UserColourScheme(colscheme);
405 colour = ucs.findColour('A');
409 colours.put(featureType, colour);
414 * Parse a Jalview graduated colour descriptor
417 * @param colourDescriptor
420 protected GraduatedColor parseGraduatedColourScheme(String line,
421 String colourDescriptor)
423 // Parse '|' separated graduated colourscheme fields:
424 // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
425 // can either provide 'label' only, first is optional, next two
426 // colors are required (but may be
427 // left blank), next is optional, nxt two min/max are required.
428 // first is either 'label'
429 // first/second and third are both hexadecimal or word equivalent
431 // next two are values parsed as floats.
432 // fifth is either 'above','below', or 'none'.
433 // sixth is a float value and only required when fifth is either
434 // 'above' or 'below'.
435 StringTokenizer gcol = new StringTokenizer(colourDescriptor, "|", true);
437 float min = Float.MIN_VALUE, max = Float.MAX_VALUE;
438 boolean labelCol = false;
440 String mincol = gcol.nextToken();
444 .println("Expected either 'label' or a colour specification in the line: "
448 String maxcol = null;
449 if (mincol.toLowerCase().indexOf("label") == 0)
452 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip '|'
453 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
455 String abso = null, minval, maxval;
458 // at least four more tokens
459 if (mincol.equals("|"))
465 gcol.nextToken(); // skip next '|'
467 // continue parsing rest of line
468 maxcol = gcol.nextToken();
469 if (maxcol.equals("|"))
475 gcol.nextToken(); // skip next '|'
477 abso = gcol.nextToken();
478 gcol.nextToken(); // skip next '|'
479 if (abso.toLowerCase().indexOf("abso") != 0)
486 minval = gcol.nextToken();
487 gcol.nextToken(); // skip next '|'
489 maxval = gcol.nextToken();
490 if (gcol.hasMoreTokens())
492 gcol.nextToken(); // skip next '|'
496 if (minval.length() > 0)
498 min = Float.valueOf(minval);
500 } catch (Exception e)
503 .println("Couldn't parse the minimum value for graduated colour for type ("
505 + ") - did you misspell 'auto' for the optional automatic colour switch ?");
510 if (maxval.length() > 0)
512 max = Float.valueOf(maxval);
514 } catch (Exception e)
517 .println("Couldn't parse the maximum value for graduated colour for type ("
518 + colourDescriptor + ")");
524 // add in some dummy min/max colours for the label-only
530 GraduatedColor colour = null;
533 colour = new GraduatedColor(
534 new UserColourScheme(mincol).findColour('A'),
535 new UserColourScheme(maxcol).findColour('A'), min, max);
536 } catch (Exception e)
538 System.err.println("Couldn't parse the graduated colour scheme ("
539 + colourDescriptor + ")");
544 colour.setColourByLabel(labelCol);
545 colour.setAutoScaled(abso == null);
546 // add in any additional parameters
547 String ttype = null, tval = null;
548 if (gcol.hasMoreTokens())
550 // threshold type and possibly a threshold value
551 ttype = gcol.nextToken();
552 if (ttype.toLowerCase().startsWith("below"))
554 colour.setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
556 else if (ttype.toLowerCase().startsWith("above"))
558 colour.setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
562 colour.setThreshType(AnnotationColourGradient.NO_THRESHOLD);
563 if (!ttype.toLowerCase().startsWith("no"))
565 System.err.println("Ignoring unrecognised threshold type : "
570 if (colour.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
575 tval = gcol.nextToken();
576 colour.setThresh(new Float(tval).floatValue());
577 } catch (Exception e)
579 System.err.println("Couldn't parse threshold value as a float: ("
584 // parse the thresh-is-min token ?
585 if (gcol.hasMoreTokens())
588 .println("Ignoring additional tokens in parameters in graduated colour specification\n");
589 while (gcol.hasMoreTokens())
591 System.err.println("|" + gcol.nextToken());
593 System.err.println("\n");
600 * clear any temporary handles used to speed up ID matching
602 protected void resetMatcher()
604 lastmatchedAl = null;
609 * Returns a sequence matching the given id, as follows
611 * <li>strict matching is on exact sequence name</li>
612 * <li>relaxed matching allows matching on a token within the sequence name,
614 * <li>first tries to find a match in the alignment sequences</li>
615 * <li>else tries to find a match in the new sequences already generated while
616 * parsing the features file</li>
617 * <li>else creates a new placeholder sequence, adds it to the new sequences
618 * list, and returns it</li>
623 * @param relaxedIdMatching
627 protected SequenceI findName(AlignmentI align, List<SequenceI> newseqs,
628 boolean relaxedIdMatching, String seqId)
630 SequenceI match = null;
631 if (relaxedIdMatching)
633 if (lastmatchedAl != align)
635 lastmatchedAl = align;
636 matcher = new SequenceIdMatcher(align.getSequencesArray());
639 matcher.addAll(newseqs);
642 match = matcher.findIdMatch(seqId);
646 match = align.findName(seqId, true);
647 if (match == null && newseqs != null)
649 for (SequenceI m : newseqs)
651 if (seqId.equals(m.getName()))
659 if (match == null && newseqs != null)
661 match = new SequenceDummy(seqId);
662 if (relaxedIdMatching)
664 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
666 // add dummy sequence to the newseqs list
672 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
674 if (sf.getDescription() == null)
678 ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
679 sf.getDescription(), removeHTML, newline);
681 sf.description = (removeHTML) ? parsed.getNonHtmlContent()
683 for (String link : parsed.getLinks())
691 * generate a features file for seqs includes non-pos features by default.
694 * source of sequence features
696 * hash of feature types and colours
697 * @return features file contents
699 public String printJalviewFormat(SequenceI[] sequences,
700 Map<String, Object> visible)
702 return printJalviewFormat(sequences, visible, true, true);
706 * generate a features file for seqs with colours from visible (if any)
711 * hash of Colours for each feature type
713 * when true only feature types in 'visible' will be output
715 * indicates if non-positional features should be output (regardless
717 * @return features file contents
719 public String printJalviewFormat(SequenceI[] sequences,
720 Map<String, Object> visible, boolean visOnly, boolean nonpos)
722 StringBuilder out = new StringBuilder(256);
723 boolean featuresGen = false;
724 if (visOnly && !nonpos && (visible == null || visible.size() < 1))
726 // no point continuing.
727 return "No Features Visible";
730 if (visible != null && visOnly)
732 // write feature colours only if we're given them and we are generating
734 // TODO: decide if feature links should also be written here ?
735 Iterator<String> en = visible.keySet().iterator();
736 String featureType, color;
739 featureType = en.next().toString();
741 if (visible.get(featureType) instanceof GraduatedColor)
743 GraduatedColor gc = (GraduatedColor) visible.get(featureType);
744 color = (gc.isColourByLabel() ? "label|" : "")
745 + Format.getHexString(gc.getMinColor()) + "|"
746 + Format.getHexString(gc.getMaxColor())
747 + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
749 if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
751 if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
757 if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
759 System.err.println("WARNING: Unsupported threshold type ("
760 + gc.getThreshType() + ") : Assuming 'above'");
765 color += "|" + gc.getThresh();
772 else if (visible.get(featureType) instanceof Color)
774 color = Format.getHexString((Color) visible.get(featureType));
778 // legacy support for integer objects containing colour triplet values
779 color = Format.getHexString(new Color(Integer.parseInt(visible
780 .get(featureType).toString())));
782 out.append(featureType);
788 // Work out which groups are both present and visible
789 List<String> groups = new ArrayList<String>();
791 boolean isnonpos = false;
793 SequenceFeature[] features;
794 for (int i = 0; i < sequences.length; i++)
796 features = sequences[i].getSequenceFeatures();
797 if (features != null)
799 for (int j = 0; j < features.length; j++)
801 isnonpos = features[j].begin == 0 && features[j].end == 0;
802 if ((!nonpos && isnonpos)
803 || (!isnonpos && visOnly && !visible
804 .containsKey(features[j].type)))
809 if (features[j].featureGroup != null
810 && !groups.contains(features[j].featureGroup))
812 groups.add(features[j].featureGroup);
821 if (groups.size() > 0 && groupIndex < groups.size())
823 group = groups.get(groupIndex);
825 out.append("STARTGROUP").append(TAB);
834 for (int i = 0; i < sequences.length; i++)
836 features = sequences[i].getSequenceFeatures();
837 if (features != null)
839 for (int j = 0; j < features.length; j++)
841 isnonpos = features[j].begin == 0 && features[j].end == 0;
842 if ((!nonpos && isnonpos)
843 || (!isnonpos && visOnly && !visible
844 .containsKey(features[j].type)))
846 // skip if feature is nonpos and we ignore them or if we only
847 // output visible and it isn't non-pos and it's not visible
852 && (features[j].featureGroup == null || !features[j].featureGroup
858 if (group == null && features[j].featureGroup != null)
862 // we have features to output
864 if (features[j].description == null
865 || features[j].description.equals(""))
867 out.append(features[j].type).append(TAB);
871 if (features[j].links != null
872 && features[j].getDescription().indexOf("<html>") == -1)
874 out.append("<html>");
877 out.append(features[j].description + " ");
878 if (features[j].links != null)
880 for (int l = 0; l < features[j].links.size(); l++)
882 String label = features[j].links.elementAt(l).toString();
883 String href = label.substring(label.indexOf("|") + 1);
884 label = label.substring(0, label.indexOf("|"));
886 if (features[j].description.indexOf(href) == -1)
888 out.append("<a href=\"" + href + "\">" + label + "</a>");
892 if (features[j].getDescription().indexOf("</html>") == -1)
894 out.append("</html>");
900 out.append(sequences[i].getName());
901 out.append("\t-1\t");
902 out.append(features[j].begin);
904 out.append(features[j].end);
906 out.append(features[j].type);
907 if (!Float.isNaN(features[j].score))
910 out.append(features[j].score);
919 out.append("ENDGROUP").append(TAB);
929 } while (groupIndex < groups.size() + 1);
933 return "No Features Visible";
936 return out.toString();
940 * Parse method that is called when a GFF file is dragged to the desktop
945 AlignViewportI av = getViewport();
948 if (av.getAlignment() != null)
950 dataset = av.getAlignment().getDataset();
954 // working in the applet context ?
955 dataset = av.getAlignment();
960 dataset = new Alignment(new SequenceI[] {});
963 boolean parseResult = parse(dataset, null, false, true);
966 // pass error up somehow
970 // update viewport with the dataset data ?
974 setSeqs(dataset.getSequencesArray());
979 * Implementation of unused abstract method
981 * @return error message
984 public String print()
986 return "Use printGffFormat() or printJalviewFormat()";
990 * Returns features output in GFF2 format, including hidden and non-positional
994 * the sequences whose features are to be output
996 * a map whose keys are the type names of visible features
999 public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible)
1001 return printGffFormat(sequences, visible, true, true);
1005 * Returns features output in GFF2 format
1008 * the sequences whose features are to be output
1010 * a map whose keys are the type names of visible features
1011 * @param outputVisibleOnly
1012 * @param includeNonPositionalFeatures
1015 public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible, boolean outputVisibleOnly,
1016 boolean includeNonPositionalFeatures)
1018 StringBuilder out = new StringBuilder(256);
1019 out.append(String.format("%s %d\n", GFF_VERSION, gffVersion));
1022 for (SequenceI seq : sequences)
1024 SequenceFeature[] features = seq.getSequenceFeatures();
1025 if (features != null)
1027 for (SequenceFeature sf : features)
1029 isnonpos = sf.begin == 0 && sf.end == 0;
1030 if (!includeNonPositionalFeatures && isnonpos)
1033 * ignore non-positional features if not wanted
1037 // TODO why the test !isnonpos here?
1038 // what about not visible non-positional features?
1039 if (!isnonpos && outputVisibleOnly
1040 && !visible.containsKey(sf.type))
1043 * ignore not visible features if not wanted
1048 source = sf.featureGroup;
1051 source = sf.getDescription();
1054 out.append(seq.getName());
1058 out.append(sf.type);
1060 out.append(sf.begin);
1064 out.append(sf.score);
1067 out.append(sf.getValue(STRAND, "."));
1070 out.append(sf.getValue(FRAME, "."));
1072 // miscellaneous key-values (GFF column 9)
1073 String attributes = (String) sf.getValue(ATTRIBUTES);
1074 if (attributes != null)
1076 out.append(TAB).append(attributes);
1079 out.append(newline);
1084 return out.toString();
1088 * Returns a mapping given list of one or more Align descriptors (exonerate
1091 * @param alignedRegions
1092 * a list of "Align fromStart toStart fromCount"
1093 * @param mapIsFromCdna
1094 * if true, 'from' is dna, else 'from' is protein
1096 * either 1 (forward) or -1 (reverse)
1098 * @throws IOException
1100 protected MapList constructCodonMappingFromAlign(
1101 List<String> alignedRegions, boolean mapIsFromCdna, int strand)
1106 throw new IOException(
1107 "Invalid strand for a codon mapping (cannot be 0)");
1109 int regions = alignedRegions.size();
1110 // arrays to hold [start, end] for each aligned region
1111 int[] fromRanges = new int[regions * 2]; // from dna
1112 int[] toRanges = new int[regions * 2]; // to protein
1113 int fromRangesIndex = 0;
1114 int toRangesIndex = 0;
1116 for (String range : alignedRegions)
1119 * Align mapFromStart mapToStart mapFromCount
1120 * e.g. if mapIsFromCdna
1121 * Align 11270 143 120
1123 * 120 bases from pos 11270 align to pos 143 in peptide
1124 * if !mapIsFromCdna this would instead be
1125 * Align 143 11270 40
1127 String[] tokens = range.split(" ");
1128 if (tokens.length != 3)
1130 throw new IOException("Wrong number of fields for Align");
1137 fromStart = Integer.parseInt(tokens[0]);
1138 toStart = Integer.parseInt(tokens[1]);
1139 fromCount = Integer.parseInt(tokens[2]);
1140 } catch (NumberFormatException nfe)
1142 throw new IOException("Invalid number in Align field: "
1143 + nfe.getMessage());
1147 * Jalview always models from dna to protein, so adjust values if the
1148 * GFF mapping is from protein to dna
1153 int temp = fromStart;
1154 fromStart = toStart;
1157 fromRanges[fromRangesIndex++] = fromStart;
1158 fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
1161 * If a codon has an intron gap, there will be contiguous 'toRanges';
1162 * this is handled for us by the MapList constructor.
1163 * (It is not clear that exonerate ever generates this case)
1165 toRanges[toRangesIndex++] = toStart;
1166 toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
1169 return new MapList(fromRanges, toRanges, 3, 1);
1173 * Parse a GFF format feature. This may include creating a 'dummy' sequence
1174 * for the feature or its mapped sequence
1178 * @param relaxedIdMatching
1182 protected SequenceI parseGffFeature(StringTokenizer st,
1183 AlignmentI alignment, boolean relaxedIdMatching,
1184 List<SequenceI> newseqs)
1188 * GFF: seqid source type start end score strand phase [attributes]
1190 if (st.countTokens() < 8)
1193 .println("Ignoring GFF feature line with unexpected number of columns ("
1194 + st.countTokens() + ")");
1197 String seqId = st.nextToken();
1200 * locate referenced sequence in alignment _or_
1201 * as a forward reference (SequenceDummy)
1203 seq = findName(alignment, newseqs, relaxedIdMatching, seqId);
1205 String desc = st.nextToken();
1206 String group = null;
1207 if (desc.indexOf(' ') == -1)
1209 // could also be a source term rather than description line
1212 String ft = st.nextToken();
1213 int startPos = StringUtils.parseInt(st.nextToken());
1214 int endPos = StringUtils.parseInt(st.nextToken());
1215 // TODO: decide if non positional feature assertion for input data
1216 // where end==0 is generally valid
1219 // treat as non-positional feature, regardless.
1225 score = new Float(st.nextToken()).floatValue();
1226 } catch (NumberFormatException ex)
1231 SequenceFeature sf = new SequenceFeature(ft, desc, startPos,
1232 endPos, score, group);
1233 if (st.hasMoreTokens())
1235 sf.setValue(STRAND, st.nextToken());
1237 if (st.hasMoreTokens())
1239 sf.setValue(FRAME, st.nextToken());
1242 if (st.hasMoreTokens())
1244 processGffColumnNine(st.nextToken(), sf);
1247 if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
1250 // check whether we should add the sequence feature to any other
1251 // sequences in the alignment with the same or similar
1252 while ((seq = alignment.findName(seq, seqId, true)) != null)
1254 seq.addSequenceFeature(new SequenceFeature(sf));
1261 * Process the 'column 9' data of the GFF file. This is less formally defined,
1262 * and its interpretation will vary depending on the tool that has generated
1268 protected void processGffColumnNine(String attributes, SequenceFeature sf)
1270 sf.setValue(ATTRIBUTES, attributes);
1273 * Parse attributes in column 9 and add them to the sequence feature's
1274 * 'otherData' table; use Note as a best proxy for description
1276 char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' };
1277 Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
1278 nameValueSeparator);
1279 for (Entry<String, List<String>> attr : nameValues.entrySet())
1281 String values = StringUtils.listToDelimitedString(attr.getValue(),
1283 sf.setValue(attr.getKey(), values);
1284 if (NOTE.equals(attr.getKey()))
1286 sf.setDescription(values);
1292 * After encountering ##fasta in a GFF3 file, process the remainder of the
1293 * file as FAST sequence data. Any placeholder sequences created during
1294 * feature parsing are updated with the actual sequences.
1298 * @throws IOException
1300 protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1306 } catch (IOException q)
1309 FastaFile parser = new FastaFile(this);
1310 List<SequenceI> includedseqs = parser.getSeqs();
1311 SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1312 // iterate over includedseqs, and replacing matching ones with newseqs
1313 // sequences. Generic iterator not used here because we modify includedseqs
1315 for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1317 // search for any dummy seqs that this sequence can be used to update
1318 SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
1319 if (dummyseq != null)
1321 // dummyseq was created so it could be annotated and referred to in
1322 // alignments/codon mappings
1324 SequenceI mseq = includedseqs.get(p);
1325 // mseq is the 'template' imported from the FASTA file which we'll use
1326 // to coomplete dummyseq
1327 if (dummyseq instanceof SequenceDummy)
1329 // probably have the pattern wrong
1330 // idea is that a flyweight proxy for a sequence ID can be created for
1331 // 1. stable reference creation
1332 // 2. addition of annotation
1333 // 3. future replacement by a real sequence
1334 // current pattern is to create SequenceDummy objects - a convenience
1335 // constructor for a Sequence.
1336 // problem is that when promoted to a real sequence, all references
1338 // to be updated somehow.
1339 ((SequenceDummy) dummyseq).become(mseq);
1340 includedseqs.set(p, dummyseq); // template is no longer needed
1344 // finally add sequences to the dataset
1345 for (SequenceI seq : includedseqs)
1347 align.addSequence(seq);
1352 * Process a ## directive
1358 * @throws IOException
1360 protected void processGffPragma(String line, Map<String, String> gffProps, AlignmentI align,
1361 List<SequenceI> newseqs) throws IOException
1364 if ("###".equals(line))
1366 // close off any open 'forward references'
1370 String[] tokens = line.substring(2).split(" ");
1371 String pragma = tokens[0];
1372 String value = tokens.length == 1 ? null : tokens[1];
1374 if ("gff-version".equalsIgnoreCase(pragma))
1380 // value may be e.g. "3.1.2"
1381 gffVersion = Integer.parseInt(value.split("\\.")[0]);
1382 } catch (NumberFormatException e)
1388 else if ("feature-ontology".equalsIgnoreCase(pragma))
1390 // should resolve against the specified feature ontology URI
1392 else if ("attribute-ontology".equalsIgnoreCase(pragma))
1394 // URI of attribute ontology - not currently used in GFF3
1396 else if ("source-ontology".equalsIgnoreCase(pragma))
1398 // URI of source ontology - not currently used in GFF3
1400 else if ("species-build".equalsIgnoreCase(pragma))
1402 // save URI of specific NCBI taxon version of annotations
1403 gffProps.put("species-build", value);
1405 else if ("fasta".equalsIgnoreCase(pragma))
1407 // process the rest of the file as a fasta file and replace any dummy
1409 processAsFasta(align, newseqs);
1413 System.err.println("Ignoring unknown pragma: " + line);
1418 * Processes the 'Query' (or 'Target') and 'Align' properties associated with
1419 * an exonerate GFF similarity feature; these properties define the mapping of
1420 * the annotated feature (e.g. 'exon') to a related sequence.
1427 * @param relaxedIdMatching
1428 * @throws IOException
1430 public void processGffSimilarity(Map<String, List<String>> set, SequenceI seq,
1431 SequenceFeature sf, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)
1434 if (!validateExonerateModel(sf))
1439 int strand = sf.getStrand();
1442 * exonerate (protein2dna or protein2genome) may be run with
1443 * --showquerygff outputs
1444 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos peptideCount
1445 * --showtargetgff outputs
1446 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
1447 * where the Align spec may repeat
1449 boolean mapIsFromCdna = true;
1450 List<String> mapTo = set.get(QUERY);
1453 mapTo = set.get(TARGET);
1454 mapIsFromCdna = false;
1456 if (mapTo == null || mapTo.size() != 1)
1458 throw new IOException(
1459 "Expecting exactly one sequence in Query field (got " + mapTo
1464 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
1466 SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching,
1469 * Process the Align maps and create cdna/protein maps;
1470 * ideally, the query sequences are in the alignment, but maybe not...
1472 AlignedCodonFrame alco = new AlignedCodonFrame();
1473 MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN),
1474 mapIsFromCdna, strand);
1477 * Jalview always maps from dna to protein
1481 alco.addMap(seq, mappedSequence, codonmapping);
1485 alco.addMap(mappedSequence, seq, codonmapping);
1487 align.addCodonFrame(alco);
1491 * Returns true if the exonerate model (saved from column 2 of the GFF as the
1492 * SequenceFeature's group) is one that we are willing to process, else false
1496 protected boolean validateExonerateModel(SequenceFeature sf)
1499 * we don't handle protein-to-protein or dna-to-dna alignment here
1501 String source = sf.getFeatureGroup();
1503 || (!source.contains("protein2dna") && !source
1504 .contains("protein2genome")))
1507 .println("I only accept protein2dna or protein2genome but found "
1515 * take a sequence feature and examine its attributes to decide how it should
1516 * be added to a sequence
1519 * - the destination sequence constructed or discovered in the
1522 * - the base feature with ATTRIBUTES property containing any
1523 * additional attributes
1525 * - true if we are processing a GFF annotation file
1526 * @return true if sf was actually added to the sequence, false if it was
1527 * processed in another way
1529 public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs,
1530 SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching)
1532 String attr = (String) sf.getValue(ATTRIBUTES);
1533 boolean addFeature = true;
1536 for (String attset : attr.split(TAB))
1538 Map<String, List<String>> set = StringUtils.parseNameValuePairs(
1539 attset, ";", new char[] { ' ', '-' });
1541 if (SIMILARITY.equals(sf.getType()))
1546 processGffSimilarity(set, seq, sf, align, newseqs,
1548 } catch (IOException ivfe)
1550 System.err.println(ivfe);
1557 seq.addSequenceFeature(sf);