2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.AlignmentUtils;
24 import jalview.analysis.SequenceIdMatcher;
25 import jalview.api.AlignViewportI;
26 import jalview.api.FeatureColourI;
27 import jalview.api.FeaturesSourceI;
28 import jalview.datamodel.AlignedCodonFrame;
29 import jalview.datamodel.Alignment;
30 import jalview.datamodel.AlignmentI;
31 import jalview.datamodel.SequenceDummy;
32 import jalview.datamodel.SequenceFeature;
33 import jalview.datamodel.SequenceI;
34 import jalview.io.gff.GffHelperBase;
35 import jalview.io.gff.GffHelperFactory;
36 import jalview.io.gff.GffHelperI;
37 import jalview.schemes.FeatureColour;
38 import jalview.schemes.UserColourScheme;
39 import jalview.util.Format;
40 import jalview.util.MapList;
41 import jalview.util.ParseHtmlBodyAndLinks;
42 import jalview.util.StringUtils;
44 import java.io.IOException;
45 import java.util.ArrayList;
46 import java.util.Arrays;
47 import java.util.HashMap;
48 import java.util.Iterator;
49 import java.util.List;
51 import java.util.Map.Entry;
52 import java.util.StringTokenizer;
55 * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
56 * format. These are tab-delimited formats but with differences in the use of
59 * A Jalview feature file may define feature colours and then declare that the
60 * remainder of the file is in GFF format with the line 'GFF'.
62 * GFF3 files may include alignment mappings for features, which Jalview will
63 * attempt to model, and may include sequence data following a ##FASTA line.
70 public class FeaturesFile extends AlignFile implements FeaturesSourceI
72 private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
74 private static final String NOTE = "Note";
76 protected static final String TAB = "\t";
78 protected static final String GFF_VERSION = "##gff-version";
80 private AlignmentI lastmatchedAl = null;
82 private SequenceIdMatcher matcher = null;
84 protected AlignmentI dataset;
86 protected int gffVersion;
89 * Creates a new FeaturesFile object.
96 * Constructor which does not parse the file immediately
100 * @throws IOException
102 public FeaturesFile(String inFile, String type) throws IOException
104 super(false, inFile, type);
109 * @throws IOException
111 public FeaturesFile(FileParse source) throws IOException
117 * Constructor that optionally parses the file immediately
119 * @param parseImmediately
122 * @throws IOException
124 public FeaturesFile(boolean parseImmediately, String inFile, String type)
127 super(parseImmediately, inFile, type);
131 * Parse GFF or sequence features file using case-independent matching,
135 * - alignment/dataset containing sequences that are to be annotated
137 * - hashtable to store feature colour definitions
139 * - process html strings into plain text
140 * @return true if features were added
142 public boolean parse(AlignmentI align,
143 Map<String, FeatureColourI> colours,
146 return parse(align, colours, removeHTML, false);
150 * Extends the default addProperties by also adding peptide-to-cDNA mappings
151 * (if any) derived while parsing a GFF file
154 public void addProperties(AlignmentI al)
156 super.addProperties(al);
157 if (dataset != null && dataset.getCodonFrames() != null)
159 AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
160 for (AlignedCodonFrame codons : dataset.getCodonFrames())
162 ds.addCodonFrame(codons);
168 * Parse GFF or Jalview format sequence features file
171 * - alignment/dataset containing sequences that are to be annotated
173 * - hashtable to store feature colour definitions
175 * - process html strings into plain text
176 * @param relaxedIdmatching
177 * - when true, ID matches to compound sequence IDs are allowed
178 * @return true if features were added
180 public boolean parse(AlignmentI align,
181 Map<String, FeatureColourI> colours,
182 boolean removeHTML, boolean relaxedIdmatching)
184 Map<String, String> gffProps = new HashMap<String, String>();
186 * keep track of any sequences we try to create from the data
188 List<SequenceI> newseqs = new ArrayList<SequenceI>();
194 String featureGroup = null;
196 while ((line = nextLine()) != null)
198 // skip comments/process pragmas
199 if (line.length() == 0 || line.startsWith("#"))
201 if (line.toLowerCase().startsWith("##"))
203 processGffPragma(line, gffProps, align, newseqs);
208 gffColumns = line.split("\\t"); // tab as regex
209 if (gffColumns.length == 1)
211 if (line.trim().equalsIgnoreCase("GFF"))
214 * Jalview features file with appended GFF
215 * assume GFF2 (though it may declare ##gff-version 3)
222 if (gffColumns.length > 1 && gffColumns.length < 4)
225 * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
226 * a feature type colour specification
228 String ft = gffColumns[0];
229 if (ft.equalsIgnoreCase("startgroup"))
231 featureGroup = gffColumns[1];
233 else if (ft.equalsIgnoreCase("endgroup"))
235 // We should check whether this is the current group,
236 // but at present theres no way of showing more than 1 group
241 parseFeatureColour(line, ft, gffColumns, colours);
247 * if not a comment, GFF pragma, startgroup, endgroup or feature
248 * colour specification, that just leaves a feature details line
249 * in either Jalview or GFF format
253 parseJalviewFeature(line, gffColumns, align, colours, removeHTML,
254 relaxedIdmatching, featureGroup);
258 parseGff(gffColumns, align, relaxedIdmatching, newseqs);
262 } catch (Exception ex)
264 // should report somewhere useful for UI if necessary
265 warningMessage = ((warningMessage == null) ? "" : warningMessage)
266 + "Parsing error at\n" + line;
267 System.out.println("Error parsing feature file: " + ex + "\n" + line);
268 ex.printStackTrace(System.err);
274 * experimental - add any dummy sequences with features to the alignment
275 * - we need them for Ensembl feature extraction - though maybe not otherwise
277 for (SequenceI newseq : newseqs)
279 if (newseq.getSequenceFeatures() != null)
281 align.addSequence(newseq);
288 * Try to parse a Jalview format feature specification and add it as a
289 * sequence feature to any matching sequences in the alignment. Returns true
290 * if successful (a feature was added), or false if not.
295 * @param featureColours
297 * @param relaxedIdmatching
298 * @param featureGroup
300 protected boolean parseJalviewFeature(String line, String[] gffColumns,
301 AlignmentI alignment, Map<String, FeatureColourI> featureColours,
302 boolean removeHTML, boolean relaxedIdMatching, String featureGroup)
305 * tokens: description seqid seqIndex start end type [score]
307 if (gffColumns.length < 6)
309 System.err.println("Ignoring feature line '" + line
310 + "' with too few columns (" + gffColumns.length + ")");
313 String desc = gffColumns[0];
314 String seqId = gffColumns[1];
315 SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching);
317 if (!ID_NOT_SPECIFIED.equals(seqId))
319 seq = findSequence(seqId, alignment, null, relaxedIdMatching);
325 String seqIndex = gffColumns[2];
328 int idx = Integer.parseInt(seqIndex);
329 seq = alignment.getSequenceAt(idx);
330 } catch (NumberFormatException ex)
332 System.err.println("Invalid sequence index: " + seqIndex);
338 System.out.println("Sequence not found: " + line);
342 int startPos = Integer.parseInt(gffColumns[3]);
343 int endPos = Integer.parseInt(gffColumns[4]);
345 String ft = gffColumns[5];
347 if (!featureColours.containsKey(ft))
350 * Perhaps an old style groups file with no colours -
351 * synthesize a colour from the feature type
353 UserColourScheme ucs = new UserColourScheme(ft);
354 featureColours.put(ft, new FeatureColour(ucs.findColour('A')));
356 SequenceFeature sf = new SequenceFeature(ft, desc, "", startPos,
357 endPos, featureGroup);
358 if (gffColumns.length > 6)
360 float score = Float.NaN;
363 score = new Float(gffColumns[6]).floatValue();
364 // update colourgradient bounds if allowed to
365 } catch (NumberFormatException ex)
372 parseDescriptionHTML(sf, removeHTML);
374 seq.addSequenceFeature(sf);
377 && (seq = alignment.findName(seq, seqId, false)) != null)
379 seq.addSequenceFeature(new SequenceFeature(sf));
385 * Process a feature type colour specification
388 * the current input line (for error messages only)
390 * the first token on the line
392 * holds tokens on the line
394 * map to which to add derived colour specification
396 protected void parseFeatureColour(String line, String featureType,
397 String[] gffColumns, Map<String, FeatureColourI> colours)
399 FeatureColourI colour = null;
400 String colscheme = gffColumns[1];
401 if (colscheme.indexOf("|") > -1
402 || colscheme.trim().equalsIgnoreCase("label"))
404 colour = parseGraduatedColourScheme(line, colscheme);
408 UserColourScheme ucs = new UserColourScheme(colscheme);
409 colour = new FeatureColour(ucs.findColour('A'));
413 colours.put(featureType, colour);
418 * Parse a Jalview graduated colour descriptor
421 * @param colourDescriptor
424 protected FeatureColourI parseGraduatedColourScheme(String line,
425 String colourDescriptor)
427 // Parse '|' separated graduated colourscheme fields:
428 // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
429 // can either provide 'label' only, first is optional, next two
430 // colors are required (but may be
431 // left blank), next is optional, nxt two min/max are required.
432 // first is either 'label'
433 // first/second and third are both hexadecimal or word equivalent
435 // next two are values parsed as floats.
436 // fifth is either 'above','below', or 'none'.
437 // sixth is a float value and only required when fifth is either
438 // 'above' or 'below'.
439 StringTokenizer gcol = new StringTokenizer(colourDescriptor, "|", true);
441 float min = Float.MIN_VALUE, max = Float.MAX_VALUE;
442 boolean labelCol = false;
444 String mincol = gcol.nextToken();
448 .println("Expected either 'label' or a colour specification in the line: "
452 String maxcol = null;
453 if (mincol.toLowerCase().indexOf("label") == 0)
456 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip '|'
457 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
459 String abso = null, minval, maxval;
462 // at least four more tokens
463 if (mincol.equals("|"))
469 gcol.nextToken(); // skip next '|'
471 // continue parsing rest of line
472 maxcol = gcol.nextToken();
473 if (maxcol.equals("|"))
479 gcol.nextToken(); // skip next '|'
481 abso = gcol.nextToken();
482 gcol.nextToken(); // skip next '|'
483 if (abso.toLowerCase().indexOf("abso") != 0)
490 minval = gcol.nextToken();
491 gcol.nextToken(); // skip next '|'
493 maxval = gcol.nextToken();
494 if (gcol.hasMoreTokens())
496 gcol.nextToken(); // skip next '|'
500 if (minval.length() > 0)
502 min = Float.valueOf(minval);
504 } catch (Exception e)
507 .println("Couldn't parse the minimum value for graduated colour for type ("
509 + ") - did you misspell 'auto' for the optional automatic colour switch ?");
514 if (maxval.length() > 0)
516 max = Float.valueOf(maxval);
518 } catch (Exception e)
521 .println("Couldn't parse the maximum value for graduated colour for type ("
522 + colourDescriptor + ")");
528 // add in some dummy min/max colours for the label-only
534 FeatureColourI colour = null;
537 colour = new FeatureColour(
538 new UserColourScheme(mincol).findColour('A'),
539 new UserColourScheme(maxcol).findColour('A'), min, max);
540 } catch (Exception e)
542 System.err.println("Couldn't parse the graduated colour scheme ("
543 + colourDescriptor + ")");
548 colour.setColourByLabel(labelCol);
549 colour.setAutoScaled(abso == null);
550 // add in any additional parameters
551 String ttype = null, tval = null;
552 boolean hasThreshold = false;
553 if (gcol.hasMoreTokens())
555 // threshold type and possibly a threshold value
556 ttype = gcol.nextToken();
557 if (ttype.toLowerCase().startsWith("below"))
559 colour.setBelowThreshold(true);
562 else if (ttype.toLowerCase().startsWith("above"))
564 colour.setAboveThreshold(true);
569 if (!ttype.toLowerCase().startsWith("no"))
571 System.err.println("Ignoring unrecognised threshold type : "
581 tval = gcol.nextToken();
582 colour.setThreshold(new Float(tval).floatValue());
583 } catch (Exception e)
585 System.err.println("Couldn't parse threshold value as a float: ("
590 // parse the thresh-is-min token ?
591 if (gcol.hasMoreTokens())
594 .println("Ignoring additional tokens in parameters in graduated colour specification\n");
595 while (gcol.hasMoreTokens())
597 System.err.println("|" + gcol.nextToken());
599 System.err.println("\n");
606 * clear any temporary handles used to speed up ID matching
608 protected void resetMatcher()
610 lastmatchedAl = null;
615 * Returns a sequence matching the given id, as follows
617 * <li>strict matching is on exact sequence name</li>
618 * <li>relaxed matching allows matching on a token within the sequence name,
620 * <li>first tries to find a match in the alignment sequences</li>
621 * <li>else tries to find a match in the new sequences already generated while
622 * parsing the features file</li>
623 * <li>else creates a new placeholder sequence, adds it to the new sequences
624 * list, and returns it</li>
630 * @param relaxedIdMatching
634 protected SequenceI findSequence(String seqId, AlignmentI align,
635 List<SequenceI> newseqs, boolean relaxedIdMatching)
637 // TODO encapsulate in SequenceIdMatcher, share the matcher
638 // with the GffHelper (removing code duplication)
639 SequenceI match = null;
640 if (relaxedIdMatching)
642 if (lastmatchedAl != align)
644 lastmatchedAl = align;
645 matcher = new SequenceIdMatcher(align.getSequencesArray());
648 matcher.addAll(newseqs);
651 match = matcher.findIdMatch(seqId);
655 match = align.findName(seqId, true);
656 if (match == null && newseqs != null)
658 for (SequenceI m : newseqs)
660 if (seqId.equals(m.getName()))
668 if (match == null && newseqs != null)
670 match = new SequenceDummy(seqId);
671 if (relaxedIdMatching)
673 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
675 // add dummy sequence to the newseqs list
681 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
683 if (sf.getDescription() == null)
687 ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
688 sf.getDescription(), removeHTML, newline);
690 sf.description = (removeHTML) ? parsed.getNonHtmlContent()
692 for (String link : parsed.getLinks())
700 * generate a features file for seqs includes non-pos features by default.
703 * source of sequence features
705 * hash of feature types and colours
706 * @return features file contents
708 public String printJalviewFormat(SequenceI[] sequences,
709 Map<String, FeatureColourI> visible)
711 return printJalviewFormat(sequences, visible, true, true);
715 * generate a features file for seqs with colours from visible (if any)
719 * @param featureColours
720 * hash of Colours for each feature type
722 * when true only feature types in 'visible' will be output
724 * indicates if non-positional features should be output (regardless
726 * @return features file contents
728 public String printJalviewFormat(SequenceI[] sequences,
729 Map<String, FeatureColourI> featureColours, boolean visOnly,
732 StringBuilder out = new StringBuilder(256);
733 boolean featuresGen = false;
734 if (visOnly && !nonpos
735 && (featureColours == null || featureColours.size() < 1))
737 // no point continuing.
738 return "No Features Visible";
741 if (featureColours != null && visOnly)
743 // write feature colours only if we're given them and we are generating
745 // TODO: decide if feature links should also be written here ?
746 Iterator<String> en = featureColours.keySet().iterator();
747 String featureType, color;
750 featureType = en.next();
751 FeatureColourI fc = featureColours.get(featureType);
752 if (fc.isSimpleColour())
754 color = Format.getHexString(fc.getColour());
758 color = (fc.isColourByLabel() ? "label|" : "")
759 + Format.getHexString(fc.getMinColour()) + "|"
760 + Format.getHexString(fc.getMaxColour())
761 + (fc.isAutoScaled() ? "|" : "|abso|") + fc.getMin() + "|"
763 if (fc.isBelowThreshold())
767 else if (fc.isAboveThreshold())
772 color += "|" + fc.getThreshold();
780 // // legacy support for integer objects containing colour triplet
782 // color = Format.getHexString(new Color(Integer
783 // .parseInt(fc.toString())));
785 out.append(featureType);
791 // Work out which groups are both present and visible
792 List<String> groups = new ArrayList<String>();
794 boolean isnonpos = false;
796 SequenceFeature[] features;
797 for (int i = 0; i < sequences.length; i++)
799 features = sequences[i].getSequenceFeatures();
800 if (features != null)
802 for (int j = 0; j < features.length; j++)
804 isnonpos = features[j].begin == 0 && features[j].end == 0;
805 if ((!nonpos && isnonpos)
806 || (!isnonpos && visOnly && !featureColours
807 .containsKey(features[j].type)))
812 if (features[j].featureGroup != null
813 && !groups.contains(features[j].featureGroup))
815 groups.add(features[j].featureGroup);
824 if (groups.size() > 0 && groupIndex < groups.size())
826 group = groups.get(groupIndex);
828 out.append("STARTGROUP").append(TAB);
837 for (int i = 0; i < sequences.length; i++)
839 features = sequences[i].getSequenceFeatures();
840 if (features != null)
842 for (int j = 0; j < features.length; j++)
844 isnonpos = features[j].begin == 0 && features[j].end == 0;
845 if ((!nonpos && isnonpos)
846 || (!isnonpos && visOnly && !featureColours
847 .containsKey(features[j].type)))
849 // skip if feature is nonpos and we ignore them or if we only
850 // output visible and it isn't non-pos and it's not visible
855 && (features[j].featureGroup == null || !features[j].featureGroup
861 if (group == null && features[j].featureGroup != null)
865 // we have features to output
867 if (features[j].description == null
868 || features[j].description.equals(""))
870 out.append(features[j].type).append(TAB);
874 if (features[j].links != null
875 && features[j].getDescription().indexOf("<html>") == -1)
877 out.append("<html>");
880 out.append(features[j].description + " ");
881 if (features[j].links != null)
883 for (int l = 0; l < features[j].links.size(); l++)
885 String label = features[j].links.elementAt(l).toString();
886 String href = label.substring(label.indexOf("|") + 1);
887 label = label.substring(0, label.indexOf("|"));
889 if (features[j].description.indexOf(href) == -1)
891 out.append("<a href=\"" + href + "\">" + label + "</a>");
895 if (features[j].getDescription().indexOf("</html>") == -1)
897 out.append("</html>");
903 out.append(sequences[i].getName());
904 out.append("\t-1\t");
905 out.append(features[j].begin);
907 out.append(features[j].end);
909 out.append(features[j].type);
910 if (!Float.isNaN(features[j].score))
913 out.append(features[j].score);
922 out.append("ENDGROUP").append(TAB);
932 } while (groupIndex < groups.size() + 1);
936 return "No Features Visible";
939 return out.toString();
943 * Parse method that is called when a GFF file is dragged to the desktop
948 AlignViewportI av = getViewport();
951 if (av.getAlignment() != null)
953 dataset = av.getAlignment().getDataset();
957 // working in the applet context ?
958 dataset = av.getAlignment();
963 dataset = new Alignment(new SequenceI[] {});
966 boolean parseResult = parse(dataset, null, false, true);
969 // pass error up somehow
973 // update viewport with the dataset data ?
977 setSeqs(dataset.getSequencesArray());
982 * Implementation of unused abstract method
984 * @return error message
987 public String print()
989 return "Use printGffFormat() or printJalviewFormat()";
993 * Returns features output in GFF2 format, including hidden and non-positional
997 * the sequences whose features are to be output
999 * a map whose keys are the type names of visible features
1002 public String printGffFormat(SequenceI[] sequences,
1003 Map<String, FeatureColourI> visible)
1005 return printGffFormat(sequences, visible, true, true);
1009 * Returns features output in GFF2 format
1012 * the sequences whose features are to be output
1013 * @param featureColours
1014 * a map whose keys are the type names of visible features
1015 * @param outputVisibleOnly
1016 * @param includeNonPositionalFeatures
1019 public String printGffFormat(SequenceI[] sequences,
1020 Map<String, FeatureColourI> featureColours,
1021 boolean outputVisibleOnly,
1022 boolean includeNonPositionalFeatures)
1024 StringBuilder out = new StringBuilder(256);
1025 out.append(String.format("%s %d\n", GFF_VERSION, gffVersion));
1028 for (SequenceI seq : sequences)
1030 SequenceFeature[] features = seq.getSequenceFeatures();
1031 if (features != null)
1033 for (SequenceFeature sf : features)
1035 isnonpos = sf.begin == 0 && sf.end == 0;
1036 if (!includeNonPositionalFeatures && isnonpos)
1039 * ignore non-positional features if not wanted
1043 // TODO why the test !isnonpos here?
1044 // what about not visible non-positional features?
1045 if (!isnonpos && outputVisibleOnly
1046 && !featureColours.containsKey(sf.type))
1049 * ignore not visible features if not wanted
1054 source = sf.featureGroup;
1057 source = sf.getDescription();
1060 out.append(seq.getName());
1064 out.append(sf.type);
1066 out.append(sf.begin);
1070 out.append(sf.score);
1073 int strand = sf.getStrand();
1074 out.append(strand == 1 ? "+" : (strand == -1 ? "-" : "."));
1077 String phase = sf.getPhase();
1078 out.append(phase == null ? "." : phase);
1080 // miscellaneous key-values (GFF column 9)
1081 String attributes = sf.getAttributes();
1082 if (attributes != null)
1084 out.append(TAB).append(attributes);
1087 out.append(newline);
1092 return out.toString();
1096 * Returns a mapping given list of one or more Align descriptors (exonerate
1099 * @param alignedRegions
1100 * a list of "Align fromStart toStart fromCount"
1101 * @param mapIsFromCdna
1102 * if true, 'from' is dna, else 'from' is protein
1104 * either 1 (forward) or -1 (reverse)
1106 * @throws IOException
1108 protected MapList constructCodonMappingFromAlign(
1109 List<String> alignedRegions, boolean mapIsFromCdna, int strand)
1114 throw new IOException(
1115 "Invalid strand for a codon mapping (cannot be 0)");
1117 int regions = alignedRegions.size();
1118 // arrays to hold [start, end] for each aligned region
1119 int[] fromRanges = new int[regions * 2]; // from dna
1120 int[] toRanges = new int[regions * 2]; // to protein
1121 int fromRangesIndex = 0;
1122 int toRangesIndex = 0;
1124 for (String range : alignedRegions)
1127 * Align mapFromStart mapToStart mapFromCount
1128 * e.g. if mapIsFromCdna
1129 * Align 11270 143 120
1131 * 120 bases from pos 11270 align to pos 143 in peptide
1132 * if !mapIsFromCdna this would instead be
1133 * Align 143 11270 40
1135 String[] tokens = range.split(" ");
1136 if (tokens.length != 3)
1138 throw new IOException("Wrong number of fields for Align");
1145 fromStart = Integer.parseInt(tokens[0]);
1146 toStart = Integer.parseInt(tokens[1]);
1147 fromCount = Integer.parseInt(tokens[2]);
1148 } catch (NumberFormatException nfe)
1150 throw new IOException("Invalid number in Align field: "
1151 + nfe.getMessage());
1155 * Jalview always models from dna to protein, so adjust values if the
1156 * GFF mapping is from protein to dna
1161 int temp = fromStart;
1162 fromStart = toStart;
1165 fromRanges[fromRangesIndex++] = fromStart;
1166 fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
1169 * If a codon has an intron gap, there will be contiguous 'toRanges';
1170 * this is handled for us by the MapList constructor.
1171 * (It is not clear that exonerate ever generates this case)
1173 toRanges[toRangesIndex++] = toStart;
1174 toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
1177 return new MapList(fromRanges, toRanges, 3, 1);
1181 * Parse a GFF format feature. This may include creating a 'dummy' sequence to
1182 * hold the feature, or for its mapped sequence, or both, to be resolved
1183 * either later in the GFF file (##FASTA section), or when the user loads
1184 * additional sequences.
1188 * @param relaxedIdMatching
1192 protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
1193 boolean relaxedIdMatching, List<SequenceI> newseqs)
1196 * GFF: seqid source type start end score strand phase [attributes]
1198 if (gffColumns.length < 5)
1200 System.err.println("Ignoring GFF feature line with too few columns ("
1201 + gffColumns.length + ")");
1206 * locate referenced sequence in alignment _or_
1207 * as a forward or external reference (SequenceDummy)
1209 String seqId = gffColumns[0];
1210 SequenceI seq = findSequence(seqId, alignment, newseqs,
1213 SequenceFeature sf = null;
1214 GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
1219 sf = helper.processGff(seq, gffColumns, alignment, newseqs,
1223 seq.addSequenceFeature(sf);
1224 while ((seq = alignment.findName(seq, seqId, true)) != null)
1226 seq.addSequenceFeature(new SequenceFeature(sf));
1229 } catch (IOException e)
1231 System.err.println("GFF parsing failed with: " + e.getMessage());
1240 * Process the 'column 9' data of the GFF file. This is less formally defined,
1241 * and its interpretation will vary depending on the tool that has generated
1247 protected void processGffColumnNine(String attributes, SequenceFeature sf)
1249 sf.setAttributes(attributes);
1252 * Parse attributes in column 9 and add them to the sequence feature's
1253 * 'otherData' table; use Note as a best proxy for description
1255 char nameValueSeparator = gffVersion == 3 ? '=' : ' ';
1256 // TODO check we don't break GFF2 values which include commas here
1257 Map<String, List<String>> nameValues = GffHelperBase
1258 .parseNameValuePairs(attributes, ";", nameValueSeparator, ",");
1259 for (Entry<String, List<String>> attr : nameValues.entrySet())
1261 String values = StringUtils.listToDelimitedString(attr.getValue(),
1263 sf.setValue(attr.getKey(), values);
1264 if (NOTE.equals(attr.getKey()))
1266 sf.setDescription(values);
1272 * After encountering ##fasta in a GFF3 file, process the remainder of the
1273 * file as FAST sequence data. Any placeholder sequences created during
1274 * feature parsing are updated with the actual sequences.
1278 * @throws IOException
1280 protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1286 } catch (IOException q)
1289 FastaFile parser = new FastaFile(this);
1290 List<SequenceI> includedseqs = parser.getSeqs();
1292 SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1295 * iterate over includedseqs, and replacing matching ones with newseqs
1296 * sequences. Generic iterator not used here because we modify
1297 * includedseqs as we go
1299 for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1301 // search for any dummy seqs that this sequence can be used to update
1302 SequenceI includedSeq = includedseqs.get(p);
1303 SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
1304 if (dummyseq != null && dummyseq instanceof SequenceDummy)
1306 // probably have the pattern wrong
1307 // idea is that a flyweight proxy for a sequence ID can be created for
1308 // 1. stable reference creation
1309 // 2. addition of annotation
1310 // 3. future replacement by a real sequence
1311 // current pattern is to create SequenceDummy objects - a convenience
1312 // constructor for a Sequence.
1313 // problem is that when promoted to a real sequence, all references
1314 // need to be updated somehow. We avoid that by keeping the same object.
1315 ((SequenceDummy) dummyseq).become(includedSeq);
1316 dummyseq.createDatasetSequence();
1319 * Update mappings so they are now to the dataset sequence
1321 for (AlignedCodonFrame mapping : align.getCodonFrames())
1323 mapping.updateToDataset(dummyseq);
1327 * replace parsed sequence with the realised forward reference
1329 includedseqs.set(p, dummyseq);
1332 * and remove from the newseqs list
1334 newseqs.remove(dummyseq);
1339 * finally add sequences to the dataset
1341 for (SequenceI seq : includedseqs)
1343 // experimental: mapping-based 'alignment' to query sequence
1344 AlignmentUtils.alignSequenceAs(seq, align,
1345 String.valueOf(align.getGapCharacter()), false, true);
1347 // rename sequences if GFF handler requested this
1348 // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
1349 SequenceFeature[] sfs = seq.getSequenceFeatures();
1352 String newName = (String) sfs[0].getValue(GffHelperI.RENAME_TOKEN);
1353 if (newName != null)
1355 seq.setName(newName);
1358 align.addSequence(seq);
1363 * Process a ## directive
1369 * @throws IOException
1371 protected void processGffPragma(String line,
1372 Map<String, String> gffProps, AlignmentI align,
1373 List<SequenceI> newseqs) throws IOException
1376 if ("###".equals(line))
1378 // close off any open 'forward references'
1382 String[] tokens = line.substring(2).split(" ");
1383 String pragma = tokens[0];
1384 String value = tokens.length == 1 ? null : tokens[1];
1386 if ("gff-version".equalsIgnoreCase(pragma))
1392 // value may be e.g. "3.1.2"
1393 gffVersion = Integer.parseInt(value.split("\\.")[0]);
1394 } catch (NumberFormatException e)
1400 else if ("sequence-region".equalsIgnoreCase(pragma))
1402 // could capture <seqid start end> if wanted here
1404 else if ("feature-ontology".equalsIgnoreCase(pragma))
1406 // should resolve against the specified feature ontology URI
1408 else if ("attribute-ontology".equalsIgnoreCase(pragma))
1410 // URI of attribute ontology - not currently used in GFF3
1412 else if ("source-ontology".equalsIgnoreCase(pragma))
1414 // URI of source ontology - not currently used in GFF3
1416 else if ("species-build".equalsIgnoreCase(pragma))
1418 // save URI of specific NCBI taxon version of annotations
1419 gffProps.put("species-build", value);
1421 else if ("fasta".equalsIgnoreCase(pragma))
1423 // process the rest of the file as a fasta file and replace any dummy
1425 processAsFasta(align, newseqs);
1429 System.err.println("Ignoring unknown pragma: " + line);