2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.AlignmentUtils;
24 import jalview.analysis.Finder;
25 import jalview.analysis.SequenceIdMatcher;
26 import jalview.api.AlignViewportI;
27 import jalview.api.FeatureColourI;
28 import jalview.api.FeatureRenderer;
29 import jalview.api.FeaturesSourceI;
30 import jalview.api.FinderI;
31 import jalview.datamodel.AlignedCodonFrame;
32 import jalview.datamodel.Alignment;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.MappedFeatures;
35 import jalview.datamodel.SearchResultMatchI;
36 import jalview.datamodel.SequenceDummy;
37 import jalview.datamodel.SequenceFeature;
38 import jalview.datamodel.SequenceI;
39 import jalview.datamodel.features.FeatureMatcherSet;
40 import jalview.datamodel.features.FeatureMatcherSetI;
41 import jalview.gui.Desktop;
42 import jalview.io.gff.GffHelperBase;
43 import jalview.io.gff.GffHelperFactory;
44 import jalview.io.gff.GffHelperI;
45 import jalview.schemes.FeatureColour;
46 import jalview.util.ColorUtils;
47 import jalview.util.MapList;
48 import jalview.util.ParseHtmlBodyAndLinks;
49 import jalview.util.StringUtils;
51 import java.awt.Color;
52 import java.io.IOException;
53 import java.util.ArrayList;
54 import java.util.Arrays;
55 import java.util.Collections;
56 import java.util.HashMap;
57 import java.util.LinkedHashMap;
58 import java.util.List;
60 import java.util.Map.Entry;
61 import java.util.TreeMap;
64 * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
65 * format. These are tab-delimited formats but with differences in the use of
68 * A Jalview feature file may define feature colours and then declare that the
69 * remainder of the file is in GFF format with the line 'GFF'.
71 * GFF3 files may include alignment mappings for features, which Jalview will
72 * attempt to model, and may include sequence data following a ##FASTA line.
79 public class FeaturesFile extends AlignFile implements FeaturesSourceI
81 private static final String TAB_REGEX = "\\t";
83 private static final String STARTGROUP = "STARTGROUP";
85 private static final String ENDGROUP = "ENDGROUP";
87 private static final String STARTFILTERS = "STARTFILTERS";
89 private static final String ENDFILTERS = "ENDFILTERS";
91 private static final String STARTMOTIFS = "STARTMOTIFS";
93 private static final String ENDMOTIFS = "ENDMOTIFS";
95 private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
97 private static final String NOTE = "Note";
99 protected static final String GFF_VERSION = "##gff-version";
101 private AlignmentI lastmatchedAl = null;
103 private SequenceIdMatcher matcher = null;
105 protected AlignmentI dataset;
107 protected int gffVersion;
110 * Creates a new FeaturesFile object.
112 public FeaturesFile()
117 * Constructor which does not parse the file immediately
121 * @throws IOException
123 public FeaturesFile(String file, DataSourceType paste)
126 super(false, file, paste);
131 * @throws IOException
133 public FeaturesFile(FileParse source) throws IOException
139 * Constructor that optionally parses the file immediately
141 * @param parseImmediately
144 * @throws IOException
146 public FeaturesFile(boolean parseImmediately, String file,
147 DataSourceType type) throws IOException
149 super(parseImmediately, file, type);
153 * Parse GFF or sequence features file using case-independent matching,
157 * - alignment/dataset containing sequences that are to be annotated
159 * - hashtable to store feature colour definitions
161 * - process html strings into plain text
162 * @return true if features were added
164 public boolean parse(AlignmentI align,
165 Map<String, FeatureColourI> colours, boolean removeHTML)
167 return parse(align, colours, removeHTML, false);
171 * Extends the default addProperties by also adding peptide-to-cDNA mappings
172 * (if any) derived while parsing a GFF file
175 public void addProperties(AlignmentI al)
177 super.addProperties(al);
178 if (dataset != null && dataset.getCodonFrames() != null)
180 AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
181 for (AlignedCodonFrame codons : dataset.getCodonFrames())
183 ds.addCodonFrame(codons);
189 * Parse GFF or Jalview format sequence features file
192 * - alignment/dataset containing sequences that are to be annotated
194 * - map to store feature colour definitions
196 * - process html strings into plain text
197 * @param relaxedIdmatching
198 * - when true, ID matches to compound sequence IDs are allowed
199 * @return true if features were added
201 public boolean parse(AlignmentI align,
202 Map<String, FeatureColourI> colours, boolean removeHTML,
203 boolean relaxedIdmatching)
205 return parse(align, colours, null, removeHTML, relaxedIdmatching);
209 * Parse GFF or Jalview format sequence features file
212 * - alignment/dataset containing sequences that are to be annotated
214 * - map to store feature colour definitions
216 * - map to store feature filter definitions
218 * - process html strings into plain text
219 * @param relaxedIdmatching
220 * - when true, ID matches to compound sequence IDs are allowed
221 * @return true if features were added
223 public boolean parse(AlignmentI align,
224 Map<String, FeatureColourI> colours,
225 Map<String, FeatureMatcherSetI> filters, boolean removeHTML,
226 boolean relaxedIdmatching)
228 Map<String, String> gffProps = new HashMap<>();
230 * keep track of any sequences we try to create from the data
232 List<SequenceI> newseqs = new ArrayList<>();
238 String featureGroup = null;
240 while ((line = nextLine()) != null)
243 // skip comments/process pragmas
244 if (line.length() == 0 || line.startsWith("#"))
246 if (line.toLowerCase().startsWith("##"))
248 processGffPragma(line, gffProps, align, newseqs);
253 gffColumns = line.split(TAB_REGEX);
254 if (gffColumns.length == 1)
256 if (line.trim().equalsIgnoreCase("GFF"))
259 * Jalview features file with appended GFF
260 * assume GFF2 (though it may declare ##gff-version 3)
267 if (gffColumns.length > 0 && gffColumns.length < 4)
270 * if 2 or 3 tokens, we anticipate one of either
271 * STARTGROUP ENDGROUP STARTFILTERS STARTMOTIFS
272 * or a feature type and colour specification
274 String ft = gffColumns[0];
275 if (ft.equalsIgnoreCase(STARTFILTERS))
277 parseFilters(filters);
280 if (ft.equalsIgnoreCase(STARTMOTIFS))
282 parseMotifs(align, featureGroup);
285 if (ft.equalsIgnoreCase(STARTGROUP))
287 featureGroup = gffColumns[1];
289 else if (ft.equalsIgnoreCase(ENDGROUP))
295 String colscheme = gffColumns[1];
296 FeatureColourI colour = FeatureColour
297 .parseJalviewFeatureColour(colscheme);
300 colours.put(ft, colour);
307 * if not handled above, that just leaves a feature details line
308 * in either Jalview or GFF format
312 parseJalviewFeature(line, gffColumns, align, colours, removeHTML,
313 relaxedIdmatching, featureGroup);
317 parseGff(gffColumns, align, relaxedIdmatching, newseqs);
321 } catch (Exception ex)
323 // should report somewhere useful for UI if necessary
324 warningMessage = ((warningMessage == null) ? "" : warningMessage)
325 + "Parsing error at\n" + line;
326 System.out.println("Error parsing feature file: " + ex + "\n" + line);
327 ex.printStackTrace(System.err);
333 * experimental - add any dummy sequences with features to the alignment
334 * - we need them for Ensembl feature extraction - though maybe not otherwise
336 for (SequenceI newseq : newseqs)
338 if (newseq.getFeatures().hasFeatures())
340 align.addSequence(newseq);
347 * Reads lines up to and including the next ENDMOTIFS, and processes each one
350 * <li>searching the alignment for the supplied motif (first column)</li>
351 * <li>creating features for matches, with feature type, description and
352 * (optionally) score given in the remaining columns</li>
356 * @param featureGroup
357 * @throws IOException
359 protected void parseMotifs(AlignmentI alignment, String featureGroup)
362 FinderI finder = new Finder(alignment);
364 while ((line = nextLine()) != null)
366 if (line.toUpperCase().startsWith(ENDMOTIFS))
370 String[] tokens = line.split(TAB_REGEX);
371 if (tokens.length != 3 && tokens.length != 4)
373 System.err.println(String.format("Invalid token count %d for %s",
374 tokens.length, line));
376 String motif = tokens[0];
377 String featureType = tokens[1];
378 String description = tokens[2];
380 if (tokens.length > 3)
384 score = Float.valueOf(tokens[3]);
385 } catch (NumberFormatException e)
387 System.err.println("Invalid score in " + line);
390 finder.findAll(motif, true, false);
391 List<SearchResultMatchI> matches = finder.getSearchResults()
393 for (SearchResultMatchI match : matches)
395 SequenceFeature sf = new SequenceFeature(featureType, description,
396 match.getStart(), match.getEnd(), score, featureGroup);
397 match.getSequence().addSequenceFeature(sf);
403 * Reads input lines from STARTFILTERS to ENDFILTERS and adds a feature type
404 * filter to the map for each line parsed. After exit from this method,
405 * nextLine() should return the line after ENDFILTERS (or we are already at
406 * end of file if ENDFILTERS was missing).
409 * @throws IOException
411 protected void parseFilters(Map<String, FeatureMatcherSetI> filters)
415 while ((line = nextLine()) != null)
417 if (line.toUpperCase().startsWith(ENDFILTERS))
421 String[] tokens = line.split(TAB_REGEX);
422 if (tokens.length != 2)
424 System.err.println(String.format("Invalid token count %d for %s",
425 tokens.length, line));
429 String featureType = tokens[0];
430 FeatureMatcherSetI fm = FeatureMatcherSet.fromString(tokens[1]);
431 if (fm != null && filters != null)
433 filters.put(featureType, fm);
440 * Try to parse a Jalview format feature specification and add it as a
441 * sequence feature to any matching sequences in the alignment. Returns true
442 * if successful (a feature was added), or false if not.
447 * @param featureColours
449 * @param relaxedIdmatching
450 * @param featureGroup
452 protected boolean parseJalviewFeature(String line, String[] gffColumns,
453 AlignmentI alignment, Map<String, FeatureColourI> featureColours,
454 boolean removeHTML, boolean relaxedIdMatching,
458 * tokens: description seqid seqIndex start end type [score]
460 if (gffColumns.length < 6)
462 System.err.println("Ignoring feature line '" + line
463 + "' with too few columns (" + gffColumns.length + ")");
466 String desc = gffColumns[0];
467 String seqId = gffColumns[1];
468 SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching);
470 if (!ID_NOT_SPECIFIED.equals(seqId))
472 seq = findSequence(seqId, alignment, null, relaxedIdMatching);
478 String seqIndex = gffColumns[2];
481 int idx = Integer.parseInt(seqIndex);
482 seq = alignment.getSequenceAt(idx);
483 } catch (NumberFormatException ex)
485 System.err.println("Invalid sequence index: " + seqIndex);
491 System.out.println("Sequence not found: " + line);
495 int startPos = Integer.parseInt(gffColumns[3]);
496 int endPos = Integer.parseInt(gffColumns[4]);
498 String ft = gffColumns[5];
500 if (!featureColours.containsKey(ft))
503 * Perhaps an old style groups file with no colours -
504 * synthesize a colour from the feature type
506 Color colour = ColorUtils.createColourFromName(ft);
507 featureColours.put(ft, new FeatureColour(colour));
509 SequenceFeature sf = null;
510 if (gffColumns.length > 6)
512 float score = Float.NaN;
515 score = Float.valueOf(gffColumns[6]).floatValue();
516 } catch (NumberFormatException ex)
518 sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup);
520 sf = new SequenceFeature(ft, desc, startPos, endPos, score,
525 sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup);
528 parseDescriptionHTML(sf, removeHTML);
530 seq.addSequenceFeature(sf);
533 && (seq = alignment.findName(seq, seqId, false)) != null)
535 seq.addSequenceFeature(new SequenceFeature(sf));
541 * clear any temporary handles used to speed up ID matching
543 protected void resetMatcher()
545 lastmatchedAl = null;
550 * Returns a sequence matching the given id, as follows
552 * <li>strict matching is on exact sequence name</li>
553 * <li>relaxed matching allows matching on a token within the sequence name,
555 * <li>first tries to find a match in the alignment sequences</li>
556 * <li>else tries to find a match in the new sequences already generated while
557 * parsing the features file</li>
558 * <li>else creates a new placeholder sequence, adds it to the new sequences
559 * list, and returns it</li>
565 * @param relaxedIdMatching
569 protected SequenceI findSequence(String seqId, AlignmentI align,
570 List<SequenceI> newseqs, boolean relaxedIdMatching)
572 // TODO encapsulate in SequenceIdMatcher, share the matcher
573 // with the GffHelper (removing code duplication)
574 SequenceI match = null;
575 if (relaxedIdMatching)
577 if (lastmatchedAl != align)
579 lastmatchedAl = align;
580 matcher = new SequenceIdMatcher(align.getSequencesArray());
583 matcher.addAll(newseqs);
586 match = matcher.findIdMatch(seqId);
590 match = align.findName(seqId, true);
591 if (match == null && newseqs != null)
593 for (SequenceI m : newseqs)
595 if (seqId.equals(m.getName()))
603 if (match == null && newseqs != null)
605 match = new SequenceDummy(seqId);
606 if (relaxedIdMatching)
608 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
610 // add dummy sequence to the newseqs list
616 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
618 if (sf.getDescription() == null)
622 ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
623 sf.getDescription(), removeHTML, newline);
627 sf.setDescription(parsed.getNonHtmlContent());
630 for (String link : parsed.getLinks())
637 * Returns contents of a Jalview format features file, for visible features, as
638 * filtered by type and group. Features with a null group are displayed if their
639 * feature type is visible. Non-positional features may optionally be included
640 * (with no check on type or group).
644 * @param includeNonPositional
645 * if true, include non-positional features
646 * (regardless of group or type)
647 * @param includeComplement
648 * if true, include visible complementary
649 * (CDS/protein) positional features, with
650 * locations converted to local sequence
654 public String printJalviewFormat(SequenceI[] sequences,
655 FeatureRenderer fr, boolean includeNonPositional,
656 boolean includeComplement)
658 Map<String, FeatureColourI> visibleColours = fr
659 .getDisplayedFeatureCols();
660 Map<String, FeatureMatcherSetI> featureFilters = fr.getFeatureFilters();
663 * write out feature colours (if we know them)
665 // TODO: decide if feature links should also be written here ?
666 StringBuilder out = new StringBuilder(256);
667 if (visibleColours != null)
669 for (Entry<String, FeatureColourI> featureColour : visibleColours
672 FeatureColourI colour = featureColour.getValue();
673 out.append(colour.toJalviewFormat(featureColour.getKey())).append(
678 String[] types = visibleColours == null ? new String[0]
679 : visibleColours.keySet()
680 .toArray(new String[visibleColours.keySet().size()]);
683 * feature filters if any
685 outputFeatureFilters(out, visibleColours, featureFilters);
688 * output features within groups
690 int count = outputFeaturesByGroup(out, fr, types, sequences,
691 includeNonPositional);
693 if (includeComplement)
695 count += outputComplementFeatures(out, fr, sequences);
698 return count > 0 ? out.toString() : "No Features Visible";
702 * Outputs any visible complementary (CDS/peptide) positional features as
703 * Jalview format, within feature group. The coordinates of the linked features
704 * are converted to the corresponding positions of the local sequences.
711 private int outputComplementFeatures(StringBuilder out,
712 FeatureRenderer fr, SequenceI[] sequences)
714 AlignViewportI comp = fr.getViewport().getCodingComplement();
715 FeatureRenderer fr2 = Desktop.getAlignFrameFor(comp)
716 .getFeatureRenderer();
719 * bin features by feature group and sequence
721 Map<String, Map<String, List<SequenceFeature>>> map = new TreeMap<>(
722 String.CASE_INSENSITIVE_ORDER);
725 for (SequenceI seq : sequences)
728 * find complementary features
730 List<SequenceFeature> complementary = findComplementaryFeatures(seq,
732 String seqName = seq.getName();
734 for (SequenceFeature sf : complementary)
736 String group = sf.getFeatureGroup();
737 if (!map.containsKey(group))
739 map.put(group, new LinkedHashMap<>()); // preserves sequence order
741 Map<String, List<SequenceFeature>> groupFeatures = map.get(group);
742 if (!groupFeatures.containsKey(seqName))
744 groupFeatures.put(seqName, new ArrayList<>());
746 List<SequenceFeature> foundFeatures = groupFeatures.get(seqName);
747 foundFeatures.add(sf);
753 * output features by group
755 for (Entry<String, Map<String, List<SequenceFeature>>> groupFeatures : map.entrySet())
758 String group = groupFeatures.getKey();
759 if (!"".equals(group))
761 out.append(STARTGROUP).append(TAB).append(group).append(newline);
763 Map<String, List<SequenceFeature>> seqFeaturesMap = groupFeatures
765 for (Entry<String, List<SequenceFeature>> seqFeatures : seqFeaturesMap
768 String sequenceName = seqFeatures.getKey();
769 for (SequenceFeature sf : seqFeatures.getValue())
771 formatJalviewFeature(out, sequenceName, sf);
774 if (!"".equals(group))
776 out.append(ENDGROUP).append(TAB).append(group).append(newline);
784 * Answers a list of mapped features visible in the (CDS/protein) complement,
785 * with feature positions translated to local sequence coordinates
791 protected List<SequenceFeature> findComplementaryFeatures(SequenceI seq,
795 * avoid duplication of features (e.g. peptide feature
796 * at all 3 mapped codon positions)
798 List<SequenceFeature> found = new ArrayList<>();
799 List<SequenceFeature> complementary = new ArrayList<>();
801 for (int pos = seq.getStart(); pos <= seq.getEnd(); pos++)
803 MappedFeatures mf = fr2.findComplementFeaturesAtResidue(seq, pos);
807 MapList mapping = mf.mapping.getMap();
808 for (SequenceFeature sf : mf.features)
811 * make a virtual feature with local coordinates
813 if (!found.contains(sf))
815 String group = sf.getFeatureGroup();
821 int begin = sf.getBegin();
822 int end = sf.getEnd();
823 int[] range = mf.mapping.getTo() == seq.getDatasetSequence()
824 ? mapping.locateInTo(begin, end)
825 : mapping.locateInFrom(begin, end);
826 SequenceFeature sf2 = new SequenceFeature(sf, range[0],
827 range[1], group, sf.getScore());
828 complementary.add(sf2);
834 return complementary;
838 * Outputs any feature filters defined for visible feature types, sandwiched by
839 * STARTFILTERS and ENDFILTERS lines
843 * @param featureFilters
845 void outputFeatureFilters(StringBuilder out,
846 Map<String, FeatureColourI> visible,
847 Map<String, FeatureMatcherSetI> featureFilters)
849 if (visible == null || featureFilters == null
850 || featureFilters.isEmpty())
855 boolean first = true;
856 for (String featureType : visible.keySet())
858 FeatureMatcherSetI filter = featureFilters.get(featureType);
864 out.append(newline).append(STARTFILTERS).append(newline);
866 out.append(featureType).append(TAB).append(filter.toStableString())
872 out.append(ENDFILTERS).append(newline);
878 * Appends output of visible sequence features within feature groups to the
879 * output buffer. Groups other than the null or empty group are sandwiched by
880 * STARTGROUP and ENDGROUP lines. Answers the number of features written.
884 * @param featureTypes
886 * @param includeNonPositional
889 private int outputFeaturesByGroup(StringBuilder out,
890 FeatureRenderer fr, String[] featureTypes,
891 SequenceI[] sequences, boolean includeNonPositional)
893 List<String> featureGroups = fr.getFeatureGroups();
896 * sort groups alphabetically, and ensure that features with a
897 * null or empty group are output after those in named groups
899 List<String> sortedGroups = new ArrayList<>(featureGroups);
900 sortedGroups.remove(null);
901 sortedGroups.remove("");
902 Collections.sort(sortedGroups);
903 sortedGroups.add(null);
904 sortedGroups.add("");
907 List<String> visibleGroups = fr.getDisplayedFeatureGroups();
910 * loop over all groups (may be visible or not);
911 * non-positional features are output even if group is not visible
913 for (String group : sortedGroups)
915 boolean firstInGroup = true;
916 boolean isNullGroup = group == null || "".equals(group);
918 for (int i = 0; i < sequences.length; i++)
920 String sequenceName = sequences[i].getName();
921 List<SequenceFeature> features = new ArrayList<>();
924 * get any non-positional features in this group, if wanted
925 * (for any feature type, whether visible or not)
927 if (includeNonPositional)
929 features.addAll(sequences[i].getFeatures()
930 .getFeaturesForGroup(false, group));
934 * add positional features for visible feature types, but
935 * (for named groups) only if feature group is visible
937 if (featureTypes.length > 0
938 && (isNullGroup || visibleGroups.contains(group)))
940 features.addAll(sequences[i].getFeatures().getFeaturesForGroup(
941 true, group, featureTypes));
944 for (SequenceFeature sf : features)
946 if (sf.isNonPositional() || fr.isVisible(sf))
954 out.append(STARTGROUP).append(TAB).append(group)
958 firstInGroup = false;
959 formatJalviewFeature(out, sequenceName, sf);
964 if (!isNullGroup && !firstInGroup)
966 out.append(ENDGROUP).append(TAB).append(group).append(newline);
973 * Formats one feature in Jalview format and appends to the string buffer
976 * @param sequenceName
977 * @param sequenceFeature
979 protected void formatJalviewFeature(
980 StringBuilder out, String sequenceName,
981 SequenceFeature sequenceFeature)
983 if (sequenceFeature.description == null
984 || sequenceFeature.description.equals(""))
986 out.append(sequenceFeature.type).append(TAB);
990 if (sequenceFeature.links != null
991 && sequenceFeature.getDescription().indexOf("<html>") == -1)
993 out.append("<html>");
996 out.append(sequenceFeature.description);
997 if (sequenceFeature.links != null)
999 for (int l = 0; l < sequenceFeature.links.size(); l++)
1001 String label = sequenceFeature.links.elementAt(l);
1002 String href = label.substring(label.indexOf("|") + 1);
1003 label = label.substring(0, label.indexOf("|"));
1005 if (sequenceFeature.description.indexOf(href) == -1)
1007 out.append(" <a href=\"").append(href).append("\">")
1008 .append(label).append("</a>");
1012 if (sequenceFeature.getDescription().indexOf("</html>") == -1)
1014 out.append("</html>");
1020 out.append(sequenceName);
1021 out.append("\t-1\t");
1022 out.append(sequenceFeature.begin);
1024 out.append(sequenceFeature.end);
1026 out.append(sequenceFeature.type);
1027 if (!Float.isNaN(sequenceFeature.score))
1030 out.append(sequenceFeature.score);
1032 out.append(newline);
1036 * Parse method that is called when a GFF file is dragged to the desktop
1041 AlignViewportI av = getViewport();
1044 if (av.getAlignment() != null)
1046 dataset = av.getAlignment().getDataset();
1048 if (dataset == null)
1050 // working in the applet context ?
1051 dataset = av.getAlignment();
1056 dataset = new Alignment(new SequenceI[] {});
1059 Map<String, FeatureColourI> featureColours = new HashMap<>();
1060 boolean parseResult = parse(dataset, featureColours, false, true);
1063 // pass error up somehow
1067 // update viewport with the dataset data ?
1071 setSeqs(dataset.getSequencesArray());
1076 * Implementation of unused abstract method
1078 * @return error message
1081 public String print(SequenceI[] sqs, boolean jvsuffix)
1083 System.out.println("Use printGffFormat() or printJalviewFormat()");
1088 * Returns features output in GFF2 format
1091 * the sequences whose features are to be
1094 * a map whose keys are the type names of
1096 * @param visibleFeatureGroups
1097 * @param includeNonPositionalFeatures
1098 * @param includeComplement
1101 public String printGffFormat(SequenceI[] sequences,
1102 FeatureRenderer fr, boolean includeNonPositionalFeatures,
1103 boolean includeComplement)
1105 FeatureRenderer fr2 = null;
1106 if (includeComplement)
1108 AlignViewportI comp = fr.getViewport().getCodingComplement();
1109 fr2 = Desktop.getAlignFrameFor(comp).getFeatureRenderer();
1112 Map<String, FeatureColourI> visibleColours = fr.getDisplayedFeatureCols();
1114 StringBuilder out = new StringBuilder(256);
1116 out.append(String.format("%s %d\n", GFF_VERSION, gffVersion == 0 ? 2 : gffVersion));
1118 String[] types = visibleColours == null ? new String[0]
1119 : visibleColours.keySet()
1120 .toArray(new String[visibleColours.keySet().size()]);
1122 for (SequenceI seq : sequences)
1124 List<SequenceFeature> seqFeatures = new ArrayList<>();
1125 List<SequenceFeature> features = new ArrayList<>();
1126 if (includeNonPositionalFeatures)
1128 features.addAll(seq.getFeatures().getNonPositionalFeatures());
1130 if (visibleColours != null && !visibleColours.isEmpty())
1132 features.addAll(seq.getFeatures().getPositionalFeatures(types));
1134 for (SequenceFeature sf : features)
1136 if (sf.isNonPositional() || fr.isVisible(sf))
1139 * drop features hidden by group visibility, colour threshold,
1140 * or feature filter condition
1142 seqFeatures.add(sf);
1146 if (includeComplement)
1148 seqFeatures.addAll(findComplementaryFeatures(seq, fr2));
1152 * sort features here if wanted
1154 for (SequenceFeature sf : seqFeatures)
1156 formatGffFeature(out, seq, sf);
1157 out.append(newline);
1161 return out.toString();
1165 * Formats one feature as GFF and appends to the string buffer
1167 private void formatGffFeature(StringBuilder out, SequenceI seq,
1170 String source = sf.featureGroup;
1173 source = sf.getDescription();
1176 out.append(seq.getName());
1180 out.append(sf.type);
1182 out.append(sf.begin);
1186 out.append(sf.score);
1189 int strand = sf.getStrand();
1190 out.append(strand == 1 ? "+" : (strand == -1 ? "-" : "."));
1193 String phase = sf.getPhase();
1194 out.append(phase == null ? "." : phase);
1196 // miscellaneous key-values (GFF column 9)
1197 String attributes = sf.getAttributes();
1198 if (attributes != null)
1200 out.append(TAB).append(attributes);
1205 * Returns a mapping given list of one or more Align descriptors (exonerate
1208 * @param alignedRegions
1209 * a list of "Align fromStart toStart fromCount"
1210 * @param mapIsFromCdna
1211 * if true, 'from' is dna, else 'from' is protein
1213 * either 1 (forward) or -1 (reverse)
1215 * @throws IOException
1217 protected MapList constructCodonMappingFromAlign(
1218 List<String> alignedRegions, boolean mapIsFromCdna, int strand)
1223 throw new IOException(
1224 "Invalid strand for a codon mapping (cannot be 0)");
1226 int regions = alignedRegions.size();
1227 // arrays to hold [start, end] for each aligned region
1228 int[] fromRanges = new int[regions * 2]; // from dna
1229 int[] toRanges = new int[regions * 2]; // to protein
1230 int fromRangesIndex = 0;
1231 int toRangesIndex = 0;
1233 for (String range : alignedRegions)
1236 * Align mapFromStart mapToStart mapFromCount
1237 * e.g. if mapIsFromCdna
1238 * Align 11270 143 120
1240 * 120 bases from pos 11270 align to pos 143 in peptide
1241 * if !mapIsFromCdna this would instead be
1242 * Align 143 11270 40
1244 String[] tokens = range.split(" ");
1245 if (tokens.length != 3)
1247 throw new IOException("Wrong number of fields for Align");
1254 fromStart = Integer.parseInt(tokens[0]);
1255 toStart = Integer.parseInt(tokens[1]);
1256 fromCount = Integer.parseInt(tokens[2]);
1257 } catch (NumberFormatException nfe)
1259 throw new IOException(
1260 "Invalid number in Align field: " + nfe.getMessage());
1264 * Jalview always models from dna to protein, so adjust values if the
1265 * GFF mapping is from protein to dna
1270 int temp = fromStart;
1271 fromStart = toStart;
1274 fromRanges[fromRangesIndex++] = fromStart;
1275 fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
1278 * If a codon has an intron gap, there will be contiguous 'toRanges';
1279 * this is handled for us by the MapList constructor.
1280 * (It is not clear that exonerate ever generates this case)
1282 toRanges[toRangesIndex++] = toStart;
1283 toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
1286 return new MapList(fromRanges, toRanges, 3, 1);
1290 * Parse a GFF format feature. This may include creating a 'dummy' sequence to
1291 * hold the feature, or for its mapped sequence, or both, to be resolved
1292 * either later in the GFF file (##FASTA section), or when the user loads
1293 * additional sequences.
1297 * @param relaxedIdMatching
1301 protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
1302 boolean relaxedIdMatching, List<SequenceI> newseqs)
1305 * GFF: seqid source type start end score strand phase [attributes]
1307 if (gffColumns.length < 5)
1309 System.err.println("Ignoring GFF feature line with too few columns ("
1310 + gffColumns.length + ")");
1315 * locate referenced sequence in alignment _or_
1316 * as a forward or external reference (SequenceDummy)
1318 String seqId = gffColumns[0];
1319 SequenceI seq = findSequence(seqId, alignment, newseqs,
1322 SequenceFeature sf = null;
1323 GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
1328 sf = helper.processGff(seq, gffColumns, alignment, newseqs,
1332 seq.addSequenceFeature(sf);
1333 while ((seq = alignment.findName(seq, seqId, true)) != null)
1335 seq.addSequenceFeature(new SequenceFeature(sf));
1338 } catch (IOException e)
1340 System.err.println("GFF parsing failed with: " + e.getMessage());
1349 * Process the 'column 9' data of the GFF file. This is less formally defined,
1350 * and its interpretation will vary depending on the tool that has generated
1356 protected void processGffColumnNine(String attributes, SequenceFeature sf)
1358 sf.setAttributes(attributes);
1361 * Parse attributes in column 9 and add them to the sequence feature's
1362 * 'otherData' table; use Note as a best proxy for description
1364 char nameValueSeparator = gffVersion == 3 ? '=' : ' ';
1365 // TODO check we don't break GFF2 values which include commas here
1366 Map<String, List<String>> nameValues = GffHelperBase
1367 .parseNameValuePairs(attributes, ";", nameValueSeparator, ",");
1368 for (Entry<String, List<String>> attr : nameValues.entrySet())
1370 String values = StringUtils.listToDelimitedString(attr.getValue(),
1372 sf.setValue(attr.getKey(), values);
1373 if (NOTE.equals(attr.getKey()))
1375 sf.setDescription(values);
1381 * After encountering ##fasta in a GFF3 file, process the remainder of the
1382 * file as FAST sequence data. Any placeholder sequences created during
1383 * feature parsing are updated with the actual sequences.
1387 * @throws IOException
1389 protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1395 } catch (IOException q)
1398 FastaFile parser = new FastaFile(this);
1399 List<SequenceI> includedseqs = parser.getSeqs();
1401 SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1404 * iterate over includedseqs, and replacing matching ones with newseqs
1405 * sequences. Generic iterator not used here because we modify
1406 * includedseqs as we go
1408 for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1410 // search for any dummy seqs that this sequence can be used to update
1411 SequenceI includedSeq = includedseqs.get(p);
1412 SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
1413 if (dummyseq != null && dummyseq instanceof SequenceDummy)
1415 // probably have the pattern wrong
1416 // idea is that a flyweight proxy for a sequence ID can be created for
1417 // 1. stable reference creation
1418 // 2. addition of annotation
1419 // 3. future replacement by a real sequence
1420 // current pattern is to create SequenceDummy objects - a convenience
1421 // constructor for a Sequence.
1422 // problem is that when promoted to a real sequence, all references
1423 // need to be updated somehow. We avoid that by keeping the same object.
1424 ((SequenceDummy) dummyseq).become(includedSeq);
1425 dummyseq.createDatasetSequence();
1428 * Update mappings so they are now to the dataset sequence
1430 for (AlignedCodonFrame mapping : align.getCodonFrames())
1432 mapping.updateToDataset(dummyseq);
1436 * replace parsed sequence with the realised forward reference
1438 includedseqs.set(p, dummyseq);
1441 * and remove from the newseqs list
1443 newseqs.remove(dummyseq);
1448 * finally add sequences to the dataset
1450 for (SequenceI seq : includedseqs)
1452 // experimental: mapping-based 'alignment' to query sequence
1453 AlignmentUtils.alignSequenceAs(seq, align,
1454 String.valueOf(align.getGapCharacter()), false, true);
1456 // rename sequences if GFF handler requested this
1457 // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
1458 List<SequenceFeature> sfs = seq.getFeatures().getPositionalFeatures();
1461 String newName = (String) sfs.get(0).getValue(
1462 GffHelperI.RENAME_TOKEN);
1463 if (newName != null)
1465 seq.setName(newName);
1468 align.addSequence(seq);
1473 * Process a ## directive
1479 * @throws IOException
1481 protected void processGffPragma(String line, Map<String, String> gffProps,
1482 AlignmentI align, List<SequenceI> newseqs) throws IOException
1485 if ("###".equals(line))
1487 // close off any open 'forward references'
1491 String[] tokens = line.substring(2).split(" ");
1492 String pragma = tokens[0];
1493 String value = tokens.length == 1 ? null : tokens[1];
1495 if ("gff-version".equalsIgnoreCase(pragma))
1501 // value may be e.g. "3.1.2"
1502 gffVersion = Integer.parseInt(value.split("\\.")[0]);
1503 } catch (NumberFormatException e)
1509 else if ("sequence-region".equalsIgnoreCase(pragma))
1511 // could capture <seqid start end> if wanted here
1513 else if ("feature-ontology".equalsIgnoreCase(pragma))
1515 // should resolve against the specified feature ontology URI
1517 else if ("attribute-ontology".equalsIgnoreCase(pragma))
1519 // URI of attribute ontology - not currently used in GFF3
1521 else if ("source-ontology".equalsIgnoreCase(pragma))
1523 // URI of source ontology - not currently used in GFF3
1525 else if ("species-build".equalsIgnoreCase(pragma))
1527 // save URI of specific NCBI taxon version of annotations
1528 gffProps.put("species-build", value);
1530 else if ("fasta".equalsIgnoreCase(pragma))
1532 // process the rest of the file as a fasta file and replace any dummy
1534 processAsFasta(align, newseqs);
1538 System.err.println("Ignoring unknown pragma: " + line);