2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.AlignmentUtils;
24 import jalview.analysis.SequenceIdMatcher;
25 import jalview.api.AlignViewportI;
26 import jalview.api.FeatureColourI;
27 import jalview.api.FeaturesSourceI;
28 import jalview.datamodel.AlignedCodonFrame;
29 import jalview.datamodel.Alignment;
30 import jalview.datamodel.AlignmentI;
31 import jalview.datamodel.SequenceDummy;
32 import jalview.datamodel.SequenceFeature;
33 import jalview.datamodel.SequenceI;
34 import jalview.datamodel.features.FeatureMatcherSet;
35 import jalview.datamodel.features.FeatureMatcherSetI;
36 import jalview.io.gff.GffHelperBase;
37 import jalview.io.gff.GffHelperFactory;
38 import jalview.io.gff.GffHelperI;
39 import jalview.schemes.FeatureColour;
40 import jalview.util.ColorUtils;
41 import jalview.util.MapList;
42 import jalview.util.ParseHtmlBodyAndLinks;
43 import jalview.util.StringUtils;
45 import java.awt.Color;
46 import java.io.IOException;
47 import java.util.ArrayList;
48 import java.util.Arrays;
49 import java.util.Collections;
50 import java.util.HashMap;
51 import java.util.List;
53 import java.util.Map.Entry;
56 * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
57 * format. These are tab-delimited formats but with differences in the use of
60 * A Jalview feature file may define feature colours and then declare that the
61 * remainder of the file is in GFF format with the line 'GFF'.
63 * GFF3 files may include alignment mappings for features, which Jalview will
64 * attempt to model, and may include sequence data following a ##FASTA line.
71 public class FeaturesFile extends AlignFile implements FeaturesSourceI
73 private static final String TAB_REGEX = "\\t";
75 private static final String STARTGROUP = "STARTGROUP";
77 private static final String ENDGROUP = "ENDGROUP";
79 private static final String STARTFILTERS = "STARTFILTERS";
81 private static final String ENDFILTERS = "ENDFILTERS";
83 private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
85 private static final String NOTE = "Note";
87 protected static final String GFF_VERSION = "##gff-version";
89 private AlignmentI lastmatchedAl = null;
91 private SequenceIdMatcher matcher = null;
93 protected AlignmentI dataset;
95 protected int gffVersion;
98 * Creates a new FeaturesFile object.
100 public FeaturesFile()
105 * Constructor which does not parse the file immediately
107 * @param file File or String filename
109 * @throws IOException
111 public FeaturesFile(Object file, DataSourceType paste)
114 super(false, file, paste);
119 * @throws IOException
121 public FeaturesFile(FileParse source) throws IOException
127 * Constructor that optionally parses the file immediately
129 * @param parseImmediately
132 * @throws IOException
134 public FeaturesFile(boolean parseImmediately, Object file,
135 DataSourceType type) throws IOException
137 super(parseImmediately, file, type);
141 * Parse GFF or sequence features file using case-independent matching,
145 * - alignment/dataset containing sequences that are to be annotated
147 * - hashtable to store feature colour definitions
149 * - process html strings into plain text
150 * @return true if features were added
152 public boolean parse(AlignmentI align,
153 Map<String, FeatureColourI> colours, boolean removeHTML)
155 return parse(align, colours, removeHTML, false);
159 * Extends the default addProperties by also adding peptide-to-cDNA mappings
160 * (if any) derived while parsing a GFF file
163 public void addProperties(AlignmentI al)
165 super.addProperties(al);
166 if (dataset != null && dataset.getCodonFrames() != null)
168 AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
169 for (AlignedCodonFrame codons : dataset.getCodonFrames())
171 ds.addCodonFrame(codons);
177 * Parse GFF or Jalview format sequence features file
180 * - alignment/dataset containing sequences that are to be annotated
182 * - map to store feature colour definitions
184 * - process html strings into plain text
185 * @param relaxedIdmatching
186 * - when true, ID matches to compound sequence IDs are allowed
187 * @return true if features were added
189 public boolean parse(AlignmentI align,
190 Map<String, FeatureColourI> colours, boolean removeHTML,
191 boolean relaxedIdmatching)
193 return parse(align, colours, null, removeHTML, relaxedIdmatching);
197 * Parse GFF or Jalview format sequence features file
200 * - alignment/dataset containing sequences that are to be annotated
202 * - map to store feature colour definitions
204 * - map to store feature filter definitions
206 * - process html strings into plain text
207 * @param relaxedIdmatching
208 * - when true, ID matches to compound sequence IDs are allowed
209 * @return true if features were added
211 public boolean parse(AlignmentI align,
212 Map<String, FeatureColourI> colours,
213 Map<String, FeatureMatcherSetI> filters, boolean removeHTML,
214 boolean relaxedIdmatching)
216 Map<String, String> gffProps = new HashMap<>();
218 * keep track of any sequences we try to create from the data
220 List<SequenceI> newseqs = new ArrayList<>();
226 String featureGroup = null;
228 while ((line = nextLine()) != null)
230 // skip comments/process pragmas
231 if (line.length() == 0 || line.startsWith("#"))
233 if (line.toLowerCase().startsWith("##"))
235 processGffPragma(line, gffProps, align, newseqs);
240 gffColumns = line.split(TAB_REGEX);
241 if (gffColumns.length == 1)
243 if (line.trim().equalsIgnoreCase("GFF"))
246 * Jalview features file with appended GFF
247 * assume GFF2 (though it may declare ##gff-version 3)
254 if (gffColumns.length > 0 && gffColumns.length < 4)
257 * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
258 * a feature type colour specification
260 String ft = gffColumns[0];
261 if (ft.equalsIgnoreCase(STARTFILTERS))
263 parseFilters(filters);
266 if (ft.equalsIgnoreCase(STARTGROUP))
268 featureGroup = gffColumns[1];
270 else if (ft.equalsIgnoreCase(ENDGROUP))
272 // We should check whether this is the current group,
273 // but at present there's no way of showing more than 1 group
278 String colscheme = gffColumns[1];
279 FeatureColourI colour = FeatureColour
280 .parseJalviewFeatureColour(colscheme);
283 colours.put(ft, colour);
290 * if not a comment, GFF pragma, startgroup, endgroup or feature
291 * colour specification, that just leaves a feature details line
292 * in either Jalview or GFF format
296 parseJalviewFeature(line, gffColumns, align, colours, removeHTML,
297 relaxedIdmatching, featureGroup);
301 parseGff(gffColumns, align, relaxedIdmatching, newseqs);
305 } catch (Exception ex)
307 // should report somewhere useful for UI if necessary
308 warningMessage = ((warningMessage == null) ? "" : warningMessage)
309 + "Parsing error at\n" + line;
310 System.out.println("Error parsing feature file: " + ex + "\n" + line);
311 ex.printStackTrace(System.err);
317 * experimental - add any dummy sequences with features to the alignment
318 * - we need them for Ensembl feature extraction - though maybe not otherwise
320 for (SequenceI newseq : newseqs)
322 if (newseq.getFeatures().hasFeatures())
324 align.addSequence(newseq);
331 * Reads input lines from STARTFILTERS to ENDFILTERS and adds a feature type
332 * filter to the map for each line parsed. After exit from this method,
333 * nextLine() should return the line after ENDFILTERS (or we are already at
334 * end of file if ENDFILTERS was missing).
337 * @throws IOException
339 protected void parseFilters(Map<String, FeatureMatcherSetI> filters)
343 while ((line = nextLine()) != null)
345 if (line.toUpperCase().startsWith(ENDFILTERS))
349 String[] tokens = line.split(TAB_REGEX);
350 if (tokens.length != 2)
352 System.err.println(String.format("Invalid token count %d for %d",
353 tokens.length, line));
357 String featureType = tokens[0];
358 FeatureMatcherSetI fm = FeatureMatcherSet.fromString(tokens[1]);
359 if (fm != null && filters != null)
361 filters.put(featureType, fm);
368 * Try to parse a Jalview format feature specification and add it as a
369 * sequence feature to any matching sequences in the alignment. Returns true
370 * if successful (a feature was added), or false if not.
375 * @param featureColours
377 * @param relaxedIdmatching
378 * @param featureGroup
380 protected boolean parseJalviewFeature(String line, String[] gffColumns,
381 AlignmentI alignment, Map<String, FeatureColourI> featureColours,
382 boolean removeHTML, boolean relaxedIdMatching,
386 * tokens: description seqid seqIndex start end type [score]
388 if (gffColumns.length < 6)
390 System.err.println("Ignoring feature line '" + line
391 + "' with too few columns (" + gffColumns.length + ")");
394 String desc = gffColumns[0];
395 String seqId = gffColumns[1];
396 SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching);
398 if (!ID_NOT_SPECIFIED.equals(seqId))
400 seq = findSequence(seqId, alignment, null, relaxedIdMatching);
406 String seqIndex = gffColumns[2];
409 int idx = Integer.parseInt(seqIndex);
410 seq = alignment.getSequenceAt(idx);
411 } catch (NumberFormatException ex)
413 System.err.println("Invalid sequence index: " + seqIndex);
419 System.out.println("Sequence not found: " + line);
423 int startPos = Integer.parseInt(gffColumns[3]);
424 int endPos = Integer.parseInt(gffColumns[4]);
426 String ft = gffColumns[5];
428 if (!featureColours.containsKey(ft))
431 * Perhaps an old style groups file with no colours -
432 * synthesize a colour from the feature type
434 Color colour = ColorUtils.createColourFromName(ft);
435 featureColours.put(ft, new FeatureColour(colour));
437 SequenceFeature sf = null;
438 if (gffColumns.length > 6)
440 float score = Float.NaN;
443 score = new Float(gffColumns[6]).floatValue();
444 } catch (NumberFormatException ex)
446 sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup);
448 sf = new SequenceFeature(ft, desc, startPos, endPos, score,
453 sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup);
456 parseDescriptionHTML(sf, removeHTML);
458 seq.addSequenceFeature(sf);
461 && (seq = alignment.findName(seq, seqId, false)) != null)
463 seq.addSequenceFeature(new SequenceFeature(sf));
469 * clear any temporary handles used to speed up ID matching
471 protected void resetMatcher()
473 lastmatchedAl = null;
478 * Returns a sequence matching the given id, as follows
480 * <li>strict matching is on exact sequence name</li>
481 * <li>relaxed matching allows matching on a token within the sequence name,
483 * <li>first tries to find a match in the alignment sequences</li>
484 * <li>else tries to find a match in the new sequences already generated while
485 * parsing the features file</li>
486 * <li>else creates a new placeholder sequence, adds it to the new sequences
487 * list, and returns it</li>
493 * @param relaxedIdMatching
497 protected SequenceI findSequence(String seqId, AlignmentI align,
498 List<SequenceI> newseqs, boolean relaxedIdMatching)
500 // TODO encapsulate in SequenceIdMatcher, share the matcher
501 // with the GffHelper (removing code duplication)
502 SequenceI match = null;
503 if (relaxedIdMatching)
505 if (lastmatchedAl != align)
507 lastmatchedAl = align;
508 matcher = new SequenceIdMatcher(align.getSequencesArray());
511 matcher.addAll(newseqs);
514 match = matcher.findIdMatch(seqId);
518 match = align.findName(seqId, true);
519 if (match == null && newseqs != null)
521 for (SequenceI m : newseqs)
523 if (seqId.equals(m.getName()))
531 if (match == null && newseqs != null)
533 match = new SequenceDummy(seqId);
534 if (relaxedIdMatching)
536 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
538 // add dummy sequence to the newseqs list
544 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
546 if (sf.getDescription() == null)
550 ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
551 sf.getDescription(), removeHTML, newline);
555 sf.setDescription(parsed.getNonHtmlContent());
558 for (String link : parsed.getLinks())
565 * Returns contents of a Jalview format features file, for visible features, as
566 * filtered by type and group. Features with a null group are displayed if their
567 * feature type is visible. Non-positional features may optionally be included
568 * (with no check on type or group).
573 * map of colour for each visible feature type
574 * @param featureFilters
575 * @param visibleFeatureGroups
576 * @param includeNonPositional
577 * if true, include non-positional features (regardless of group or
581 public String printJalviewFormat(SequenceI[] sequences,
582 Map<String, FeatureColourI> visible,
583 Map<String, FeatureMatcherSetI> featureFilters,
584 List<String> visibleFeatureGroups, boolean includeNonPositional)
586 if (!includeNonPositional && (visible == null || visible.isEmpty()))
588 // no point continuing.
589 return "No Features Visible";
593 * write out feature colours (if we know them)
595 // TODO: decide if feature links should also be written here ?
596 StringBuilder out = new StringBuilder(256);
599 for (Entry<String, FeatureColourI> featureColour : visible.entrySet())
601 FeatureColourI colour = featureColour.getValue();
602 out.append(colour.toJalviewFormat(featureColour.getKey())).append(
607 String[] types = visible == null ? new String[0] : visible.keySet()
608 .toArray(new String[visible.keySet().size()]);
611 * feature filters if any
613 outputFeatureFilters(out, visible, featureFilters);
616 * sort groups alphabetically, and ensure that features with a
617 * null or empty group are output after those in named groups
619 List<String> sortedGroups = new ArrayList<>(visibleFeatureGroups);
620 sortedGroups.remove(null);
621 sortedGroups.remove("");
622 Collections.sort(sortedGroups);
623 sortedGroups.add(null);
624 sortedGroups.add("");
626 boolean foundSome = false;
629 * first output any non-positional features
631 if (includeNonPositional)
633 for (int i = 0; i < sequences.length; i++)
635 String sequenceName = sequences[i].getName();
636 for (SequenceFeature feature : sequences[i].getFeatures()
637 .getNonPositionalFeatures())
640 out.append(formatJalviewFeature(sequenceName, feature));
646 * positional features within groups
648 foundSome |= outputFeaturesByGroup(out, sortedGroups, types, sequences);
650 return foundSome ? out.toString() : "No Features Visible";
654 * Outputs any feature filters defined for visible feature types, sandwiched by
655 * STARTFILTERS and ENDFILTERS lines
659 * @param featureFilters
661 void outputFeatureFilters(StringBuilder out,
662 Map<String, FeatureColourI> visible,
663 Map<String, FeatureMatcherSetI> featureFilters)
665 if (visible == null || featureFilters == null
666 || featureFilters.isEmpty())
671 boolean first = true;
672 for (String featureType : visible.keySet())
674 FeatureMatcherSetI filter = featureFilters.get(featureType);
680 out.append(newline).append(STARTFILTERS).append(newline);
682 out.append(featureType).append(TAB).append(filter.toStableString())
688 out.append(ENDFILTERS).append(newline).append(newline);
694 * Appends output of sequence features within feature groups to the output
695 * buffer. Groups other than the null or empty group are sandwiched by
696 * STARTGROUP and ENDGROUP lines.
700 * @param featureTypes
704 private boolean outputFeaturesByGroup(StringBuilder out,
705 List<String> groups, String[] featureTypes, SequenceI[] sequences)
707 boolean foundSome = false;
708 for (String group : groups)
710 boolean isNamedGroup = (group != null && !"".equals(group));
714 out.append(STARTGROUP).append(TAB);
720 * output positional features within groups
722 for (int i = 0; i < sequences.length; i++)
724 String sequenceName = sequences[i].getName();
725 List<SequenceFeature> features = new ArrayList<>();
726 if (featureTypes.length > 0)
728 features.addAll(sequences[i].getFeatures().getFeaturesForGroup(
729 true, group, featureTypes));
732 for (SequenceFeature sequenceFeature : features)
735 out.append(formatJalviewFeature(sequenceName, sequenceFeature));
741 out.append(ENDGROUP).append(TAB);
751 * @param sequenceName
752 * @param sequenceFeature
754 protected String formatJalviewFeature(
755 String sequenceName, SequenceFeature sequenceFeature)
757 StringBuilder out = new StringBuilder(64);
758 if (sequenceFeature.description == null
759 || sequenceFeature.description.equals(""))
761 out.append(sequenceFeature.type).append(TAB);
765 if (sequenceFeature.links != null
766 && sequenceFeature.getDescription().indexOf("<html>") == -1)
768 out.append("<html>");
771 out.append(sequenceFeature.description);
772 if (sequenceFeature.links != null)
774 for (int l = 0; l < sequenceFeature.links.size(); l++)
776 String label = sequenceFeature.links.elementAt(l);
777 String href = label.substring(label.indexOf("|") + 1);
778 label = label.substring(0, label.indexOf("|"));
780 if (sequenceFeature.description.indexOf(href) == -1)
782 out.append(" <a href=\"" + href + "\">" + label + "</a>");
786 if (sequenceFeature.getDescription().indexOf("</html>") == -1)
788 out.append("</html>");
794 out.append(sequenceName);
795 out.append("\t-1\t");
796 out.append(sequenceFeature.begin);
798 out.append(sequenceFeature.end);
800 out.append(sequenceFeature.type);
801 if (!Float.isNaN(sequenceFeature.score))
804 out.append(sequenceFeature.score);
808 return out.toString();
812 * Parse method that is called when a GFF file is dragged to the desktop
817 AlignViewportI av = getViewport();
820 if (av.getAlignment() != null)
822 dataset = av.getAlignment().getDataset();
826 // working in the applet context ?
827 dataset = av.getAlignment();
832 dataset = new Alignment(new SequenceI[] {});
835 Map<String, FeatureColourI> featureColours = new HashMap<>();
836 boolean parseResult = parse(dataset, featureColours, false, true);
839 // pass error up somehow
843 // update viewport with the dataset data ?
847 setSeqs(dataset.getSequencesArray());
852 * Implementation of unused abstract method
854 * @return error message
857 public String print(SequenceI[] sqs, boolean jvsuffix)
859 System.out.println("Use printGffFormat() or printJalviewFormat()");
864 * Returns features output in GFF2 format
867 * the sequences whose features are to be output
869 * a map whose keys are the type names of visible features
870 * @param visibleFeatureGroups
871 * @param includeNonPositionalFeatures
874 public String printGffFormat(SequenceI[] sequences,
875 Map<String, FeatureColourI> visible,
876 List<String> visibleFeatureGroups,
877 boolean includeNonPositionalFeatures)
879 StringBuilder out = new StringBuilder(256);
881 out.append(String.format("%s %d\n", GFF_VERSION, gffVersion == 0 ? 2 : gffVersion));
883 if (!includeNonPositionalFeatures
884 && (visible == null || visible.isEmpty()))
886 return out.toString();
889 String[] types = visible == null ? new String[0] : visible.keySet()
891 new String[visible.keySet().size()]);
893 for (SequenceI seq : sequences)
895 List<SequenceFeature> features = new ArrayList<>();
896 if (includeNonPositionalFeatures)
898 features.addAll(seq.getFeatures().getNonPositionalFeatures());
900 if (visible != null && !visible.isEmpty())
902 features.addAll(seq.getFeatures().getPositionalFeatures(types));
905 for (SequenceFeature sf : features)
907 String source = sf.featureGroup;
908 if (!sf.isNonPositional() && source != null
909 && !visibleFeatureGroups.contains(source))
911 // group is not visible
917 source = sf.getDescription();
920 out.append(seq.getName());
926 out.append(sf.begin);
930 out.append(sf.score);
933 int strand = sf.getStrand();
934 out.append(strand == 1 ? "+" : (strand == -1 ? "-" : "."));
937 String phase = sf.getPhase();
938 out.append(phase == null ? "." : phase);
940 // miscellaneous key-values (GFF column 9)
941 String attributes = sf.getAttributes();
942 if (attributes != null)
944 out.append(TAB).append(attributes);
951 return out.toString();
955 * Returns a mapping given list of one or more Align descriptors (exonerate
958 * @param alignedRegions
959 * a list of "Align fromStart toStart fromCount"
960 * @param mapIsFromCdna
961 * if true, 'from' is dna, else 'from' is protein
963 * either 1 (forward) or -1 (reverse)
965 * @throws IOException
967 protected MapList constructCodonMappingFromAlign(
968 List<String> alignedRegions, boolean mapIsFromCdna, int strand)
973 throw new IOException(
974 "Invalid strand for a codon mapping (cannot be 0)");
976 int regions = alignedRegions.size();
977 // arrays to hold [start, end] for each aligned region
978 int[] fromRanges = new int[regions * 2]; // from dna
979 int[] toRanges = new int[regions * 2]; // to protein
980 int fromRangesIndex = 0;
981 int toRangesIndex = 0;
983 for (String range : alignedRegions)
986 * Align mapFromStart mapToStart mapFromCount
987 * e.g. if mapIsFromCdna
988 * Align 11270 143 120
990 * 120 bases from pos 11270 align to pos 143 in peptide
991 * if !mapIsFromCdna this would instead be
994 String[] tokens = range.split(" ");
995 if (tokens.length != 3)
997 throw new IOException("Wrong number of fields for Align");
1004 fromStart = Integer.parseInt(tokens[0]);
1005 toStart = Integer.parseInt(tokens[1]);
1006 fromCount = Integer.parseInt(tokens[2]);
1007 } catch (NumberFormatException nfe)
1009 throw new IOException(
1010 "Invalid number in Align field: " + nfe.getMessage());
1014 * Jalview always models from dna to protein, so adjust values if the
1015 * GFF mapping is from protein to dna
1020 int temp = fromStart;
1021 fromStart = toStart;
1024 fromRanges[fromRangesIndex++] = fromStart;
1025 fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
1028 * If a codon has an intron gap, there will be contiguous 'toRanges';
1029 * this is handled for us by the MapList constructor.
1030 * (It is not clear that exonerate ever generates this case)
1032 toRanges[toRangesIndex++] = toStart;
1033 toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
1036 return new MapList(fromRanges, toRanges, 3, 1);
1040 * Parse a GFF format feature. This may include creating a 'dummy' sequence to
1041 * hold the feature, or for its mapped sequence, or both, to be resolved
1042 * either later in the GFF file (##FASTA section), or when the user loads
1043 * additional sequences.
1047 * @param relaxedIdMatching
1051 protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
1052 boolean relaxedIdMatching, List<SequenceI> newseqs)
1055 * GFF: seqid source type start end score strand phase [attributes]
1057 if (gffColumns.length < 5)
1059 System.err.println("Ignoring GFF feature line with too few columns ("
1060 + gffColumns.length + ")");
1065 * locate referenced sequence in alignment _or_
1066 * as a forward or external reference (SequenceDummy)
1068 String seqId = gffColumns[0];
1069 SequenceI seq = findSequence(seqId, alignment, newseqs,
1072 SequenceFeature sf = null;
1073 GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
1078 sf = helper.processGff(seq, gffColumns, alignment, newseqs,
1082 seq.addSequenceFeature(sf);
1083 while ((seq = alignment.findName(seq, seqId, true)) != null)
1085 seq.addSequenceFeature(new SequenceFeature(sf));
1088 } catch (IOException e)
1090 System.err.println("GFF parsing failed with: " + e.getMessage());
1099 * Process the 'column 9' data of the GFF file. This is less formally defined,
1100 * and its interpretation will vary depending on the tool that has generated
1106 protected void processGffColumnNine(String attributes, SequenceFeature sf)
1108 sf.setAttributes(attributes);
1111 * Parse attributes in column 9 and add them to the sequence feature's
1112 * 'otherData' table; use Note as a best proxy for description
1114 char nameValueSeparator = gffVersion == 3 ? '=' : ' ';
1115 // TODO check we don't break GFF2 values which include commas here
1116 Map<String, List<String>> nameValues = GffHelperBase
1117 .parseNameValuePairs(attributes, ";", nameValueSeparator, ",");
1118 for (Entry<String, List<String>> attr : nameValues.entrySet())
1120 String values = StringUtils.listToDelimitedString(attr.getValue(),
1122 sf.setValue(attr.getKey(), values);
1123 if (NOTE.equals(attr.getKey()))
1125 sf.setDescription(values);
1131 * After encountering ##fasta in a GFF3 file, process the remainder of the
1132 * file as FAST sequence data. Any placeholder sequences created during
1133 * feature parsing are updated with the actual sequences.
1137 * @throws IOException
1139 protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1145 } catch (IOException q)
1148 FastaFile parser = new FastaFile(this);
1149 List<SequenceI> includedseqs = parser.getSeqs();
1151 SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1154 * iterate over includedseqs, and replacing matching ones with newseqs
1155 * sequences. Generic iterator not used here because we modify
1156 * includedseqs as we go
1158 for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1160 // search for any dummy seqs that this sequence can be used to update
1161 SequenceI includedSeq = includedseqs.get(p);
1162 SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
1163 if (dummyseq != null && dummyseq instanceof SequenceDummy)
1165 // probably have the pattern wrong
1166 // idea is that a flyweight proxy for a sequence ID can be created for
1167 // 1. stable reference creation
1168 // 2. addition of annotation
1169 // 3. future replacement by a real sequence
1170 // current pattern is to create SequenceDummy objects - a convenience
1171 // constructor for a Sequence.
1172 // problem is that when promoted to a real sequence, all references
1173 // need to be updated somehow. We avoid that by keeping the same object.
1174 ((SequenceDummy) dummyseq).become(includedSeq);
1175 dummyseq.createDatasetSequence();
1178 * Update mappings so they are now to the dataset sequence
1180 for (AlignedCodonFrame mapping : align.getCodonFrames())
1182 mapping.updateToDataset(dummyseq);
1186 * replace parsed sequence with the realised forward reference
1188 includedseqs.set(p, dummyseq);
1191 * and remove from the newseqs list
1193 newseqs.remove(dummyseq);
1198 * finally add sequences to the dataset
1200 for (SequenceI seq : includedseqs)
1202 // experimental: mapping-based 'alignment' to query sequence
1203 AlignmentUtils.alignSequenceAs(seq, align,
1204 String.valueOf(align.getGapCharacter()), false, true);
1206 // rename sequences if GFF handler requested this
1207 // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
1208 List<SequenceFeature> sfs = seq.getFeatures().getPositionalFeatures();
1211 String newName = (String) sfs.get(0).getValue(
1212 GffHelperI.RENAME_TOKEN);
1213 if (newName != null)
1215 seq.setName(newName);
1218 align.addSequence(seq);
1223 * Process a ## directive
1229 * @throws IOException
1231 protected void processGffPragma(String line, Map<String, String> gffProps,
1232 AlignmentI align, List<SequenceI> newseqs) throws IOException
1235 if ("###".equals(line))
1237 // close off any open 'forward references'
1241 String[] tokens = line.substring(2).split(" ");
1242 String pragma = tokens[0];
1243 String value = tokens.length == 1 ? null : tokens[1];
1245 if ("gff-version".equalsIgnoreCase(pragma))
1251 // value may be e.g. "3.1.2"
1252 gffVersion = Integer.parseInt(value.split("\\.")[0]);
1253 } catch (NumberFormatException e)
1259 else if ("sequence-region".equalsIgnoreCase(pragma))
1261 // could capture <seqid start end> if wanted here
1263 else if ("feature-ontology".equalsIgnoreCase(pragma))
1265 // should resolve against the specified feature ontology URI
1267 else if ("attribute-ontology".equalsIgnoreCase(pragma))
1269 // URI of attribute ontology - not currently used in GFF3
1271 else if ("source-ontology".equalsIgnoreCase(pragma))
1273 // URI of source ontology - not currently used in GFF3
1275 else if ("species-build".equalsIgnoreCase(pragma))
1277 // save URI of specific NCBI taxon version of annotations
1278 gffProps.put("species-build", value);
1280 else if ("fasta".equalsIgnoreCase(pragma))
1282 // process the rest of the file as a fasta file and replace any dummy
1284 processAsFasta(align, newseqs);
1288 System.err.println("Ignoring unknown pragma: " + line);