X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FFeaturesFile.java;h=3f1cdd11bd506ccb0a1e5307aed3db3cc93a386b;hb=552acd59bb94adcb4b8f6012a0ecb64d05c799cd;hp=fbe871b67a7af8130062988385817d11b3d15850;hpb=0873fd1ab406be7ef121f466c7b53e4b6315bbca;p=jalview.git diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index fbe871b..3f1cdd1 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -20,47 +20,84 @@ */ package jalview.io; +import java.awt.Color; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; + +import jalview.analysis.AlignmentUtils; import jalview.analysis.SequenceIdMatcher; +import jalview.api.AlignViewportI; +import jalview.api.FeatureColourI; +import jalview.api.FeatureRenderer; +import jalview.api.FeaturesSourceI; import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.MappedFeatures; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.schemes.AnnotationColourGradient; -import jalview.schemes.GraduatedColor; -import jalview.schemes.UserColourScheme; -import jalview.util.Format; +import jalview.datamodel.features.FeatureMatcherSet; +import jalview.datamodel.features.FeatureMatcherSetI; +import jalview.gui.Desktop; +import jalview.io.gff.GffHelperFactory; +import jalview.io.gff.GffHelperI; +import jalview.schemes.FeatureColour; +import jalview.util.ColorUtils; import jalview.util.MapList; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.StringTokenizer; -import java.util.Vector; +import jalview.util.ParseHtmlBodyAndLinks; +import jalview.util.StringUtils; /** - * Parse and create Jalview Features files Detects GFF format features files and - * parses. Does not implement standard print() - call specific printFeatures or - * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object - * for the features annotation - this normally works on an exact match. + * Parses and writes features files, which may be in Jalview, GFF2 or GFF3 + * format. These are tab-delimited formats but with differences in the use of + * columns. + * + * A Jalview feature file may define feature colours and then declare that the + * remainder of the file is in GFF format with the line 'GFF'. + * + * GFF3 files may include alignment mappings for features, which Jalview will + * attempt to model, and may include sequence data following a ##FASTA line. + * * * @author AMW - * @version $Revision$ + * @author jbprocter + * @author gmcarstairs */ -public class FeaturesFile extends AlignFile +public class FeaturesFile extends AlignFile implements FeaturesSourceI { - /** - * work around for GFF interpretation bug where source string becomes - * description rather than a group - */ - private boolean doGffSource = true; + private static final String EQUALS = "="; + + private static final String TAB_REGEX = "\\t"; + + private static final String STARTGROUP = "STARTGROUP"; + + private static final String ENDGROUP = "ENDGROUP"; + + private static final String STARTFILTERS = "STARTFILTERS"; + + private static final String ENDFILTERS = "ENDFILTERS"; - private int gffversion; + private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED"; + + protected static final String GFF_VERSION = "##gff-version"; + + private AlignmentI lastmatchedAl = null; + + private SequenceIdMatcher matcher = null; + + protected AlignmentI dataset; + + protected int gffVersion; /** * Creates a new FeaturesFile object. @@ -70,13 +107,16 @@ public class FeaturesFile extends AlignFile } /** - * @param inFile - * @param type + * Constructor which does not parse the file immediately + * + * @param file + * @param paste * @throws IOException */ - public FeaturesFile(String inFile, String type) throws IOException + public FeaturesFile(String file, DataSourceType paste) + throws IOException { - super(inFile, type); + super(false, file, paste); } /** @@ -89,26 +129,17 @@ public class FeaturesFile extends AlignFile } /** + * Constructor that optionally parses the file immediately + * * @param parseImmediately - * @param source - * @throws IOException - */ - public FeaturesFile(boolean parseImmediately, FileParse source) - throws IOException - { - super(parseImmediately, source); - } - - /** - * @param parseImmediately - * @param inFile + * @param file * @param type * @throws IOException */ - public FeaturesFile(boolean parseImmediately, String inFile, String type) - throws IOException + public FeaturesFile(boolean parseImmediately, String file, + DataSourceType type) throws IOException { - super(parseImmediately, inFile, type); + super(parseImmediately, file, type); } /** @@ -123,565 +154,156 @@ public class FeaturesFile extends AlignFile * - process html strings into plain text * @return true if features were added */ - public boolean parse(AlignmentI align, Hashtable colours, - boolean removeHTML) + public boolean parse(AlignmentI align, + Map colours, boolean removeHTML) { - return parse(align, colours, null, removeHTML, false); + return parse(align, colours, removeHTML, false); } /** - * Parse GFF or sequence features file optionally using case-independent - * matching, discarding URLs - * - * @param align - * - alignment/dataset containing sequences that are to be annotated - * @param colours - * - hashtable to store feature colour definitions - * @param removeHTML - * - process html strings into plain text - * @param relaxedIdmatching - * - when true, ID matches to compound sequence IDs are allowed - * @return true if features were added + * Extends the default addProperties by also adding peptide-to-cDNA mappings + * (if any) derived while parsing a GFF file */ - public boolean parse(AlignmentI align, Map colours, boolean removeHTML, - boolean relaxedIdMatching) + @Override + public void addProperties(AlignmentI al) { - return parse(align, colours, null, removeHTML, relaxedIdMatching); + super.addProperties(al); + if (dataset != null && dataset.getCodonFrames() != null) + { + AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset(); + for (AlignedCodonFrame codons : dataset.getCodonFrames()) + { + ds.addCodonFrame(codons); + } + } } /** - * Parse GFF or sequence features file optionally using case-independent - * matching + * Parse GFF or Jalview format sequence features file * * @param align * - alignment/dataset containing sequences that are to be annotated * @param colours - * - hashtable to store feature colour definitions - * @param featureLink - * - hashtable to store associated URLs + * - map to store feature colour definitions * @param removeHTML * - process html strings into plain text + * @param relaxedIdmatching + * - when true, ID matches to compound sequence IDs are allowed * @return true if features were added */ - public boolean parse(AlignmentI align, Map colours, Map featureLink, - boolean removeHTML) - { - return parse(align, colours, featureLink, removeHTML, false); - } - - @Override - public void addAnnotations(AlignmentI al) - { - // TODO Auto-generated method stub - super.addAnnotations(al); - } - - @Override - public void addProperties(AlignmentI al) - { - // TODO Auto-generated method stub - super.addProperties(al); - } - - @Override - public void addSeqGroups(AlignmentI al) + public boolean parse(AlignmentI align, + Map colours, boolean removeHTML, + boolean relaxedIdmatching) { - // TODO Auto-generated method stub - super.addSeqGroups(al); + return parse(align, colours, null, removeHTML, relaxedIdmatching); } /** - * Parse GFF or sequence features file + * Parse GFF or Jalview format sequence features file * * @param align * - alignment/dataset containing sequences that are to be annotated * @param colours - * - hashtable to store feature colour definitions - * @param featureLink - * - hashtable to store associated URLs + * - map to store feature colour definitions + * @param filters + * - map to store feature filter definitions * @param removeHTML * - process html strings into plain text * @param relaxedIdmatching * - when true, ID matches to compound sequence IDs are allowed * @return true if features were added */ - public boolean parse(AlignmentI align, Map colours, Map featureLink, - boolean removeHTML, boolean relaxedIdmatching) + public boolean parse(AlignmentI align, + Map colours, + Map filters, boolean removeHTML, + boolean relaxedIdmatching) { + Map gffProps = new HashMap<>(); + /* + * keep track of any sequences we try to create from the data + */ + List newseqs = new ArrayList<>(); String line = null; try { - SequenceI seq = null; - /** - * keep track of any sequences we try to create from the data if it is a GFF3 file - */ - ArrayList newseqs = new ArrayList(); - String type, desc, token = null; - - int index, start, end; - float score; - StringTokenizer st; - SequenceFeature sf; - String featureGroup = null, groupLink = null; - Map typeLink = new Hashtable(); - /** - * when true, assume GFF style features rather than Jalview style. - */ - boolean GFFFile = true; - Map gffProps = new HashMap(); + String[] gffColumns; + String featureGroup = null; + while ((line = nextLine()) != null) { // skip comments/process pragmas - if (line.startsWith("#")) + if (line.length() == 0 || line.startsWith("#")) { - if (line.startsWith("##")) + if (line.toLowerCase().startsWith("##")) { - // possibly GFF2/3 version and metadata header processGffPragma(line, gffProps, align, newseqs); - line = ""; } continue; } - st = new StringTokenizer(line, "\t"); - if (st.countTokens() == 1) + gffColumns = line.split(TAB_REGEX); + if (gffColumns.length == 1) { if (line.trim().equalsIgnoreCase("GFF")) { - // Start parsing file as if it might be GFF again. - GFFFile = true; + /* + * Jalview features file with appended GFF + * assume GFF2 (though it may declare ##gff-version 3) + */ + gffVersion = 2; continue; } } - if (st.countTokens() > 1 && st.countTokens() < 4) + + if (gffColumns.length > 0 && gffColumns.length < 4) { - GFFFile = false; - type = st.nextToken(); - if (type.equalsIgnoreCase("startgroup")) + /* + * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or + * a feature type colour specification + */ + String ft = gffColumns[0]; + if (ft.equalsIgnoreCase(STARTFILTERS)) { - featureGroup = st.nextToken(); - if (st.hasMoreElements()) - { - groupLink = st.nextToken(); - featureLink.put(featureGroup, groupLink); - } + parseFilters(filters); + continue; + } + if (ft.equalsIgnoreCase(STARTGROUP)) + { + featureGroup = gffColumns[1]; } - else if (type.equalsIgnoreCase("endgroup")) + else if (ft.equalsIgnoreCase(ENDGROUP)) { // We should check whether this is the current group, - // but at present theres no way of showing more than 1 group - st.nextToken(); + // but at present there's no way of showing more than 1 group featureGroup = null; - groupLink = null; } else { - Object colour = null; - String colscheme = st.nextToken(); - if (colscheme.indexOf("|") > -1 - || colscheme.trim().equalsIgnoreCase("label")) - { - // Parse '|' separated graduated colourscheme fields: - // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue] - // can either provide 'label' only, first is optional, next two - // colors are required (but may be - // left blank), next is optional, nxt two min/max are required. - // first is either 'label' - // first/second and third are both hexadecimal or word equivalent - // colour. - // next two are values parsed as floats. - // fifth is either 'above','below', or 'none'. - // sixth is a float value and only required when fifth is either - // 'above' or 'below'. - StringTokenizer gcol = new StringTokenizer(colscheme, "|", - true); - // set defaults - int threshtype = AnnotationColourGradient.NO_THRESHOLD; - float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN; - boolean labelCol = false; - // Parse spec line - String mincol = gcol.nextToken(); - if (mincol == "|") - { - System.err - .println("Expected either 'label' or a colour specification in the line: " - + line); - continue; - } - String maxcol = null; - if (mincol.toLowerCase().indexOf("label") == 0) - { - labelCol = true; - mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip - // '|' - mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); - } - String abso = null, minval, maxval; - if (mincol != null) - { - // at least four more tokens - if (mincol.equals("|")) - { - mincol = ""; - } - else - { - gcol.nextToken(); // skip next '|' - } - // continue parsing rest of line - maxcol = gcol.nextToken(); - if (maxcol.equals("|")) - { - maxcol = ""; - } - else - { - gcol.nextToken(); // skip next '|' - } - abso = gcol.nextToken(); - gcol.nextToken(); // skip next '|' - if (abso.toLowerCase().indexOf("abso") != 0) - { - minval = abso; - abso = null; - } - else - { - minval = gcol.nextToken(); - gcol.nextToken(); // skip next '|' - } - maxval = gcol.nextToken(); - if (gcol.hasMoreTokens()) - { - gcol.nextToken(); // skip next '|' - } - try - { - if (minval.length() > 0) - { - min = new Float(minval).floatValue(); - } - } catch (Exception e) - { - System.err - .println("Couldn't parse the minimum value for graduated colour for type (" - + colscheme - + ") - did you misspell 'auto' for the optional automatic colour switch ?"); - e.printStackTrace(); - } - try - { - if (maxval.length() > 0) - { - max = new Float(maxval).floatValue(); - } - } catch (Exception e) - { - System.err - .println("Couldn't parse the maximum value for graduated colour for type (" - + colscheme + ")"); - e.printStackTrace(); - } - } - else - { - // add in some dummy min/max colours for the label-only - // colourscheme. - mincol = "FFFFFF"; - maxcol = "000000"; - } - try - { - colour = new jalview.schemes.GraduatedColor( - new UserColourScheme(mincol).findColour('A'), - new UserColourScheme(maxcol).findColour('A'), min, - max); - } catch (Exception e) - { - System.err - .println("Couldn't parse the graduated colour scheme (" - + colscheme + ")"); - e.printStackTrace(); - } - if (colour != null) - { - ((jalview.schemes.GraduatedColor) colour) - .setColourByLabel(labelCol); - ((jalview.schemes.GraduatedColor) colour) - .setAutoScaled(abso == null); - // add in any additional parameters - String ttype = null, tval = null; - if (gcol.hasMoreTokens()) - { - // threshold type and possibly a threshold value - ttype = gcol.nextToken(); - if (ttype.toLowerCase().startsWith("below")) - { - ((jalview.schemes.GraduatedColor) colour) - .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD); - } - else if (ttype.toLowerCase().startsWith("above")) - { - ((jalview.schemes.GraduatedColor) colour) - .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD); - } - else - { - ((jalview.schemes.GraduatedColor) colour) - .setThreshType(AnnotationColourGradient.NO_THRESHOLD); - if (!ttype.toLowerCase().startsWith("no")) - { - System.err - .println("Ignoring unrecognised threshold type : " - + ttype); - } - } - } - if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD) - { - try - { - gcol.nextToken(); - tval = gcol.nextToken(); - ((jalview.schemes.GraduatedColor) colour) - .setThresh(new Float(tval).floatValue()); - } catch (Exception e) - { - System.err - .println("Couldn't parse threshold value as a float: (" - + tval + ")"); - e.printStackTrace(); - } - } - // parse the thresh-is-min token ? - if (gcol.hasMoreTokens()) - { - System.err - .println("Ignoring additional tokens in parameters in graduated colour specification\n"); - while (gcol.hasMoreTokens()) - { - System.err.println("|" + gcol.nextToken()); - } - System.err.println("\n"); - } - } - } - else - { - UserColourScheme ucs = new UserColourScheme(colscheme); - colour = ucs.findColour('A'); - } + String colscheme = gffColumns[1]; + FeatureColourI colour = FeatureColour + .parseJalviewFeatureColour(colscheme); if (colour != null) { - colours.put(type, colour); - } - if (st.hasMoreElements()) - { - String link = st.nextToken(); - typeLink.put(type, link); - if (featureLink == null) - { - featureLink = new Hashtable(); - } - featureLink.put(type, link); + colours.put(ft, colour); } } continue; } - String seqId = ""; - while (st.hasMoreElements()) - { - - if (GFFFile) - { - // Still possible this is an old Jalview file, - // which does not have type colours at the beginning - seqId = token = st.nextToken(); - seq = findName(align, seqId, relaxedIdmatching, newseqs); - if (seq != null) - { - desc = st.nextToken(); - String group = null; - if (doGffSource && desc.indexOf(' ') == -1) - { - // could also be a source term rather than description line - group = new String(desc); - } - type = st.nextToken(); - try - { - String stt = st.nextToken(); - if (stt.length() == 0 || stt.equals("-")) - { - start = 0; - } - else - { - start = Integer.parseInt(stt); - } - } catch (NumberFormatException ex) - { - start = 0; - } - try - { - String stt = st.nextToken(); - if (stt.length() == 0 || stt.equals("-")) - { - end = 0; - } - else - { - end = Integer.parseInt(stt); - } - } catch (NumberFormatException ex) - { - end = 0; - } - // TODO: decide if non positional feature assertion for input data - // where end==0 is generally valid - if (end == 0) - { - // treat as non-positional feature, regardless. - start = 0; - } - try - { - score = new Float(st.nextToken()).floatValue(); - } catch (NumberFormatException ex) - { - score = 0; - } - - sf = new SequenceFeature(type, desc, start, end, score, group); - try - { - sf.setValue("STRAND", st.nextToken()); - sf.setValue("FRAME", st.nextToken()); - } catch (Exception ex) - { - } - - if (st.hasMoreTokens()) - { - StringBuffer attributes = new StringBuffer(); - boolean sep = false; - while (st.hasMoreTokens()) - { - attributes.append((sep ? "\t" : "") + st.nextElement()); - sep = true; - } - // TODO validate and split GFF2 attributes field ? parse out - // ([A-Za-z][A-Za-z0-9_]*) ; and add as - // sf.setValue(attrib, val); - sf.setValue("ATTRIBUTES", attributes.toString()); - } - - if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile, - relaxedIdmatching)) - { - // check whether we should add the sequence feature to any other - // sequences in the alignment with the same or similar - while ((seq = align.findName(seq, seqId, true)) != null) - { - seq.addSequenceFeature(new SequenceFeature(sf)); - } - } - break; - } - } - - if (GFFFile && seq == null) - { - desc = token; - } - else - { - desc = st.nextToken(); - } - if (!st.hasMoreTokens()) - { - System.err - .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up."); - // in all probability, this isn't a file we understand, so bail - // quietly. - return false; - } - - token = st.nextToken(); - - if (!token.equals("ID_NOT_SPECIFIED")) - { - seq = findName(align, seqId = token, relaxedIdmatching, null); - st.nextToken(); - } - else - { - seqId = null; - try - { - index = Integer.parseInt(st.nextToken()); - seq = align.getSequenceAt(index); - } catch (NumberFormatException ex) - { - seq = null; - } - } - - if (seq == null) - { - System.out.println("Sequence not found: " + line); - break; - } - - start = Integer.parseInt(st.nextToken()); - end = Integer.parseInt(st.nextToken()); - - type = st.nextToken(); - - if (!colours.containsKey(type)) - { - // Probably the old style groups file - UserColourScheme ucs = new UserColourScheme(type); - colours.put(type, ucs.findColour('A')); - } - sf = new SequenceFeature(type, desc, "", start, end, featureGroup); - if (st.hasMoreTokens()) - { - try - { - score = new Float(st.nextToken()).floatValue(); - // update colourgradient bounds if allowed to - } catch (NumberFormatException ex) - { - score = 0; - } - sf.setScore(score); - } - if (groupLink != null && removeHTML) - { - sf.addLink(groupLink); - sf.description += "%LINK%"; - } - if (typeLink.containsKey(type) && removeHTML) - { - sf.addLink(typeLink.get(type).toString()); - sf.description += "%LINK%"; - } - - parseDescriptionHTML(sf, removeHTML); - - seq.addSequenceFeature(sf); - - while (seqId != null - && (seq = align.findName(seq, seqId, false)) != null) - { - seq.addSequenceFeature(new SequenceFeature(sf)); - } - // If we got here, its not a GFFFile - GFFFile = false; + /* + * if not a comment, GFF pragma, startgroup, endgroup or feature + * colour specification, that just leaves a feature details line + * in either Jalview or GFF format + */ + if (gffVersion == 0) + { + parseJalviewFeature(line, gffColumns, align, colours, removeHTML, + relaxedIdmatching, featureGroup); + } + else + { + parseGff(gffColumns, align, relaxedIdmatching, newseqs); } } resetMatcher(); @@ -696,831 +318,1224 @@ public class FeaturesFile extends AlignFile return false; } + /* + * experimental - add any dummy sequences with features to the alignment + * - we need them for Ensembl feature extraction - though maybe not otherwise + */ + for (SequenceI newseq : newseqs) + { + if (newseq.getFeatures().hasFeatures()) + { + align.addSequence(newseq); + } + } return true; } - private enum GffPragmas - { - gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash - }; - - private static Map GFFPRAGMA; - static + /** + * Reads input lines from STARTFILTERS to ENDFILTERS and adds a feature type + * filter to the map for each line parsed. After exit from this method, + * nextLine() should return the line after ENDFILTERS (or we are already at + * end of file if ENDFILTERS was missing). + * + * @param filters + * @throws IOException + */ + protected void parseFilters(Map filters) + throws IOException { - GFFPRAGMA = new HashMap(); - GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region); - GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology); - GFFPRAGMA.put("#", GffPragmas.hash); - GFFPRAGMA.put("fasta", GffPragmas.fasta); - GFFPRAGMA.put("species-build", GffPragmas.species_build); - GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology); - GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology); + String line; + while ((line = nextLine()) != null) + { + // TODO: use .trim().equalsIgnoreCase here instead ? + if (line.toUpperCase(Locale.ROOT).startsWith(ENDFILTERS)) + { + return; + } + String[] tokens = line.split(TAB_REGEX); + if (tokens.length != 2) + { + System.err.println(String.format("Invalid token count %d for %d", + tokens.length, line)); + } + else + { + String featureType = tokens[0]; + FeatureMatcherSetI fm = FeatureMatcherSet.fromString(tokens[1]); + if (fm != null && filters != null) + { + filters.put(featureType, fm); + } + } + } } - private void processGffPragma(String line, Map gffProps, - AlignmentI align, ArrayList newseqs) - throws IOException + /** + * Try to parse a Jalview format feature specification and add it as a + * sequence feature to any matching sequences in the alignment. Returns true + * if successful (a feature was added), or false if not. + * + * @param line + * @param gffColumns + * @param alignment + * @param featureColours + * @param removeHTML + * @param relaxedIdmatching + * @param featureGroup + */ + protected boolean parseJalviewFeature(String line, String[] gffColumns, + AlignmentI alignment, Map featureColours, + boolean removeHTML, boolean relaxedIdMatching, + String featureGroup) { - // line starts with ## - int spacepos = line.indexOf(' '); - String pragma = spacepos == -1 ? line.substring(2).trim() : line - .substring(2, spacepos); - GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase()); - if (gffpragma == null) + /* + * tokens: description seqid seqIndex start end type [score] + */ + if (gffColumns.length < 6) { - return; + System.err.println("Ignoring feature line '" + line + + "' with too few columns (" + gffColumns.length + ")"); + return false; + } + String desc = gffColumns[0]; + String seqId = gffColumns[1]; + SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching); + + if (!ID_NOT_SPECIFIED.equals(seqId)) + { + seq = findSequence(seqId, alignment, null, relaxedIdMatching); } - switch (gffpragma) + else { - case gff_version: + seqId = null; + seq = null; + String seqIndex = gffColumns[2]; try { - gffversion = Integer.parseInt(line.substring(spacepos + 1)); - } finally + int idx = Integer.parseInt(seqIndex); + seq = alignment.getSequenceAt(idx); + } catch (NumberFormatException ex) { - + System.err.println("Invalid sequence index: " + seqIndex); } - break; - case feature_ontology: - // resolve against specific feature ontology - break; - case attribute_ontology: - // resolve against specific attribute ontology - break; - case source_ontology: - // resolve against specific source ontology - break; - case species_build: - // resolve against specific NCBI taxon version - break; - case hash: - // close off any open feature hierarchies - break; - case fasta: - // process the rest of the file as a fasta file and replace any dummy - // sequence IDs - process_as_fasta(align, newseqs); - break; - default: - // we do nothing ? - System.err.println("Ignoring unknown pragma:\n" + line); } - } - private void process_as_fasta(AlignmentI align, List newseqs) - throws IOException - { - try + if (seq == null) { - mark(); - } catch (IOException q) + System.out.println("Sequence not found: " + line); + return false; + } + + int startPos = Integer.parseInt(gffColumns[3]); + int endPos = Integer.parseInt(gffColumns[4]); + + String ft = gffColumns[5]; + + if (!featureColours.containsKey(ft)) { + /* + * Perhaps an old style groups file with no colours - + * synthesize a colour from the feature type + */ + Color colour = ColorUtils.createColourFromName(ft); + featureColours.put(ft, new FeatureColour(colour)); } - FastaFile parser = new FastaFile(this); - List includedseqs = parser.getSeqs(); - SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); - // iterate over includedseqs, and replacing matching ones with newseqs - // sequences. Generic iterator not used here because we modify includedseqs - // as we go - for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) + SequenceFeature sf = null; + if (gffColumns.length > 6) { - // search for any dummy seqs that this sequence can be used to update - SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p)); - if (dummyseq != null) + float score = Float.NaN; + try { - // dummyseq was created so it could be annotated and referred to in - // alignments/codon mappings - - SequenceI mseq = includedseqs.get(p); - // mseq is the 'template' imported from the FASTA file which we'll use - // to coomplete dummyseq - if (dummyseq instanceof SequenceDummy) - { - // probably have the pattern wrong - // idea is that a flyweight proxy for a sequence ID can be created for - // 1. stable reference creation - // 2. addition of annotation - // 3. future replacement by a real sequence - // current pattern is to create SequenceDummy objects - a convenience - // constructor for a Sequence. - // problem is that when promoted to a real sequence, all references - // need - // to be updated somehow. - ((SequenceDummy) dummyseq).become(mseq); - includedseqs.set(p, dummyseq); // template is no longer needed - } + score = Float.valueOf(gffColumns[6]).floatValue(); + } catch (NumberFormatException ex) + { + sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup); } + sf = new SequenceFeature(ft, desc, startPos, endPos, score, + featureGroup); } - // finally add sequences to the dataset - for (SequenceI seq : includedseqs) + else { - align.addSequence(seq); + sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup); + } + + parseDescriptionHTML(sf, removeHTML); + + seq.addSequenceFeature(sf); + + while (seqId != null + && (seq = alignment.findName(seq, seqId, false)) != null) + { + seq.addSequenceFeature(new SequenceFeature(sf)); } + return true; } /** - * take a sequence feature and examine its attributes to decide how it should - * be added to a sequence - * - * @param seq - * - the destination sequence constructed or discovered in the - * current context - * @param sf - * - the base feature with ATTRIBUTES property containing any - * additional attributes - * @param gFFFile - * - true if we are processing a GFF annotation file - * @return true if sf was actually added to the sequence, false if it was - * processed in another way + * clear any temporary handles used to speed up ID matching */ - public boolean processOrAddSeqFeature(AlignmentI align, List newseqs, SequenceI seq, SequenceFeature sf, - boolean gFFFile, boolean relaxedIdMatching) + protected void resetMatcher() { - String attr = (String) sf.getValue("ATTRIBUTES"); - boolean add = true; - if (gFFFile && attr != null) - { - int nattr=8; + lastmatchedAl = null; + matcher = null; + } - for (String attset : attr.split("\t")) + /** + * Returns a sequence matching the given id, as follows + *
    + *
  • strict matching is on exact sequence name
  • + *
  • relaxed matching allows matching on a token within the sequence name, + * or a dbxref
  • + *
  • first tries to find a match in the alignment sequences
  • + *
  • else tries to find a match in the new sequences already generated while + * parsing the features file
  • + *
  • else creates a new placeholder sequence, adds it to the new sequences + * list, and returns it
  • + *
+ * + * @param seqId + * @param align + * @param newseqs + * @param relaxedIdMatching + * + * @return + */ + protected SequenceI findSequence(String seqId, AlignmentI align, + List newseqs, boolean relaxedIdMatching) + { + // TODO encapsulate in SequenceIdMatcher, share the matcher + // with the GffHelper (removing code duplication) + SequenceI match = null; + if (relaxedIdMatching) + { + if (lastmatchedAl != align) { - if (attset==null || attset.trim().length()==0) + lastmatchedAl = align; + matcher = new SequenceIdMatcher(align.getSequencesArray()); + if (newseqs != null) { - continue; + matcher.addAll(newseqs); } - nattr++; - Map> set = new HashMap>(); - // normally, only expect one column - 9 - in this field - // the attributes (Gff3) or groups (gff2) field - for (String pair : attset.trim().split(";")) + } + match = matcher.findIdMatch(seqId); + } + else + { + match = align.findName(seqId, true); + if (match == null && newseqs != null) + { + for (SequenceI m : newseqs) { - pair = pair.trim(); - if (pair.length() == 0) - { - continue; - } - - // expect either space seperated (gff2) or '=' separated (gff3) - // key/value pairs here - - int eqpos = pair.indexOf('='),sppos = pair.indexOf(' '); - String key = null, value = null; - - if (sppos > -1 && (eqpos == -1 || sppos < eqpos)) - { - key = pair.substring(0, sppos); - value = pair.substring(sppos + 1); - } else { - if (eqpos > -1 && (sppos == -1 || eqpos < sppos)) - { - key = pair.substring(0, eqpos); - value = pair.substring(eqpos + 1); - } else - { - key = pair; - } - } - if (key != null) + if (seqId.equals(m.getName())) { - List vals = set.get(key); - if (vals == null) - { - vals = new ArrayList(); - set.put(key, vals); - } - if (value != null) - { - vals.add(value.trim()); - } + return m; } } - try - { - add &= processGffKey(set, nattr, seq, sf, align, newseqs, - relaxedIdMatching); // process decides if - // feature is actually - // added - } catch (InvalidGFF3FieldException ivfe) - { - System.err.println(ivfe); - } } + } - if (add) + if (match == null && newseqs != null) { - seq.addSequenceFeature(sf); + match = new SequenceDummy(seqId); + if (relaxedIdMatching) + { + matcher.addAll(Arrays.asList(new SequenceI[] { match })); + } + // add dummy sequence to the newseqs list + newseqs.add(match); } - return add; + return match; } - public class InvalidGFF3FieldException extends Exception + public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML) { - String field, value; + if (sf.getDescription() == null) + { + return; + } + ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks( + sf.getDescription(), removeHTML, newline); - public InvalidGFF3FieldException(String field, - Map> set, String message) + if (removeHTML) { - super(message + " (Field was " + field + " and value was " - + set.get(field).toString()); - this.field = field; - this.value = set.get(field).toString(); + sf.setDescription(parsed.getNonHtmlContent()); } + for (String link : parsed.getLinks()) + { + sf.addLink(link); + } } /** - * take a set of keys for a feature and interpret them + * Returns contents of a Jalview format features file, for visible features, as + * filtered by type and group. Features with a null group are displayed if their + * feature type is visible. Non-positional features may optionally be included + * (with no check on type or group). * - * @param set - * @param nattr - * @param seq - * @param sf + * @param sequences + * @param fr + * @param includeNonPositional + * if true, include non-positional features + * (regardless of group or type) + * @param includeComplement + * if true, include visible complementary + * (CDS/protein) positional features, with + * locations converted to local sequence + * coordinates * @return */ - public boolean processGffKey(Map> set, int nattr, - SequenceI seq, SequenceFeature sf, AlignmentI align, - List newseqs, boolean relaxedIdMatching) - throws InvalidGFF3FieldException + public String printJalviewFormat(SequenceI[] sequences, + FeatureRenderer fr, boolean includeNonPositional, + boolean includeComplement) { - String attr; - // decide how to interpret according to type - if (sf.getType().equals("similarity")) + Map visibleColours = fr + .getDisplayedFeatureCols(); + Map featureFilters = fr.getFeatureFilters(); + + /* + * write out feature colours (if we know them) + */ + // TODO: decide if feature links should also be written here ? + StringBuilder out = new StringBuilder(256); + if (visibleColours != null) { - int strand = sf.getStrand(); - // exonerate cdna/protein map - // look for fields - List querySeq = findNames(align, newseqs, - relaxedIdMatching, set.get(attr="Query")); - if (querySeq==null || querySeq.size()!=1) - { - throw new InvalidGFF3FieldException( attr, set, - "Expecting exactly one sequence in Query field (got " - + set.get(attr) + ")"); - } - if (set.containsKey(attr="Align")) + for (Entry featureColour : visibleColours + .entrySet()) { - // process the align maps and create cdna/protein maps - // ideally, the query sequences are in the alignment, but maybe not... - - AlignedCodonFrame alco = new AlignedCodonFrame(); - MapList codonmapping = constructCodonMappingFromAlign(set, attr, - strand); - - // add codon mapping, and hope! - alco.addMap(seq, querySeq.get(0), codonmapping); - align.addCodonFrame(alco); - // everything that's needed to be done is done - // no features to create here ! - return false; + FeatureColourI colour = featureColour.getValue(); + out.append(colour.toJalviewFormat(featureColour.getKey())).append( + newline); } + } + + String[] types = visibleColours == null ? new String[0] + : visibleColours.keySet() + .toArray(new String[visibleColours.keySet().size()]); + + /* + * feature filters if any + */ + outputFeatureFilters(out, visibleColours, featureFilters); + + /* + * output features within groups + */ + int count = outputFeaturesByGroup(out, fr, types, sequences, + includeNonPositional); + if (includeComplement) + { + count += outputComplementFeatures(out, fr, sequences); } - return true; + + return count > 0 ? out.toString() : "No Features Visible"; } - private MapList constructCodonMappingFromAlign( - Map> set, - String attr, int strand) throws InvalidGFF3FieldException + /** + * Outputs any visible complementary (CDS/peptide) positional features as + * Jalview format, within feature group. The coordinates of the linked features + * are converted to the corresponding positions of the local sequences. + * + * @param out + * @param fr + * @param sequences + * @return + */ + private int outputComplementFeatures(StringBuilder out, + FeatureRenderer fr, SequenceI[] sequences) { - if (strand == 0) - { - throw new InvalidGFF3FieldException(attr, set, - "Invalid strand for a codon mapping (cannot be 0)"); - } - List fromrange = new ArrayList(), torange = new ArrayList(); - int lastppos = 0, lastpframe = 0; - for (String range : set.get(attr)) + AlignViewportI comp = fr.getViewport().getCodingComplement(); + FeatureRenderer fr2 = Desktop.getAlignFrameFor(comp) + .getFeatureRenderer(); + + /* + * bin features by feature group and sequence + */ + Map>> map = new TreeMap<>( + String.CASE_INSENSITIVE_ORDER); + int count = 0; + + for (SequenceI seq : sequences) { - List ints = new ArrayList(); - StringTokenizer st = new StringTokenizer(range, " "); - while (st.hasMoreTokens()) + /* + * find complementary features + */ + List complementary = findComplementaryFeatures(seq, + fr2); + String seqName = seq.getName(); + + for (SequenceFeature sf : complementary) { - String num = st.nextToken(); - try + String group = sf.getFeatureGroup(); + if (!map.containsKey(group)) { - ints.add(new Integer(num)); - } catch (NumberFormatException nfe) + map.put(group, new LinkedHashMap<>()); // preserves sequence order + } + Map> groupFeatures = map.get(group); + if (!groupFeatures.containsKey(seqName)) { - throw new InvalidGFF3FieldException(attr, set, - "Invalid number in field " + num); + groupFeatures.put(seqName, new ArrayList<>()); } + List foundFeatures = groupFeatures.get(seqName); + foundFeatures.add(sf); + count++; } - // Align positionInRef positionInQuery LengthInRef - // contig_1146 exonerate:protein2genome:local similarity 8534 11269 - // 3652 - . alignment_id 0 ; - // Query DDB_G0269124 - // Align 11270 143 120 - // corresponds to : 120 bases align at pos 143 in protein to 11270 on - // dna in strand direction - // Align 11150 187 282 - // corresponds to : 282 bases align at pos 187 in protein to 11150 on - // dna in strand direction - // - // Align 10865 281 888 - // Align 9977 578 1068 - // Align 8909 935 375 - // - if (ints.size() != 3) + } + + /* + * output features by group + */ + for (Entry>> groupFeatures : map.entrySet()) + { + out.append(newline); + String group = groupFeatures.getKey(); + if (!"".equals(group)) { - throw new InvalidGFF3FieldException(attr, set, - "Invalid number of fields for this attribute (" - + ints.size() + ")"); + out.append(STARTGROUP).append(TAB).append(group).append(newline); } - fromrange.add(new Integer(ints.get(0).intValue())); - fromrange.add(new Integer(ints.get(0).intValue() + strand - * ints.get(2).intValue())); - // how are intron/exon boundaries that do not align in codons - // represented - if (ints.get(1).equals(lastppos) && lastpframe > 0) + Map> seqFeaturesMap = groupFeatures + .getValue(); + for (Entry> seqFeatures : seqFeaturesMap + .entrySet()) { - // extend existing to map - lastppos += ints.get(2) / 3; - lastpframe = ints.get(2) % 3; - torange.set(torange.size() - 1, new Integer(lastppos)); + String sequenceName = seqFeatures.getKey(); + for (SequenceFeature sf : seqFeatures.getValue()) + { + formatJalviewFeature(out, sequenceName, sf); + } } - else + if (!"".equals(group)) { - // new to map range - torange.add(ints.get(1)); - lastppos = ints.get(1) + ints.get(2) / 3; - lastpframe = ints.get(2) % 3; - torange.add(new Integer(lastppos)); + out.append(ENDGROUP).append(TAB).append(group).append(newline); } } - // from and to ranges must end up being a series of start/end intervals - if (fromrange.size() % 2 == 1) + + return count; + } + + /** + * Answers a list of mapped features visible in the (CDS/protein) complement, + * with feature positions translated to local sequence coordinates + * + * @param seq + * @param fr2 + * @return + */ + protected List findComplementaryFeatures(SequenceI seq, + FeatureRenderer fr2) + { + /* + * avoid duplication of features (e.g. peptide feature + * at all 3 mapped codon positions) + */ + List found = new ArrayList<>(); + List complementary = new ArrayList<>(); + + for (int pos = seq.getStart(); pos <= seq.getEnd(); pos++) { - throw new InvalidGFF3FieldException(attr, set, - "Couldn't parse the DNA alignment range correctly"); + MappedFeatures mf = fr2.findComplementFeaturesAtResidue(seq, pos); + + if (mf != null) + { + for (SequenceFeature sf : mf.features) + { + /* + * make a virtual feature with local coordinates + */ + if (!found.contains(sf)) + { + String group = sf.getFeatureGroup(); + if (group == null) + { + group = ""; + } + found.add(sf); + int begin = sf.getBegin(); + int end = sf.getEnd(); + int[] range = mf.getMappedPositions(begin, end); + SequenceFeature sf2 = new SequenceFeature(sf, range[0], + range[1], group, sf.getScore()); + complementary.add(sf2); + } + } + } } - if (torange.size() % 2 == 1) + + return complementary; + } + + /** + * Outputs any feature filters defined for visible feature types, sandwiched by + * STARTFILTERS and ENDFILTERS lines + * + * @param out + * @param visible + * @param featureFilters + */ + void outputFeatureFilters(StringBuilder out, + Map visible, + Map featureFilters) + { + if (visible == null || featureFilters == null + || featureFilters.isEmpty()) { - throw new InvalidGFF3FieldException(attr, set, - "Couldn't parse the protein alignment range correctly"); + return; } - // finally, build the map - int[] frommap = new int[fromrange.size()], tomap = new int[torange - .size()]; - int p = 0; - for (Integer ip : fromrange) + + boolean first = true; + for (String featureType : visible.keySet()) { - frommap[p++] = ip.intValue(); + FeatureMatcherSetI filter = featureFilters.get(featureType); + if (filter != null) + { + if (first) + { + first = false; + out.append(newline).append(STARTFILTERS).append(newline); + } + out.append(featureType).append(TAB).append(filter.toStableString()) + .append(newline); + } } - p = 0; - for (Integer ip : torange) + if (!first) { - tomap[p++] = ip.intValue(); + out.append(ENDFILTERS).append(newline); } - return new MapList(frommap, tomap, 3, 1); } - private List findNames(AlignmentI align, - List newseqs, boolean relaxedIdMatching, - List list) + /** + * Appends output of visible sequence features within feature groups to the + * output buffer. Groups other than the null or empty group are sandwiched by + * STARTGROUP and ENDGROUP lines. Answers the number of features written. + * + * @param out + * @param fr + * @param featureTypes + * @param sequences + * @param includeNonPositional + * @return + */ + private int outputFeaturesByGroup(StringBuilder out, + FeatureRenderer fr, String[] featureTypes, + SequenceI[] sequences, boolean includeNonPositional) { - List found = new ArrayList(); - for (String seqId : list) + List featureGroups = fr.getFeatureGroups(); + + /* + * sort groups alphabetically, and ensure that features with a + * null or empty group are output after those in named groups + */ + List sortedGroups = new ArrayList<>(featureGroups); + sortedGroups.remove(null); + sortedGroups.remove(""); + Collections.sort(sortedGroups); + sortedGroups.add(null); + sortedGroups.add(""); + + int count = 0; + List visibleGroups = fr.getDisplayedFeatureGroups(); + + /* + * loop over all groups (may be visible or not); + * non-positional features are output even if group is not visible + */ + for (String group : sortedGroups) { - SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs); - if (seq != null) + boolean firstInGroup = true; + boolean isNullGroup = group == null || "".equals(group); + + for (int i = 0; i < sequences.length; i++) + { + String sequenceName = sequences[i].getName(); + List features = new ArrayList<>(); + + /* + * get any non-positional features in this group, if wanted + * (for any feature type, whether visible or not) + */ + if (includeNonPositional) + { + features.addAll(sequences[i].getFeatures() + .getFeaturesForGroup(false, group)); + } + + /* + * add positional features for visible feature types, but + * (for named groups) only if feature group is visible + */ + if (featureTypes.length > 0 + && (isNullGroup || visibleGroups.contains(group))) + { + features.addAll(sequences[i].getFeatures().getFeaturesForGroup( + true, group, featureTypes)); + } + + for (SequenceFeature sf : features) + { + if (sf.isNonPositional() || fr.isVisible(sf)) + { + count++; + if (firstInGroup) + { + out.append(newline); + if (!isNullGroup) + { + out.append(STARTGROUP).append(TAB).append(group) + .append(newline); + } + } + firstInGroup = false; + formatJalviewFeature(out, sequenceName, sf); + } + } + } + + if (!isNullGroup && !firstInGroup) { - found.add(seq); + out.append(ENDGROUP).append(TAB).append(group).append(newline); } } - return found; + return count; } - private AlignmentI lastmatchedAl = null; - - private SequenceIdMatcher matcher = null; - /** - * clear any temporary handles used to speed up ID matching + * Formats one feature in Jalview format and appends to the string buffer + * + * @param out + * @param sequenceName + * @param sequenceFeature */ - private void resetMatcher() - { - lastmatchedAl = null; - matcher = null; - } - - private SequenceI findName(AlignmentI align, String seqId, - boolean relaxedIdMatching, List newseqs) + protected void formatJalviewFeature( + StringBuilder out, String sequenceName, + SequenceFeature sequenceFeature) { - SequenceI match = null; - if (relaxedIdMatching) + if (sequenceFeature.description == null + || sequenceFeature.description.equals("")) { - if (lastmatchedAl != align) - { - matcher = new SequenceIdMatcher( - (lastmatchedAl = align).getSequencesArray()); - if (newseqs != null) - { - matcher.addAll(newseqs); - } - } - match = matcher.findIdMatch(seqId); + out.append(sequenceFeature.type).append(TAB); } else { - match = align.findName(seqId, true); - if (match == null && newseqs != null) + if (sequenceFeature.links != null + && sequenceFeature.getDescription().indexOf("") == -1) { - for (SequenceI m : newseqs) + out.append(""); + } + + out.append(sequenceFeature.description); + if (sequenceFeature.links != null) + { + for (int l = 0; l < sequenceFeature.links.size(); l++) { - if (seqId.equals(m.getName())) + String label = sequenceFeature.links.elementAt(l); + String href = label.substring(label.indexOf("|") + 1); + label = label.substring(0, label.indexOf("|")); + + if (sequenceFeature.description.indexOf(href) == -1) { - return m; + out.append(" ") + .append(label).append(""); } } + + if (sequenceFeature.getDescription().indexOf("") == -1) + { + out.append(""); + } } - + + out.append(TAB); } - if (match==null && newseqs!=null) + out.append(sequenceName); + out.append("\t-1\t"); + out.append(sequenceFeature.begin); + out.append(TAB); + out.append(sequenceFeature.end); + out.append(TAB); + out.append(sequenceFeature.type); + if (!Float.isNaN(sequenceFeature.score)) { - match = new SequenceDummy(seqId); - if (relaxedIdMatching) - { - matcher.addAll(Arrays.asList(new SequenceI[] - { match })); - } - // add dummy sequence to the newseqs list - newseqs.add(match); + out.append(TAB); + out.append(sequenceFeature.score); } - return match; + out.append(newline); } - public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML) + + /** + * Parse method that is called when a GFF file is dragged to the desktop + */ + @Override + public void parse() { - if (sf.getDescription() == null) + AlignViewportI av = getViewport(); + if (av != null) { - return; + if (av.getAlignment() != null) + { + dataset = av.getAlignment().getDataset(); + } + if (dataset == null) + { + // working in the applet context ? + dataset = av.getAlignment(); + } } - jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks( - sf.getDescription(), removeHTML, newline); - - sf.description = (removeHTML) ? parsed.getNonHtmlContent() - : sf.description; - for (String link : parsed.getLinks()) + else { - sf.addLink(link); + dataset = new Alignment(new SequenceI[] {}); } + Map featureColours = new HashMap<>(); + boolean parseResult = parse(dataset, featureColours, false, true); + if (!parseResult) + { + // pass error up somehow + } + if (av != null) + { + // update viewport with the dataset data ? + } + else + { + setSeqs(dataset.getSequencesArray()); + } } /** - * generate a features file for seqs includes non-pos features by default. + * Implementation of unused abstract method * - * @param seqs - * source of sequence features - * @param visible - * hash of feature types and colours - * @return features file contents + * @return error message */ - public String printJalviewFormat(SequenceI[] seqs, Map visible) + @Override + public String print(SequenceI[] sqs, boolean jvsuffix) { - return printJalviewFormat(seqs, visible, true, true); + System.out.println("Use printGffFormat() or printJalviewFormat()"); + return null; } /** - * generate a features file for seqs with colours from visible (if any) + * Returns features output in GFF2 format * - * @param seqs - * source of features + * @param sequences + * the sequences whose features are to be + * output * @param visible - * hash of Colours for each feature type - * @param visOnly - * when true only feature types in 'visible' will be output - * @param nonpos - * indicates if non-positional features should be output (regardless - * of group or type) - * @return features file contents + * a map whose keys are the type names of + * visible features + * @param visibleFeatureGroups + * @param includeNonPositionalFeatures + * @param includeComplement + * @return */ - public String printJalviewFormat(SequenceI[] seqs, Map visible, - boolean visOnly, boolean nonpos) + public String printGffFormat(SequenceI[] sequences, + FeatureRenderer fr, boolean includeNonPositionalFeatures, + boolean includeComplement) { - StringBuffer out = new StringBuffer(); - SequenceFeature[] next; - boolean featuresGen = false; - if (visOnly && !nonpos && (visible == null || visible.size() < 1)) + FeatureRenderer fr2 = null; + if (includeComplement) { - // no point continuing. - return "No Features Visible"; + AlignViewportI comp = fr.getViewport().getCodingComplement(); + fr2 = Desktop.getAlignFrameFor(comp).getFeatureRenderer(); } - if (visible != null && visOnly) + Map visibleColours = fr.getDisplayedFeatureCols(); + + StringBuilder out = new StringBuilder(256); + + out.append(String.format("%s %d\n", GFF_VERSION, gffVersion == 0 ? 2 : gffVersion)); + + String[] types = visibleColours == null ? new String[0] + : visibleColours.keySet() + .toArray(new String[visibleColours.keySet().size()]); + + for (SequenceI seq : sequences) { - // write feature colours only if we're given them and we are generating - // viewed features - // TODO: decide if feature links should also be written here ? - Iterator en = visible.keySet().iterator(); - String type, color; - while (en.hasNext()) + List seqFeatures = new ArrayList<>(); + List features = new ArrayList<>(); + if (includeNonPositionalFeatures) { - type = en.next().toString(); - - if (visible.get(type) instanceof GraduatedColor) - { - GraduatedColor gc = (GraduatedColor) visible.get(type); - color = (gc.isColourByLabel() ? "label|" : "") - + Format.getHexString(gc.getMinColor()) + "|" - + Format.getHexString(gc.getMaxColor()) - + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|" - + gc.getMax() + "|"; - if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD) - { - if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD) - { - color += "below"; - } - else - { - if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD) - { - System.err.println("WARNING: Unsupported threshold type (" - + gc.getThreshType() + ") : Assuming 'above'"); - } - color += "above"; - } - // add the value - color += "|" + gc.getThresh(); - } - else - { - color += "none"; - } - } - else if (visible.get(type) instanceof java.awt.Color) - { - color = Format.getHexString((java.awt.Color) visible.get(type)); - } - else + features.addAll(seq.getFeatures().getNonPositionalFeatures()); + } + if (visibleColours != null && !visibleColours.isEmpty()) + { + features.addAll(seq.getFeatures().getPositionalFeatures(types)); + } + for (SequenceFeature sf : features) + { + if (sf.isNonPositional() || fr.isVisible(sf)) { - // legacy support for integer objects containing colour triplet values - color = Format.getHexString(new java.awt.Color(Integer - .parseInt(visible.get(type).toString()))); + /* + * drop features hidden by group visibility, colour threshold, + * or feature filter condition + */ + seqFeatures.add(sf); } - out.append(type); - out.append("\t"); - out.append(color); - out.append(newline); } - } - // Work out which groups are both present and visible - Vector groups = new Vector(); - int groupIndex = 0; - boolean isnonpos = false; - for (int i = 0; i < seqs.length; i++) - { - next = seqs[i].getSequenceFeatures(); - if (next != null) + if (includeComplement) { - for (int j = 0; j < next.length; j++) - { - isnonpos = next[j].begin == 0 && next[j].end == 0; - if ((!nonpos && isnonpos) - || (!isnonpos && visOnly && !visible - .containsKey(next[j].type))) - { - continue; - } + seqFeatures.addAll(findComplementaryFeatures(seq, fr2)); + } - if (next[j].featureGroup != null - && !groups.contains(next[j].featureGroup)) - { - groups.addElement(next[j].featureGroup); - } - } + /* + * sort features here if wanted + */ + for (SequenceFeature sf : seqFeatures) + { + formatGffFeature(out, seq, sf); + out.append(newline); } } - String group = null; - do + return out.toString(); + } + + /** + * Formats one feature as GFF and appends to the string buffer + */ + private void formatGffFeature(StringBuilder out, SequenceI seq, + SequenceFeature sf) + { + String source = sf.featureGroup; + if (source == null) { + source = sf.getDescription(); + } - if (groups.size() > 0 && groupIndex < groups.size()) - { - group = groups.elementAt(groupIndex).toString(); - out.append(newline); - out.append("STARTGROUP\t"); - out.append(group); - out.append(newline); - } - else + out.append(seq.getName()); + out.append(TAB); + out.append(source); + out.append(TAB); + out.append(sf.type); + out.append(TAB); + out.append(sf.begin); + out.append(TAB); + out.append(sf.end); + out.append(TAB); + out.append(sf.score); + out.append(TAB); + + int strand = sf.getStrand(); + out.append(strand == 1 ? "+" : (strand == -1 ? "-" : ".")); + out.append(TAB); + + String phase = sf.getPhase(); + out.append(phase == null ? "." : phase); + + if (sf.otherDetails != null && !sf.otherDetails.isEmpty()) + { + Map map = sf.otherDetails; + formatAttributes(out, map); + } + } + + /** + * A helper method that outputs attributes stored in the map as + * semicolon-delimited values e.g. + * + *
+   * AC_Male=0;AF_NFE=0.00000e 00;Hom_FIN=0;GQ_MEDIAN=9
+   * 
+ * + * A map-valued attribute is formatted as a comma-delimited list within braces, + * for example + * + *
+   * jvmap_CSQ={ALLELE_NUM=1,UNIPARC=UPI0002841053,Feature=ENST00000585561}
+   * 
+ * + * The {@code jvmap_} prefix designates a values map and is removed if the value + * is parsed when read in. (The GFF3 specification allows 'semi-structured data' + * to be represented provided the attribute name begins with a lower case + * letter.) + * + * @param sb + * @param map + * @see http://gmod.org/wiki/GFF3#GFF3_Format + */ + void formatAttributes(StringBuilder sb, Map map) + { + sb.append(TAB); + boolean first = true; + for (String key : map.keySet()) + { + if (SequenceFeature.STRAND.equals(key) + || SequenceFeature.PHASE.equals(key)) { - group = null; + /* + * values stashed in map but output to their own columns + */ + continue; } - - for (int i = 0; i < seqs.length; i++) { - next = seqs[i].getSequenceFeatures(); - if (next != null) + if (!first) { - for (int j = 0; j < next.length; j++) - { - isnonpos = next[j].begin == 0 && next[j].end == 0; - if ((!nonpos && isnonpos) - || (!isnonpos && visOnly && !visible - .containsKey(next[j].type))) - { - // skip if feature is nonpos and we ignore them or if we only - // output visible and it isn't non-pos and it's not visible - continue; - } - - if (group != null - && (next[j].featureGroup == null || !next[j].featureGroup - .equals(group))) - { - continue; - } - - if (group == null && next[j].featureGroup != null) - { - continue; - } - // we have features to output - featuresGen = true; - if (next[j].description == null - || next[j].description.equals("")) - { - out.append(next[j].type + "\t"); - } - else - { - if (next[j].links != null - && next[j].getDescription().indexOf("") == -1) - { - out.append(""); - } - - out.append(next[j].description + " "); - if (next[j].links != null) - { - for (int l = 0; l < next[j].links.size(); l++) - { - String label = next[j].links.elementAt(l).toString(); - String href = label.substring(label.indexOf("|") + 1); - label = label.substring(0, label.indexOf("|")); - - if (next[j].description.indexOf(href) == -1) - { - out.append("" + label + ""); - } - } - - if (next[j].getDescription().indexOf("") == -1) - { - out.append(""); - } - } - - out.append("\t"); - } - out.append(seqs[i].getName()); - out.append("\t-1\t"); - out.append(next[j].begin); - out.append("\t"); - out.append(next[j].end); - out.append("\t"); - out.append(next[j].type); - if (!Float.isNaN(next[j].score)) - { - out.append("\t"); - out.append(next[j].score); - } - out.append(newline); - } + sb.append(";"); } } - - if (group != null) + first = false; + Object value = map.get(key); + if (value instanceof Map) { - out.append("ENDGROUP\t"); - out.append(group); - out.append(newline); - groupIndex++; + formatMapAttribute(sb, key, (Map) value); } else { - break; + String formatted = StringUtils.urlEncode(value.toString(), + GffHelperI.GFF_ENCODABLE); + sb.append(key).append(EQUALS).append(formatted); } + } + } - } while (groupIndex < groups.size() + 1); - - if (!featuresGen) + /** + * Formats the map entries as + * + *
+   * key=key1=value1,key2=value2,...
+   * 
+ * + * and appends this to the string buffer + * + * @param sb + * @param key + * @param map + */ + private void formatMapAttribute(StringBuilder sb, String key, + Map map) + { + if (map == null || map.isEmpty()) { - return "No Features Visible"; + return; } - return out.toString(); + /* + * AbstractMap.toString would be a shortcut here, but more reliable + * to code the required format in case toString changes in future + */ + sb.append(key).append(EQUALS); + boolean first = true; + for (Entry entry : map.entrySet()) + { + if (!first) + { + sb.append(","); + } + first = false; + sb.append(entry.getKey().toString()).append(EQUALS); + String formatted = StringUtils.urlEncode(entry.getValue().toString(), + GffHelperI.GFF_ENCODABLE); + sb.append(formatted); + } } /** - * generate a gff file for sequence features includes non-pos features by - * default. + * Returns a mapping given list of one or more Align descriptors (exonerate + * format) * - * @param seqs - * @param visible + * @param alignedRegions + * a list of "Align fromStart toStart fromCount" + * @param mapIsFromCdna + * if true, 'from' is dna, else 'from' is protein + * @param strand + * either 1 (forward) or -1 (reverse) * @return + * @throws IOException */ - public String printGFFFormat(SequenceI[] seqs, Map visible) - { - return printGFFFormat(seqs, visible, true, true); - } - - public String printGFFFormat(SequenceI[] seqs, Map visible, - boolean visOnly, boolean nonpos) + protected MapList constructCodonMappingFromAlign( + List alignedRegions, boolean mapIsFromCdna, int strand) + throws IOException { - StringBuffer out = new StringBuffer(); - SequenceFeature[] next; - String source; - boolean isnonpos; - for (int i = 0; i < seqs.length; i++) + if (strand == 0) + { + throw new IOException( + "Invalid strand for a codon mapping (cannot be 0)"); + } + int regions = alignedRegions.size(); + // arrays to hold [start, end] for each aligned region + int[] fromRanges = new int[regions * 2]; // from dna + int[] toRanges = new int[regions * 2]; // to protein + int fromRangesIndex = 0; + int toRangesIndex = 0; + + for (String range : alignedRegions) { - if (seqs[i].getSequenceFeatures() != null) + /* + * Align mapFromStart mapToStart mapFromCount + * e.g. if mapIsFromCdna + * Align 11270 143 120 + * means: + * 120 bases from pos 11270 align to pos 143 in peptide + * if !mapIsFromCdna this would instead be + * Align 143 11270 40 + */ + String[] tokens = range.split(" "); + if (tokens.length != 3) { - next = seqs[i].getSequenceFeatures(); - for (int j = 0; j < next.length; j++) - { - isnonpos = next[j].begin == 0 && next[j].end == 0; - if ((!nonpos && isnonpos) - || (!isnonpos && visOnly && !visible - .containsKey(next[j].type))) - { - continue; - } + throw new IOException("Wrong number of fields for Align"); + } + int fromStart = 0; + int toStart = 0; + int fromCount = 0; + try + { + fromStart = Integer.parseInt(tokens[0]); + toStart = Integer.parseInt(tokens[1]); + fromCount = Integer.parseInt(tokens[2]); + } catch (NumberFormatException nfe) + { + throw new IOException( + "Invalid number in Align field: " + nfe.getMessage()); + } - source = next[j].featureGroup; - if (source == null) - { - source = next[j].getDescription(); - } + /* + * Jalview always models from dna to protein, so adjust values if the + * GFF mapping is from protein to dna + */ + if (!mapIsFromCdna) + { + fromCount *= 3; + int temp = fromStart; + fromStart = toStart; + toStart = temp; + } + fromRanges[fromRangesIndex++] = fromStart; + fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1); - out.append(seqs[i].getName()); - out.append("\t"); - out.append(source); - out.append("\t"); - out.append(next[j].type); - out.append("\t"); - out.append(next[j].begin); - out.append("\t"); - out.append(next[j].end); - out.append("\t"); - out.append(next[j].score); - out.append("\t"); - - if (next[j].getValue("STRAND") != null) - { - out.append(next[j].getValue("STRAND")); - out.append("\t"); - } - else - { - out.append(".\t"); - } + /* + * If a codon has an intron gap, there will be contiguous 'toRanges'; + * this is handled for us by the MapList constructor. + * (It is not clear that exonerate ever generates this case) + */ + toRanges[toRangesIndex++] = toStart; + toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3; + } - if (next[j].getValue("FRAME") != null) - { - out.append(next[j].getValue("FRAME")); - } - else - { - out.append("."); - } - // TODO: verify/check GFF - should there be a /t here before attribute - // output ? + return new MapList(fromRanges, toRanges, 3, 1); + } + + /** + * Parse a GFF format feature. This may include creating a 'dummy' sequence to + * hold the feature, or for its mapped sequence, or both, to be resolved + * either later in the GFF file (##FASTA section), or when the user loads + * additional sequences. + * + * @param gffColumns + * @param alignment + * @param relaxedIdMatching + * @param newseqs + * @return + */ + protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment, + boolean relaxedIdMatching, List newseqs) + { + /* + * GFF: seqid source type start end score strand phase [attributes] + */ + if (gffColumns.length < 5) + { + System.err.println("Ignoring GFF feature line with too few columns (" + + gffColumns.length + ")"); + return null; + } - if (next[j].getValue("ATTRIBUTES") != null) + /* + * locate referenced sequence in alignment _or_ + * as a forward or external reference (SequenceDummy) + */ + String seqId = gffColumns[0]; + SequenceI seq = findSequence(seqId, alignment, newseqs, + relaxedIdMatching); + + SequenceFeature sf = null; + GffHelperI helper = GffHelperFactory.getHelper(gffColumns); + if (helper != null) + { + try + { + sf = helper.processGff(seq, gffColumns, alignment, newseqs, + relaxedIdMatching); + if (sf != null) + { + seq.addSequenceFeature(sf); + while ((seq = alignment.findName(seq, seqId, true)) != null) { - out.append(next[j].getValue("ATTRIBUTES")); + seq.addSequenceFeature(new SequenceFeature(sf)); } - - out.append(newline); - } + } catch (IOException e) + { + System.err.println("GFF parsing failed with: " + e.getMessage()); + return null; } } - return out.toString(); + return seq; } /** - * this is only for the benefit of object polymorphism - method does nothing. + * After encountering ##fasta in a GFF3 file, process the remainder of the + * file as FAST sequence data. Any placeholder sequences created during + * feature parsing are updated with the actual sequences. + * + * @param align + * @param newseqs + * @throws IOException */ - public void parse() + protected void processAsFasta(AlignmentI align, List newseqs) + throws IOException { - // IGNORED + try + { + mark(); + } catch (IOException q) + { + } + // Opening a FastaFile object with the remainder of this object's dataIn. + // Tell the constructor to NOT close the dataIn when finished. + FastaFile parser = new FastaFile(this, false); + List includedseqs = parser.getSeqs(); + + SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); + + /* + * iterate over includedseqs, and replacing matching ones with newseqs + * sequences. Generic iterator not used here because we modify + * includedseqs as we go + */ + for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) + { + // search for any dummy seqs that this sequence can be used to update + SequenceI includedSeq = includedseqs.get(p); + SequenceI dummyseq = smatcher.findIdMatch(includedSeq); + if (dummyseq != null && dummyseq instanceof SequenceDummy) + { + // probably have the pattern wrong + // idea is that a flyweight proxy for a sequence ID can be created for + // 1. stable reference creation + // 2. addition of annotation + // 3. future replacement by a real sequence + // current pattern is to create SequenceDummy objects - a convenience + // constructor for a Sequence. + // problem is that when promoted to a real sequence, all references + // need to be updated somehow. We avoid that by keeping the same object. + ((SequenceDummy) dummyseq).become(includedSeq); + dummyseq.createDatasetSequence(); + + /* + * Update mappings so they are now to the dataset sequence + */ + for (AlignedCodonFrame mapping : align.getCodonFrames()) + { + mapping.updateToDataset(dummyseq); + } + + /* + * replace parsed sequence with the realised forward reference + */ + includedseqs.set(p, dummyseq); + + /* + * and remove from the newseqs list + */ + newseqs.remove(dummyseq); + } + } + + /* + * finally add sequences to the dataset + */ + for (SequenceI seq : includedseqs) + { + // experimental: mapping-based 'alignment' to query sequence + AlignmentUtils.alignSequenceAs(seq, align, + String.valueOf(align.getGapCharacter()), false, true); + + // rename sequences if GFF handler requested this + // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ? + List sfs = seq.getFeatures().getPositionalFeatures(); + if (!sfs.isEmpty()) + { + String newName = (String) sfs.get(0).getValue( + GffHelperI.RENAME_TOKEN); + if (newName != null) + { + seq.setName(newName); + } + } + align.addSequence(seq); + } } /** - * this is only for the benefit of object polymorphism - method does nothing. + * Process a ## directive * - * @return error message + * @param line + * @param gffProps + * @param align + * @param newseqs + * @throws IOException */ - public String print() + protected void processGffPragma(String line, Map gffProps, + AlignmentI align, List newseqs) throws IOException { - return "USE printGFFFormat() or printJalviewFormat()"; - } + line = line.trim(); + if ("###".equals(line)) + { + // close off any open 'forward references' + return; + } + + String[] tokens = line.substring(2).split(" "); + String pragma = tokens[0]; + String value = tokens.length == 1 ? null : tokens[1]; + if ("gff-version".equalsIgnoreCase(pragma)) + { + if (value != null) + { + try + { + // value may be e.g. "3.1.2" + gffVersion = Integer.parseInt(value.split("\\.")[0]); + } catch (NumberFormatException e) + { + // ignore + } + } + } + else if ("sequence-region".equalsIgnoreCase(pragma)) + { + // could capture if wanted here + } + else if ("feature-ontology".equalsIgnoreCase(pragma)) + { + // should resolve against the specified feature ontology URI + } + else if ("attribute-ontology".equalsIgnoreCase(pragma)) + { + // URI of attribute ontology - not currently used in GFF3 + } + else if ("source-ontology".equalsIgnoreCase(pragma)) + { + // URI of source ontology - not currently used in GFF3 + } + else if ("species-build".equalsIgnoreCase(pragma)) + { + // save URI of specific NCBI taxon version of annotations + gffProps.put("species-build", value); + } + else if ("fasta".equalsIgnoreCase(pragma)) + { + // process the rest of the file as a fasta file and replace any dummy + // sequence IDs + processAsFasta(align, newseqs); + } + else + { + System.err.println("Ignoring unknown pragma: " + line); + } + } }