X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FFeaturesFile.java;h=e0722c05cc832abc81dcbc134da6954470966206;hb=72ccf8af9a93159507587c9c0c7a39f77212ab0b;hp=bd7127ff225c4f72767ddd7b69d4fcce6f069024;hpb=0900cfda915f917ce29ced5401b7118ff2a5372a;p=jalview.git diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index bd7127f..e0722c0 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -20,18 +20,22 @@ */ package jalview.io; +import jalview.analysis.AlignmentUtils; import jalview.analysis.SequenceIdMatcher; import jalview.api.AlignViewportI; +import jalview.api.FeatureColourI; +import jalview.api.FeaturesSourceI; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.schemes.AnnotationColourGradient; -import jalview.schemes.GraduatedColor; -import jalview.schemes.UserColourScheme; -import jalview.util.Format; +import jalview.io.gff.GffHelperBase; +import jalview.io.gff.GffHelperFactory; +import jalview.io.gff.GffHelperI; +import jalview.schemes.FeatureColour; +import jalview.util.ColorUtils; import jalview.util.MapList; import jalview.util.ParseHtmlBodyAndLinks; import jalview.util.StringUtils; @@ -40,12 +44,11 @@ import java.awt.Color; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.StringTokenizer; /** * Parses and writes features files, which may be in Jalview, GFF2 or GFF3 @@ -63,25 +66,11 @@ import java.util.StringTokenizer; * @author jbprocter * @author gmcarstairs */ -public class FeaturesFile extends AlignFile +public class FeaturesFile extends AlignFile implements FeaturesSourceI { - private static final String NOTE = "Note"; - - private static final String ALIGN = "Align"; - - private static final String QUERY = "Query"; - - private static final String TARGET = "Target"; - - private static final String SIMILARITY = "similarity"; - - protected static final String STRAND = "STRAND"; + private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED"; - protected static final String FRAME = "FRAME"; - - protected static final String ATTRIBUTES = "ATTRIBUTES"; - - protected static final String TAB = "\t"; + private static final String NOTE = "Note"; protected static final String GFF_VERSION = "##gff-version"; @@ -103,13 +92,14 @@ public class FeaturesFile extends AlignFile /** * Constructor which does not parse the file immediately * - * @param inFile - * @param type + * @param file + * @param paste * @throws IOException */ - public FeaturesFile(String inFile, String type) throws IOException + public FeaturesFile(String file, DataSourceType paste) + throws IOException { - super(false, inFile, type); + super(false, file, paste); } /** @@ -125,14 +115,14 @@ public class FeaturesFile extends AlignFile * Constructor that optionally parses the file immediately * * @param parseImmediately - * @param inFile + * @param file * @param type * @throws IOException */ - public FeaturesFile(boolean parseImmediately, String inFile, String type) - throws IOException + public FeaturesFile(boolean parseImmediately, String file, + DataSourceType type) throws IOException { - super(parseImmediately, inFile, type); + super(parseImmediately, file, type); } /** @@ -147,8 +137,8 @@ public class FeaturesFile extends AlignFile * - process html strings into plain text * @return true if features were added */ - public boolean parse(AlignmentI align, Map colours, - boolean removeHTML) + public boolean parse(AlignmentI align, + Map colours, boolean removeHTML) { return parse(align, colours, removeHTML, false); } @@ -184,19 +174,20 @@ public class FeaturesFile extends AlignFile * - when true, ID matches to compound sequence IDs are allowed * @return true if features were added */ - public boolean parse(AlignmentI align, Map colours, - boolean removeHTML, boolean relaxedIdmatching) + public boolean parse(AlignmentI align, + Map colours, boolean removeHTML, + boolean relaxedIdmatching) { - Map gffProps = new HashMap(); + Map gffProps = new HashMap<>(); /* * keep track of any sequences we try to create from the data */ - List newseqs = new ArrayList(); + List newseqs = new ArrayList<>(); String line = null; try { - StringTokenizer st; + String[] gffColumns; String featureGroup = null; while ((line = nextLine()) != null) @@ -211,41 +202,46 @@ public class FeaturesFile extends AlignFile continue; } - st = new StringTokenizer(line, TAB); - if (st.countTokens() == 1) + gffColumns = line.split("\\t"); // tab as regex + if (gffColumns.length == 1) { if (line.trim().equalsIgnoreCase("GFF")) { /* - * Jalview features file with appendded GFF - * assume GFF2 (though it may declare gff-version 3) + * Jalview features file with appended GFF + * assume GFF2 (though it may declare ##gff-version 3) */ gffVersion = 2; continue; } } - if (st.countTokens() > 1 && st.countTokens() < 4) + if (gffColumns.length > 1 && gffColumns.length < 4) { /* * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or - * a feature type colour specification; not GFF format + * a feature type colour specification */ - String ft = st.nextToken(); + String ft = gffColumns[0]; if (ft.equalsIgnoreCase("startgroup")) { - featureGroup = st.nextToken(); + featureGroup = gffColumns[1]; } else if (ft.equalsIgnoreCase("endgroup")) { // We should check whether this is the current group, - // but at present theres no way of showing more than 1 group - st.nextToken(); + // but at present there's no way of showing more than 1 group featureGroup = null; } else { - parseFeatureColour(line, ft, st, colours); + String colscheme = gffColumns[1]; + FeatureColourI colour = FeatureColour + .parseJalviewFeatureColour(colscheme); + if (colour != null) + { + colours.put(ft, colour); + } } continue; } @@ -257,12 +253,12 @@ public class FeaturesFile extends AlignFile */ if (gffVersion == 0) { - parseJalviewFeature(line, st, align, colours, removeHTML, + parseJalviewFeature(line, gffColumns, align, colours, removeHTML, relaxedIdmatching, featureGroup); } else { - parseGffFeature(st, align, relaxedIdmatching, newseqs); + parseGff(gffColumns, align, relaxedIdmatching, newseqs); } } resetMatcher(); @@ -277,55 +273,67 @@ public class FeaturesFile extends AlignFile return false; } + /* + * experimental - add any dummy sequences with features to the alignment + * - we need them for Ensembl feature extraction - though maybe not otherwise + */ + for (SequenceI newseq : newseqs) + { + if (newseq.getFeatures().hasFeatures()) + { + align.addSequence(newseq); + } + } return true; } /** - * Try to parse a Jalview format feature specification. Returns true if - * successful or false if not. + * Try to parse a Jalview format feature specification and add it as a + * sequence feature to any matching sequences in the alignment. Returns true + * if successful (a feature was added), or false if not. * * @param line - * @param st + * @param gffColumns * @param alignment * @param featureColours * @param removeHTML * @param relaxedIdmatching * @param featureGroup */ - protected boolean parseJalviewFeature(String line, StringTokenizer st, - AlignmentI alignment, Map featureColours, - boolean removeHTML, boolean relaxedIdMatching, String featureGroup) + protected boolean parseJalviewFeature(String line, String[] gffColumns, + AlignmentI alignment, Map featureColours, + boolean removeHTML, boolean relaxedIdMatching, + String featureGroup) { /* - * Jalview: description seqid seqIndex start end type [score] + * tokens: description seqid seqIndex start end type [score] */ - if (st.countTokens() < 6) + if (gffColumns.length < 6) { System.err.println("Ignoring feature line '" + line - + "' with unexpected number of columns (" + st.countTokens() - + ")"); + + "' with too few columns (" + gffColumns.length + ")"); return false; } - String desc = st.nextToken(); - String seqId = st.nextToken(); - SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId); + String desc = gffColumns[0]; + String seqId = gffColumns[1]; + SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching); - if (!seqId.equals("ID_NOT_SPECIFIED")) + if (!ID_NOT_SPECIFIED.equals(seqId)) { - seq = findName(alignment, null, relaxedIdMatching, seqId); - st.nextToken(); + seq = findSequence(seqId, alignment, null, relaxedIdMatching); } else { seqId = null; seq = null; + String seqIndex = gffColumns[2]; try { - int idx = Integer.parseInt(st.nextToken()); + int idx = Integer.parseInt(seqIndex); seq = alignment.getSequenceAt(idx); } catch (NumberFormatException ex) { - // continue + System.err.println("Invalid sequence index: " + seqIndex); } } @@ -335,10 +343,10 @@ public class FeaturesFile extends AlignFile return false; } - int startPos = Integer.parseInt(st.nextToken()); - int endPos = Integer.parseInt(st.nextToken()); + int startPos = Integer.parseInt(gffColumns[3]); + int endPos = Integer.parseInt(gffColumns[4]); - String ft = st.nextToken(); + String ft = gffColumns[5]; if (!featureColours.containsKey(ft)) { @@ -346,23 +354,26 @@ public class FeaturesFile extends AlignFile * Perhaps an old style groups file with no colours - * synthesize a colour from the feature type */ - UserColourScheme ucs = new UserColourScheme(ft); - featureColours.put(ft, ucs.findColour('A')); + Color colour = ColorUtils.createColourFromName(ft); + featureColours.put(ft, new FeatureColour(colour)); } - SequenceFeature sf = new SequenceFeature(ft, desc, "", - startPos, endPos, featureGroup); - if (st.hasMoreTokens()) + SequenceFeature sf = null; + if (gffColumns.length > 6) { - float score = 0f; + float score = Float.NaN; try { - score = new Float(st.nextToken()).floatValue(); - // update colourgradient bounds if allowed to + score = new Float(gffColumns[6]).floatValue(); } catch (NumberFormatException ex) { - // leave as 0 + sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup); } - sf.setScore(score); + sf = new SequenceFeature(ft, desc, startPos, endPos, score, + featureGroup); + } + else + { + sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup); } parseDescriptionHTML(sf, removeHTML); @@ -378,225 +389,6 @@ public class FeaturesFile extends AlignFile } /** - * Process a feature type colour specification - * - * @param line - * the current input line (for error messages only) - * @param featureType - * the first token on the line - * @param st - * holds remaining tokens on the line - * @param colours - * map to which to add derived colour specification - */ - protected void parseFeatureColour(String line, String featureType, - StringTokenizer st, Map colours) - { - Object colour = null; - String colscheme = st.nextToken(); - if (colscheme.indexOf("|") > -1 - || colscheme.trim().equalsIgnoreCase("label")) - { - colour = parseGraduatedColourScheme(line, colscheme); - } - else - { - UserColourScheme ucs = new UserColourScheme(colscheme); - colour = ucs.findColour('A'); - } - if (colour != null) - { - colours.put(featureType, colour); - } - } - - /** - * Parse a Jalview graduated colour descriptor - * - * @param line - * @param colourDescriptor - * @return - */ - protected GraduatedColor parseGraduatedColourScheme(String line, - String colourDescriptor) - { - // Parse '|' separated graduated colourscheme fields: - // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue] - // can either provide 'label' only, first is optional, next two - // colors are required (but may be - // left blank), next is optional, nxt two min/max are required. - // first is either 'label' - // first/second and third are both hexadecimal or word equivalent - // colour. - // next two are values parsed as floats. - // fifth is either 'above','below', or 'none'. - // sixth is a float value and only required when fifth is either - // 'above' or 'below'. - StringTokenizer gcol = new StringTokenizer(colourDescriptor, "|", true); - // set defaults - float min = Float.MIN_VALUE, max = Float.MAX_VALUE; - boolean labelCol = false; - // Parse spec line - String mincol = gcol.nextToken(); - if (mincol == "|") - { - System.err - .println("Expected either 'label' or a colour specification in the line: " - + line); - return null; - } - String maxcol = null; - if (mincol.toLowerCase().indexOf("label") == 0) - { - labelCol = true; - mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip '|' - mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); - } - String abso = null, minval, maxval; - if (mincol != null) - { - // at least four more tokens - if (mincol.equals("|")) - { - mincol = ""; - } - else - { - gcol.nextToken(); // skip next '|' - } - // continue parsing rest of line - maxcol = gcol.nextToken(); - if (maxcol.equals("|")) - { - maxcol = ""; - } - else - { - gcol.nextToken(); // skip next '|' - } - abso = gcol.nextToken(); - gcol.nextToken(); // skip next '|' - if (abso.toLowerCase().indexOf("abso") != 0) - { - minval = abso; - abso = null; - } - else - { - minval = gcol.nextToken(); - gcol.nextToken(); // skip next '|' - } - maxval = gcol.nextToken(); - if (gcol.hasMoreTokens()) - { - gcol.nextToken(); // skip next '|' - } - try - { - if (minval.length() > 0) - { - min = Float.valueOf(minval); - } - } catch (Exception e) - { - System.err - .println("Couldn't parse the minimum value for graduated colour for type (" - + colourDescriptor - + ") - did you misspell 'auto' for the optional automatic colour switch ?"); - e.printStackTrace(); - } - try - { - if (maxval.length() > 0) - { - max = Float.valueOf(maxval); - } - } catch (Exception e) - { - System.err - .println("Couldn't parse the maximum value for graduated colour for type (" - + colourDescriptor + ")"); - e.printStackTrace(); - } - } - else - { - // add in some dummy min/max colours for the label-only - // colourscheme. - mincol = "FFFFFF"; - maxcol = "000000"; - } - - GraduatedColor colour = null; - try - { - colour = new GraduatedColor( - new UserColourScheme(mincol).findColour('A'), - new UserColourScheme(maxcol).findColour('A'), min, max); - } catch (Exception e) - { - System.err.println("Couldn't parse the graduated colour scheme (" - + colourDescriptor + ")"); - e.printStackTrace(); - } - if (colour != null) - { - colour.setColourByLabel(labelCol); - colour.setAutoScaled(abso == null); - // add in any additional parameters - String ttype = null, tval = null; - if (gcol.hasMoreTokens()) - { - // threshold type and possibly a threshold value - ttype = gcol.nextToken(); - if (ttype.toLowerCase().startsWith("below")) - { - colour.setThreshType(AnnotationColourGradient.BELOW_THRESHOLD); - } - else if (ttype.toLowerCase().startsWith("above")) - { - colour.setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD); - } - else - { - colour.setThreshType(AnnotationColourGradient.NO_THRESHOLD); - if (!ttype.toLowerCase().startsWith("no")) - { - System.err.println("Ignoring unrecognised threshold type : " - + ttype); - } - } - } - if (colour.getThreshType() != AnnotationColourGradient.NO_THRESHOLD) - { - try - { - gcol.nextToken(); - tval = gcol.nextToken(); - colour.setThresh(new Float(tval).floatValue()); - } catch (Exception e) - { - System.err.println("Couldn't parse threshold value as a float: (" - + tval + ")"); - e.printStackTrace(); - } - } - // parse the thresh-is-min token ? - if (gcol.hasMoreTokens()) - { - System.err - .println("Ignoring additional tokens in parameters in graduated colour specification\n"); - while (gcol.hasMoreTokens()) - { - System.err.println("|" + gcol.nextToken()); - } - System.err.println("\n"); - } - } - return colour; - } - - /** * clear any temporary handles used to speed up ID matching */ protected void resetMatcher() @@ -618,15 +410,18 @@ public class FeaturesFile extends AlignFile * list, and returns it * * + * @param seqId * @param align * @param newseqs * @param relaxedIdMatching - * @param seqId + * * @return */ - protected SequenceI findName(AlignmentI align, List newseqs, - boolean relaxedIdMatching, String seqId) + protected SequenceI findSequence(String seqId, AlignmentI align, + List newseqs, boolean relaxedIdMatching) { + // TODO encapsulate in SequenceIdMatcher, share the matcher + // with the GffHelper (removing code duplication) SequenceI match = null; if (relaxedIdMatching) { @@ -678,260 +473,191 @@ public class FeaturesFile extends AlignFile ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks( sf.getDescription(), removeHTML, newline); - sf.description = (removeHTML) ? parsed.getNonHtmlContent() - : sf.description; + if (removeHTML) + { + sf.setDescription(parsed.getNonHtmlContent()); + } + for (String link : parsed.getLinks()) { sf.addLink(link); } - - } - - /** - * generate a features file for seqs includes non-pos features by default. - * - * @param sequences - * source of sequence features - * @param visible - * hash of feature types and colours - * @return features file contents - */ - public String printJalviewFormat(SequenceI[] sequences, - Map visible) - { - return printJalviewFormat(sequences, visible, true, true); } /** - * generate a features file for seqs with colours from visible (if any) + * Returns contents of a Jalview format features file, for visible features, + * as filtered by type and group. Features with a null group are displayed if + * their feature type is visible. Non-positional features may optionally be + * included (with no check on type or group). * * @param sequences * source of features * @param visible - * hash of Colours for each feature type - * @param visOnly - * when true only feature types in 'visible' will be output - * @param nonpos - * indicates if non-positional features should be output (regardless - * of group or type) - * @return features file contents + * map of colour for each visible feature type + * @param visibleFeatureGroups + * @param includeNonPositional + * if true, include non-positional features (regardless of group or + * type) + * @return */ public String printJalviewFormat(SequenceI[] sequences, - Map visible, boolean visOnly, boolean nonpos) + Map visible, + List visibleFeatureGroups, boolean includeNonPositional) { - StringBuilder out = new StringBuilder(256); - boolean featuresGen = false; - if (visOnly && !nonpos && (visible == null || visible.size() < 1)) + if (!includeNonPositional && (visible == null || visible.isEmpty())) { // no point continuing. return "No Features Visible"; } - if (visible != null && visOnly) + /* + * write out feature colours (if we know them) + */ + // TODO: decide if feature links should also be written here ? + StringBuilder out = new StringBuilder(256); + if (visible != null) { - // write feature colours only if we're given them and we are generating - // viewed features - // TODO: decide if feature links should also be written here ? - Iterator en = visible.keySet().iterator(); - String featureType, color; - while (en.hasNext()) + for (Entry featureColour : visible.entrySet()) { - featureType = en.next().toString(); - - if (visible.get(featureType) instanceof GraduatedColor) - { - GraduatedColor gc = (GraduatedColor) visible.get(featureType); - color = (gc.isColourByLabel() ? "label|" : "") - + Format.getHexString(gc.getMinColor()) + "|" - + Format.getHexString(gc.getMaxColor()) - + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|" - + gc.getMax() + "|"; - if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD) - { - if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD) - { - color += "below"; - } - else - { - if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD) - { - System.err.println("WARNING: Unsupported threshold type (" - + gc.getThreshType() + ") : Assuming 'above'"); - } - color += "above"; - } - // add the value - color += "|" + gc.getThresh(); - } - else - { - color += "none"; - } - } - else if (visible.get(featureType) instanceof Color) - { - color = Format.getHexString((Color) visible.get(featureType)); - } - else - { - // legacy support for integer objects containing colour triplet values - color = Format.getHexString(new Color(Integer.parseInt(visible - .get(featureType).toString()))); - } - out.append(featureType); - out.append(TAB); - out.append(color); - out.append(newline); + FeatureColourI colour = featureColour.getValue(); + out.append(colour.toJalviewFormat(featureColour.getKey())).append( + newline); } } - // Work out which groups are both present and visible - List groups = new ArrayList(); - int groupIndex = 0; - boolean isnonpos = false; - SequenceFeature[] features; - for (int i = 0; i < sequences.length; i++) + String[] types = visible == null ? new String[0] : visible.keySet() + .toArray(new String[visible.keySet().size()]); + + /* + * sort groups alphabetically, and ensure that features with a + * null or empty group are output after those in named groups + */ + List sortedGroups = new ArrayList<>(visibleFeatureGroups); + sortedGroups.remove(null); + sortedGroups.remove(""); + Collections.sort(sortedGroups); + sortedGroups.add(null); + sortedGroups.add(""); + + boolean foundSome = false; + + /* + * first output any non-positional features + */ + if (includeNonPositional) { - features = sequences[i].getSequenceFeatures(); - if (features != null) + for (int i = 0; i < sequences.length; i++) { - for (int j = 0; j < features.length; j++) + String sequenceName = sequences[i].getName(); + for (SequenceFeature feature : sequences[i].getFeatures() + .getNonPositionalFeatures()) { - isnonpos = features[j].begin == 0 && features[j].end == 0; - if ((!nonpos && isnonpos) - || (!isnonpos && visOnly && !visible - .containsKey(features[j].type))) - { - continue; - } - - if (features[j].featureGroup != null - && !groups.contains(features[j].featureGroup)) - { - groups.add(features[j].featureGroup); - } + foundSome = true; + out.append(formatJalviewFeature(sequenceName, feature)); } } } - String group = null; - do + for (String group : sortedGroups) { - if (groups.size() > 0 && groupIndex < groups.size()) + boolean isNamedGroup = (group != null && !"".equals(group)); + if (isNamedGroup) { - group = groups.get(groupIndex); out.append(newline); out.append("STARTGROUP").append(TAB); out.append(group); out.append(newline); } - else - { - group = null; - } + /* + * output positional features within groups + */ for (int i = 0; i < sequences.length; i++) { - features = sequences[i].getSequenceFeatures(); - if (features != null) + String sequenceName = sequences[i].getName(); + List features = new ArrayList<>(); + if (types.length > 0) { - for (int j = 0; j < features.length; j++) - { - isnonpos = features[j].begin == 0 && features[j].end == 0; - if ((!nonpos && isnonpos) - || (!isnonpos && visOnly && !visible - .containsKey(features[j].type))) - { - // skip if feature is nonpos and we ignore them or if we only - // output visible and it isn't non-pos and it's not visible - continue; - } - - if (group != null - && (features[j].featureGroup == null || !features[j].featureGroup - .equals(group))) - { - continue; - } + features.addAll(sequences[i].getFeatures().getFeaturesForGroup( + true, group, types)); + } - if (group == null && features[j].featureGroup != null) - { - continue; - } - // we have features to output - featuresGen = true; - if (features[j].description == null - || features[j].description.equals("")) - { - out.append(features[j].type).append(TAB); - } - else - { - if (features[j].links != null - && features[j].getDescription().indexOf("") == -1) - { - out.append(""); - } - - out.append(features[j].description + " "); - if (features[j].links != null) - { - for (int l = 0; l < features[j].links.size(); l++) - { - String label = features[j].links.elementAt(l).toString(); - String href = label.substring(label.indexOf("|") + 1); - label = label.substring(0, label.indexOf("|")); - - if (features[j].description.indexOf(href) == -1) - { - out.append("" + label + ""); - } - } - - if (features[j].getDescription().indexOf("") == -1) - { - out.append(""); - } - } - - out.append(TAB); - } - out.append(sequences[i].getName()); - out.append("\t-1\t"); - out.append(features[j].begin); - out.append(TAB); - out.append(features[j].end); - out.append(TAB); - out.append(features[j].type); - if (!Float.isNaN(features[j].score)) - { - out.append(TAB); - out.append(features[j].score); - } - out.append(newline); - } + for (SequenceFeature sequenceFeature : features) + { + foundSome = true; + out.append(formatJalviewFeature(sequenceName, sequenceFeature)); } } - if (group != null) + if (isNamedGroup) { out.append("ENDGROUP").append(TAB); out.append(group); out.append(newline); - groupIndex++; } - else + } + + return foundSome ? out.toString() : "No Features Visible"; + } + + /** + * @param out + * @param sequenceName + * @param sequenceFeature + */ + protected String formatJalviewFeature( + String sequenceName, SequenceFeature sequenceFeature) + { + StringBuilder out = new StringBuilder(64); + if (sequenceFeature.description == null + || sequenceFeature.description.equals("")) + { + out.append(sequenceFeature.type).append(TAB); + } + else + { + if (sequenceFeature.links != null + && sequenceFeature.getDescription().indexOf("") == -1) { - break; + out.append(""); } - } while (groupIndex < groups.size() + 1); + out.append(sequenceFeature.description); + if (sequenceFeature.links != null) + { + for (int l = 0; l < sequenceFeature.links.size(); l++) + { + String label = sequenceFeature.links.elementAt(l); + String href = label.substring(label.indexOf("|") + 1); + label = label.substring(0, label.indexOf("|")); + + if (sequenceFeature.description.indexOf(href) == -1) + { + out.append(" " + label + ""); + } + } - if (!featuresGen) - { - return "No Features Visible"; + if (sequenceFeature.getDescription().indexOf("") == -1) + { + out.append(""); + } + } + + out.append(TAB); + } + out.append(sequenceName); + out.append("\t-1\t"); + out.append(sequenceFeature.begin); + out.append(TAB); + out.append(sequenceFeature.end); + out.append(TAB); + out.append(sequenceFeature.type); + if (!Float.isNaN(sequenceFeature.score)) + { + out.append(TAB); + out.append(sequenceFeature.score); } + out.append(newline); return out.toString(); } @@ -960,7 +686,8 @@ public class FeaturesFile extends AlignFile dataset = new Alignment(new SequenceI[] {}); } - boolean parseResult = parse(dataset, null, false, true); + Map featureColours = new HashMap<>(); + boolean parseResult = parse(dataset, featureColours, false, true); if (!parseResult) { // pass error up somehow @@ -981,24 +708,10 @@ public class FeaturesFile extends AlignFile * @return error message */ @Override - public String print() - { - return "Use printGffFormat() or printJalviewFormat()"; - } - - /** - * Returns features output in GFF2 format, including hidden and non-positional - * features - * - * @param sequences - * the sequences whose features are to be output - * @param visible - * a map whose keys are the type names of visible features - * @return - */ - public String printGffFormat(SequenceI[] sequences, Map visible) + public String print(SequenceI[] sqs, boolean jvsuffix) { - return printGffFormat(sequences, visible, true, true); + System.out.println("Use printGffFormat() or printJalviewFormat()"); + return null; } /** @@ -1008,79 +721,87 @@ public class FeaturesFile extends AlignFile * the sequences whose features are to be output * @param visible * a map whose keys are the type names of visible features - * @param outputVisibleOnly + * @param visibleFeatureGroups * @param includeNonPositionalFeatures * @return */ - public String printGffFormat(SequenceI[] sequences, Map visible, boolean outputVisibleOnly, + public String printGffFormat(SequenceI[] sequences, + Map visible, + List visibleFeatureGroups, boolean includeNonPositionalFeatures) { StringBuilder out = new StringBuilder(256); - out.append(String.format("%s %d\n", GFF_VERSION, gffVersion)); - String source; - boolean isnonpos; + + out.append(String.format("%s %d\n", GFF_VERSION, gffVersion == 0 ? 2 : gffVersion)); + + if (!includeNonPositionalFeatures + && (visible == null || visible.isEmpty())) + { + return out.toString(); + } + + String[] types = visible == null ? new String[0] : visible.keySet() + .toArray( + new String[visible.keySet().size()]); + for (SequenceI seq : sequences) { - SequenceFeature[] features = seq.getSequenceFeatures(); - if (features != null) + List features = new ArrayList<>(); + if (includeNonPositionalFeatures) + { + features.addAll(seq.getFeatures().getNonPositionalFeatures()); + } + if (visible != null && !visible.isEmpty()) { - for (SequenceFeature sf : features) + features.addAll(seq.getFeatures().getPositionalFeatures(types)); + } + + for (SequenceFeature sf : features) + { + String source = sf.featureGroup; + if (!sf.isNonPositional() && source != null + && !visibleFeatureGroups.contains(source)) { - isnonpos = sf.begin == 0 && sf.end == 0; - if (!includeNonPositionalFeatures && isnonpos) - { - /* - * ignore non-positional features if not wanted - */ - continue; - } - // TODO why the test !isnonpos here? - // what about not visible non-positional features? - if (!isnonpos && outputVisibleOnly - && !visible.containsKey(sf.type)) - { - /* - * ignore not visible features if not wanted - */ - continue; - } - - source = sf.featureGroup; - if (source == null) - { - source = sf.getDescription(); - } - - out.append(seq.getName()); - out.append(TAB); - out.append(source); - out.append(TAB); - out.append(sf.type); - out.append(TAB); - out.append(sf.begin); - out.append(TAB); - out.append(sf.end); - out.append(TAB); - out.append(sf.score); - out.append(TAB); - - out.append(sf.getValue(STRAND, ".")); - out.append(TAB); - - out.append(sf.getValue(FRAME, ".")); - - // miscellaneous key-values (GFF column 9) - String attributes = (String) sf.getValue(ATTRIBUTES); - if (attributes != null) - { - out.append(TAB).append(attributes); - } - - out.append(newline); + // group is not visible + continue; + } + + if (source == null) + { + source = sf.getDescription(); + } + + out.append(seq.getName()); + out.append(TAB); + out.append(source); + out.append(TAB); + out.append(sf.type); + out.append(TAB); + out.append(sf.begin); + out.append(TAB); + out.append(sf.end); + out.append(TAB); + out.append(sf.score); + out.append(TAB); + + int strand = sf.getStrand(); + out.append(strand == 1 ? "+" : (strand == -1 ? "-" : ".")); + out.append(TAB); + + String phase = sf.getPhase(); + out.append(phase == null ? "." : phase); + + // miscellaneous key-values (GFF column 9) + String attributes = sf.getAttributes(); + if (attributes != null) + { + out.append(TAB).append(attributes); } + + out.append(newline); } } - + return out.toString(); } @@ -1139,8 +860,8 @@ public class FeaturesFile extends AlignFile fromCount = Integer.parseInt(tokens[2]); } catch (NumberFormatException nfe) { - throw new IOException("Invalid number in Align field: " - + nfe.getMessage()); + throw new IOException( + "Invalid number in Align field: " + nfe.getMessage()); } /* @@ -1165,95 +886,66 @@ public class FeaturesFile extends AlignFile toRanges[toRangesIndex++] = toStart; toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3; } - + return new MapList(fromRanges, toRanges, 3, 1); } /** - * Parse a GFF format feature. This may include creating a 'dummy' sequence - * for the feature or its mapped sequence + * Parse a GFF format feature. This may include creating a 'dummy' sequence to + * hold the feature, or for its mapped sequence, or both, to be resolved + * either later in the GFF file (##FASTA section), or when the user loads + * additional sequences. * - * @param st + * @param gffColumns * @param alignment * @param relaxedIdMatching * @param newseqs * @return */ - protected SequenceI parseGffFeature(StringTokenizer st, - AlignmentI alignment, boolean relaxedIdMatching, - List newseqs) + protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment, + boolean relaxedIdMatching, List newseqs) { - SequenceI seq; /* * GFF: seqid source type start end score strand phase [attributes] */ - if (st.countTokens() < 8) + if (gffColumns.length < 5) { - System.err - .println("Ignoring GFF feature line with unexpected number of columns (" - + st.countTokens() + ")"); + System.err.println("Ignoring GFF feature line with too few columns (" + + gffColumns.length + ")"); return null; } - String seqId = st.nextToken(); - + /* * locate referenced sequence in alignment _or_ - * as a forward reference (SequenceDummy) + * as a forward or external reference (SequenceDummy) */ - seq = findName(alignment, newseqs, relaxedIdMatching, seqId); - - String desc = st.nextToken(); - String group = null; - if (desc.indexOf(' ') == -1) - { - // could also be a source term rather than description line - group = desc; - } - String ft = st.nextToken(); - int startPos = StringUtils.parseInt(st.nextToken()); - int endPos = StringUtils.parseInt(st.nextToken()); - // TODO: decide if non positional feature assertion for input data - // where end==0 is generally valid - if (endPos == 0) - { - // treat as non-positional feature, regardless. - startPos = 0; - } - float score = 0f; - try - { - score = new Float(st.nextToken()).floatValue(); - } catch (NumberFormatException ex) - { - // leave at 0 - } - - SequenceFeature sf = new SequenceFeature(ft, desc, startPos, - endPos, score, group); - if (st.hasMoreTokens()) - { - sf.setValue(STRAND, st.nextToken()); - } - if (st.hasMoreTokens()) - { - sf.setValue(FRAME, st.nextToken()); - } - - if (st.hasMoreTokens()) - { - processGffColumnNine(st.nextToken(), sf); - } - - if (processOrAddSeqFeature(alignment, newseqs, seq, sf, - relaxedIdMatching)) + String seqId = gffColumns[0]; + SequenceI seq = findSequence(seqId, alignment, newseqs, + relaxedIdMatching); + + SequenceFeature sf = null; + GffHelperI helper = GffHelperFactory.getHelper(gffColumns); + if (helper != null) { - // check whether we should add the sequence feature to any other - // sequences in the alignment with the same or similar - while ((seq = alignment.findName(seq, seqId, true)) != null) + try + { + sf = helper.processGff(seq, gffColumns, alignment, newseqs, + relaxedIdMatching); + if (sf != null) + { + seq.addSequenceFeature(sf); + while ((seq = alignment.findName(seq, seqId, true)) != null) + { + seq.addSequenceFeature(new SequenceFeature(sf)); + } + } + } catch (IOException e) { - seq.addSequenceFeature(new SequenceFeature(sf)); + System.err.println("GFF parsing failed with: " + e.getMessage()); + return null; } } + return seq; } @@ -1267,15 +959,16 @@ public class FeaturesFile extends AlignFile */ protected void processGffColumnNine(String attributes, SequenceFeature sf) { - sf.setValue(ATTRIBUTES, attributes); - + sf.setAttributes(attributes); + /* * Parse attributes in column 9 and add them to the sequence feature's * 'otherData' table; use Note as a best proxy for description */ - char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' }; - Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", - nameValueSeparator); + char nameValueSeparator = gffVersion == 3 ? '=' : ' '; + // TODO check we don't break GFF2 values which include commas here + Map> nameValues = GffHelperBase + .parseNameValuePairs(attributes, ";", nameValueSeparator, ","); for (Entry> attr : nameValues.entrySet()) { String values = StringUtils.listToDelimitedString(attr.getValue(), @@ -1308,42 +1001,74 @@ public class FeaturesFile extends AlignFile } FastaFile parser = new FastaFile(this); List includedseqs = parser.getSeqs(); + SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); - // iterate over includedseqs, and replacing matching ones with newseqs - // sequences. Generic iterator not used here because we modify includedseqs - // as we go + + /* + * iterate over includedseqs, and replacing matching ones with newseqs + * sequences. Generic iterator not used here because we modify + * includedseqs as we go + */ for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) { // search for any dummy seqs that this sequence can be used to update - SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p)); - if (dummyseq != null) + SequenceI includedSeq = includedseqs.get(p); + SequenceI dummyseq = smatcher.findIdMatch(includedSeq); + if (dummyseq != null && dummyseq instanceof SequenceDummy) { - // dummyseq was created so it could be annotated and referred to in - // alignments/codon mappings - - SequenceI mseq = includedseqs.get(p); - // mseq is the 'template' imported from the FASTA file which we'll use - // to coomplete dummyseq - if (dummyseq instanceof SequenceDummy) + // probably have the pattern wrong + // idea is that a flyweight proxy for a sequence ID can be created for + // 1. stable reference creation + // 2. addition of annotation + // 3. future replacement by a real sequence + // current pattern is to create SequenceDummy objects - a convenience + // constructor for a Sequence. + // problem is that when promoted to a real sequence, all references + // need to be updated somehow. We avoid that by keeping the same object. + ((SequenceDummy) dummyseq).become(includedSeq); + dummyseq.createDatasetSequence(); + + /* + * Update mappings so they are now to the dataset sequence + */ + for (AlignedCodonFrame mapping : align.getCodonFrames()) { - // probably have the pattern wrong - // idea is that a flyweight proxy for a sequence ID can be created for - // 1. stable reference creation - // 2. addition of annotation - // 3. future replacement by a real sequence - // current pattern is to create SequenceDummy objects - a convenience - // constructor for a Sequence. - // problem is that when promoted to a real sequence, all references - // need - // to be updated somehow. - ((SequenceDummy) dummyseq).become(mseq); - includedseqs.set(p, dummyseq); // template is no longer needed + mapping.updateToDataset(dummyseq); } + + /* + * replace parsed sequence with the realised forward reference + */ + includedseqs.set(p, dummyseq); + + /* + * and remove from the newseqs list + */ + newseqs.remove(dummyseq); } } - // finally add sequences to the dataset + + /* + * finally add sequences to the dataset + */ for (SequenceI seq : includedseqs) { + // experimental: mapping-based 'alignment' to query sequence + AlignmentUtils.alignSequenceAs(seq, align, + String.valueOf(align.getGapCharacter()), false, true); + + // rename sequences if GFF handler requested this + // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ? + List sfs = seq.getFeatures().getPositionalFeatures(); + if (!sfs.isEmpty()) + { + String newName = (String) sfs.get(0).getValue( + GffHelperI.RENAME_TOKEN); + if (newName != null) + { + seq.setName(newName); + } + } align.addSequence(seq); } } @@ -1357,8 +1082,8 @@ public class FeaturesFile extends AlignFile * @param newseqs * @throws IOException */ - protected void processGffPragma(String line, Map gffProps, AlignmentI align, - List newseqs) throws IOException + protected void processGffPragma(String line, Map gffProps, + AlignmentI align, List newseqs) throws IOException { line = line.trim(); if ("###".equals(line)) @@ -1366,11 +1091,11 @@ public class FeaturesFile extends AlignFile // close off any open 'forward references' return; } - + String[] tokens = line.substring(2).split(" "); String pragma = tokens[0]; String value = tokens.length == 1 ? null : tokens[1]; - + if ("gff-version".equalsIgnoreCase(pragma)) { if (value != null) @@ -1385,6 +1110,10 @@ public class FeaturesFile extends AlignFile } } } + else if ("sequence-region".equalsIgnoreCase(pragma)) + { + // could capture if wanted here + } else if ("feature-ontology".equalsIgnoreCase(pragma)) { // should resolve against the specified feature ontology URI @@ -1413,150 +1142,4 @@ public class FeaturesFile extends AlignFile System.err.println("Ignoring unknown pragma: " + line); } } - - /** - * Processes the 'Query' (or 'Target') and 'Align' properties associated with - * an exonerate GFF similarity feature; these properties define the mapping of - * the annotated feature (e.g. 'exon') to a related sequence. - * - * @param set - * @param seq - * @param sf - * @param align - * @param newseqs - * @param relaxedIdMatching - * @throws IOException - */ - public void processGffSimilarity(Map> set, SequenceI seq, - SequenceFeature sf, AlignmentI align, List newseqs, boolean relaxedIdMatching) - throws IOException - { - if (!validateExonerateModel(sf)) - { - return; - } - - int strand = sf.getStrand(); - - /* - * exonerate (protein2dna or protein2genome) may be run with - * --showquerygff outputs - * Target ; Align proteinStartPos dnaStartPos peptideCount - * --showtargetgff outputs - * Query ; Align dnaStartPos proteinStartPos nucleotideCount - * where the Align spec may repeat - */ - boolean mapIsFromCdna = true; - List mapTo = set.get(QUERY); - if (mapTo == null) - { - mapTo = set.get(TARGET); - mapIsFromCdna = false; - } - if (mapTo == null || mapTo.size() != 1) - { - throw new IOException( - "Expecting exactly one sequence in Query field (got " + mapTo - + ")"); - } - - /* - * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; - */ - SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching, - mapTo.get(0)); - /* - * Process the Align maps and create cdna/protein maps; - * ideally, the query sequences are in the alignment, but maybe not... - */ - AlignedCodonFrame alco = new AlignedCodonFrame(); - MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN), - mapIsFromCdna, strand); - - /* - * Jalview always maps from dna to protein - */ - if (mapIsFromCdna) - { - alco.addMap(seq, mappedSequence, codonmapping); - } - else - { - alco.addMap(mappedSequence, seq, codonmapping); - } - align.addCodonFrame(alco); - } - - /** - * Returns true if the exonerate model (saved from column 2 of the GFF as the - * SequenceFeature's group) is one that we are willing to process, else false - * - * @param sf - */ - protected boolean validateExonerateModel(SequenceFeature sf) - { - /* - * we don't handle protein-to-protein or dna-to-dna alignment here - */ - String source = sf.getFeatureGroup(); - if (source == null - || (!source.contains("protein2dna") && !source - .contains("protein2genome"))) - { - System.err - .println("I only accept protein2dna or protein2genome but found " - + source); - return false; - } - return true; - } - - /** - * take a sequence feature and examine its attributes to decide how it should - * be added to a sequence - * - * @param seq - * - the destination sequence constructed or discovered in the - * current context - * @param sf - * - the base feature with ATTRIBUTES property containing any - * additional attributes - * @param gFFFile - * - true if we are processing a GFF annotation file - * @return true if sf was actually added to the sequence, false if it was - * processed in another way - */ - public boolean processOrAddSeqFeature(AlignmentI align, List newseqs, - SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching) - { - String attr = (String) sf.getValue(ATTRIBUTES); - boolean addFeature = true; - if (attr != null) - { - for (String attset : attr.split(TAB)) - { - Map> set = StringUtils.parseNameValuePairs( - attset, ";", new char[] { ' ', '-' }); - - if (SIMILARITY.equals(sf.getType())) - { - try - { - addFeature = false; - processGffSimilarity(set, seq, sf, align, newseqs, - relaxedIdMatching); - } catch (IOException ivfe) - { - System.err.println(ivfe); - } - } - } - } - if (addFeature) - { - seq.addSequenceFeature(sf); - } - return addFeature; - } - }