/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.io; import jalview.analysis.SequenceIdMatcher; import jalview.api.AlignViewportI; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.schemes.AnnotationColourGradient; import jalview.schemes.GraduatedColor; import jalview.schemes.UserColourScheme; import jalview.util.Format; import jalview.util.MapList; import jalview.util.ParseHtmlBodyAndLinks; import jalview.util.StringUtils; import java.awt.Color; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.StringTokenizer; /** * Parses and writes features files, which may be in Jalview, GFF2 or GFF3 * format. These are tab-delimited formats but with differences in the use of * columns. * * A Jalview feature file may define feature colours and then declare that the * remainder of the file is in GFF format with the line 'GFF'. * * GFF3 files may include alignment mappings for features, which Jalview will * attempt to model, and may include sequence data following a ##FASTA line. * * * @author AMW * @author jbprocter * @author gmcarstairs */ public class FeaturesFile extends AlignFile { protected static final String STRAND = "STRAND"; protected static final String FRAME = "FRAME"; protected static final String ATTRIBUTES = "ATTRIBUTES"; protected static final String TAB = "\t"; protected static final String GFF_VERSION = "##gff-version"; private AlignmentI lastmatchedAl = null; private SequenceIdMatcher matcher = null; protected AlignmentI dataset; protected int gffVersion; /** * Creates a new FeaturesFile object. */ public FeaturesFile() { } /** * Constructor which does not parse the file immediately * * @param inFile * @param type * @throws IOException */ public FeaturesFile(String inFile, String type) throws IOException { super(false, inFile, type); } /** * @param source * @throws IOException */ public FeaturesFile(FileParse source) throws IOException { super(source); } /** * Constructor that optionally parses the file immediately * * @param parseImmediately * @param inFile * @param type * @throws IOException */ public FeaturesFile(boolean parseImmediately, String inFile, String type) throws IOException { super(parseImmediately, inFile, type); } /** * Parse GFF or sequence features file using case-independent matching, * discarding URLs * * @param align * - alignment/dataset containing sequences that are to be annotated * @param colours * - hashtable to store feature colour definitions * @param removeHTML * - process html strings into plain text * @return true if features were added */ public boolean parse(AlignmentI align, Map colours, boolean removeHTML) { return parse(align, colours, removeHTML, false); } /** * Extends the default addProperties by also adding peptide-to-cDNA mappings * (if any) derived while parsing a GFF file */ @Override public void addProperties(AlignmentI al) { super.addProperties(al); if (dataset != null && dataset.getCodonFrames() != null) { AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset(); for (AlignedCodonFrame codons : dataset.getCodonFrames()) { ds.addCodonFrame(codons); } } } /** * Parse GFF or Jalview format sequence features file * * @param align * - alignment/dataset containing sequences that are to be annotated * @param colours * - hashtable to store feature colour definitions * @param removeHTML * - process html strings into plain text * @param relaxedIdmatching * - when true, ID matches to compound sequence IDs are allowed * @return true if features were added */ public boolean parse(AlignmentI align, Map colours, boolean removeHTML, boolean relaxedIdmatching) { Map gffProps = new HashMap(); /* * keep track of any sequences we try to create from the data */ List newseqs = new ArrayList(); String line = null; try { StringTokenizer st; String featureGroup = null; while ((line = nextLine()) != null) { // skip comments/process pragmas if (line.length() == 0 || line.startsWith("#")) { if (line.toLowerCase().startsWith("##")) { processGffPragma(line, gffProps, align, newseqs); } continue; } st = new StringTokenizer(line, TAB); if (st.countTokens() == 1) { if (line.trim().equalsIgnoreCase("GFF")) { /* * Jalview features file with appendded GFF * assume GFF2 (though it may declare gff-version 3) */ gffVersion = 2; continue; } } if (st.countTokens() > 1 && st.countTokens() < 4) { /* * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or * a feature type colour specification; not GFF format */ String ft = st.nextToken(); if (ft.equalsIgnoreCase("startgroup")) { featureGroup = st.nextToken(); } else if (ft.equalsIgnoreCase("endgroup")) { // We should check whether this is the current group, // but at present theres no way of showing more than 1 group st.nextToken(); featureGroup = null; } else { parseFeatureColour(line, ft, st, colours); } continue; } /* * if not a comment, GFF pragma, startgroup, endgroup or feature * colour specification, that just leaves a feature details line * in either Jalview or GFF format */ if (gffVersion == 0) { parseJalviewFeature(line, st, align, colours, removeHTML, relaxedIdmatching, featureGroup); } else { parseGffFeature(st, align, relaxedIdmatching, newseqs); } } resetMatcher(); } catch (Exception ex) { // should report somewhere useful for UI if necessary warningMessage = ((warningMessage == null) ? "" : warningMessage) + "Parsing error at\n" + line; System.out.println("Error parsing feature file: " + ex + "\n" + line); ex.printStackTrace(System.err); resetMatcher(); return false; } return true; } /** * Try to parse a Jalview format feature specification. Returns true if * successful or false if not. * * @param line * @param st * @param alignment * @param featureColours * @param removeHTML * @param relaxedIdmatching * @param featureGroup */ protected boolean parseJalviewFeature(String line, StringTokenizer st, AlignmentI alignment, Map featureColours, boolean removeHTML, boolean relaxedIdmatching, String featureGroup) { /* * Jalview: description seqid seqIndex start end type [score] */ String desc = st.nextToken(); String seqId = st.nextToken(); SequenceI seq = findName(alignment, seqId, relaxedIdmatching, null); if (!st.hasMoreTokens()) { System.err .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up."); // in all probability, this isn't a file we understand, so bail // quietly. return false; } if (!seqId.equals("ID_NOT_SPECIFIED")) { seq = findName(alignment, seqId, relaxedIdmatching, null); st.nextToken(); } else { seqId = null; seq = null; try { int idx = Integer.parseInt(st.nextToken()); seq = alignment.getSequenceAt(idx); } catch (NumberFormatException ex) { // continue } } if (seq == null) { System.out.println("Sequence not found: " + line); return false; } int startPos = Integer.parseInt(st.nextToken()); int endPos = Integer.parseInt(st.nextToken()); String ft = st.nextToken(); if (!featureColours.containsKey(ft)) { /* * Perhaps an old style groups file with no colours - * synthesize a colour from the feature type */ UserColourScheme ucs = new UserColourScheme(ft); featureColours.put(ft, ucs.findColour('A')); } SequenceFeature sf = new SequenceFeature(ft, desc, "", startPos, endPos, featureGroup); if (st.hasMoreTokens()) { float score = 0f; try { score = new Float(st.nextToken()).floatValue(); // update colourgradient bounds if allowed to } catch (NumberFormatException ex) { // leave as 0 } sf.setScore(score); } parseDescriptionHTML(sf, removeHTML); seq.addSequenceFeature(sf); while (seqId != null && (seq = alignment.findName(seq, seqId, false)) != null) { seq.addSequenceFeature(new SequenceFeature(sf)); } return true; } /** * Process a feature type colour specification * * @param line * the current input line (for error messages only) * @param featureType * the first token on the line * @param st * holds remaining tokens on the line * @param colours * map to which to add derived colour specification */ protected void parseFeatureColour(String line, String featureType, StringTokenizer st, Map colours) { Object colour = null; String colscheme = st.nextToken(); if (colscheme.indexOf("|") > -1 || colscheme.trim().equalsIgnoreCase("label")) { colour = parseGraduatedColourScheme(line, colscheme); } else { UserColourScheme ucs = new UserColourScheme(colscheme); colour = ucs.findColour('A'); } if (colour != null) { colours.put(featureType, colour); } } /** * Parse a Jalview graduated colour descriptor * * @param line * @param colourDescriptor * @return */ protected GraduatedColor parseGraduatedColourScheme(String line, String colourDescriptor) { // Parse '|' separated graduated colourscheme fields: // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue] // can either provide 'label' only, first is optional, next two // colors are required (but may be // left blank), next is optional, nxt two min/max are required. // first is either 'label' // first/second and third are both hexadecimal or word equivalent // colour. // next two are values parsed as floats. // fifth is either 'above','below', or 'none'. // sixth is a float value and only required when fifth is either // 'above' or 'below'. StringTokenizer gcol = new StringTokenizer(colourDescriptor, "|", true); // set defaults float min = Float.MIN_VALUE, max = Float.MAX_VALUE; boolean labelCol = false; // Parse spec line String mincol = gcol.nextToken(); if (mincol == "|") { System.err .println("Expected either 'label' or a colour specification in the line: " + line); return null; } String maxcol = null; if (mincol.toLowerCase().indexOf("label") == 0) { labelCol = true; mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip '|' mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); } String abso = null, minval, maxval; if (mincol != null) { // at least four more tokens if (mincol.equals("|")) { mincol = ""; } else { gcol.nextToken(); // skip next '|' } // continue parsing rest of line maxcol = gcol.nextToken(); if (maxcol.equals("|")) { maxcol = ""; } else { gcol.nextToken(); // skip next '|' } abso = gcol.nextToken(); gcol.nextToken(); // skip next '|' if (abso.toLowerCase().indexOf("abso") != 0) { minval = abso; abso = null; } else { minval = gcol.nextToken(); gcol.nextToken(); // skip next '|' } maxval = gcol.nextToken(); if (gcol.hasMoreTokens()) { gcol.nextToken(); // skip next '|' } try { if (minval.length() > 0) { min = Float.valueOf(minval); } } catch (Exception e) { System.err .println("Couldn't parse the minimum value for graduated colour for type (" + colourDescriptor + ") - did you misspell 'auto' for the optional automatic colour switch ?"); e.printStackTrace(); } try { if (maxval.length() > 0) { max = Float.valueOf(maxval); } } catch (Exception e) { System.err .println("Couldn't parse the maximum value for graduated colour for type (" + colourDescriptor + ")"); e.printStackTrace(); } } else { // add in some dummy min/max colours for the label-only // colourscheme. mincol = "FFFFFF"; maxcol = "000000"; } GraduatedColor colour = null; try { colour = new GraduatedColor( new UserColourScheme(mincol).findColour('A'), new UserColourScheme(maxcol).findColour('A'), min, max); } catch (Exception e) { System.err.println("Couldn't parse the graduated colour scheme (" + colourDescriptor + ")"); e.printStackTrace(); } if (colour != null) { colour.setColourByLabel(labelCol); colour.setAutoScaled(abso == null); // add in any additional parameters String ttype = null, tval = null; if (gcol.hasMoreTokens()) { // threshold type and possibly a threshold value ttype = gcol.nextToken(); if (ttype.toLowerCase().startsWith("below")) { colour.setThreshType(AnnotationColourGradient.BELOW_THRESHOLD); } else if (ttype.toLowerCase().startsWith("above")) { colour.setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD); } else { colour.setThreshType(AnnotationColourGradient.NO_THRESHOLD); if (!ttype.toLowerCase().startsWith("no")) { System.err.println("Ignoring unrecognised threshold type : " + ttype); } } } if (colour.getThreshType() != AnnotationColourGradient.NO_THRESHOLD) { try { gcol.nextToken(); tval = gcol.nextToken(); colour.setThresh(new Float(tval).floatValue()); } catch (Exception e) { System.err.println("Couldn't parse threshold value as a float: (" + tval + ")"); e.printStackTrace(); } } // parse the thresh-is-min token ? if (gcol.hasMoreTokens()) { System.err .println("Ignoring additional tokens in parameters in graduated colour specification\n"); while (gcol.hasMoreTokens()) { System.err.println("|" + gcol.nextToken()); } System.err.println("\n"); } } return colour; } /** * clear any temporary handles used to speed up ID matching */ protected void resetMatcher() { lastmatchedAl = null; matcher = null; } /** * Returns a sequence matching the given id, as follows *
    *
  • matching is on exact sequence name, or on a token within the sequence * name, or a dbxref, if relaxed matching is selected
  • *
  • first tries to find a match in the alignment sequences
  • *
  • else tries to find a match in the new sequences already generated * parsing the features file
  • *
  • else creates a new placeholder sequence, adds it to the new sequences * list, and returns it
  • *
* * @param align * @param seqId * @param relaxedIdMatching * @param newseqs * @return */ protected SequenceI findName(AlignmentI align, String seqId, boolean relaxedIdMatching, List newseqs) { SequenceI match = null; if (relaxedIdMatching) { if (lastmatchedAl != align) { lastmatchedAl = align; matcher = new SequenceIdMatcher(align.getSequencesArray()); if (newseqs != null) { matcher.addAll(newseqs); } } match = matcher.findIdMatch(seqId); } else { match = align.findName(seqId, true); if (match == null && newseqs != null) { for (SequenceI m : newseqs) { if (seqId.equals(m.getName())) { return m; } } } } if (match == null && newseqs != null) { match = new SequenceDummy(seqId); if (relaxedIdMatching) { matcher.addAll(Arrays.asList(new SequenceI[] { match })); } // add dummy sequence to the newseqs list newseqs.add(match); } return match; } public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML) { if (sf.getDescription() == null) { return; } ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks( sf.getDescription(), removeHTML, newline); sf.description = (removeHTML) ? parsed.getNonHtmlContent() : sf.description; for (String link : parsed.getLinks()) { sf.addLink(link); } } /** * generate a features file for seqs includes non-pos features by default. * * @param sequences * source of sequence features * @param visible * hash of feature types and colours * @return features file contents */ public String printJalviewFormat(SequenceI[] sequences, Map visible) { return printJalviewFormat(sequences, visible, true, true); } /** * generate a features file for seqs with colours from visible (if any) * * @param sequences * source of features * @param visible * hash of Colours for each feature type * @param visOnly * when true only feature types in 'visible' will be output * @param nonpos * indicates if non-positional features should be output (regardless * of group or type) * @return features file contents */ public String printJalviewFormat(SequenceI[] sequences, Map visible, boolean visOnly, boolean nonpos) { StringBuilder out = new StringBuilder(256); boolean featuresGen = false; if (visOnly && !nonpos && (visible == null || visible.size() < 1)) { // no point continuing. return "No Features Visible"; } if (visible != null && visOnly) { // write feature colours only if we're given them and we are generating // viewed features // TODO: decide if feature links should also be written here ? Iterator en = visible.keySet().iterator(); String featureType, color; while (en.hasNext()) { featureType = en.next().toString(); if (visible.get(featureType) instanceof GraduatedColor) { GraduatedColor gc = (GraduatedColor) visible.get(featureType); color = (gc.isColourByLabel() ? "label|" : "") + Format.getHexString(gc.getMinColor()) + "|" + Format.getHexString(gc.getMaxColor()) + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|" + gc.getMax() + "|"; if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD) { if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD) { color += "below"; } else { if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD) { System.err.println("WARNING: Unsupported threshold type (" + gc.getThreshType() + ") : Assuming 'above'"); } color += "above"; } // add the value color += "|" + gc.getThresh(); } else { color += "none"; } } else if (visible.get(featureType) instanceof Color) { color = Format.getHexString((Color) visible.get(featureType)); } else { // legacy support for integer objects containing colour triplet values color = Format.getHexString(new Color(Integer.parseInt(visible .get(featureType).toString()))); } out.append(featureType); out.append(TAB); out.append(color); out.append(newline); } } // Work out which groups are both present and visible List groups = new ArrayList(); int groupIndex = 0; boolean isnonpos = false; SequenceFeature[] features; for (int i = 0; i < sequences.length; i++) { features = sequences[i].getSequenceFeatures(); if (features != null) { for (int j = 0; j < features.length; j++) { isnonpos = features[j].begin == 0 && features[j].end == 0; if ((!nonpos && isnonpos) || (!isnonpos && visOnly && !visible .containsKey(features[j].type))) { continue; } if (features[j].featureGroup != null && !groups.contains(features[j].featureGroup)) { groups.add(features[j].featureGroup); } } } } String group = null; do { if (groups.size() > 0 && groupIndex < groups.size()) { group = groups.get(groupIndex); out.append(newline); out.append("STARTGROUP").append(TAB); out.append(group); out.append(newline); } else { group = null; } for (int i = 0; i < sequences.length; i++) { features = sequences[i].getSequenceFeatures(); if (features != null) { for (int j = 0; j < features.length; j++) { isnonpos = features[j].begin == 0 && features[j].end == 0; if ((!nonpos && isnonpos) || (!isnonpos && visOnly && !visible .containsKey(features[j].type))) { // skip if feature is nonpos and we ignore them or if we only // output visible and it isn't non-pos and it's not visible continue; } if (group != null && (features[j].featureGroup == null || !features[j].featureGroup .equals(group))) { continue; } if (group == null && features[j].featureGroup != null) { continue; } // we have features to output featuresGen = true; if (features[j].description == null || features[j].description.equals("")) { out.append(features[j].type).append(TAB); } else { if (features[j].links != null && features[j].getDescription().indexOf("") == -1) { out.append(""); } out.append(features[j].description + " "); if (features[j].links != null) { for (int l = 0; l < features[j].links.size(); l++) { String label = features[j].links.elementAt(l).toString(); String href = label.substring(label.indexOf("|") + 1); label = label.substring(0, label.indexOf("|")); if (features[j].description.indexOf(href) == -1) { out.append("" + label + ""); } } if (features[j].getDescription().indexOf("") == -1) { out.append(""); } } out.append(TAB); } out.append(sequences[i].getName()); out.append("\t-1\t"); out.append(features[j].begin); out.append(TAB); out.append(features[j].end); out.append(TAB); out.append(features[j].type); if (!Float.isNaN(features[j].score)) { out.append(TAB); out.append(features[j].score); } out.append(newline); } } } if (group != null) { out.append("ENDGROUP").append(TAB); out.append(group); out.append(newline); groupIndex++; } else { break; } } while (groupIndex < groups.size() + 1); if (!featuresGen) { return "No Features Visible"; } return out.toString(); } /** * Parse method that is called when a GFF file is dragged to the desktop */ @Override public void parse() { AlignViewportI av = getViewport(); if (av != null) { if (av.getAlignment() != null) { dataset = av.getAlignment().getDataset(); } if (dataset == null) { // working in the applet context ? dataset = av.getAlignment(); } } else { dataset = new Alignment(new SequenceI[] {}); } boolean parseResult = parse(dataset, null, false, true); if (!parseResult) { // pass error up somehow } if (av != null) { // update viewport with the dataset data ? } else { setSeqs(dataset.getSequencesArray()); } } /** * Implementation of unused abstract method * * @return error message */ @Override public String print() { return "Use printGffFormat() or printJalviewFormat()"; } /** * Returns features output in GFF2 format, including hidden and non-positional * features * * @param sequences * the sequences whose features are to be output * @param visible * a map whose keys are the type names of visible features * @return */ public String printGffFormat(SequenceI[] sequences, Map visible) { return printGffFormat(sequences, visible, true, true); } /** * Returns features output in GFF2 format * * @param sequences * the sequences whose features are to be output * @param visible * a map whose keys are the type names of visible features * @param outputVisibleOnly * @param includeNonPositionalFeatures * @return */ public String printGffFormat(SequenceI[] sequences, Map visible, boolean outputVisibleOnly, boolean includeNonPositionalFeatures) { StringBuilder out = new StringBuilder(256); out.append(String.format("%s %d\n", GFF_VERSION, gffVersion)); String source; boolean isnonpos; for (SequenceI seq : sequences) { SequenceFeature[] features = seq.getSequenceFeatures(); if (features != null) { for (SequenceFeature sf : features) { isnonpos = sf.begin == 0 && sf.end == 0; if (!includeNonPositionalFeatures && isnonpos) { /* * ignore non-positional features if not wanted */ continue; } // TODO why the test !isnonpos here? // what about not visible non-positional features? if (!isnonpos && outputVisibleOnly && !visible.containsKey(sf.type)) { /* * ignore not visible features if not wanted */ continue; } source = sf.featureGroup; if (source == null) { source = sf.getDescription(); } out.append(seq.getName()); out.append(TAB); out.append(source); out.append(TAB); out.append(sf.type); out.append(TAB); out.append(sf.begin); out.append(TAB); out.append(sf.end); out.append(TAB); out.append(sf.score); out.append(TAB); out.append(sf.getValue(STRAND, ".")); out.append(TAB); out.append(sf.getValue(FRAME, ".")); // miscellaneous key-values (GFF column 9) String attributes = (String) sf.getValue(ATTRIBUTES); if (attributes != null) { out.append(TAB).append(attributes); } out.append(newline); } } } return out.toString(); } /** * Helper method to make a mapping given a set of attributes for a GFF feature * * @param set * @param attr * @param strand * either 1 (forward) or -1 (reverse) * @return * @throws InvalidGFF3FieldException */ protected MapList constructCodonMappingFromAlign( Map> set, String attr, int strand) throws InvalidGFF3FieldException { if (strand == 0) { throw new InvalidGFF3FieldException(attr, set, "Invalid strand for a codon mapping (cannot be 0)"); } List fromrange = new ArrayList(); List torange = new ArrayList(); int lastppos = 0, lastpframe = 0; for (String range : set.get(attr)) { List ints = new ArrayList(); StringTokenizer st = new StringTokenizer(range, " "); while (st.hasMoreTokens()) { String num = st.nextToken(); try { ints.add(new Integer(num)); } catch (NumberFormatException nfe) { throw new InvalidGFF3FieldException(attr, set, "Invalid number in field " + num); } } /* * Align positionInRef positionInQuery LengthInRef * contig_1146 exonerate:p2g:local similarity 8534 11269 3652 - . * alignment_id 0 ; Query DDB_G0269124 Align 11270 143 120 * means: * 120 bases align at pos 143 in protein to 11270 on dna (-ve strand) * and so on for additional ' ; Align x y z' groups */ if (ints.size() != 3) { throw new InvalidGFF3FieldException(attr, set, "Invalid number of fields for this attribute (" + ints.size() + ")"); } fromrange.add(ints.get(0)); fromrange.add(ints.get(0) + strand * ints.get(2)); // how are intron/exon boundaries that do not align in codons // represented if (ints.get(1).intValue() == lastppos && lastpframe > 0) { // extend existing to map lastppos += ints.get(2) / 3; lastpframe = ints.get(2) % 3; torange.set(torange.size() - 1, new Integer(lastppos)); } else { // new to map range torange.add(ints.get(1)); lastppos = ints.get(1) + ints.get(2) / 3; lastpframe = ints.get(2) % 3; torange.add(new Integer(lastppos)); } } // from and to ranges must end up being a series of start/end intervals if (fromrange.size() % 2 == 1) { throw new InvalidGFF3FieldException(attr, set, "Couldn't parse the DNA alignment range correctly"); } if (torange.size() % 2 == 1) { throw new InvalidGFF3FieldException(attr, set, "Couldn't parse the protein alignment range correctly"); } // finally, build the map int[] frommap = new int[fromrange.size()], tomap = new int[torange .size()]; int p = 0; for (Integer ip : fromrange) { frommap[p++] = ip.intValue(); } p = 0; for (Integer ip : torange) { tomap[p++] = ip.intValue(); } return new MapList(frommap, tomap, 3, 1); } private List findNames(AlignmentI align, List newseqs, boolean relaxedIdMatching, List list) { List found = new ArrayList(); for (String seqId : list) { SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs); if (seq != null) { found.add(seq); } } return found; } /** * Parse a GFF format feature. This may include creating a 'dummy' sequence * for the feature or its mapped sequence * * @param st * @param alignment * @param relaxedIdmatching * @param newseqs * @return */ protected SequenceI parseGffFeature(StringTokenizer st, AlignmentI alignment, boolean relaxedIdmatching, List newseqs) { SequenceI seq; /* * GFF: seqid source type start end score strand phase [attributes] */ String seqId = st.nextToken(); /* * locate referenced sequence in alignment _or_ * as a forward reference (SequenceDummy) */ seq = findName(alignment, seqId, relaxedIdmatching, newseqs); String desc = st.nextToken(); String group = null; if (desc.indexOf(' ') == -1) { // could also be a source term rather than description line group = desc; } String ft = st.nextToken(); int startPos = StringUtils.parseInt(st.nextToken()); int endPos = StringUtils.parseInt(st.nextToken()); // TODO: decide if non positional feature assertion for input data // where end==0 is generally valid if (endPos == 0) { // treat as non-positional feature, regardless. startPos = 0; } float score = 0f; try { score = new Float(st.nextToken()).floatValue(); } catch (NumberFormatException ex) { // leave at 0 } SequenceFeature sf = new SequenceFeature(ft, desc, startPos, endPos, score, group); if (st.hasMoreTokens()) { sf.setValue(STRAND, st.nextToken()); } if (st.hasMoreTokens()) { sf.setValue(FRAME, st.nextToken()); } if (st.hasMoreTokens()) { String attributes = st.nextToken(); sf.setValue(ATTRIBUTES, attributes); /* * parse semi-structured attributes in column 9 and add them to the * sequence feature's 'otherData' table; use Note as a best proxy for * description */ Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", new char[] { ' ', '=' }); for (Entry> attr : nameValues.entrySet()) { String values = StringUtils.listToDelimitedString(attr.getValue(), "; "); sf.setValue(attr.getKey(), values); if ("Note".equals(attr.getKey())) { sf.setDescription(values); } } } if (processOrAddSeqFeature(alignment, newseqs, seq, sf, relaxedIdmatching)) { // check whether we should add the sequence feature to any other // sequences in the alignment with the same or similar while ((seq = alignment.findName(seq, seqId, true)) != null) { seq.addSequenceFeature(new SequenceFeature(sf)); } } return seq; } /** * After encountering ##fasta in a GFF3 file, process the remainder of the * file as FAST sequence data. Any placeholder sequences created during * feature parsing are updated with the actual sequences. * * @param align * @param newseqs * @throws IOException */ protected void processAsFasta(AlignmentI align, List newseqs) throws IOException { try { mark(); } catch (IOException q) { } FastaFile parser = new FastaFile(this); List includedseqs = parser.getSeqs(); SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); // iterate over includedseqs, and replacing matching ones with newseqs // sequences. Generic iterator not used here because we modify includedseqs // as we go for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) { // search for any dummy seqs that this sequence can be used to update SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p)); if (dummyseq != null) { // dummyseq was created so it could be annotated and referred to in // alignments/codon mappings SequenceI mseq = includedseqs.get(p); // mseq is the 'template' imported from the FASTA file which we'll use // to coomplete dummyseq if (dummyseq instanceof SequenceDummy) { // probably have the pattern wrong // idea is that a flyweight proxy for a sequence ID can be created for // 1. stable reference creation // 2. addition of annotation // 3. future replacement by a real sequence // current pattern is to create SequenceDummy objects - a convenience // constructor for a Sequence. // problem is that when promoted to a real sequence, all references // need // to be updated somehow. ((SequenceDummy) dummyseq).become(mseq); includedseqs.set(p, dummyseq); // template is no longer needed } } } // finally add sequences to the dataset for (SequenceI seq : includedseqs) { align.addSequence(seq); } } /** * Process a ## directive * * @param line * @param gffProps * @param align * @param newseqs * @throws IOException */ protected void processGffPragma(String line, Map gffProps, AlignmentI align, List newseqs) throws IOException { line = line.trim(); if ("###".equals(line)) { // close off any open 'forward references' return; } String[] tokens = line.substring(2).split(" "); String pragma = tokens[0]; String value = tokens.length == 1 ? null : tokens[1]; if ("gff-version".equalsIgnoreCase(pragma)) { if (value != null) { try { // value may be e.g. "3.1.2" gffVersion = Integer.parseInt(value.split("\\.")[0]); } catch (NumberFormatException e) { // ignore } } } else if ("feature-ontology".equalsIgnoreCase(pragma)) { // should resolve against the specified feature ontology URI } else if ("attribute-ontology".equalsIgnoreCase(pragma)) { // URI of attribute ontology - not currently used in GFF3 } else if ("source-ontology".equalsIgnoreCase(pragma)) { // URI of source ontology - not currently used in GFF3 } else if ("species-build".equalsIgnoreCase(pragma)) { // save URI of specific NCBI taxon version of annotations gffProps.put("species-build", value); } else if ("fasta".equalsIgnoreCase(pragma)) { // process the rest of the file as a fasta file and replace any dummy // sequence IDs processAsFasta(align, newseqs); } else { System.err.println("Ignoring unknown pragma: " + line); } } /** * Processes the 'Query' and 'Align' properties associated with a GFF * similarity feature; these properties define the mapping of the annotated * feature to another from which it has transferred annotation * * @param set * @param seq * @param sf * @return */ public void processGffSimilarity(Map> set, SequenceI seq, SequenceFeature sf, AlignmentI align, List newseqs, boolean relaxedIdMatching) throws InvalidGFF3FieldException { int strand = sf.getStrand(); // exonerate cdna/protein map // look for fields List querySeq = findNames(align, newseqs, relaxedIdMatching, set.get("Query")); if (querySeq == null || querySeq.size() != 1) { throw new InvalidGFF3FieldException("Query", set, "Expecting exactly one sequence in Query field (got " + set.get("Query") + ")"); } if (set.containsKey("Align")) { // process the align maps and create cdna/protein maps // ideally, the query sequences are in the alignment, but maybe not... AlignedCodonFrame alco = new AlignedCodonFrame(); MapList codonmapping = constructCodonMappingFromAlign(set, "Align", strand); // add codon mapping, and hope! alco.addMap(seq, querySeq.get(0), codonmapping); align.addCodonFrame(alco); } } /** * take a sequence feature and examine its attributes to decide how it should * be added to a sequence * * @param seq * - the destination sequence constructed or discovered in the * current context * @param sf * - the base feature with ATTRIBUTES property containing any * additional attributes * @param gFFFile * - true if we are processing a GFF annotation file * @return true if sf was actually added to the sequence, false if it was * processed in another way */ public boolean processOrAddSeqFeature(AlignmentI align, List newseqs, SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching) { String attr = (String) sf.getValue(ATTRIBUTES); boolean addFeature = true; if (attr != null) { for (String attset : attr.split(TAB)) { Map> set = StringUtils.parseNameValuePairs( attset, ";", new char[] { ' ', '-' }); if ("similarity".equals(sf.getType())) { try { processGffSimilarity(set, seq, sf, align, newseqs, relaxedIdMatching); addFeature = false; } catch (InvalidGFF3FieldException ivfe) { System.err.println(ivfe); } } } } if (addFeature) { seq.addSequenceFeature(sf); } return addFeature; } } class InvalidGFF3FieldException extends Exception { String field, value; public InvalidGFF3FieldException(String field, Map> set, String message) { super(message + " (Field was " + field + " and value was " + set.get(field).toString()); this.field = field; this.value = set.get(field).toString(); } }