X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fio%2FFeaturesFile.java;h=bd7127ff225c4f72767ddd7b69d4fcce6f069024;hb=5c6b5740b5bc6ac8e89dc04fe0a3542ee76cc22e;hp=ee6ba11a18cdd9615055a351cf8211992330eed1;hpb=26ba864a6c290121fe6cf616794d2d0bea65fb7d;p=jalview.git diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index ee6ba11..bd7127f 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -65,6 +65,16 @@ import java.util.StringTokenizer; */ public class FeaturesFile extends AlignFile { + private static final String NOTE = "Note"; + + private static final String ALIGN = "Align"; + + private static final String QUERY = "Query"; + + private static final String TARGET = "Target"; + + private static final String SIMILARITY = "similarity"; + protected static final String STRAND = "STRAND"; protected static final String FRAME = "FRAME"; @@ -284,26 +294,25 @@ public class FeaturesFile extends AlignFile */ protected boolean parseJalviewFeature(String line, StringTokenizer st, AlignmentI alignment, Map featureColours, - boolean removeHTML, boolean relaxedIdmatching, String featureGroup) + boolean removeHTML, boolean relaxedIdMatching, String featureGroup) { /* * Jalview: description seqid seqIndex start end type [score] */ - String desc = st.nextToken(); - String seqId = st.nextToken(); - SequenceI seq = findName(alignment, seqId, relaxedIdmatching, null); - if (!st.hasMoreTokens()) + if (st.countTokens() < 6) { - System.err - .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up."); - // in all probability, this isn't a file we understand, so bail - // quietly. + System.err.println("Ignoring feature line '" + line + + "' with unexpected number of columns (" + st.countTokens() + + ")"); return false; } + String desc = st.nextToken(); + String seqId = st.nextToken(); + SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId); if (!seqId.equals("ID_NOT_SPECIFIED")) { - seq = findName(alignment, seqId, relaxedIdmatching, null); + seq = findName(alignment, null, relaxedIdMatching, seqId); st.nextToken(); } else @@ -599,23 +608,24 @@ public class FeaturesFile extends AlignFile /** * Returns a sequence matching the given id, as follows * * * @param align - * @param seqId - * @param relaxedIdMatching * @param newseqs + * @param relaxedIdMatching + * @param seqId * @return */ - protected SequenceI findName(AlignmentI align, String seqId, - boolean relaxedIdMatching, List newseqs) + protected SequenceI findName(AlignmentI align, List newseqs, + boolean relaxedIdMatching, String seqId) { SequenceI match = null; if (relaxedIdMatching) @@ -1075,118 +1085,88 @@ public class FeaturesFile extends AlignFile } /** - * Helper method to make a mapping given a set of attributes for a GFF feature + * Returns a mapping given list of one or more Align descriptors (exonerate + * format) * - * @param set - * @param attr + * @param alignedRegions + * a list of "Align fromStart toStart fromCount" + * @param mapIsFromCdna + * if true, 'from' is dna, else 'from' is protein * @param strand * either 1 (forward) or -1 (reverse) * @return - * @throws InvalidGFF3FieldException + * @throws IOException */ protected MapList constructCodonMappingFromAlign( - Map> set, String attr, - int strand) throws InvalidGFF3FieldException + List alignedRegions, boolean mapIsFromCdna, int strand) + throws IOException { if (strand == 0) { - throw new InvalidGFF3FieldException(attr, set, + throw new IOException( "Invalid strand for a codon mapping (cannot be 0)"); } - List fromrange = new ArrayList(); - List torange = new ArrayList(); - int lastppos = 0, lastpframe = 0; - for (String range : set.get(attr)) - { - List ints = new ArrayList(); - StringTokenizer st = new StringTokenizer(range, " "); - while (st.hasMoreTokens()) - { - String num = st.nextToken(); - try - { - ints.add(new Integer(num)); - } catch (NumberFormatException nfe) - { - throw new InvalidGFF3FieldException(attr, set, - "Invalid number in field " + num); - } - } + int regions = alignedRegions.size(); + // arrays to hold [start, end] for each aligned region + int[] fromRanges = new int[regions * 2]; // from dna + int[] toRanges = new int[regions * 2]; // to protein + int fromRangesIndex = 0; + int toRangesIndex = 0; + + for (String range : alignedRegions) + { /* - * Align positionInRef positionInQuery LengthInRef - * contig_1146 exonerate:p2g:local similarity 8534 11269 3652 - . - * alignment_id 0 ; Query DDB_G0269124 Align 11270 143 120 + * Align mapFromStart mapToStart mapFromCount + * e.g. if mapIsFromCdna + * Align 11270 143 120 * means: - * 120 bases align at pos 143 in protein to 11270 on dna (-ve strand) - * and so on for additional ' ; Align x y z' groups + * 120 bases from pos 11270 align to pos 143 in peptide + * if !mapIsFromCdna this would instead be + * Align 143 11270 40 */ - if (ints.size() != 3) + String[] tokens = range.split(" "); + if (tokens.length != 3) { - throw new InvalidGFF3FieldException(attr, set, - "Invalid number of fields for this attribute (" - + ints.size() + ")"); + throw new IOException("Wrong number of fields for Align"); } - fromrange.add(ints.get(0)); - fromrange.add(ints.get(0) + strand * ints.get(2)); - // how are intron/exon boundaries that do not align in codons - // represented - if (ints.get(1).intValue() == lastppos && lastpframe > 0) + int fromStart = 0; + int toStart = 0; + int fromCount = 0; + try { - // extend existing to map - lastppos += ints.get(2) / 3; - lastpframe = ints.get(2) % 3; - torange.set(torange.size() - 1, new Integer(lastppos)); - } - else + fromStart = Integer.parseInt(tokens[0]); + toStart = Integer.parseInt(tokens[1]); + fromCount = Integer.parseInt(tokens[2]); + } catch (NumberFormatException nfe) { - // new to map range - torange.add(ints.get(1)); - lastppos = ints.get(1) + ints.get(2) / 3; - lastpframe = ints.get(2) % 3; - torange.add(new Integer(lastppos)); + throw new IOException("Invalid number in Align field: " + + nfe.getMessage()); } - } - // from and to ranges must end up being a series of start/end intervals - if (fromrange.size() % 2 == 1) - { - throw new InvalidGFF3FieldException(attr, set, - "Couldn't parse the DNA alignment range correctly"); - } - if (torange.size() % 2 == 1) - { - throw new InvalidGFF3FieldException(attr, set, - "Couldn't parse the protein alignment range correctly"); - } - // finally, build the map - int[] frommap = new int[fromrange.size()], tomap = new int[torange - .size()]; - int p = 0; - for (Integer ip : fromrange) - { - frommap[p++] = ip.intValue(); - } - p = 0; - for (Integer ip : torange) - { - tomap[p++] = ip.intValue(); - } - - return new MapList(frommap, tomap, 3, 1); - } - private List findNames(AlignmentI align, List newseqs, boolean relaxedIdMatching, - List list) - { - List found = new ArrayList(); - for (String seqId : list) - { - SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs); - if (seq != null) + /* + * Jalview always models from dna to protein, so adjust values if the + * GFF mapping is from protein to dna + */ + if (!mapIsFromCdna) { - found.add(seq); + fromCount *= 3; + int temp = fromStart; + fromStart = toStart; + toStart = temp; } + fromRanges[fromRangesIndex++] = fromStart; + fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1); + + /* + * If a codon has an intron gap, there will be contiguous 'toRanges'; + * this is handled for us by the MapList constructor. + * (It is not clear that exonerate ever generates this case) + */ + toRanges[toRangesIndex++] = toStart; + toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3; } - return found; + + return new MapList(fromRanges, toRanges, 3, 1); } /** @@ -1195,24 +1175,32 @@ public class FeaturesFile extends AlignFile * * @param st * @param alignment - * @param relaxedIdmatching + * @param relaxedIdMatching * @param newseqs * @return */ - protected SequenceI parseGffFeature(StringTokenizer st, AlignmentI alignment, boolean relaxedIdmatching, + protected SequenceI parseGffFeature(StringTokenizer st, + AlignmentI alignment, boolean relaxedIdMatching, List newseqs) { SequenceI seq; /* * GFF: seqid source type start end score strand phase [attributes] */ + if (st.countTokens() < 8) + { + System.err + .println("Ignoring GFF feature line with unexpected number of columns (" + + st.countTokens() + ")"); + return null; + } String seqId = st.nextToken(); /* * locate referenced sequence in alignment _or_ * as a forward reference (SequenceDummy) */ - seq = findName(alignment, seqId, relaxedIdmatching, newseqs); + seq = findName(alignment, newseqs, relaxedIdMatching, seqId); String desc = st.nextToken(); String group = null; @@ -1253,30 +1241,11 @@ public class FeaturesFile extends AlignFile if (st.hasMoreTokens()) { - String attributes = st.nextToken(); - sf.setValue(ATTRIBUTES, attributes); - - /* - * parse semi-structured attributes in column 9 and add them to the - * sequence feature's 'otherData' table; use Note as a best proxy for - * description - */ - Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", - new char[] { ' ', '=' }); - for (Entry> attr : nameValues.entrySet()) - { - String values = StringUtils.listToDelimitedString(attr.getValue(), - "; "); - sf.setValue(attr.getKey(), values); - if ("Note".equals(attr.getKey())) - { - sf.setDescription(values); - } - } + processGffColumnNine(st.nextToken(), sf); } if (processOrAddSeqFeature(alignment, newseqs, seq, sf, - relaxedIdmatching)) + relaxedIdMatching)) { // check whether we should add the sequence feature to any other // sequences in the alignment with the same or similar @@ -1289,6 +1258,37 @@ public class FeaturesFile extends AlignFile } /** + * Process the 'column 9' data of the GFF file. This is less formally defined, + * and its interpretation will vary depending on the tool that has generated + * it. + * + * @param attributes + * @param sf + */ + protected void processGffColumnNine(String attributes, SequenceFeature sf) + { + sf.setValue(ATTRIBUTES, attributes); + + /* + * Parse attributes in column 9 and add them to the sequence feature's + * 'otherData' table; use Note as a best proxy for description + */ + char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' }; + Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", + nameValueSeparator); + for (Entry> attr : nameValues.entrySet()) + { + String values = StringUtils.listToDelimitedString(attr.getValue(), + "; "); + sf.setValue(attr.getKey(), values); + if (NOTE.equals(attr.getKey())) + { + sf.setDescription(values); + } + } + } + + /** * After encountering ##fasta in a GFF3 file, process the remainder of the * file as FAST sequence data. Any placeholder sequences created during * feature parsing are updated with the actual sequences. @@ -1415,44 +1415,100 @@ public class FeaturesFile extends AlignFile } /** - * Processes the 'Query' and 'Align' properties associated with a GFF - * similarity feature; these properties define the mapping of the annotated - * feature to another from which it has transferred annotation + * Processes the 'Query' (or 'Target') and 'Align' properties associated with + * an exonerate GFF similarity feature; these properties define the mapping of + * the annotated feature (e.g. 'exon') to a related sequence. * * @param set * @param seq * @param sf - * @return + * @param align + * @param newseqs + * @param relaxedIdMatching + * @throws IOException */ public void processGffSimilarity(Map> set, SequenceI seq, SequenceFeature sf, AlignmentI align, List newseqs, boolean relaxedIdMatching) - throws InvalidGFF3FieldException + throws IOException { + if (!validateExonerateModel(sf)) + { + return; + } + int strand = sf.getStrand(); - // exonerate cdna/protein map - // look for fields - List querySeq = findNames(align, newseqs, relaxedIdMatching, - set.get("Query")); - if (querySeq == null || querySeq.size() != 1) - { - throw new InvalidGFF3FieldException("Query", set, - "Expecting exactly one sequence in Query field (got " - + set.get("Query") + ")"); + + /* + * exonerate (protein2dna or protein2genome) may be run with + * --showquerygff outputs + * Target ; Align proteinStartPos dnaStartPos peptideCount + * --showtargetgff outputs + * Query ; Align dnaStartPos proteinStartPos nucleotideCount + * where the Align spec may repeat + */ + boolean mapIsFromCdna = true; + List mapTo = set.get(QUERY); + if (mapTo == null) + { + mapTo = set.get(TARGET); + mapIsFromCdna = false; } - if (set.containsKey("Align")) + if (mapTo == null || mapTo.size() != 1) { - // process the align maps and create cdna/protein maps - // ideally, the query sequences are in the alignment, but maybe not... - - AlignedCodonFrame alco = new AlignedCodonFrame(); - MapList codonmapping = constructCodonMappingFromAlign(set, "Align", - strand); - - // add codon mapping, and hope! - alco.addMap(seq, querySeq.get(0), codonmapping); - align.addCodonFrame(alco); + throw new IOException( + "Expecting exactly one sequence in Query field (got " + mapTo + + ")"); } + + /* + * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; + */ + SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching, + mapTo.get(0)); + /* + * Process the Align maps and create cdna/protein maps; + * ideally, the query sequences are in the alignment, but maybe not... + */ + AlignedCodonFrame alco = new AlignedCodonFrame(); + MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN), + mapIsFromCdna, strand); + /* + * Jalview always maps from dna to protein + */ + if (mapIsFromCdna) + { + alco.addMap(seq, mappedSequence, codonmapping); + } + else + { + alco.addMap(mappedSequence, seq, codonmapping); + } + align.addCodonFrame(alco); + } + + /** + * Returns true if the exonerate model (saved from column 2 of the GFF as the + * SequenceFeature's group) is one that we are willing to process, else false + * + * @param sf + */ + protected boolean validateExonerateModel(SequenceFeature sf) + { + /* + * we don't handle protein-to-protein or dna-to-dna alignment here + */ + String source = sf.getFeatureGroup(); + if (source == null + || (!source.contains("protein2dna") && !source + .contains("protein2genome"))) + { + System.err + .println("I only accept protein2dna or protein2genome but found " + + source); + return false; + } + return true; } /** @@ -1482,14 +1538,14 @@ public class FeaturesFile extends AlignFile Map> set = StringUtils.parseNameValuePairs( attset, ";", new char[] { ' ', '-' }); - if ("similarity".equals(sf.getType())) + if (SIMILARITY.equals(sf.getType())) { try { + addFeature = false; processGffSimilarity(set, seq, sf, align, newseqs, relaxedIdMatching); - addFeature = false; - } catch (InvalidGFF3FieldException ivfe) + } catch (IOException ivfe) { System.err.println(ivfe); } @@ -1504,17 +1560,3 @@ public class FeaturesFile extends AlignFile } } - -class InvalidGFF3FieldException extends Exception -{ - String field, value; - - public InvalidGFF3FieldException(String field, - Map> set, String message) - { - super(message + " (Field was " + field + " and value was " - + set.get(field).toString()); - this.field = field; - this.value = set.get(field).toString(); - } -}