X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fgff%2FGffHelperBase.java;fp=src%2Fjalview%2Fio%2Fgff%2FGffHelperBase.java;h=feeec1d2a60c7a87c5e678f87b8acbecb7956793;hb=4da7d6ec8ef5ff030c6d06d37a099da2d92d7246;hp=0000000000000000000000000000000000000000;hpb=43ee8686fab13cd6952335ade1382adf3226f7a1;p=jalview.git diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java new file mode 100644 index 0000000..feeec1d --- /dev/null +++ b/src/jalview/io/gff/GffHelperBase.java @@ -0,0 +1,405 @@ +package jalview.io.gff; + +import jalview.analysis.SequenceIdMatcher; +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.MappingType; +import jalview.datamodel.SequenceDummy; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; +import jalview.util.StringUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** + * Base class with common functionality for flavours of GFF handler (GFF2 or + * GFF3) + */ +public abstract class GffHelperBase implements GffHelperI +{ + private static final String NOTE = "Note"; + + /* + * GFF columns 1-9 (zero-indexed): + */ + protected static final int SEQID_COL = 0; + + protected static final int SOURCE_COL = 1; + + protected static final int TYPE_COL = 2; + + protected static final int START_COL = 3; + + protected static final int END_COL = 4; + + protected static final int SCORE_COL = 5; + + protected static final int STRAND_COL = 6; + + protected static final int PHASE_COL = 7; + + protected static final int ATTRIBUTES_COL = 8; + + private AlignmentI lastmatchedAl = null; + + private SequenceIdMatcher matcher = null; + + /** + * Constructs and returns a mapping, or null if data appear invalid + * + * @param fromStart + * @param fromEnd + * @param toStart + * @param toEnd + * @param mappingType + * type of mapping (e.g. protein to nucleotide) + * @return + */ + protected MapList constructMappingFromAlign(int fromStart, int fromEnd, + int toStart, int toEnd, MappingType mappingType) + { + int[] from = new int[] { fromStart, fromEnd }; + int[] to = new int[] { toStart, toEnd }; + + /* + * Jalview always models from dna to protein, so switch values if the + * GFF mapping is from protein to dna + */ + if (mappingType == MappingType.PeptideToNucleotide) + { + int[] temp = from; + from = to; + to = temp; + mappingType = mappingType.getInverse(); + } + + int fromRatio = mappingType.getFromRatio(); + int toRatio = mappingType.getToRatio(); + + /* + * sanity check that mapped residue counts match + * TODO understand why PASA generates such cases... + */ + if (!trimMapping(from, to, fromRatio, toRatio)) + { + System.err.println("Ignoring mapping from " + Arrays.toString(from) + + " to " + Arrays.toString(to) + " as counts don't match!"); + return null; + } + + /* + * If a codon has an intron gap, there will be contiguous 'toRanges'; + * this is handled for us by the MapList constructor. + * (It is not clear that exonerate ever generates this case) + */ + + return new MapList(from, to, fromRatio, toRatio); + } + + /** + * Checks that the 'from' and 'to' ranges have equivalent lengths. If not, + * tries to trim the end of the longer so they do. Returns true if the + * mappings could be made equivalent, else false. Note the range array values + * may be modified by this method. + * + * @param from + * @param to + * @param fromRatio + * @param toRatio + * @return + */ + protected static boolean trimMapping(int[] from, int[] to, int fromRatio, + int toRatio) + { + int fromLength = Math.abs(from[1] - from[0]) + 1; + int toLength = Math.abs(to[1] - to[0]) + 1; + int fromOverlap = fromLength * toRatio - toLength * fromRatio; + if (fromOverlap == 0) + { + return true; + } + if (fromOverlap > 0 && fromOverlap % toRatio == 0) + { + /* + * restrict from range to make them match up + * it's kind of arbitrary which end we truncate - here it is the end + */ + System.err.print("Truncating mapping from " + Arrays.toString(from) + + " to "); + if (from[1] > from[0]) + { + from[1] -= fromOverlap / toRatio; + } + else + { + from[1] += fromOverlap / toRatio; + } + System.err.println(Arrays.toString(from)); + return true; + } + else if (fromOverlap < 0 && fromOverlap % fromRatio == 0) + { + fromOverlap = -fromOverlap; // > 0 + /* + * restrict to range to make them match up + */ + System.err.print("Truncating mapping to " + Arrays.toString(to) + + " to "); + if (to[1] > to[0]) + { + to[1] -= fromOverlap / fromRatio; + } + else + { + to[1] += fromOverlap / fromRatio; + } + System.err.println(Arrays.toString(to)); + return true; + } + + /* + * Couldn't truncate to an exact match.. + */ + return false; + } + + /** + * Returns a sequence matching the given id, as follows + * + * + * @param seqId + * @param align + * @param newseqs + * @param relaxedIdMatching + * + * @return + */ + protected SequenceI findSequence(String seqId, AlignmentI align, + List newseqs, boolean relaxedIdMatching) + { + if (seqId == null) + { + return null; + } + SequenceI match = null; + if (relaxedIdMatching) + { + if (lastmatchedAl != align) + { + lastmatchedAl = align; + matcher = new SequenceIdMatcher(align.getSequencesArray()); + if (newseqs != null) + { + matcher.addAll(newseqs); + } + } + match = matcher.findIdMatch(seqId); + } + else + { + match = align.findName(seqId, true); + if (match == null && newseqs != null) + { + for (SequenceI m : newseqs) + { + if (seqId.equals(m.getName())) + { + return m; + } + } + } + + } + if (match == null && newseqs != null) + { + match = new SequenceDummy(seqId); + if (relaxedIdMatching) + { + matcher.addAll(Arrays.asList(new SequenceI[] { match })); + } + // add dummy sequence to the newseqs list + newseqs.add(match); + } + return match; + } + + /** + * Parses the input line to a map of name / value(s) pairs. For example the + * line
+ * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
+ * if parsed with delimiter=";" and separators {' ', '='}
+ * would return a map with { Notes={Fe=S, Metal}, Method={manual curation, + * prediction}, source={Pfam}}
+ * + * This method supports parsing of either GFF2 format (which uses space ' ' as + * the name/value delimiter, and allows multiple occurrences of the same + * name), or GFF3 format (which uses '=' as the name/value delimiter, and + * strictly does not allow repeat occurrences of the same name - but does + * allow a comma-separated list of values). + * + * @param text + * @param namesDelimiter + * the major delimiter between name-value pairs + * @param nameValueSeparator + * one or more separators used between name and value + * @param valuesDelimiter + * delimits a list of more than one value + * @return the name-values map (which may be empty but never null) + */ + public static Map> parseNameValuePairs(String text, + String namesDelimiter, char nameValueSeparator, + String valuesDelimiter) + { + Map> map = new HashMap>(); + if (text == null || text.trim().length() == 0) + { + return map; + } + + for (String pair : text.trim().split(namesDelimiter)) + { + pair = pair.trim(); + if (pair.length() == 0) + { + continue; + } + + int sepPos = pair.indexOf(nameValueSeparator); + if (sepPos == -1) + { + // no name=value present + continue; + } + + String key = pair.substring(0, sepPos).trim(); + String values = pair.substring(sepPos + 1).trim(); + if (values.length() > 0) + { + List vals = map.get(key); + if (vals == null) + { + vals = new ArrayList(); + map.put(key, vals); + } + for (String val : values.split(valuesDelimiter)) + { + vals.add(val); + } + } + } + return map; + } + + /** + * Constructs a SequenceFeature from the GFF column data. Subclasses may wish + * to call this method then adjust the SequenceFeature depending on the + * particular usage of different tools that generate GFF. + * + * @param gff + * @param attributes + * @return + */ + protected SequenceFeature buildSequenceFeature(String[] gff, + Map> attributes) + { + try + { + int start = Integer.parseInt(gff[START_COL]); + int end = Integer.parseInt(gff[END_COL]); + + /* + * default 'score' is 0 rather than Float.NaN as the latter currently + * disables the 'graduated colour => colour by label' option + */ + float score = 0f; + try + { + score = Float.parseFloat(gff[SCORE_COL]); + } catch (NumberFormatException nfe) + { + // e.g. '.' - leave as zero + } + + SequenceFeature sf = new SequenceFeature(gff[TYPE_COL], + gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]); + + sf.setStrand(gff[STRAND_COL]); + + sf.setPhase(gff[PHASE_COL]); + + if (attributes != null) + { + /* + * save 'raw' column 9 to allow roundtrip output as input + */ + sf.setAttributes(gff[ATTRIBUTES_COL]); + + /* + * Add attributes in column 9 to the sequence feature's + * 'otherData' table; use Note as a best proxy for description + */ + for (Entry> attr : attributes.entrySet()) + { + String values = StringUtils.listToDelimitedString( + attr.getValue(), ","); + sf.setValue(attr.getKey(), values); + if (NOTE.equals(attr.getKey())) + { + sf.setDescription(values); + } + } + } + + return sf; + } catch (NumberFormatException nfe) + { + System.err.println("Invalid number in gff: " + nfe.getMessage()); + return null; + } + } + + /** + * Returns the character used to separate attributes names from values in GFF + * column 9. This is space for GFF2, '=' for GFF3. + * + * @return + */ + protected abstract char getNameValueSeparator(); + + /** + * Returns any existing mapping held on the alignment between the given + * dataset sequences, or a new one if none found. This is a convenience method + * to facilitate processing multiple GFF lines that make up a single 'spliced' + * mapping, by extending the first mapping as the others are read. + * + * @param align + * @param fromSeq + * @param toSeq + * @return + */ + protected AlignedCodonFrame getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq) + { + AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq); + if (acf == null) + { + acf = new AlignedCodonFrame(); + } + return acf; + } + +}