X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fgff%2FGffHelperBase.java;fp=src%2Fjalview%2Fio%2Fgff%2FGffHelperBase.java;h=fbde9d99c5e6887b0cedc62a71cf563bb6ccb151;hb=8f920d337154e092f5f9056ffde3cdf2735eca43;hp=0000000000000000000000000000000000000000;hpb=da768251d307c7ce11283d72e0e522b2c5fac526;p=jalview.git
diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java
new file mode 100644
index 0000000..fbde9d9
--- /dev/null
+++ b/src/jalview/io/gff/GffHelperBase.java
@@ -0,0 +1,396 @@
+package jalview.io.gff;
+
+import jalview.analysis.SequenceIdMatcher;
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceDummy;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.MapList;
+import jalview.util.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+/**
+ * Base class with common functionality for flavours of GFF handler (GFF2 or
+ * GFF3)
+ */
+public abstract class GffHelperBase implements GffHelperI
+{
+ private static final String NOTE = "Note";
+
+ /*
+ * GFF columns 1-9 (zero-indexed):
+ */
+ protected static final int SEQID_COL = 0;
+
+ protected static final int SOURCE_COL = 1;
+
+ protected static final int TYPE_COL = 2;
+
+ protected static final int START_COL = 3;
+
+ protected static final int END_COL = 4;
+
+ protected static final int SCORE_COL = 5;
+
+ protected static final int STRAND_COL = 6;
+
+ protected static final int PHASE_COL = 7;
+
+ protected static final int ATTRIBUTES_COL = 8;
+
+ private AlignmentI lastmatchedAl = null;
+
+ private SequenceIdMatcher matcher = null;
+
+ /**
+ * Constructs and returns a mapping, or null if data appear invalid
+ *
+ * @param fromStart
+ * @param fromEnd
+ * @param toStart
+ * @param toEnd
+ * @param mappingType
+ * type of mapping (e.g. protein to nucleotide)
+ * @return
+ */
+ protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
+ int toStart, int toEnd, MappingType mappingType)
+ {
+ int[] from = new int[] { fromStart, fromEnd };
+ int[] to = new int[] { toStart, toEnd };
+
+ /*
+ * Jalview always models from dna to protein, so switch values if the
+ * GFF mapping is from protein to dna
+ */
+ if (mappingType == MappingType.PeptideToNucleotide)
+ {
+ int[] temp = from;
+ from = to;
+ to = temp;
+ mappingType = mappingType.getInverse();
+ }
+
+ int fromRatio = mappingType.getFromRatio();
+ int toRatio = mappingType.getToRatio();
+
+ /*
+ * sanity check that mapped residue counts match
+ * TODO understand why PASA generates such cases...
+ */
+ if (!trimMapping(from, to, fromRatio, toRatio))
+ {
+ System.err.println("Ignoring mapping from " + Arrays.toString(from)
+ + " to " + Arrays.toString(to) + " as counts don't match!");
+ return null;
+ }
+
+ /*
+ * If a codon has an intron gap, there will be contiguous 'toRanges';
+ * this is handled for us by the MapList constructor.
+ * (It is not clear that exonerate ever generates this case)
+ */
+
+ return new MapList(from, to, fromRatio, toRatio);
+ }
+
+ /**
+ * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
+ * tries to trim the end of the longer so they do. Returns true if the
+ * mappings could be made equivalent, else false. Note the range array values
+ * may be modified by this method.
+ *
+ * @param from
+ * @param to
+ * @param fromRatio
+ * @param toRatio
+ * @return
+ */
+ protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
+ int toRatio)
+ {
+ int fromLength = Math.abs(from[1] - from[0]) + 1;
+ int toLength = Math.abs(to[1] - to[0]) + 1;
+ int fromOverlap = fromLength * toRatio - toLength * fromRatio;
+ if (fromOverlap == 0)
+ {
+ return true;
+ }
+ if (fromOverlap > 0 && fromOverlap % toRatio == 0)
+ {
+ /*
+ * restrict from range to make them match up
+ * it's kind of arbitrary which end we truncate - here it is the end
+ */
+ System.err.print("Truncating mapping from " + Arrays.toString(from)
+ + " to ");
+ if (from[1] > from[0])
+ {
+ from[1] -= fromOverlap / toRatio;
+ }
+ else
+ {
+ from[1] += fromOverlap / toRatio;
+ }
+ System.err.println(Arrays.toString(from));
+ return true;
+ }
+ else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
+ {
+ fromOverlap = -fromOverlap; // > 0
+ /*
+ * restrict to range to make them match up
+ */
+ System.err.print("Truncating mapping to " + Arrays.toString(to)
+ + " to ");
+ if (to[1] > to[0])
+ {
+ to[1] -= fromOverlap / fromRatio;
+ }
+ else
+ {
+ to[1] += fromOverlap / fromRatio;
+ }
+ System.err.println(Arrays.toString(to));
+ return true;
+ }
+
+ /*
+ * Couldn't truncate to an exact match..
+ */
+ return false;
+ }
+
+ /**
+ * Returns a sequence matching the given id, as follows
+ *
+ * - strict matching is on exact sequence name
+ * - relaxed matching allows matching on a token within the sequence name,
+ * or a dbxref
+ * - first tries to find a match in the alignment sequences
+ * - else tries to find a match in the new sequences already generated while
+ * parsing the features file
+ * - else creates a new placeholder sequence, adds it to the new sequences
+ * list, and returns it
+ *
+ *
+ * @param seqId
+ * @param align
+ * @param newseqs
+ * @param relaxedIdMatching
+ *
+ * @return
+ */
+ protected SequenceI findSequence(String seqId, AlignmentI align,
+ List newseqs, boolean relaxedIdMatching)
+ {
+ if (seqId == null)
+ {
+ return null;
+ }
+ SequenceI match = null;
+ if (relaxedIdMatching)
+ {
+ if (lastmatchedAl != align)
+ {
+ lastmatchedAl = align;
+ matcher = new SequenceIdMatcher(align.getSequencesArray());
+ if (newseqs != null)
+ {
+ matcher.addAll(newseqs);
+ }
+ }
+ match = matcher.findIdMatch(seqId);
+ }
+ else
+ {
+ match = align.findName(seqId, true);
+ if (match == null && newseqs != null)
+ {
+ for (SequenceI m : newseqs)
+ {
+ if (seqId.equals(m.getName()))
+ {
+ return m;
+ }
+ }
+ }
+
+ }
+ if (match == null && newseqs != null)
+ {
+ match = new SequenceDummy(seqId);
+ if (relaxedIdMatching)
+ {
+ matcher.addAll(Arrays.asList(new SequenceI[] { match }));
+ }
+ // add dummy sequence to the newseqs list
+ newseqs.add(match);
+ }
+ return match;
+ }
+
+ /**
+ * Parses the input line to a map of name / value(s) pairs. For example the
+ * line
+ * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
+ * if parsed with delimiter=";" and separators {' ', '='}
+ * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
+ * prediction}, source={Pfam}}
+ *
+ * This method supports parsing of either GFF2 format (which uses space ' ' as
+ * the name/value delimiter, and allows multiple occurrences of the same
+ * name), or GFF3 format (which uses '=' as the name/value delimiter, and
+ * strictly does not allow repeat occurrences of the same name - but does
+ * allow a comma-separated list of values).
+ *
+ * @param text
+ * @param namesDelimiter
+ * the major delimiter between name-value pairs
+ * @param nameValueSeparator
+ * one or more separators used between name and value
+ * @param valuesDelimiter
+ * delimits a list of more than one value
+ * @return the name-values map (which may be empty but never null)
+ */
+ public static Map> parseNameValuePairs(String text,
+ String namesDelimiter, char nameValueSeparator,
+ String valuesDelimiter)
+ {
+ Map> map = new HashMap>();
+ if (text == null || text.trim().length() == 0)
+ {
+ return map;
+ }
+
+ for (String pair : text.trim().split(namesDelimiter))
+ {
+ pair = pair.trim();
+ if (pair.length() == 0)
+ {
+ continue;
+ }
+
+ int sepPos = pair.indexOf(nameValueSeparator);
+ if (sepPos == -1)
+ {
+ // no name=value present
+ continue;
+ }
+
+ String key = pair.substring(0, sepPos).trim();
+ String values = pair.substring(sepPos + 1).trim();
+ if (values.length() > 0)
+ {
+ List vals = map.get(key);
+ if (vals == null)
+ {
+ vals = new ArrayList();
+ map.put(key, vals);
+ }
+ for (String val : values.split(valuesDelimiter))
+ {
+ vals.add(val);
+ }
+ }
+ }
+ return map;
+ }
+
+ /**
+ * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
+ * to call this method then adjust the SequenceFeature depending on the
+ * particular usage of different tools that generate GFF.
+ *
+ * @param gff
+ * @param attributes
+ * @return
+ */
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map> attributes)
+ {
+ try
+ {
+ int start = Integer.parseInt(gff[START_COL]);
+ int end = Integer.parseInt(gff[END_COL]);
+ float score = Float.NaN;
+ try
+ {
+ score = Float.parseFloat(gff[SCORE_COL]);
+ } catch (NumberFormatException nfe)
+ {
+ // e.g. '.' - leave as NaN to indicate no score
+ }
+
+ SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
+ gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
+
+ if (attributes != null)
+ {
+ /*
+ * save 'raw' column 9 to allow roundtrip output as input
+ */
+ sf.setAttributes(gff[ATTRIBUTES_COL]);
+
+ /*
+ * Add attributes in column 9 to the sequence feature's
+ * 'otherData' table; use Note as a best proxy for description
+ */
+ for (Entry> attr : attributes.entrySet())
+ {
+ String values = StringUtils.listToDelimitedString(
+ attr.getValue(), "; ");
+ sf.setValue(attr.getKey(), values);
+ if (NOTE.equals(attr.getKey()))
+ {
+ sf.setDescription(values);
+ }
+ }
+ }
+
+ return sf;
+ } catch (NumberFormatException nfe)
+ {
+ System.err.println("Invalid number in gff: " + nfe.getMessage());
+ return null;
+ }
+ }
+
+ /**
+ * Returns the character used to separate attributes names from values in GFF
+ * column 9. This is space for GFF2, '=' for GFF3.
+ *
+ * @return
+ */
+ protected abstract char getNameValueSeparator();
+
+ /**
+ * Returns any existing mapping held on the alignment between the given
+ * dataset sequences, or a new one if none found. This is a convenience method
+ * to facilitate processing multiple GFF lines that make up a single 'spliced'
+ * mapping, by extending the first mapping as the others are read.
+ *
+ * @param align
+ * @param fromSeq
+ * @param toSeq
+ * @return
+ */
+ protected AlignedCodonFrame getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq)
+ {
+ AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
+ if (acf == null)
+ {
+ acf = new AlignedCodonFrame();
+ }
+ return acf;
+ }
+
+}