/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.io.gff;
import jalview.analysis.SequenceIdMatcher;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.MappingType;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.MapList;
import jalview.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
/**
* Base class with common functionality for flavours of GFF handler (GFF2 or
* GFF3)
*/
public abstract class GffHelperBase implements GffHelperI
{
private static final String NOTE = "Note";
/*
* GFF columns 1-9 (zero-indexed):
*/
protected static final int SEQID_COL = 0;
protected static final int SOURCE_COL = 1;
protected static final int TYPE_COL = 2;
protected static final int START_COL = 3;
protected static final int END_COL = 4;
protected static final int SCORE_COL = 5;
protected static final int STRAND_COL = 6;
protected static final int PHASE_COL = 7;
protected static final int ATTRIBUTES_COL = 8;
private AlignmentI lastmatchedAl = null;
private SequenceIdMatcher matcher = null;
/**
* Constructs and returns a mapping, or null if data appear invalid
*
* @param fromStart
* @param fromEnd
* @param toStart
* @param toEnd
* @param mappingType
* type of mapping (e.g. protein to nucleotide)
* @return
*/
protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
int toStart, int toEnd, MappingType mappingType)
{
int[] from = new int[] { fromStart, fromEnd };
int[] to = new int[] { toStart, toEnd };
/*
* Jalview always models from dna to protein, so switch values if the
* GFF mapping is from protein to dna
*/
if (mappingType == MappingType.PeptideToNucleotide)
{
int[] temp = from;
from = to;
to = temp;
mappingType = mappingType.getInverse();
}
int fromRatio = mappingType.getFromRatio();
int toRatio = mappingType.getToRatio();
/*
* sanity check that mapped residue counts match
* TODO understand why PASA generates such cases...
*/
if (!trimMapping(from, to, fromRatio, toRatio))
{
System.err.println("Ignoring mapping from " + Arrays.toString(from)
+ " to " + Arrays.toString(to) + " as counts don't match!");
return null;
}
/*
* If a codon has an intron gap, there will be contiguous 'toRanges';
* this is handled for us by the MapList constructor.
* (It is not clear that exonerate ever generates this case)
*/
return new MapList(from, to, fromRatio, toRatio);
}
/**
* Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
* tries to trim the end of the longer so they do. Returns true if the
* mappings could be made equivalent, else false. Note the range array values
* may be modified by this method.
*
* @param from
* @param to
* @param fromRatio
* @param toRatio
* @return
*/
protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
int toRatio)
{
int fromLength = Math.abs(from[1] - from[0]) + 1;
int toLength = Math.abs(to[1] - to[0]) + 1;
int fromOverlap = fromLength * toRatio - toLength * fromRatio;
if (fromOverlap == 0)
{
return true;
}
if (fromOverlap > 0 && fromOverlap % toRatio == 0)
{
/*
* restrict from range to make them match up
* it's kind of arbitrary which end we truncate - here it is the end
*/
System.err.print(
"Truncating mapping from " + Arrays.toString(from) + " to ");
if (from[1] > from[0])
{
from[1] -= fromOverlap / toRatio;
}
else
{
from[1] += fromOverlap / toRatio;
}
System.err.println(Arrays.toString(from));
return true;
}
else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
{
fromOverlap = -fromOverlap; // > 0
/*
* restrict to range to make them match up
*/
System.err.print(
"Truncating mapping to " + Arrays.toString(to) + " to ");
if (to[1] > to[0])
{
to[1] -= fromOverlap / fromRatio;
}
else
{
to[1] += fromOverlap / fromRatio;
}
System.err.println(Arrays.toString(to));
return true;
}
/*
* Couldn't truncate to an exact match..
*/
return false;
}
/**
* Returns a sequence matching the given id, as follows
*
* - strict matching is on exact sequence name
* - relaxed matching allows matching on a token within the sequence name,
* or a dbxref
* - first tries to find a match in the alignment sequences
* - else tries to find a match in the new sequences already generated while
* parsing the features file
* - else creates a new placeholder sequence, adds it to the new sequences
* list, and returns it
*
*
* @param seqId
* @param align
* @param newseqs
* @param relaxedIdMatching
*
* @return
*/
protected SequenceI findSequence(String seqId, AlignmentI align,
List newseqs, boolean relaxedIdMatching)
{
if (seqId == null)
{
return null;
}
SequenceI match = null;
if (relaxedIdMatching)
{
if (lastmatchedAl != align)
{
lastmatchedAl = align;
matcher = new SequenceIdMatcher(align.getSequencesArray());
if (newseqs != null)
{
matcher.addAll(newseqs);
}
}
match = matcher.findIdMatch(seqId);
}
else
{
match = align.findName(seqId, true);
if (match == null && newseqs != null)
{
for (SequenceI m : newseqs)
{
if (seqId.equals(m.getName()))
{
return m;
}
}
}
}
if (match == null && newseqs != null)
{
match = new SequenceDummy(seqId);
if (relaxedIdMatching)
{
matcher.addAll(Arrays.asList(new SequenceI[] { match }));
}
// add dummy sequence to the newseqs list
newseqs.add(match);
}
return match;
}
/**
* Parses the input line to a map of name / value(s) pairs. For example the
* line
* Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
*
* if parsed with delimiter=";" and separators {' ', '='}
* would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
* prediction}, source={Pfam}}
*
* This method supports parsing of either GFF2 format (which uses space ' ' as
* the name/value delimiter, and allows multiple occurrences of the same
* name), or GFF3 format (which uses '=' as the name/value delimiter, and
* strictly does not allow repeat occurrences of the same name - but does
* allow a comma-separated list of values).
*
* @param text
* @param namesDelimiter
* the major delimiter between name-value pairs
* @param nameValueSeparator
* one or more separators used between name and value
* @param valuesDelimiter
* delimits a list of more than one value
* @return the name-values map (which may be empty but never null)
*/
public static Map> parseNameValuePairs(String text,
String namesDelimiter, char nameValueSeparator,
String valuesDelimiter)
{
Map> map = new HashMap>();
if (text == null || text.trim().length() == 0)
{
return map;
}
for (String pair : text.trim().split(namesDelimiter))
{
pair = pair.trim();
if (pair.length() == 0)
{
continue;
}
int sepPos = pair.indexOf(nameValueSeparator);
if (sepPos == -1)
{
// no name=value present
continue;
}
String key = pair.substring(0, sepPos).trim();
String values = pair.substring(sepPos + 1).trim();
if (values.length() > 0)
{
List vals = map.get(key);
if (vals == null)
{
vals = new ArrayList();
map.put(key, vals);
}
for (String val : values.split(valuesDelimiter))
{
vals.add(val);
}
}
}
return map;
}
/**
* Constructs a SequenceFeature from the GFF column data. Subclasses may wish
* to call this method then adjust the SequenceFeature depending on the
* particular usage of different tools that generate GFF.
*
* @param gff
* @param attributes
* @return
*/
protected SequenceFeature buildSequenceFeature(String[] gff,
Map> attributes)
{
return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
}
/**
* @param gff
* @param typeColumn
* @param group
* @param attributes
* @return
*/
protected SequenceFeature buildSequenceFeature(String[] gff,
int typeColumn, String group, Map> attributes)
{
try
{
int start = Integer.parseInt(gff[START_COL]);
int end = Integer.parseInt(gff[END_COL]);
/*
* default 'score' is 0 rather than Float.NaN as the latter currently
* disables the 'graduated colour => colour by label' option
*/
float score = 0f;
try
{
score = Float.parseFloat(gff[SCORE_COL]);
} catch (NumberFormatException nfe)
{
// e.g. '.' - leave as zero
}
SequenceFeature sf = new SequenceFeature(gff[typeColumn],
gff[SOURCE_COL], start, end, score, group);
sf.setStrand(gff[STRAND_COL]);
sf.setPhase(gff[PHASE_COL]);
if (attributes != null)
{
/*
* save 'raw' column 9 to allow roundtrip output as input
*/
sf.setAttributes(gff[ATTRIBUTES_COL]);
/*
* Add attributes in column 9 to the sequence feature's
* 'otherData' table; use Note as a best proxy for description
*/
for (Entry> attr : attributes.entrySet())
{
String values = StringUtils.listToDelimitedString(attr.getValue(),
",");
sf.setValue(attr.getKey(), values);
if (NOTE.equals(attr.getKey()))
{
sf.setDescription(values);
}
}
}
return sf;
} catch (NumberFormatException nfe)
{
System.err.println("Invalid number in gff: " + nfe.getMessage());
return null;
}
}
/**
* Returns the character used to separate attributes names from values in GFF
* column 9. This is space for GFF2, '=' for GFF3.
*
* @return
*/
protected abstract char getNameValueSeparator();
/**
* Returns any existing mapping held on the alignment between the given
* dataset sequences, or a new one if none found. This is a convenience method
* to facilitate processing multiple GFF lines that make up a single 'spliced'
* mapping, by extending the first mapping as the others are read.
*
* @param align
* @param fromSeq
* @param toSeq
* @return
*/
protected AlignedCodonFrame getMapping(AlignmentI align,
SequenceI fromSeq, SequenceI toSeq)
{
AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
if (acf == null)
{
acf = new AlignedCodonFrame();
}
return acf;
}
}