src/jalview/io/gff/ExonerateHelper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.util.MapList;
   9
  10 import java.io.IOException;
  11 import java.util.List;
  12 import java.util.Map;
  13
  14 /**
  15  * A handler to parse GFF in the format generated by the exonerate tool
  16  */
  17 public class ExonerateHelper extends Gff2Helper
  18 {
  19   private static final String SIMILARITY = "similarity";
  20
  21   private static final String GENOME2GENOME = "genome2genome";
  22
  23   private static final String CDNA2GENOME = "cdna2genome";
  24
  25   private static final String CODING2GENOME = "coding2genome";
  26
  27   private static final String CODING2CODING = "coding2coding";
  28
  29   private static final String PROTEIN2GENOME = "protein2genome";
  30
  31   private static final String PROTEIN2DNA = "protein2dna";
  32
  33   private static final String ALIGN = "Align";
  34
  35   private static final String QUERY = "Query";
  36
  37   private static final String TARGET = "Target";
  38
  39   /**
  40    * Process one GFF feature line (as modelled by SequenceFeature)
  41    *
  42    * @param seq
  43    *          the sequence with which this feature is associated
  44    * @param gffColumns
  45    *          the sequence feature with ATTRIBUTES property containing any
  46    *          additional attributes
  47    * @param align
  48    *          the alignment we are adding GFF to
  49    * @param newseqs
  50    *          any new sequences referenced by the GFF
  51    * @param relaxedIdMatching
  52    *          if true, match word tokens in sequence names
  53    * @return true if the sequence feature should be added to the sequence, else
  54    *         false (i.e. it has been processed in another way e.g. to generate a
  55    *         mapping)
  56    */
  57   @Override
  58   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  59           AlignmentI align, List<SequenceI> newseqs,
  60           boolean relaxedIdMatching)
  61   {
  62     String attr = gffColumns[ATTRIBUTES_COL];
  63     Map<String, List<String>> set = parseNameValuePairs(attr);
  64
  65     try
  66     {
  67       processGffSimilarity(set, seq, gffColumns, align, newseqs,
  68               relaxedIdMatching);
  69     } catch (IOException ivfe)
  70     {
  71       System.err.println(ivfe);
  72     }
  73
  74     /*
  75      * return null to indicate we don't want to add a sequence feature for
  76      * similarity (only process it to create mappings)
  77      */
  78     return null;
  79   }
  80
  81   /**
  82    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
  83    * an exonerate GFF similarity feature; these properties define the mapping of
  84    * the annotated range to a related sequence.
  85    *
  86    * @param set
  87    *          parsed GFF column 9 key/value(s)
  88    * @param seq
  89    *          the sequence the GFF feature is on
  90    * @param gff
  91    *          the GFF column data
  92    * @param align
  93    *          the alignment the sequence belongs to, where any new mappings
  94    *          should be added
  95    * @param newseqs
  96    *          a list of new 'virtual sequences' generated while parsing GFF
  97    * @param relaxedIdMatching
  98    *          if true allow fuzzy search for a matching target sequence
  99    * @throws IOException
 100    */
 101   protected void processGffSimilarity(Map<String, List<String>> set,
 102           SequenceI seq, String[] gff, AlignmentI align,
 103           List<SequenceI> newseqs, boolean relaxedIdMatching)
 104           throws IOException
 105   {
 106     /*
 107      * exonerate may be run with
 108      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 109      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 110      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 111      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 112      * where the Align spec may repeat
 113      */
 114     // TODO handle coding2coding and similar as well
 115     boolean featureIsOnTarget = true;
 116     List<String> mapTo = set.get(QUERY);
 117     if (mapTo == null)
 118     {
 119       mapTo = set.get(TARGET);
 120       featureIsOnTarget = false;
 121     }
 122     MappingType type = getMappingType(gff[SOURCE_COL]);
 123
 124     if (type == null)
 125     {
 126       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 127     }
 128
 129     if (mapTo == null || mapTo.size() != 1)
 130     {
 131       throw new IOException(
 132               "Expecting exactly one sequence in Query or Target field (got "
 133                       + mapTo + ")");
 134     }
 135
 136     /*
 137      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 138      */
 139     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 140             relaxedIdMatching);
 141
 142     /*
 143      * If mapping is from protein to dna, we store it as dna to protein instead
 144      */
 145     SequenceI mapFromSequence = seq;
 146     SequenceI mapToSequence = mappedSequence;
 147     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 148             || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget))
 149     {
 150       mapFromSequence = mappedSequence;
 151       mapToSequence = seq;
 152     }
 153
 154     /*
 155      * Process the Align maps and create mappings.
 156      * These may be cdna-genome, cdna-protein, genome-protein.
 157      * The mapped sequences may or may not be in the alignment
 158      * (they may be included later in the GFF file).
 159      */
 160
 161     /*
 162      * get any existing mapping for these sequences (or start one),
 163      * and add this mapped range
 164      */
 165     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 166             mapToSequence);
 167
 168     /*
 169      * exonerate GFF has the strand of the target in column 7
 170      * (differs from GFF3 which has it in the Target descriptor)
 171      */
 172     String strand = gff[STRAND_COL];
 173     boolean forwardStrand = true;
 174     if ("-".equals(strand))
 175     {
 176       forwardStrand = false;
 177     }
 178     else if (!"+".equals(strand))
 179     {
 180       System.err.println("Strand must be specified for alignment");
 181       return;
 182     }
 183
 184     List<String> alignedRegions = set.get(ALIGN);
 185     for (String region : alignedRegions)
 186     {
 187       MapList mapping = buildMapping(region, type, forwardStrand,
 188               featureIsOnTarget, gff);
 189
 190       if (mapping == null)
 191       {
 192         continue;
 193       }
 194
 195       acf.addMap(mapFromSequence, mapToSequence, mapping);
 196     }
 197     align.addCodonFrame(acf);
 198   }
 199
 200   /**
 201    * Construct the mapping
 202    *
 203    * @param region
 204    * @param type
 205    * @param forwardStrand
 206    * @param featureIsOnTarget
 207    * @param gff
 208    * @return
 209    */
 210   protected MapList buildMapping(String region, MappingType type,
 211           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 212   {
 213     /*
 214      * process one "fromStart toStart fromCount" descriptor
 215      */
 216     String[] tokens = region.split(" ");
 217     if (tokens.length != 3)
 218     {
 219       System.err.println("Malformed Align descriptor: " + region);
 220       return null;
 221     }
 222
 223     /*
 224      * get start/end of from/to mappings
 225      * if feature is on the target sequence we have to invert the sense
 226      */
 227     int alignFromStart;
 228     int alignToStart;
 229     int alignCount;
 230     try
 231     {
 232       alignFromStart = Integer.parseInt(tokens[0]);
 233       alignToStart = Integer.parseInt(tokens[1]);
 234       alignCount = Integer.parseInt(tokens[2]);
 235     } catch (NumberFormatException nfe)
 236     {
 237       System.err.println(nfe.toString());
 238       return null;
 239     }
 240
 241     int fromStart;
 242     int fromEnd;
 243     int toStart;
 244     int toEnd;
 245
 246     if (featureIsOnTarget)
 247     {
 248       fromStart = alignToStart;
 249       toStart = alignFromStart;
 250       toEnd = forwardStrand ? toStart + alignCount - 1 : toStart
 251               - (alignCount - 1);
 252       int toLength = Math.abs(toEnd - toStart) + 1;
 253       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 254       fromEnd = fromStart + fromLength - 1;
 255     }
 256     else
 257     {
 258       // we use the 'Align' values here not the feature start/end
 259       // not clear why they may differ but it seems they can
 260       fromStart = alignFromStart;
 261       fromEnd = alignFromStart + alignCount - 1;
 262       int fromLength = fromEnd - fromStart + 1;
 263       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 264       toStart = alignToStart;
 265       if (forwardStrand)
 266       {
 267         toEnd = toStart + toLength - 1;
 268       }
 269       else
 270       {
 271         toEnd = toStart - (toLength - 1);
 272       }
 273     }
 274
 275     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 276             toStart, toEnd, type);
 277     return codonmapping;
 278   }
 279
 280   /**
 281    * Returns a MappingType depending on the exonerate 'model' value.
 282    *
 283    * @param model
 284    * @return
 285    */
 286   protected static MappingType getMappingType(String model)
 287   {
 288     MappingType result = null;
 289
 290     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 291     {
 292       result = MappingType.PeptideToNucleotide;
 293     }
 294     else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
 295             || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
 296     {
 297       result = MappingType.NucleotideToNucleotide;
 298     }
 299     return result;
 300   }
 301
 302   /**
 303    * Tests whether the GFF data looks like it was generated by exonerate, and is
 304    * a format we are willing to handle
 305    *
 306    * @param columns
 307    * @return
 308    */
 309   public static boolean recognises(String[] columns)
 310   {
 311     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 312     {
 313       return false;
 314     }
 315
 316     /*
 317      * inspect alignment model
 318      */
 319     String model = columns[SOURCE_COL];
 320     // e.g. exonerate:protein2genome:local
 321     if (model != null)
 322     {
 323       String mdl = model.toLowerCase();
 324       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 325               || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
 326               || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
 327       {
 328         return true;
 329       }
 330     }
 331     System.err.println("Sorry, I don't handle exonerate model " + model);
 332     return false;
 333   }
 334
 335   @Override
 336   protected SequenceFeature buildSequenceFeature(String[] gff,
 337           Map<String, List<String>> set)
 338   {
 339     SequenceFeature sf = super.buildSequenceFeature(gff, set);
 340     sf.setFeatureGroup("exonerate");
 341
 342     return sf;
 343   }
 344
 345 }