src/jalview/io/gff/ExonerateHelper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.util.MapList;
   9
  10 import java.io.IOException;
  11 import java.util.List;
  12 import java.util.Map;
  13
  14 /**
  15  * A handler to parse GFF in the format generated by the exonerate tool
  16  */
  17 public class ExonerateHelper extends Gff2Helper
  18 {
  19   private static final String SIMILARITY = "similarity";
  20
  21   private static final String GENOME2GENOME = "genome2genome";
  22
  23   private static final String CDNA2GENOME = "cdna2genome";
  24
  25   private static final String CODING2GENOME = "coding2genome";
  26
  27   private static final String CODING2CODING = "coding2coding";
  28
  29   private static final String PROTEIN2GENOME = "protein2genome";
  30
  31   private static final String PROTEIN2DNA = "protein2dna";
  32
  33   private static final String ALIGN = "Align";
  34
  35   private static final String QUERY = "Query";
  36
  37   private static final String TARGET = "Target";
  38
  39   /**
  40    * Process one GFF feature line (as modelled by SequenceFeature)
  41    *
  42    * @param seq
  43    *          the sequence with which this feature is associated
  44    * @param gffColumns
  45    *          the sequence feature with ATTRIBUTES property containing any
  46    *          additional attributes
  47    * @param align
  48    *          the alignment we are adding GFF to
  49    * @param newseqs
  50    *          any new sequences referenced by the GFF
  51    * @param relaxedIdMatching
  52    *          if true, match word tokens in sequence names
  53    * @return true if the sequence feature should be added to the sequence, else
  54    *         false (i.e. it has been processed in another way e.g. to generate a
  55    *         mapping)
  56    */
  57   @Override
  58   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  59           AlignmentI align, List<SequenceI> newseqs,
  60           boolean relaxedIdMatching)
  61   {
  62     String attr = gffColumns[ATTRIBUTES_COL];
  63     Map<String, List<String>> set = parseNameValuePairs(attr);
  64
  65     try
  66     {
  67       processGffSimilarity(set, seq, gffColumns,
  68               align, newseqs, relaxedIdMatching);
  69     } catch (IOException ivfe)
  70     {
  71       System.err.println(ivfe);
  72     }
  73
  74     /*
  75      * return null to indicate we don't want to add a sequence feature for
  76      * similarity (only process it to create mappings)
  77      */
  78     return null;
  79   }
  80
  81   /**
  82    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
  83    * an exonerate GFF similarity feature; these properties define the mapping of
  84    * the annotated range to a related sequence.
  85    *
  86    * @param set
  87    *          parsed GFF column 9 key/value(s)
  88    * @param seq
  89    *          the sequence the GFF feature is on
  90    * @param gff
  91    *          the GFF column data
  92    * @param align
  93    *          the alignment the sequence belongs to, where any new mappings
  94    *          should be added
  95    * @param newseqs
  96    *          a list of new 'virtual sequences' generated while parsing GFF
  97    * @param relaxedIdMatching
  98    *          if true allow fuzzy search for a matching target sequence
  99    * @throws IOException
 100    */
 101   protected void processGffSimilarity(
 102           Map<String, List<String>> set,
 103           SequenceI seq, String[] gff, AlignmentI align,
 104           List<SequenceI> newseqs, boolean relaxedIdMatching)
 105           throws IOException
 106   {
 107     /*
 108      * exonerate may be run with
 109      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 110      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 111      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 112      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 113      * where the Align spec may repeat
 114      */
 115     // TODO handle coding2coding and similar as well
 116     boolean featureIsOnTarget = true;
 117     List<String> mapTo = set.get(QUERY);
 118     if (mapTo == null)
 119     {
 120       mapTo = set.get(TARGET);
 121       featureIsOnTarget = false;
 122     }
 123     MappingType type = getMappingType(gff[SOURCE_COL]);
 124
 125     if (type == null)
 126     {
 127       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 128     }
 129
 130     if (mapTo == null || mapTo.size() != 1)
 131     {
 132       throw new IOException(
 133               "Expecting exactly one sequence in Query or Target field (got "
 134                       + mapTo + ")");
 135     }
 136
 137     /*
 138      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 139      */
 140     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 141             relaxedIdMatching);
 142
 143     /*
 144      * If mapping is from protein to dna, we store it as dna to protein instead
 145      */
 146     SequenceI mapFromSequence = seq;
 147     SequenceI mapToSequence = mappedSequence;
 148     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 149             || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget))
 150     {
 151       mapFromSequence = mappedSequence;
 152       mapToSequence = seq;
 153     }
 154
 155     /*
 156      * Process the Align maps and create mappings.
 157      * These may be cdna-genome, cdna-protein, genome-protein.
 158      * The mapped sequences may or may not be in the alignment
 159      * (they may be included later in the GFF file).
 160      */
 161
 162     /*
 163      * get any existing mapping for these sequences (or start one),
 164      * and add this mapped range
 165      */
 166     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 167             mapToSequence);
 168
 169     /*
 170      * exonerate GFF has the strand of the target in column 7
 171      * (differs from GFF3 which has it in the Target descriptor)
 172      */
 173     String strand = gff[STRAND_COL];
 174     boolean forwardStrand = true;
 175     if ("-".equals(strand))
 176     {
 177       forwardStrand = false;
 178     }
 179     else if (!"+".equals(strand))
 180     {
 181       System.err.println("Strand must be specified for alignment");
 182       return;
 183     }
 184
 185     List<String> alignedRegions = set.get(ALIGN);
 186     for (String region : alignedRegions)
 187     {
 188       MapList mapping = buildMapping(region, type, forwardStrand,
 189               featureIsOnTarget, gff);
 190
 191       if (mapping == null)
 192       {
 193         continue;
 194       }
 195
 196       acf.addMap(mapFromSequence, mapToSequence, mapping);
 197     }
 198     align.addCodonFrame(acf);
 199   }
 200
 201   /**
 202    * Construct the mapping
 203    *
 204    * @param region
 205    * @param type
 206    * @param forwardStrand
 207    * @param featureIsOnTarget
 208    * @param gff
 209    * @return
 210    */
 211   protected MapList buildMapping(String region, MappingType type,
 212           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 213   {
 214     /*
 215      * process one "fromStart toStart fromCount" descriptor
 216      */
 217     String[] tokens = region.split(" ");
 218     if (tokens.length != 3)
 219     {
 220       System.err.println("Malformed Align descriptor: " + region);
 221       return null;
 222     }
 223
 224     /*
 225      * get start/end of from/to mappings
 226      * if feature is on the target sequence we have to invert the sense
 227      */
 228     int alignFromStart;
 229     int alignToStart;
 230     int alignCount;
 231     try {
 232       alignFromStart = Integer.parseInt(tokens[0]);
 233       alignToStart = Integer.parseInt(tokens[1]);
 234       alignCount = Integer.parseInt(tokens[2]);
 235     } catch (NumberFormatException nfe) {
 236       System.err.println(nfe.toString());
 237       return null;
 238     }
 239
 240     int fromStart;
 241     int fromEnd;
 242     int toStart;
 243     int toEnd;
 244
 245     if (featureIsOnTarget)
 246     {
 247       fromStart = alignToStart;
 248       toStart = alignFromStart;
 249       toEnd = forwardStrand ? toStart + alignCount - 1 : toStart
 250               - (alignCount - 1);
 251       int toLength = Math.abs(toEnd - toStart) + 1;
 252       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 253       fromEnd = fromStart + fromLength - 1;
 254     }
 255     else
 256     {
 257       // we use the 'Align' values here not the feature start/end
 258       // not clear why they may differ but it seems they can
 259       fromStart = alignFromStart;
 260       fromEnd = alignFromStart + alignCount - 1;
 261       int fromLength = fromEnd - fromStart + 1;
 262       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 263       toStart = alignToStart;
 264       if (forwardStrand)
 265       {
 266         toEnd = toStart + toLength - 1;
 267       }
 268       else
 269       {
 270         toEnd = toStart - (toLength - 1);
 271       }
 272     }
 273
 274     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 275             toStart, toEnd, type);
 276     return codonmapping;
 277   }
 278
 279   /**
 280    * Returns a MappingType depending on the exonerate 'model' value.
 281    *
 282    * @param model
 283    * @return
 284    */
 285   protected static MappingType getMappingType(String model)
 286   {
 287     MappingType result = null;
 288
 289     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 290     {
 291       result = MappingType.PeptideToNucleotide;
 292     }
 293     else if (model.contains(CODING2CODING)
 294             || model.contains(CODING2GENOME)
 295             || model.contains(CDNA2GENOME)
 296             || model.contains(GENOME2GENOME))
 297     {
 298       result = MappingType.NucleotideToNucleotide;
 299     }
 300     return result;
 301   }
 302
 303   /**
 304    * Tests whether the GFF data looks like it was generated by exonerate, and is
 305    * a format we are willing to handle
 306    *
 307    * @param columns
 308    * @return
 309    */
 310   public static boolean recognises(String[] columns)
 311   {
 312     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 313     {
 314       return false;
 315     }
 316
 317     /*
 318      * inspect alignment model
 319      */
 320     String model = columns[SOURCE_COL];
 321     // e.g. exonerate:protein2genome:local
 322     if (model != null)
 323     {
 324       String mdl = model.toLowerCase();
 325       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 326               || mdl.contains(CODING2CODING)
 327               || mdl.contains(CODING2GENOME)
 328               || mdl.contains(CDNA2GENOME)
 329               || mdl.contains(GENOME2GENOME))
 330       {
 331         return true;
 332       }
 333     }
 334     System.err.println("Sorry, I don't handle exonerate model " + model);
 335     return false;
 336   }
 337
 338   @Override
 339   protected SequenceFeature buildSequenceFeature(String[] gff,
 340           Map<String, List<String>> set)
 341   {
 342     SequenceFeature sf = super.buildSequenceFeature(gff, set);
 343     sf.setFeatureGroup("exonerate");
 344
 345     return sf;
 346   }
 347
 348 }