JAL-3121 slightly cleaner parsing of, and unit test for, map attributes
[jalview.git] / src / jalview / io / gff / Gff3Helper.java
index 4a2a50e..1ef8848 100644 (file)
@@ -1,3 +1,23 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ * 
+ * This file is part of Jalview.
+ * 
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License 
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *  
+ * Jalview is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty 
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 
+ * PURPOSE.  See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
 package jalview.io.gff;
 
 import jalview.datamodel.AlignedCodonFrame;
@@ -5,7 +25,6 @@ import jalview.datamodel.AlignmentI;
 import jalview.datamodel.MappingType;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
-import jalview.ext.ensembl.EnsemblSeqProxy;
 import jalview.util.MapList;
 import jalview.util.StringUtils;
 
@@ -20,6 +39,8 @@ import java.util.Map;
  */
 public class Gff3Helper extends GffHelperBase
 {
+  public static final String ALLELES = "alleles";
+
   protected static final String TARGET = "Target";
 
   protected static final String ID = "ID";
@@ -70,15 +91,16 @@ public class Gff3Helper extends GffHelperBase
       String atts = gff[ATTRIBUTES_COL];
       Map<String, List<String>> attributes = parseNameValuePairs(atts);
 
-      if (SequenceOntology.getInstance().isProteinMatch(soTerm))
+      SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+      if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
       {
-        sf = processProteinMatch(attributes, seq, gff, align,
-                newseqs, relaxedIdMatching);
+        sf = processProteinMatch(attributes, seq, gff, align, newseqs,
+                relaxedIdMatching);
       }
-      else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm))
+      else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
       {
-        sf = processNucleotideMatch(attributes, seq, gff, align,
-                newseqs, relaxedIdMatching);
+        sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
+                relaxedIdMatching);
       }
       else
       {
@@ -92,7 +114,7 @@ public class Gff3Helper extends GffHelperBase
        */
       sf = buildSequenceFeature(gff, null);
     }
-  
+
     return sf;
   }
 
@@ -119,8 +141,7 @@ public class Gff3Helper extends GffHelperBase
   protected SequenceFeature processNucleotideMatch(
           Map<String, List<String>> attributes, SequenceI seq,
           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
-          boolean relaxedIdMatching)
-          throws IOException
+          boolean relaxedIdMatching) throws IOException
   {
     String strand = gffColumns[STRAND_COL];
 
@@ -133,8 +154,8 @@ public class Gff3Helper extends GffHelperBase
      */
     if ("-".equals(strand))
     {
-      System.err
-              .println("Skipping mapping from reverse complement as not yet supported");
+      System.err.println(
+              "Skipping mapping from reverse complement as not yet supported");
       return null;
     }
 
@@ -166,8 +187,8 @@ public class Gff3Helper extends GffHelperBase
        * (new or existing) virtual sequence in the newseqs list 
        */
       String targetId = findTargetId(tokens[0], attributes);
-      SequenceI mappedSequence1 = findSequence(targetId, align,
-      newseqs, relaxedIdMatching);
+      SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
+              relaxedIdMatching);
       SequenceI mappedSequence = mappedSequence1;
       if (mappedSequence == null)
       {
@@ -195,8 +216,7 @@ public class Gff3Helper extends GffHelperBase
         int fromStart = Integer.parseInt(gffColumns[START_COL]);
         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
-                toStart, toEnd,
-                MappingType.NucleotideToNucleotide);
+                toStart, toEnd, MappingType.NucleotideToNucleotide);
 
         if (mapping != null)
         {
@@ -226,7 +246,8 @@ public class Gff3Helper extends GffHelperBase
    * @return
    */
   @SuppressWarnings("unused")
-  protected String findTargetId(String target, Map<String, List<String>> set)
+  protected String findTargetId(String target,
+          Map<String, List<String>> set)
   {
     return target;
   }
@@ -257,8 +278,8 @@ public class Gff3Helper extends GffHelperBase
    * @throws IOException
    */
   protected SequenceFeature processProteinMatch(
-          Map<String, List<String>> set, SequenceI seq,
-          String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
+          Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
+          AlignmentI align, List<SequenceI> newseqs,
           boolean relaxedIdMatching)
   {
     // This is currently tailored to InterProScan GFF output:
@@ -280,8 +301,8 @@ public class Gff3Helper extends GffHelperBase
       for (String target : targets)
       {
 
-        SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
-        newseqs, relaxedIdMatching);
+        SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
+                align, newseqs, relaxedIdMatching);
         SequenceI mappedSequence = mappedSequence1;
         if (mappedSequence == null)
         {
@@ -292,10 +313,9 @@ public class Gff3Helper extends GffHelperBase
          * give the mapped sequence a copy of the sequence feature, with 
          * start/end range adjusted 
          */
-        SequenceFeature sf2 = new SequenceFeature(sf);
-        sf2.setBegin(1);
         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
-        sf2.setEnd(sequenceFeatureLength);
+        SequenceFeature sf2 = new SequenceFeature(sf, 1,
+                sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
         mappedSequence.addSequenceFeature(sf2);
 
         /*
@@ -303,8 +323,8 @@ public class Gff3Helper extends GffHelperBase
          * renamed with its qualified accession id; renaming has to wait until
          * all sequence reference resolution is complete
          */
-        String accessionId = StringUtils.listToDelimitedString(
-                set.get(NAME), ",");
+        String accessionId = StringUtils
+                .listToDelimitedString(set.get(NAME), ",");
         if (accessionId.length() > 0)
         {
           String database = sf.getType(); // TODO InterProScan only??
@@ -330,23 +350,16 @@ public class Gff3Helper extends GffHelperBase
   }
 
   /**
-   * Return '=' as the name-value separator used in column 9 attributes.
-   */
-  @Override
-  protected char getNameValueSeparator()
-  {
-    return '=';
-  }
-
-  /**
    * Modifies the default SequenceFeature in order to set the Target sequence id
    * as the description
    */
   @Override
   protected SequenceFeature buildSequenceFeature(String[] gff,
+          int typeColumn, String group,
           Map<String, List<String>> attributes)
   {
-    SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
+    SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
+            attributes);
     String desc = getDescription(sf, attributes);
     if (desc != null)
     {
@@ -372,22 +385,41 @@ public class Gff3Helper extends GffHelperBase
       desc = target.split(" ")[0];
     }
 
-    if (SequenceOntology.getInstance().isSequenceVariant(sf.getType()))
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+    String type = sf.getType();
+    if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
     {
       /*
        * Ensembl returns dna variants as 'alleles'
        */
-      desc = StringUtils.listToDelimitedString(
-              attributes.get("alleles"), ",");
+      desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
+              ",");
     }
 
     /*
-     * Ensembl returns gene name as 'Name' for a transcript
+     * extract 'Name' for a transcript (to show gene name)
+     * or an exon (so 'colour by label' shows exon boundaries) 
      */
-    if (EnsemblSeqProxy.isTranscript(sf.getType()))
+    if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
+            || so.isA(type, SequenceOntologyI.TRANSCRIPT)
+            || so.isA(type, SequenceOntologyI.EXON))
     {
       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
     }
+
+    /*
+     * if the above fails, try ID
+     */
+    if (desc == null)
+    {
+      desc = (String) sf.getValue(ID);
+    }
+
+    /*
+     * and decode comma, equals, semi-colon as required by GFF3 spec
+     */
+    desc = StringUtils.urlDecode(desc, GFF_ENCODABLE);
+
     return desc;
   }
 }