develop merge
[jalview.git] / src / jalview / io / gff / InterProScanHelper.java
diff --git a/src/jalview/io/gff/InterProScanHelper.java b/src/jalview/io/gff/InterProScanHelper.java
new file mode 100644 (file)
index 0000000..68d5d4f
--- /dev/null
@@ -0,0 +1,118 @@
+package jalview.io.gff;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.StringUtils;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A handler to parse GFF in the format generated by InterProScan
+ */
+public class InterProScanHelper extends Gff3Helper
+{
+  private static final String INTER_PRO_SCAN = "InterProScan";
+
+  private static final String SIGNATURE_DESC = "signature_desc";
+
+  /**
+   * Process one GFF feature line (as modelled by SequenceFeature)
+   * 
+   * @param seq
+   *          the sequence with which this feature is associated
+   * @param gff
+   *          the gff column data
+   * @param align
+   *          the alignment we are adding GFF to
+   * @param newseqs
+   *          any new sequences referenced by the GFF
+   * @param relaxedIdMatching
+   *          if true, match word tokens in sequence names
+   * @return a sequence feature if one should be added to the sequence, else
+   *         null (i.e. it has been processed in another way e.g. to generate a
+   *         mapping)
+   * @throws IOException
+   */
+  @Override
+  public SequenceFeature processGff(SequenceI seq, String[] gff,
+          AlignmentI align, List<SequenceI> newseqs,
+          boolean relaxedIdMatching) throws IOException
+  {
+    /*
+     * ignore the 'polypeptide' match of the whole sequence
+     */
+    if (".".equals(gff[SOURCE_COL]))
+    {
+      return null;
+    }
+
+    return super.processGff(seq, gff, align, newseqs, relaxedIdMatching);
+  }
+
+  /**
+ * 
+ */
+  @Override
+  protected SequenceFeature buildSequenceFeature(String[] gff,
+          Map<String, List<String>> attributes)
+  {
+    SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
+
+    /*
+     * signature_desc is a more informative source of description
+     */
+    List<String> desc = attributes.get(SIGNATURE_DESC);
+    String description = StringUtils.listToDelimitedString(desc, ", ");
+    if (description.length() > 0)
+    {
+      sf.setDescription(description);
+    }
+
+    /*
+     * Set sequence feature group as 'InterProScan', and type as the source
+     * database for this match (e.g. 'Pfam')
+     */
+    sf.setType(gff[SOURCE_COL]);
+    sf.setFeatureGroup(INTER_PRO_SCAN);
+
+    return sf;
+  }
+
+  /**
+   * Tests whether the GFF data looks like it was generated by InterProScan
+   * 
+   * @param columns
+   * @return
+   */
+  public static boolean recognises(String[] columns)
+  {
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+    String type = columns[TYPE_COL];
+    if (so.isA(type, SequenceOntologyI.PROTEIN_MATCH)
+            || (".".equals(columns[SOURCE_COL]) && so.isA(type,
+                    SequenceOntologyI.POLYPEPTIDE)))
+    {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Overriden method, because InterProScan GFF has the target sequence id in
+   * GFF field 'ID' rather than the usual 'Target' :-O
+   */
+  @Override
+  protected String findTargetId(String target, Map<String, List<String>> set)
+  {
+    List<String> ids = set.get(ID);
+    if (ids == null || ids.size() != 1)
+    {
+      return null;
+    }
+    return ids.get(0);
+  }
+
+}