JAL-3499 process motifs in features file to create features feature/JAL-3499featuresFileMotifs
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 21 Jan 2020 19:40:38 +0000 (19:40 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 21 Jan 2020 19:40:38 +0000 (19:40 +0000)
src/jalview/analysis/Finder.java
src/jalview/io/FeaturesFile.java
test/jalview/io/FeaturesFileTest.java

index 3cbef6d..61ca663 100644 (file)
@@ -53,11 +53,17 @@ public class Finder implements FinderI
   private Vector<SequenceI> idMatches;
 
   /*
-   * the viewport to search over
+   * the viewport to search over, if known
+   * (may restrict search by selection group, or for hidden columns)
    */
   private AlignViewportI viewport;
 
   /*
+   * the alignment to search over
+   */
+  private AlignmentI alignment;
+
+  /*
    * sequence index in alignment to search from
    */
   private int sequenceIndex;
@@ -76,6 +82,22 @@ public class Finder implements FinderI
    */
   public Finder(AlignViewportI av)
   {
+    this(av, av.getAlignment());
+  }
+
+  /**
+   * Constructor given an alignment to search
+   * 
+   * @param align
+   */
+  public Finder(AlignmentI align)
+  {
+    this(null, align);
+  }
+
+  private Finder(AlignViewportI av, AlignmentI align)
+  {
+    this.alignment = align;
     this.viewport = av;
     this.sequenceIndex = 0;
     this.columnIndex = -1;
@@ -135,13 +157,13 @@ public class Finder implements FinderI
     searchResults = new SearchResults();
     idMatches = new Vector<>();
 
-    SequenceGroup selection = viewport.getSelectionGroup();
+    SequenceGroup selection = viewport == null ? null
+            : viewport.getSelectionGroup();
     if (selection != null && selection.getSize() < 1)
     {
       selection = null; // ? ignore column-only selection
     }
 
-    AlignmentI alignment = viewport.getAlignment();
     int end = alignment.getHeight();
 
     while (sequenceIndex < end)
@@ -179,7 +201,6 @@ public class Finder implements FinderI
      * restrict search to (next) visible column region, 
      * in case there are hidden columns
      */
-    AlignmentI alignment = viewport.getAlignment();
     VisibleContigsIterator visibleRegions = alignment.getHiddenColumns()
             .getVisContigsIterator(column, alignment.getWidth(),
                     false);
@@ -195,7 +216,8 @@ public class Finder implements FinderI
     /*
      * restrict search to selected region if there is one
      */
-    SequenceGroup selection = viewport.getSelectionGroup();
+    SequenceGroup selection = viewport == null ? null
+            : viewport.getSelectionGroup();
     if (selection != null)
     {
       int selectionStart = selection.getStartRes();
@@ -230,7 +252,8 @@ public class Finder implements FinderI
   protected boolean findNextMatch(SequenceI seq, String searchString,
           Regex searchPattern, boolean matchDescription)
   {
-    SequenceGroup selection = viewport.getSelectionGroup();
+    SequenceGroup selection = viewport == null ? null
+            : viewport.getSelectionGroup();
     if (selection != null && !selection.contains(seq))
     {
       /*
index a69788b..df3d352 100755 (executable)
 package jalview.io;
 
 import jalview.analysis.AlignmentUtils;
+import jalview.analysis.Finder;
 import jalview.analysis.SequenceIdMatcher;
 import jalview.api.AlignViewportI;
 import jalview.api.FeatureColourI;
 import jalview.api.FeatureRenderer;
 import jalview.api.FeaturesSourceI;
+import jalview.api.FinderI;
 import jalview.datamodel.AlignedCodonFrame;
 import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.MappedFeatures;
+import jalview.datamodel.SearchResultMatchI;
 import jalview.datamodel.SequenceDummy;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
@@ -85,6 +88,10 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
 
   private static final String ENDFILTERS = "ENDFILTERS";
 
+  private static final String STARTMOTIFS = "STARTMOTIFS";
+
+  private static final String ENDMOTIFS = "ENDMOTIFS";
+
   private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
 
   private static final String NOTE = "Note";
@@ -232,6 +239,7 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
 
       while ((line = nextLine()) != null)
       {
+        line = line.trim();
         // skip comments/process pragmas
         if (line.length() == 0 || line.startsWith("#"))
         {
@@ -259,8 +267,9 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
         if (gffColumns.length > 0 && gffColumns.length < 4)
         {
           /*
-           * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
-           * a feature type colour specification
+           * if 2 or 3 tokens, we anticipate one of either
+           * STARTGROUP ENDGROUP STARTFILTERS STARTMOTIFS
+           * or a feature type and colour specification
            */
           String ft = gffColumns[0];
           if (ft.equalsIgnoreCase(STARTFILTERS))
@@ -268,14 +277,17 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
             parseFilters(filters);
             continue;
           }
+          if (ft.equalsIgnoreCase(STARTMOTIFS))
+          {
+            parseMotifs(align, featureGroup);
+            continue;
+          }
           if (ft.equalsIgnoreCase(STARTGROUP))
           {
             featureGroup = gffColumns[1];
           }
           else if (ft.equalsIgnoreCase(ENDGROUP))
           {
-            // We should check whether this is the current group,
-            // but at present there's no way of showing more than 1 group
             featureGroup = null;
           }
           else
@@ -292,8 +304,7 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
         }
 
         /*
-         * if not a comment, GFF pragma, startgroup, endgroup or feature
-         * colour specification, that just leaves a feature details line
+         * if not handled above, that just leaves a feature details line
          * in either Jalview or GFF format
          */
         if (gffVersion == 0)
@@ -333,6 +344,62 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
   }
 
   /**
+   * Reads lines up to and including the next ENDMOTIFS, and processes each one
+   * by
+   * <ul>
+   * <li>searching the alignment for the supplied motif (first column)</li>
+   * <li>creating features for matches, with feature type, description and
+   * (optionally) score given in the remaining columns</li>
+   * </ul>
+   * 
+   * @param alignment
+   * @param featureGroup
+   * @throws IOException
+   */
+  protected void parseMotifs(AlignmentI alignment, String featureGroup)
+          throws IOException
+  {
+    FinderI finder = new Finder(alignment);
+    String line;
+    while ((line = nextLine()) != null)
+    {
+      if (line.toUpperCase().startsWith(ENDMOTIFS))
+      {
+        return;
+      }
+      String[] tokens = line.split(TAB_REGEX);
+      if (tokens.length != 3 && tokens.length != 4)
+      {
+        System.err.println(String.format("Invalid token count %d for %s",
+                tokens.length, line));
+      }
+      String motif = tokens[0];
+      String featureType = tokens[1];
+      String description = tokens[2];
+      float score = 0f;
+      if (tokens.length > 3)
+      {
+        try
+        {
+          score = Float.valueOf(tokens[3]);
+        } catch (NumberFormatException e)
+        {
+          System.err.println("Invalid score in " + line);
+        }
+      }
+      finder.findAll(motif, true, false);
+      List<SearchResultMatchI> matches = finder.getSearchResults()
+              .getResults();
+      for (SearchResultMatchI match : matches)
+      {
+        SequenceFeature sf = new SequenceFeature(featureType, description,
+                match.getStart(), match.getEnd(), score, featureGroup);
+        match.getSequence().addSequenceFeature(sf);
+      }
+    }
+  }
+
+  /**
    * Reads input lines from STARTFILTERS to ENDFILTERS and adds a feature type
    * filter to the map for each line parsed. After exit from this method,
    * nextLine() should return the line after ENDFILTERS (or we are already at
@@ -354,7 +421,7 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI
       String[] tokens = line.split(TAB_REGEX);
       if (tokens.length != 2)
       {
-        System.err.println(String.format("Invalid token count %d for %d",
+        System.err.println(String.format("Invalid token count %d for %s",
                 tokens.length, line));
       }
       else
index 090de6f..5c7bf15 100644 (file)
@@ -23,6 +23,7 @@ package jalview.io;
 import static org.testng.AssertJUnit.assertEquals;
 import static org.testng.AssertJUnit.assertFalse;
 import static org.testng.AssertJUnit.assertNotNull;
+import static org.testng.AssertJUnit.assertNull;
 import static org.testng.AssertJUnit.assertSame;
 import static org.testng.AssertJUnit.assertTrue;
 import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
@@ -62,6 +63,7 @@ import org.testng.annotations.Test;
 
 public class FeaturesFileTest
 {
+  private static final String TAB = "\t";
   private static String simpleGffFile = "examples/testdata/simpleGff3.gff";
 
   @AfterClass(alwaysRun = true)
@@ -899,4 +901,145 @@ public class FeaturesFileTest
             + "ENDGROUP\tgrp2\n";
     assertEquals(expected, exported);
   }
+
+  /**
+   * Test parsing a features file with Jalview format features, including
+   * STARTMOTIFS/ENDMOTIFS lines with motifs to be matched to create features
+   * 
+   * @throws Exception
+   */
+  @Test(groups = { "Functional" })
+  public void testParse_jalviewFeaturesWithMotifs() throws IOException
+  {
+    File f = new File("examples/uniref50.fa");
+    AlignmentI al = readAlignmentFile(f);
+    AlignFrame af = new AlignFrame(al, 500, 500);
+    Map<String, FeatureColourI> colours = af.getFeatureRenderer()
+            .getFeatureColours();
+
+    /*
+     * hide columns with YKV motif; these should not get
+     * matched by the Finder
+     */
+    al.getHiddenColumns().hideColumns(62, 64);
+
+    // @formatter:off
+    String featureData = 
+            /*
+             * features in the null grup
+             */
+            "HELIX" + TAB + "blue\n" +
+            "MOTIF1" + TAB + "green\n" +
+            "MOTIF2" + TAB + "250,200,150|100,50,0|-3.9|4.5|above|-2.0\n" +
+            "adescription" + TAB + "FER_CAPAN" + TAB + "-1" + TAB + "42" + TAB + "45" + TAB + "HELIX\n" +
+            "STARTMOTIFS\n" +
+            "FLP" + TAB + "MOTIF1" + TAB + "flxMotifP\n" +
+            "F[LR]N" + TAB + "MOTIF1" + TAB + "flxMotifN\n" +
+            "fld" + TAB + "MOTIF1" + TAB + "flxMotifD\n" +
+            "YKV" + TAB + "MOTIF1" + TAB + "ykvMotif\n" +
+            "ENDMOTIFS\n" +
+            /*
+             * features in group uniprot
+             */
+            "STARTGROUP" + TAB + "uniprot\n" +
+            "bdescription" + TAB + "FER_CAPAN" + TAB + "-1" + TAB + "47" + TAB + "48" + TAB + "HELIX\n" +
+            "STARTMOTIFS\n" +
+            "FLG" + TAB + "MOTIF1" + TAB + "flxMotifG\n" +
+            "VTT" + TAB + "MOTIF2" + TAB + "vxtMotifT" + TAB + "-3.21\n" +
+            "VRT" + TAB + "MOTIF2" + TAB + "vxtMotifR\n" +
+            "ENDMOTIFS\n" +
+            "ENDGROUP"; 
+            // @formatter:on
+    FeaturesFile featuresFile = new FeaturesFile(featureData,
+            DataSourceType.PASTE);
+    assertTrue("Failed to parse features file",
+            featuresFile.parse(al, colours, true));
+
+    // verify HELIX features were parsed as normal
+    List<SequenceFeature> sfs = al.getSequenceAt(1).findFeatures(0, 999,
+            "HELIX");
+    assertEquals(2, sfs.size());
+    SequenceFeature sf = sfs.get(0);
+    assertNull(sf.getFeatureGroup());
+    assertEquals(42, sf.getBegin());
+    assertEquals(45, sf.getEnd());
+    assertEquals("adescription", sf.getDescription());
+    sf = sfs.get(1);
+    assertEquals("uniprot", sf.getFeatureGroup());
+    assertEquals(47, sf.getBegin());
+    assertEquals(48, sf.getEnd());
+    assertEquals("bdescription", sf.getDescription());
+
+    /*
+     * feature type MOTIF1
+     * FLP motif should match FER1_SOLLC/13-15 and Q93XJ9_SOLTU/13-15
+     * F[LR]N should match O80429_MAIZE/107-109
+     * fld should match nothing (as case sensitive)
+     * feature group should be null for the above
+     * FLG should match FER1_PEA/36-38, feature group uniprot
+     * YKV should match nothing as entirely within hidden columns
+     */
+    for (SequenceI seq : al.getSequences())
+    {
+      List<SequenceFeature> features = seq.findFeatures(0, 9999, "MOTIF1");
+      String name = seq.getName();
+      if (name.equals("FER1_SOLLC") || name.equals("Q93XJ9_SOLTU"))
+      {
+        assertEquals(1, features.size());
+        sf = features.get(0);
+        assertNull(sf.getFeatureGroup());
+        assertEquals(13, sf.getBegin());
+        assertEquals(15, sf.getEnd());
+        assertEquals("flxMotifP", sf.getDescription());
+      }
+      else if (name.equals("O80429_MAIZE"))
+      {
+        assertEquals(1, features.size());
+        sf = features.get(0);
+        assertNull(sf.getFeatureGroup());
+        assertEquals(107, sf.getBegin());
+        assertEquals(109, sf.getEnd());
+        assertEquals("flxMotifN", sf.getDescription());
+      }
+      else if (name.equals("FER1_PEA"))
+      {
+        assertEquals(1, features.size());
+        sf = features.get(0);
+        assertEquals("uniprot", sf.getFeatureGroup());
+        assertEquals(36, sf.getBegin());
+        assertEquals(38, sf.getEnd());
+        assertEquals("flxMotifG", sf.getDescription());
+      }
+      else
+      {
+        assertTrue("MOTIF1 features found for " + name, features.isEmpty());
+      }
+    }
+
+    /*
+     * feature type MOTIF2
+     * VTT motif should match FER1_PEA/26-28
+     * VRT should match nothing
+     */
+    for (SequenceI seq : al.getSequences())
+    {
+      List<SequenceFeature> features = seq.findFeatures(0, 9999, "MOTIF2");
+      String name = seq.getName();
+      if (name.equals("FER1_PEA"))
+      {
+        assertEquals(1, features.size());
+        sf = features.get(0);
+        assertEquals("uniprot", sf.getFeatureGroup());
+        assertEquals(26, sf.getBegin());
+        assertEquals(28, sf.getEnd());
+        assertEquals("vxtMotifT", sf.getDescription());
+        assertEquals(-3.21f, sf.getScore());
+      }
+      else
+      {
+        assertTrue("MOTIF2 features found for " + name, features.isEmpty());
+        assertTrue(features.isEmpty());
+      }
+    }
+  }
 }