JAL-653 first stab at parsing GFF3 pragmas to support ##fasta
authorJim Procter <jprocter@issues.jalview.org>
Sat, 6 Jun 2015 16:59:39 +0000 (17:59 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Sat, 6 Jun 2015 16:59:39 +0000 (17:59 +0100)
src/jalview/io/FeaturesFile.java

index 9bdc4e3..4085fc1 100755 (executable)
@@ -187,10 +187,18 @@ public class FeaturesFile extends AlignFile
        * when true, assume GFF style features rather than Jalview style.
        */
       boolean GFFFile = true;
+      Map<String, String> gffProps = new HashMap<String, String>();
       while ((line = nextLine()) != null)
       {
+        // skip comments/process pragmas
         if (line.startsWith("#"))
         {
+          if (line.startsWith("##"))
+          {
+            // possibly GFF2/3 version and metadata header
+            processGffPragma(line, gffProps, align, newseqs);
+            line = "";
+          }
           continue;
         }
 
@@ -649,6 +657,124 @@ public class FeaturesFile extends AlignFile
     return true;
   }
 
+  private enum GffPragmas
+  {
+    gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
+  };
+
+  private static Map<String, GffPragmas> GFFPRAGMA;
+  static
+  {
+    GFFPRAGMA = new HashMap<String, GffPragmas>();
+    GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
+    GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
+    GFFPRAGMA.put("#", GffPragmas.hash);
+    GFFPRAGMA.put("fasta", GffPragmas.fasta);
+    GFFPRAGMA.put("species-build", GffPragmas.species_build);
+    GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
+    GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
+  }
+
+  private void processGffPragma(String line, Map<String, String> gffProps,
+          AlignmentI align, ArrayList<SequenceI> newseqs)
+          throws IOException
+  {
+    // line starts with ##
+    int spacepos = line.indexOf(' ');
+    String pragma = spacepos == -1 ? line.substring(2).trim() : line
+            .substring(2, spacepos);
+    GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
+    if (gffpragma == null)
+    {
+      return;
+    }
+    switch (gffpragma)
+    {
+    case gff_version:
+      try
+      {
+        gffversion = Integer.parseInt(line.substring(spacepos + 1));
+      } finally
+      {
+
+      }
+      break;
+    case feature_ontology:
+      // resolve against specific feature ontology
+      break;
+    case attribute_ontology:
+      // resolve against specific attribute ontology
+      break;
+    case source_ontology:
+      // resolve against specific source ontology
+      break;
+    case species_build:
+      // resolve against specific NCBI taxon version
+      break;
+    case hash:
+      // close off any open feature hierarchies
+      break;
+    case fasta:
+      // process the rest of the file as a fasta file and replace any dummy
+      // sequence IDs
+      process_as_fasta(align, newseqs);
+      break;
+    default:
+      // we do nothing ?
+      System.err.println("Ignoring unknown pragma:\n" + line);
+    }
+  }
+
+  private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
+          throws IOException
+  {
+    try
+    {
+      mark();
+    } catch (IOException q)
+    {
+    }
+    FastaFile parser = new FastaFile(this);
+    List<SequenceI> includedseqs = parser.getSeqs();
+    SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
+    // iterate over includedseqs, and replacing matching ones with newseqs
+    // sequences. Generic iterator not used here because we modify includedseqs
+    // as we go
+    for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
+    {
+      // search for any dummy seqs that this sequence can be used to update
+      SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
+      if (dummyseq != null)
+      {
+        // dummyseq was created so it could be annotated and referred to in
+        // alignments/codon mappings
+
+        SequenceI mseq = includedseqs.get(p);
+        // mseq is the 'template' imported from the FASTA file which we'll use
+        // to coomplete dummyseq
+        if (dummyseq instanceof SequenceDummy)
+        {
+          // probably have the pattern wrong
+          // idea is that a flyweight proxy for a sequence ID can be created for
+          // 1. stable reference creation
+          // 2. addition of annotation
+          // 3. future replacement by a real sequence
+          // current pattern is to create SequenceDummy objects - a convenience
+          // constructor for a Sequence.
+          // problem is that when promoted to a real sequence, all references
+          // need
+          // to be updated somehow.
+          ((SequenceDummy) dummyseq).become(mseq);
+          includedseqs.set(p, dummyseq); // template is no longer needed
+        }
+      }
+    }
+    // finally add sequences to the dataset
+    for (SequenceI seq : includedseqs)
+    {
+      align.addSequence(seq);
+    }
+  }
 
   /**
    * take a sequence feature and examine its attributes to decide how it should