From: Jim Procter Date: Sat, 6 Jun 2015 16:59:39 +0000 (+0100) Subject: JAL-653 first stab at parsing GFF3 pragmas to support ##fasta X-Git-Tag: Release_2_10_0~296^2~151 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=e0c1dde5b241a8ac5b8c960f1f55f0940da55788 JAL-653 first stab at parsing GFF3 pragmas to support ##fasta --- diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index 9bdc4e3..4085fc1 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -187,10 +187,18 @@ public class FeaturesFile extends AlignFile * when true, assume GFF style features rather than Jalview style. */ boolean GFFFile = true; + Map gffProps = new HashMap(); while ((line = nextLine()) != null) { + // skip comments/process pragmas if (line.startsWith("#")) { + if (line.startsWith("##")) + { + // possibly GFF2/3 version and metadata header + processGffPragma(line, gffProps, align, newseqs); + line = ""; + } continue; } @@ -649,6 +657,124 @@ public class FeaturesFile extends AlignFile return true; } + private enum GffPragmas + { + gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash + }; + + private static Map GFFPRAGMA; + static + { + GFFPRAGMA = new HashMap(); + GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region); + GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology); + GFFPRAGMA.put("#", GffPragmas.hash); + GFFPRAGMA.put("fasta", GffPragmas.fasta); + GFFPRAGMA.put("species-build", GffPragmas.species_build); + GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology); + GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology); + } + + private void processGffPragma(String line, Map gffProps, + AlignmentI align, ArrayList newseqs) + throws IOException + { + // line starts with ## + int spacepos = line.indexOf(' '); + String pragma = spacepos == -1 ? line.substring(2).trim() : line + .substring(2, spacepos); + GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase()); + if (gffpragma == null) + { + return; + } + switch (gffpragma) + { + case gff_version: + try + { + gffversion = Integer.parseInt(line.substring(spacepos + 1)); + } finally + { + + } + break; + case feature_ontology: + // resolve against specific feature ontology + break; + case attribute_ontology: + // resolve against specific attribute ontology + break; + case source_ontology: + // resolve against specific source ontology + break; + case species_build: + // resolve against specific NCBI taxon version + break; + case hash: + // close off any open feature hierarchies + break; + case fasta: + // process the rest of the file as a fasta file and replace any dummy + // sequence IDs + process_as_fasta(align, newseqs); + break; + default: + // we do nothing ? + System.err.println("Ignoring unknown pragma:\n" + line); + } + } + + private void process_as_fasta(AlignmentI align, List newseqs) + throws IOException + { + try + { + mark(); + } catch (IOException q) + { + } + FastaFile parser = new FastaFile(this); + List includedseqs = parser.getSeqs(); + SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); + // iterate over includedseqs, and replacing matching ones with newseqs + // sequences. Generic iterator not used here because we modify includedseqs + // as we go + for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) + { + // search for any dummy seqs that this sequence can be used to update + SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p)); + if (dummyseq != null) + { + // dummyseq was created so it could be annotated and referred to in + // alignments/codon mappings + + SequenceI mseq = includedseqs.get(p); + // mseq is the 'template' imported from the FASTA file which we'll use + // to coomplete dummyseq + if (dummyseq instanceof SequenceDummy) + { + // probably have the pattern wrong + // idea is that a flyweight proxy for a sequence ID can be created for + // 1. stable reference creation + // 2. addition of annotation + // 3. future replacement by a real sequence + // current pattern is to create SequenceDummy objects - a convenience + // constructor for a Sequence. + // problem is that when promoted to a real sequence, all references + // need + // to be updated somehow. + ((SequenceDummy) dummyseq).become(mseq); + includedseqs.set(p, dummyseq); // template is no longer needed + } + } + } + // finally add sequences to the dataset + for (SequenceI seq : includedseqs) + { + align.addSequence(seq); + } + } /** * take a sequence feature and examine its attributes to decide how it should