- }
-
- return true;
- }
-
- private enum GffPragmas
- {
- gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
- };
-
- private static Map<String, GffPragmas> GFFPRAGMA;
- static
- {
- GFFPRAGMA = new HashMap<String, GffPragmas>();
- GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
- GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
- GFFPRAGMA.put("#", GffPragmas.hash);
- GFFPRAGMA.put("fasta", GffPragmas.fasta);
- GFFPRAGMA.put("species-build", GffPragmas.species_build);
- GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
- GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
- }
-
- private void processGffPragma(String line, Map<String, String> gffProps,
- AlignmentI align, ArrayList<SequenceI> newseqs)
- throws IOException
- {
- // line starts with ##
- int spacepos = line.indexOf(' ');
- String pragma = spacepos == -1 ? line.substring(2).trim() : line
- .substring(2, spacepos);
- GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
- if (gffpragma == null)
- {
- return;
- }
- switch (gffpragma)
- {
- case gff_version:
- try
- {
- gffversion = Integer.parseInt(line.substring(spacepos + 1));
- } finally
- {
-
- }
- break;
- case feature_ontology:
- // resolve against specific feature ontology
- break;
- case attribute_ontology:
- // resolve against specific attribute ontology
- break;
- case source_ontology:
- // resolve against specific source ontology
- break;
- case species_build:
- // resolve against specific NCBI taxon version
- break;
- case hash:
- // close off any open feature hierarchies
- break;
- case fasta:
- // process the rest of the file as a fasta file and replace any dummy
- // sequence IDs
- process_as_fasta(align, newseqs);
- break;
- default:
- // we do nothing ?
- System.err.println("Ignoring unknown pragma:\n" + line);
- }
- }
-
- private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
- throws IOException
- {
- try
- {
- mark();
- } catch (IOException q)
- {
- }
- FastaFile parser = new FastaFile(this);
- List<SequenceI> includedseqs = parser.getSeqs();
- SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
- // iterate over includedseqs, and replacing matching ones with newseqs
- // sequences. Generic iterator not used here because we modify includedseqs
- // as we go
- for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
- {
- // search for any dummy seqs that this sequence can be used to update
- SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
- if (dummyseq != null)
- {
- // dummyseq was created so it could be annotated and referred to in
- // alignments/codon mappings
-
- SequenceI mseq = includedseqs.get(p);
- // mseq is the 'template' imported from the FASTA file which we'll use
- // to coomplete dummyseq
- if (dummyseq instanceof SequenceDummy)
- {
- // probably have the pattern wrong
- // idea is that a flyweight proxy for a sequence ID can be created for
- // 1. stable reference creation
- // 2. addition of annotation
- // 3. future replacement by a real sequence
- // current pattern is to create SequenceDummy objects - a convenience
- // constructor for a Sequence.
- // problem is that when promoted to a real sequence, all references
- // need
- // to be updated somehow.
- ((SequenceDummy) dummyseq).become(mseq);
- includedseqs.set(p, dummyseq); // template is no longer needed
- }
- }
- }
- // finally add sequences to the dataset
- for (SequenceI seq : includedseqs)
- {
- align.addSequence(seq);
- }
- }
-
- /**
- * take a sequence feature and examine its attributes to decide how it should
- * be added to a sequence
- *
- * @param seq
- * - the destination sequence constructed or discovered in the
- * current context
- * @param sf
- * - the base feature with ATTRIBUTES property containing any
- * additional attributes
- * @param gFFFile
- * - true if we are processing a GFF annotation file
- * @return true if sf was actually added to the sequence, false if it was
- * processed in another way
- */
- public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
- boolean gFFFile, boolean relaxedIdMatching)
- {
- String attr = (String) sf.getValue("ATTRIBUTES");
- boolean add = true;
- if (gFFFile && attr != null)
- {
- int nattr=8;
-
- for (String attset : attr.split("\t"))
- {
- if (attset==null || attset.trim().length()==0)
- {
- continue;
- }
- nattr++;
- Map<String, List<String>> set = new HashMap<String, List<String>>();
- // normally, only expect one column - 9 - in this field
- // the attributes (Gff3) or groups (gff2) field
- for (String pair : attset.trim().split(";"))
- {
- pair = pair.trim();
- if (pair.length() == 0)
- {
- continue;
- }
-
- // expect either space seperated (gff2) or '=' separated (gff3)
- // key/value pairs here
-
- int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
- String key = null, value = null;
-
- if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
- {
- key = pair.substring(0, sppos);
- value = pair.substring(sppos + 1);
- } else {
- if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
- {
- key = pair.substring(0, eqpos);
- value = pair.substring(eqpos + 1);
- } else
- {
- key = pair;
- }
- }
- if (key != null)
- {
- List<String> vals = set.get(key);
- if (vals == null)
- {
- vals = new ArrayList<String>();
- set.put(key, vals);
- }
- if (value != null)
- {
- vals.add(value.trim());
- }
- }
- }
- try
- {
- add &= processGffKey(set, nattr, seq, sf, align, newseqs,
- relaxedIdMatching); // process decides if
- // feature is actually
- // added
- } catch (InvalidGFF3FieldException ivfe)
- {
- System.err.println(ivfe);
- }
- }
- }
- if (add)
- {
- seq.addSequenceFeature(sf);
- }
- return add;
- }
-
- public class InvalidGFF3FieldException extends Exception
- {
- String field, value;
-
- public InvalidGFF3FieldException(String field,
- Map<String, List<String>> set, String message)
- {
- super(message + " (Field was " + field + " and value was "
- + set.get(field).toString());
- this.field = field;
- this.value = set.get(field).toString();
- }
-
- }
-
- /**
- * take a set of keys for a feature and interpret them
- *
- * @param set
- * @param nattr
- * @param seq
- * @param sf
- * @return
- */
- public boolean processGffKey(Map<String, List<String>> set, int nattr,
- SequenceI seq, SequenceFeature sf, AlignmentI align,
- List<SequenceI> newseqs, boolean relaxedIdMatching)
- throws InvalidGFF3FieldException
- {
- String attr;
- // decide how to interpret according to type
- if (sf.getType().equals("similarity"))
- {
- int strand = sf.getStrand();
- // exonerate cdna/protein map
- // look for fields
- List<SequenceI> querySeq = findNames(align, newseqs,
- relaxedIdMatching, set.get(attr="Query"));
- if (querySeq==null || querySeq.size()!=1)
- {
- throw new InvalidGFF3FieldException( attr, set,
- "Expecting exactly one sequence in Query field (got "
- + set.get(attr) + ")");
- }
- if (set.containsKey(attr="Align"))
- {
- // process the align maps and create cdna/protein maps
- // ideally, the query sequences are in the alignment, but maybe not...
-
- AlignedCodonFrame alco = new AlignedCodonFrame();
- MapList codonmapping = constructCodonMappingFromAlign(set, attr,
- strand);
-
- // add codon mapping, and hope!
- alco.addMap(seq, querySeq.get(0), codonmapping);
- align.addCodonFrame(alco);
- // everything that's needed to be done is done
- // no features to create here !
- return false;
- }
-
- }
- return true;
- }
-
- private MapList constructCodonMappingFromAlign(
- Map<String, List<String>> set,
- String attr, int strand) throws InvalidGFF3FieldException
- {
- if (strand == 0)
- {
- throw new InvalidGFF3FieldException(attr, set,
- "Invalid strand for a codon mapping (cannot be 0)");
- }
- List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
- int lastppos = 0, lastpframe = 0;
- for (String range : set.get(attr))
- {
- List<Integer> ints = new ArrayList<Integer>();
- StringTokenizer st = new StringTokenizer(range, " ");
- while (st.hasMoreTokens())
- {
- String num = st.nextToken();
- try
- {
- ints.add(new Integer(num));
- } catch (NumberFormatException nfe)
- {
- throw new InvalidGFF3FieldException(attr, set,
- "Invalid number in field " + num);
- }
- }
- // Align positionInRef positionInQuery LengthInRef
- // contig_1146 exonerate:protein2genome:local similarity 8534 11269
- // 3652 - . alignment_id 0 ;
- // Query DDB_G0269124
- // Align 11270 143 120
- // corresponds to : 120 bases align at pos 143 in protein to 11270 on
- // dna in strand direction
- // Align 11150 187 282
- // corresponds to : 282 bases align at pos 187 in protein to 11150 on
- // dna in strand direction
- //
- // Align 10865 281 888
- // Align 9977 578 1068
- // Align 8909 935 375
- //
- if (ints.size() != 3)
- {
- throw new InvalidGFF3FieldException(attr, set,
- "Invalid number of fields for this attribute ("
- + ints.size() + ")");
- }
- fromrange.add(new Integer(ints.get(0).intValue()));
- fromrange.add(new Integer(ints.get(0).intValue() + strand
- * ints.get(2).intValue()));
- // how are intron/exon boundaries that do not align in codons
- // represented
- if (ints.get(1).equals(lastppos) && lastpframe > 0)
- {
- // extend existing to map
- lastppos += ints.get(2) / 3;
- lastpframe = ints.get(2) % 3;
- torange.set(torange.size() - 1, new Integer(lastppos));
- }
- else