package jalview.datamodel.xdb.embl;

import jalview.datamodel.DBRefEntry;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;

import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;

public class EmblEntry {
  String accession;
  String version;
  String taxDivision;
  String desc;
  String rCreated;
  String rLastUpdated;
  String lastUpdated;
  Vector keywords;
  Vector refs;
  Vector dbRefs;
  Vector features;
  EmblSequence sequence;
  /**
   * @return the accession
   */
  public String getAccession() {
    return accession;
  }
  /**
   * @param accession the accession to set
   */
  public void setAccession(String accession) {
    this.accession = accession;
  }
  /**
   * @return the dbRefs
   */
  public Vector getDbRefs() {
    return dbRefs;
  }
  /**
   * @param dbRefs the dbRefs to set
   */
  public void setDbRefs(Vector dbRefs) {
    this.dbRefs = dbRefs;
  }
  /**
   * @return the desc
   */
  public String getDesc() {
    return desc;
  }
  /**
   * @param desc the desc to set
   */
  public void setDesc(String desc) {
    this.desc = desc;
  }
  /**
   * @return the features
   */
  public Vector getFeatures() {
    return features;
  }
  /**
   * @param features the features to set
   */
  public void setFeatures(Vector features) {
    this.features = features;
  }
  /**
   * @return the keywords
   */
  public Vector getKeywords() {
    return keywords;
  }
  /**
   * @param keywords the keywords to set
   */
  public void setKeywords(Vector keywords) {
    this.keywords = keywords;
  }
  /**
   * @return the lastUpdated
   */
  public String getLastUpdated() {
    return lastUpdated;
  }
  /**
   * @param lastUpdated the lastUpdated to set
   */
  public void setLastUpdated(String lastUpdated) {
    this.lastUpdated = lastUpdated;
  }
  /**
   * @return the refs
   */
  public Vector getRefs() {
    return refs;
  }
  /**
   * @param refs the refs to set
   */
  public void setRefs(Vector refs) {
    this.refs = refs;
  }
  /**
   * @return the releaseCreated
   */
  public String getRCreated() {
    return rCreated;
  }
  /**
   * @param releaseCreated the releaseCreated to set
   */
  public void setRcreated(String releaseCreated) {
    this.rCreated = releaseCreated;
  }
  /**
   * @return the releaseLastUpdated
   */
  public String getRLastUpdated() {
    return rLastUpdated;
  }
  /**
   * @param releaseLastUpdated the releaseLastUpdated to set
   */
  public void setRLastUpdated(String releaseLastUpdated) {
    this.rLastUpdated = releaseLastUpdated;
  }
  /**
   * @return the sequence
   */
  public EmblSequence getSequence() {
    return sequence;
  }
  /**
   * @param sequence the sequence to set
   */
  public void setSequence(EmblSequence sequence) {
    this.sequence = sequence;
  }
  /**
   * @return the taxDivision
   */
  public String getTaxDivision() {
    return taxDivision;
  }
  /**
   * @param taxDivision the taxDivision to set
   */
  public void setTaxDivision(String taxDivision) {
    this.taxDivision = taxDivision;
  }
  /**
   * @return the version
   */
  public String getVersion() {
    return version;
  }
  /**
   * @param version the version to set
   */
  public void setVersion(String version) {
    this.version = version;
  }
/*
 * EMBL Feature support is limited. The text below is included for the benefit of
 * any developer working on improving EMBL feature import in Jalview.
 * Extract from EMBL feature specification
 * see http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
3.5 Location
3.5.1 Purpose

The location indicates the region of the presented sequence which corresponds 
to a feature. 

3.5.2 Format and conventions
The location contains at least one sequence location descriptor and may 
contain one or more operators with one or more sequence location descriptors. 
Base numbers refer to the numbering in the entry. This numbering designates 
the first base (5' end) of the presented sequence as base 1. 
Base locations beyond the range of the presented sequence may not be used in 
location descriptors, the only exception being location in a remote entry (see 
3.5.2.1, e).  

Location operators and descriptors are discussed in more detail below.  

3.5.2.1 Location descriptors

The location descriptor can be one of the following: 
(a) a single base number
(b) a site between two indicated adjoining bases
(c) a single base chosen from within a specified range of bases (not allowed for new
    entries)
(d) the base numbers delimiting a sequence span
(e) a remote entry identifier followed by a local location descriptor
    (i.e., a-d)

A site between two adjoining nucleotides, such as endonucleolytic cleavage 
site, is indicated by listing the two points separated by a carat (^). The 
permitted formats for this descriptor are n^n+1 (for example 55^56), or, for 
circular molecules, n^1, where "n" is the full length of the molecule, ie 
1000^1 for circular molecule with length 1000.

A single base chosen from a range of bases is indicated by the first base
number and the last base number of the range separated by a single period
(e.g., '12.21' indicates a single base taken from between the indicated
points). From October 2006 the usage of this descriptor is restricted :
it is illegal to use "a single base from a range" (c) either on its own or
in combination with the "sequence span" (d) descriptor for newly created entries.
The existing entries where such descriptors exist are going to be retrofitted.

Sequence spans are indicated by the starting base number and the ending base 
number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may 
be used with the starting and ending base numbers to indicate that an end 
point is beyond the specified base number. The starting and ending base 
positions can be represented as distinct base numbers ('34..456') or a site 
between two indicated adjoining bases. 

A location in a remote entry (not the entry to which the feature table 
belongs) can be specified by giving  the accession-number and sequence version 
of the remote entry, followed by a colon ":", followed by a location 
descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see 
also examples below) 

3.5.2.2 Operators

The location operator is a prefix that specifies what must be done to the 
indicated sequence to find or construct the location corresponding to the 
feature. A list of operators is given below with their definitions and most 
common format. 

complement(location) 
Find the complement of the presented sequence in the span specified by "
location" (i.e., read the complement of the presented strand in its 5'-to-3' 
direction) 

join(location,location, ... location) 
The indicated elements should be joined (placed end-to-end) to form one 
contiguous sequence 

order(location,location, ... location) 
The elements can be found in the 
specified order (5' to 3' direction), but nothing is implied about the 
reasonableness about joining them 

Note : location operator "complement" can be used in combination with either "
join" or "order" within the same location; combinations of "join" and "order" 
within the same location (nested operators) are illegal.


3.5.3 Location examples 

The following is a list of common location descriptors with their meanings: 

Location                  Description   

467                       Points to a single base in the presented sequence 

340..565                  Points to a continuous range of bases bounded by and
                          including the starting and ending bases

<345..500                 Indicates that the exact lower boundary point of a feature
                          is unknown.  The location begins at some  base previous to
                          the first base specified (which need not be contained in 
                          the presented sequence) and continues to and includes the 
                          ending base 

<1..888                   The feature starts before the first sequenced base and 
                          continues to and includes base 888

1..>888                   The feature starts at the first sequenced base and 
                          continues beyond base 888

102.110                   Indicates that the exact location is unknown but that it is 
                          one of the bases between bases 102 and 110, inclusive

123^124                   Points to a site between bases 123 and 124

join(12..78,134..202)     Regions 12 to 78 and 134 to 202 should be joined to form 
                          one contiguous sequence


complement(34..126)       Start at the base complementary to 126 and finish at the 
                          base complementary to base 34 (the feature is on the strand 
                          complementary to the presented strand)


complement(join(2691..4571,4918..5163))
                          Joins regions 2691 to 4571 and 4918 to 5163, then 
                          complements the joined segments (the feature is on the 
                          strand complementary to the presented strand) 

join(complement(4918..5163),complement(2691..4571))
                       Complements regions 4918 to 5163 and 2691 to 4571, then 
                          joins the complemented segments (the feature is on the 
                          strand complementary to the presented strand)
  
J00194.1:100..202         Points to bases 100 to 202, inclusive, in the entry (in 
                          this database) with primary accession number 'J00194'
 
join(1..100,J00194.1:100..202)
                          Joins region 1..100 of the existing entry with the region
                          100..202 of remote entry J00194

 */
  /**
   * Recover annotated sequences from EMBL file
   * @param noNa don't return nucleic acid sequences 
   * @param sourceDb TODO
   * @param noProtein don't return any translated protein sequences marked in features
   * @return dataset sequences with DBRefs and features - DNA always comes first
   */
  public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) {
    Vector seqs=new Vector();
    Sequence dna=null;
    if (!noNa) {
      dna = new Sequence(sourceDb+"|"+accession, sequence.getSequence());
      dna.setDescription(desc);
      dna.addDBRef(new DBRefEntry(sourceDb, version, accession));
      // TODO: add mapping for parentAccession attribute
      // TODO: transform EMBL Database refs to canonical form
      if (dbRefs!=null)
        for (Iterator i=dbRefs.iterator(); i.hasNext(); dna.addDBRef((DBRefEntry)i.next()));
    }
    for (Iterator i=features.iterator(); i.hasNext(); ) {
      EmblFeature feature = (EmblFeature) i.next();
      if (!noNa) {
        if (feature.dbRefs!=null && feature.dbRefs.size()>0) {
          for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) )
            ;
        }
      }
      if (feature.getName().equalsIgnoreCase("CDS")) {
        // extract coding region(s)
        jalview.datamodel.Mapping map = null;
        int[] exon=null;
        if (feature.locations!=null && feature.locations.size()>0) {
          for (Iterator locs=feature.locations.iterator();
          locs.hasNext(); ) {
            EmblFeatureLocations loc = (EmblFeatureLocations) locs.next();
            int[] se = loc.getElementRanges();
            if (exon==null) {
              exon=se;
            } else {
              int[] t=new int[exon.length+se.length];
              System.arraycopy(exon, 0, t, 0, exon.length);
              System.arraycopy(se, 0, t, exon.length,se.length);
              exon=t;
            }
          }
        }
        String prseq=null;
        String prname=new String();
        String prid=null;
        Hashtable vals=new Hashtable();
        int prstart=1;
        // get qualifiers
        if (feature.getQualifiers()!=null && feature.getQualifiers().size()>0) {
          for (Iterator quals=feature.getQualifiers().iterator(); quals.hasNext(); ) {
            Qualifier q = (Qualifier) quals.next();
            if (q.getName().equals("translation")) 
            {
              prseq=q.getValue();
            } 
            else
              if (q.getName().equals("protein_id")) 
              {
                prid=q.getValue();
              }
              else
                if (q.getName().equals("codon_start"))
                {
                  prstart = Integer.parseInt(q.getValue());
                }
                else 
                if (q.getName().equals("product")){
                  prname = q.getValue();
                } else {
                  // throw anything else into the additional properties hash
                  vals.put(q.getName(), q.getValue());
                }
          }
        }
        Sequence product=null;
        if (prseq!=null && prname!=null && prid!=null) {
          // extract proteins.
          if (!noPeptide) {
            product = new Sequence(sourceDb+"|"+"EMBLCDS|"+prid+"|"+prname, prseq, prstart, prstart+prseq.length()-1);
            product.setDescription("Protein Product from "+sourceDb);
            seqs.add(product);
          }
          // we have everything - create the mapping and perhaps the protein sequence
          map = new jalview.datamodel.Mapping(product, exon, new int[] { prstart, prstart+prseq.length()-1}, 3, 1);
          // add cds feature to dna seq - this may include the stop codon
          for (int xint=0;xint<exon.length; xint+=2) {
            SequenceFeature sf = new SequenceFeature();
            sf.setBegin(exon[xint]);
            sf.setEnd(exon[xint+1]);
            sf.setType(feature.getName());
            sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL);
            sf.setDescription("Exon "+(1+xint)+" for protein '"+prname+"' EMBLCDS:"+prid);
            if (vals!=null && vals.size()>0) {
              Enumeration kv = vals.elements();
              while (kv.hasMoreElements()) {
                Object key=kv.nextElement();
                if (key!=null)
                  sf.setValue(key.toString(), vals.get(key));
              }
            }
            dna.addSequenceFeature(sf);
          }
        }
        // add dbRefs to sequence
        if (feature.dbRefs!=null && feature.dbRefs.size()>0) 
        {
          for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext();  ) 
          {
            DBRefEntry ref = (DBRefEntry)dbr.next();
            ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref.getSource()));
            if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT)) 
            {
              ref.setMap(map);
            }
            if (product!=null) {
              DBRefEntry pref = new DBRefEntry(ref.getSource(), ref.getVersion(), ref.getAccessionId());
              pref.setMap(null); // reference is direct
            }
            dna.addDBRef(ref);
          }
        }
        
      } else {
        // General feature type.
        if (!noNa) {
          if (feature.dbRefs!=null && feature.dbRefs.size()>0) {
            for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) )
              ;
          }
        }
      }

    }
    if (!noNa) {
      seqs.add(dna);
    }
    SequenceI[] sqs = new SequenceI[seqs.size()];
    for (int i=0,j=seqs.size();i<j; i++) {
      sqs[i] = (SequenceI) seqs.elementAt(i);
      seqs.set(i, null);
    }
    return sqs;
  }
}