Merge branch 'develop' into features/JAL-2446NCList

[jalview.git] / src / jalview / datamodel / xdb / embl / EmblEntry.java
diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java

index 4d09bdc..8cc81d9 100644 (file)
--- a/src/jalview/datamodel/xdb/embl/EmblEntry.java
+++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java
@@ -191,13 +191,15 @@ public class EmblEntry
        return null;
      }
      dna.setDescription(description);
-    DBRefEntry retrievedref = new DBRefEntry(sourceDb,
-            getSequenceVersion(), accession);
+    DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(),
+            accession);
      dna.addDBRef(retrievedref);
      // add map to indicate the sequence is a valid coordinate frame for the
      // dbref
-    retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
-            new int[] { 1, dna.getLength() }, 1, 1));
+    retrievedref
+            .setMap(new Mapping(null, new int[]
+            { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
+                    1));
  
      /*
       * transform EMBL Database refs to canonical form
@@ -242,8 +244,8 @@ public class EmblEntry
    {
      if (sequence == null)
      {
-      System.err.println("No sequence was returned for ENA accession "
-              + accession);
+      System.err.println(
+              "No sequence was returned for ENA accession " + accession);
        return null;
      }
      SequenceI dna = new Sequence(sourceDb + "|" + accession,
@@ -267,7 +269,8 @@ public class EmblEntry
     *          helper to match xrefs in already retrieved sequences
     */
    void parseCodingFeature(EmblFeature feature, String sourceDb,
-          SequenceI dna, List<SequenceI> peptides, SequenceIdMatcher matcher)
+          SequenceI dna, List<SequenceI> peptides,
+          SequenceIdMatcher matcher)
    {
      boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
  
@@ -296,8 +299,8 @@ public class EmblEntry
          if (qname.equals("translation"))
          {
            // remove all spaces (precompiled String.replaceAll(" ", ""))
-          translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(
-                  "");
+          translation = SPACE_PATTERN.matcher(q.getValues()[0])
+                  .replaceAll("");
          }
          else if (qname.equals("protein_id"))
          {
@@ -310,8 +313,8 @@ public class EmblEntry
              codonStart = Integer.parseInt(q.getValues()[0].trim());
            } catch (NumberFormatException e)
            {
-            System.err.println("Invalid codon_start in XML for "
-                    + accession + ": " + e.getMessage());
+            System.err.println("Invalid codon_start in XML for " + accession
+                    + ": " + e.getMessage());
            }
          }
          else if (qname.equals("product"))
@@ -348,9 +351,10 @@ public class EmblEntry
        product = matcher.findIdMatch(proteinId);
        if (product == null)
        {
-        product = new Sequence(proteinId, translation, 1, translationLength);
-        product.setDescription(((proteinName.length() == 0) ? "Protein Product from "
-                + sourceDb
+        product = new Sequence(proteinId, translation, 1,
+                translationLength);
+        product.setDescription(((proteinName.length() == 0)
+                ? "Protein Product from " + sourceDb
                  : proteinName));
          peptides.add(product);
          matcher.add(product);
@@ -364,28 +368,30 @@ public class EmblEntry
           * workaround until we handle dna location for CDS sequence
           * e.g. location="X53828.1:60..1058" correctly
           */
-        System.err
-                .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
+        System.err.println(
+                "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
                          + sourceDb + ":" + getAccession() + ")");
-        if (translationLength * 3 == (1 - codonStart + dna.getSequence().length))
+        int dnaLength = dna.getLength();
+        if (translationLength * 3 == (1 - codonStart + dnaLength))
          {
-          System.err
-                  .println("Not allowing for additional stop codon at end of cDNA fragment... !");
+          System.err.println(
+                  "Not allowing for additional stop codon at end of cDNA fragment... !");
            // this might occur for CDS sequences where no features are marked
            exons = new int[] { dna.getStart() + (codonStart - 1),
                dna.getEnd() };
-          dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
-              translationLength }, 3, 1);
+          dnaToProteinMapping = new Mapping(product, exons,
+                  new int[]
+                  { 1, translationLength }, 3, 1);
          }
-        if ((translationLength + 1) * 3 == (1 - codonStart + dna
-                .getSequence().length))
+        if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
          {
-          System.err
-                  .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
+          System.err.println(
+                  "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
            exons = new int[] { dna.getStart() + (codonStart - 1),
                dna.getEnd() - 3 };
-          dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
-              translationLength }, 3, 1);
+          dnaToProteinMapping = new Mapping(product, exons,
+                  new int[]
+                  { 1, translationLength }, 3, 1);
          }
        }
        else
@@ -404,26 +410,32 @@ public class EmblEntry
          else
          {
            // final product length truncation check
-          int[] cdsRanges = adjustForProteinLength(translationLength, exons);
-          dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] {
-              1, translationLength }, 3, 1);
+          int[] cdsRanges = adjustForProteinLength(translationLength,
+                  exons);
+          dnaToProteinMapping = new Mapping(product, cdsRanges,
+                  new int[]
+                  { 1, translationLength }, 3, 1);
            if (product != null)
            {
              /*
               * make xref with mapping from protein to EMBL dna
               */
              DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
-                    getSequenceVersion(), proteinId, new Mapping(
-                            dnaToProteinMapping.getMap().getInverse()));
+                    getSequenceVersion(), proteinId,
+                    new Mapping(dnaToProteinMapping.getMap().getInverse()));
              product.addDBRef(proteinToEmblRef);
  
              /*
               * make xref from protein to EMBLCDS; we assume here that the 
               * CDS sequence version is same as dna sequence (?!)
               */
-            MapList proteinToCdsMapList = new MapList(new int[] { 1,
-                translationLength }, new int[] { 1 + (codonStart - 1),
-                (codonStart - 1) + 3 * translationLength }, 1, 3);
+            MapList proteinToCdsMapList = new MapList(
+                    new int[]
+                    { 1, translationLength },
+                    new int[]
+                    { 1 + (codonStart - 1),
+                        (codonStart - 1) + 3 * translationLength },
+                    1, 3);
              DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
                      DBRefSource.EMBLCDS, getSequenceVersion(), proteinId,
                      new Mapping(proteinToCdsMapList));
@@ -443,13 +455,27 @@ public class EmblEntry
        /*
         * add cds features to dna sequence
         */
-      for (int xint = 0; exons != null && xint < exons.length; xint += 2)
+      String cds = feature.getName(); // "CDS"
+      for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
        {
-        SequenceFeature sf = makeCdsFeature(exons, xint, proteinName,
-                proteinId, vals, codonStart);
-        sf.setType(feature.getName()); // "CDS"
+        int exonStart = exons[xint];
+        int exonEnd = exons[xint + 1];
+        int begin = Math.min(exonStart, exonEnd);
+        int end = Math.max(exonStart, exonEnd);
+        int exonNumber = xint / 2 + 1;
+        String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
+                exonNumber, proteinName, proteinId);
+
+        SequenceFeature sf = makeCdsFeature(cds, desc, begin, end,
+                sourceDb, vals);
+
          sf.setEnaLocation(feature.getLocation());
-        sf.setFeatureGroup(sourceDb);
+        boolean forwardStrand = exonStart <= exonEnd;
+        sf.setStrand(forwardStrand ? "+" : "-");
+        sf.setPhase(String.valueOf(codonStart - 1));
+        sf.setValue(FeatureProperties.EXONPOS, exonNumber);
+        sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+
          dna.addSequenceFeature(sf);
        }
      }
@@ -518,8 +544,8 @@ public class EmblEntry
            // Add converse mapping reference
            if (dnaToProteinMapping != null)
            {
-            Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap()
-                    .getInverse());
+            Mapping pmap = new Mapping(dna,
+                    dnaToProteinMapping.getMap().getInverse());
              pref = new DBRefEntry(sourceDb, getSequenceVersion(),
                      this.getAccession());
              pref.setMap(pmap);
@@ -543,8 +569,8 @@ public class EmblEntry
        if (proteinToEmblProteinRef == null)
        {
          // assuming CDSPROTEIN sequence version = dna version (?!)
-        proteinToEmblProteinRef = new DBRefEntry(
-                DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
+        proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
+                getSequenceVersion(), proteinId);
        }
        product.addDBRef(proteinToEmblProteinRef);
  
@@ -552,7 +578,8 @@ public class EmblEntry
                && dnaToProteinMapping.getTo() != null)
        {
          DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
-                DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
+                DBRefSource.EMBLCDSProduct, getSequenceVersion(),
+                proteinId);
          dnaToEmblProteinRef.setMap(dnaToProteinMapping);
          dnaToProteinMapping.setMappedFromId(proteinId);
          dna.addDBRef(dnaToEmblProteinRef);
@@ -563,33 +590,24 @@ public class EmblEntry
    /**
     * Helper method to construct a SequenceFeature for one cds range
     * 
-   * @param exons
-   *          array of cds [start, end, ...] positions
-   * @param exonStartIndex
-   *          offset into the exons array
-   * @param proteinName
-   * @param proteinAccessionId
+   * @param type
+   *          feature type ("CDS")
+   * @param desc
+   *          description
+   * @param begin
+   *          start position
+   * @param end
+   *          end position
+   * @param group
+   *          feature group
     * @param vals
     *          map of 'miscellaneous values' for feature
-   * @param codonStart
-   *          codon start position for CDS (1/2/3, normally 1)
     * @return
     */
-  protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
-          String proteinName, String proteinAccessionId,
-          Map<String, String> vals, int codonStart)
-  {
-    int exonNumber = exonStartIndex / 2 + 1;
-    SequenceFeature sf = new SequenceFeature();
-    sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
-    sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
-    sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s",
-            exonNumber, proteinName, proteinAccessionId));
-    sf.setPhase(String.valueOf(codonStart - 1));
-    sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+"
-            : "-");
-    sf.setValue(FeatureProperties.EXONPOS, exonNumber);
-    sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+  protected SequenceFeature makeCdsFeature(String type, String desc,
+          int begin, int end, String group, Map<String, String> vals)
+  {
+    SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
      if (!vals.isEmpty())
      {
        StringBuilder sb = new StringBuilder();
@@ -629,9 +647,9 @@ public class EmblEntry
        return listToArray(ranges);
      } catch (ParseException e)
      {
-      Cache.log.warn(String.format(
-              "Not parsing inexact CDS location %s in ENA %s",
-              feature.location, this.accession));
+      Cache.log.warn(
+              String.format("Not parsing inexact CDS location %s in ENA %s",
+                      feature.location, this.accession));
        return new int[] {};
      }
    }