src/jalview/io/StockholmFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 /*
  22  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk
  23  */
  24 package jalview.io;
  25
  26 import jalview.analysis.Rna;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentI;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.DBRefEntry;
  31 import jalview.datamodel.DBRefSource;
  32 import jalview.datamodel.Mapping;
  33 import jalview.datamodel.Sequence;
  34 import jalview.datamodel.SequenceFeature;
  35 import jalview.datamodel.SequenceI;
  36 import jalview.schemes.ResidueProperties;
  37 import jalview.util.Comparison;
  38 import jalview.util.DBRefUtils;
  39 import jalview.util.Format;
  40 import jalview.util.MessageManager;
  41
  42 import java.io.BufferedReader;
  43 import java.io.FileReader;
  44 import java.io.IOException;
  45 import java.util.ArrayList;
  46 import java.util.Enumeration;
  47 import java.util.Hashtable;
  48 import java.util.LinkedHashMap;
  49 import java.util.List;
  50 import java.util.Map;
  51 import java.util.Vector;
  52
  53 import com.stevesoft.pat.Regex;
  54
  55 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
  56 import fr.orsay.lri.varna.factories.RNAFactory;
  57 import fr.orsay.lri.varna.models.rna.RNA;
  58
  59 // import org.apache.log4j.*;
  60
  61 /**
  62  * This class is supposed to parse a Stockholm format file into Jalview There
  63  * are TODOs in this class: we do not know what the database source and version
  64  * is for the file when parsing the #GS= AC tag which associates accessions with
  65  * sequences. Database references are also not parsed correctly: a separate
  66  * reference string parser must be added to parse the database reference form
  67  * into Jalview's local representation.
  68  *
  69  * @author bsb at sanger.ac.uk
  70  * @author Natasha Shersnev (Dundee, UK) (Stockholm file writer)
  71  * @author Lauren Lui (UCSC, USA) (RNA secondary structure annotation import as
  72  *         stockholm)
  73  * @author Anne Menard (Paris, FR) (VARNA parsing of Stockholm file data)
  74  * @version 0.3 + jalview mods
  75  *
  76  */
  77 public class StockholmFile extends AlignFile
  78 {
  79   private static final String ANNOTATION = "annotation";
  80
  81   private static final char UNDERSCORE = '_';
  82
  83   private static final Regex OPEN_PAREN = new Regex("(<|\\[)", "(");
  84
  85   private static final Regex CLOSE_PAREN = new Regex("(>|\\])", ")");
  86
  87   // private static final Regex OPEN_PAREN = new Regex("(<|\\[)", "(");
  88   // private static final Regex CLOSE_PAREN = new Regex("(>|\\])", ")");
  89
  90   public static final Regex DETECT_BRACKETS = new Regex(
  91           "(<|>|\\[|\\]|\\(|\\)|\\{|\\})");
  92
  93   // WUSS extended symbols. Avoid ambiguity with protein SS annotations by using NOT_RNASS first.
  94   public static final String RNASS_BRACKETS = "<>[](){}AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
  95
  96   // use the following regex to decide an annotations (whole) line is NOT an RNA
  97   // SS (it contains only E,H,e,h and other non-brace/non-alpha chars)
  98   private static final Regex NOT_RNASS = new Regex(
  99           "^[^<>[\\](){}A-DF-Za-df-z]*$");
 100
 101   StringBuffer out; // output buffer
 102
 103   private AlignmentI al;
 104
 105   public StockholmFile()
 106   {
 107   }
 108
 109   /**
 110    * Creates a new StockholmFile object for output
 111    */
 112   public StockholmFile(AlignmentI al)
 113   {
 114     this.al = al;
 115   }
 116
 117   public StockholmFile(String inFile, DataSourceType type)
 118           throws IOException
 119   {
 120     super(inFile, type);
 121   }
 122
 123   public StockholmFile(FileParse source) throws IOException
 124   {
 125     super(source);
 126   }
 127
 128   @Override
 129   public void initData()
 130   {
 131     super.initData();
 132   }
 133
 134   /**
 135    * Parse a file in Stockholm format into Jalview's data model using VARNA
 136    *
 137    * @throws IOException
 138    *           If there is an error with the input file
 139    */
 140   public void parse_with_VARNA(java.io.File inFile) throws IOException
 141   {
 142     FileReader fr = null;
 143     fr = new FileReader(inFile);
 144
 145     BufferedReader r = new BufferedReader(fr);
 146     List<RNA> result = null;
 147     try
 148     {
 149       result = RNAFactory.loadSecStrStockholm(r);
 150     } catch (ExceptionUnmatchedClosingParentheses umcp)
 151     {
 152       errormessage = "Unmatched parentheses in annotation. Aborting ("
 153               + umcp.getMessage() + ")";
 154       throw new IOException(umcp);
 155     }
 156     // DEBUG System.out.println("this is the secondary scructure:"
 157     // +result.size());
 158     SequenceI[] seqs = new SequenceI[result.size()];
 159     String id = null;
 160     for (int i = 0; i < result.size(); i++)
 161     {
 162       // DEBUG System.err.println("Processing i'th sequence in Stockholm file")
 163       RNA current = result.get(i);
 164
 165       String seq = current.getSeq();
 166       String rna = current.getStructDBN(true);
 167       // DEBUG System.out.println(seq);
 168       // DEBUG System.err.println(rna);
 169       int begin = 0;
 170       int end = seq.length() - 1;
 171       id = safeName(getDataName());
 172       seqs[i] = new Sequence(id, seq, begin, end);
 173       String[] annot = new String[rna.length()];
 174       Annotation[] ann = new Annotation[rna.length()];
 175       for (int j = 0; j < rna.length(); j++)
 176       {
 177         annot[j] = rna.substring(j, j + 1);
 178
 179       }
 180
 181       for (int k = 0; k < rna.length(); k++)
 182       {
 183         ann[k] = new Annotation(annot[k], "",
 184                 Rna.getRNASecStrucState(annot[k]).charAt(0), 0f);
 185
 186       }
 187       AlignmentAnnotation align = new AlignmentAnnotation("Sec. str.",
 188               current.getID(), ann);
 189
 190       seqs[i].addAlignmentAnnotation(align);
 191       seqs[i].setRNA(result.get(i));
 192       this.annotations.addElement(align);
 193     }
 194     this.setSeqs(seqs);
 195
 196   }
 197
 198   /**
 199    * Parse a file in Stockholm format into Jalview's data model. The file has to
 200    * be passed at construction time
 201    *
 202    * @throws IOException
 203    *           If there is an error with the input file
 204    */
 205   @Override
 206   public void parse() throws IOException
 207   {
 208     StringBuffer treeString = new StringBuffer();
 209     String treeName = null;
 210     // --------------- Variable Definitions -------------------
 211     String line;
 212     String version;
 213     // String id;
 214     Hashtable seqAnn = new Hashtable(); // Sequence related annotations
 215     LinkedHashMap<String, String> seqs = new LinkedHashMap<>();
 216     Regex p, r, rend, s, x;
 217     // Temporary line for processing RNA annotation
 218     // String RNAannot = "";
 219
 220     // ------------------ Parsing File ----------------------
 221     // First, we have to check that this file has STOCKHOLM format, i.e. the
 222     // first line must match
 223
 224     r = new Regex("# STOCKHOLM ([\\d\\.]+)");
 225     if (!r.search(nextLine()))
 226     {
 227       throw new IOException(MessageManager
 228               .getString("exception.stockholm_invalid_format"));
 229     }
 230     else
 231     {
 232       version = r.stringMatched(1);
 233
 234       // logger.debug("Stockholm version: " + version);
 235     }
 236
 237     // We define some Regexes here that will be used regularly later
 238     rend = new Regex("^\\s*\\/\\/"); // Find the end of an alignment
 239     p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in
 240     // id/from/to
 241     s = new Regex("(\\S+)\\s+(\\S*)\\s+(.*)"); // Parses annotation subtype
 242     r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line
 243     x = new Regex("(\\S+)\\s+(\\S+)"); // split id from sequence
 244
 245     // Convert all bracket types to parentheses (necessary for passing to VARNA)
 246     Regex openparen = new Regex("(<|\\[)", "(");
 247     Regex closeparen = new Regex("(>|\\])", ")");
 248
 249     // Detect if file is RNA by looking for bracket types
 250     Regex detectbrackets = new Regex("(<|>|\\[|\\]|\\(|\\))");
 251
 252     rend.optimize();
 253     p.optimize();
 254     s.optimize();
 255     r.optimize();
 256     x.optimize();
 257     openparen.optimize();
 258     closeparen.optimize();
 259
 260     while ((line = nextLine()) != null)
 261     {
 262       if (line.length() == 0)
 263       {
 264         continue;
 265       }
 266       if (rend.search(line))
 267       {
 268         // End of the alignment, pass stuff back
 269         this.noSeqs = seqs.size();
 270
 271         String seqdb, dbsource = null;
 272         Regex pf = new Regex("PF[0-9]{5}(.*)"); // Finds AC for Pfam
 273         Regex rf = new Regex("RF[0-9]{5}(.*)"); // Finds AC for Rfam
 274         if (getAlignmentProperty("AC") != null)
 275         {
 276           String dbType = getAlignmentProperty("AC").toString();
 277           if (pf.search(dbType))
 278           {
 279             // PFAM Alignment - so references are typically from Uniprot
 280             dbsource = "PFAM";
 281           }
 282           else if (rf.search(dbType))
 283           {
 284             dbsource = "RFAM";
 285           }
 286         }
 287         // logger.debug("Number of sequences: " + this.noSeqs);
 288         for (Map.Entry<String, String> skey : seqs.entrySet())
 289         {
 290           // logger.debug("Processing sequence " + acc);
 291           String acc = skey.getKey();
 292           String seq = skey.getValue();
 293           if (maxLength < seq.length())
 294           {
 295             maxLength = seq.length();
 296           }
 297           int start = 1;
 298           int end = -1;
 299           String sid = acc;
 300           /*
 301            * Retrieve hash of annotations for this accession Associate
 302            * Annotation with accession
 303            */
 304           Hashtable accAnnotations = null;
 305
 306           if (seqAnn != null && seqAnn.containsKey(acc))
 307           {
 308             accAnnotations = (Hashtable) seqAnn.remove(acc);
 309             // TODO: add structures to sequence
 310           }
 311
 312           // Split accession in id and from/to
 313           if (p.search(acc))
 314           {
 315             sid = p.stringMatched(1);
 316             start = Integer.parseInt(p.stringMatched(2));
 317             end = Integer.parseInt(p.stringMatched(3));
 318           }
 319           // logger.debug(sid + ", " + start + ", " + end);
 320
 321           Sequence seqO = new Sequence(sid, seq, start, end);
 322           // Add Description (if any)
 323           if (accAnnotations != null && accAnnotations.containsKey("DE"))
 324           {
 325             String desc = (String) accAnnotations.get("DE");
 326             seqO.setDescription((desc == null) ? "" : desc);
 327           }
 328           // Add DB References (if any)
 329           if (accAnnotations != null && accAnnotations.containsKey("DR"))
 330           {
 331             String dbr = (String) accAnnotations.get("DR");
 332             if (dbr != null && dbr.indexOf(";") > -1)
 333             {
 334               String src = dbr.substring(0, dbr.indexOf(";"));
 335               String acn = dbr.substring(dbr.indexOf(";") + 1);
 336               jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn);
 337             }
 338           }
 339
 340           if (accAnnotations != null && accAnnotations.containsKey("AC"))
 341           {
 342             String dbr = (String) accAnnotations.get("AC");
 343             if (dbr != null)
 344             {
 345               // we could get very clever here - but for now - just try to
 346               // guess accession type from type of sequence, source of alignment plus
 347               // structure
 348               // of accession
 349               guessDatabaseFor(seqO, dbr, dbsource);
 350             }
 351             // else - do what ? add the data anyway and prompt the user to
 352             // specify what references these are ?
 353           }
 354
 355           Hashtable features = null;
 356           // We need to adjust the positions of all features to account for gaps
 357           try
 358           {
 359             features = (Hashtable) accAnnotations.remove("features");
 360           } catch (java.lang.NullPointerException e)
 361           {
 362             // loggerwarn("Getting Features for " + acc + ": " +
 363             // e.getMessage());
 364             // continue;
 365           }
 366           // if we have features
 367           if (features != null)
 368           {
 369             int posmap[] = seqO.findPositionMap();
 370             Enumeration i = features.keys();
 371             while (i.hasMoreElements())
 372             {
 373               // TODO: parse out secondary structure annotation as annotation
 374               // row
 375               // TODO: parse out scores as annotation row
 376               // TODO: map coding region to core jalview feature types
 377               String type = i.nextElement().toString();
 378               Hashtable content = (Hashtable) features.remove(type);
 379
 380               // add alignment annotation for this feature
 381               String key = type2id(type);
 382
 383               /*
 384                * have we added annotation rows for this type ?
 385                */
 386               boolean annotsAdded = false;
 387               if (key != null)
 388               {
 389                 if (accAnnotations != null
 390                         && accAnnotations.containsKey(key))
 391                 {
 392                   Vector vv = (Vector) accAnnotations.get(key);
 393                   for (int ii = 0; ii < vv.size(); ii++)
 394                   {
 395                     annotsAdded = true;
 396                     AlignmentAnnotation an = (AlignmentAnnotation) vv
 397                             .elementAt(ii);
 398                     seqO.addAlignmentAnnotation(an);
 399                     annotations.add(an);
 400                   }
 401                 }
 402               }
 403
 404               Enumeration j = content.keys();
 405               while (j.hasMoreElements())
 406               {
 407                 String desc = j.nextElement().toString();
 408                 if (ANNOTATION.equals(desc) && annotsAdded)
 409                 {
 410                   // don't add features if we already added an annotation row
 411                   continue;
 412                 }
 413                 String ns = content.get(desc).toString();
 414                 char[] byChar = ns.toCharArray();
 415                 for (int k = 0; k < byChar.length; k++)
 416                 {
 417                   char c = byChar[k];
 418                   if (!(c == ' ' || c == '_' || c == '-' || c == '.')) // PFAM
 419                   // uses
 420                   // '.'
 421                   // for
 422                   // feature
 423                   // background
 424                   {
 425                     int new_pos = posmap[k]; // look up nearest seqeunce
 426                     // position to this column
 427                     SequenceFeature feat = new SequenceFeature(type, desc,
 428                             new_pos, new_pos, null);
 429
 430                     seqO.addSequenceFeature(feat);
 431                   }
 432                 }
 433               }
 434
 435             }
 436
 437           }
 438           // garbage collect
 439
 440           // logger.debug("Adding seq " + acc + " from " + start + " to " + end
 441           // + ": " + seq);
 442           this.seqs.addElement(seqO);
 443         }
 444         return; // finished parsing this segment of source
 445       }
 446       else if (!r.search(line))
 447       {
 448         // System.err.println("Found sequence line: " + line);
 449
 450         // Split sequence in sequence and accession parts
 451         if (!x.search(line))
 452         {
 453           // logger.error("Could not parse sequence line: " + line);
 454           throw new IOException(MessageManager.formatMessage(
 455                   "exception.couldnt_parse_sequence_line", new String[]
 456                   { line }));
 457         }
 458         String ns = seqs.get(x.stringMatched(1));
 459         if (ns == null)
 460         {
 461           ns = "";
 462         }
 463         ns += x.stringMatched(2);
 464
 465         seqs.put(x.stringMatched(1), ns);
 466       }
 467       else
 468       {
 469         String annType = r.stringMatched(1);
 470         String annContent = r.stringMatched(2);
 471
 472         // System.err.println("type:" + annType + " content: " + annContent);
 473
 474         if (annType.equals("GF"))
 475         {
 476           /*
 477            * Generic per-File annotation, free text Magic features: #=GF NH
 478            * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier
 479            * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS
 480            *
 481            * Compulsory fields: ------------------
 482            *
 483            * AC Accession number: Accession number in form PFxxxxx.version or
 484            * PBxxxxxx. ID Identification: One word name for family. DE
 485            * Definition: Short description of family. AU Author: Authors of the
 486            * entry. SE Source of seed: The source suggesting the seed members
 487            * belong to one family. GA Gathering method: Search threshold to
 488            * build the full alignment. TC Trusted Cutoff: Lowest sequence score
 489            * and domain score of match in the full alignment. NC Noise Cutoff:
 490            * Highest sequence score and domain score of match not in full
 491            * alignment. TP Type: Type of family -- presently Family, Domain,
 492            * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM
 493            * Alignment Method The order ls and fs hits are aligned to the model
 494            * to build the full align. // End of alignment.
 495            *
 496            * Optional fields: ----------------
 497            *
 498            * DC Database Comment: Comment about database reference. DR Database
 499            * Reference: Reference to external database. RC Reference Comment:
 500            * Comment about literature reference. RN Reference Number: Reference
 501            * Number. RM Reference Medline: Eight digit medline UI number. RT
 502            * Reference Title: Reference Title. RA Reference Author: Reference
 503            * Author RL Reference Location: Journal location. PI Previous
 504            * identifier: Record of all previous ID lines. KW Keywords: Keywords.
 505            * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.
 506            * NL Location: Location of nested domains - sequence ID, start and
 507            * end of insert.
 508            *
 509            * Obsolete fields: ----------- AL Alignment method of seed: The
 510            * method used to align the seed members.
 511            */
 512           // Let's save the annotations, maybe we'll be able to do something
 513           // with them later...
 514           Regex an = new Regex("(\\w+)\\s*(.*)");
 515           if (an.search(annContent))
 516           {
 517             if (an.stringMatched(1).equals("NH"))
 518             {
 519               treeString.append(an.stringMatched(2));
 520             }
 521             else if (an.stringMatched(1).equals("TN"))
 522             {
 523               if (treeString.length() > 0)
 524               {
 525                 if (treeName == null)
 526                 {
 527                   treeName = "Tree " + (getTreeCount() + 1);
 528                 }
 529                 addNewickTree(treeName, treeString.toString());
 530               }
 531               treeName = an.stringMatched(2);
 532               treeString = new StringBuffer();
 533             }
 534             // TODO: JAL-3532 - this is where GF comments and database references are lost
 535             // suggest overriding this method for Stockholm files to catch and properly
 536             // process CC, DR etc into multivalued properties
 537             setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));
 538           }
 539         }
 540         else if (annType.equals("GS"))
 541         {
 542           // Generic per-Sequence annotation, free text
 543           /*
 544            * Pfam uses these features: Feature Description ---------------------
 545            * ----------- AC <accession> ACcession number DE <freetext>
 546            * DEscription DR <db>; <accession>; Database Reference OS <organism>
 547            * OrganiSm (species) OC <clade> Organism Classification (clade, etc.)
 548            * LO <look> Look (Color, etc.)
 549            */
 550           if (s.search(annContent))
 551           {
 552             String acc = s.stringMatched(1);
 553             String type = s.stringMatched(2);
 554             String content = s.stringMatched(3);
 555             // TODO: store DR in a vector.
 556             // TODO: store AC according to generic file db annotation.
 557             Hashtable ann;
 558             if (seqAnn.containsKey(acc))
 559             {
 560               ann = (Hashtable) seqAnn.get(acc);
 561             }
 562             else
 563             {
 564               ann = new Hashtable();
 565             }
 566             ann.put(type, content);
 567             seqAnn.put(acc, ann);
 568           }
 569           else
 570           {
 571             // throw new IOException("Error parsing " + line);
 572             System.err.println(">> missing annotation: " + line);
 573           }
 574         }
 575         else if (annType.equals("GC"))
 576         {
 577           // Generic per-Column annotation, exactly 1 char per column
 578           // always need a label.
 579           if (x.search(annContent))
 580           {
 581             // parse out and create alignment annotation directly.
 582             parseAnnotationRow(annotations, x.stringMatched(1),
 583                     x.stringMatched(2));
 584           }
 585         }
 586         else if (annType.equals("GR"))
 587         {
 588           // Generic per-Sequence AND per-Column markup, exactly 1 char per
 589           // column
 590           /*
 591            * Feature Description Markup letters ------- -----------
 592            * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface
 593            * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane
 594            * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;
 595            * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in
 596            * or after) [0-2]
 597            */
 598           if (s.search(annContent))
 599           {
 600             String acc = s.stringMatched(1);
 601             String type = s.stringMatched(2);
 602             String oseq = s.stringMatched(3);
 603             /*
 604              * copy of annotation field that may be processed into whitespace chunks
 605              */
 606             String seq = new String(oseq);
 607
 608             Hashtable ann;
 609             // Get an object with all the annotations for this sequence
 610             if (seqAnn.containsKey(acc))
 611             {
 612               // logger.debug("Found annotations for " + acc);
 613               ann = (Hashtable) seqAnn.get(acc);
 614             }
 615             else
 616             {
 617               // logger.debug("Creating new annotations holder for " + acc);
 618               ann = new Hashtable();
 619               seqAnn.put(acc, ann);
 620             }
 621
 622             // // start of block for appending annotation lines for wrapped
 623             // stokchholm file
 624             // TODO test structure, call parseAnnotationRow with vector from
 625             // hashtable for specific sequence
 626
 627             Hashtable features;
 628             // Get an object with all the content for an annotation
 629             if (ann.containsKey("features"))
 630             {
 631               // logger.debug("Found features for " + acc);
 632               features = (Hashtable) ann.get("features");
 633             }
 634             else
 635             {
 636               // logger.debug("Creating new features holder for " + acc);
 637               features = new Hashtable();
 638               ann.put("features", features);
 639             }
 640
 641             Hashtable content;
 642             if (features.containsKey(this.id2type(type)))
 643             {
 644               // logger.debug("Found content for " + this.id2type(type));
 645               content = (Hashtable) features
 646                       .get(this.id2type(type));
 647             }
 648             else
 649             {
 650               // logger.debug("Creating new content holder for " +
 651               // this.id2type(type));
 652               content = new Hashtable();
 653               features.put(id2type(type), content);
 654             }
 655             String ns = (String) content.get(ANNOTATION);
 656
 657             if (ns == null)
 658             {
 659               ns = "";
 660             }
 661             // finally, append the annotation line
 662             ns += seq;
 663             content.put(ANNOTATION, ns);
 664             // // end of wrapped annotation block.
 665             // // Now a new row is created with the current set of data
 666
 667             Hashtable strucAnn;
 668             if (seqAnn.containsKey(acc))
 669             {
 670               strucAnn = (Hashtable) seqAnn.get(acc);
 671             }
 672             else
 673             {
 674               strucAnn = new Hashtable();
 675             }
 676
 677             Vector<AlignmentAnnotation> newStruc = new Vector<>();
 678             parseAnnotationRow(newStruc, type, ns);
 679             for (AlignmentAnnotation alan : newStruc)
 680             {
 681               alan.visible = false;
 682             }
 683             // new annotation overwrites any existing annotation...
 684
 685             strucAnn.put(type, newStruc);
 686             seqAnn.put(acc, strucAnn);
 687           }
 688           // }
 689           else
 690           {
 691             System.err.println(
 692                     "Warning - couldn't parse sequence annotation row line:\n"
 693                             + line);
 694             // throw new IOException("Error parsing " + line);
 695           }
 696         }
 697         else
 698         {
 699           throw new IOException(MessageManager.formatMessage(
 700                   "exception.unknown_annotation_detected", new String[]
 701                   { annType, annContent }));
 702         }
 703       }
 704     }
 705     if (treeString.length() > 0)
 706     {
 707       if (treeName == null)
 708       {
 709         treeName = "Tree " + (1 + getTreeCount());
 710       }
 711       addNewickTree(treeName, treeString.toString());
 712     }
 713   }
 714
 715   /**
 716    * Demangle an accession string and guess the originating sequence database
 717    * for a given sequence
 718    *
 719    * @param seqO
 720    *          sequence to be annotated
 721    * @param dbr
 722    *          Accession string for sequence
 723    * @param dbsource
 724    *          source database for alignment (PFAM or RFAM)
 725    */
 726   private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource)
 727   {
 728     DBRefEntry dbrf = null;
 729     List<DBRefEntry> dbrs = new ArrayList<>();
 730     String seqdb = "Unknown", sdbac = "" + dbr;
 731     int st = -1, en = -1, p;
 732     if ((st = sdbac.indexOf("/")) > -1)
 733     {
 734       String num, range = sdbac.substring(st + 1);
 735       sdbac = sdbac.substring(0, st);
 736       if ((p = range.indexOf("-")) > -1)
 737       {
 738         p++;
 739         if (p < range.length())
 740         {
 741           num = range.substring(p).trim();
 742           try
 743           {
 744             en = Integer.parseInt(num);
 745           } catch (NumberFormatException x)
 746           {
 747             // could warn here that index is invalid
 748             en = -1;
 749           }
 750         }
 751       }
 752       else
 753       {
 754         p = range.length();
 755       }
 756       num = range.substring(0, p).trim();
 757       try
 758       {
 759         st = Integer.parseInt(num);
 760       } catch (NumberFormatException x)
 761       {
 762         // could warn here that index is invalid
 763         st = -1;
 764       }
 765     }
 766     if (dbsource == null)
 767     {
 768       // make up an origin based on whether the sequence looks like it is nucleotide
 769       // or protein
 770       dbsource = (seqO.isProtein()) ? "PFAM" : "RFAM";
 771     }
 772     if (dbsource.equals("PFAM"))
 773     {
 774       seqdb = "UNIPROT";
 775       if (sdbac.indexOf(".") > -1)
 776       {
 777         // strip of last subdomain
 778         sdbac = sdbac.substring(0, sdbac.indexOf("."));
 779         dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
 780                 sdbac);
 781         if (dbrf != null)
 782         {
 783           dbrs.add(dbrf);
 784         }
 785       }
 786       dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
 787               dbr);
 788       if (dbr != null)
 789       {
 790         dbrs.add(dbrf);
 791       }
 792     }
 793     else
 794     {
 795       seqdb = "EMBL"; // total guess - could be ENA, or something else these
 796                       // days
 797       if (sdbac.indexOf(".") > -1)
 798       {
 799         // strip off last subdomain
 800         sdbac = sdbac.substring(0, sdbac.indexOf("."));
 801         dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
 802                 sdbac);
 803         if (dbrf != null)
 804         {
 805           dbrs.add(dbrf);
 806         }
 807       }
 808
 809       dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
 810               dbr);
 811       if (dbrf != null)
 812       {
 813         dbrs.add(dbrf);
 814       }
 815     }
 816     if (st != -1 && en != -1)
 817     {
 818       for (DBRefEntry d : dbrs)
 819       {
 820         jalview.util.MapList mp = new jalview.util.MapList(
 821                 new int[]
 822                 { seqO.getStart(), seqO.getEnd() }, new int[] { st, en }, 1,
 823                 1);
 824         jalview.datamodel.Mapping mping = new Mapping(mp);
 825         d.setMap(mping);
 826       }
 827     }
 828   }
 829
 830   protected static AlignmentAnnotation parseAnnotationRow(
 831           Vector<AlignmentAnnotation> annotation, String label,
 832           String annots)
 833   {
 834           String convert1, convert2 = null;
 835     // String convert1 = OPEN_PAREN.replaceAll(annots);
 836     // String convert2 = CLOSE_PAREN.replaceAll(convert1);
 837     // annots = convert2;
 838
 839     String type = label;
 840     if (label.contains("_cons"))
 841     {
 842       type = (label.indexOf("_cons") == label.length() - 5)
 843               ? label.substring(0, label.length() - 5)
 844               : label;
 845     }
 846     boolean ss = false, posterior = false;
 847     type = id2type(type);
 848
 849     boolean isrnass = false;
 850
 851     if (type.equalsIgnoreCase("secondary structure"))
 852     {
 853       ss = true;
 854       isrnass = !NOT_RNASS.search(annots); // sorry about the double negative
 855                                            // here (it's easier for dealing with
 856                                            // other non-alpha-non-brace chars)
 857     }
 858     if (type.equalsIgnoreCase("posterior probability"))
 859     {
 860       posterior = true;
 861     }
 862     // decide on secondary structure or not.
 863     Annotation[] els = new Annotation[annots.length()];
 864     for (int i = 0; i < annots.length(); i++)
 865     {
 866       String pos = annots.substring(i, i + 1);
 867       if (UNDERSCORE == pos.charAt(0))
 868       {
 869         pos = " ";
 870       }
 871       Annotation ann;
 872       ann = new Annotation(pos, "", ' ', 0f); // 0f is 'valid' null - will not
 873       // be written out
 874       if (ss)
 875       {
 876         // if (" .-_".indexOf(pos) == -1)
 877         {
 878           if (isrnass && RNASS_BRACKETS.indexOf(pos) >= 0)
 879           {
 880             ann.secondaryStructure = Rna.getRNASecStrucState(pos).charAt(0);
 881             ann.displayCharacter = "" + pos.charAt(0);
 882           }
 883           else
 884           {
 885             ann.secondaryStructure = ResidueProperties.getDssp3state(pos)
 886                     .charAt(0);
 887
 888             if (ann.secondaryStructure == pos.charAt(0))
 889             {
 890               ann.displayCharacter = ""; // null; // " ";
 891             }
 892             else
 893             {
 894               ann.displayCharacter = " " + ann.displayCharacter;
 895             }
 896           }
 897         }
 898
 899       }
 900       if (posterior && !ann.isWhitespace()
 901               && !Comparison.isGap(pos.charAt(0)))
 902       {
 903         float val = 0;
 904         // symbol encodes values - 0..*==0..10
 905         if (pos.charAt(0) == '*')
 906         {
 907           val = 10;
 908         }
 909         else
 910         {
 911           val = pos.charAt(0) - '0';
 912           if (val > 9)
 913           {
 914             val = 10;
 915           }
 916         }
 917         ann.value = val;
 918       }
 919
 920       els[i] = ann;
 921     }
 922     AlignmentAnnotation annot = null;
 923     Enumeration<AlignmentAnnotation> e = annotation.elements();
 924     while (e.hasMoreElements())
 925     {
 926       annot = e.nextElement();
 927       if (annot.label.equals(type))
 928       {
 929         break;
 930       }
 931       annot = null;
 932     }
 933     if (annot == null)
 934     {
 935       annot = new AlignmentAnnotation(type, type, els);
 936       annotation.addElement(annot);
 937     }
 938     else
 939     {
 940       Annotation[] anns = new Annotation[annot.annotations.length
 941               + els.length];
 942       System.arraycopy(annot.annotations, 0, anns, 0,
 943               annot.annotations.length);
 944       System.arraycopy(els, 0, anns, annot.annotations.length, els.length);
 945       annot.annotations = anns;
 946       // System.out.println("else: ");
 947     }
 948     return annot;
 949   }
 950
 951   private String dbref_to_ac_record(DBRefEntry ref)
 952   {
 953     return ref.getSource().toString() + " ; "
 954             + ref.getAccessionId().toString();
 955   }
 956   @Override
 957   public String print(SequenceI[] s, boolean jvSuffix)
 958   {
 959     out = new StringBuffer();
 960     out.append("# STOCKHOLM 1.0");
 961     out.append(newline);
 962
 963     // find max length of id
 964     int max = 0;
 965     int maxid = 0;
 966     int in = 0;
 967     Hashtable dataRef = null;
 968     boolean isAA = s[in].isProtein();
 969     while ((in < s.length) && (s[in] != null))
 970     {
 971
 972       String tmp = printId(s[in], jvSuffix);
 973       max = Math.max(max, s[in].getLength());
 974
 975       if (tmp.length() > maxid)
 976       {
 977         maxid = tmp.length();
 978       }
 979       if (s[in].getDBRefs() != null)
 980       {
 981         if (dataRef == null)
 982         {
 983           dataRef = new Hashtable();
 984         }
 985         List<DBRefEntry> primrefs = s[in].getPrimaryDBRefs();
 986         if (primrefs.size() >= 1)
 987         {
 988           dataRef.put(tmp, dbref_to_ac_record(primrefs.get(0)));
 989         }
 990         else
 991         {
 992           for (int idb = 0; idb < s[in].getDBRefs().length; idb++)
 993           {
 994             DBRefEntry dbref = s[in].getDBRefs()[idb];
 995             dataRef.put(tmp, dbref_to_ac_record(dbref));
 996             // if we put in a uniprot or EMBL record then we're done:
 997             if (isAA && DBRefSource.UNIPROT
 998                     .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
 999             {
1000               break;
1001             }
1002             if (!isAA && DBRefSource.EMBL
1003                     .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
1004             {
1005               break;
1006             }
1007           }
1008         }
1009       }
1010       in++;
1011     }
1012     maxid += 9;
1013     int i = 0;
1014
1015     // output database type
1016     if (al.getProperties() != null)
1017     {
1018       if (!al.getProperties().isEmpty())
1019       {
1020         Enumeration key = al.getProperties().keys();
1021         Enumeration val = al.getProperties().elements();
1022         while (key.hasMoreElements())
1023         {
1024           out.append("#=GF " + key.nextElement() + " " + val.nextElement());
1025           out.append(newline);
1026         }
1027       }
1028     }
1029
1030     // output database accessions
1031     if (dataRef != null)
1032     {
1033       Enumeration en = dataRef.keys();
1034       while (en.hasMoreElements())
1035       {
1036         Object idd = en.nextElement();
1037         String type = (String) dataRef.remove(idd);
1038         out.append(new Format("%-" + (maxid - 2) + "s")
1039                 .form("#=GS " + idd.toString() + " "));
1040         if (isAA && type.contains("UNIPROT")
1041                 || (!isAA && type.contains("EMBL")))
1042         {
1043
1044           out.append(" AC " + type.substring(type.indexOf(";") + 1));
1045         }
1046         else
1047         {
1048           out.append(" DR " + type + " ");
1049         }
1050         out.append(newline);
1051       }
1052     }
1053
1054     // output annotations
1055     while (i < s.length && s[i] != null)
1056     {
1057       AlignmentAnnotation[] alAnot = s[i].getAnnotation();
1058       if (alAnot != null)
1059       {
1060         Annotation[] ann;
1061
1062         for (int j = 0; j < alAnot.length; j++)
1063         {
1064           if (alAnot[j].annotations != null)
1065           {
1066             String key = type2id(alAnot[j].label);
1067             boolean isrna = alAnot[j].isValidStruc();
1068
1069             if (isrna)
1070             {
1071               // hardwire to secondary structure if there is RNA secondary
1072               // structure on the annotation
1073               key = "SS";
1074             }
1075             if (key == null)
1076             {
1077
1078               continue;
1079             }
1080
1081             // out.append("#=GR ");
1082             out.append(new Format("%-" + maxid + "s").form(
1083                     "#=GR " + printId(s[i], jvSuffix) + " " + key + " "));
1084             ann = alAnot[j].annotations;
1085             String seq = "";
1086             for (int k = 0; k < ann.length; k++)
1087             {
1088               seq += outputCharacter(key, k, isrna, ann, s[i]);
1089             }
1090             out.append(seq);
1091             out.append(newline);
1092           }
1093         }
1094
1095       }
1096
1097       out.append(new Format("%-" + maxid + "s")
1098               .form(printId(s[i], jvSuffix) + " "));
1099       out.append(s[i].getSequenceAsString());
1100       out.append(newline);
1101       i++;
1102     }
1103
1104     // alignment annotation
1105     AlignmentAnnotation aa;
1106     if (al.getAlignmentAnnotation() != null)
1107     {
1108       for (int ia = 0; ia < al.getAlignmentAnnotation().length; ia++)
1109       {
1110         aa = al.getAlignmentAnnotation()[ia];
1111         if (aa.autoCalculated || !aa.visible || aa.sequenceRef != null)
1112         {
1113           continue;
1114         }
1115         String seq = "";
1116         String label;
1117         String key = "";
1118         if (aa.label.equals("seq"))
1119         {
1120           label = "seq_cons";
1121         }
1122         else
1123         {
1124           key = type2id(aa.label.toLowerCase());
1125           if (key == null)
1126           {
1127             label = aa.label;
1128           }
1129           else
1130           {
1131             label = key + "_cons";
1132           }
1133         }
1134         if (label == null)
1135         {
1136           label = aa.label;
1137         }
1138         label = label.replace(" ", "_");
1139
1140         out.append(
1141                 new Format("%-" + maxid + "s").form("#=GC " + label + " "));
1142         boolean isrna = aa.isValidStruc();
1143         for (int j = 0; j < aa.annotations.length; j++)
1144         {
1145           seq += outputCharacter(key, j, isrna, aa.annotations, null);
1146         }
1147         out.append(seq);
1148         out.append(newline);
1149       }
1150     }
1151
1152     out.append("//");
1153     out.append(newline);
1154
1155     return out.toString();
1156   }
1157
1158
1159   /**
1160    * add an annotation character to the output row
1161    *
1162    * @param seq
1163    * @param key
1164    * @param k
1165    * @param isrna
1166    * @param ann
1167    * @param sequenceI
1168    */
1169   private char outputCharacter(String key, int k, boolean isrna,
1170           Annotation[] ann, SequenceI sequenceI)
1171   {
1172     char seq = ' ';
1173     Annotation annot = ann[k];
1174     String ch = (annot == null)
1175             ? ((sequenceI == null) ? "-"
1176                     : Character.toString(sequenceI.getCharAt(k)))
1177             : (annot.displayCharacter == null
1178                     ? String.valueOf(annot.secondaryStructure)
1179                     : annot.displayCharacter);
1180     if (ch == null)
1181     {
1182       ch = " ";
1183     }
1184     if (key != null && key.equals("SS"))
1185     {
1186       char ssannotchar = ' ';
1187       boolean charset = false;
1188       if (annot == null)
1189       {
1190         // sensible gap character
1191         ssannotchar = ' ';
1192         charset = true;
1193       }
1194       else
1195       {
1196         // valid secondary structure AND no alternative label (e.g. ' B')
1197         if (annot.secondaryStructure > ' ' && ch.length() < 2)
1198         {
1199           ssannotchar = annot.secondaryStructure;
1200           charset = true;
1201         }
1202       }
1203       if (charset)
1204       {
1205         return (ssannotchar == ' ' && isrna) ? '.' : ssannotchar;
1206       }
1207     }
1208
1209     if (ch.length() == 0)
1210     {
1211       seq = '.';
1212     }
1213     else if (ch.length() == 1)
1214     {
1215       seq = ch.charAt(0);
1216     }
1217     else if (ch.length() > 1)
1218     {
1219       seq = ch.charAt(1);
1220     }
1221
1222     return (seq == ' ' && key != null && key.equals("SS") && isrna) ? '.'
1223             : seq;
1224   }
1225
1226   /**
1227    * make a friendly ID string.
1228    *
1229    * @param dataName
1230    * @return truncated dataName to after last '/'
1231    */
1232   private String safeName(String dataName)
1233   {
1234     int b = 0;
1235     while ((b = dataName.indexOf("/")) > -1 && b < dataName.length())
1236     {
1237       dataName = dataName.substring(b + 1).trim();
1238
1239     }
1240     int e = (dataName.length() - dataName.indexOf(".")) + 1;
1241     dataName = dataName.substring(1, e).trim();
1242     return dataName;
1243   }
1244
1245
1246   public String print()
1247   {
1248     out = new StringBuffer();
1249     out.append("# STOCKHOLM 1.0");
1250     out.append(newline);
1251     print(getSeqsAsArray(), false);
1252
1253     out.append("//");
1254     out.append(newline);
1255     return out.toString();
1256   }
1257
1258   private static Hashtable typeIds = null;
1259
1260   static
1261   {
1262     if (typeIds == null)
1263     {
1264       typeIds = new Hashtable();
1265       typeIds.put("SS", "Secondary Structure");
1266       typeIds.put("SA", "Surface Accessibility");
1267       typeIds.put("TM", "transmembrane");
1268       typeIds.put("PP", "Posterior Probability");
1269       typeIds.put("LI", "ligand binding");
1270       typeIds.put("AS", "active site");
1271       typeIds.put("IN", "intron");
1272       typeIds.put("IR", "interacting residue");
1273       typeIds.put("AC", "accession");
1274       typeIds.put("OS", "organism");
1275       typeIds.put("CL", "class");
1276       typeIds.put("DE", "description");
1277       typeIds.put("DR", "reference");
1278       typeIds.put("LO", "look");
1279       typeIds.put("RF", "Reference Positions");
1280
1281     }
1282   }
1283
1284   protected static String id2type(String id)
1285   {
1286     if (typeIds.containsKey(id))
1287     {
1288       return (String) typeIds.get(id);
1289     }
1290     System.err.println(
1291             "Warning : Unknown Stockholm annotation type code " + id);
1292     return id;
1293   }
1294
1295   protected static String type2id(String type)
1296   {
1297     String key = null;
1298     Enumeration e = typeIds.keys();
1299     while (e.hasMoreElements())
1300     {
1301       Object ll = e.nextElement();
1302       if (typeIds.get(ll).toString().equalsIgnoreCase(type))
1303       {
1304         key = (String) ll;
1305         break;
1306       }
1307     }
1308     if (key != null)
1309     {
1310       return key;
1311     }
1312     System.err.println(
1313             "Warning : Unknown Stockholm annotation type: " + type);
1314     return key;
1315   }
1316 }