src/jalview/io/StockholmFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 /*
  22  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk
  23  */
  24 package jalview.io;
  25
  26 import jalview.analysis.Rna;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentI;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.DBRefEntry;
  31 import jalview.datamodel.Mapping;
  32 import jalview.datamodel.Sequence;
  33 import jalview.datamodel.SequenceFeature;
  34 import jalview.datamodel.SequenceI;
  35 import jalview.schemes.ResidueProperties;
  36 import jalview.util.Comparison;
  37 import jalview.util.Format;
  38 import jalview.util.MessageManager;
  39
  40 import java.io.BufferedReader;
  41 import java.io.FileReader;
  42 import java.io.IOException;
  43 import java.util.ArrayList;
  44 import java.util.Enumeration;
  45 import java.util.Hashtable;
  46 import java.util.LinkedHashMap;
  47 import java.util.List;
  48 import java.util.Map;
  49 import java.util.Vector;
  50
  51 import com.stevesoft.pat.Regex;
  52
  53 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
  54 import fr.orsay.lri.varna.factories.RNAFactory;
  55 import fr.orsay.lri.varna.models.rna.RNA;
  56
  57 // import org.apache.log4j.*;
  58
  59 /**
  60  * This class is supposed to parse a Stockholm format file into Jalview There
  61  * are TODOs in this class: we do not know what the database source and version
  62  * is for the file when parsing the #GS= AC tag which associates accessions with
  63  * sequences. Database references are also not parsed correctly: a separate
  64  * reference string parser must be added to parse the database reference form
  65  * into Jalview's local representation.
  66  *
  67  * @author bsb at sanger.ac.uk
  68  * @author Natasha Shersnev (Dundee, UK) (Stockholm file writer)
  69  * @author Lauren Lui (UCSC, USA) (RNA secondary structure annotation import as
  70  *         stockholm)
  71  * @author Anne Menard (Paris, FR) (VARNA parsing of Stockholm file data)
  72  * @version 0.3 + jalview mods
  73  *
  74  */
  75 public class StockholmFile extends AlignFile
  76 {
  77   private static final String ANNOTATION = "annotation";
  78
  79   private static final Regex OPEN_PAREN = new Regex("(<|\\[)", "(");
  80
  81   private static final Regex CLOSE_PAREN = new Regex("(>|\\])", ")");
  82
  83   public static final Regex DETECT_BRACKETS = new Regex(
  84           "(<|>|\\[|\\]|\\(|\\)|\\{|\\})");
  85
  86   // WUSS extended symbols. Avoid ambiguity with protein SS annotations by using NOT_RNASS first.
  87   public static final String RNASS_BRACKETS = "<>[](){}AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
  88
  89   // use the following regex to decide an annotations (whole) line is NOT an RNA
  90   // SS (it contains only E,H,e,h and other non-brace/non-alpha chars)
  91   private static final Regex NOT_RNASS = new Regex(
  92           "^[^<>[\\](){}A-DF-Za-df-z]*$");
  93
  94   StringBuffer out; // output buffer
  95
  96   AlignmentI al;
  97
  98   public StockholmFile()
  99   {
 100   }
 101
 102   /**
 103    * Creates a new StockholmFile object for output.
 104    */
 105   public StockholmFile(AlignmentI al)
 106   {
 107     this.al = al;
 108   }
 109
 110   public StockholmFile(String inFile, DataSourceType type)
 111           throws IOException
 112   {
 113     super(inFile, type);
 114   }
 115
 116   public StockholmFile(FileParse source) throws IOException
 117   {
 118     super(source);
 119   }
 120
 121   @Override
 122   public void initData()
 123   {
 124     super.initData();
 125   }
 126
 127   /**
 128    * Parse a file in Stockholm format into Jalview's data model using VARNA
 129    *
 130    * @throws IOException
 131    *           If there is an error with the input file
 132    */
 133   public void parse_with_VARNA(java.io.File inFile) throws IOException
 134   {
 135     FileReader fr = null;
 136     fr = new FileReader(inFile);
 137
 138     BufferedReader r = new BufferedReader(fr);
 139     List<RNA> result = null;
 140     try
 141     {
 142       result = RNAFactory.loadSecStrStockholm(r);
 143     } catch (ExceptionUnmatchedClosingParentheses umcp)
 144     {
 145       errormessage = "Unmatched parentheses in annotation. Aborting ("
 146               + umcp.getMessage() + ")";
 147       throw new IOException(umcp);
 148     }
 149     // DEBUG System.out.println("this is the secondary scructure:"
 150     // +result.size());
 151     SequenceI[] seqs = new SequenceI[result.size()];
 152     String id = null;
 153     for (int i = 0; i < result.size(); i++)
 154     {
 155       // DEBUG System.err.println("Processing i'th sequence in Stockholm file")
 156       RNA current = result.get(i);
 157
 158       String seq = current.getSeq();
 159       String rna = current.getStructDBN(true);
 160       // DEBUG System.out.println(seq);
 161       // DEBUG System.err.println(rna);
 162       int begin = 0;
 163       int end = seq.length() - 1;
 164       id = safeName(getDataName());
 165       seqs[i] = new Sequence(id, seq, begin, end);
 166       String[] annot = new String[rna.length()];
 167       Annotation[] ann = new Annotation[rna.length()];
 168       for (int j = 0; j < rna.length(); j++)
 169       {
 170         annot[j] = rna.substring(j, j + 1);
 171
 172       }
 173
 174       for (int k = 0; k < rna.length(); k++)
 175       {
 176         ann[k] = new Annotation(annot[k], "",
 177                 Rna.getRNASecStrucState(annot[k]).charAt(0), 0f);
 178
 179       }
 180       AlignmentAnnotation align = new AlignmentAnnotation("Sec. str.",
 181               current.getID(), ann);
 182
 183       seqs[i].addAlignmentAnnotation(align);
 184       seqs[i].setRNA(result.get(i));
 185       this.annotations.addElement(align);
 186     }
 187     this.setSeqs(seqs);
 188
 189   }
 190
 191   /**
 192    * Parse a file in Stockholm format into Jalview's data model. The file has to
 193    * be passed at construction time
 194    *
 195    * @throws IOException
 196    *           If there is an error with the input file
 197    */
 198   @Override
 199   public void parse() throws IOException
 200   {
 201     StringBuffer treeString = new StringBuffer();
 202     String treeName = null;
 203     // --------------- Variable Definitions -------------------
 204     String line;
 205     String version;
 206     // String id;
 207     Hashtable seqAnn = new Hashtable(); // Sequence related annotations
 208     LinkedHashMap<String, String> seqs = new LinkedHashMap<>();
 209     Regex p, r, rend, s, x;
 210     // Temporary line for processing RNA annotation
 211     // String RNAannot = "";
 212
 213     // ------------------ Parsing File ----------------------
 214     // First, we have to check that this file has STOCKHOLM format, i.e. the
 215     // first line must match
 216
 217     r = new Regex("# STOCKHOLM ([\\d\\.]+)");
 218     if (!r.search(nextLine()))
 219     {
 220       throw new IOException(MessageManager
 221               .getString("exception.stockholm_invalid_format"));
 222     }
 223     else
 224     {
 225       version = r.stringMatched(1);
 226
 227       // logger.debug("Stockholm version: " + version);
 228     }
 229
 230     // We define some Regexes here that will be used regularily later
 231     rend = new Regex("^\\s*\\/\\/"); // Find the end of an alignment
 232     p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in
 233     // id/from/to
 234     s = new Regex("(\\S+)\\s+(\\S*)\\s+(.*)"); // Parses annotation subtype
 235     r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line
 236     x = new Regex("(\\S+)\\s+(\\S+)"); // split id from sequence
 237
 238     // Convert all bracket types to parentheses (necessary for passing to VARNA)
 239     Regex openparen = new Regex("(<|\\[)", "(");
 240     Regex closeparen = new Regex("(>|\\])", ")");
 241
 242     // Detect if file is RNA by looking for bracket types
 243     Regex detectbrackets = new Regex("(<|>|\\[|\\]|\\(|\\))");
 244
 245     rend.optimize();
 246     p.optimize();
 247     s.optimize();
 248     r.optimize();
 249     x.optimize();
 250     openparen.optimize();
 251     closeparen.optimize();
 252
 253     while ((line = nextLine()) != null)
 254     {
 255       if (line.length() == 0)
 256       {
 257         continue;
 258       }
 259       if (rend.search(line))
 260       {
 261         // End of the alignment, pass stuff back
 262         this.noSeqs = seqs.size();
 263
 264         String seqdb, dbsource = null;
 265         Regex pf = new Regex("PF[0-9]{5}(.*)"); // Finds AC for Pfam
 266         Regex rf = new Regex("RF[0-9]{5}(.*)"); // Finds AC for Rfam
 267         if (getAlignmentProperty("AC") != null)
 268         {
 269           String dbType = getAlignmentProperty("AC").toString();
 270           if (pf.search(dbType))
 271           {
 272             // PFAM Alignment - so references are typically from Uniprot
 273             dbsource = "PFAM";
 274           }
 275           else if (rf.search(dbType))
 276           {
 277             dbsource = "RFAM";
 278           }
 279         }
 280         // logger.debug("Number of sequences: " + this.noSeqs);
 281         for (Map.Entry<String, String> skey : seqs.entrySet())
 282         {
 283           // logger.debug("Processing sequence " + acc);
 284           String acc = skey.getKey();
 285           String seq = skey.getValue();
 286           if (maxLength < seq.length())
 287           {
 288             maxLength = seq.length();
 289           }
 290           int start = 1;
 291           int end = -1;
 292           String sid = acc;
 293           /*
 294            * Retrieve hash of annotations for this accession Associate
 295            * Annotation with accession
 296            */
 297           Hashtable accAnnotations = null;
 298
 299           if (seqAnn != null && seqAnn.containsKey(acc))
 300           {
 301             accAnnotations = (Hashtable) seqAnn.remove(acc);
 302             // TODO: add structures to sequence
 303           }
 304
 305           // Split accession in id and from/to
 306           if (p.search(acc))
 307           {
 308             sid = p.stringMatched(1);
 309             start = Integer.parseInt(p.stringMatched(2));
 310             end = Integer.parseInt(p.stringMatched(3));
 311           }
 312           // logger.debug(sid + ", " + start + ", " + end);
 313
 314           Sequence seqO = new Sequence(sid, seq, start, end);
 315           // Add Description (if any)
 316           if (accAnnotations != null && accAnnotations.containsKey("DE"))
 317           {
 318             String desc = (String) accAnnotations.get("DE");
 319             seqO.setDescription((desc == null) ? "" : desc);
 320           }
 321           // Add DB References (if any)
 322           if (accAnnotations != null && accAnnotations.containsKey("DR"))
 323           {
 324             String dbr = (String) accAnnotations.get("DR");
 325             if (dbr != null && dbr.indexOf(";") > -1)
 326             {
 327               String src = dbr.substring(0, dbr.indexOf(";"));
 328               String acn = dbr.substring(dbr.indexOf(";") + 1);
 329               jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn);
 330             }
 331           }
 332
 333           if (accAnnotations != null && accAnnotations.containsKey("AC"))
 334           {
 335             if (dbsource != null)
 336             {
 337               String dbr = (String) accAnnotations.get("AC");
 338               if (dbr != null)
 339               {
 340                 // we could get very clever here - but for now - just try to
 341                 // guess accession type from source of alignment plus structure
 342                 // of accession
 343                 guessDatabaseFor(seqO, dbr, dbsource);
 344
 345               }
 346             }
 347             // else - do what ? add the data anyway and prompt the user to
 348             // specify what references these are ?
 349           }
 350
 351           Hashtable features = null;
 352           // We need to adjust the positions of all features to account for gaps
 353           try
 354           {
 355             features = (Hashtable) accAnnotations.remove("features");
 356           } catch (java.lang.NullPointerException e)
 357           {
 358             // loggerwarn("Getting Features for " + acc + ": " +
 359             // e.getMessage());
 360             // continue;
 361           }
 362           // if we have features
 363           if (features != null)
 364           {
 365             int posmap[] = seqO.findPositionMap();
 366             Enumeration i = features.keys();
 367             while (i.hasMoreElements())
 368             {
 369               // TODO: parse out secondary structure annotation as annotation
 370               // row
 371               // TODO: parse out scores as annotation row
 372               // TODO: map coding region to core jalview feature types
 373               String type = i.nextElement().toString();
 374               Hashtable content = (Hashtable) features.remove(type);
 375
 376               // add alignment annotation for this feature
 377               String key = type2id(type);
 378
 379               /*
 380                * have we added annotation rows for this type ?
 381                */
 382               boolean annotsAdded = false;
 383               if (key != null)
 384               {
 385                 if (accAnnotations != null
 386                         && accAnnotations.containsKey(key))
 387                 {
 388                   Vector vv = (Vector) accAnnotations.get(key);
 389                   for (int ii = 0; ii < vv.size(); ii++)
 390                   {
 391                     annotsAdded = true;
 392                     AlignmentAnnotation an = (AlignmentAnnotation) vv
 393                             .elementAt(ii);
 394                     seqO.addAlignmentAnnotation(an);
 395                     annotations.add(an);
 396                   }
 397                 }
 398               }
 399
 400               Enumeration j = content.keys();
 401               while (j.hasMoreElements())
 402               {
 403                 String desc = j.nextElement().toString();
 404                 if (ANNOTATION.equals(desc) && annotsAdded)
 405                 {
 406                   // don't add features if we already added an annotation row
 407                   continue;
 408                 }
 409                 String ns = content.get(desc).toString();
 410                 char[] byChar = ns.toCharArray();
 411                 for (int k = 0; k < byChar.length; k++)
 412                 {
 413                   char c = byChar[k];
 414                   if (!(c == ' ' || c == '_' || c == '-' || c == '.')) // PFAM
 415                   // uses
 416                   // '.'
 417                   // for
 418                   // feature
 419                   // background
 420                   {
 421                     int new_pos = posmap[k]; // look up nearest seqeunce
 422                     // position to this column
 423                     SequenceFeature feat = new SequenceFeature(type, desc,
 424                             new_pos, new_pos, null);
 425
 426                     seqO.addSequenceFeature(feat);
 427                   }
 428                 }
 429               }
 430
 431             }
 432
 433           }
 434           // garbage collect
 435
 436           // logger.debug("Adding seq " + acc + " from " + start + " to " + end
 437           // + ": " + seq);
 438           this.seqs.addElement(seqO);
 439         }
 440         return; // finished parsing this segment of source
 441       }
 442       else if (!r.search(line))
 443       {
 444         // System.err.println("Found sequence line: " + line);
 445
 446         // Split sequence in sequence and accession parts
 447         if (!x.search(line))
 448         {
 449           // logger.error("Could not parse sequence line: " + line);
 450           throw new IOException(MessageManager.formatMessage(
 451                   "exception.couldnt_parse_sequence_line", new String[]
 452                   { line }));
 453         }
 454         String ns = seqs.get(x.stringMatched(1));
 455         if (ns == null)
 456         {
 457           ns = "";
 458         }
 459         ns += x.stringMatched(2);
 460
 461         seqs.put(x.stringMatched(1), ns);
 462       }
 463       else
 464       {
 465         String annType = r.stringMatched(1);
 466         String annContent = r.stringMatched(2);
 467
 468         // System.err.println("type:" + annType + " content: " + annContent);
 469
 470         if (annType.equals("GF"))
 471         {
 472           /*
 473            * Generic per-File annotation, free text Magic features: #=GF NH
 474            * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier
 475            * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS
 476            *
 477            * Compulsory fields: ------------------
 478            *
 479            * AC Accession number: Accession number in form PFxxxxx.version or
 480            * PBxxxxxx. ID Identification: One word name for family. DE
 481            * Definition: Short description of family. AU Author: Authors of the
 482            * entry. SE Source of seed: The source suggesting the seed members
 483            * belong to one family. GA Gathering method: Search threshold to
 484            * build the full alignment. TC Trusted Cutoff: Lowest sequence score
 485            * and domain score of match in the full alignment. NC Noise Cutoff:
 486            * Highest sequence score and domain score of match not in full
 487            * alignment. TP Type: Type of family -- presently Family, Domain,
 488            * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM
 489            * Alignment Method The order ls and fs hits are aligned to the model
 490            * to build the full align. // End of alignment.
 491            *
 492            * Optional fields: ----------------
 493            *
 494            * DC Database Comment: Comment about database reference. DR Database
 495            * Reference: Reference to external database. RC Reference Comment:
 496            * Comment about literature reference. RN Reference Number: Reference
 497            * Number. RM Reference Medline: Eight digit medline UI number. RT
 498            * Reference Title: Reference Title. RA Reference Author: Reference
 499            * Author RL Reference Location: Journal location. PI Previous
 500            * identifier: Record of all previous ID lines. KW Keywords: Keywords.
 501            * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.
 502            * NL Location: Location of nested domains - sequence ID, start and
 503            * end of insert.
 504            *
 505            * Obsolete fields: ----------- AL Alignment method of seed: The
 506            * method used to align the seed members.
 507            */
 508           // Let's save the annotations, maybe we'll be able to do something
 509           // with them later...
 510           Regex an = new Regex("(\\w+)\\s*(.*)");
 511           if (an.search(annContent))
 512           {
 513             if (an.stringMatched(1).equals("NH"))
 514             {
 515               treeString.append(an.stringMatched(2));
 516             }
 517             else if (an.stringMatched(1).equals("TN"))
 518             {
 519               if (treeString.length() > 0)
 520               {
 521                 if (treeName == null)
 522                 {
 523                   treeName = "Tree " + (getTreeCount() + 1);
 524                 }
 525                 addNewickTree(treeName, treeString.toString());
 526               }
 527               treeName = an.stringMatched(2);
 528               treeString = new StringBuffer();
 529             }
 530             setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));
 531           }
 532         }
 533         else if (annType.equals("GS"))
 534         {
 535           // Generic per-Sequence annotation, free text
 536           /*
 537            * Pfam uses these features: Feature Description ---------------------
 538            * ----------- AC <accession> ACcession number DE <freetext>
 539            * DEscription DR <db>; <accession>; Database Reference OS <organism>
 540            * OrganiSm (species) OC <clade> Organism Classification (clade, etc.)
 541            * LO <look> Look (Color, etc.)
 542            */
 543           if (s.search(annContent))
 544           {
 545             String acc = s.stringMatched(1);
 546             String type = s.stringMatched(2);
 547             String content = s.stringMatched(3);
 548             // TODO: store DR in a vector.
 549             // TODO: store AC according to generic file db annotation.
 550             Hashtable ann;
 551             if (seqAnn.containsKey(acc))
 552             {
 553               ann = (Hashtable) seqAnn.get(acc);
 554             }
 555             else
 556             {
 557               ann = new Hashtable();
 558             }
 559             ann.put(type, content);
 560             seqAnn.put(acc, ann);
 561           }
 562           else
 563           {
 564             // throw new IOException("Error parsing " + line);
 565             System.err.println(">> missing annotation: " + line);
 566           }
 567         }
 568         else if (annType.equals("GC"))
 569         {
 570           // Generic per-Column annotation, exactly 1 char per column
 571           // always need a label.
 572           if (x.search(annContent))
 573           {
 574             // parse out and create alignment annotation directly.
 575             parseAnnotationRow(annotations, x.stringMatched(1),
 576                     x.stringMatched(2));
 577           }
 578         }
 579         else if (annType.equals("GR"))
 580         {
 581           // Generic per-Sequence AND per-Column markup, exactly 1 char per
 582           // column
 583           /*
 584            * Feature Description Markup letters ------- -----------
 585            * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface
 586            * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane
 587            * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;
 588            * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in
 589            * or after) [0-2]
 590            */
 591           if (s.search(annContent))
 592           {
 593             String acc = s.stringMatched(1);
 594             String type = s.stringMatched(2);
 595             String oseq = s.stringMatched(3);
 596             /*
 597              * copy of annotation field that may be processed into whitespace chunks
 598              */
 599             String seq = new String(oseq);
 600
 601             Hashtable ann;
 602             // Get an object with all the annotations for this sequence
 603             if (seqAnn.containsKey(acc))
 604             {
 605               // logger.debug("Found annotations for " + acc);
 606               ann = (Hashtable) seqAnn.get(acc);
 607             }
 608             else
 609             {
 610               // logger.debug("Creating new annotations holder for " + acc);
 611               ann = new Hashtable();
 612               seqAnn.put(acc, ann);
 613             }
 614
 615             // // start of block for appending annotation lines for wrapped
 616             // stokchholm file
 617             // TODO test structure, call parseAnnotationRow with vector from
 618             // hashtable for specific sequence
 619
 620             Hashtable features;
 621             // Get an object with all the content for an annotation
 622             if (ann.containsKey("features"))
 623             {
 624               // logger.debug("Found features for " + acc);
 625               features = (Hashtable) ann.get("features");
 626             }
 627             else
 628             {
 629               // logger.debug("Creating new features holder for " + acc);
 630               features = new Hashtable();
 631               ann.put("features", features);
 632             }
 633
 634             Hashtable content;
 635             if (features.containsKey(this.id2type(type)))
 636             {
 637               // logger.debug("Found content for " + this.id2type(type));
 638               content = (Hashtable) features.get(this.id2type(type));
 639             }
 640             else
 641             {
 642               // logger.debug("Creating new content holder for " +
 643               // this.id2type(type));
 644               content = new Hashtable();
 645               features.put(this.id2type(type), content);
 646             }
 647             String ns = (String) content.get(ANNOTATION);
 648
 649             if (ns == null)
 650             {
 651               ns = "";
 652             }
 653             // finally, append the annotation line
 654             ns += seq;
 655             content.put(ANNOTATION, ns);
 656             // // end of wrapped annotation block.
 657             // // Now a new row is created with the current set of data
 658
 659             Hashtable strucAnn;
 660             if (seqAnn.containsKey(acc))
 661             {
 662               strucAnn = (Hashtable) seqAnn.get(acc);
 663             }
 664             else
 665             {
 666               strucAnn = new Hashtable();
 667             }
 668
 669             Vector<AlignmentAnnotation> newStruc = new Vector<>();
 670             parseAnnotationRow(newStruc, type, ns);
 671             for (AlignmentAnnotation alan : newStruc)
 672             {
 673               alan.visible = false;
 674             }
 675             // new annotation overwrites any existing annotation...
 676
 677             strucAnn.put(type, newStruc);
 678             seqAnn.put(acc, strucAnn);
 679           }
 680           // }
 681           else
 682           {
 683             System.err.println(
 684                     "Warning - couldn't parse sequence annotation row line:\n"
 685                             + line);
 686             // throw new IOException("Error parsing " + line);
 687           }
 688         }
 689         else
 690         {
 691           throw new IOException(MessageManager.formatMessage(
 692                   "exception.unknown_annotation_detected", new String[]
 693                   { annType, annContent }));
 694         }
 695       }
 696     }
 697     if (treeString.length() > 0)
 698     {
 699       if (treeName == null)
 700       {
 701         treeName = "Tree " + (1 + getTreeCount());
 702       }
 703       addNewickTree(treeName, treeString.toString());
 704     }
 705   }
 706
 707   /**
 708    * Demangle an accession string and guess the originating sequence database
 709    * for a given sequence
 710    *
 711    * @param seqO
 712    *          sequence to be annotated
 713    * @param dbr
 714    *          Accession string for sequence
 715    * @param dbsource
 716    *          source database for alignment (PFAM or RFAM)
 717    */
 718   private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource)
 719   {
 720     DBRefEntry dbrf = null;
 721     List<DBRefEntry> dbrs = new ArrayList<>();
 722     String seqdb = "Unknown", sdbac = "" + dbr;
 723     int st = -1, en = -1, p;
 724     if ((st = sdbac.indexOf("/")) > -1)
 725     {
 726       String num, range = sdbac.substring(st + 1);
 727       sdbac = sdbac.substring(0, st);
 728       if ((p = range.indexOf("-")) > -1)
 729       {
 730         p++;
 731         if (p < range.length())
 732         {
 733           num = range.substring(p).trim();
 734           try
 735           {
 736             en = Integer.parseInt(num);
 737           } catch (NumberFormatException x)
 738           {
 739             // could warn here that index is invalid
 740             en = -1;
 741           }
 742         }
 743       }
 744       else
 745       {
 746         p = range.length();
 747       }
 748       num = range.substring(0, p).trim();
 749       try
 750       {
 751         st = Integer.parseInt(num);
 752       } catch (NumberFormatException x)
 753       {
 754         // could warn here that index is invalid
 755         st = -1;
 756       }
 757     }
 758     if (dbsource.equals("PFAM"))
 759     {
 760       seqdb = "UNIPROT";
 761       if (sdbac.indexOf(".") > -1)
 762       {
 763         // strip of last subdomain
 764         sdbac = sdbac.substring(0, sdbac.indexOf("."));
 765         dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
 766                 sdbac);
 767         if (dbrf != null)
 768         {
 769           dbrs.add(dbrf);
 770         }
 771       }
 772       dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
 773               dbr);
 774       if (dbr != null)
 775       {
 776         dbrs.add(dbrf);
 777       }
 778     }
 779     else
 780     {
 781       seqdb = "EMBL"; // total guess - could be ENA, or something else these
 782                       // days
 783       if (sdbac.indexOf(".") > -1)
 784       {
 785         // strip off last subdomain
 786         sdbac = sdbac.substring(0, sdbac.indexOf("."));
 787         dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
 788                 sdbac);
 789         if (dbrf != null)
 790         {
 791           dbrs.add(dbrf);
 792         }
 793       }
 794
 795       dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
 796               dbr);
 797       if (dbrf != null)
 798       {
 799         dbrs.add(dbrf);
 800       }
 801     }
 802     if (st != -1 && en != -1)
 803     {
 804       for (DBRefEntry d : dbrs)
 805       {
 806         jalview.util.MapList mp = new jalview.util.MapList(
 807                 new int[]
 808                 { seqO.getStart(), seqO.getEnd() }, new int[] { st, en }, 1,
 809                 1);
 810         jalview.datamodel.Mapping mping = new Mapping(mp);
 811         d.setMap(mping);
 812       }
 813     }
 814   }
 815
 816   protected static AlignmentAnnotation parseAnnotationRow(
 817           Vector<AlignmentAnnotation> annotation, String label,
 818           String annots)
 819   {
 820     String convert1, convert2 = null;
 821
 822     // convert1 = OPEN_PAREN.replaceAll(annots);
 823     // convert2 = CLOSE_PAREN.replaceAll(convert1);
 824     // annots = convert2;
 825
 826     String type = label;
 827     if (label.contains("_cons"))
 828     {
 829       type = (label.indexOf("_cons") == label.length() - 5)
 830               ? label.substring(0, label.length() - 5)
 831               : label;
 832     }
 833     boolean ss = false, posterior = false;
 834     type = id2type(type);
 835
 836     boolean isrnass = false;
 837     if (type.equalsIgnoreCase("secondary structure"))
 838     {
 839       ss = true;
 840       isrnass = !NOT_RNASS.search(annots); // sorry about the double negative
 841                                            // here (it's easier for dealing with
 842                                            // other non-alpha-non-brace chars)
 843     }
 844     if (type.equalsIgnoreCase("posterior probability"))
 845     {
 846       posterior = true;
 847     }
 848     // decide on secondary structure or not.
 849     Annotation[] els = new Annotation[annots.length()];
 850     for (int i = 0; i < annots.length(); i++)
 851     {
 852       String pos = annots.substring(i, i + 1);
 853       Annotation ann;
 854       ann = new Annotation(pos, "", ' ', 0f); // 0f is 'valid' null - will not
 855       // be written out
 856       if (ss)
 857       {
 858         // if (" .-_".indexOf(pos) == -1)
 859         {
 860           if (isrnass && RNASS_BRACKETS.indexOf(pos) >= 0)
 861           {
 862             ann.secondaryStructure = Rna.getRNASecStrucState(pos).charAt(0);
 863             ann.displayCharacter = "" + pos.charAt(0);
 864           }
 865           else
 866           {
 867             ann.secondaryStructure = ResidueProperties.getDssp3state(pos)
 868                     .charAt(0);
 869
 870             if (ann.secondaryStructure == pos.charAt(0))
 871             {
 872               ann.displayCharacter = ""; // null; // " ";
 873             }
 874             else
 875             {
 876               ann.displayCharacter = " " + ann.displayCharacter;
 877             }
 878           }
 879         }
 880
 881       }
 882       if (posterior && !ann.isWhitespace()
 883               && !Comparison.isGap(pos.charAt(0)))
 884       {
 885         float val = 0;
 886         // symbol encodes values - 0..*==0..10
 887         if (pos.charAt(0) == '*')
 888         {
 889           val = 10;
 890         }
 891         else
 892         {
 893           val = pos.charAt(0) - '0';
 894           if (val > 9)
 895           {
 896             val = 10;
 897           }
 898         }
 899         ann.value = val;
 900       }
 901
 902       els[i] = ann;
 903     }
 904     AlignmentAnnotation annot = null;
 905     Enumeration<AlignmentAnnotation> e = annotation.elements();
 906     while (e.hasMoreElements())
 907     {
 908       annot = e.nextElement();
 909       if (annot.label.equals(type))
 910       {
 911         break;
 912       }
 913       annot = null;
 914     }
 915     if (annot == null)
 916     {
 917       annot = new AlignmentAnnotation(type, type, els);
 918       annotation.addElement(annot);
 919     }
 920     else
 921     {
 922       Annotation[] anns = new Annotation[annot.annotations.length
 923               + els.length];
 924       System.arraycopy(annot.annotations, 0, anns, 0,
 925               annot.annotations.length);
 926       System.arraycopy(els, 0, anns, annot.annotations.length, els.length);
 927       annot.annotations = anns;
 928       // System.out.println("else: ");
 929     }
 930     return annot;
 931   }
 932
 933   @Override
 934   public String print(SequenceI[] s, boolean jvSuffix)
 935   {
 936     out = new StringBuffer();
 937     out.append("# STOCKHOLM 1.0");
 938     out.append(newline);
 939
 940     // find max length of id
 941     int max = 0;
 942     int maxid = 0;
 943     int in = 0;
 944     Hashtable dataRef = null;
 945     while ((in < s.length) && (s[in] != null))
 946     {
 947       String tmp = printId(s[in], jvSuffix);
 948       max = Math.max(max, s[in].getLength());
 949
 950       if (tmp.length() > maxid)
 951       {
 952         maxid = tmp.length();
 953       }
 954       if (s[in].getDBRefs() != null)
 955       {
 956         for (int idb = 0; idb < s[in].getDBRefs().length; idb++)
 957         {
 958           if (dataRef == null)
 959           {
 960             dataRef = new Hashtable();
 961           }
 962
 963           String datAs1 = s[in].getDBRefs()[idb].getSource().toString()
 964                   + " ; "
 965                   + s[in].getDBRefs()[idb].getAccessionId().toString();
 966           dataRef.put(tmp, datAs1);
 967         }
 968       }
 969       in++;
 970     }
 971     maxid += 9;
 972     int i = 0;
 973
 974     // output database type
 975     if (al.getProperties() != null)
 976     {
 977       if (!al.getProperties().isEmpty())
 978       {
 979         Enumeration key = al.getProperties().keys();
 980         Enumeration val = al.getProperties().elements();
 981         while (key.hasMoreElements())
 982         {
 983           out.append("#=GF " + key.nextElement() + " " + val.nextElement());
 984           out.append(newline);
 985         }
 986       }
 987     }
 988
 989     // output database accessions
 990     if (dataRef != null)
 991     {
 992       Enumeration en = dataRef.keys();
 993       while (en.hasMoreElements())
 994       {
 995         Object idd = en.nextElement();
 996         String type = (String) dataRef.remove(idd);
 997         out.append(new Format("%-" + (maxid - 2) + "s")
 998                 .form("#=GS " + idd.toString() + " "));
 999         if (type.contains("PFAM") || type.contains("RFAM"))
1000         {
1001
1002           out.append(" AC " + type.substring(type.indexOf(";") + 1));
1003         }
1004         else
1005         {
1006           out.append(" DR " + type + " ");
1007         }
1008         out.append(newline);
1009       }
1010     }
1011
1012     // output annotations
1013     while (i < s.length && s[i] != null)
1014     {
1015       AlignmentAnnotation[] alAnot = s[i].getAnnotation();
1016       if (alAnot != null)
1017       {
1018         Annotation[] ann;
1019         for (int j = 0; j < alAnot.length; j++)
1020         {
1021
1022           String key = type2id(alAnot[j].label);
1023           boolean isrna = alAnot[j].isValidStruc();
1024
1025           if (isrna)
1026           {
1027             // hardwire to secondary structure if there is RNA secondary
1028             // structure on the annotation
1029             key = "SS";
1030           }
1031           if (key == null)
1032           {
1033
1034             continue;
1035           }
1036
1037           // out.append("#=GR ");
1038           out.append(new Format("%-" + maxid + "s").form(
1039                   "#=GR " + printId(s[i], jvSuffix) + " " + key + " "));
1040           ann = alAnot[j].annotations;
1041           String seq = "";
1042           for (int k = 0; k < ann.length; k++)
1043           {
1044             seq += outputCharacter(key, k, isrna, ann, s[i]);
1045           }
1046           out.append(seq);
1047           out.append(newline);
1048         }
1049       }
1050
1051       out.append(new Format("%-" + maxid + "s")
1052               .form(printId(s[i], jvSuffix) + " "));
1053       out.append(s[i].getSequenceAsString());
1054       out.append(newline);
1055       i++;
1056     }
1057
1058     // alignment annotation
1059     AlignmentAnnotation aa;
1060     if (al.getAlignmentAnnotation() != null)
1061     {
1062       for (int ia = 0; ia < al.getAlignmentAnnotation().length; ia++)
1063       {
1064         aa = al.getAlignmentAnnotation()[ia];
1065         if (aa.autoCalculated || !aa.visible || aa.sequenceRef != null)
1066         {
1067           continue;
1068         }
1069         String seq = "";
1070         String label;
1071         String key = "";
1072         if (aa.label.equals("seq"))
1073         {
1074           label = "seq_cons";
1075         }
1076         else
1077         {
1078           key = type2id(aa.label.toLowerCase());
1079           if (key == null)
1080           {
1081             label = aa.label;
1082           }
1083           else
1084           {
1085             label = key + "_cons";
1086           }
1087         }
1088         if (label == null)
1089         {
1090           label = aa.label;
1091         }
1092         label = label.replace(" ", "_");
1093
1094         out.append(
1095                 new Format("%-" + maxid + "s").form("#=GC " + label + " "));
1096         boolean isrna = aa.isValidStruc();
1097         for (int j = 0; j < aa.annotations.length; j++)
1098         {
1099           seq += outputCharacter(key, j, isrna, aa.annotations, null);
1100         }
1101         out.append(seq);
1102         out.append(newline);
1103       }
1104     }
1105
1106     out.append("//");
1107     out.append(newline);
1108
1109     return out.toString();
1110   }
1111
1112   /**
1113    * add an annotation character to the output row
1114    *
1115    * @param seq
1116    * @param key
1117    * @param k
1118    * @param isrna
1119    * @param ann
1120    * @param sequenceI
1121    */
1122   private char outputCharacter(String key, int k, boolean isrna,
1123           Annotation[] ann, SequenceI sequenceI)
1124   {
1125     char seq = ' ';
1126     Annotation annot = ann[k];
1127     String ch = (annot == null)
1128             ? ((sequenceI == null) ? "-"
1129                     : Character.toString(sequenceI.getCharAt(k)))
1130             : (annot.displayCharacter == null
1131                     ? String.valueOf(annot.secondaryStructure)
1132                     : annot.displayCharacter);
1133     if (ch == null)
1134     {
1135       ch = " ";
1136     }
1137     if (key != null && key.equals("SS"))
1138     {
1139       char ssannotchar = ' ';
1140       boolean charset = false;
1141       if (annot == null)
1142       {
1143         // sensible gap character
1144         ssannotchar = ' ';
1145         charset = true;
1146       }
1147       else
1148       {
1149         // valid secondary structure AND no alternative label (e.g. ' B')
1150         if (annot.secondaryStructure > ' ' && ch.length() < 2)
1151         {
1152           ssannotchar = annot.secondaryStructure;
1153           charset = true;
1154         }
1155       }
1156       if (charset)
1157       {
1158         return (ssannotchar == ' ' && isrna) ? '.' : ssannotchar;
1159       }
1160     }
1161
1162     if (ch.length() == 0)
1163     {
1164       seq = '.';
1165     }
1166     else if (ch.length() == 1)
1167     {
1168       seq = ch.charAt(0);
1169     }
1170     else if (ch.length() > 1)
1171     {
1172       seq = ch.charAt(1);
1173     }
1174
1175     return (seq == ' ' && key != null && key.equals("SS") && isrna) ? '.'
1176             : seq;
1177   }
1178
1179   public String print()
1180   {
1181     out = new StringBuffer();
1182     out.append("# STOCKHOLM 1.0");
1183     out.append(newline);
1184     print(getSeqsAsArray(), false);
1185
1186     out.append("//");
1187     out.append(newline);
1188     return out.toString();
1189   }
1190
1191   private static Hashtable typeIds = null;
1192
1193   static
1194   {
1195     if (typeIds == null)
1196     {
1197       typeIds = new Hashtable();
1198       typeIds.put("SS", "Secondary Structure");
1199       typeIds.put("SA", "Surface Accessibility");
1200       typeIds.put("TM", "transmembrane");
1201       typeIds.put("PP", "Posterior Probability");
1202       typeIds.put("LI", "ligand binding");
1203       typeIds.put("AS", "active site");
1204       typeIds.put("IN", "intron");
1205       typeIds.put("IR", "interacting residue");
1206       typeIds.put("AC", "accession");
1207       typeIds.put("OS", "organism");
1208       typeIds.put("CL", "class");
1209       typeIds.put("DE", "description");
1210       typeIds.put("DR", "reference");
1211       typeIds.put("LO", "look");
1212       typeIds.put("RF", "Reference Positions");
1213
1214     }
1215   }
1216
1217   protected static String id2type(String id)
1218   {
1219     if (typeIds.containsKey(id))
1220     {
1221       return (String) typeIds.get(id);
1222     }
1223     System.err.println(
1224             "Warning : Unknown Stockholm annotation type code " + id);
1225     return id;
1226   }
1227
1228   protected static String type2id(String type)
1229   {
1230     String key = null;
1231     Enumeration e = typeIds.keys();
1232     while (e.hasMoreElements())
1233     {
1234       Object ll = e.nextElement();
1235       if (typeIds.get(ll).toString().equalsIgnoreCase(type))
1236       {
1237         key = (String) ll;
1238         break;
1239       }
1240     }
1241     if (key != null)
1242     {
1243       return key;
1244     }
1245     System.err.println(
1246             "Warning : Unknown Stockholm annotation type: " + type);
1247     return key;
1248   }
1249
1250   /**
1251    * make a friendly ID string.
1252    *
1253    * @param dataName
1254    * @return truncated dataName to after last '/'
1255    */
1256   private String safeName(String dataName)
1257   {
1258     int b = 0;
1259     while ((b = dataName.indexOf("/")) > -1 && b < dataName.length())
1260     {
1261       dataName = dataName.substring(b + 1).trim();
1262
1263     }
1264     int e = (dataName.length() - dataName.indexOf(".")) + 1;
1265     dataName = dataName.substring(1, e).trim();
1266     return dataName;
1267   }
1268 }