src/jalview/io/StockholmFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 /*
  22  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk
  23  */
  24 package jalview.io;
  25
  26 import jalview.analysis.Rna;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentI;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.DBRefEntry;
  31 import jalview.datamodel.Mapping;
  32 import jalview.datamodel.Sequence;
  33 import jalview.datamodel.SequenceFeature;
  34 import jalview.datamodel.SequenceI;
  35 import jalview.schemes.ResidueProperties;
  36 import jalview.util.Comparison;
  37 import jalview.util.Format;
  38 import jalview.util.MessageManager;
  39 import jalview.util.Platform;
  40
  41 import java.io.BufferedReader;
  42 import java.io.FileReader;
  43 import java.io.IOException;
  44 import java.util.ArrayList;
  45 import java.util.Enumeration;
  46 import java.util.Hashtable;
  47 import java.util.LinkedHashMap;
  48 import java.util.List;
  49 import java.util.Map;
  50 import java.util.Vector;
  51
  52 import com.stevesoft.pat.Regex;
  53
  54 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
  55 import fr.orsay.lri.varna.factories.RNAFactory;
  56 import fr.orsay.lri.varna.models.rna.RNA;
  57
  58 // import org.apache.log4j.*;
  59
  60 /**
  61  * This class is supposed to parse a Stockholm format file into Jalview There
  62  * are TODOs in this class: we do not know what the database source and version
  63  * is for the file when parsing the #GS= AC tag which associates accessions with
  64  * sequences. Database references are also not parsed correctly: a separate
  65  * reference string parser must be added to parse the database reference form
  66  * into Jalview's local representation.
  67  *
  68  * @author bsb at sanger.ac.uk
  69  * @author Natasha Shersnev (Dundee, UK) (Stockholm file writer)
  70  * @author Lauren Lui (UCSC, USA) (RNA secondary structure annotation import as
  71  *         stockholm)
  72  * @author Anne Menard (Paris, FR) (VARNA parsing of Stockholm file data)
  73  * @version 0.3 + jalview mods
  74  *
  75  */
  76 public class StockholmFile extends AlignFile
  77 {
  78   private static final String ANNOTATION = "annotation";
  79
  80   // WUSS extended symbols. Avoid ambiguity with protein SS annotations by using
  81   // NOT_RNASS first.
  82
  83   public static final String RNASS_BRACKETS = "<>[](){}AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
  84
  85   public static final int REGEX_STOCKHOLM = 0;
  86
  87   public static final int REGEX_BRACKETS = 1;
  88
  89   // use the following regex to decide an annotations (whole) line is NOT an RNA
  90   // SS (it contains only E,H,e,h and other non-brace/non-alpha chars)
  91   public static final int REGEX_NOT_RNASS = 2;
  92
  93   private static final int REGEX_ANNOTATION = 3;
  94
  95   private static final int REGEX_PFAM = 4;
  96
  97   private static final int REGEX_RFAM = 5;
  98
  99   private static final int REGEX_ALIGN_END = 6;
 100
 101   private static final int REGEX_SPLIT_ID = 7;
 102
 103   private static final int REGEX_SUBTYPE = 8;
 104
 105   private static final int REGEX_ANNOTATION_LINE = 9;
 106
 107   private static final int REGEX_REMOVE_ID = 10;
 108
 109   private static final int REGEX_OPEN_PAREN = 11;
 110
 111   private static final int REGEX_CLOSE_PAREN = 12;
 112
 113   public static final int REGEX_MAX = 13;
 114
 115   private static Regex REGEX[] = new Regex[REGEX_MAX];
 116
 117   /**
 118    * Centralize all actual Regex instantialization in Platform.
 119    *
 120    * @param id
 121    * @return
 122    */
 123   private static Regex getRegex(int id)
 124   {
 125     if (REGEX[id] == null)
 126     {
 127       String pat = null, pat2 = null;
 128       switch (id)
 129       {
 130       case REGEX_STOCKHOLM:
 131         pat = "# STOCKHOLM ([\\d\\.]+)";
 132         break;
 133       case REGEX_BRACKETS:
 134         // for reference; not used
 135         pat = "(<|>|\\[|\\]|\\(|\\)|\\{|\\})";
 136         break;
 137       case REGEX_NOT_RNASS:
 138         pat = "^[^<>[\\](){}A-DF-Za-df-z]*$";
 139         break;
 140       case REGEX_ANNOTATION:
 141         pat = "(\\w+)\\s*(.*)";
 142         break;
 143       case REGEX_PFAM:
 144         pat = "PF[0-9]{5}(.*)";
 145         break;
 146       case REGEX_RFAM:
 147         pat = "RF[0-9]{5}(.*)";
 148         break;
 149       case REGEX_ALIGN_END:
 150         pat = "^\\s*\\/\\/";
 151         break;
 152       case REGEX_SPLIT_ID:
 153         pat = "(\\S+)\\/(\\d+)\\-(\\d+)";
 154         break;
 155       case REGEX_SUBTYPE:
 156         pat = "(\\S+)\\s+(\\S*)\\s+(.*)";
 157         break;
 158       case REGEX_ANNOTATION_LINE:
 159         pat = "#=(G[FSRC]?)\\s+(.*)";
 160         break;
 161       case REGEX_REMOVE_ID:
 162         pat = "(\\S+)\\s+(\\S+)";
 163         break;
 164       case REGEX_OPEN_PAREN:
 165         pat = "(<|\\[)";
 166         pat2 = "(";
 167         break;
 168       case REGEX_CLOSE_PAREN:
 169         pat = "(>|\\])";
 170         pat2 = ")";
 171         break;
 172       default:
 173         return null;
 174       }
 175       REGEX[id] = Platform.newRegex(pat, pat2);
 176     }
 177     return REGEX[id];
 178   }
 179
 180   StringBuffer out; // output buffer
 181
 182   AlignmentI al;
 183
 184   public StockholmFile()
 185   {
 186   }
 187
 188   /**
 189    * Creates a new StockholmFile object for output.
 190    */
 191   public StockholmFile(AlignmentI al)
 192   {
 193     this.al = al;
 194   }
 195
 196   public StockholmFile(String inFile, DataSourceType type)
 197           throws IOException
 198   {
 199     super(inFile, type);
 200   }
 201
 202   public StockholmFile(FileParse source) throws IOException
 203   {
 204     super(source);
 205   }
 206
 207   @Override
 208   public void initData()
 209   {
 210     super.initData();
 211   }
 212
 213   /**
 214    * Parse a file in Stockholm format into Jalview's data model using VARNA
 215    *
 216    * @throws IOException
 217    *           If there is an error with the input file
 218    */
 219   public void parse_with_VARNA(java.io.File inFile) throws IOException
 220   {
 221     FileReader fr = null;
 222     fr = new FileReader(inFile);
 223
 224     BufferedReader r = new BufferedReader(fr);
 225     List<RNA> result = null;
 226     try
 227     {
 228       result = RNAFactory.loadSecStrStockholm(r);
 229     } catch (ExceptionUnmatchedClosingParentheses umcp)
 230     {
 231       errormessage = "Unmatched parentheses in annotation. Aborting ("
 232               + umcp.getMessage() + ")";
 233       throw new IOException(umcp);
 234     }
 235     // DEBUG System.out.println("this is the secondary scructure:"
 236     // +result.size());
 237     SequenceI[] seqs = new SequenceI[result.size()];
 238     String id = null;
 239     for (int i = 0; i < result.size(); i++)
 240     {
 241       // DEBUG System.err.println("Processing i'th sequence in Stockholm file")
 242       RNA current = result.get(i);
 243
 244       String seq = current.getSeq();
 245       String rna = current.getStructDBN(true);
 246       // DEBUG System.out.println(seq);
 247       // DEBUG System.err.println(rna);
 248       int begin = 0;
 249       int end = seq.length() - 1;
 250       id = safeName(getDataName());
 251       seqs[i] = new Sequence(id, seq, begin, end);
 252       String[] annot = new String[rna.length()];
 253       Annotation[] ann = new Annotation[rna.length()];
 254       for (int j = 0; j < rna.length(); j++)
 255       {
 256         annot[j] = rna.substring(j, j + 1);
 257
 258       }
 259
 260       for (int k = 0; k < rna.length(); k++)
 261       {
 262         ann[k] = new Annotation(annot[k], "",
 263                 Rna.getRNASecStrucState(annot[k]).charAt(0), 0f);
 264
 265       }
 266       AlignmentAnnotation align = new AlignmentAnnotation("Sec. str.",
 267               current.getID(), ann);
 268
 269       seqs[i].addAlignmentAnnotation(align);
 270       seqs[i].setRNA(result.get(i));
 271       this.annotations.addElement(align);
 272     }
 273     this.setSeqs(seqs);
 274
 275   }
 276
 277   /**
 278    * Parse a file in Stockholm format into Jalview's data model. The file has to
 279    * be passed at construction time
 280    *
 281    * @throws IOException
 282    *           If there is an error with the input file
 283    */
 284   @Override
 285   public void parse() throws IOException
 286   {
 287     StringBuffer treeString = new StringBuffer();
 288     String treeName = null;
 289     // --------------- Variable Definitions -------------------
 290     String line;
 291     String version;
 292     // String id;
 293     Hashtable seqAnn = new Hashtable(); // Sequence related annotations
 294     LinkedHashMap<String, String> seqs = new LinkedHashMap<>();
 295     Regex p, r, rend, s, x;
 296     // Temporary line for processing RNA annotation
 297     // String RNAannot = "";
 298
 299     // ------------------ Parsing File ----------------------
 300     // First, we have to check that this file has STOCKHOLM format, i.e. the
 301     // first line must match
 302
 303     r = getRegex(REGEX_STOCKHOLM);
 304     if (!r.search(nextLine()))
 305     {
 306       throw new IOException(MessageManager
 307               .getString("exception.stockholm_invalid_format"));
 308     }
 309     else
 310     {
 311       version = r.stringMatched(1);
 312
 313       // logger.debug("Stockholm version: " + version);
 314     }
 315
 316     // We define some Regexes here that will be used regularily later
 317     rend = getRegex(REGEX_ALIGN_END);//"^\\s*\\/\\/"); // Find the end of an alignment
 318     p = getRegex(REGEX_SPLIT_ID);//"(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in
 319     // id/from/to
 320     s = getRegex(REGEX_SUBTYPE);// "(\\S+)\\s+(\\S*)\\s+(.*)"); // Parses
 321                                 // annotation subtype
 322     r = getRegex(REGEX_ANNOTATION_LINE);// "#=(G[FSRC]?)\\s+(.*)"); // Finds any
 323                                         // annotation line
 324     x = getRegex(REGEX_REMOVE_ID);// "(\\S+)\\s+(\\S+)"); // split id from
 325                                   // sequence
 326
 327     // Convert all bracket types to parentheses (necessary for passing to VARNA)
 328     Regex openparen = getRegex(REGEX_OPEN_PAREN);//"(<|\\[)", "(");
 329     Regex closeparen = getRegex(REGEX_CLOSE_PAREN);//"(>|\\])", ")");
 330
 331 //    // Detect if file is RNA by looking for bracket types
 332     // Regex detectbrackets = getRegex("(<|>|\\[|\\]|\\(|\\))");
 333
 334     rend.optimize();
 335     p.optimize();
 336     s.optimize();
 337     r.optimize();
 338     x.optimize();
 339     openparen.optimize();
 340     closeparen.optimize();
 341
 342     while ((line = nextLine()) != null)
 343     {
 344       if (line.length() == 0)
 345       {
 346         continue;
 347       }
 348       if (rend.search(line))
 349       {
 350         // End of the alignment, pass stuff back
 351         this.noSeqs = seqs.size();
 352
 353         String dbsource = null;
 354         Regex pf = getRegex(REGEX_PFAM); // Finds AC for Pfam
 355         Regex rf = getRegex(REGEX_RFAM); // Finds AC for Rfam
 356         if (getAlignmentProperty("AC") != null)
 357         {
 358           String dbType = getAlignmentProperty("AC").toString();
 359           if (pf.search(dbType))
 360           {
 361             // PFAM Alignment - so references are typically from Uniprot
 362             dbsource = "PFAM";
 363           }
 364           else if (rf.search(dbType))
 365           {
 366             dbsource = "RFAM";
 367           }
 368         }
 369         // logger.debug("Number of sequences: " + this.noSeqs);
 370         for (Map.Entry<String, String> skey : seqs.entrySet())
 371         {
 372           // logger.debug("Processing sequence " + acc);
 373           String acc = skey.getKey();
 374           String seq = skey.getValue();
 375           if (maxLength < seq.length())
 376           {
 377             maxLength = seq.length();
 378           }
 379           int start = 1;
 380           int end = -1;
 381           String sid = acc;
 382           /*
 383            * Retrieve hash of annotations for this accession Associate
 384            * Annotation with accession
 385            */
 386           Hashtable accAnnotations = null;
 387
 388           if (seqAnn != null && seqAnn.containsKey(acc))
 389           {
 390             accAnnotations = (Hashtable) seqAnn.remove(acc);
 391             // TODO: add structures to sequence
 392           }
 393
 394           // Split accession in id and from/to
 395           if (p.search(acc))
 396           {
 397             sid = p.stringMatched(1);
 398             start = Integer.parseInt(p.stringMatched(2));
 399             end = Integer.parseInt(p.stringMatched(3));
 400           }
 401           // logger.debug(sid + ", " + start + ", " + end);
 402
 403           Sequence seqO = new Sequence(sid, seq, start, end);
 404           // Add Description (if any)
 405           if (accAnnotations != null && accAnnotations.containsKey("DE"))
 406           {
 407             String desc = (String) accAnnotations.get("DE");
 408             seqO.setDescription((desc == null) ? "" : desc);
 409           }
 410           // Add DB References (if any)
 411           if (accAnnotations != null && accAnnotations.containsKey("DR"))
 412           {
 413             String dbr = (String) accAnnotations.get("DR");
 414             if (dbr != null && dbr.indexOf(";") > -1)
 415             {
 416               String src = dbr.substring(0, dbr.indexOf(";"));
 417               String acn = dbr.substring(dbr.indexOf(";") + 1);
 418               jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn);
 419             }
 420           }
 421
 422           if (accAnnotations != null && accAnnotations.containsKey("AC"))
 423           {
 424             if (dbsource != null)
 425             {
 426               String dbr = (String) accAnnotations.get("AC");
 427               if (dbr != null)
 428               {
 429                 // we could get very clever here - but for now - just try to
 430                 // guess accession type from source of alignment plus structure
 431                 // of accession
 432                 guessDatabaseFor(seqO, dbr, dbsource);
 433
 434               }
 435             }
 436             // else - do what ? add the data anyway and prompt the user to
 437             // specify what references these are ?
 438           }
 439
 440           Hashtable features = null;
 441           // We need to adjust the positions of all features to account for gaps
 442           try
 443           {
 444             features = (Hashtable) accAnnotations.remove("features");
 445           } catch (java.lang.NullPointerException e)
 446           {
 447             // loggerwarn("Getting Features for " + acc + ": " +
 448             // e.getMessage());
 449             // continue;
 450           }
 451           // if we have features
 452           if (features != null)
 453           {
 454             int posmap[] = seqO.findPositionMap();
 455             Enumeration i = features.keys();
 456             while (i.hasMoreElements())
 457             {
 458               // TODO: parse out secondary structure annotation as annotation
 459               // row
 460               // TODO: parse out scores as annotation row
 461               // TODO: map coding region to core jalview feature types
 462               String type = i.nextElement().toString();
 463               Hashtable content = (Hashtable) features.remove(type);
 464
 465               // add alignment annotation for this feature
 466               String key = type2id(type);
 467
 468               /*
 469                * have we added annotation rows for this type ?
 470                */
 471               boolean annotsAdded = false;
 472               if (key != null)
 473               {
 474                 if (accAnnotations != null
 475                         && accAnnotations.containsKey(key))
 476                 {
 477                   Vector vv = (Vector) accAnnotations.get(key);
 478                   for (int ii = 0; ii < vv.size(); ii++)
 479                   {
 480                     annotsAdded = true;
 481                     AlignmentAnnotation an = (AlignmentAnnotation) vv
 482                             .elementAt(ii);
 483                     seqO.addAlignmentAnnotation(an);
 484                     annotations.add(an);
 485                   }
 486                 }
 487               }
 488
 489               Enumeration j = content.keys();
 490               while (j.hasMoreElements())
 491               {
 492                 String desc = j.nextElement().toString();
 493                 if (ANNOTATION.equals(desc) && annotsAdded)
 494                 {
 495                   // don't add features if we already added an annotation row
 496                   continue;
 497                 }
 498                 String ns = content.get(desc).toString();
 499                 char[] byChar = ns.toCharArray();
 500                 for (int k = 0; k < byChar.length; k++)
 501                 {
 502                   char c = byChar[k];
 503                   if (!(c == ' ' || c == '_' || c == '-' || c == '.')) // PFAM
 504                   // uses
 505                   // '.'
 506                   // for
 507                   // feature
 508                   // background
 509                   {
 510                     int new_pos = posmap[k]; // look up nearest seqeunce
 511                     // position to this column
 512                     SequenceFeature feat = new SequenceFeature(type, desc,
 513                             new_pos, new_pos, null);
 514
 515                     seqO.addSequenceFeature(feat);
 516                   }
 517                 }
 518               }
 519
 520             }
 521
 522           }
 523           // garbage collect
 524
 525           // logger.debug("Adding seq " + acc + " from " + start + " to " + end
 526           // + ": " + seq);
 527           this.seqs.addElement(seqO);
 528         }
 529         return; // finished parsing this segment of source
 530       }
 531       else if (!r.search(line))
 532       {
 533         // System.err.println("Found sequence line: " + line);
 534
 535         // Split sequence in sequence and accession parts
 536         if (!x.search(line))
 537         {
 538           // logger.error("Could not parse sequence line: " + line);
 539           throw new IOException(MessageManager.formatMessage(
 540                   "exception.couldnt_parse_sequence_line", new String[]
 541                   { line }));
 542         }
 543         String ns = seqs.get(x.stringMatched(1));
 544         if (ns == null)
 545         {
 546           ns = "";
 547         }
 548         ns += x.stringMatched(2);
 549
 550         seqs.put(x.stringMatched(1), ns);
 551       }
 552       else
 553       {
 554         String annType = r.stringMatched(1);
 555         String annContent = r.stringMatched(2);
 556
 557         // System.err.println("type:" + annType + " content: " + annContent);
 558
 559         if (annType.equals("GF"))
 560         {
 561           /*
 562            * Generic per-File annotation, free text Magic features: #=GF NH
 563            * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier
 564            * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS
 565            *
 566            * Compulsory fields: ------------------
 567            *
 568            * AC Accession number: Accession number in form PFxxxxx.version or
 569            * PBxxxxxx. ID Identification: One word name for family. DE
 570            * Definition: Short description of family. AU Author: Authors of the
 571            * entry. SE Source of seed: The source suggesting the seed members
 572            * belong to one family. GA Gathering method: Search threshold to
 573            * build the full alignment. TC Trusted Cutoff: Lowest sequence score
 574            * and domain score of match in the full alignment. NC Noise Cutoff:
 575            * Highest sequence score and domain score of match not in full
 576            * alignment. TP Type: Type of family -- presently Family, Domain,
 577            * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM
 578            * Alignment Method The order ls and fs hits are aligned to the model
 579            * to build the full align. // End of alignment.
 580            *
 581            * Optional fields: ----------------
 582            *
 583            * DC Database Comment: Comment about database reference. DR Database
 584            * Reference: Reference to external database. RC Reference Comment:
 585            * Comment about literature reference. RN Reference Number: Reference
 586            * Number. RM Reference Medline: Eight digit medline UI number. RT
 587            * Reference Title: Reference Title. RA Reference Author: Reference
 588            * Author RL Reference Location: Journal location. PI Previous
 589            * identifier: Record of all previous ID lines. KW Keywords: Keywords.
 590            * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.
 591            * NL Location: Location of nested domains - sequence ID, start and
 592            * end of insert.
 593            *
 594            * Obsolete fields: ----------- AL Alignment method of seed: The
 595            * method used to align the seed members.
 596            */
 597           // Let's save the annotations, maybe we'll be able to do something
 598           // with them later...
 599           Regex an = getRegex(REGEX_ANNOTATION);
 600           if (an.search(annContent))
 601           {
 602             if (an.stringMatched(1).equals("NH"))
 603             {
 604               treeString.append(an.stringMatched(2));
 605             }
 606             else if (an.stringMatched(1).equals("TN"))
 607             {
 608               if (treeString.length() > 0)
 609               {
 610                 if (treeName == null)
 611                 {
 612                   treeName = "Tree " + (getTreeCount() + 1);
 613                 }
 614                 addNewickTree(treeName, treeString.toString());
 615               }
 616               treeName = an.stringMatched(2);
 617               treeString = new StringBuffer();
 618             }
 619             setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));
 620           }
 621         }
 622         else if (annType.equals("GS"))
 623         {
 624           // Generic per-Sequence annotation, free text
 625           /*
 626            * Pfam uses these features: Feature Description ---------------------
 627            * ----------- AC <accession> ACcession number DE <freetext>
 628            * DEscription DR <db>; <accession>; Database Reference OS <organism>
 629            * OrganiSm (species) OC <clade> Organism Classification (clade, etc.)
 630            * LO <look> Look (Color, etc.)
 631            */
 632           if (s.search(annContent))
 633           {
 634             String acc = s.stringMatched(1);
 635             String type = s.stringMatched(2);
 636             String content = s.stringMatched(3);
 637             // TODO: store DR in a vector.
 638             // TODO: store AC according to generic file db annotation.
 639             Hashtable ann;
 640             if (seqAnn.containsKey(acc))
 641             {
 642               ann = (Hashtable) seqAnn.get(acc);
 643             }
 644             else
 645             {
 646               ann = new Hashtable();
 647             }
 648             ann.put(type, content);
 649             seqAnn.put(acc, ann);
 650           }
 651           else
 652           {
 653             // throw new IOException("Error parsing " + line);
 654             System.err.println(">> missing annotation: " + line);
 655           }
 656         }
 657         else if (annType.equals("GC"))
 658         {
 659           // Generic per-Column annotation, exactly 1 char per column
 660           // always need a label.
 661           if (x.search(annContent))
 662           {
 663             // parse out and create alignment annotation directly.
 664             parseAnnotationRow(annotations, x.stringMatched(1),
 665                     x.stringMatched(2));
 666           }
 667         }
 668         else if (annType.equals("GR"))
 669         {
 670           // Generic per-Sequence AND per-Column markup, exactly 1 char per
 671           // column
 672           /*
 673            * Feature Description Markup letters ------- -----------
 674            * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface
 675            * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane
 676            * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;
 677            * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in
 678            * or after) [0-2]
 679            */
 680           if (s.search(annContent))
 681           {
 682             String acc = s.stringMatched(1);
 683             String type = s.stringMatched(2);
 684             String oseq = s.stringMatched(3);
 685             /*
 686              * copy of annotation field that may be processed into whitespace chunks
 687              */
 688             String seq = new String(oseq);
 689
 690             Hashtable ann;
 691             // Get an object with all the annotations for this sequence
 692             if (seqAnn.containsKey(acc))
 693             {
 694               // logger.debug("Found annotations for " + acc);
 695               ann = (Hashtable) seqAnn.get(acc);
 696             }
 697             else
 698             {
 699               // logger.debug("Creating new annotations holder for " + acc);
 700               ann = new Hashtable();
 701               seqAnn.put(acc, ann);
 702             }
 703
 704             // // start of block for appending annotation lines for wrapped
 705             // stokchholm file
 706             // TODO test structure, call parseAnnotationRow with vector from
 707             // hashtable for specific sequence
 708
 709             Hashtable features;
 710             // Get an object with all the content for an annotation
 711             if (ann.containsKey("features"))
 712             {
 713               // logger.debug("Found features for " + acc);
 714               features = (Hashtable) ann.get("features");
 715             }
 716             else
 717             {
 718               // logger.debug("Creating new features holder for " + acc);
 719               features = new Hashtable();
 720               ann.put("features", features);
 721             }
 722
 723             Hashtable content;
 724             if (features.containsKey(this.id2type(type)))
 725             {
 726               // logger.debug("Found content for " + this.id2type(type));
 727               content = (Hashtable) features.get(this.id2type(type));
 728             }
 729             else
 730             {
 731               // logger.debug("Creating new content holder for " +
 732               // this.id2type(type));
 733               content = new Hashtable();
 734               features.put(this.id2type(type), content);
 735             }
 736             String ns = (String) content.get(ANNOTATION);
 737
 738             if (ns == null)
 739             {
 740               ns = "";
 741             }
 742             // finally, append the annotation line
 743             ns += seq;
 744             content.put(ANNOTATION, ns);
 745             // // end of wrapped annotation block.
 746             // // Now a new row is created with the current set of data
 747
 748             Hashtable strucAnn;
 749             if (seqAnn.containsKey(acc))
 750             {
 751               strucAnn = (Hashtable) seqAnn.get(acc);
 752             }
 753             else
 754             {
 755               strucAnn = new Hashtable();
 756             }
 757
 758             Vector<AlignmentAnnotation> newStruc = new Vector<>();
 759             parseAnnotationRow(newStruc, type, ns);
 760             for (AlignmentAnnotation alan : newStruc)
 761             {
 762               alan.visible = false;
 763             }
 764             // new annotation overwrites any existing annotation...
 765
 766             strucAnn.put(type, newStruc);
 767             seqAnn.put(acc, strucAnn);
 768           }
 769           // }
 770           else
 771           {
 772             System.err.println(
 773                     "Warning - couldn't parse sequence annotation row line:\n"
 774                             + line);
 775             // throw new IOException("Error parsing " + line);
 776           }
 777         }
 778         else
 779         {
 780           throw new IOException(MessageManager.formatMessage(
 781                   "exception.unknown_annotation_detected", new String[]
 782                   { annType, annContent }));
 783         }
 784       }
 785     }
 786     if (treeString.length() > 0)
 787     {
 788       if (treeName == null)
 789       {
 790         treeName = "Tree " + (1 + getTreeCount());
 791       }
 792       addNewickTree(treeName, treeString.toString());
 793     }
 794   }
 795
 796   /**
 797    * Demangle an accession string and guess the originating sequence database
 798    * for a given sequence
 799    *
 800    * @param seqO
 801    *          sequence to be annotated
 802    * @param dbr
 803    *          Accession string for sequence
 804    * @param dbsource
 805    *          source database for alignment (PFAM or RFAM)
 806    */
 807   private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource)
 808   {
 809     DBRefEntry dbrf = null;
 810     List<DBRefEntry> dbrs = new ArrayList<>();
 811     String seqdb = "Unknown", sdbac = "" + dbr;
 812     int st = -1, en = -1, p;
 813     if ((st = sdbac.indexOf("/")) > -1)
 814     {
 815       String num, range = sdbac.substring(st + 1);
 816       sdbac = sdbac.substring(0, st);
 817       if ((p = range.indexOf("-")) > -1)
 818       {
 819         p++;
 820         if (p < range.length())
 821         {
 822           num = range.substring(p).trim();
 823           try
 824           {
 825             en = Integer.parseInt(num);
 826           } catch (NumberFormatException x)
 827           {
 828             // could warn here that index is invalid
 829             en = -1;
 830           }
 831         }
 832       }
 833       else
 834       {
 835         p = range.length();
 836       }
 837       num = range.substring(0, p).trim();
 838       try
 839       {
 840         st = Integer.parseInt(num);
 841       } catch (NumberFormatException x)
 842       {
 843         // could warn here that index is invalid
 844         st = -1;
 845       }
 846     }
 847     if (dbsource.equals("PFAM"))
 848     {
 849       seqdb = "UNIPROT";
 850       if (sdbac.indexOf(".") > -1)
 851       {
 852         // strip of last subdomain
 853         sdbac = sdbac.substring(0, sdbac.indexOf("."));
 854         dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
 855                 sdbac);
 856         if (dbrf != null)
 857         {
 858           dbrs.add(dbrf);
 859         }
 860       }
 861       dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
 862               dbr);
 863       if (dbr != null)
 864       {
 865         dbrs.add(dbrf);
 866       }
 867     }
 868     else
 869     {
 870       seqdb = "EMBL"; // total guess - could be ENA, or something else these
 871                       // days
 872       if (sdbac.indexOf(".") > -1)
 873       {
 874         // strip off last subdomain
 875         sdbac = sdbac.substring(0, sdbac.indexOf("."));
 876         dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
 877                 sdbac);
 878         if (dbrf != null)
 879         {
 880           dbrs.add(dbrf);
 881         }
 882       }
 883
 884       dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
 885               dbr);
 886       if (dbrf != null)
 887       {
 888         dbrs.add(dbrf);
 889       }
 890     }
 891     if (st != -1 && en != -1)
 892     {
 893       for (DBRefEntry d : dbrs)
 894       {
 895         jalview.util.MapList mp = new jalview.util.MapList(
 896                 new int[]
 897                 { seqO.getStart(), seqO.getEnd() }, new int[] { st, en }, 1,
 898                 1);
 899         jalview.datamodel.Mapping mping = new Mapping(mp);
 900         d.setMap(mping);
 901       }
 902     }
 903   }
 904
 905   protected static AlignmentAnnotation parseAnnotationRow(
 906           Vector<AlignmentAnnotation> annotation, String label,
 907           String annots)
 908   {
 909     String convert1, convert2 = null;
 910
 911     // convert1 = OPEN_PAREN.replaceAll(annots);
 912     // convert2 = CLOSE_PAREN.replaceAll(convert1);
 913     // annots = convert2;
 914
 915     String type = label;
 916     if (label.contains("_cons"))
 917     {
 918       type = (label.indexOf("_cons") == label.length() - 5)
 919               ? label.substring(0, label.length() - 5)
 920               : label;
 921     }
 922     boolean ss = false, posterior = false;
 923     type = id2type(type);
 924
 925     boolean isrnass = false;
 926     if (type.equalsIgnoreCase("secondary structure"))
 927     {
 928       ss = true;
 929       isrnass = !getRegex(REGEX_NOT_RNASS).search(annots); // sorry about the double
 930                                                      // negative
 931                                            // here (it's easier for dealing with
 932                                            // other non-alpha-non-brace chars)
 933     }
 934     if (type.equalsIgnoreCase("posterior probability"))
 935     {
 936       posterior = true;
 937     }
 938     // decide on secondary structure or not.
 939     Annotation[] els = new Annotation[annots.length()];
 940     for (int i = 0; i < annots.length(); i++)
 941     {
 942       String pos = annots.substring(i, i + 1);
 943       Annotation ann;
 944       ann = new Annotation(pos, "", ' ', 0f); // 0f is 'valid' null - will not
 945       // be written out
 946       if (ss)
 947       {
 948         // if (" .-_".indexOf(pos) == -1)
 949         {
 950           if (isrnass && RNASS_BRACKETS.indexOf(pos) >= 0)
 951           {
 952             ann.secondaryStructure = Rna.getRNASecStrucState(pos).charAt(0);
 953             ann.displayCharacter = "" + pos.charAt(0);
 954           }
 955           else
 956           {
 957             ann.secondaryStructure = ResidueProperties.getDssp3state(pos)
 958                     .charAt(0);
 959
 960             if (ann.secondaryStructure == pos.charAt(0))
 961             {
 962               ann.displayCharacter = ""; // null; // " ";
 963             }
 964             else
 965             {
 966               ann.displayCharacter = " " + ann.displayCharacter;
 967             }
 968           }
 969         }
 970
 971       }
 972       if (posterior && !ann.isWhitespace()
 973               && !Comparison.isGap(pos.charAt(0)))
 974       {
 975         float val = 0;
 976         // symbol encodes values - 0..*==0..10
 977         if (pos.charAt(0) == '*')
 978         {
 979           val = 10;
 980         }
 981         else
 982         {
 983           val = pos.charAt(0) - '0';
 984           if (val > 9)
 985           {
 986             val = 10;
 987           }
 988         }
 989         ann.value = val;
 990       }
 991
 992       els[i] = ann;
 993     }
 994     AlignmentAnnotation annot = null;
 995     Enumeration<AlignmentAnnotation> e = annotation.elements();
 996     while (e.hasMoreElements())
 997     {
 998       annot = e.nextElement();
 999       if (annot.label.equals(type))
1000       {
1001         break;
1002       }
1003       annot = null;
1004     }
1005     if (annot == null)
1006     {
1007       annot = new AlignmentAnnotation(type, type, els);
1008       annotation.addElement(annot);
1009     }
1010     else
1011     {
1012       Annotation[] anns = new Annotation[annot.annotations.length
1013               + els.length];
1014       System.arraycopy(annot.annotations, 0, anns, 0,
1015               annot.annotations.length);
1016       System.arraycopy(els, 0, anns, annot.annotations.length, els.length);
1017       annot.annotations = anns;
1018       // System.out.println("else: ");
1019     }
1020     return annot;
1021   }
1022
1023   @Override
1024   public String print(SequenceI[] s, boolean jvSuffix)
1025   {
1026     out = new StringBuffer();
1027     out.append("# STOCKHOLM 1.0");
1028     out.append(newline);
1029
1030     // find max length of id
1031     int max = 0;
1032     int maxid = 0;
1033     int in = 0;
1034     int slen = s.length;
1035     SequenceI seq;
1036     Hashtable<String, String> dataRef = null;
1037     while ((in < slen) && ((seq = s[in]) != null))
1038     {
1039       String tmp = printId(seq, jvSuffix);
1040       max = Math.max(max, seq.getLength());
1041
1042       if (tmp.length() > maxid)
1043       {
1044         maxid = tmp.length();
1045       }
1046       List<DBRefEntry> seqrefs = seq.getDBRefs();
1047       int ndb;
1048       if (seqrefs != null && (ndb = seqrefs.size()) > 0)
1049       {
1050         if (dataRef == null)
1051         {
1052           dataRef = new Hashtable<>();
1053         }
1054         for (int idb = 0; idb < ndb; idb++)
1055         {
1056
1057           DBRefEntry ref = seqrefs.get(idb);
1058           String datAs1 = ref.getSource().toString()
1059                   + " ; "
1060                   + ref.getAccessionId().toString();
1061           dataRef.put(tmp, datAs1);
1062         }
1063       }
1064       in++;
1065     }
1066     maxid += 9;
1067     int i = 0;
1068
1069     // output database type
1070     if (al.getProperties() != null)
1071     {
1072       if (!al.getProperties().isEmpty())
1073       {
1074         Enumeration key = al.getProperties().keys();
1075         Enumeration val = al.getProperties().elements();
1076         while (key.hasMoreElements())
1077         {
1078           out.append("#=GF " + key.nextElement() + " " + val.nextElement());
1079           out.append(newline);
1080         }
1081       }
1082     }
1083
1084     // output database accessions
1085     if (dataRef != null)
1086     {
1087       Enumeration<String> en = dataRef.keys();
1088       while (en.hasMoreElements())
1089       {
1090         Object idd = en.nextElement();
1091         String type = dataRef.remove(idd);
1092         out.append(new Format("%-" + (maxid - 2) + "s")
1093                 .form("#=GS " + idd.toString() + " "));
1094         if (type.contains("PFAM") || type.contains("RFAM"))
1095         {
1096
1097           out.append(" AC " + type.substring(type.indexOf(";") + 1));
1098         }
1099         else
1100         {
1101           out.append(" DR " + type + " ");
1102         }
1103         out.append(newline);
1104       }
1105     }
1106
1107     // output annotations
1108     while (i < slen && (seq = s[i]) != null)
1109     {
1110       AlignmentAnnotation[] alAnot = seq.getAnnotation();
1111       if (alAnot != null)
1112       {
1113         Annotation[] ann;
1114         for (int j = 0, nj = alAnot.length; j < nj; j++)
1115         {
1116
1117           String key = type2id(alAnot[j].label);
1118           boolean isrna = alAnot[j].isValidStruc();
1119
1120           if (isrna)
1121           {
1122             // hardwire to secondary structure if there is RNA secondary
1123             // structure on the annotation
1124             key = "SS";
1125           }
1126           if (key == null)
1127           {
1128
1129             continue;
1130           }
1131
1132           // out.append("#=GR ");
1133           out.append(new Format("%-" + maxid + "s").form(
1134                   "#=GR " + printId(seq, jvSuffix) + " " + key + " "));
1135           ann = alAnot[j].annotations;
1136           String sseq = "";
1137           for (int k = 0, nk = ann.length; k < nk; k++)
1138           {
1139             sseq += outputCharacter(key, k, isrna, ann, seq);
1140           }
1141           out.append(sseq);
1142           out.append(newline);
1143         }
1144       }
1145
1146       out.append(new Format("%-" + maxid + "s")
1147               .form(printId(seq, jvSuffix) + " "));
1148       out.append(seq.getSequenceAsString());
1149       out.append(newline);
1150       i++;
1151     }
1152
1153     // alignment annotation
1154     AlignmentAnnotation aa;
1155     AlignmentAnnotation[] an = al.getAlignmentAnnotation();
1156     if (an != null)
1157     {
1158       for (int ia = 0, na = an.length; ia < na; ia++)
1159       {
1160         aa = an[ia];
1161         if (aa.autoCalculated || !aa.visible || aa.sequenceRef != null)
1162         {
1163           continue;
1164         }
1165         String sseq = "";
1166         String label;
1167         String key = "";
1168         if (aa.label.equals("seq"))
1169         {
1170           label = "seq_cons";
1171         }
1172         else
1173         {
1174           key = type2id(aa.label.toLowerCase());
1175           if (key == null)
1176           {
1177             label = aa.label;
1178           }
1179           else
1180           {
1181             label = key + "_cons";
1182           }
1183         }
1184         if (label == null)
1185         {
1186           label = aa.label;
1187         }
1188         label = label.replace(" ", "_");
1189
1190         out.append(
1191                 new Format("%-" + maxid + "s").form("#=GC " + label + " "));
1192         boolean isrna = aa.isValidStruc();
1193         for (int j = 0, nj = aa.annotations.length; j < nj; j++)
1194         {
1195           sseq += outputCharacter(key, j, isrna, aa.annotations, null);
1196         }
1197         out.append(sseq);
1198         out.append(newline);
1199       }
1200     }
1201
1202     out.append("//");
1203     out.append(newline);
1204
1205     return out.toString();
1206   }
1207
1208   /**
1209    * add an annotation character to the output row
1210    *
1211    * @param seq
1212    * @param key
1213    * @param k
1214    * @param isrna
1215    * @param ann
1216    * @param sequenceI
1217    */
1218   private char outputCharacter(String key, int k, boolean isrna,
1219           Annotation[] ann, SequenceI sequenceI)
1220   {
1221     char seq = ' ';
1222     Annotation annot = ann[k];
1223     String ch = (annot == null)
1224             ? ((sequenceI == null) ? "-"
1225                     : Character.toString(sequenceI.getCharAt(k)))
1226             : (annot.displayCharacter == null
1227                     ? String.valueOf(annot.secondaryStructure)
1228                     : annot.displayCharacter);
1229     if (ch == null)
1230     {
1231       ch = " ";
1232     }
1233     if (key != null && key.equals("SS"))
1234     {
1235       char ssannotchar = ' ';
1236       boolean charset = false;
1237       if (annot == null)
1238       {
1239         // sensible gap character
1240         ssannotchar = ' ';
1241         charset = true;
1242       }
1243       else
1244       {
1245         // valid secondary structure AND no alternative label (e.g. ' B')
1246         if (annot.secondaryStructure > ' ' && ch.length() < 2)
1247         {
1248           ssannotchar = annot.secondaryStructure;
1249           charset = true;
1250         }
1251       }
1252       if (charset)
1253       {
1254         return (ssannotchar == ' ' && isrna) ? '.' : ssannotchar;
1255       }
1256     }
1257
1258     if (ch.length() == 0)
1259     {
1260       seq = '.';
1261     }
1262     else if (ch.length() == 1)
1263     {
1264       seq = ch.charAt(0);
1265     }
1266     else if (ch.length() > 1)
1267     {
1268       seq = ch.charAt(1);
1269     }
1270
1271     return (seq == ' ' && key != null && key.equals("SS") && isrna) ? '.'
1272             : seq;
1273   }
1274
1275   public String print()
1276   {
1277     out = new StringBuffer();
1278     out.append("# STOCKHOLM 1.0");
1279     out.append(newline);
1280     print(getSeqsAsArray(), false);
1281
1282     out.append("//");
1283     out.append(newline);
1284     return out.toString();
1285   }
1286
1287   private static Hashtable typeIds = null;
1288
1289   static
1290   {
1291     if (typeIds == null)
1292     {
1293       typeIds = new Hashtable();
1294       typeIds.put("SS", "Secondary Structure");
1295       typeIds.put("SA", "Surface Accessibility");
1296       typeIds.put("TM", "transmembrane");
1297       typeIds.put("PP", "Posterior Probability");
1298       typeIds.put("LI", "ligand binding");
1299       typeIds.put("AS", "active site");
1300       typeIds.put("IN", "intron");
1301       typeIds.put("IR", "interacting residue");
1302       typeIds.put("AC", "accession");
1303       typeIds.put("OS", "organism");
1304       typeIds.put("CL", "class");
1305       typeIds.put("DE", "description");
1306       typeIds.put("DR", "reference");
1307       typeIds.put("LO", "look");
1308       typeIds.put("RF", "Reference Positions");
1309
1310     }
1311   }
1312
1313   protected static String id2type(String id)
1314   {
1315     if (typeIds.containsKey(id))
1316     {
1317       return (String) typeIds.get(id);
1318     }
1319     System.err.println(
1320             "Warning : Unknown Stockholm annotation type code " + id);
1321     return id;
1322   }
1323
1324   protected static String type2id(String type)
1325   {
1326     String key = null;
1327     Enumeration e = typeIds.keys();
1328     while (e.hasMoreElements())
1329     {
1330       Object ll = e.nextElement();
1331       if (typeIds.get(ll).toString().equalsIgnoreCase(type))
1332       {
1333         key = (String) ll;
1334         break;
1335       }
1336     }
1337     if (key != null)
1338     {
1339       return key;
1340     }
1341     System.err.println(
1342             "Warning : Unknown Stockholm annotation type: " + type);
1343     return key;
1344   }
1345
1346   /**
1347    * make a friendly ID string.
1348    *
1349    * @param dataName
1350    * @return truncated dataName to after last '/'
1351    */
1352   private String safeName(String dataName)
1353   {
1354     int b = 0;
1355     while ((b = dataName.indexOf("/")) > -1 && b < dataName.length())
1356     {
1357       dataName = dataName.substring(b + 1).trim();
1358
1359     }
1360     int e = (dataName.length() - dataName.indexOf(".")) + 1;
1361     dataName = dataName.substring(1, e).trim();
1362     return dataName;
1363   }
1364 }