src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
   3  * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18 package jalview.io;
  19
  20 import jalview.analysis.SequenceIdMatcher;
  21 import jalview.datamodel.AlignmentAnnotation;
  22 import jalview.datamodel.AlignmentI;
  23 import jalview.datamodel.Annotation;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.awt.Color;
  27 import java.io.IOException;
  28 import java.util.ArrayList;
  29 import java.util.HashMap;
  30 import java.util.LinkedHashMap;
  31 import java.util.List;
  32 import java.util.Map;
  33
  34 import javax.xml.parsers.ParserConfigurationException;
  35
  36 import org.xml.sax.SAXException;
  37
  38 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
  39 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
  40 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
  41 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
  42
  43 /**
  44  * A file parse for T-Coffee score ascii format. This file contains the
  45  * alignment consensus for each resude in any sequence.
  46  * <p>
  47  * This file is procuded by <code>t_coffee</code> providing the option
  48  * <code>-output=score_ascii </code> to the program command line
  49  *
  50  * An example file is the following
  51  *
  52  * <pre>
  53  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  54  * Cedric Notredame
  55  * CPU TIME:0 sec.
  56  * SCORE=90
  57  * *
  58  *  BAD AVG GOOD
  59  * *
  60  * 1PHT   :  89
  61  * 1BB9   :  90
  62  * 1UHC   :  94
  63  * 1YCS   :  94
  64  * 1OOT   :  93
  65  * 1ABO   :  94
  66  * 1FYN   :  94
  67  * 1QCF   :  94
  68  * cons   :  90
  69  *
  70  * 1PHT   999999999999999999999999998762112222543211112134
  71  * 1BB9   99999999999999999999999999987-------4322----2234
  72  * 1UHC   99999999999999999999999999987-------5321----2246
  73  * 1YCS   99999999999999999999999999986-------4321----1-35
  74  * 1OOT   999999999999999999999999999861-------3------1135
  75  * 1ABO   99999999999999999999999999986-------422-------34
  76  * 1FYN   99999999999999999999999999985-------32--------35
  77  * 1QCF   99999999999999999999999999974-------2---------24
  78  * cons   999999999999999999999999999851000110321100001134
  79  *
  80  *
  81  * 1PHT   ----------5666642367889999999999889
  82  * 1BB9   1111111111676653-355679999999999889
  83  * 1UHC   ----------788774--66789999999999889
  84  * 1YCS   ----------78777--356789999999999889
  85  * 1OOT   ----------78877--356789999999997-67
  86  * 1ABO   ----------687774--56779999999999889
  87  * 1FYN   ----------6888842356789999999999889
  88  * 1QCF   ----------6878742356789999999999889
  89  * cons   00100000006877641356789999999999889
  90  * </pre>
  91  *
  92  *
  93  * @author Paolo Di Tommaso
  94  *
  95  */
  96 public class TCoffeeScoreFile extends AlignFile {
  97
  98   public TCoffeeScoreFile(String inFile, String type) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException, ExceptionUnmatchedClosingParentheses
  99   {
 100     super(inFile, type);
 101
 102   }
 103
 104   public TCoffeeScoreFile(FileParse source) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException, ExceptionUnmatchedClosingParentheses
 105   {
 106     super(source);
 107   }
 108
 109   /** The {@link Header} structure holder */
 110   Header header;
 111
 112   /**
 113    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 114    * maintaint the insertion order.
 115    */
 116   LinkedHashMap<String, StringBuilder> scores;
 117
 118   Integer fWidth;
 119
 120   /**
 121    * Parse the provided reader for the T-Coffee scores file format
 122    *
 123    * @param reader
 124    *          public static TCoffeeScoreFile load(Reader reader) {
 125    *
 126    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 127    *          BufferedReader ? reader : new BufferedReader(reader));
 128    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 129    *          result.doParsing(in); return result.header != null &&
 130    *          result.scores != null ? result : null; } catch( Exception e) {
 131    *          throw new RuntimeException(e); } }
 132    */
 133
 134   /**
 135    * @return The 'height' of the score matrix i.e. the numbers of score rows
 136    *         that should matches the number of sequences in the alignment
 137    */
 138   public int getHeight()
 139   {
 140     // the last entry will always be the 'global' alingment consensus scores, so
 141     // it is removed
 142     // from the 'height' count to make this value compatible with the number of
 143     // sequences in the MSA
 144     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 145   }
 146
 147   /**
 148    * @return The 'width' of the score matrix i.e. the number of columns. Since
 149    *         teh score value are supposd to be calculated for an 'aligned' MSA,
 150    *         all the entries have to have the same width.
 151    */
 152   public int getWidth()
 153   {
 154     return fWidth != null ? fWidth : 0;
 155   }
 156
 157   /**
 158    * Get the string of score values for the specified seqeunce ID.
 159    *
 160    * @param id
 161    *          The sequence ID
 162    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 163    *         It return an empty string when the specified ID is missing.
 164    */
 165   public String getScoresFor(String id)
 166   {
 167     return scores != null && scores.containsKey(id) ? scores.get(id)
 168             .toString() : "";
 169   }
 170
 171   /**
 172    * @return The list of score string as a {@link List} object, in the same
 173    *         ordeer of the insertion i.e. in the MSA
 174    */
 175   public List<String> getScoresList()
 176   {
 177     if (scores == null)
 178     {
 179       return null;
 180     }
 181     List<String> result = new ArrayList<String>(scores.size());
 182     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 183     {
 184       result.add(it.getValue().toString());
 185     }
 186
 187     return result;
 188   }
 189
 190   /**
 191    * @return The parsed score values a matrix of bytes
 192    */
 193   public byte[][] getScoresArray()
 194   {
 195     if (scores == null)
 196     {
 197       return null;
 198     }
 199     byte[][] result = new byte[scores.size()][];
 200
 201     int rowCount = 0;
 202     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 203     {
 204       String line = it.getValue().toString();
 205       byte[] seqValues = new byte[line.length()];
 206       for (int j = 0, c = line.length(); j < c; j++)
 207       {
 208
 209         byte val = (byte) (line.charAt(j) - '0');
 210
 211         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 212       }
 213
 214       result[rowCount++] = seqValues;
 215     }
 216
 217     return result;
 218   }
 219
 220   public void parse() throws IOException
 221   {
 222     /*
 223      * read the header
 224      */
 225     header = readHeader(this);
 226
 227     if (header == null)
 228     {
 229       error = true;
 230       return;
 231     }
 232     scores = new LinkedHashMap<String, StringBuilder>();
 233
 234     /*
 235      * initilize the structure
 236      */
 237     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 238     {
 239       scores.put(entry.getKey(), new StringBuilder());
 240     }
 241
 242     /*
 243      * go with the reading
 244      */
 245     Block block;
 246     while ((block = readBlock(this, header.scores.size())) != null)
 247     {
 248
 249       /*
 250        * append sequences read in the block
 251        */
 252       for (Map.Entry<String, String> entry : block.items.entrySet())
 253       {
 254         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 255         if (scoreStringBuilder == null)
 256         {
 257           error = true;
 258           errormessage = String
 259                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 260                           entry.getKey());
 261           return;
 262         }
 263
 264         scoreStringBuilder.append(entry.getValue());
 265       }
 266     }
 267
 268     /*
 269      * verify that all rows have the same width
 270      */
 271     for (StringBuilder str : scores.values())
 272     {
 273       if (fWidth == null)
 274       {
 275         fWidth = str.length();
 276       }
 277       else if (fWidth != str.length())
 278       {
 279         error = true;
 280         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 281         return;
 282       }
 283     }
 284
 285     return;
 286   }
 287
 288   static int parseInt(String str)
 289   {
 290     try
 291     {
 292       return Integer.parseInt(str);
 293     } catch (NumberFormatException e)
 294     {
 295       // TODO report a warning ?
 296       return 0;
 297     }
 298   }
 299
 300   /**
 301    * Reaad the header section in the T-Coffee score file format
 302    *
 303    * @param reader
 304    *          The scores reader
 305    * @return The parser {@link Header} instance
 306    * @throws RuntimeException
 307    *           when the header is not in the expected format
 308    */
 309   static Header readHeader(FileParse reader) throws IOException
 310   {
 311
 312     Header result = null;
 313     try
 314     {
 315       result = new Header();
 316       result.head = reader.nextLine();
 317
 318       String line;
 319
 320       while ((line = reader.nextLine()) != null)
 321       {
 322         if (line.startsWith("SCORE="))
 323         {
 324           result.score = parseInt(line.substring(6).trim());
 325           break;
 326         }
 327       }
 328
 329       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 330       {
 331         error(reader,
 332                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 333         return null;
 334       }
 335       if ((line = reader.nextLine()) == null
 336               || !"BAD AVG GOOD".equals(line.trim()))
 337       {
 338         error(reader,
 339                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 340         return null;
 341       }
 342       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 343       {
 344         error(reader,
 345                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 346         return null;
 347       }
 348
 349       /*
 350        * now are expected a list if sequences ID up to the first blank line
 351        */
 352       while ((line = reader.nextLine()) != null)
 353       {
 354         if ("".equals(line))
 355         {
 356           break;
 357         }
 358
 359         int p = line.indexOf(":");
 360         if (p == -1)
 361         {
 362           // TODO report a warning
 363           continue;
 364         }
 365
 366         String id = line.substring(0, p).trim();
 367         int val = parseInt(line.substring(p + 1).trim());
 368         if ("".equals(id))
 369         {
 370           // TODO report warning
 371           continue;
 372         }
 373
 374         result.scores.put(id, val);
 375       }
 376
 377       if (result == null)
 378       {
 379         error(reader, "T-COFFEE score file had no per-sequence scores");
 380       }
 381
 382     } catch (IOException e)
 383     {
 384       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 385       throw e;
 386     }
 387
 388     return result;
 389   }
 390
 391   private static void error(FileParse reader, String errm)
 392   {
 393     reader.error = true;
 394     if (reader.errormessage == null)
 395     {
 396       reader.errormessage = errm;
 397     }
 398     else
 399     {
 400       reader.errormessage += "\n" + errm;
 401     }
 402   }
 403
 404   /**
 405    * Read a scores block ihe provided stream.
 406    *
 407    * @param reader
 408    *          The stream to parse
 409    * @param size
 410    *          The expected number of the sequence to be read
 411    * @return The {@link Block} instance read or {link null} null if the end of
 412    *         file has reached.
 413    * @throws IOException
 414    *           Something went wrong on the 'wire'
 415    */
 416   static Block readBlock(FileParse reader, int size) throws IOException
 417   {
 418     Block result = new Block(size);
 419     String line;
 420
 421     /*
 422      * read blank lines (eventually)
 423      */
 424     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 425     {
 426       // consume blank lines
 427     }
 428
 429     if (line == null)
 430     {
 431       return null;
 432     }
 433
 434     /*
 435      * read the scores block
 436      */
 437     do
 438     {
 439       if ("".equals(line.trim()))
 440       {
 441         // terminated
 442         break;
 443       }
 444
 445       // split the line on the first blank
 446       // the first part have to contain the sequence id
 447       // the remaining part are the scores values
 448       int p = line.indexOf(" ");
 449       if (p == -1)
 450       {
 451         if (reader.warningMessage == null)
 452         {
 453           reader.warningMessage = "";
 454         }
 455         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 456                 + line + "'\n";
 457         continue;
 458       }
 459
 460       String id = line.substring(0, p).trim();
 461       String val = line.substring(p + 1).trim();
 462
 463       result.items.put(id, val);
 464
 465     } while ((line = reader.nextLine()) != null);
 466
 467     return result;
 468   }
 469
 470   /*
 471    * The score file header
 472    */
 473   static class Header
 474   {
 475     String head;
 476
 477     int score;
 478
 479     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 480
 481     public int getScoreAvg()
 482     {
 483       return score;
 484     }
 485
 486     public int getScoreFor(String ID)
 487     {
 488
 489       return scores.containsKey(ID) ? scores.get(ID) : -1;
 490
 491     }
 492   }
 493
 494   /*
 495    * Hold a single block values block in the score file
 496    */
 497   static class Block
 498   {
 499     int size;
 500
 501     Map<String, String> items;
 502
 503     public Block(int size)
 504     {
 505       this.size = size;
 506       this.items = new HashMap<String, String>(size);
 507     }
 508
 509     String getScoresFor(String id)
 510     {
 511       return items.get(id);
 512     }
 513
 514     String getConsensus()
 515     {
 516       return items.get("cons");
 517     }
 518   }
 519
 520   /**
 521    * TCOFFEE score colourscheme
 522    */
 523   static final Color[] colors =
 524   { new Color(102, 102, 255), // #6666FF
 525       new Color(0, 255, 0), // #00FF00
 526       new Color(102, 255, 0), // #66FF00
 527       new Color(204, 255, 0), // #CCFF00
 528       new Color(255, 255, 0), // #FFFF00
 529       new Color(255, 204, 0), // #FFCC00
 530       new Color(255, 153, 0), // #FF9900
 531       new Color(255, 102, 0), // #FF6600
 532       new Color(255, 51, 0), // #FF3300
 533       new Color(255, 34, 0) // #FF2000
 534   };
 535
 536   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 537
 538   /**
 539    * generate annotation for this TCoffee score set on the given alignment
 540    *
 541    * @param al
 542    *          alignment to annotate
 543    * @param matchids
 544    *          if true, annotate sequences based on matching sequence names
 545    * @return true if alignment annotation was modified, false otherwise.
 546    */
 547   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 548   {
 549     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 550     {
 551       warningMessage = "Alignment shape does not match T-Coffee score file shape.";
 552       return false;
 553     }
 554     boolean added = false;
 555     int i = 0;
 556     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 557             al.getSequencesArray());
 558     byte[][] scoreMatrix = getScoresArray();
 559     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 560     // before adding this.
 561     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 562     {
 563       byte[] srow = scoreMatrix[i];
 564       SequenceI s;
 565       if (matchids)
 566       {
 567         s = sidmatcher.findIdMatch(id.getKey());
 568       }
 569       else
 570       {
 571         s = al.getSequenceAt(i);
 572       }
 573       i++;
 574       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 575       {
 576         System.err.println("No "
 577                 + (matchids ? "match " : " sequences left ")
 578                 + " for TCoffee score set : " + id.getKey());
 579         continue;
 580       }
 581       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 582       Annotation[] annotations = new Annotation[al.getWidth()];
 583       for (int j = 0; j < jSize; j++)
 584       {
 585         byte val = srow[j];
 586         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 587         {
 588           annotations[j] = null;
 589           if (val > 0)
 590           {
 591             System.err
 592                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 593                             + j + " in sequence " + s.getName());
 594           }
 595         }
 596         else
 597         {
 598           annotations[j] = new Annotation(s == null ? "" + val : null,
 599                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 600                           && val < colors.length ? colors[val]
 601                           : Color.white);
 602         }
 603       }
 604       // this will overwrite any existing t-coffee scores for the alignment
 605       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 606               TCOFFEE_SCORE, false, s, null);
 607       if (s != null)
 608       {
 609         aa.label = "T-COFFEE";
 610         aa.description = "" + id.getKey();
 611         aa.annotations = annotations;
 612         aa.visible = false;
 613         aa.belowAlignment = false;
 614         aa.setScore(header.getScoreFor(id.getKey()));
 615         aa.createSequenceMapping(s, s.getStart(), true);
 616         s.addAlignmentAnnotation(aa);
 617         aa.adjustForAlignment();
 618       }
 619       else
 620       {
 621         aa.graph = AlignmentAnnotation.NO_GRAPH;
 622         aa.label = "T-COFFEE";
 623         aa.description = "TCoffee column reliability score";
 624         aa.annotations = annotations;
 625         aa.belowAlignment = true;
 626         aa.visible = true;
 627         aa.setScore(header.getScoreAvg());
 628       }
 629       aa.showAllColLabels = true;
 630       aa.validateRangeAndDisplay();
 631       added = true;
 632     }
 633
 634     return added;
 635   }
 636
 637   @Override
 638   public String print()
 639   {
 640     // TODO Auto-generated method stub
 641     return "Not valid.";
 642   }
 643 }