src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.analysis.SequenceIdMatcher;
  22 import jalview.datamodel.AlignmentAnnotation;
  23 import jalview.datamodel.AlignmentI;
  24 import jalview.datamodel.Annotation;
  25 import jalview.datamodel.SequenceI;
  26
  27 import java.awt.Color;
  28 import java.io.IOException;
  29 import java.util.ArrayList;
  30 import java.util.HashMap;
  31 import java.util.LinkedHashMap;
  32 import java.util.List;
  33 import java.util.Map;
  34 import java.util.regex.Matcher;
  35 import java.util.regex.Pattern;
  36
  37 /**
  38  * A file parse for T-Coffee score ascii format. This file contains the
  39  * alignment consensus for each resude in any sequence.
  40  * <p>
  41  * This file is procuded by <code>t_coffee</code> providing the option
  42  * <code>-output=score_ascii </code> to the program command line
  43  *
  44  * An example file is the following
  45  *
  46  * <pre>
  47  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  48  * Cedric Notredame
  49  * CPU TIME:0 sec.
  50  * SCORE=90
  51  * *
  52  *  BAD AVG GOOD
  53  * *
  54  * 1PHT   :  89
  55  * 1BB9   :  90
  56  * 1UHC   :  94
  57  * 1YCS   :  94
  58  * 1OOT   :  93
  59  * 1ABO   :  94
  60  * 1FYN   :  94
  61  * 1QCF   :  94
  62  * cons   :  90
  63  *
  64  * 1PHT   999999999999999999999999998762112222543211112134
  65  * 1BB9   99999999999999999999999999987-------4322----2234
  66  * 1UHC   99999999999999999999999999987-------5321----2246
  67  * 1YCS   99999999999999999999999999986-------4321----1-35
  68  * 1OOT   999999999999999999999999999861-------3------1135
  69  * 1ABO   99999999999999999999999999986-------422-------34
  70  * 1FYN   99999999999999999999999999985-------32--------35
  71  * 1QCF   99999999999999999999999999974-------2---------24
  72  * cons   999999999999999999999999999851000110321100001134
  73  *
  74  *
  75  * 1PHT   ----------5666642367889999999999889
  76  * 1BB9   1111111111676653-355679999999999889
  77  * 1UHC   ----------788774--66789999999999889
  78  * 1YCS   ----------78777--356789999999999889
  79  * 1OOT   ----------78877--356789999999997-67
  80  * 1ABO   ----------687774--56779999999999889
  81  * 1FYN   ----------6888842356789999999999889
  82  * 1QCF   ----------6878742356789999999999889
  83  * cons   00100000006877641356789999999999889
  84  * </pre>
  85  *
  86  *
  87  * @author Paolo Di Tommaso
  88  *
  89  */
  90 public class TCoffeeScoreFile extends AlignFile {
  91   public TCoffeeScoreFile(String inFile, String type) throws IOException
  92   {
  93     super(inFile, type);
  94
  95   }
  96
  97   public TCoffeeScoreFile(FileParse source) throws IOException
  98   {
  99     super(source);
 100   }
 101
 102   /** The {@link Header} structure holder */
 103   Header header;
 104
 105   /**
 106    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 107    * maintaint the insertion order.
 108    */
 109   LinkedHashMap<String, StringBuilder> scores;
 110
 111   Integer fWidth;
 112
 113   /**
 114    * Parse the provided reader for the T-Coffee scores file format
 115    *
 116    * @param reader
 117    *          public static TCoffeeScoreFile load(Reader reader) {
 118    *
 119    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 120    *          BufferedReader ? reader : new BufferedReader(reader));
 121    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 122    *          result.doParsing(in); return result.header != null &&
 123    *          result.scores != null ? result : null; } catch( Exception e) {
 124    *          throw new RuntimeException(e); } }
 125    */
 126
 127   /**
 128    * @return The 'height' of the score matrix i.e. the numbers of score rows
 129    *         that should matches the number of sequences in the alignment
 130    */
 131   public int getHeight()
 132   {
 133     // the last entry will always be the 'global' alingment consensus scores, so
 134     // it is removed
 135     // from the 'height' count to make this value compatible with the number of
 136     // sequences in the MSA
 137     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 138   }
 139
 140   /**
 141    * @return The 'width' of the score matrix i.e. the number of columns. Since
 142    *         the score value are supposed to be calculated for an 'aligned' MSA,
 143    *         all the entries have to have the same width.
 144    */
 145   public int getWidth()
 146   {
 147     return fWidth != null ? fWidth : 0;
 148   }
 149
 150   /**
 151    * Get the string of score values for the specified seqeunce ID.
 152    *
 153    * @param id
 154    *          The sequence ID
 155    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 156    *         It return an empty string when the specified ID is missing.
 157    */
 158   public String getScoresFor(String id)
 159   {
 160     return scores != null && scores.containsKey(id) ? scores.get(id)
 161             .toString() : "";
 162   }
 163
 164   /**
 165    * @return The list of score string as a {@link List} object, in the same
 166    *         ordeer of the insertion i.e. in the MSA
 167    */
 168   public List<String> getScoresList()
 169   {
 170     if (scores == null)
 171     {
 172       return null;
 173     }
 174     List<String> result = new ArrayList<String>(scores.size());
 175     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 176     {
 177       result.add(it.getValue().toString());
 178     }
 179
 180     return result;
 181   }
 182
 183   /**
 184    * @return The parsed score values a matrix of bytes
 185    */
 186   public byte[][] getScoresArray()
 187   {
 188     if (scores == null)
 189     {
 190       return null;
 191     }
 192     byte[][] result = new byte[scores.size()][];
 193
 194     int rowCount = 0;
 195     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 196     {
 197       String line = it.getValue().toString();
 198       byte[] seqValues = new byte[line.length()];
 199       for (int j = 0, c = line.length(); j < c; j++)
 200       {
 201
 202         byte val = (byte) (line.charAt(j) - '0');
 203
 204         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 205       }
 206
 207       result[rowCount++] = seqValues;
 208     }
 209
 210     return result;
 211   }
 212
 213   public void parse() throws IOException
 214   {
 215     /*
 216      * read the header
 217      */
 218     header = readHeader(this);
 219
 220     if (header == null)
 221     {
 222       error = true;
 223       return;
 224     }
 225     scores = new LinkedHashMap<String, StringBuilder>();
 226
 227     /*
 228      * initilize the structure
 229      */
 230     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 231     {
 232       scores.put(entry.getKey(), new StringBuilder());
 233     }
 234
 235     /*
 236      * go with the reading
 237      */
 238     Block block;
 239     while ((block = readBlock(this, header.scores.size())) != null)
 240     {
 241
 242       /*
 243        * append sequences read in the block
 244        */
 245       for (Map.Entry<String, String> entry : block.items.entrySet())
 246       {
 247         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 248         if (scoreStringBuilder == null)
 249         {
 250           error = true;
 251           errormessage = String
 252                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 253                           entry.getKey());
 254           return;
 255         }
 256
 257         scoreStringBuilder.append(entry.getValue());
 258       }
 259     }
 260
 261     /*
 262      * verify that all rows have the same width
 263      */
 264     for (StringBuilder str : scores.values())
 265     {
 266       if (fWidth == null)
 267       {
 268         fWidth = str.length();
 269       }
 270       else if (fWidth != str.length())
 271       {
 272         error = true;
 273         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 274         return;
 275       }
 276     }
 277
 278     return;
 279   }
 280
 281   static int parseInt(String str)
 282   {
 283     try
 284     {
 285       return Integer.parseInt(str);
 286     } catch (NumberFormatException e)
 287     {
 288       // TODO report a warning ?
 289       return 0;
 290     }
 291   }
 292
 293   /**
 294    * Reaad the header section in the T-Coffee score file format
 295    *
 296    * @param reader
 297    *          The scores reader
 298    * @return The parser {@link Header} instance
 299    * @throws RuntimeException
 300    *           when the header is not in the expected format
 301    */
 302   static Header readHeader(FileParse reader) throws IOException
 303   {
 304
 305     Header result = null;
 306     try
 307     {
 308       result = new Header();
 309       result.head = reader.nextLine();
 310
 311       String line;
 312
 313       while ((line = reader.nextLine()) != null)
 314       {
 315         if (line.startsWith("SCORE="))
 316         {
 317           result.score = parseInt(line.substring(6).trim());
 318           break;
 319         }
 320       }
 321
 322       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 323       {
 324         error(reader,
 325                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 326         return null;
 327       }
 328       if ((line = reader.nextLine()) == null
 329               || !"BAD AVG GOOD".equals(line.trim()))
 330       {
 331         error(reader,
 332                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 333         return null;
 334       }
 335       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 336       {
 337         error(reader,
 338                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 339         return null;
 340       }
 341
 342       /*
 343        * now are expected a list if sequences ID up to the first blank line
 344        */
 345       while ((line = reader.nextLine()) != null)
 346       {
 347         if ("".equals(line))
 348         {
 349           break;
 350         }
 351
 352         int p = line.indexOf(":");
 353         if (p == -1)
 354         {
 355           // TODO report a warning
 356           continue;
 357         }
 358
 359         String id = line.substring(0, p).trim();
 360         int val = parseInt(line.substring(p + 1).trim());
 361         if ("".equals(id))
 362         {
 363           // TODO report warning
 364           continue;
 365         }
 366
 367         result.scores.put(id, val);
 368       }
 369
 370       if (result == null)
 371       {
 372         error(reader, "T-COFFEE score file had no per-sequence scores");
 373       }
 374
 375     } catch (IOException e)
 376     {
 377       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 378       throw e;
 379     }
 380
 381     return result;
 382   }
 383
 384   private static void error(FileParse reader, String errm)
 385   {
 386     reader.error = true;
 387     if (reader.errormessage == null)
 388     {
 389       reader.errormessage = errm;
 390     }
 391     else
 392     {
 393       reader.errormessage += "\n" + errm;
 394     }
 395   }
 396
 397   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern.compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 398
 399   /**
 400    * Read a scores block ihe provided stream.
 401    *
 402    * @param reader
 403    *          The stream to parse
 404    * @param size
 405    *          The expected number of the sequence to be read
 406    * @return The {@link Block} instance read or {link null} null if the end of
 407    *         file has reached.
 408    * @throws IOException
 409    *           Something went wrong on the 'wire'
 410    */
 411   static Block readBlock(FileParse reader, int size) throws IOException
 412   {
 413     Block result = new Block(size);
 414     String line;
 415
 416     /*
 417      * read blank lines (eventually)
 418      */
 419     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 420     {
 421       // consume blank lines
 422     }
 423
 424     if (line == null)
 425     {
 426       return null;
 427     }
 428
 429     /*
 430      * read the scores block
 431      */
 432     do
 433     {
 434       if ("".equals(line.trim()))
 435       {
 436         // terminated
 437         break;
 438       }
 439
 440       // split the line on the first blank
 441       // the first part have to contain the sequence id
 442       // the remaining part are the scores values
 443       int p = line.indexOf(" ");
 444       if (p == -1)
 445       {
 446         if (reader.warningMessage == null)
 447         {
 448           reader.warningMessage = "";
 449         }
 450         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 451                 + line + "'\n";
 452         continue;
 453       }
 454
 455       String id = line.substring(0, p).trim();
 456       String val = line.substring(p + 1).trim();
 457
 458       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 459       if( m.matches() ) {
 460           val = m.group(1);
 461       }
 462
 463       result.items.put(id, val);
 464
 465     } while ((line = reader.nextLine()) != null);
 466
 467     return result;
 468   }
 469
 470   /*
 471    * The score file header
 472    */
 473   static class Header
 474   {
 475     String head;
 476
 477     int score;
 478
 479     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 480
 481     public int getScoreAvg()
 482     {
 483       return score;
 484     }
 485
 486     public int getScoreFor(String ID)
 487     {
 488
 489       return scores.containsKey(ID) ? scores.get(ID) : -1;
 490
 491     }
 492   }
 493
 494   /*
 495    * Hold a single block values block in the score file
 496    */
 497   static class Block
 498   {
 499     int size;
 500
 501     Map<String, String> items;
 502
 503     public Block(int size)
 504     {
 505       this.size = size;
 506       this.items = new HashMap<String, String>(size);
 507     }
 508
 509     String getScoresFor(String id)
 510     {
 511       return items.get(id);
 512     }
 513
 514     String getConsensus()
 515     {
 516       return items.get("cons");
 517     }
 518   }
 519
 520   /**
 521    * TCOFFEE score colourscheme
 522    */
 523   static final Color[] colors =
 524   { new Color(102, 102, 255), // #6666FF
 525       new Color(0, 255, 0), // #00FF00
 526       new Color(102, 255, 0), // #66FF00
 527       new Color(204, 255, 0), // #CCFF00
 528       new Color(255, 255, 0), // #FFFF00
 529       new Color(255, 204, 0), // #FFCC00
 530       new Color(255, 153, 0), // #FF9900
 531       new Color(255, 102, 0), // #FF6600
 532       new Color(255, 51, 0), // #FF3300
 533       new Color(255, 34, 0) // #FF2000
 534   };
 535
 536   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 537
 538   /**
 539    * generate annotation for this TCoffee score set on the given alignment
 540    *
 541    * @param al
 542    *          alignment to annotate
 543    * @param matchids
 544    *          if true, annotate sequences based on matching sequence names
 545    * @return true if alignment annotation was modified, false otherwise.
 546    */
 547   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 548   {
 549     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 550     {
 551       String info = String.format("align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), al.getHeight(), getWidth(), getHeight() );
 552       warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + info;
 553       return false;
 554     }
 555     boolean added = false;
 556     int i = 0;
 557     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 558             al.getSequencesArray());
 559     byte[][] scoreMatrix = getScoresArray();
 560     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 561     // before adding this.
 562     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 563     {
 564       byte[] srow = scoreMatrix[i];
 565       SequenceI s;
 566       if (matchids)
 567       {
 568         s = sidmatcher.findIdMatch(id.getKey());
 569       }
 570       else
 571       {
 572         s = al.getSequenceAt(i);
 573       }
 574       i++;
 575       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 576       {
 577         System.err.println("No "
 578                 + (matchids ? "match " : " sequences left ")
 579                 + " for TCoffee score set : " + id.getKey());
 580         continue;
 581       }
 582       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 583       Annotation[] annotations = new Annotation[al.getWidth()];
 584       for (int j = 0; j < jSize; j++)
 585       {
 586         byte val = srow[j];
 587         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 588         {
 589           annotations[j] = null;
 590           if (val > 0)
 591           {
 592             System.err
 593                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 594                             + j + " in sequence " + s.getName());
 595           }
 596         }
 597         else
 598         {
 599           annotations[j] = new Annotation(s == null ? "" + val : null,
 600                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 601                           && val < colors.length ? colors[val]
 602                           : Color.white);
 603         }
 604       }
 605       // this will overwrite any existing t-coffee scores for the alignment
 606       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 607               TCOFFEE_SCORE, false, s, null);
 608       if (s != null)
 609       {
 610         aa.label = "T-COFFEE";
 611         aa.description = "" + id.getKey();
 612         aa.annotations = annotations;
 613         aa.visible = false;
 614         aa.belowAlignment = false;
 615         aa.setScore(header.getScoreFor(id.getKey()));
 616         aa.createSequenceMapping(s, s.getStart(), true);
 617         s.addAlignmentAnnotation(aa);
 618         aa.adjustForAlignment();
 619       }
 620       else
 621       {
 622         aa.graph = AlignmentAnnotation.NO_GRAPH;
 623         aa.label = "T-COFFEE";
 624         aa.description = "TCoffee column reliability score";
 625         aa.annotations = annotations;
 626         aa.belowAlignment = true;
 627         aa.visible = true;
 628         aa.setScore(header.getScoreAvg());
 629       }
 630       aa.showAllColLabels = true;
 631       aa.validateRangeAndDisplay();
 632       added = true;
 633     }
 634
 635     return added;
 636   }
 637
 638   @Override
 639   public String print()
 640   {
 641     // TODO Auto-generated method stub
 642     return "Not valid.";
 643   }
 644 }