src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28
  29 import java.awt.Color;
  30 import java.io.IOException;
  31 import java.util.ArrayList;
  32 import java.util.HashMap;
  33 import java.util.LinkedHashMap;
  34 import java.util.List;
  35 import java.util.Map;
  36 import java.util.regex.Matcher;
  37 import java.util.regex.Pattern;
  38
  39 /**
  40  * A file parse for T-Coffee score ascii format. This file contains the
  41  * alignment consensus for each resude in any sequence.
  42  * <p>
  43  * This file is procuded by <code>t_coffee</code> providing the option
  44  * <code>-output=score_ascii </code> to the program command line
  45  *
  46  * An example file is the following
  47  *
  48  * <pre>
  49  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  50  * Cedric Notredame
  51  * CPU TIME:0 sec.
  52  * SCORE=90
  53  * *
  54  *  BAD AVG GOOD
  55  * *
  56  * 1PHT   :  89
  57  * 1BB9   :  90
  58  * 1UHC   :  94
  59  * 1YCS   :  94
  60  * 1OOT   :  93
  61  * 1ABO   :  94
  62  * 1FYN   :  94
  63  * 1QCF   :  94
  64  * cons   :  90
  65  *
  66  * 1PHT   999999999999999999999999998762112222543211112134
  67  * 1BB9   99999999999999999999999999987-------4322----2234
  68  * 1UHC   99999999999999999999999999987-------5321----2246
  69  * 1YCS   99999999999999999999999999986-------4321----1-35
  70  * 1OOT   999999999999999999999999999861-------3------1135
  71  * 1ABO   99999999999999999999999999986-------422-------34
  72  * 1FYN   99999999999999999999999999985-------32--------35
  73  * 1QCF   99999999999999999999999999974-------2---------24
  74  * cons   999999999999999999999999999851000110321100001134
  75  *
  76  *
  77  * 1PHT   ----------5666642367889999999999889
  78  * 1BB9   1111111111676653-355679999999999889
  79  * 1UHC   ----------788774--66789999999999889
  80  * 1YCS   ----------78777--356789999999999889
  81  * 1OOT   ----------78877--356789999999997-67
  82  * 1ABO   ----------687774--56779999999999889
  83  * 1FYN   ----------6888842356789999999999889
  84  * 1QCF   ----------6878742356789999999999889
  85  * cons   00100000006877641356789999999999889
  86  * </pre>
  87  *
  88  *
  89  * @author Paolo Di Tommaso
  90  *
  91  */
  92 public class TCoffeeScoreFile extends AlignFile
  93 {
  94   public TCoffeeScoreFile(String inFile, String type) throws IOException
  95   {
  96     super(inFile, type);
  97
  98   }
  99
 100   public TCoffeeScoreFile(FileParse source) throws IOException
 101   {
 102     super(source);
 103   }
 104
 105   /** The {@link Header} structure holder */
 106   Header header;
 107
 108   /**
 109    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 110    * maintaint the insertion order.
 111    */
 112   LinkedHashMap<String, StringBuilder> scores;
 113
 114   Integer fWidth;
 115
 116   /**
 117    * Parse the provided reader for the T-Coffee scores file format
 118    *
 119    * @param reader
 120    *          public static TCoffeeScoreFile load(Reader reader) {
 121    *
 122    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 123    *          BufferedReader ? reader : new BufferedReader(reader));
 124    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 125    *          result.doParsing(in); return result.header != null &&
 126    *          result.scores != null ? result : null; } catch( Exception e) {
 127    *          throw new RuntimeException(e); } }
 128    */
 129
 130   /**
 131    * @return The 'height' of the score matrix i.e. the numbers of score rows
 132    *         that should matches the number of sequences in the alignment
 133    */
 134   public int getHeight()
 135   {
 136     // the last entry will always be the 'global' alingment consensus scores, so
 137     // it is removed
 138     // from the 'height' count to make this value compatible with the number of
 139     // sequences in the MSA
 140     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 141   }
 142
 143   /**
 144    * @return The 'width' of the score matrix i.e. the number of columns. Since
 145    *         the score value are supposed to be calculated for an 'aligned' MSA,
 146    *         all the entries have to have the same width.
 147    */
 148   public int getWidth()
 149   {
 150     return fWidth != null ? fWidth : 0;
 151   }
 152
 153   /**
 154    * Get the string of score values for the specified seqeunce ID.
 155    *
 156    * @param id
 157    *          The sequence ID
 158    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 159    *         It return an empty string when the specified ID is missing.
 160    */
 161   public String getScoresFor(String id)
 162   {
 163     return scores != null && scores.containsKey(id) ? scores.get(id)
 164             .toString() : "";
 165   }
 166
 167   /**
 168    * @return The list of score string as a {@link List} object, in the same
 169    *         ordeer of the insertion i.e. in the MSA
 170    */
 171   public List<String> getScoresList()
 172   {
 173     if (scores == null)
 174     {
 175       return null;
 176     }
 177     List<String> result = new ArrayList<String>(scores.size());
 178     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 179     {
 180       result.add(it.getValue().toString());
 181     }
 182
 183     return result;
 184   }
 185
 186   /**
 187    * @return The parsed score values a matrix of bytes
 188    */
 189   public byte[][] getScoresArray()
 190   {
 191     if (scores == null)
 192     {
 193       return null;
 194     }
 195     byte[][] result = new byte[scores.size()][];
 196
 197     int rowCount = 0;
 198     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 199     {
 200       String line = it.getValue().toString();
 201       byte[] seqValues = new byte[line.length()];
 202       for (int j = 0, c = line.length(); j < c; j++)
 203       {
 204
 205         byte val = (byte) (line.charAt(j) - '0');
 206
 207         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 208       }
 209
 210       result[rowCount++] = seqValues;
 211     }
 212
 213     return result;
 214   }
 215
 216   public void parse() throws IOException
 217   {
 218     /*
 219      * read the header
 220      */
 221     header = readHeader(this);
 222
 223     if (header == null)
 224     {
 225       error = true;
 226       return;
 227     }
 228     scores = new LinkedHashMap<String, StringBuilder>();
 229
 230     /*
 231      * initilize the structure
 232      */
 233     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 234     {
 235       scores.put(entry.getKey(), new StringBuilder());
 236     }
 237
 238     /*
 239      * go with the reading
 240      */
 241     Block block;
 242     while ((block = readBlock(this, header.scores.size())) != null)
 243     {
 244
 245       /*
 246        * append sequences read in the block
 247        */
 248       for (Map.Entry<String, String> entry : block.items.entrySet())
 249       {
 250         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 251         if (scoreStringBuilder == null)
 252         {
 253           error = true;
 254           errormessage = String
 255                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 256                           entry.getKey());
 257           return;
 258         }
 259
 260         scoreStringBuilder.append(entry.getValue());
 261       }
 262     }
 263
 264     /*
 265      * verify that all rows have the same width
 266      */
 267     for (StringBuilder str : scores.values())
 268     {
 269       if (fWidth == null)
 270       {
 271         fWidth = str.length();
 272       }
 273       else if (fWidth != str.length())
 274       {
 275         error = true;
 276         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 277         return;
 278       }
 279     }
 280
 281     return;
 282   }
 283
 284   static int parseInt(String str)
 285   {
 286     try
 287     {
 288       return Integer.parseInt(str);
 289     } catch (NumberFormatException e)
 290     {
 291       // TODO report a warning ?
 292       return 0;
 293     }
 294   }
 295
 296   /**
 297    * Reaad the header section in the T-Coffee score file format
 298    *
 299    * @param reader
 300    *          The scores reader
 301    * @return The parser {@link Header} instance
 302    * @throws RuntimeException
 303    *           when the header is not in the expected format
 304    */
 305   static Header readHeader(FileParse reader) throws IOException
 306   {
 307
 308     Header result = null;
 309     try
 310     {
 311       result = new Header();
 312       result.head = reader.nextLine();
 313
 314       String line;
 315
 316       while ((line = reader.nextLine()) != null)
 317       {
 318         if (line.startsWith("SCORE="))
 319         {
 320           result.score = parseInt(line.substring(6).trim());
 321           break;
 322         }
 323       }
 324
 325       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 326       {
 327         error(reader,
 328                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 329         return null;
 330       }
 331       if ((line = reader.nextLine()) == null
 332               || !"BAD AVG GOOD".equals(line.trim()))
 333       {
 334         error(reader,
 335                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 336         return null;
 337       }
 338       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 339       {
 340         error(reader,
 341                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 342         return null;
 343       }
 344
 345       /*
 346        * now are expected a list if sequences ID up to the first blank line
 347        */
 348       while ((line = reader.nextLine()) != null)
 349       {
 350         if ("".equals(line))
 351         {
 352           break;
 353         }
 354
 355         int p = line.indexOf(":");
 356         if (p == -1)
 357         {
 358           // TODO report a warning
 359           continue;
 360         }
 361
 362         String id = line.substring(0, p).trim();
 363         int val = parseInt(line.substring(p + 1).trim());
 364         if ("".equals(id))
 365         {
 366           // TODO report warning
 367           continue;
 368         }
 369
 370         result.scores.put(id, val);
 371       }
 372
 373       if (result == null)
 374       {
 375         error(reader, "T-COFFEE score file had no per-sequence scores");
 376       }
 377
 378     } catch (IOException e)
 379     {
 380       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 381       throw e;
 382     }
 383
 384     return result;
 385   }
 386
 387   private static void error(FileParse reader, String errm)
 388   {
 389     reader.error = true;
 390     if (reader.errormessage == null)
 391     {
 392       reader.errormessage = errm;
 393     }
 394     else
 395     {
 396       reader.errormessage += "\n" + errm;
 397     }
 398   }
 399
 400   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern
 401           .compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 402
 403   /**
 404    * Read a scores block ihe provided stream.
 405    *
 406    * @param reader
 407    *          The stream to parse
 408    * @param size
 409    *          The expected number of the sequence to be read
 410    * @return The {@link Block} instance read or {link null} null if the end of
 411    *         file has reached.
 412    * @throws IOException
 413    *           Something went wrong on the 'wire'
 414    */
 415   static Block readBlock(FileParse reader, int size) throws IOException
 416   {
 417     Block result = new Block(size);
 418     String line;
 419
 420     /*
 421      * read blank lines (eventually)
 422      */
 423     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 424     {
 425       // consume blank lines
 426     }
 427
 428     if (line == null)
 429     {
 430       return null;
 431     }
 432
 433     /*
 434      * read the scores block
 435      */
 436     do
 437     {
 438       if ("".equals(line.trim()))
 439       {
 440         // terminated
 441         break;
 442       }
 443
 444       // split the line on the first blank
 445       // the first part have to contain the sequence id
 446       // the remaining part are the scores values
 447       int p = line.indexOf(" ");
 448       if (p == -1)
 449       {
 450         if (reader.warningMessage == null)
 451         {
 452           reader.warningMessage = "";
 453         }
 454         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 455                 + line + "'\n";
 456         continue;
 457       }
 458
 459       String id = line.substring(0, p).trim();
 460       String val = line.substring(p + 1).trim();
 461
 462       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 463       if (m.matches())
 464       {
 465         val = m.group(1);
 466       }
 467
 468       result.items.put(id, val);
 469
 470     } while ((line = reader.nextLine()) != null);
 471
 472     return result;
 473   }
 474
 475   /*
 476    * The score file header
 477    */
 478   static class Header
 479   {
 480     String head;
 481
 482     int score;
 483
 484     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 485
 486     public int getScoreAvg()
 487     {
 488       return score;
 489     }
 490
 491     public int getScoreFor(String ID)
 492     {
 493
 494       return scores.containsKey(ID) ? scores.get(ID) : -1;
 495
 496     }
 497   }
 498
 499   /*
 500    * Hold a single block values block in the score file
 501    */
 502   static class Block
 503   {
 504     int size;
 505
 506     Map<String, String> items;
 507
 508     public Block(int size)
 509     {
 510       this.size = size;
 511       this.items = new HashMap<String, String>(size);
 512     }
 513
 514     String getScoresFor(String id)
 515     {
 516       return items.get(id);
 517     }
 518
 519     String getConsensus()
 520     {
 521       return items.get("cons");
 522     }
 523   }
 524
 525   /**
 526    * TCOFFEE score colourscheme
 527    */
 528   static final Color[] colors = { new Color(102, 102, 255), // #6666FF
 529       new Color(0, 255, 0), // #00FF00
 530       new Color(102, 255, 0), // #66FF00
 531       new Color(204, 255, 0), // #CCFF00
 532       new Color(255, 255, 0), // #FFFF00
 533       new Color(255, 204, 0), // #FFCC00
 534       new Color(255, 153, 0), // #FF9900
 535       new Color(255, 102, 0), // #FF6600
 536       new Color(255, 51, 0), // #FF3300
 537       new Color(255, 34, 0) // #FF2000
 538   };
 539
 540   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 541
 542   /**
 543    * generate annotation for this TCoffee score set on the given alignment
 544    *
 545    * @param al
 546    *          alignment to annotate
 547    * @param matchids
 548    *          if true, annotate sequences based on matching sequence names
 549    * @return true if alignment annotation was modified, false otherwise.
 550    */
 551   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 552   {
 553     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 554     {
 555       String info = String.format(
 556               "align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(),
 557               al.getHeight(), getWidth(), getHeight());
 558       warningMessage = "Alignment shape does not match T-Coffee score file shape -- "
 559               + info;
 560       return false;
 561     }
 562     boolean added = false;
 563     int i = 0;
 564     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 565             al.getSequencesArray());
 566     byte[][] scoreMatrix = getScoresArray();
 567     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 568     // before adding this.
 569     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 570     {
 571       byte[] srow = scoreMatrix[i];
 572       SequenceI s;
 573       if (matchids)
 574       {
 575         s = sidmatcher.findIdMatch(id.getKey());
 576       }
 577       else
 578       {
 579         s = al.getSequenceAt(i);
 580       }
 581       i++;
 582       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 583       {
 584         System.err.println("No "
 585                 + (matchids ? "match " : " sequences left ")
 586                 + " for TCoffee score set : " + id.getKey());
 587         continue;
 588       }
 589       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 590       Annotation[] annotations = new Annotation[al.getWidth()];
 591       for (int j = 0; j < jSize; j++)
 592       {
 593         byte val = srow[j];
 594         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 595         {
 596           annotations[j] = null;
 597           if (val > 0)
 598           {
 599             System.err
 600                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 601                             + j + " in sequence " + s.getName());
 602           }
 603         }
 604         else
 605         {
 606           annotations[j] = new Annotation(s == null ? "" + val : null,
 607                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 608                           && val < colors.length ? colors[val]
 609                           : Color.white);
 610         }
 611       }
 612       // this will overwrite any existing t-coffee scores for the alignment
 613       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 614               TCOFFEE_SCORE, false, s, null);
 615       if (s != null)
 616       {
 617         aa.label = "T-COFFEE";
 618         aa.description = "" + id.getKey();
 619         aa.annotations = annotations;
 620         aa.visible = false;
 621         aa.belowAlignment = false;
 622         aa.setScore(header.getScoreFor(id.getKey()));
 623         aa.createSequenceMapping(s, s.getStart(), true);
 624         s.addAlignmentAnnotation(aa);
 625         aa.adjustForAlignment();
 626       }
 627       else
 628       {
 629         aa.graph = AlignmentAnnotation.NO_GRAPH;
 630         aa.label = "T-COFFEE";
 631         aa.description = "TCoffee column reliability score";
 632         aa.annotations = annotations;
 633         aa.belowAlignment = true;
 634         aa.visible = true;
 635         aa.setScore(header.getScoreAvg());
 636       }
 637       aa.showAllColLabels = true;
 638       aa.validateRangeAndDisplay();
 639       added = true;
 640     }
 641
 642     return added;
 643   }
 644
 645   @Override
 646   public String print()
 647   {
 648     // TODO Auto-generated method stub
 649     return "Not valid.";
 650   }
 651 }