src/jalview/io/TCoffeeScoreFile.java

   1 package jalview.io;
   2
   3 import jalview.analysis.SequenceIdMatcher;
   4 import jalview.datamodel.AlignmentAnnotation;
   5 import jalview.datamodel.AlignmentI;
   6 import jalview.datamodel.Annotation;
   7 import jalview.datamodel.SequenceI;
   8
   9 import java.awt.Color;
  10 import java.io.BufferedReader;
  11 import java.io.File;
  12 import java.io.FileNotFoundException;
  13 import java.io.FileReader;
  14 import java.io.IOException;
  15 import java.io.Reader;
  16 import java.util.ArrayList;
  17 import java.util.HashMap;
  18 import java.util.LinkedHashMap;
  19 import java.util.List;
  20 import java.util.Map;
  21
  22 /**
  23  * A file parse for T-Coffee score ascii format. This file contains the
  24  * alignment consensus for each resude in any sequence.
  25  * <p>
  26  * This file is procuded by <code>t_coffee</code> providing the option
  27  * <code>-output=score_ascii </code> to the program command line
  28  *
  29  * An example file is the following
  30  *
  31  * <pre>
  32  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  33  * Cedric Notredame
  34  * CPU TIME:0 sec.
  35  * SCORE=90
  36  * *
  37  *  BAD AVG GOOD
  38  * *
  39  * 1PHT   :  89
  40  * 1BB9   :  90
  41  * 1UHC   :  94
  42  * 1YCS   :  94
  43  * 1OOT   :  93
  44  * 1ABO   :  94
  45  * 1FYN   :  94
  46  * 1QCF   :  94
  47  * cons   :  90
  48  *
  49  * 1PHT   999999999999999999999999998762112222543211112134
  50  * 1BB9   99999999999999999999999999987-------4322----2234
  51  * 1UHC   99999999999999999999999999987-------5321----2246
  52  * 1YCS   99999999999999999999999999986-------4321----1-35
  53  * 1OOT   999999999999999999999999999861-------3------1135
  54  * 1ABO   99999999999999999999999999986-------422-------34
  55  * 1FYN   99999999999999999999999999985-------32--------35
  56  * 1QCF   99999999999999999999999999974-------2---------24
  57  * cons   999999999999999999999999999851000110321100001134
  58  *
  59  *
  60  * 1PHT   ----------5666642367889999999999889
  61  * 1BB9   1111111111676653-355679999999999889
  62  * 1UHC   ----------788774--66789999999999889
  63  * 1YCS   ----------78777--356789999999999889
  64  * 1OOT   ----------78877--356789999999997-67
  65  * 1ABO   ----------687774--56779999999999889
  66  * 1FYN   ----------6888842356789999999999889
  67  * 1QCF   ----------6878742356789999999999889
  68  * cons   00100000006877641356789999999999889
  69  * </pre>
  70  *
  71  *
  72  * @author Paolo Di Tommaso
  73  *
  74  */
  75 public class TCoffeeScoreFile extends AlignFile
  76 {
  77
  78   public TCoffeeScoreFile(String inFile, String type) throws IOException
  79   {
  80     super(inFile, type);
  81
  82   }
  83
  84   public TCoffeeScoreFile(FileParse source) throws IOException
  85   {
  86     super(source);
  87   }
  88
  89   /** The {@link Header} structure holder */
  90   Header header;
  91
  92   /**
  93    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
  94    * maintaint the insertion order.
  95    */
  96   LinkedHashMap<String, StringBuilder> scores;
  97
  98   Integer fWidth;
  99
 100   /**
 101    * Parse the provided reader for the T-Coffee scores file format
 102    *
 103    * @param reader
 104    *          public static TCoffeeScoreFile load(Reader reader) {
 105    *
 106    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 107    *          BufferedReader ? reader : new BufferedReader(reader));
 108    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 109    *          result.doParsing(in); return result.header != null &&
 110    *          result.scores != null ? result : null; } catch( Exception e) {
 111    *          throw new RuntimeException(e); } }
 112    */
 113
 114   /**
 115    * @return The 'height' of the score matrix i.e. the numbers of score rows
 116    *         that should matches the number of sequences in the alignment
 117    */
 118   public int getHeight()
 119   {
 120     // the last entry will always be the 'global' alingment consensus scores, so
 121     // it is removed
 122     // from the 'height' count to make this value compatible with the number of
 123     // sequences in the MSA
 124     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 125   }
 126
 127   /**
 128    * @return The 'width' of the score matrix i.e. the number of columns. Since
 129    *         teh score value are supposd to be calculated for an 'aligned' MSA,
 130    *         all the entries have to have the same width.
 131    */
 132   public int getWidth()
 133   {
 134     return fWidth != null ? fWidth : 0;
 135   }
 136
 137   /**
 138    * Get the string of score values for the specified seqeunce ID.
 139    *
 140    * @param id
 141    *          The sequence ID
 142    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 143    *         It return an empty string when the specified ID is missing.
 144    */
 145   public String getScoresFor(String id)
 146   {
 147     return scores != null && scores.containsKey(id) ? scores.get(id)
 148             .toString() : "";
 149   }
 150
 151   /**
 152    * @return The list of score string as a {@link List} object, in the same
 153    *         ordeer of the insertion i.e. in the MSA
 154    */
 155   public List<String> getScoresList()
 156   {
 157     if (scores == null)
 158     {
 159       return null;
 160     }
 161     List<String> result = new ArrayList<String>(scores.size());
 162     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 163     {
 164       result.add(it.getValue().toString());
 165     }
 166
 167     return result;
 168   }
 169
 170   /**
 171    * @return The parsed score values a matrix of bytes
 172    */
 173   public byte[][] getScoresArray()
 174   {
 175     if (scores == null)
 176     {
 177       return null;
 178     }
 179     byte[][] result = new byte[scores.size()][];
 180
 181     int rowCount = 0;
 182     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 183     {
 184       String line = it.getValue().toString();
 185       byte[] seqValues = new byte[line.length()];
 186       for (int j = 0, c = line.length(); j < c; j++)
 187       {
 188
 189         byte val = (byte) (line.charAt(j) - '0');
 190
 191         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 192       }
 193
 194       result[rowCount++] = seqValues;
 195     }
 196
 197     return result;
 198   }
 199
 200   public void parse() throws IOException
 201   {
 202     /*
 203      * read the header
 204      */
 205     header = readHeader(this);
 206
 207     if (header == null)
 208     {
 209       error = true;
 210       return;
 211     }
 212     scores = new LinkedHashMap<String, StringBuilder>();
 213
 214     /*
 215      * initilize the structure
 216      */
 217     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 218     {
 219       scores.put(entry.getKey(), new StringBuilder());
 220     }
 221
 222     /*
 223      * go with the reading
 224      */
 225     Block block;
 226     while ((block = readBlock(this, header.scores.size())) != null)
 227     {
 228
 229       /*
 230        * append sequences read in the block
 231        */
 232       for (Map.Entry<String, String> entry : block.items.entrySet())
 233       {
 234         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 235         if (scoreStringBuilder == null)
 236         {
 237           error = true;
 238           errormessage = String
 239                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 240                           entry.getKey());
 241           return;
 242         }
 243
 244         scoreStringBuilder.append(entry.getValue());
 245       }
 246     }
 247
 248     /*
 249      * verify that all rows have the same width
 250      */
 251     for (StringBuilder str : scores.values())
 252     {
 253       if (fWidth == null)
 254       {
 255         fWidth = str.length();
 256       }
 257       else if (fWidth != str.length())
 258       {
 259         error = true;
 260         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 261         return;
 262       }
 263     }
 264
 265     return;
 266   }
 267
 268   static int parseInt(String str)
 269   {
 270     try
 271     {
 272       return Integer.parseInt(str);
 273     } catch (NumberFormatException e)
 274     {
 275       // TODO report a warning ?
 276       return 0;
 277     }
 278   }
 279
 280   /**
 281    * Reaad the header section in the T-Coffee score file format
 282    *
 283    * @param reader
 284    *          The scores reader
 285    * @return The parser {@link Header} instance
 286    * @throws RuntimeException
 287    *           when the header is not in the expected format
 288    */
 289   static Header readHeader(FileParse reader) throws IOException
 290   {
 291
 292     Header result = null;
 293     try
 294     {
 295       result = new Header();
 296       result.head = reader.nextLine();
 297
 298       String line;
 299
 300       while ((line = reader.nextLine()) != null)
 301       {
 302         if (line.startsWith("SCORE="))
 303         {
 304           result.score = parseInt(line.substring(6).trim());
 305           break;
 306         }
 307       }
 308
 309       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 310       {
 311         error(reader,
 312                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 313         return null;
 314       }
 315       if ((line = reader.nextLine()) == null
 316               || !"BAD AVG GOOD".equals(line.trim()))
 317       {
 318         error(reader,
 319                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 320         return null;
 321       }
 322       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 323       {
 324         error(reader,
 325                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 326         return null;
 327       }
 328
 329       /*
 330        * now are expected a list if sequences ID up to the first blank line
 331        */
 332       while ((line = reader.nextLine()) != null)
 333       {
 334         if ("".equals(line))
 335         {
 336           break;
 337         }
 338
 339         int p = line.indexOf(":");
 340         if (p == -1)
 341         {
 342           // TODO report a warning
 343           continue;
 344         }
 345
 346         String id = line.substring(0, p).trim();
 347         int val = parseInt(line.substring(p + 1).trim());
 348         if ("".equals(id))
 349         {
 350           // TODO report warning
 351           continue;
 352         }
 353
 354         result.scores.put(id, val);
 355       }
 356
 357       if (result == null)
 358       {
 359         error(reader, "T-COFFEE score file had no per-sequence scores");
 360       }
 361
 362     } catch (IOException e)
 363     {
 364       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 365       throw e;
 366     }
 367
 368     return result;
 369   }
 370
 371   private static void error(FileParse reader, String errm)
 372   {
 373     reader.error = true;
 374     if (reader.errormessage == null)
 375     {
 376       reader.errormessage = errm;
 377     }
 378     else
 379     {
 380       reader.errormessage += "\n" + errm;
 381     }
 382   }
 383
 384   /**
 385    * Read a scores block ihe provided stream.
 386    *
 387    * @param reader
 388    *          The stream to parse
 389    * @param size
 390    *          The expected number of the sequence to be read
 391    * @return The {@link Block} instance read or {link null} null if the end of
 392    *         file has reached.
 393    * @throws IOException
 394    *           Something went wrong on the 'wire'
 395    */
 396   static Block readBlock(FileParse reader, int size) throws IOException
 397   {
 398     Block result = new Block(size);
 399     String line;
 400
 401     /*
 402      * read blank lines (eventually)
 403      */
 404     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 405     {
 406       // consume blank lines
 407     }
 408
 409     if (line == null)
 410     {
 411       return null;
 412     }
 413
 414     /*
 415      * read the scores block
 416      */
 417     do
 418     {
 419       if ("".equals(line.trim()))
 420       {
 421         // terminated
 422         break;
 423       }
 424
 425       // split the line on the first blank
 426       // the first part have to contain the sequence id
 427       // the remaining part are the scores values
 428       int p = line.indexOf(" ");
 429       if (p == -1)
 430       {
 431         if (reader.warningMessage == null)
 432         {
 433           reader.warningMessage = "";
 434         }
 435         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 436                 + line + "'\n";
 437         continue;
 438       }
 439
 440       String id = line.substring(0, p).trim();
 441       String val = line.substring(p + 1).trim();
 442
 443       result.items.put(id, val);
 444
 445     } while ((line = reader.nextLine()) != null);
 446
 447     return result;
 448   }
 449
 450   /*
 451    * The score file header
 452    */
 453   static class Header
 454   {
 455     String head;
 456
 457     int score;
 458
 459     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 460
 461     public int getScoreAvg()
 462     {
 463       return score;
 464     }
 465
 466     public int getScoreFor(String ID)
 467     {
 468
 469       return scores.containsKey(ID) ? scores.get(ID) : -1;
 470
 471     }
 472   }
 473
 474   /*
 475    * Hold a single block values block in the score file
 476    */
 477   static class Block
 478   {
 479     int size;
 480
 481     Map<String, String> items;
 482
 483     public Block(int size)
 484     {
 485       this.size = size;
 486       this.items = new HashMap<String, String>(size);
 487     }
 488
 489     String getScoresFor(String id)
 490     {
 491       return items.get(id);
 492     }
 493
 494     String getConsensus()
 495     {
 496       return items.get("cons");
 497     }
 498   }
 499
 500   /**
 501    * TCOFFEE score colourscheme
 502    */
 503   static final Color[] colors =
 504   { new Color(102, 102, 255), // #6666FF
 505       new Color(0, 255, 0), // #00FF00
 506       new Color(102, 255, 0), // #66FF00
 507       new Color(204, 255, 0), // #CCFF00
 508       new Color(255, 255, 0), // #FFFF00
 509       new Color(255, 204, 0), // #FFCC00
 510       new Color(255, 153, 0), // #FF9900
 511       new Color(255, 102, 0), // #FF6600
 512       new Color(255, 51, 0), // #FF3300
 513       new Color(255, 34, 0) // #FF2000
 514   };
 515
 516   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 517
 518   /**
 519    * generate annotation for this TCoffee score set on the given alignment
 520    *
 521    * @param al
 522    *          alignment to annotate
 523    * @param matchids
 524    *          if true, annotate sequences based on matching sequence names
 525    * @return true if alignment annotation was modified, false otherwise.
 526    */
 527   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 528   {
 529     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 530     {
 531       warningMessage = "Alignment shape does not match T-Coffee score file shape.";
 532       return false;
 533     }
 534     boolean added = false;
 535     int i = 0;
 536     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 537             al.getSequencesArray());
 538     byte[][] scoreMatrix = getScoresArray();
 539     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 540     // before adding this.
 541     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 542     {
 543       byte[] srow = scoreMatrix[i];
 544       SequenceI s;
 545       if (matchids)
 546       {
 547         s = sidmatcher.findIdMatch(id.getKey());
 548       }
 549       else
 550       {
 551         s = al.getSequenceAt(i);
 552       }
 553       i++;
 554       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 555       {
 556         System.err.println("No "
 557                 + (matchids ? "match " : " sequences left ")
 558                 + " for TCoffee score set : " + id.getKey());
 559         continue;
 560       }
 561       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 562       Annotation[] annotations = new Annotation[al.getWidth()];
 563       for (int j = 0; j < jSize; j++)
 564       {
 565         byte val = srow[j];
 566         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 567         {
 568           annotations[j] = null;
 569           if (val > 0)
 570           {
 571             System.err
 572                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 573                             + j + " in sequence " + s.getName());
 574           }
 575         }
 576         else
 577         {
 578           annotations[j] = new Annotation(s == null ? "" + val : null,
 579                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 580                           && val < colors.length ? colors[val]
 581                           : Color.white);
 582         }
 583       }
 584       // this will overwrite any existing t-coffee scores for the alignment
 585       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 586               TCOFFEE_SCORE, false, s, null);
 587       if (s != null)
 588       {
 589         aa.label = "T-COFFEE";
 590         aa.description = "" + id.getKey();
 591         aa.annotations = annotations;
 592         aa.visible = false;
 593         aa.belowAlignment = false;
 594         aa.setScore(header.getScoreFor(id.getKey()));
 595         aa.createSequenceMapping(s, s.getStart(), true);
 596         s.addAlignmentAnnotation(aa);
 597         aa.adjustForAlignment();
 598       }
 599       else
 600       {
 601         aa.graph = AlignmentAnnotation.NO_GRAPH;
 602         aa.label = "T-COFFEE";
 603         aa.description = "TCoffee column reliability score";
 604         aa.annotations = annotations;
 605         aa.belowAlignment = true;
 606         aa.visible = true;
 607         aa.setScore(header.getScoreAvg());
 608       }
 609       aa.showAllColLabels = true;
 610       aa.validateRangeAndDisplay();
 611       added = true;
 612     }
 613
 614     return added;
 615   }
 616
 617   @Override
 618   public String print()
 619   {
 620     // TODO Auto-generated method stub
 621     return "Not valid.";
 622   }
 623 }