src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28
  29 import java.awt.Color;
  30 import java.io.IOException;
  31 import java.util.ArrayList;
  32 import java.util.HashMap;
  33 import java.util.LinkedHashMap;
  34 import java.util.List;
  35 import java.util.Map;
  36 import java.util.regex.Matcher;
  37 import java.util.regex.Pattern;
  38
  39 /**
  40  * A file parse for T-Coffee score ascii format. This file contains the
  41  * alignment consensus for each resude in any sequence.
  42  * <p>
  43  * This file is procuded by <code>t_coffee</code> providing the option
  44  * <code>-output=score_ascii </code> to the program command line
  45  *
  46  * An example file is the following
  47  *
  48  * <pre>
  49  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  50  * Cedric Notredame
  51  * CPU TIME:0 sec.
  52  * SCORE=90
  53  * *
  54  *  BAD AVG GOOD
  55  * *
  56  * 1PHT   :  89
  57  * 1BB9   :  90
  58  * 1UHC   :  94
  59  * 1YCS   :  94
  60  * 1OOT   :  93
  61  * 1ABO   :  94
  62  * 1FYN   :  94
  63  * 1QCF   :  94
  64  * cons   :  90
  65  *
  66  * 1PHT   999999999999999999999999998762112222543211112134
  67  * 1BB9   99999999999999999999999999987-------4322----2234
  68  * 1UHC   99999999999999999999999999987-------5321----2246
  69  * 1YCS   99999999999999999999999999986-------4321----1-35
  70  * 1OOT   999999999999999999999999999861-------3------1135
  71  * 1ABO   99999999999999999999999999986-------422-------34
  72  * 1FYN   99999999999999999999999999985-------32--------35
  73  * 1QCF   99999999999999999999999999974-------2---------24
  74  * cons   999999999999999999999999999851000110321100001134
  75  *
  76  *
  77  * 1PHT   ----------5666642367889999999999889
  78  * 1BB9   1111111111676653-355679999999999889
  79  * 1UHC   ----------788774--66789999999999889
  80  * 1YCS   ----------78777--356789999999999889
  81  * 1OOT   ----------78877--356789999999997-67
  82  * 1ABO   ----------687774--56779999999999889
  83  * 1FYN   ----------6888842356789999999999889
  84  * 1QCF   ----------6878742356789999999999889
  85  * cons   00100000006877641356789999999999889
  86  * </pre>
  87  *
  88  *
  89  * @author Paolo Di Tommaso
  90  *
  91  */
  92 public class TCoffeeScoreFile extends AlignFile
  93 {
  94   public TCoffeeScoreFile(String inFile, DataSourceType fileSourceType)
  95           throws IOException
  96   {
  97     super(inFile, fileSourceType);
  98
  99   }
 100
 101   public TCoffeeScoreFile(FileParse source) throws IOException
 102   {
 103     super(source);
 104   }
 105
 106   /** The {@link Header} structure holder */
 107   Header header;
 108
 109   /**
 110    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 111    * maintaint the insertion order.
 112    */
 113   LinkedHashMap<String, StringBuilder> scores;
 114
 115   Integer fWidth;
 116
 117   /**
 118    * Parse the provided reader for the T-Coffee scores file format
 119    *
 120    * @param reader
 121    *          public static TCoffeeScoreFile load(Reader reader) {
 122    *
 123    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 124    *          BufferedReader ? reader : new BufferedReader(reader));
 125    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 126    *          result.doParsing(in); return result.header != null &&
 127    *          result.scores != null ? result : null; } catch( Exception e) {
 128    *          throw new RuntimeException(e); } }
 129    */
 130
 131   /**
 132    * @return The 'height' of the score matrix i.e. the numbers of score rows
 133    *         that should matches the number of sequences in the alignment
 134    */
 135   public int getHeight()
 136   {
 137     // the last entry will always be the 'global' alingment consensus scores, so
 138     // it is removed
 139     // from the 'height' count to make this value compatible with the number of
 140     // sequences in the MSA
 141     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 142   }
 143
 144   /**
 145    * @return The 'width' of the score matrix i.e. the number of columns. Since
 146    *         the score value are supposed to be calculated for an 'aligned' MSA,
 147    *         all the entries have to have the same width.
 148    */
 149   public int getWidth()
 150   {
 151     return fWidth != null ? fWidth : 0;
 152   }
 153
 154   /**
 155    * Get the string of score values for the specified seqeunce ID.
 156    *
 157    * @param id
 158    *          The sequence ID
 159    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 160    *         It return an empty string when the specified ID is missing.
 161    */
 162   public String getScoresFor(String id)
 163   {
 164     return scores != null && scores.containsKey(id) ? scores.get(id)
 165             .toString() : "";
 166   }
 167
 168   /**
 169    * @return The list of score string as a {@link List} object, in the same
 170    *         ordeer of the insertion i.e. in the MSA
 171    */
 172   public List<String> getScoresList()
 173   {
 174     if (scores == null)
 175     {
 176       return null;
 177     }
 178     List<String> result = new ArrayList<String>(scores.size());
 179     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 180     {
 181       result.add(it.getValue().toString());
 182     }
 183
 184     return result;
 185   }
 186
 187   /**
 188    * @return The parsed score values a matrix of bytes
 189    */
 190   public byte[][] getScoresArray()
 191   {
 192     if (scores == null)
 193     {
 194       return null;
 195     }
 196     byte[][] result = new byte[scores.size()][];
 197
 198     int rowCount = 0;
 199     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 200     {
 201       String line = it.getValue().toString();
 202       byte[] seqValues = new byte[line.length()];
 203       for (int j = 0, c = line.length(); j < c; j++)
 204       {
 205
 206         byte val = (byte) (line.charAt(j) - '0');
 207
 208         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 209       }
 210
 211       result[rowCount++] = seqValues;
 212     }
 213
 214     return result;
 215   }
 216
 217   @Override
 218   public void parse() throws IOException
 219   {
 220     /*
 221      * read the header
 222      */
 223     header = readHeader(this);
 224
 225     if (header == null)
 226     {
 227       error = true;
 228       return;
 229     }
 230     scores = new LinkedHashMap<String, StringBuilder>();
 231
 232     /*
 233      * initilize the structure
 234      */
 235     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 236     {
 237       scores.put(entry.getKey(), new StringBuilder());
 238     }
 239
 240     /*
 241      * go with the reading
 242      */
 243     Block block;
 244     while ((block = readBlock(this, header.scores.size())) != null)
 245     {
 246
 247       /*
 248        * append sequences read in the block
 249        */
 250       for (Map.Entry<String, String> entry : block.items.entrySet())
 251       {
 252         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 253         if (scoreStringBuilder == null)
 254         {
 255           error = true;
 256           errormessage = String
 257                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 258                           entry.getKey());
 259           return;
 260         }
 261
 262         scoreStringBuilder.append(entry.getValue());
 263       }
 264     }
 265
 266     /*
 267      * verify that all rows have the same width
 268      */
 269     for (StringBuilder str : scores.values())
 270     {
 271       if (fWidth == null)
 272       {
 273         fWidth = str.length();
 274       }
 275       else if (fWidth != str.length())
 276       {
 277         error = true;
 278         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 279         return;
 280       }
 281     }
 282
 283     return;
 284   }
 285
 286   static int parseInt(String str)
 287   {
 288     try
 289     {
 290       return Integer.parseInt(str);
 291     } catch (NumberFormatException e)
 292     {
 293       // TODO report a warning ?
 294       return 0;
 295     }
 296   }
 297
 298   /**
 299    * Reaad the header section in the T-Coffee score file format
 300    *
 301    * @param reader
 302    *          The scores reader
 303    * @return The parser {@link Header} instance
 304    * @throws RuntimeException
 305    *           when the header is not in the expected format
 306    */
 307   static Header readHeader(FileParse reader) throws IOException
 308   {
 309
 310     Header result = null;
 311     try
 312     {
 313       result = new Header();
 314       result.head = reader.nextLine();
 315
 316       String line;
 317
 318       while ((line = reader.nextLine()) != null)
 319       {
 320         if (line.startsWith("SCORE="))
 321         {
 322           result.score = parseInt(line.substring(6).trim());
 323           break;
 324         }
 325       }
 326
 327       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 328       {
 329         error(reader,
 330                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 331         return null;
 332       }
 333       if ((line = reader.nextLine()) == null
 334               || !"BAD AVG GOOD".equals(line.trim()))
 335       {
 336         error(reader,
 337                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 338         return null;
 339       }
 340       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 341       {
 342         error(reader,
 343                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 344         return null;
 345       }
 346
 347       /*
 348        * now are expected a list if sequences ID up to the first blank line
 349        */
 350       while ((line = reader.nextLine()) != null)
 351       {
 352         if ("".equals(line))
 353         {
 354           break;
 355         }
 356
 357         int p = line.indexOf(":");
 358         if (p == -1)
 359         {
 360           // TODO report a warning
 361           continue;
 362         }
 363
 364         String id = line.substring(0, p).trim();
 365         int val = parseInt(line.substring(p + 1).trim());
 366         if ("".equals(id))
 367         {
 368           // TODO report warning
 369           continue;
 370         }
 371
 372         result.scores.put(id, val);
 373       }
 374
 375       if (result == null)
 376       {
 377         error(reader, "T-COFFEE score file had no per-sequence scores");
 378       }
 379
 380     } catch (IOException e)
 381     {
 382       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 383       throw e;
 384     }
 385
 386     return result;
 387   }
 388
 389   private static void error(FileParse reader, String errm)
 390   {
 391     reader.error = true;
 392     if (reader.errormessage == null)
 393     {
 394       reader.errormessage = errm;
 395     }
 396     else
 397     {
 398       reader.errormessage += "\n" + errm;
 399     }
 400   }
 401
 402   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern
 403           .compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 404
 405   /**
 406    * Read a scores block ihe provided stream.
 407    *
 408    * @param reader
 409    *          The stream to parse
 410    * @param size
 411    *          The expected number of the sequence to be read
 412    * @return The {@link Block} instance read or {link null} null if the end of
 413    *         file has reached.
 414    * @throws IOException
 415    *           Something went wrong on the 'wire'
 416    */
 417   static Block readBlock(FileParse reader, int size) throws IOException
 418   {
 419     Block result = new Block(size);
 420     String line;
 421
 422     /*
 423      * read blank lines (eventually)
 424      */
 425     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 426     {
 427       // consume blank lines
 428     }
 429
 430     if (line == null)
 431     {
 432       return null;
 433     }
 434
 435     /*
 436      * read the scores block
 437      */
 438     do
 439     {
 440       if ("".equals(line.trim()))
 441       {
 442         // terminated
 443         break;
 444       }
 445
 446       // split the line on the first blank
 447       // the first part have to contain the sequence id
 448       // the remaining part are the scores values
 449       int p = line.indexOf(" ");
 450       if (p == -1)
 451       {
 452         if (reader.warningMessage == null)
 453         {
 454           reader.warningMessage = "";
 455         }
 456         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 457                 + line + "'\n";
 458         continue;
 459       }
 460
 461       String id = line.substring(0, p).trim();
 462       String val = line.substring(p + 1).trim();
 463
 464       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 465       if (m.matches())
 466       {
 467         val = m.group(1);
 468       }
 469
 470       result.items.put(id, val);
 471
 472     } while ((line = reader.nextLine()) != null);
 473
 474     return result;
 475   }
 476
 477   /*
 478    * The score file header
 479    */
 480   static class Header
 481   {
 482     String head;
 483
 484     int score;
 485
 486     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 487
 488     public int getScoreAvg()
 489     {
 490       return score;
 491     }
 492
 493     public int getScoreFor(String ID)
 494     {
 495
 496       return scores.containsKey(ID) ? scores.get(ID) : -1;
 497
 498     }
 499   }
 500
 501   /*
 502    * Hold a single block values block in the score file
 503    */
 504   static class Block
 505   {
 506     int size;
 507
 508     Map<String, String> items;
 509
 510     public Block(int size)
 511     {
 512       this.size = size;
 513       this.items = new HashMap<String, String>(size);
 514     }
 515
 516     String getScoresFor(String id)
 517     {
 518       return items.get(id);
 519     }
 520
 521     String getConsensus()
 522     {
 523       return items.get("cons");
 524     }
 525   }
 526
 527   /**
 528    * TCOFFEE score colourscheme
 529    */
 530   static final Color[] colors = { new Color(102, 102, 255), // #6666FF
 531       new Color(0, 255, 0), // #00FF00
 532       new Color(102, 255, 0), // #66FF00
 533       new Color(204, 255, 0), // #CCFF00
 534       new Color(255, 255, 0), // #FFFF00
 535       new Color(255, 204, 0), // #FFCC00
 536       new Color(255, 153, 0), // #FF9900
 537       new Color(255, 102, 0), // #FF6600
 538       new Color(255, 51, 0), // #FF3300
 539       new Color(255, 34, 0) // #FF2000
 540   };
 541
 542   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 543
 544   /**
 545    * generate annotation for this TCoffee score set on the given alignment
 546    *
 547    * @param al
 548    *          alignment to annotate
 549    * @param matchids
 550    *          if true, annotate sequences based on matching sequence names
 551    * @return true if alignment annotation was modified, false otherwise.
 552    */
 553   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 554   {
 555     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 556     {
 557       String info = String.format(
 558               "align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(),
 559               al.getHeight(), getWidth(), getHeight());
 560       warningMessage = "Alignment shape does not match T-Coffee score file shape -- "
 561               + info;
 562       return false;
 563     }
 564     boolean added = false;
 565     int i = 0;
 566     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 567             al.getSequencesArray());
 568     byte[][] scoreMatrix = getScoresArray();
 569     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 570     // before adding this.
 571     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 572     {
 573       byte[] srow = scoreMatrix[i];
 574       SequenceI s;
 575       if (matchids)
 576       {
 577         s = sidmatcher.findIdMatch(id.getKey());
 578       }
 579       else
 580       {
 581         s = al.getSequenceAt(i);
 582       }
 583       i++;
 584       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 585       {
 586         System.err.println("No "
 587                 + (matchids ? "match " : " sequences left ")
 588                 + " for TCoffee score set : " + id.getKey());
 589         continue;
 590       }
 591       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 592       Annotation[] annotations = new Annotation[al.getWidth()];
 593       for (int j = 0; j < jSize; j++)
 594       {
 595         byte val = srow[j];
 596         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 597         {
 598           annotations[j] = null;
 599           if (val > 0)
 600           {
 601             System.err
 602                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 603                             + j + " in sequence " + s.getName());
 604           }
 605         }
 606         else
 607         {
 608           annotations[j] = new Annotation(s == null ? "" + val : null,
 609                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 610                           && val < colors.length ? colors[val]
 611                           : Color.white);
 612         }
 613       }
 614       // this will overwrite any existing t-coffee scores for the alignment
 615       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 616               TCOFFEE_SCORE, false, s, null);
 617       if (s != null)
 618       {
 619         aa.label = "T-COFFEE";
 620         aa.description = "" + id.getKey();
 621         aa.annotations = annotations;
 622         aa.visible = false;
 623         aa.belowAlignment = false;
 624         aa.setScore(header.getScoreFor(id.getKey()));
 625         aa.createSequenceMapping(s, s.getStart(), true);
 626         s.addAlignmentAnnotation(aa);
 627         aa.adjustForAlignment();
 628       }
 629       else
 630       {
 631         aa.graph = AlignmentAnnotation.NO_GRAPH;
 632         aa.label = "T-COFFEE";
 633         aa.description = "TCoffee column reliability score";
 634         aa.annotations = annotations;
 635         aa.belowAlignment = true;
 636         aa.visible = true;
 637         aa.setScore(header.getScoreAvg());
 638       }
 639       aa.showAllColLabels = true;
 640       aa.validateRangeAndDisplay();
 641       added = true;
 642     }
 643
 644     return added;
 645   }
 646
 647   @Override
 648   public String print(SequenceI[] sqs, boolean jvsuffix)
 649   {
 650     // TODO Auto-generated method stub
 651     return "Not valid.";
 652   }
 653 }