src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
   3  * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18 package jalview.io;
  19
  20 import jalview.analysis.SequenceIdMatcher;
  21 import jalview.datamodel.AlignmentAnnotation;
  22 import jalview.datamodel.AlignmentI;
  23 import jalview.datamodel.Annotation;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.awt.Color;
  27 import java.io.IOException;
  28 import java.util.ArrayList;
  29 import java.util.HashMap;
  30 import java.util.LinkedHashMap;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.regex.Matcher;
  34 import java.util.regex.Pattern;
  35
  36 /**
  37  * A file parse for T-Coffee score ascii format. This file contains the
  38  * alignment consensus for each resude in any sequence.
  39  * <p>
  40  * This file is procuded by <code>t_coffee</code> providing the option
  41  * <code>-output=score_ascii </code> to the program command line
  42  *
  43  * An example file is the following
  44  *
  45  * <pre>
  46  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  47  * Cedric Notredame
  48  * CPU TIME:0 sec.
  49  * SCORE=90
  50  * *
  51  *  BAD AVG GOOD
  52  * *
  53  * 1PHT   :  89
  54  * 1BB9   :  90
  55  * 1UHC   :  94
  56  * 1YCS   :  94
  57  * 1OOT   :  93
  58  * 1ABO   :  94
  59  * 1FYN   :  94
  60  * 1QCF   :  94
  61  * cons   :  90
  62  *
  63  * 1PHT   999999999999999999999999998762112222543211112134
  64  * 1BB9   99999999999999999999999999987-------4322----2234
  65  * 1UHC   99999999999999999999999999987-------5321----2246
  66  * 1YCS   99999999999999999999999999986-------4321----1-35
  67  * 1OOT   999999999999999999999999999861-------3------1135
  68  * 1ABO   99999999999999999999999999986-------422-------34
  69  * 1FYN   99999999999999999999999999985-------32--------35
  70  * 1QCF   99999999999999999999999999974-------2---------24
  71  * cons   999999999999999999999999999851000110321100001134
  72  *
  73  *
  74  * 1PHT   ----------5666642367889999999999889
  75  * 1BB9   1111111111676653-355679999999999889
  76  * 1UHC   ----------788774--66789999999999889
  77  * 1YCS   ----------78777--356789999999999889
  78  * 1OOT   ----------78877--356789999999997-67
  79  * 1ABO   ----------687774--56779999999999889
  80  * 1FYN   ----------6888842356789999999999889
  81  * 1QCF   ----------6878742356789999999999889
  82  * cons   00100000006877641356789999999999889
  83  * </pre>
  84  *
  85  *
  86  * @author Paolo Di Tommaso
  87  *
  88  */
  89 public class TCoffeeScoreFile extends AlignFile
  90 {
  91
  92   public TCoffeeScoreFile(String inFile, String type) throws IOException
  93   {
  94     super(inFile, type);
  95
  96   }
  97
  98   public TCoffeeScoreFile(FileParse source) throws IOException
  99   {
 100     super(source);
 101   }
 102
 103   /** The {@link Header} structure holder */
 104   Header header;
 105
 106   /**
 107    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 108    * maintaint the insertion order.
 109    */
 110   LinkedHashMap<String, StringBuilder> scores;
 111
 112   Integer fWidth;
 113
 114   /**
 115    * Parse the provided reader for the T-Coffee scores file format
 116    *
 117    * @param reader
 118    *          public static TCoffeeScoreFile load(Reader reader) {
 119    *
 120    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 121    *          BufferedReader ? reader : new BufferedReader(reader));
 122    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 123    *          result.doParsing(in); return result.header != null &&
 124    *          result.scores != null ? result : null; } catch( Exception e) {
 125    *          throw new RuntimeException(e); } }
 126    */
 127
 128   /**
 129    * @return The 'height' of the score matrix i.e. the numbers of score rows
 130    *         that should matches the number of sequences in the alignment
 131    */
 132   public int getHeight()
 133   {
 134     // the last entry will always be the 'global' alingment consensus scores, so
 135     // it is removed
 136     // from the 'height' count to make this value compatible with the number of
 137     // sequences in the MSA
 138     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 139   }
 140
 141   /**
 142    * @return The 'width' of the score matrix i.e. the number of columns. Since
 143    *         the score value are supposed to be calculated for an 'aligned' MSA,
 144    *         all the entries have to have the same width.
 145    */
 146   public int getWidth()
 147   {
 148     return fWidth != null ? fWidth : 0;
 149   }
 150
 151   /**
 152    * Get the string of score values for the specified seqeunce ID.
 153    *
 154    * @param id
 155    *          The sequence ID
 156    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 157    *         It return an empty string when the specified ID is missing.
 158    */
 159   public String getScoresFor(String id)
 160   {
 161     return scores != null && scores.containsKey(id) ? scores.get(id)
 162             .toString() : "";
 163   }
 164
 165   /**
 166    * @return The list of score string as a {@link List} object, in the same
 167    *         ordeer of the insertion i.e. in the MSA
 168    */
 169   public List<String> getScoresList()
 170   {
 171     if (scores == null)
 172     {
 173       return null;
 174     }
 175     List<String> result = new ArrayList<String>(scores.size());
 176     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 177     {
 178       result.add(it.getValue().toString());
 179     }
 180
 181     return result;
 182   }
 183
 184   /**
 185    * @return The parsed score values a matrix of bytes
 186    */
 187   public byte[][] getScoresArray()
 188   {
 189     if (scores == null)
 190     {
 191       return null;
 192     }
 193     byte[][] result = new byte[scores.size()][];
 194
 195     int rowCount = 0;
 196     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 197     {
 198       String line = it.getValue().toString();
 199       byte[] seqValues = new byte[line.length()];
 200       for (int j = 0, c = line.length(); j < c; j++)
 201       {
 202
 203         byte val = (byte) (line.charAt(j) - '0');
 204
 205         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 206       }
 207
 208       result[rowCount++] = seqValues;
 209     }
 210
 211     return result;
 212   }
 213
 214   public void parse() throws IOException
 215   {
 216     /*
 217      * read the header
 218      */
 219     header = readHeader(this);
 220
 221     if (header == null)
 222     {
 223       error = true;
 224       return;
 225     }
 226     scores = new LinkedHashMap<String, StringBuilder>();
 227
 228     /*
 229      * initilize the structure
 230      */
 231     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 232     {
 233       scores.put(entry.getKey(), new StringBuilder());
 234     }
 235
 236     /*
 237      * go with the reading
 238      */
 239     Block block;
 240     while ((block = readBlock(this, header.scores.size())) != null)
 241     {
 242
 243       /*
 244        * append sequences read in the block
 245        */
 246       for (Map.Entry<String, String> entry : block.items.entrySet())
 247       {
 248         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 249         if (scoreStringBuilder == null)
 250         {
 251           error = true;
 252           errormessage = String
 253                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 254                           entry.getKey());
 255           return;
 256         }
 257
 258         scoreStringBuilder.append(entry.getValue());
 259       }
 260     }
 261
 262     /*
 263      * verify that all rows have the same width
 264      */
 265     for (StringBuilder str : scores.values())
 266     {
 267       if (fWidth == null)
 268       {
 269         fWidth = str.length();
 270       }
 271       else if (fWidth != str.length())
 272       {
 273         error = true;
 274         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 275         return;
 276       }
 277     }
 278
 279     return;
 280   }
 281
 282   static int parseInt(String str)
 283   {
 284     try
 285     {
 286       return Integer.parseInt(str);
 287     } catch (NumberFormatException e)
 288     {
 289       // TODO report a warning ?
 290       return 0;
 291     }
 292   }
 293
 294   /**
 295    * Reaad the header section in the T-Coffee score file format
 296    *
 297    * @param reader
 298    *          The scores reader
 299    * @return The parser {@link Header} instance
 300    * @throws RuntimeException
 301    *           when the header is not in the expected format
 302    */
 303   static Header readHeader(FileParse reader) throws IOException
 304   {
 305
 306     Header result = null;
 307     try
 308     {
 309       result = new Header();
 310       result.head = reader.nextLine();
 311
 312       String line;
 313
 314       while ((line = reader.nextLine()) != null)
 315       {
 316         if (line.startsWith("SCORE="))
 317         {
 318           result.score = parseInt(line.substring(6).trim());
 319           break;
 320         }
 321       }
 322
 323       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 324       {
 325         error(reader,
 326                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 327         return null;
 328       }
 329       if ((line = reader.nextLine()) == null
 330               || !"BAD AVG GOOD".equals(line.trim()))
 331       {
 332         error(reader,
 333                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 334         return null;
 335       }
 336       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 337       {
 338         error(reader,
 339                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 340         return null;
 341       }
 342
 343       /*
 344        * now are expected a list if sequences ID up to the first blank line
 345        */
 346       while ((line = reader.nextLine()) != null)
 347       {
 348         if ("".equals(line))
 349         {
 350           break;
 351         }
 352
 353         int p = line.indexOf(":");
 354         if (p == -1)
 355         {
 356           // TODO report a warning
 357           continue;
 358         }
 359
 360         String id = line.substring(0, p).trim();
 361         int val = parseInt(line.substring(p + 1).trim());
 362         if ("".equals(id))
 363         {
 364           // TODO report warning
 365           continue;
 366         }
 367
 368         result.scores.put(id, val);
 369       }
 370
 371       if (result == null)
 372       {
 373         error(reader, "T-COFFEE score file had no per-sequence scores");
 374       }
 375
 376     } catch (IOException e)
 377     {
 378       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 379       throw e;
 380     }
 381
 382     return result;
 383   }
 384
 385   private static void error(FileParse reader, String errm)
 386   {
 387     reader.error = true;
 388     if (reader.errormessage == null)
 389     {
 390       reader.errormessage = errm;
 391     }
 392     else
 393     {
 394       reader.errormessage += "\n" + errm;
 395     }
 396   }
 397
 398   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern.compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 399
 400   /**
 401    * Read a scores block ihe provided stream.
 402    *
 403    * @param reader
 404    *          The stream to parse
 405    * @param size
 406    *          The expected number of the sequence to be read
 407    * @return The {@link Block} instance read or {link null} null if the end of
 408    *         file has reached.
 409    * @throws IOException
 410    *           Something went wrong on the 'wire'
 411    */
 412   static Block readBlock(FileParse reader, int size) throws IOException
 413   {
 414     Block result = new Block(size);
 415     String line;
 416
 417     /*
 418      * read blank lines (eventually)
 419      */
 420     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 421     {
 422       // consume blank lines
 423     }
 424
 425     if (line == null)
 426     {
 427       return null;
 428     }
 429
 430     /*
 431      * read the scores block
 432      */
 433     do
 434     {
 435       if ("".equals(line.trim()))
 436       {
 437         // terminated
 438         break;
 439       }
 440
 441       // split the line on the first blank
 442       // the first part have to contain the sequence id
 443       // the remaining part are the scores values
 444       int p = line.indexOf(" ");
 445       if (p == -1)
 446       {
 447         if (reader.warningMessage == null)
 448         {
 449           reader.warningMessage = "";
 450         }
 451         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 452                 + line + "'\n";
 453         continue;
 454       }
 455
 456       String id = line.substring(0, p).trim();
 457       String val = line.substring(p + 1).trim();
 458
 459       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 460       if( m.matches() ) {
 461           val = m.group(1);
 462       }
 463
 464       result.items.put(id, val);
 465
 466     } while ((line = reader.nextLine()) != null);
 467
 468     return result;
 469   }
 470
 471   /*
 472    * The score file header
 473    */
 474   static class Header
 475   {
 476     String head;
 477
 478     int score;
 479
 480     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 481
 482     public int getScoreAvg()
 483     {
 484       return score;
 485     }
 486
 487     public int getScoreFor(String ID)
 488     {
 489
 490       return scores.containsKey(ID) ? scores.get(ID) : -1;
 491
 492     }
 493   }
 494
 495   /*
 496    * Hold a single block values block in the score file
 497    */
 498   static class Block
 499   {
 500     int size;
 501
 502     Map<String, String> items;
 503
 504     public Block(int size)
 505     {
 506       this.size = size;
 507       this.items = new HashMap<String, String>(size);
 508     }
 509
 510     String getScoresFor(String id)
 511     {
 512       return items.get(id);
 513     }
 514
 515     String getConsensus()
 516     {
 517       return items.get("cons");
 518     }
 519   }
 520
 521   /**
 522    * TCOFFEE score colourscheme
 523    */
 524   static final Color[] colors =
 525   { new Color(102, 102, 255), // #6666FF
 526       new Color(0, 255, 0), // #00FF00
 527       new Color(102, 255, 0), // #66FF00
 528       new Color(204, 255, 0), // #CCFF00
 529       new Color(255, 255, 0), // #FFFF00
 530       new Color(255, 204, 0), // #FFCC00
 531       new Color(255, 153, 0), // #FF9900
 532       new Color(255, 102, 0), // #FF6600
 533       new Color(255, 51, 0), // #FF3300
 534       new Color(255, 34, 0) // #FF2000
 535   };
 536
 537   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 538
 539   /**
 540    * generate annotation for this TCoffee score set on the given alignment
 541    *
 542    * @param al
 543    *          alignment to annotate
 544    * @param matchids
 545    *          if true, annotate sequences based on matching sequence names
 546    * @return true if alignment annotation was modified, false otherwise.
 547    */
 548   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 549   {
 550     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 551     {
 552       String info = String.format("align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), al.getHeight(), getWidth(), getHeight() );
 553       warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + info;
 554       return false;
 555     }
 556     boolean added = false;
 557     int i = 0;
 558     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 559             al.getSequencesArray());
 560     byte[][] scoreMatrix = getScoresArray();
 561     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 562     // before adding this.
 563     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 564     {
 565       byte[] srow = scoreMatrix[i];
 566       SequenceI s;
 567       if (matchids)
 568       {
 569         s = sidmatcher.findIdMatch(id.getKey());
 570       }
 571       else
 572       {
 573         s = al.getSequenceAt(i);
 574       }
 575       i++;
 576       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 577       {
 578         System.err.println("No "
 579                 + (matchids ? "match " : " sequences left ")
 580                 + " for TCoffee score set : " + id.getKey());
 581         continue;
 582       }
 583       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 584       Annotation[] annotations = new Annotation[al.getWidth()];
 585       for (int j = 0; j < jSize; j++)
 586       {
 587         byte val = srow[j];
 588         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 589         {
 590           annotations[j] = null;
 591           if (val > 0)
 592           {
 593             System.err
 594                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 595                             + j + " in sequence " + s.getName());
 596           }
 597         }
 598         else
 599         {
 600           annotations[j] = new Annotation(s == null ? "" + val : null,
 601                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 602                           && val < colors.length ? colors[val]
 603                           : Color.white);
 604         }
 605       }
 606       // this will overwrite any existing t-coffee scores for the alignment
 607       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 608               TCOFFEE_SCORE, false, s, null);
 609       if (s != null)
 610       {
 611         aa.label = "T-COFFEE";
 612         aa.description = "" + id.getKey();
 613         aa.annotations = annotations;
 614         aa.visible = false;
 615         aa.belowAlignment = false;
 616         aa.setScore(header.getScoreFor(id.getKey()));
 617         aa.createSequenceMapping(s, s.getStart(), true);
 618         s.addAlignmentAnnotation(aa);
 619         aa.adjustForAlignment();
 620       }
 621       else
 622       {
 623         aa.graph = AlignmentAnnotation.NO_GRAPH;
 624         aa.label = "T-COFFEE";
 625         aa.description = "TCoffee column reliability score";
 626         aa.annotations = annotations;
 627         aa.belowAlignment = true;
 628         aa.visible = true;
 629         aa.setScore(header.getScoreAvg());
 630       }
 631       aa.showAllColLabels = true;
 632       aa.validateRangeAndDisplay();
 633       added = true;
 634     }
 635
 636     return added;
 637   }
 638
 639   @Override
 640   public String print()
 641   {
 642     // TODO Auto-generated method stub
 643     return "Not valid.";
 644   }
 645 }