src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.analysis.SequenceIdMatcher;
  22 import jalview.datamodel.AlignmentAnnotation;
  23 import jalview.datamodel.AlignmentI;
  24 import jalview.datamodel.Annotation;
  25 import jalview.datamodel.SequenceI;
  26
  27 import java.awt.Color;
  28 import java.io.IOException;
  29 import java.util.ArrayList;
  30 import java.util.HashMap;
  31 import java.util.LinkedHashMap;
  32 import java.util.List;
  33 import java.util.Map;
  34 import java.util.regex.Matcher;
  35 import java.util.regex.Pattern;
  36
  37 /**
  38  * A file parse for T-Coffee score ascii format. This file contains the
  39  * alignment consensus for each resude in any sequence.
  40  * <p>
  41  * This file is procuded by <code>t_coffee</code> providing the option
  42  * <code>-output=score_ascii </code> to the program command line
  43  *
  44  * An example file is the following
  45  *
  46  * <pre>
  47  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  48  * Cedric Notredame
  49  * CPU TIME:0 sec.
  50  * SCORE=90
  51  * *
  52  *  BAD AVG GOOD
  53  * *
  54  * 1PHT   :  89
  55  * 1BB9   :  90
  56  * 1UHC   :  94
  57  * 1YCS   :  94
  58  * 1OOT   :  93
  59  * 1ABO   :  94
  60  * 1FYN   :  94
  61  * 1QCF   :  94
  62  * cons   :  90
  63  *
  64  * 1PHT   999999999999999999999999998762112222543211112134
  65  * 1BB9   99999999999999999999999999987-------4322----2234
  66  * 1UHC   99999999999999999999999999987-------5321----2246
  67  * 1YCS   99999999999999999999999999986-------4321----1-35
  68  * 1OOT   999999999999999999999999999861-------3------1135
  69  * 1ABO   99999999999999999999999999986-------422-------34
  70  * 1FYN   99999999999999999999999999985-------32--------35
  71  * 1QCF   99999999999999999999999999974-------2---------24
  72  * cons   999999999999999999999999999851000110321100001134
  73  *
  74  *
  75  * 1PHT   ----------5666642367889999999999889
  76  * 1BB9   1111111111676653-355679999999999889
  77  * 1UHC   ----------788774--66789999999999889
  78  * 1YCS   ----------78777--356789999999999889
  79  * 1OOT   ----------78877--356789999999997-67
  80  * 1ABO   ----------687774--56779999999999889
  81  * 1FYN   ----------6888842356789999999999889
  82  * 1QCF   ----------6878742356789999999999889
  83  * cons   00100000006877641356789999999999889
  84  * </pre>
  85  *
  86  *
  87  * @author Paolo Di Tommaso
  88  *
  89  */
  90 public class TCoffeeScoreFile extends AlignFile
  91 {
  92
  93   public TCoffeeScoreFile(String inFile, String type) throws IOException
  94   {
  95     super(inFile, type);
  96
  97   }
  98
  99   public TCoffeeScoreFile(FileParse source) throws IOException
 100   {
 101     super(source);
 102   }
 103
 104   /** The {@link Header} structure holder */
 105   Header header;
 106
 107   /**
 108    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 109    * maintaint the insertion order.
 110    */
 111   LinkedHashMap<String, StringBuilder> scores;
 112
 113   Integer fWidth;
 114
 115   /**
 116    * Parse the provided reader for the T-Coffee scores file format
 117    *
 118    * @param reader
 119    *          public static TCoffeeScoreFile load(Reader reader) {
 120    *
 121    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 122    *          BufferedReader ? reader : new BufferedReader(reader));
 123    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 124    *          result.doParsing(in); return result.header != null &&
 125    *          result.scores != null ? result : null; } catch( Exception e) {
 126    *          throw new RuntimeException(e); } }
 127    */
 128
 129   /**
 130    * @return The 'height' of the score matrix i.e. the numbers of score rows
 131    *         that should matches the number of sequences in the alignment
 132    */
 133   public int getHeight()
 134   {
 135     // the last entry will always be the 'global' alingment consensus scores, so
 136     // it is removed
 137     // from the 'height' count to make this value compatible with the number of
 138     // sequences in the MSA
 139     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 140   }
 141
 142   /**
 143    * @return The 'width' of the score matrix i.e. the number of columns. Since
 144    *         the score value are supposed to be calculated for an 'aligned' MSA,
 145    *         all the entries have to have the same width.
 146    */
 147   public int getWidth()
 148   {
 149     return fWidth != null ? fWidth : 0;
 150   }
 151
 152   /**
 153    * Get the string of score values for the specified seqeunce ID.
 154    *
 155    * @param id
 156    *          The sequence ID
 157    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 158    *         It return an empty string when the specified ID is missing.
 159    */
 160   public String getScoresFor(String id)
 161   {
 162     return scores != null && scores.containsKey(id) ? scores.get(id)
 163             .toString() : "";
 164   }
 165
 166   /**
 167    * @return The list of score string as a {@link List} object, in the same
 168    *         ordeer of the insertion i.e. in the MSA
 169    */
 170   public List<String> getScoresList()
 171   {
 172     if (scores == null)
 173     {
 174       return null;
 175     }
 176     List<String> result = new ArrayList<String>(scores.size());
 177     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 178     {
 179       result.add(it.getValue().toString());
 180     }
 181
 182     return result;
 183   }
 184
 185   /**
 186    * @return The parsed score values a matrix of bytes
 187    */
 188   public byte[][] getScoresArray()
 189   {
 190     if (scores == null)
 191     {
 192       return null;
 193     }
 194     byte[][] result = new byte[scores.size()][];
 195
 196     int rowCount = 0;
 197     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 198     {
 199       String line = it.getValue().toString();
 200       byte[] seqValues = new byte[line.length()];
 201       for (int j = 0, c = line.length(); j < c; j++)
 202       {
 203
 204         byte val = (byte) (line.charAt(j) - '0');
 205
 206         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 207       }
 208
 209       result[rowCount++] = seqValues;
 210     }
 211
 212     return result;
 213   }
 214
 215   public void parse() throws IOException
 216   {
 217     /*
 218      * read the header
 219      */
 220     header = readHeader(this);
 221
 222     if (header == null)
 223     {
 224       error = true;
 225       return;
 226     }
 227     scores = new LinkedHashMap<String, StringBuilder>();
 228
 229     /*
 230      * initilize the structure
 231      */
 232     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 233     {
 234       scores.put(entry.getKey(), new StringBuilder());
 235     }
 236
 237     /*
 238      * go with the reading
 239      */
 240     Block block;
 241     while ((block = readBlock(this, header.scores.size())) != null)
 242     {
 243
 244       /*
 245        * append sequences read in the block
 246        */
 247       for (Map.Entry<String, String> entry : block.items.entrySet())
 248       {
 249         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 250         if (scoreStringBuilder == null)
 251         {
 252           error = true;
 253           errormessage = String
 254                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 255                           entry.getKey());
 256           return;
 257         }
 258
 259         scoreStringBuilder.append(entry.getValue());
 260       }
 261     }
 262
 263     /*
 264      * verify that all rows have the same width
 265      */
 266     for (StringBuilder str : scores.values())
 267     {
 268       if (fWidth == null)
 269       {
 270         fWidth = str.length();
 271       }
 272       else if (fWidth != str.length())
 273       {
 274         error = true;
 275         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 276         return;
 277       }
 278     }
 279
 280     return;
 281   }
 282
 283   static int parseInt(String str)
 284   {
 285     try
 286     {
 287       return Integer.parseInt(str);
 288     } catch (NumberFormatException e)
 289     {
 290       // TODO report a warning ?
 291       return 0;
 292     }
 293   }
 294
 295   /**
 296    * Reaad the header section in the T-Coffee score file format
 297    *
 298    * @param reader
 299    *          The scores reader
 300    * @return The parser {@link Header} instance
 301    * @throws RuntimeException
 302    *           when the header is not in the expected format
 303    */
 304   static Header readHeader(FileParse reader) throws IOException
 305   {
 306
 307     Header result = null;
 308     try
 309     {
 310       result = new Header();
 311       result.head = reader.nextLine();
 312
 313       String line;
 314
 315       while ((line = reader.nextLine()) != null)
 316       {
 317         if (line.startsWith("SCORE="))
 318         {
 319           result.score = parseInt(line.substring(6).trim());
 320           break;
 321         }
 322       }
 323
 324       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 325       {
 326         error(reader,
 327                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 328         return null;
 329       }
 330       if ((line = reader.nextLine()) == null
 331               || !"BAD AVG GOOD".equals(line.trim()))
 332       {
 333         error(reader,
 334                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 335         return null;
 336       }
 337       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 338       {
 339         error(reader,
 340                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 341         return null;
 342       }
 343
 344       /*
 345        * now are expected a list if sequences ID up to the first blank line
 346        */
 347       while ((line = reader.nextLine()) != null)
 348       {
 349         if ("".equals(line))
 350         {
 351           break;
 352         }
 353
 354         int p = line.indexOf(":");
 355         if (p == -1)
 356         {
 357           // TODO report a warning
 358           continue;
 359         }
 360
 361         String id = line.substring(0, p).trim();
 362         int val = parseInt(line.substring(p + 1).trim());
 363         if ("".equals(id))
 364         {
 365           // TODO report warning
 366           continue;
 367         }
 368
 369         result.scores.put(id, val);
 370       }
 371
 372       if (result == null)
 373       {
 374         error(reader, "T-COFFEE score file had no per-sequence scores");
 375       }
 376
 377     } catch (IOException e)
 378     {
 379       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 380       throw e;
 381     }
 382
 383     return result;
 384   }
 385
 386   private static void error(FileParse reader, String errm)
 387   {
 388     reader.error = true;
 389     if (reader.errormessage == null)
 390     {
 391       reader.errormessage = errm;
 392     }
 393     else
 394     {
 395       reader.errormessage += "\n" + errm;
 396     }
 397   }
 398
 399   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern.compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 400
 401   /**
 402    * Read a scores block ihe provided stream.
 403    *
 404    * @param reader
 405    *          The stream to parse
 406    * @param size
 407    *          The expected number of the sequence to be read
 408    * @return The {@link Block} instance read or {link null} null if the end of
 409    *         file has reached.
 410    * @throws IOException
 411    *           Something went wrong on the 'wire'
 412    */
 413   static Block readBlock(FileParse reader, int size) throws IOException
 414   {
 415     Block result = new Block(size);
 416     String line;
 417
 418     /*
 419      * read blank lines (eventually)
 420      */
 421     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 422     {
 423       // consume blank lines
 424     }
 425
 426     if (line == null)
 427     {
 428       return null;
 429     }
 430
 431     /*
 432      * read the scores block
 433      */
 434     do
 435     {
 436       if ("".equals(line.trim()))
 437       {
 438         // terminated
 439         break;
 440       }
 441
 442       // split the line on the first blank
 443       // the first part have to contain the sequence id
 444       // the remaining part are the scores values
 445       int p = line.indexOf(" ");
 446       if (p == -1)
 447       {
 448         if (reader.warningMessage == null)
 449         {
 450           reader.warningMessage = "";
 451         }
 452         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 453                 + line + "'\n";
 454         continue;
 455       }
 456
 457       String id = line.substring(0, p).trim();
 458       String val = line.substring(p + 1).trim();
 459
 460       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 461       if( m.matches() ) {
 462           val = m.group(1);
 463       }
 464
 465       result.items.put(id, val);
 466
 467     } while ((line = reader.nextLine()) != null);
 468
 469     return result;
 470   }
 471
 472   /*
 473    * The score file header
 474    */
 475   static class Header
 476   {
 477     String head;
 478
 479     int score;
 480
 481     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 482
 483     public int getScoreAvg()
 484     {
 485       return score;
 486     }
 487
 488     public int getScoreFor(String ID)
 489     {
 490
 491       return scores.containsKey(ID) ? scores.get(ID) : -1;
 492
 493     }
 494   }
 495
 496   /*
 497    * Hold a single block values block in the score file
 498    */
 499   static class Block
 500   {
 501     int size;
 502
 503     Map<String, String> items;
 504
 505     public Block(int size)
 506     {
 507       this.size = size;
 508       this.items = new HashMap<String, String>(size);
 509     }
 510
 511     String getScoresFor(String id)
 512     {
 513       return items.get(id);
 514     }
 515
 516     String getConsensus()
 517     {
 518       return items.get("cons");
 519     }
 520   }
 521
 522   /**
 523    * TCOFFEE score colourscheme
 524    */
 525   static final Color[] colors =
 526   { new Color(102, 102, 255), // #6666FF
 527       new Color(0, 255, 0), // #00FF00
 528       new Color(102, 255, 0), // #66FF00
 529       new Color(204, 255, 0), // #CCFF00
 530       new Color(255, 255, 0), // #FFFF00
 531       new Color(255, 204, 0), // #FFCC00
 532       new Color(255, 153, 0), // #FF9900
 533       new Color(255, 102, 0), // #FF6600
 534       new Color(255, 51, 0), // #FF3300
 535       new Color(255, 34, 0) // #FF2000
 536   };
 537
 538   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 539
 540   /**
 541    * generate annotation for this TCoffee score set on the given alignment
 542    *
 543    * @param al
 544    *          alignment to annotate
 545    * @param matchids
 546    *          if true, annotate sequences based on matching sequence names
 547    * @return true if alignment annotation was modified, false otherwise.
 548    */
 549   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 550   {
 551     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 552     {
 553       String info = String.format("align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), al.getHeight(), getWidth(), getHeight() );
 554       warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + info;
 555       return false;
 556     }
 557     boolean added = false;
 558     int i = 0;
 559     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 560             al.getSequencesArray());
 561     byte[][] scoreMatrix = getScoresArray();
 562     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 563     // before adding this.
 564     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 565     {
 566       byte[] srow = scoreMatrix[i];
 567       SequenceI s;
 568       if (matchids)
 569       {
 570         s = sidmatcher.findIdMatch(id.getKey());
 571       }
 572       else
 573       {
 574         s = al.getSequenceAt(i);
 575       }
 576       i++;
 577       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 578       {
 579         System.err.println("No "
 580                 + (matchids ? "match " : " sequences left ")
 581                 + " for TCoffee score set : " + id.getKey());
 582         continue;
 583       }
 584       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 585       Annotation[] annotations = new Annotation[al.getWidth()];
 586       for (int j = 0; j < jSize; j++)
 587       {
 588         byte val = srow[j];
 589         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 590         {
 591           annotations[j] = null;
 592           if (val > 0)
 593           {
 594             System.err
 595                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 596                             + j + " in sequence " + s.getName());
 597           }
 598         }
 599         else
 600         {
 601           annotations[j] = new Annotation(s == null ? "" + val : null,
 602                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 603                           && val < colors.length ? colors[val]
 604                           : Color.white);
 605         }
 606       }
 607       // this will overwrite any existing t-coffee scores for the alignment
 608       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 609               TCOFFEE_SCORE, false, s, null);
 610       if (s != null)
 611       {
 612         aa.label = "T-COFFEE";
 613         aa.description = "" + id.getKey();
 614         aa.annotations = annotations;
 615         aa.visible = false;
 616         aa.belowAlignment = false;
 617         aa.setScore(header.getScoreFor(id.getKey()));
 618         aa.createSequenceMapping(s, s.getStart(), true);
 619         s.addAlignmentAnnotation(aa);
 620         aa.adjustForAlignment();
 621       }
 622       else
 623       {
 624         aa.graph = AlignmentAnnotation.NO_GRAPH;
 625         aa.label = "T-COFFEE";
 626         aa.description = "TCoffee column reliability score";
 627         aa.annotations = annotations;
 628         aa.belowAlignment = true;
 629         aa.visible = true;
 630         aa.setScore(header.getScoreAvg());
 631       }
 632       aa.showAllColLabels = true;
 633       aa.validateRangeAndDisplay();
 634       added = true;
 635     }
 636
 637     return added;
 638   }
 639
 640   @Override
 641   public String print()
 642   {
 643     // TODO Auto-generated method stub
 644     return "Not valid.";
 645   }
 646 }