src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28
  29 import java.awt.Color;
  30 import java.io.IOException;
  31 import java.util.ArrayList;
  32 import java.util.HashMap;
  33 import java.util.LinkedHashMap;
  34 import java.util.List;
  35 import java.util.Map;
  36 import java.util.regex.Matcher;
  37 import java.util.regex.Pattern;
  38
  39 /**
  40  * A file parser for T-Coffee score ascii format. This file contains the
  41  * alignment consensus for each residue in any sequence.
  42  * <p>
  43  * This file is produced by <code>t_coffee</code> providing the option
  44  * <code>-output=score_ascii </code> to the program command line
  45  *
  46  * An example file is the following
  47  *
  48  * <pre>
  49  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  50  * Cedric Notredame
  51  * CPU TIME:0 sec.
  52  * SCORE=90
  53  * *
  54  *  BAD AVG GOOD
  55  * *
  56  * 1PHT   :  89
  57  * 1BB9   :  90
  58  * 1UHC   :  94
  59  * 1YCS   :  94
  60  * 1OOT   :  93
  61  * 1ABO   :  94
  62  * 1FYN   :  94
  63  * 1QCF   :  94
  64  * cons   :  90
  65  *
  66  * 1PHT   999999999999999999999999998762112222543211112134
  67  * 1BB9   99999999999999999999999999987-------4322----2234
  68  * 1UHC   99999999999999999999999999987-------5321----2246
  69  * 1YCS   99999999999999999999999999986-------4321----1-35
  70  * 1OOT   999999999999999999999999999861-------3------1135
  71  * 1ABO   99999999999999999999999999986-------422-------34
  72  * 1FYN   99999999999999999999999999985-------32--------35
  73  * 1QCF   99999999999999999999999999974-------2---------24
  74  * cons   999999999999999999999999999851000110321100001134
  75  *
  76  *
  77  * 1PHT   ----------5666642367889999999999889
  78  * 1BB9   1111111111676653-355679999999999889
  79  * 1UHC   ----------788774--66789999999999889
  80  * 1YCS   ----------78777--356789999999999889
  81  * 1OOT   ----------78877--356789999999997-67
  82  * 1ABO   ----------687774--56779999999999889
  83  * 1FYN   ----------6888842356789999999999889
  84  * 1QCF   ----------6878742356789999999999889
  85  * cons   00100000006877641356789999999999889
  86  * </pre>
  87  *
  88  *
  89  * @author Paolo Di Tommaso
  90  *
  91  */
  92 public class TCoffeeScoreFile extends AlignFile
  93 {
  94
  95   /**
  96    * TCOFFEE score colourscheme
  97    */
  98   static final Color[] colors = { new Color(102, 102, 255), // 0: lilac #6666FF
  99       new Color(0, 255, 0), // 1: green #00FF00
 100       new Color(102, 255, 0), // 2: lime green #66FF00
 101       new Color(204, 255, 0), // 3: greeny yellow #CCFF00
 102       new Color(255, 255, 0), // 4: yellow #FFFF00
 103       new Color(255, 204, 0), // 5: orange #FFCC00
 104       new Color(255, 153, 0), // 6: deep orange #FF9900
 105       new Color(255, 102, 0), // 7: ochre #FF6600
 106       new Color(255, 51, 0), // 8: red #FF3300
 107       new Color(255, 34, 0) // 9: redder #FF2000
 108   };
 109
 110   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 111
 112   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern
 113           .compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 114
 115   /** The {@link Header} structure holder */
 116   Header header;
 117
 118   /**
 119    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 120    * maintaint the insertion order.
 121    */
 122   LinkedHashMap<String, StringBuilder> scores;
 123
 124   Integer fWidth;
 125
 126   public TCoffeeScoreFile(Object inFile, DataSourceType fileSourceType)
 127           throws IOException
 128   {
 129     // BH 2018 allows File or String
 130     super(inFile, fileSourceType);
 131
 132   }
 133
 134   public TCoffeeScoreFile(FileParse source) throws IOException
 135   {
 136     super(source);
 137   }
 138
 139   /**
 140    * Parse the provided reader for the T-Coffee scores file format
 141    *
 142    * @param reader
 143    *          public static TCoffeeScoreFile load(Reader reader) {
 144    *
 145    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 146    *          BufferedReader ? reader : new BufferedReader(reader));
 147    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 148    *          result.doParsing(in); return result.header != null &&
 149    *          result.scores != null ? result : null; } catch( Exception e) {
 150    *          throw new RuntimeException(e); } }
 151    */
 152
 153   /**
 154    * @return The 'height' of the score matrix i.e. the numbers of score rows
 155    *         that should matches the number of sequences in the alignment
 156    */
 157   public int getHeight()
 158   {
 159     // the last entry will always be the 'global' alingment consensus scores, so
 160     // it is removed
 161     // from the 'height' count to make this value compatible with the number of
 162     // sequences in the MSA
 163     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 164   }
 165
 166   /**
 167    * @return The 'width' of the score matrix i.e. the number of columns. Since
 168    *         the score value are supposed to be calculated for an 'aligned' MSA,
 169    *         all the entries have to have the same width.
 170    */
 171   public int getWidth()
 172   {
 173     return fWidth != null ? fWidth : 0;
 174   }
 175
 176   /**
 177    * Get the string of score values for the specified seqeunce ID.
 178    *
 179    * @param id
 180    *          The sequence ID
 181    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 182    *         It return an empty string when the specified ID is missing.
 183    */
 184   public String getScoresFor(String id)
 185   {
 186     return scores != null && scores.containsKey(id)
 187             ? scores.get(id).toString()
 188             : "";
 189   }
 190
 191   /**
 192    * @return The list of score string as a {@link List} object, in the same
 193    *         ordeer of the insertion i.e. in the MSA
 194    */
 195   public List<String> getScoresList()
 196   {
 197     if (scores == null)
 198     {
 199       return null;
 200     }
 201     List<String> result = new ArrayList<String>(scores.size());
 202     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 203     {
 204       result.add(it.getValue().toString());
 205     }
 206
 207     return result;
 208   }
 209
 210   /**
 211    * @return The parsed score values a matrix of bytes
 212    */
 213   public byte[][] getScoresArray()
 214   {
 215     if (scores == null)
 216     {
 217       return null;
 218     }
 219     byte[][] result = new byte[scores.size()][];
 220
 221     int rowCount = 0;
 222     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 223     {
 224       String line = it.getValue().toString();
 225       byte[] seqValues = new byte[line.length()];
 226       for (int j = 0, c = line.length(); j < c; j++)
 227       {
 228
 229         byte val = (byte) (line.charAt(j) - '0');
 230
 231         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 232       }
 233
 234       result[rowCount++] = seqValues;
 235     }
 236
 237     return result;
 238   }
 239
 240   @Override
 241   public void parse() throws IOException
 242   {
 243     /*
 244      * read the header
 245      */
 246     header = readHeader(this);
 247
 248     if (header == null)
 249     {
 250       error = true;
 251       return;
 252     }
 253     scores = new LinkedHashMap<String, StringBuilder>();
 254
 255     /*
 256      * initilize the structure
 257      */
 258     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 259     {
 260       scores.put(entry.getKey(), new StringBuilder());
 261     }
 262
 263     /*
 264      * go with the reading
 265      */
 266     Block block;
 267     while ((block = readBlock(this, header.scores.size())) != null)
 268     {
 269
 270       /*
 271        * append sequences read in the block
 272        */
 273       for (Map.Entry<String, String> entry : block.items.entrySet())
 274       {
 275         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 276         if (scoreStringBuilder == null)
 277         {
 278           error = true;
 279           errormessage = String.format(
 280                   "Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 281                   entry.getKey());
 282           return;
 283         }
 284
 285         scoreStringBuilder.append(entry.getValue());
 286       }
 287     }
 288
 289     /*
 290      * verify that all rows have the same width
 291      */
 292     for (StringBuilder str : scores.values())
 293     {
 294       if (fWidth == null)
 295       {
 296         fWidth = str.length();
 297       }
 298       else if (fWidth != str.length())
 299       {
 300         error = true;
 301         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 302         return;
 303       }
 304     }
 305
 306     return;
 307   }
 308
 309   static int parseInt(String str)
 310   {
 311     try
 312     {
 313       return Integer.parseInt(str);
 314     } catch (NumberFormatException e)
 315     {
 316       // TODO report a warning ?
 317       return 0;
 318     }
 319   }
 320
 321   /**
 322    * Reaad the header section in the T-Coffee score file format
 323    *
 324    * @param reader
 325    *          The scores reader
 326    * @return The parser {@link Header} instance
 327    * @throws RuntimeException
 328    *           when the header is not in the expected format
 329    */
 330   static Header readHeader(FileParse reader) throws IOException
 331   {
 332
 333     Header result = null;
 334     try
 335     {
 336       result = new Header();
 337       result.head = reader.nextLine();
 338
 339       String line;
 340
 341       while ((line = reader.nextLine()) != null)
 342       {
 343         if (line.startsWith("SCORE="))
 344         {
 345           result.score = parseInt(line.substring(6).trim());
 346           break;
 347         }
 348       }
 349
 350       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 351       {
 352         error(reader,
 353                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 354         return null;
 355       }
 356       if ((line = reader.nextLine()) == null
 357               || !"BAD AVG GOOD".equals(line.trim()))
 358       {
 359         error(reader,
 360                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 361         return null;
 362       }
 363       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 364       {
 365         error(reader,
 366                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 367         return null;
 368       }
 369
 370       /*
 371        * now are expected a list if sequences ID up to the first blank line
 372        */
 373       while ((line = reader.nextLine()) != null)
 374       {
 375         if ("".equals(line))
 376         {
 377           break;
 378         }
 379
 380         int p = line.indexOf(":");
 381         if (p == -1)
 382         {
 383           // TODO report a warning
 384           continue;
 385         }
 386
 387         String id = line.substring(0, p).trim();
 388         int val = parseInt(line.substring(p + 1).trim());
 389         if ("".equals(id))
 390         {
 391           // TODO report warning
 392           continue;
 393         }
 394
 395         result.scores.put(id, val);
 396       }
 397
 398       if (result == null)
 399       {
 400         error(reader, "T-COFFEE score file had no per-sequence scores");
 401       }
 402
 403     } catch (IOException e)
 404     {
 405       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 406       throw e;
 407     }
 408
 409     return result;
 410   }
 411
 412   private static void error(FileParse reader, String errm)
 413   {
 414     reader.error = true;
 415     if (reader.errormessage == null)
 416     {
 417       reader.errormessage = errm;
 418     }
 419     else
 420     {
 421       reader.errormessage += "\n" + errm;
 422     }
 423   }
 424
 425   /**
 426    * Read a scores block ihe provided stream.
 427    *
 428    * @param reader
 429    *          The stream to parse
 430    * @param size
 431    *          The expected number of the sequence to be read
 432    * @return The {@link Block} instance read or {link null} null if the end of
 433    *         file has reached.
 434    * @throws IOException
 435    *           Something went wrong on the 'wire'
 436    */
 437   static Block readBlock(FileParse reader, int size) throws IOException
 438   {
 439     Block result = new Block(size);
 440     String line;
 441
 442     /*
 443      * read blank lines (eventually)
 444      */
 445     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 446     {
 447       // consume blank lines
 448     }
 449
 450     if (line == null)
 451     {
 452       return null;
 453     }
 454
 455     /*
 456      * read the scores block
 457      */
 458     do
 459     {
 460       if ("".equals(line.trim()))
 461       {
 462         // terminated
 463         break;
 464       }
 465
 466       // split the line on the first blank
 467       // the first part have to contain the sequence id
 468       // the remaining part are the scores values
 469       int p = line.indexOf(" ");
 470       if (p == -1)
 471       {
 472         if (reader.warningMessage == null)
 473         {
 474           reader.warningMessage = "";
 475         }
 476         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 477                 + line + "'\n";
 478         continue;
 479       }
 480
 481       String id = line.substring(0, p).trim();
 482       String val = line.substring(p + 1).trim();
 483
 484       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 485       if (m.matches())
 486       {
 487         val = m.group(1);
 488       }
 489
 490       result.items.put(id, val);
 491
 492     } while ((line = reader.nextLine()) != null);
 493
 494     return result;
 495   }
 496
 497   /*
 498    * The score file header
 499    */
 500   static class Header
 501   {
 502     String head;
 503
 504     int score;
 505
 506     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 507
 508     public int getScoreAvg()
 509     {
 510       return score;
 511     }
 512
 513     public int getScoreFor(String ID)
 514     {
 515
 516       return scores.containsKey(ID) ? scores.get(ID) : -1;
 517
 518     }
 519   }
 520
 521   /*
 522    * Hold a single block values block in the score file
 523    */
 524   static class Block
 525   {
 526     int size;
 527
 528     Map<String, String> items;
 529
 530     public Block(int size)
 531     {
 532       this.size = size;
 533       this.items = new HashMap<String, String>(size);
 534     }
 535
 536     String getScoresFor(String id)
 537     {
 538       return items.get(id);
 539     }
 540
 541     String getConsensus()
 542     {
 543       return items.get("cons");
 544     }
 545   }
 546
 547   /**
 548    * generate annotation for this TCoffee score set on the given alignment
 549    *
 550    * @param al
 551    *          alignment to annotate
 552    * @param matchids
 553    *          if true, annotate sequences based on matching sequence names
 554    * @return true if alignment annotation was modified, false otherwise.
 555    */
 556   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 557   {
 558     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 559     {
 560       String info = String.format(
 561               "align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(),
 562               al.getHeight(), getWidth(), getHeight());
 563       warningMessage = "Alignment shape does not match T-Coffee score file shape -- "
 564               + info;
 565       return false;
 566     }
 567     boolean added = false;
 568     int i = 0;
 569     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 570             al.getSequencesArray());
 571     byte[][] scoreMatrix = getScoresArray();
 572     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 573     // before adding this.
 574     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 575     {
 576       byte[] srow = scoreMatrix[i];
 577       SequenceI s;
 578       if (matchids)
 579       {
 580         s = sidmatcher.findIdMatch(id.getKey());
 581       }
 582       else
 583       {
 584         s = al.getSequenceAt(i);
 585       }
 586       i++;
 587       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 588       {
 589         System.err
 590                 .println("No " + (matchids ? "match " : " sequences left ")
 591                         + " for TCoffee score set : " + id.getKey());
 592         continue;
 593       }
 594       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 595       Annotation[] annotations = new Annotation[al.getWidth()];
 596       for (int j = 0; j < jSize; j++)
 597       {
 598         byte val = srow[j];
 599         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 600         {
 601           annotations[j] = null;
 602           if (val > 0)
 603           {
 604             System.err.println(
 605                     "Warning: non-zero value for positional T-COFFEE score for gap at "
 606                             + j + " in sequence " + s.getName());
 607           }
 608         }
 609         else
 610         {
 611           annotations[j] = new Annotation(s == null ? "" + val : null,
 612                   s == null ? "" + val : null, '\0', val * 1f,
 613                   val >= 0 && val < colors.length ? colors[val]
 614                           : Color.white);
 615         }
 616       }
 617       // this will overwrite any existing t-coffee scores for the alignment
 618       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 619               TCOFFEE_SCORE, false, s, null);
 620       if (s != null)
 621       {
 622         aa.label = "T-COFFEE";
 623         aa.description = "" + id.getKey();
 624         aa.annotations = annotations;
 625         aa.visible = false;
 626         aa.belowAlignment = false;
 627         aa.setScore(header.getScoreFor(id.getKey()));
 628         aa.createSequenceMapping(s, s.getStart(), true);
 629         s.addAlignmentAnnotation(aa);
 630         aa.adjustForAlignment();
 631       }
 632       else
 633       {
 634         aa.graph = AlignmentAnnotation.NO_GRAPH;
 635         aa.label = "T-COFFEE";
 636         aa.description = "TCoffee column reliability score";
 637         aa.annotations = annotations;
 638         aa.belowAlignment = true;
 639         aa.visible = true;
 640         aa.setScore(header.getScoreAvg());
 641       }
 642       aa.showAllColLabels = true;
 643       aa.validateRangeAndDisplay();
 644       added = true;
 645     }
 646
 647     return added;
 648   }
 649
 650   @Override
 651   public String print(SequenceI[] sqs, boolean jvsuffix)
 652   {
 653     // TODO Auto-generated method stub
 654     return "Not valid.";
 655   }
 656 }