src/jalview/io/TCoffeeScoreFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
   3  * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18 package jalview.io;
  19
  20 import jalview.analysis.SequenceIdMatcher;
  21 import jalview.datamodel.AlignmentAnnotation;
  22 import jalview.datamodel.AlignmentI;
  23 import jalview.datamodel.Annotation;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.awt.Color;
  27 import java.io.IOException;
  28 import java.util.ArrayList;
  29 import java.util.HashMap;
  30 import java.util.LinkedHashMap;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.regex.Matcher;
  34 import java.util.regex.Pattern;
  35
  36 import javax.xml.parsers.ParserConfigurationException;
  37
  38 import org.xml.sax.SAXException;
  39
  40 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
  41 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
  42 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
  43 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
  44
  45 /**
  46  * A file parse for T-Coffee score ascii format. This file contains the
  47  * alignment consensus for each resude in any sequence.
  48  * <p>
  49  * This file is procuded by <code>t_coffee</code> providing the option
  50  * <code>-output=score_ascii </code> to the program command line
  51  *
  52  * An example file is the following
  53  *
  54  * <pre>
  55  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  56  * Cedric Notredame
  57  * CPU TIME:0 sec.
  58  * SCORE=90
  59  * *
  60  *  BAD AVG GOOD
  61  * *
  62  * 1PHT   :  89
  63  * 1BB9   :  90
  64  * 1UHC   :  94
  65  * 1YCS   :  94
  66  * 1OOT   :  93
  67  * 1ABO   :  94
  68  * 1FYN   :  94
  69  * 1QCF   :  94
  70  * cons   :  90
  71  *
  72  * 1PHT   999999999999999999999999998762112222543211112134
  73  * 1BB9   99999999999999999999999999987-------4322----2234
  74  * 1UHC   99999999999999999999999999987-------5321----2246
  75  * 1YCS   99999999999999999999999999986-------4321----1-35
  76  * 1OOT   999999999999999999999999999861-------3------1135
  77  * 1ABO   99999999999999999999999999986-------422-------34
  78  * 1FYN   99999999999999999999999999985-------32--------35
  79  * 1QCF   99999999999999999999999999974-------2---------24
  80  * cons   999999999999999999999999999851000110321100001134
  81  *
  82  *
  83  * 1PHT   ----------5666642367889999999999889
  84  * 1BB9   1111111111676653-355679999999999889
  85  * 1UHC   ----------788774--66789999999999889
  86  * 1YCS   ----------78777--356789999999999889
  87  * 1OOT   ----------78877--356789999999997-67
  88  * 1ABO   ----------687774--56779999999999889
  89  * 1FYN   ----------6888842356789999999999889
  90  * 1QCF   ----------6878742356789999999999889
  91  * cons   00100000006877641356789999999999889
  92  * </pre>
  93  *
  94  *
  95  * @author Paolo Di Tommaso
  96  *
  97  */
  98 public class TCoffeeScoreFile extends AlignFile {
  99
 100   public TCoffeeScoreFile(String inFile, String type) throws Exception
 101   {
 102     super(inFile, type);
 103
 104   }
 105
 106   public TCoffeeScoreFile(FileParse source) throws Exception
 107   {
 108     super(source);
 109   }
 110
 111   /** The {@link Header} structure holder */
 112   Header header;
 113
 114   /**
 115    * Holds the consensues values for each sequences. It uses a LinkedHashMap to
 116    * maintaint the insertion order.
 117    */
 118   LinkedHashMap<String, StringBuilder> scores;
 119
 120   Integer fWidth;
 121
 122   /**
 123    * Parse the provided reader for the T-Coffee scores file format
 124    *
 125    * @param reader
 126    *          public static TCoffeeScoreFile load(Reader reader) {
 127    *
 128    *          try { BufferedReader in = (BufferedReader) (reader instanceof
 129    *          BufferedReader ? reader : new BufferedReader(reader));
 130    *          TCoffeeScoreFile result = new TCoffeeScoreFile();
 131    *          result.doParsing(in); return result.header != null &&
 132    *          result.scores != null ? result : null; } catch( Exception e) {
 133    *          throw new RuntimeException(e); } }
 134    */
 135
 136   /**
 137    * @return The 'height' of the score matrix i.e. the numbers of score rows
 138    *         that should matches the number of sequences in the alignment
 139    */
 140   public int getHeight()
 141   {
 142     // the last entry will always be the 'global' alingment consensus scores, so
 143     // it is removed
 144     // from the 'height' count to make this value compatible with the number of
 145     // sequences in the MSA
 146     return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
 147   }
 148
 149   /**
 150    * @return The 'width' of the score matrix i.e. the number of columns. Since
 151    *         the score value are supposed to be calculated for an 'aligned' MSA,
 152    *         all the entries have to have the same width.
 153    */
 154   public int getWidth()
 155   {
 156     return fWidth != null ? fWidth : 0;
 157   }
 158
 159   /**
 160    * Get the string of score values for the specified seqeunce ID.
 161    *
 162    * @param id
 163    *          The sequence ID
 164    * @return The scores as a string of values e.g. {@code 99999987-------432}.
 165    *         It return an empty string when the specified ID is missing.
 166    */
 167   public String getScoresFor(String id)
 168   {
 169     return scores != null && scores.containsKey(id) ? scores.get(id)
 170             .toString() : "";
 171   }
 172
 173   /**
 174    * @return The list of score string as a {@link List} object, in the same
 175    *         ordeer of the insertion i.e. in the MSA
 176    */
 177   public List<String> getScoresList()
 178   {
 179     if (scores == null)
 180     {
 181       return null;
 182     }
 183     List<String> result = new ArrayList<String>(scores.size());
 184     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 185     {
 186       result.add(it.getValue().toString());
 187     }
 188
 189     return result;
 190   }
 191
 192   /**
 193    * @return The parsed score values a matrix of bytes
 194    */
 195   public byte[][] getScoresArray()
 196   {
 197     if (scores == null)
 198     {
 199       return null;
 200     }
 201     byte[][] result = new byte[scores.size()][];
 202
 203     int rowCount = 0;
 204     for (Map.Entry<String, StringBuilder> it : scores.entrySet())
 205     {
 206       String line = it.getValue().toString();
 207       byte[] seqValues = new byte[line.length()];
 208       for (int j = 0, c = line.length(); j < c; j++)
 209       {
 210
 211         byte val = (byte) (line.charAt(j) - '0');
 212
 213         seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
 214       }
 215
 216       result[rowCount++] = seqValues;
 217     }
 218
 219     return result;
 220   }
 221
 222   public void parse() throws IOException
 223   {
 224     /*
 225      * read the header
 226      */
 227     header = readHeader(this);
 228
 229     if (header == null)
 230     {
 231       error = true;
 232       return;
 233     }
 234     scores = new LinkedHashMap<String, StringBuilder>();
 235
 236     /*
 237      * initilize the structure
 238      */
 239     for (Map.Entry<String, Integer> entry : header.scores.entrySet())
 240     {
 241       scores.put(entry.getKey(), new StringBuilder());
 242     }
 243
 244     /*
 245      * go with the reading
 246      */
 247     Block block;
 248     while ((block = readBlock(this, header.scores.size())) != null)
 249     {
 250
 251       /*
 252        * append sequences read in the block
 253        */
 254       for (Map.Entry<String, String> entry : block.items.entrySet())
 255       {
 256         StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 257         if (scoreStringBuilder == null)
 258         {
 259           error = true;
 260           errormessage = String
 261                   .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
 262                           entry.getKey());
 263           return;
 264         }
 265
 266         scoreStringBuilder.append(entry.getValue());
 267       }
 268     }
 269
 270     /*
 271      * verify that all rows have the same width
 272      */
 273     for (StringBuilder str : scores.values())
 274     {
 275       if (fWidth == null)
 276       {
 277         fWidth = str.length();
 278       }
 279       else if (fWidth != str.length())
 280       {
 281         error = true;
 282         errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
 283         return;
 284       }
 285     }
 286
 287     return;
 288   }
 289
 290   static int parseInt(String str)
 291   {
 292     try
 293     {
 294       return Integer.parseInt(str);
 295     } catch (NumberFormatException e)
 296     {
 297       // TODO report a warning ?
 298       return 0;
 299     }
 300   }
 301
 302   /**
 303    * Reaad the header section in the T-Coffee score file format
 304    *
 305    * @param reader
 306    *          The scores reader
 307    * @return The parser {@link Header} instance
 308    * @throws RuntimeException
 309    *           when the header is not in the expected format
 310    */
 311   static Header readHeader(FileParse reader) throws IOException
 312   {
 313
 314     Header result = null;
 315     try
 316     {
 317       result = new Header();
 318       result.head = reader.nextLine();
 319
 320       String line;
 321
 322       while ((line = reader.nextLine()) != null)
 323       {
 324         if (line.startsWith("SCORE="))
 325         {
 326           result.score = parseInt(line.substring(6).trim());
 327           break;
 328         }
 329       }
 330
 331       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 332       {
 333         error(reader,
 334                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 335         return null;
 336       }
 337       if ((line = reader.nextLine()) == null
 338               || !"BAD AVG GOOD".equals(line.trim()))
 339       {
 340         error(reader,
 341                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 342         return null;
 343       }
 344       if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
 345       {
 346         error(reader,
 347                 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
 348         return null;
 349       }
 350
 351       /*
 352        * now are expected a list if sequences ID up to the first blank line
 353        */
 354       while ((line = reader.nextLine()) != null)
 355       {
 356         if ("".equals(line))
 357         {
 358           break;
 359         }
 360
 361         int p = line.indexOf(":");
 362         if (p == -1)
 363         {
 364           // TODO report a warning
 365           continue;
 366         }
 367
 368         String id = line.substring(0, p).trim();
 369         int val = parseInt(line.substring(p + 1).trim());
 370         if ("".equals(id))
 371         {
 372           // TODO report warning
 373           continue;
 374         }
 375
 376         result.scores.put(id, val);
 377       }
 378
 379       if (result == null)
 380       {
 381         error(reader, "T-COFFEE score file had no per-sequence scores");
 382       }
 383
 384     } catch (IOException e)
 385     {
 386       error(reader, "Unexpected problem parsing T-Coffee score ascii file");
 387       throw e;
 388     }
 389
 390     return result;
 391   }
 392
 393   private static void error(FileParse reader, String errm)
 394   {
 395     reader.error = true;
 396     if (reader.errormessage == null)
 397     {
 398       reader.errormessage = errm;
 399     }
 400     else
 401     {
 402       reader.errormessage += "\n" + errm;
 403     }
 404   }
 405
 406   static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern.compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
 407
 408   /**
 409    * Read a scores block ihe provided stream.
 410    *
 411    * @param reader
 412    *          The stream to parse
 413    * @param size
 414    *          The expected number of the sequence to be read
 415    * @return The {@link Block} instance read or {link null} null if the end of
 416    *         file has reached.
 417    * @throws IOException
 418    *           Something went wrong on the 'wire'
 419    */
 420   static Block readBlock(FileParse reader, int size) throws IOException
 421   {
 422     Block result = new Block(size);
 423     String line;
 424
 425     /*
 426      * read blank lines (eventually)
 427      */
 428     while ((line = reader.nextLine()) != null && "".equals(line.trim()))
 429     {
 430       // consume blank lines
 431     }
 432
 433     if (line == null)
 434     {
 435       return null;
 436     }
 437
 438     /*
 439      * read the scores block
 440      */
 441     do
 442     {
 443       if ("".equals(line.trim()))
 444       {
 445         // terminated
 446         break;
 447       }
 448
 449       // split the line on the first blank
 450       // the first part have to contain the sequence id
 451       // the remaining part are the scores values
 452       int p = line.indexOf(" ");
 453       if (p == -1)
 454       {
 455         if (reader.warningMessage == null)
 456         {
 457           reader.warningMessage = "";
 458         }
 459         reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
 460                 + line + "'\n";
 461         continue;
 462       }
 463
 464       String id = line.substring(0, p).trim();
 465       String val = line.substring(p + 1).trim();
 466
 467       Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
 468       if( m.matches() ) {
 469           val = m.group(1);
 470       }
 471
 472       result.items.put(id, val);
 473
 474     } while ((line = reader.nextLine()) != null);
 475
 476     return result;
 477   }
 478
 479   /*
 480    * The score file header
 481    */
 482   static class Header
 483   {
 484     String head;
 485
 486     int score;
 487
 488     LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
 489
 490     public int getScoreAvg()
 491     {
 492       return score;
 493     }
 494
 495     public int getScoreFor(String ID)
 496     {
 497
 498       return scores.containsKey(ID) ? scores.get(ID) : -1;
 499
 500     }
 501   }
 502
 503   /*
 504    * Hold a single block values block in the score file
 505    */
 506   static class Block
 507   {
 508     int size;
 509
 510     Map<String, String> items;
 511
 512     public Block(int size)
 513     {
 514       this.size = size;
 515       this.items = new HashMap<String, String>(size);
 516     }
 517
 518     String getScoresFor(String id)
 519     {
 520       return items.get(id);
 521     }
 522
 523     String getConsensus()
 524     {
 525       return items.get("cons");
 526     }
 527   }
 528
 529   /**
 530    * TCOFFEE score colourscheme
 531    */
 532   static final Color[] colors =
 533   { new Color(102, 102, 255), // #6666FF
 534       new Color(0, 255, 0), // #00FF00
 535       new Color(102, 255, 0), // #66FF00
 536       new Color(204, 255, 0), // #CCFF00
 537       new Color(255, 255, 0), // #FFFF00
 538       new Color(255, 204, 0), // #FFCC00
 539       new Color(255, 153, 0), // #FF9900
 540       new Color(255, 102, 0), // #FF6600
 541       new Color(255, 51, 0), // #FF3300
 542       new Color(255, 34, 0) // #FF2000
 543   };
 544
 545   public final static String TCOFFEE_SCORE = "TCoffeeScore";
 546
 547   /**
 548    * generate annotation for this TCoffee score set on the given alignment
 549    *
 550    * @param al
 551    *          alignment to annotate
 552    * @param matchids
 553    *          if true, annotate sequences based on matching sequence names
 554    * @return true if alignment annotation was modified, false otherwise.
 555    */
 556   public boolean annotateAlignment(AlignmentI al, boolean matchids)
 557   {
 558     if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
 559     {
 560       String info = String.format("align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), al.getHeight(), getWidth(), getHeight() );
 561       warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + info;
 562       return false;
 563     }
 564     boolean added = false;
 565     int i = 0;
 566     SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
 567             al.getSequencesArray());
 568     byte[][] scoreMatrix = getScoresArray();
 569     // for 2.8 - we locate any existing TCoffee annotation and remove it first
 570     // before adding this.
 571     for (Map.Entry<String, StringBuilder> id : scores.entrySet())
 572     {
 573       byte[] srow = scoreMatrix[i];
 574       SequenceI s;
 575       if (matchids)
 576       {
 577         s = sidmatcher.findIdMatch(id.getKey());
 578       }
 579       else
 580       {
 581         s = al.getSequenceAt(i);
 582       }
 583       i++;
 584       if (s == null && i != scores.size() && !id.getKey().equals("cons"))
 585       {
 586         System.err.println("No "
 587                 + (matchids ? "match " : " sequences left ")
 588                 + " for TCoffee score set : " + id.getKey());
 589         continue;
 590       }
 591       int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
 592       Annotation[] annotations = new Annotation[al.getWidth()];
 593       for (int j = 0; j < jSize; j++)
 594       {
 595         byte val = srow[j];
 596         if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 597         {
 598           annotations[j] = null;
 599           if (val > 0)
 600           {
 601             System.err
 602                     .println("Warning: non-zero value for positional T-COFFEE score for gap at "
 603                             + j + " in sequence " + s.getName());
 604           }
 605         }
 606         else
 607         {
 608           annotations[j] = new Annotation(s == null ? "" + val : null,
 609                   s == null ? "" + val : null, '\0', val * 1f, val >= 0
 610                           && val < colors.length ? colors[val]
 611                           : Color.white);
 612         }
 613       }
 614       // this will overwrite any existing t-coffee scores for the alignment
 615       AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
 616               TCOFFEE_SCORE, false, s, null);
 617       if (s != null)
 618       {
 619         aa.label = "T-COFFEE";
 620         aa.description = "" + id.getKey();
 621         aa.annotations = annotations;
 622         aa.visible = false;
 623         aa.belowAlignment = false;
 624         aa.setScore(header.getScoreFor(id.getKey()));
 625         aa.createSequenceMapping(s, s.getStart(), true);
 626         s.addAlignmentAnnotation(aa);
 627         aa.adjustForAlignment();
 628       }
 629       else
 630       {
 631         aa.graph = AlignmentAnnotation.NO_GRAPH;
 632         aa.label = "T-COFFEE";
 633         aa.description = "TCoffee column reliability score";
 634         aa.annotations = annotations;
 635         aa.belowAlignment = true;
 636         aa.visible = true;
 637         aa.setScore(header.getScoreAvg());
 638       }
 639       aa.showAllColLabels = true;
 640       aa.validateRangeAndDisplay();
 641       added = true;
 642     }
 643
 644     return added;
 645   }
 646
 647   @Override
 648   public String print()
 649   {
 650     // TODO Auto-generated method stub
 651     return "Not valid.";
 652   }
 653 }