src/jalview/io/TCoffeeScoreFile.java

   1 package jalview.io;
   2
   3 import jalview.analysis.SequenceIdMatcher;
   4 import jalview.datamodel.AlignmentAnnotation;
   5 import jalview.datamodel.AlignmentI;
   6 import jalview.datamodel.Annotation;
   7 import jalview.datamodel.SequenceI;
   8
   9 import java.awt.Color;
  10 import java.io.BufferedReader;
  11 import java.io.File;
  12 import java.io.FileNotFoundException;
  13 import java.io.FileReader;
  14 import java.io.IOException;
  15 import java.io.Reader;
  16 import java.util.ArrayList;
  17 import java.util.HashMap;
  18 import java.util.LinkedHashMap;
  19 import java.util.List;
  20 import java.util.Map;
  21
  22 /**
  23  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus
  24  * for each resude in any sequence.
  25  * <p>
  26  * This file is procuded by <code>t_coffee</code> providing the option
  27  * <code>-output=score_ascii </code> to the program command line
  28  *
  29  * An example file is the following
  30  *
  31  * <pre>
  32  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
  33  * Cedric Notredame
  34  * CPU TIME:0 sec.
  35  * SCORE=90
  36  * *
  37  *  BAD AVG GOOD
  38  * *
  39  * 1PHT   :  89
  40  * 1BB9   :  90
  41  * 1UHC   :  94
  42  * 1YCS   :  94
  43  * 1OOT   :  93
  44  * 1ABO   :  94
  45  * 1FYN   :  94
  46  * 1QCF   :  94
  47  * cons   :  90
  48  *
  49  * 1PHT   999999999999999999999999998762112222543211112134
  50  * 1BB9   99999999999999999999999999987-------4322----2234
  51  * 1UHC   99999999999999999999999999987-------5321----2246
  52  * 1YCS   99999999999999999999999999986-------4321----1-35
  53  * 1OOT   999999999999999999999999999861-------3------1135
  54  * 1ABO   99999999999999999999999999986-------422-------34
  55  * 1FYN   99999999999999999999999999985-------32--------35
  56  * 1QCF   99999999999999999999999999974-------2---------24
  57  * cons   999999999999999999999999999851000110321100001134
  58  *
  59  *
  60  * 1PHT   ----------5666642367889999999999889
  61  * 1BB9   1111111111676653-355679999999999889
  62  * 1UHC   ----------788774--66789999999999889
  63  * 1YCS   ----------78777--356789999999999889
  64  * 1OOT   ----------78877--356789999999997-67
  65  * 1ABO   ----------687774--56779999999999889
  66  * 1FYN   ----------6888842356789999999999889
  67  * 1QCF   ----------6878742356789999999999889
  68  * cons   00100000006877641356789999999999889
  69  * </pre>
  70  *
  71  *
  72  * @author Paolo Di Tommaso
  73  *
  74  */
  75 public class TCoffeeScoreFile extends AlignFile {
  76
  77   public TCoffeeScoreFile(String inFile, String type) throws IOException
  78   {
  79     super(inFile, type);
  80
  81   }
  82
  83   public TCoffeeScoreFile(FileParse source) throws IOException
  84   {
  85     super(source);
  86   }
  87
  88         /** The {@link Header} structure holder */
  89         Header header;
  90
  91         /**
  92          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the
  93          * insertion order.
  94          */
  95         LinkedHashMap<String,StringBuilder> scores;
  96
  97         Integer fWidth;
  98
  99         /**
 100          * Parse the provided reader for the T-Coffee scores file format
 101          *
 102          * @param reader
 103         public static TCoffeeScoreFile load(Reader reader) {
 104
 105                 try {
 106                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
 107                         TCoffeeScoreFile result = new TCoffeeScoreFile();
 108                         result.doParsing(in);
 109                         return result.header != null && result.scores != null ? result : null;
 110                 }
 111                 catch( Exception e) {
 112                         throw new RuntimeException(e);
 113                 }
 114         }
 115          */
 116
 117         /**
 118          * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches
 119          * the number of sequences in the alignment
 120          */
 121         public int getHeight() {
 122                 // the last entry will always be the 'global' alingment consensus scores, so it is removed
 123                 // from the 'height' count to make this value compatible with the number of sequences in the MSA
 124                 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
 125         }
 126
 127         /**
 128          * @return The 'width' of the score matrix i.e. the number of columns.
 129          * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries
 130          * have to have the same width.
 131          */
 132         public int getWidth() {
 133                 return fWidth != null ? fWidth : 0;
 134         }
 135
 136
 137         /**
 138          * Get the string of score values for the specified seqeunce ID.
 139          * @param id The sequence ID
 140          * @return The scores as a string of values e.g. {@code 99999987-------432}.
 141          *      It return an empty string when the specified ID is missing.
 142          */
 143         public String getScoresFor( String id ) {
 144                 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
 145         }
 146
 147         /**
 148          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
 149          */
 150         public List<String> getScoresList() {
 151           if (scores==null)
 152           {
 153             return null;
 154           }
 155                 List<String> result = new ArrayList<String>( scores.size() );
 156                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
 157                         result.add(it.getValue().toString());
 158                 }
 159
 160                 return result;
 161         }
 162
 163         /**
 164          * @return The parsed score values a matrix of bytes
 165          */
 166         public byte[][] getScoresArray() {
 167           if (scores==null)
 168           {
 169             return null;
 170           }
 171                 byte[][] result = new byte[ scores.size() ][];
 172
 173                 int rowCount = 0;
 174                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
 175                         String line = it.getValue().toString();
 176                         byte[] seqValues = new byte[ line.length() ];
 177                         for( int j=0, c=line.length(); j<c; j++ ) {
 178
 179                                 byte val = (byte)(line.charAt(j) - '0');
 180
 181                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1;
 182                         }
 183
 184                         result[rowCount++] = seqValues;
 185                 }
 186
 187                 return result;
 188         }
 189
 190
 191         public void parse() throws IOException
 192         {
 193                 /*
 194                  * read the header
 195                  */
 196                 header = readHeader(this);
 197
 198                 if( header == null ) { error=true; return;}
 199                 scores = new LinkedHashMap<String,StringBuilder>();
 200
 201                 /*
 202                  * initilize the structure
 203                  */
 204                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
 205                         scores.put( entry.getKey(), new StringBuilder());
 206                 }
 207
 208                 /*
 209                  * go with the reading
 210                  */
 211                 Block block;
 212                 while( (block = readBlock(this,header.scores.size())) != null  ) {
 213
 214                         /*
 215                          * append sequences read in the block
 216                          */
 217                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
 218                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
 219                                 if( scoreStringBuilder == null ) {
 220                                         error=true;
 221                                         errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
 222                                         return ;
 223                                 }
 224
 225                                 scoreStringBuilder.append( entry.getValue() );
 226                         }
 227                 }
 228
 229                 /*
 230                  * verify that all rows have the same width
 231                  */
 232                 for( StringBuilder str : scores.values() ) {
 233                         if( fWidth == null ) {
 234                                 fWidth = str.length();
 235                         }
 236                         else if( fWidth != str.length() ) {
 237                           error=true;
 238                           errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
 239                           return ;
 240                         }
 241                 }
 242
 243
 244                 return;
 245         }
 246
 247
 248         static int parseInt( String str ) {
 249                 try {
 250                         return Integer.parseInt(str);
 251                 }
 252                 catch( NumberFormatException e ) {
 253                         // TODO report a warning ?
 254                         return 0;
 255                 }
 256         }
 257
 258         /**
 259          * Reaad the header section in the T-Coffee score file format
 260          *
 261          * @param reader The scores reader
 262          * @return The parser {@link Header} instance
 263          * @throws RuntimeException when the header is not in the expected format
 264          */
 265         static Header readHeader(FileParse reader) throws IOException {
 266
 267                 Header result = null;
 268                 try {
 269                         result = new Header();
 270                         result.head = reader.nextLine();
 271
 272                         String line;
 273
 274                         while( (line = reader.nextLine()) != null ) {
 275                                 if( line.startsWith("SCORE=")) {
 276                                         result.score = parseInt( line.substring(6).trim() );
 277                                         break;
 278                                 }
 279                         }
 280
 281                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
 282                         if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
 283                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
 284
 285                         /*
 286                          * now are expected a list if sequences ID up to the first blank line
 287                          */
 288                         while( (line=reader.nextLine()) != null ) {
 289                                 if( "".equals(line) ) {
 290                                         break;
 291                                 }
 292
 293                                 int p = line.indexOf(":");
 294                                 if( p == -1 ) {
 295                                         // TODO report a warning
 296                                         continue;
 297                                 }
 298
 299                                 String id = line.substring(0,p).trim();
 300                                 int val = parseInt(line.substring(p+1).trim());
 301                                 if( "".equals(id) ) {
 302                                         // TODO report warning
 303                                         continue;
 304                                 }
 305
 306                                 result.scores.put(id,val);
 307                         }
 308
 309                         if (result==null) {
 310                           error(reader, "T-COFFEE score file had no per-sequence scores");
 311                         }
 312
 313                 }
 314                 catch( IOException e ) {
 315                   error(reader,"Unexpected problem parsing T-Coffee score ascii file");
 316                   throw e;
 317                 }
 318
 319                 return result;
 320         }
 321         private static void error(FileParse reader, String errm)
 322         {
 323           reader.error=true;
 324           if (reader.errormessage==null)
 325           { reader.errormessage=errm;
 326           } else {
 327             reader.errormessage+="\n"+errm;
 328           }
 329         }
 330         /**
 331          * Read a scores block ihe provided stream.
 332          *
 333          * @param reader The stream to parse
 334          * @param size The expected number of the sequence to be read
 335          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
 336          * @throws IOException Something went wrong on the 'wire'
 337          */
 338         static Block readBlock( FileParse reader, int size ) throws IOException {
 339                 Block result = new Block(size);
 340                 String line;
 341
 342                 /*
 343                  * read blank lines (eventually)
 344                  */
 345                 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
 346                         // consume blank lines
 347                 }
 348
 349                 if( line == null ) { return null; }
 350
 351                 /*
 352                  * read the scores block
 353                  */
 354                 do {
 355                         if( "".equals(line.trim()) ) {
 356                                 // terminated
 357                                 break;
 358                         }
 359
 360                         // split the line on the first blank
 361                         // the first part have to contain the sequence id
 362                         // the remaining part are the scores values
 363                         int p = line.indexOf(" ");
 364                         if( p == -1 ) {
 365                           if (reader.warningMessage==null) { reader.warningMessage=""; }
 366                           reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
 367                                 continue;
 368                         }
 369
 370                         String id = line.substring(0,p).trim();
 371                         String val = line.substring(p+1).trim();
 372
 373                         result.items.put(id, val);
 374
 375                 } while( (line = reader.nextLine()) != null );
 376
 377
 378                 return result;
 379         }
 380
 381         /*
 382          * The score file header
 383          */
 384         static class Header {
 385                 String head;
 386                 int score;
 387
 388                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
 389
 390                 public int getScoreAvg() { return score; }
 391
 392                 public int getScoreFor( String ID ) {
 393
 394                         return scores.containsKey(ID) ? scores.get(ID) : -1;
 395
 396                 }
 397         }
 398
 399         /*
 400          * Hold a single block values block in the score file
 401          */
 402         static class Block {
 403                 int size;
 404                 Map<String,String> items;
 405
 406                 public Block( int size ) {
 407                         this.size = size;
 408                         this.items = new HashMap<String,String>(size);
 409                 }
 410
 411                 String getScoresFor( String id ) {
 412                         return items.get(id);
 413                 }
 414
 415                 String getConsensus() {
 416                         return items.get("cons");
 417                 }
 418         }
 419         /**
 420          * TCOFFEE score colourscheme
 421          */
 422         static final Color[] colors = {
 423                         new Color( 102, 102, 255 ),     // #6666FF
 424                         new Color( 0, 255, 0),          // #00FF00
 425                         new Color( 102, 255, 0),        // #66FF00
 426                         new Color( 204, 255, 0),        // #CCFF00
 427                         new Color( 255, 255, 0),        // #FFFF00
 428                         new Color( 255, 204, 0),        // #FFCC00
 429                         new Color( 255, 153, 0),        // #FF9900
 430                         new Color( 255, 102, 0),        // #FF6600
 431                         new Color( 255, 51, 0),         // #FF3300
 432                         new Color( 255, 34, 0)          // #FF2000
 433                 };
 434         public final static String TCOFFEE_SCORE="TCoffeeScore";
 435         /**
 436          * generate annotation for this TCoffee score set on the given alignment
 437          * @param al alignment to annotate
 438          * @param matchids if true, annotate sequences based on matching sequence names
 439          * @return true if alignment annotation was modified, false otherwise.
 440          */
 441         public boolean annotateAlignment(AlignmentI al, boolean matchids)
 442         {
 443           if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
 444           {
 445             warningMessage="Alignment shape does not match T-Coffee score file shape.";
 446             return false;
 447           }
 448           boolean added=false;
 449           int i=0;
 450           SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
 451           byte[][] scoreMatrix=getScoresArray();
 452           // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
 453           for (Map.Entry<String,StringBuilder> id:scores.entrySet())
 454           {
 455             byte[] srow=scoreMatrix[i];
 456             SequenceI s;
 457             if (matchids)
 458             {
 459               s=sidmatcher.findIdMatch(id.getKey());
 460             } else {
 461               s=al.getSequenceAt(i);
 462             }
 463             i++;
 464             if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
 465             {
 466               System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
 467               continue;
 468             }
 469             int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
 470             Annotation[] annotations=new Annotation[al.getWidth()];
 471             for (int j=0;j<jSize;j++) {
 472               byte val = srow[j];
 473               if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
 474               {
 475                 annotations[j]=null;
 476                 if (val>0)
 477                 {
 478                   System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
 479                 }
 480               } else {
 481               annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
 482               }
 483             }
 484             // this will overwrite any existing t-coffee scores for the alignment
 485             AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,false,s,null);
 486             if (s!=null)
 487             {
 488               aa.label="T-COFFEE";
 489               aa.description=""+id.getKey();
 490               aa.annotations=annotations;
 491               aa.visible=false;
 492               aa.belowAlignment=false;
 493               aa.setScore(header.getScoreFor(id.getKey()));
 494               aa.createSequenceMapping(s, s.getStart(),true);
 495               s.addAlignmentAnnotation(aa);
 496               aa.adjustForAlignment();
 497             } else {
 498               aa.graph=AlignmentAnnotation.NO_GRAPH;
 499               aa.label="T-COFFEE";
 500               aa.description="TCoffee column reliability score";
 501               aa.annotations=annotations;
 502               aa.belowAlignment=true;
 503               aa.visible=true;
 504               aa.setScore(header.getScoreAvg());
 505             }
 506             aa.showAllColLabels=true;
 507             aa.validateRangeAndDisplay();
 508             added=true;
 509           }
 510
 511           return added;
 512         }
 513
 514   @Override
 515   public String print()
 516   {
 517     // TODO Auto-generated method stub
 518     return "Not valid.";
 519   }
 520 }