JAL-1065 per sequence and per-alignment scores added to annotation
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
8
9 import java.awt.Color;
10 import java.io.BufferedReader;
11 import java.io.File;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
20 import java.util.Map;
21
22 /**
23  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
24  * for each resude in any sequence.
25  * <p>
26  * This file is procuded by <code>t_coffee</code> providing the option 
27  * <code>-output=score_ascii </code> to the program command line
28  * 
29  * An example file is the following 
30  * 
31  * <pre>
32  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
33  * Cedric Notredame 
34  * CPU TIME:0 sec.
35  * SCORE=90
36  * *
37  *  BAD AVG GOOD
38  * *
39  * 1PHT   :  89
40  * 1BB9   :  90
41  * 1UHC   :  94
42  * 1YCS   :  94
43  * 1OOT   :  93
44  * 1ABO   :  94
45  * 1FYN   :  94
46  * 1QCF   :  94
47  * cons   :  90
48  * 
49  * 1PHT   999999999999999999999999998762112222543211112134
50  * 1BB9   99999999999999999999999999987-------4322----2234
51  * 1UHC   99999999999999999999999999987-------5321----2246
52  * 1YCS   99999999999999999999999999986-------4321----1-35
53  * 1OOT   999999999999999999999999999861-------3------1135
54  * 1ABO   99999999999999999999999999986-------422-------34
55  * 1FYN   99999999999999999999999999985-------32--------35
56  * 1QCF   99999999999999999999999999974-------2---------24
57  * cons   999999999999999999999999999851000110321100001134
58  * 
59  * 
60  * 1PHT   ----------5666642367889999999999889
61  * 1BB9   1111111111676653-355679999999999889
62  * 1UHC   ----------788774--66789999999999889
63  * 1YCS   ----------78777--356789999999999889
64  * 1OOT   ----------78877--356789999999997-67
65  * 1ABO   ----------687774--56779999999999889
66  * 1FYN   ----------6888842356789999999999889
67  * 1QCF   ----------6878742356789999999999889
68  * cons   00100000006877641356789999999999889
69  * </pre>
70  * 
71  * 
72  * @author Paolo Di Tommaso
73  *
74  */
75 public class TCoffeeScoreFile extends AlignFile {
76         
77   public TCoffeeScoreFile(String inFile, String type) throws IOException
78   {
79     super(inFile, type);
80     
81   }
82
83   public TCoffeeScoreFile(FileParse source) throws IOException
84   {
85     super(source);
86   }
87
88         /** The {@link Header} structure holder */
89         Header header;
90         
91         /** 
92          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
93          * insertion order. 
94          */
95         LinkedHashMap<String,StringBuilder> scores;
96
97         Integer fWidth;
98         
99         /**
100          * Parse the provided reader for the T-Coffee scores file format
101          * 
102          * @param reader 
103         public static TCoffeeScoreFile load(Reader reader) {
104
105                 try {
106                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
107                         TCoffeeScoreFile result = new TCoffeeScoreFile();
108                         result.doParsing(in);
109                         return result.header != null && result.scores != null ? result : null;
110                 }
111                 catch( Exception e) {
112                         throw new RuntimeException(e);
113                 }
114         }
115          */
116                 
117         /**
118          * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches 
119          * the number of sequences in the alignment
120          */
121         public int getHeight() {
122                 // the last entry will always be the 'global' alingment consensus scores, so it is removed 
123                 // from the 'height' count to make this value compatible with the number of sequences in the MSA
124                 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
125         }
126         
127         /**
128          * @return The 'width' of the score matrix i.e. the number of columns. 
129          * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries 
130          * have to have the same width.  
131          */
132         public int getWidth() {
133                 return fWidth != null ? fWidth : 0;
134         }
135         
136         
137         /**
138          * Get the string of score values for the specified seqeunce ID. 
139          * @param id The sequence ID 
140          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
141          *      It return an empty string when the specified ID is missing. 
142          */
143         public String getScoresFor( String id ) {
144                 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
145         }
146         
147         /**
148          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
149          */
150         public List<String> getScoresList() {
151           if (scores==null)
152           {
153             return null;
154           }
155                 List<String> result = new ArrayList<String>( scores.size() );
156                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
157                         result.add(it.getValue().toString());
158                 }
159                 
160                 return result;
161         }
162         
163         /**
164          * @return The parsed score values a matrix of bytes
165          */
166         public byte[][] getScoresArray() { 
167           if (scores==null)
168           {
169             return null;
170           }
171                 byte[][] result = new byte[ scores.size() ][];
172                 
173                 int rowCount = 0;
174                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
175                         String line = it.getValue().toString();
176                         byte[] seqValues = new byte[ line.length() ];
177                         for( int j=0, c=line.length(); j<c; j++ ) {
178                                 
179                                 byte val = (byte)(line.charAt(j) - '0');
180
181                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
182                         }
183
184                         result[rowCount++] = seqValues;
185                 }
186                 
187                 return result;
188         }
189         
190
191         public void parse() throws IOException
192         {
193                 /*
194                  * read the header
195                  */
196                 header = readHeader(this);
197
198                 if( header == null ) { error=true; return;}
199                 scores = new LinkedHashMap<String,StringBuilder>();
200         
201                 /*
202                  * initilize the structure
203                  */
204                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
205                         scores.put( entry.getKey(), new StringBuilder());
206                 }
207                 
208                 /*
209                  * go with the reading
210                  */
211                 Block block;
212                 while( (block = readBlock(this,header.scores.size())) != null  ) {
213                         
214                         /*
215                          * append sequences read in the block
216                          */
217                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
218                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
219                                 if( scoreStringBuilder == null ) {
220                                         error=true;
221                                         errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
222                                         return ;
223                                 }
224                                 
225                                 scoreStringBuilder.append( entry.getValue() );
226                         }
227                 }
228                 
229                 /*
230                  * verify that all rows have the same width
231                  */
232                 for( StringBuilder str : scores.values() ) {
233                         if( fWidth == null ) {
234                                 fWidth = str.length();
235                         }
236                         else if( fWidth != str.length() ) {
237                           error=true;
238                           errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
239                           return ;
240                         }
241                 }
242                 
243                 
244                 return;
245         }
246
247
248         static int parseInt( String str ) {
249                 try {
250                         return Integer.parseInt(str);
251                 }
252                 catch( NumberFormatException e ) {
253                         // TODO report a warning ?
254                         return 0;
255                 }               
256         }
257         
258         /**
259          * Reaad the header section in the T-Coffee score file format 
260          * 
261          * @param reader The scores reader 
262          * @return The parser {@link Header} instance 
263          * @throws RuntimeException when the header is not in the expected format
264          */
265         static Header readHeader(FileParse reader) throws IOException {
266                 
267                 Header result = null;
268                 try {
269                         result = new Header();
270                         result.head = reader.nextLine();
271                         
272                         String line;
273
274                         while( (line = reader.nextLine()) != null ) {
275                                 if( line.startsWith("SCORE=")) {
276                                         result.score = parseInt( line.substring(6).trim() );
277                                         break;
278                                 }
279                         }
280
281                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
282                         if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
283                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
284                         
285                         /*
286                          * now are expected a list if sequences ID up to the first blank line
287                          */
288                         while( (line=reader.nextLine()) != null ) {
289                                 if( "".equals(line) ) {
290                                         break;
291                                 }
292                                 
293                                 int p = line.indexOf(":");
294                                 if( p == -1 ) {
295                                         // TODO report a warning
296                                         continue;
297                                 }
298                                 
299                                 String id = line.substring(0,p).trim();
300                                 int val = parseInt(line.substring(p+1).trim());
301                                 if( "".equals(id) ) {
302                                         // TODO report warning
303                                         continue;
304                                 }
305                                 
306                                 result.scores.put(id,val);
307                         }
308                         
309                         if (result==null) {
310                           error(reader, "T-COFFEE score file had no per-sequence scores");
311                         }
312                         
313                 }
314                 catch( IOException e ) {
315                   error(reader,"Unexpected problem parsing T-Coffee score ascii file");
316                   throw e;
317                 }
318                 
319                 return result;
320         } 
321         private static void error(FileParse reader, String errm)
322         {
323           reader.error=true;
324           if (reader.errormessage==null)
325           { reader.errormessage=errm;
326           } else {
327             reader.errormessage+="\n"+errm;
328           }
329         }
330         /**
331          * Read a scores block ihe provided stream. 
332          * 
333          * @param reader The stream to parse
334          * @param size The expected number of the sequence to be read 
335          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
336          * @throws IOException Something went wrong on the 'wire' 
337          */
338         static Block readBlock( FileParse reader, int size ) throws IOException {
339                 Block result = new Block(size);
340                 String line;
341                 
342                 /*
343                  * read blank lines (eventually)
344                  */
345                 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
346                         // consume blank lines 
347                 }
348                 
349                 if( line == null ) { return null; }
350                 
351                 /*
352                  * read the scores block
353                  */
354                 do {
355                         if( "".equals(line.trim()) ) {
356                                 // terminated
357                                 break;
358                         }
359                         
360                         // split the line on the first blank 
361                         // the first part have to contain the sequence id
362                         // the remaining part are the scores values
363                         int p = line.indexOf(" ");
364                         if( p == -1 ) {
365                           if (reader.warningMessage==null) { reader.warningMessage=""; }
366                           reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
367                                 continue;
368                         } 
369                         
370                         String id = line.substring(0,p).trim();
371                         String val = line.substring(p+1).trim();
372                         
373                         result.items.put(id, val);
374                         
375                 } while( (line = reader.nextLine()) != null ); 
376                 
377
378                 return result;
379         }
380
381         /*
382          * The score file header 
383          */
384         static class Header {
385                 String head;
386                 int score;
387
388                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
389                 
390                 public int getScoreAvg() { return score; }
391                 
392                 public int getScoreFor( String ID ) { 
393
394                         return scores.containsKey(ID) ? scores.get(ID) : -1;
395         
396                 }
397         }
398         
399         /*
400          * Hold a single block values block in the score file
401          */
402         static class Block {
403                 int size;
404                 Map<String,String> items;
405                 
406                 public Block( int size ) {
407                         this.size = size;
408                         this.items = new HashMap<String,String>(size);
409                 } 
410         
411                 String getScoresFor( String id ) {
412                         return items.get(id);
413                 }
414                 
415                 String getConsensus() {
416                         return items.get("cons");
417                 }
418         }
419         /**
420          * TCOFFEE score colourscheme
421          */
422         static final Color[] colors = {
423                         new Color( 102, 102, 255 ),     // #6666FF
424                         new Color( 0, 255, 0),          // #00FF00
425                         new Color( 102, 255, 0),        // #66FF00
426                         new Color( 204, 255, 0),        // #CCFF00
427                         new Color( 255, 255, 0),        // #FFFF00
428                         new Color( 255, 204, 0),        // #FFCC00
429                         new Color( 255, 153, 0),        // #FF9900
430                         new Color( 255, 102, 0),        // #FF6600
431                         new Color( 255, 51, 0),         // #FF3300
432                         new Color( 255, 34, 0)          // #FF2000
433                 };
434         public final static String TCOFFEE_SCORE="TCoffeeScore";
435         /**
436          * generate annotation for this TCoffee score set on the given alignment
437          * @param al alignment to annotate
438          * @param matchids if true, annotate sequences based on matching sequence names
439          * @return true if alignment annotation was modified, false otherwise.
440          */
441         public boolean annotateAlignment(AlignmentI al, boolean matchids)
442         {
443           if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
444           {
445             warningMessage="Alignment shape does not match T-Coffee score file shape.";
446             return false;
447           }
448           boolean added=false;
449           int i=0;
450           SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
451           byte[][] scoreMatrix=getScoresArray();
452           // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
453           for (Map.Entry<String,StringBuilder> id:scores.entrySet())
454           {
455             byte[] srow=scoreMatrix[i];
456             SequenceI s;
457             if (matchids)
458             {
459               s=sidmatcher.findIdMatch(id.getKey());
460             } else {
461               s=al.getSequenceAt(i);
462             }
463             i++;
464             if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
465             {
466               System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
467               continue;
468             }
469             int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
470             Annotation[] annotations=new Annotation[al.getWidth()];
471             for (int j=0;j<jSize;j++) {
472               byte val = srow[j];
473               if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
474               {
475                 annotations[j]=null;
476                 if (val>0)
477                 {
478                   System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
479                 }
480               } else {
481               annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
482               }
483             }
484             // this will overwrite any existing t-coffee scores for the alignment
485             AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,false,s,null);
486             if (s!=null)
487             {
488               aa.label="T-COFFEE";
489               aa.description=""+id.getKey();
490               aa.annotations=annotations;
491               aa.visible=false;
492               aa.belowAlignment=false;
493               aa.setScore(header.getScoreFor(id.getKey()));
494               aa.createSequenceMapping(s, s.getStart(),true);
495               s.addAlignmentAnnotation(aa);
496               aa.adjustForAlignment();
497             } else {
498               aa.graph=AlignmentAnnotation.NO_GRAPH;
499               aa.label="T-COFFEE";
500               aa.description="TCoffee column reliability score";
501               aa.annotations=annotations;
502               aa.belowAlignment=true;
503               aa.visible=true;
504               aa.setScore(header.getScoreAvg());
505             }
506             aa.showAllColLabels=true;
507             aa.validateRangeAndDisplay();
508             added=true;
509           }
510           
511           return added;
512         }
513
514   @Override
515   public String print()
516   {
517     // TODO Auto-generated method stub
518     return "Not valid.";
519   }
520 }