Merge remote-tracking branch 'origin/Tcoffee_JAL-1065' into develop
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
8
9 import java.awt.Color;
10 import java.io.BufferedReader;
11 import java.io.File;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
20 import java.util.Map;
21
22 /**
23  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
24  * for each resude in any sequence.
25  * <p>
26  * This file is procuded by <code>t_coffee</code> providing the option 
27  * <code>-output=score_ascii </code> to the program command line
28  * 
29  * An example file is the following 
30  * 
31  * <pre>
32  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
33  * Cedric Notredame 
34  * CPU TIME:0 sec.
35  * SCORE=90
36  * *
37  *  BAD AVG GOOD
38  * *
39  * 1PHT   :  89
40  * 1BB9   :  90
41  * 1UHC   :  94
42  * 1YCS   :  94
43  * 1OOT   :  93
44  * 1ABO   :  94
45  * 1FYN   :  94
46  * 1QCF   :  94
47  * cons   :  90
48  * 
49  * 1PHT   999999999999999999999999998762112222543211112134
50  * 1BB9   99999999999999999999999999987-------4322----2234
51  * 1UHC   99999999999999999999999999987-------5321----2246
52  * 1YCS   99999999999999999999999999986-------4321----1-35
53  * 1OOT   999999999999999999999999999861-------3------1135
54  * 1ABO   99999999999999999999999999986-------422-------34
55  * 1FYN   99999999999999999999999999985-------32--------35
56  * 1QCF   99999999999999999999999999974-------2---------24
57  * cons   999999999999999999999999999851000110321100001134
58  * 
59  * 
60  * 1PHT   ----------5666642367889999999999889
61  * 1BB9   1111111111676653-355679999999999889
62  * 1UHC   ----------788774--66789999999999889
63  * 1YCS   ----------78777--356789999999999889
64  * 1OOT   ----------78877--356789999999997-67
65  * 1ABO   ----------687774--56779999999999889
66  * 1FYN   ----------6888842356789999999999889
67  * 1QCF   ----------6878742356789999999999889
68  * cons   00100000006877641356789999999999889
69  * </pre>
70  * 
71  * 
72  * @author Paolo Di Tommaso
73  *
74  */
75 public class TCoffeeScoreFile {
76         
77         /** The {@link Header} structure holder */
78         Header header;
79         
80         /** 
81          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
82          * insertion order. 
83          */
84         LinkedHashMap<String,StringBuilder> scores = new LinkedHashMap<String,StringBuilder>();
85
86         Integer fWidth;
87
88         /**
89          * Parse the specified file.
90          * 
91          * @param file The file to be parsed 
92          */
93         public static TCoffeeScoreFile load(File file) {
94                 try {
95                         return load(new FileReader(file));
96                 } 
97                 catch (FileNotFoundException e) {
98                         throw new RuntimeException(e);
99                 }
100         }
101         
102         /**
103          * Parse the provided reader for the T-Coffee scores file format
104          * 
105          * @param reader 
106          */
107         public static TCoffeeScoreFile load(Reader reader) {
108
109                 try {
110                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
111                         TCoffeeScoreFile result = new TCoffeeScoreFile();
112                         result.doParsing(in);
113                         return result.header != null && result.scores != null ? result : null;
114                 }
115                 catch( Exception e) {
116                         throw new RuntimeException(e);
117                 }
118         }
119                 
120         /**
121          * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches 
122          * the number of sequences in the alignment
123          */
124         public int getHeight() {
125                 // the last entry will always be the 'global' alingment consensus scores, so it is removed 
126                 // from the 'height' count to make this value compatible with the number of sequences in the MSA
127                 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
128         }
129         
130         /**
131          * @return The 'width' of the score matrix i.e. the number of columns. 
132          * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries 
133          * have to have the same width.  
134          */
135         public int getWidth() {
136                 return fWidth != null ? fWidth : 0;
137         }
138         
139         /**
140          * The default constructor is marked as {@code protected} since this class is meant to created 
141          * through the {@link #load(File)} or {@link #load(Reader)} factory methods
142          */
143         protected TCoffeeScoreFile() { } 
144         
145         /**
146          * Get the string of score values for the specified seqeunce ID. 
147          * @param id The sequence ID 
148          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
149          *      It return an empty string when the specified ID is missing. 
150          */
151         public String getScoresFor( String id ) {
152                 return scores.containsKey(id) ? scores.get(id).toString() : "";
153         }
154         
155         /**
156          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
157          */
158         public List<String> getScoresList() {
159                 List<String> result = new ArrayList<String>( scores.size() );
160                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
161                         result.add(it.getValue().toString());
162                 }
163                 
164                 return result;
165         }
166         
167         /**
168          * @return The parsed score values a matrix of bytes
169          */
170         public byte[][] getScoresArray() { 
171                 byte[][] result = new byte[ scores.size() ][];
172                 
173                 int rowCount = 0;
174                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
175                         String line = it.getValue().toString();
176                         byte[] seqValues = new byte[ line.length() ];
177                         for( int j=0, c=line.length(); j<c; j++ ) {
178                                 
179                                 byte val = (byte)(line.charAt(j) - '0');
180
181                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
182                         }
183
184                         result[rowCount++] = seqValues;
185                 }
186                 
187                 return result;
188         }
189         
190
191         private void doParsing(BufferedReader in) throws IOException {
192
193                 /*
194                  * read the header
195                  */
196                 header = readHeader(in);
197
198                 if( header == null ) { return; }
199                 
200         
201                 /*
202                  * initilize the structure
203                  */
204                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
205                         scores.put( entry.getKey(), new StringBuilder());
206                 }
207                 
208                 /*
209                  * go with the reading
210                  */
211                 Block block;
212                 while( (block = readBlock(in, header.scores.size())) != null  ) {
213                         
214                         /*
215                          * append sequences read in the block
216                          */
217                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
218                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
219                                 if( scoreStringBuilder == null ) {
220                                         throw new RuntimeException(String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey()));
221                                 }
222                                 
223                                 scoreStringBuilder.append( entry.getValue() );
224                         }
225                 }
226                 
227                 /*
228                  * verify that all rows have the same width
229                  */
230                 for( StringBuilder str : scores.values() ) {
231                         if( fWidth == null ) {
232                                 fWidth = str.length();
233                         }
234                         else if( fWidth != str.length() ) {
235                                 throw new RuntimeException("Invalid T-Coffee score file: All the score sequences must have the same length");
236                         }
237                 }
238                 
239                 
240                 
241         }
242
243
244         static int parseInt( String str ) {
245                 try {
246                         return Integer.parseInt(str);
247                 }
248                 catch( NumberFormatException e ) {
249                         // TODO report a warning ?
250                         return 0;
251                 }               
252         }
253         
254         /**
255          * Reaad the header section in the T-Coffee score file format 
256          * 
257          * @param reader The scores reader 
258          * @return The parser {@link Header} instance 
259          * @throws RuntimeException when the header is not in the expected format
260          */
261         static Header readHeader(BufferedReader reader) {
262                 
263                 Header result = null;
264                 try {
265                         result = new Header();
266                         result.head = reader.readLine();
267                         
268                         String line;
269
270                         while( (line = reader.readLine()) != null ) {
271                                 if( line.startsWith("SCORE=")) {
272                                         result.score = parseInt( line.substring(6).trim() );
273                                         break;
274                                 }
275                         }
276
277                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
278                         if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null;
279                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
280                         
281                         /*
282                          * now are expected a list if sequences ID up to the first blank line
283                          */
284                         while( (line=reader.readLine()) != null ) {
285                                 if( "".equals(line) ) {
286                                         break;
287                                 }
288                                 
289                                 int p = line.indexOf(":");
290                                 if( p == -1 ) {
291                                         // TODO report a warning
292                                         continue;
293                                 }
294                                 
295                                 String id = line.substring(0,p).trim();
296                                 int val = parseInt(line.substring(p+1).trim());
297                                 if( "".equals(id) ) {
298                                         // TODO report warning
299                                         continue;
300                                 }
301                                 
302                                 result.scores.put(id,val);
303                         }
304                         
305                 }
306                 catch( IOException e ) {
307                         throw new RuntimeException("Cannot parse T-Coffee score ascii file", e);
308                 }
309                 
310                 return result;
311         } 
312         
313         /**
314          * Read a scores block ihe provided stream. 
315          * 
316          * @param reader The stream to parse
317          * @param size The expected number of the sequence to be read 
318          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
319          * @throws IOException Something went wrong on the 'wire' 
320          */
321         static Block readBlock( BufferedReader reader, int size ) throws IOException {
322                 Block result = new Block(size);
323                 String line;
324                 
325                 /*
326                  * read blank lines (eventually)
327                  */
328                 while( (line=reader.readLine()) != null && "".equals(line.trim())) {
329                         // consume blank lines 
330                 }
331                 
332                 if( line == null ) return null;
333                 
334                 /*
335                  * read the scores block
336                  */
337                 do {
338                         if( "".equals(line.trim()) ) {
339                                 // terminated
340                                 break;
341                         }
342                         
343                         // split the line on the first blank 
344                         // the first part have to contain the sequence id
345                         // theramining part are the scores values
346                         int p = line.indexOf(" ");
347                         if( p == -1 ) {
348                                 //TODO This is an unexpected condition, log a warning or throw an exception ? 
349                                 continue;
350                         } 
351                         
352                         String id = line.substring(0,p).trim();
353                         String val = line.substring(p+1).trim();
354                         
355                         result.items.put(id, val);
356                         
357                 } while( (line = reader.readLine()) != null ); 
358                 
359
360                 return result;
361         }
362
363         /*
364          * The score file header 
365          */
366         static class Header {
367                 String head;
368                 int score;
369
370                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
371                 
372                 public int getScoreAvg() { return score; }
373                 
374                 public int getScoreFor( String ID ) { 
375
376                         return scores.containsKey(ID) ? scores.get(ID) : -1;
377         
378                 }
379         }
380         
381         /*
382          * Hold a single block values block in the score file
383          */
384         static class Block {
385                 int size;
386                 Map<String,String> items;
387                 
388                 public Block( int size ) {
389                         this.size = size;
390                         this.items = new HashMap<String,String>(size);
391                 } 
392         
393                 String getScoresFor( String id ) {
394                         return items.get(id);
395                 }
396                 
397                 String getConsensus() {
398                         return items.get("cons");
399                 }
400         }
401         /**
402          * TCOFFEE score colourscheme
403          */
404         static final Color[] colors = {
405                         new Color( 102, 102, 255 ),     // #6666FF
406                         new Color( 0, 255, 0),          // #00FF00
407                         new Color( 102, 255, 0),        // #66FF00
408                         new Color( 204, 255, 0),        // #CCFF00
409                         new Color( 255, 255, 0),        // #FFFF00
410                         new Color( 255, 204, 0),        // #FFCC00
411                         new Color( 255, 153, 0),        // #FF9900
412                         new Color( 255, 102, 0),        // #FF6600
413                         new Color( 255, 51, 0),         // #FF3300
414                         new Color( 255, 34, 0)          // #FF2000
415                 };
416         public final static String TCOFFEE_SCORE="TCoffeeScore";
417         /**
418          * generate annotation for this TCoffee score set on the given alignment
419          * @param al alignment to annotate
420          * @param matchids if true, annotate sequences based on matching sequence names
421          * @return true if alignment annotation was modified, false otherwise.
422          */
423         public boolean annotateAlignment(AlignmentI al, boolean matchids)
424         {
425           boolean added=false;
426           int i=0;
427           SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
428           byte[][] scoreMatrix=getScoresArray();
429           // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
430           for (Map.Entry<String,StringBuilder> id:scores.entrySet())
431           {
432             byte[] srow=scoreMatrix[i];
433             SequenceI s;
434             if (matchids)
435             {
436               s=sidmatcher.findIdMatch(id.getKey());
437             } else {
438               s=al.getSequenceAt(i);
439             }
440             i++;
441             if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
442             {
443               System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
444               continue;
445             }
446             int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
447             Annotation[] annotations=new Annotation[al.getWidth()];
448             for (int j=0;j<jSize;j++) {
449               byte val = srow[j];
450               annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,(char) val,val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
451             }
452             AlignmentAnnotation aa=null;
453             if (s!=null)
454             {
455               // TODO - set per sequence score
456               aa=new AlignmentAnnotation(TCOFFEE_SCORE, "Score for "+id.getKey(), annotations);
457               
458               aa.setSequenceRef(s);
459               aa.visible=false;
460               aa.belowAlignment=false;
461             } else {
462               aa=new AlignmentAnnotation("T-COFFEE", "TCoffee column reliability score", annotations);
463               aa.belowAlignment=true;
464               aa.visible=true;
465               
466             }
467             al.addAnnotation(aa);
468             added=true;
469           }
470           return added;
471         }
472           
473
474 }