JAL-1065 JAL-1066 - refactored tcoffee score-> jalview vis object code to a create...
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
8
9 import java.awt.Color;
10 import java.io.BufferedReader;
11 import java.io.File;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
20 import java.util.Map;
21
22 /**
23  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
24  * for each resude in any sequence.
25  * <p>
26  * This file is procuded by <code>t_coffee</code> providing the option 
27  * <code>-output=score_ascii </code> to the program command line
28  * 
29  * An example file is the following 
30  * 
31  * <pre>
32  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
33  * Cedric Notredame 
34  * CPU TIME:0 sec.
35  * SCORE=90
36  * *
37  *  BAD AVG GOOD
38  * *
39  * 1PHT   :  89
40  * 1BB9   :  90
41  * 1UHC   :  94
42  * 1YCS   :  94
43  * 1OOT   :  93
44  * 1ABO   :  94
45  * 1FYN   :  94
46  * 1QCF   :  94
47  * cons   :  90
48  * 
49  * 1PHT   999999999999999999999999998762112222543211112134
50  * 1BB9   99999999999999999999999999987-------4322----2234
51  * 1UHC   99999999999999999999999999987-------5321----2246
52  * 1YCS   99999999999999999999999999986-------4321----1-35
53  * 1OOT   999999999999999999999999999861-------3------1135
54  * 1ABO   99999999999999999999999999986-------422-------34
55  * 1FYN   99999999999999999999999999985-------32--------35
56  * 1QCF   99999999999999999999999999974-------2---------24
57  * cons   999999999999999999999999999851000110321100001134
58  * 
59  * 
60  * 1PHT   ----------5666642367889999999999889
61  * 1BB9   1111111111676653-355679999999999889
62  * 1UHC   ----------788774--66789999999999889
63  * 1YCS   ----------78777--356789999999999889
64  * 1OOT   ----------78877--356789999999997-67
65  * 1ABO   ----------687774--56779999999999889
66  * 1FYN   ----------6888842356789999999999889
67  * 1QCF   ----------6878742356789999999999889
68  * cons   00100000006877641356789999999999889
69  * </pre>
70  * 
71  * 
72  * @author Paolo Di Tommaso
73  *
74  */
75 public class TCoffeeScoreFile {
76         
77         /** The {@link Header} structure holder */
78         Header header;
79         
80         /** 
81          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
82          * insertion order. 
83          */
84         LinkedHashMap<String,StringBuilder> scores = new LinkedHashMap<String,StringBuilder>();
85         
86
87         /**
88          * Parse the specified file.
89          * 
90          * @param file The file to be parsed 
91          */
92         public static TCoffeeScoreFile load(File file) {
93                 try {
94                         return load(new FileReader(file));
95                 } 
96                 catch (FileNotFoundException e) {
97                         throw new RuntimeException(e);
98                 }
99         }
100         
101         /**
102          * Parse the provided reader for the T-Coffee scores file format
103          * 
104          * @param reader 
105          */
106         public static TCoffeeScoreFile load(Reader reader) {
107
108                 try {
109                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
110                         TCoffeeScoreFile result = new TCoffeeScoreFile();
111                         result.doParsing(in);
112                         return result.header != null && result.scores != null ? result : null;
113                 }
114                 catch( Exception e) {
115                         throw new RuntimeException(e);
116                 }
117         }
118                 
119         /**
120          * The default constructor is marked as {@code protected} since this class is meant to created 
121          * through the {@link #load(File)} or {@link #load(Reader)} factory methods
122          */
123         protected TCoffeeScoreFile() { } 
124         
125         /**
126          * Get the string of score values for the specified seqeunce ID. 
127          * @param id The sequence ID 
128          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
129          *      It return an empty string when the specified ID is missing. 
130          */
131         public String getScoresFor( String id ) {
132                 return scores.containsKey(id) ? scores.get(id).toString() : "";
133         }
134         
135         /**
136          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
137          */
138         public List<String> getScoresList() {
139                 List<String> result = new ArrayList<String>( scores.size() );
140                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
141                         result.add(it.getValue().toString());
142                 }
143                 
144                 return result;
145         }
146         
147         /**
148          * @return The parsed score values a matrix of bytes
149          */
150         public byte[][] getScoresArray() { 
151                 byte[][] result = new byte[ scores.size() ][];
152                 
153                 int rowCount = 0;
154                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
155                         String line = it.getValue().toString();
156                         byte[] seqValues = new byte[ line.length() ];
157                         for( int j=0, c=line.length(); j<c; j++ ) {
158                                 
159                                 byte val = (byte)(line.charAt(j) - '0');
160
161                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
162                         }
163
164                         result[rowCount++] = seqValues;
165                 }
166                 
167                 return result;
168         }
169         
170
171         private void doParsing(BufferedReader in) throws IOException {
172
173                 /*
174                  * read the header
175                  */
176                 header = readHeader(in);
177
178                 if( header == null ) { return; }
179                 
180         
181                 /*
182                  * initilize the structure
183                  */
184                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
185                         scores.put( entry.getKey(), new StringBuilder());
186                 }
187                 
188                 /*
189                  * go with the reading
190                  */
191                 Block block;
192                 while( (block = readBlock(in, header.scores.size())) != null  ) {
193                         
194                         /*
195                          * append sequences read in the block
196                          */
197                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
198                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
199                                 if( scoreStringBuilder == null ) {
200                                         throw new RuntimeException(String.format("Invalid T-Coffee score file. Sequence ID '%s' is not declared in header section", entry.getKey()));
201                                 }
202                                 
203                                 scoreStringBuilder.append( entry.getValue() );
204                         }
205                         
206                 }
207                 
208         }
209
210
211         static int parseInt( String str ) {
212                 try {
213                         return Integer.parseInt(str);
214                 }
215                 catch( NumberFormatException e ) {
216                         // TODO report a warning ?
217                         return 0;
218                 }               
219         }
220         
221         /**
222          * Reaad the header section in the T-Coffee score file format 
223          * 
224          * @param reader The scores reader 
225          * @return The parser {@link Header} instance 
226          * @throws RuntimeException when the header is not in the expected format
227          */
228         static Header readHeader(BufferedReader reader) {
229                 
230                 Header result = null;
231                 try {
232                         result = new Header();
233                         result.head = reader.readLine();
234                         
235                         String line;
236
237                         while( (line = reader.readLine()) != null ) {
238                                 if( line.startsWith("SCORE=")) {
239                                         result.score = parseInt( line.substring(6).trim() );
240                                         break;
241                                 }
242                         }
243
244                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
245                         if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null;
246                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
247                         
248                         /*
249                          * now are expected a list if sequences ID up to the first blank line
250                          */
251                         while( (line=reader.readLine()) != null ) {
252                                 if( "".equals(line) ) {
253                                         break;
254                                 }
255                                 
256                                 int p = line.indexOf(":");
257                                 if( p == -1 ) {
258                                         // TODO report a warning
259                                         continue;
260                                 }
261                                 
262                                 String id = line.substring(0,p).trim();
263                                 int val = parseInt(line.substring(p+1).trim());
264                                 if( "".equals(id) ) {
265                                         // TODO report warning
266                                         continue;
267                                 }
268                                 
269                                 result.scores.put(id,val);
270                         }
271                         
272                 }
273                 catch( IOException e ) {
274                         throw new RuntimeException("Cannot parse T-Coffee score ascii file", e);
275                 }
276                 
277                 return result;
278         } 
279         
280         /**
281          * Read a scores block ihe provided stream. 
282          * 
283          * @param reader The stream to parse
284          * @param size The expected number of the sequence to be read 
285          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
286          * @throws IOException Something went wrong on the 'wire' 
287          */
288         static Block readBlock( BufferedReader reader, int size ) throws IOException {
289                 Block result = new Block(size);
290                 String line;
291                 
292                 /*
293                  * read blank lines (eventually)
294                  */
295                 while( (line=reader.readLine()) != null && "".equals(line.trim())) {
296                         // consume blank lines 
297                 }
298                 
299                 if( line == null ) return null;
300                 
301                 /*
302                  * read the scores block
303                  */
304                 do {
305                         if( "".equals(line.trim()) ) {
306                                 // terminated
307                                 break;
308                         }
309                         
310                         // split the line on the first blank 
311                         // the first part have to contain the sequence id
312                         // theramining part are the scores values
313                         int p = line.indexOf(" ");
314                         if( p == -1 ) {
315                                 //TODO This is an unexpected condition, log a warning or throw an exception ? 
316                                 continue;
317                         } 
318                         
319                         String id = line.substring(0,p).trim();
320                         String val = line.substring(p+1).trim();
321                         
322                         result.items.put(id, val);
323                         
324                 } while( (line = reader.readLine()) != null ); 
325                 
326
327                 return result;
328         }
329
330         /*
331          * The score file header 
332          */
333         static class Header {
334                 String head;
335                 int score;
336
337                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
338                 
339                 public int getScoreAvg() { return score; }
340                 
341                 public int getScoreFor( String ID ) { 
342
343                         return scores.containsKey(ID) ? scores.get(ID) : -1;
344         
345                 }
346         }
347         
348         /*
349          * Hold a single block values block in the score file
350          */
351         static class Block {
352                 int size;
353                 Map<String,String> items;
354                 
355                 public Block( int size ) {
356                         this.size = size;
357                         this.items = new HashMap<String,String>(size);
358                 } 
359         
360                 String getScoresFor( String id ) {
361                         return items.get(id);
362                 }
363                 
364                 String getConsensus() {
365                         return items.get("cons");
366                 }
367         }
368         /**
369          * TCOFFEE score colourscheme
370          */
371         static final Color[] colors = {
372                         new Color( 102, 102, 255 ),     // #6666FF
373                         new Color( 0, 255, 0),          // #00FF00
374                         new Color( 102, 255, 0),        // #66FF00
375                         new Color( 204, 255, 0),        // #CCFF00
376                         new Color( 255, 255, 0),        // #FFFF00
377                         new Color( 255, 204, 0),        // #FFCC00
378                         new Color( 255, 153, 0),        // #FF9900
379                         new Color( 255, 102, 0),        // #FF6600
380                         new Color( 255, 51, 0),         // #FF3300
381                         new Color( 255, 34, 0)          // #FF2000
382                 };
383         public final static String TCOFFEE_SCORE="TCoffeeScore";
384         /**
385          * generate annotation for this TCoffee score set on the given alignment
386          * @param al alignment to annotate
387          * @param matchids if true, annotate sequences based on matching sequence names
388          * @return true if alignment annotation was modified, false otherwise.
389          */
390         public boolean annotateAlignment(AlignmentI al, boolean matchids)
391         {
392           boolean added=false;
393           int i=0;
394           SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
395           byte[][] scoreMatrix=getScoresArray();
396           // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
397           for (Map.Entry<String,StringBuilder> id:scores.entrySet())
398           {
399             byte[] srow=scoreMatrix[i];
400             SequenceI s;
401             if (matchids)
402             {
403               s=sidmatcher.findIdMatch(id.getKey());
404             } else {
405               s=al.getSequenceAt(i);
406             }
407             i++;
408             if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
409             {
410               System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
411               continue;
412             }
413             int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
414             Annotation[] annotations=new Annotation[al.getWidth()];
415             for (int j=0;j<jSize;j++) {
416               byte val = srow[j];
417               annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,(char) val,val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
418             }
419             AlignmentAnnotation aa=null;
420             if (s!=null)
421             {
422               // TODO - set per sequence score
423               aa=new AlignmentAnnotation(TCOFFEE_SCORE, "Score for "+id.getKey(), annotations);
424               
425               aa.setSequenceRef(s);
426               aa.visible=false;
427               aa.belowAlignment=false;
428             } else {
429               aa=new AlignmentAnnotation("T-COFFEE", "TCoffee column reliability score", annotations);
430               aa.belowAlignment=true;
431               aa.visible=true;
432               
433             }
434             al.addAnnotation(aa);
435             added=true;
436           }
437           return added;
438         }
439           
440
441 }