7a3f2d9deaae376cd55d360b1f3eb8a4e7437095
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.io.Reader;
9 import java.util.ArrayList;
10 import java.util.HashMap;
11 import java.util.LinkedHashMap;
12 import java.util.List;
13 import java.util.Map;
14
15 /**
16  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
17  * for each resude in any sequence.
18  * <p>
19  * This file is procuded by <code>t_coffee</code> providing the option 
20  * <code>-output=score_ascii </code> to the program command line
21  * 
22  * An example file is the following 
23  * 
24  * <pre>
25  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
26  * Cedric Notredame 
27  * CPU TIME:0 sec.
28  * SCORE=90
29  * *
30  *  BAD AVG GOOD
31  * *
32  * 1PHT   :  89
33  * 1BB9   :  90
34  * 1UHC   :  94
35  * 1YCS   :  94
36  * 1OOT   :  93
37  * 1ABO   :  94
38  * 1FYN   :  94
39  * 1QCF   :  94
40  * cons   :  90
41  * 
42  * 1PHT   999999999999999999999999998762112222543211112134
43  * 1BB9   99999999999999999999999999987-------4322----2234
44  * 1UHC   99999999999999999999999999987-------5321----2246
45  * 1YCS   99999999999999999999999999986-------4321----1-35
46  * 1OOT   999999999999999999999999999861-------3------1135
47  * 1ABO   99999999999999999999999999986-------422-------34
48  * 1FYN   99999999999999999999999999985-------32--------35
49  * 1QCF   99999999999999999999999999974-------2---------24
50  * cons   999999999999999999999999999851000110321100001134
51  * 
52  * 
53  * 1PHT   ----------5666642367889999999999889
54  * 1BB9   1111111111676653-355679999999999889
55  * 1UHC   ----------788774--66789999999999889
56  * 1YCS   ----------78777--356789999999999889
57  * 1OOT   ----------78877--356789999999997-67
58  * 1ABO   ----------687774--56779999999999889
59  * 1FYN   ----------6888842356789999999999889
60  * 1QCF   ----------6878742356789999999999889
61  * cons   00100000006877641356789999999999889
62  * </pre>
63  * 
64  * 
65  * @author Paolo Di Tommaso
66  *
67  */
68 public class TCoffeeScoreFile {
69         
70         /** The {@link Header} structure holder */
71         Header header;
72         
73         /** 
74          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
75          * insertion order. 
76          */
77         LinkedHashMap<String,StringBuilder> scores = new LinkedHashMap<String,StringBuilder>();
78         
79
80         /**
81          * Parse the specified file.
82          * 
83          * @param file The file to be parsed 
84          */
85         public static TCoffeeScoreFile load(File file) {
86                 try {
87                         return load(new FileReader(file));
88                 } 
89                 catch (FileNotFoundException e) {
90                         throw new RuntimeException(e);
91                 }
92         }
93         
94         /**
95          * Parse the provided reader for the T-Coffee scores file format
96          * 
97          * @param reader 
98          */
99         public static TCoffeeScoreFile load(Reader reader) {
100
101                 try {
102                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
103                         TCoffeeScoreFile result = new TCoffeeScoreFile();
104                         result.doParsing(in);
105                         return result.header != null && result.scores != null ? result : null;
106                 }
107                 catch( Exception e) {
108                         throw new RuntimeException(e);
109                 }
110         }
111                 
112         /**
113          * The default constructor is marked as {@code protected} since this class is meant to created 
114          * through the {@link #load(File)} or {@link #load(Reader)} factory methods
115          */
116         protected TCoffeeScoreFile() { } 
117         
118         /**
119          * Get the string of score values for the specified seqeunce ID. 
120          * @param id The sequence ID 
121          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
122          *      It return an empty string when the specified ID is missing. 
123          */
124         public String getScoresFor( String id ) {
125                 return scores.containsKey(id) ? scores.get(id).toString() : "";
126         }
127         
128         /**
129          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
130          */
131         public List<String> getScoresList() {
132                 List<String> result = new ArrayList<String>( scores.size() );
133                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
134                         result.add(it.getValue().toString());
135                 }
136                 
137                 return result;
138         }
139         
140         /**
141          * @return The parsed score values a matrix of bytes
142          */
143         public byte[][] getScoresArray() { 
144                 byte[][] result = new byte[ scores.size() ][];
145                 
146                 int rowCount = 0;
147                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
148                         String line = it.getValue().toString();
149                         byte[] seqValues = new byte[ line.length() ];
150                         for( int j=0, c=line.length(); j<c; j++ ) {
151                                 
152                                 byte val = (byte)(line.charAt(j) - '0');
153
154                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
155                         }
156
157                         result[rowCount++] = seqValues;
158                 }
159                 
160                 return result;
161         }
162         
163
164         private void doParsing(BufferedReader in) throws IOException {
165
166                 /*
167                  * read the header
168                  */
169                 header = readHeader(in);
170
171                 if( header == null ) { return; }
172                 
173         
174                 /*
175                  * initilize the structure
176                  */
177                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
178                         scores.put( entry.getKey(), new StringBuilder());
179                 }
180                 
181                 /*
182                  * go with the reading
183                  */
184                 Block block;
185                 while( (block = readBlock(in, header.scores.size())) != null  ) {
186                         
187                         /*
188                          * append sequences read in the block
189                          */
190                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
191                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
192                                 if( scoreStringBuilder == null ) {
193                                         throw new RuntimeException(String.format("Invalid T-Coffee score file. Sequence ID '%s' is not declared in header section", entry.getKey()));
194                                 }
195                                 
196                                 scoreStringBuilder.append( entry.getValue() );
197                         }
198                         
199                 }
200                 
201         }
202
203
204         static int parseInt( String str ) {
205                 try {
206                         return Integer.parseInt(str);
207                 }
208                 catch( NumberFormatException e ) {
209                         // TODO report a warning ?
210                         return 0;
211                 }               
212         }
213         
214         /**
215          * Reaad the header section in the T-Coffee score file format 
216          * 
217          * @param reader The scores reader 
218          * @return The parser {@link Header} instance 
219          * @throws RuntimeException when the header is not in the expected format
220          */
221         static Header readHeader(BufferedReader reader) {
222                 
223                 Header result = null;
224                 try {
225                         result = new Header();
226                         result.head = reader.readLine();
227                         
228                         String line;
229
230                         while( (line = reader.readLine()) != null ) {
231                                 if( line.startsWith("SCORE=")) {
232                                         result.score = parseInt( line.substring(6).trim() );
233                                         break;
234                                 }
235                         }
236
237                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
238                         if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null;
239                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
240                         
241                         /*
242                          * now are expected a list if sequences ID up to the first blank line
243                          */
244                         while( (line=reader.readLine()) != null ) {
245                                 if( "".equals(line) ) {
246                                         break;
247                                 }
248                                 
249                                 int p = line.indexOf(":");
250                                 if( p == -1 ) {
251                                         // TODO report a warning
252                                         continue;
253                                 }
254                                 
255                                 String id = line.substring(0,p).trim();
256                                 int val = parseInt(line.substring(p+1).trim());
257                                 if( "".equals(id) ) {
258                                         // TODO report warning
259                                         continue;
260                                 }
261                                 
262                                 result.scores.put(id,val);
263                         }
264                         
265                 }
266                 catch( IOException e ) {
267                         throw new RuntimeException("Cannot parse T-Coffee score ascii file", e);
268                 }
269                 
270                 return result;
271         } 
272         
273         /**
274          * Read a scores block ihe provided stream. 
275          * 
276          * @param reader The stream to parse
277          * @param size The expected number of the sequence to be read 
278          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
279          * @throws IOException Something went wrong on the 'wire' 
280          */
281         static Block readBlock( BufferedReader reader, int size ) throws IOException {
282                 Block result = new Block(size);
283                 String line;
284                 
285                 /*
286                  * read blank lines (eventually)
287                  */
288                 while( (line=reader.readLine()) != null && "".equals(line.trim())) {
289                         // consume blank lines 
290                 }
291                 
292                 if( line == null ) return null;
293                 
294                 /*
295                  * read the scores block
296                  */
297                 do {
298                         if( "".equals(line.trim()) ) {
299                                 // terminated
300                                 break;
301                         }
302                         
303                         // split the line on the first blank 
304                         // the first part have to contain the sequence id
305                         // theramining part are the scores values
306                         int p = line.indexOf(" ");
307                         if( p == -1 ) {
308                                 //TODO This is an unexpected condition, log a warning or throw an exception ? 
309                                 continue;
310                         } 
311                         
312                         String id = line.substring(0,p).trim();
313                         String val = line.substring(p+1).trim();
314                         
315                         result.items.put(id, val);
316                         
317                 } while( (line = reader.readLine()) != null ); 
318                 
319
320                 return result;
321         }
322
323         /*
324          * The score file header 
325          */
326         static class Header {
327                 String head;
328                 int score;
329
330                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
331                 
332                 public int getScoreAvg() { return score; }
333                 
334                 public int getScoreFor( String ID ) { 
335
336                         return scores.containsKey(ID) ? scores.get(ID) : -1;
337         
338                 }
339         }
340         
341         /*
342          * Hold a single block values block in the score file
343          */
344         static class Block {
345                 int size;
346                 Map<String,String> items;
347                 
348                 public Block( int size ) {
349                         this.size = size;
350                         this.items = new HashMap<String,String>(size);
351                 } 
352         
353                 String getScoresFor( String id ) {
354                         return items.get(id);
355                 }
356                 
357                 String getConsensus() {
358                         return items.get("cons");
359                 }
360         }
361         
362         
363
364 }