df009865cd405e68a66060ea5bb4eb88ea0b1afc
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.io.Reader;
9 import java.util.ArrayList;
10 import java.util.HashMap;
11 import java.util.LinkedHashMap;
12 import java.util.List;
13 import java.util.Map;
14
15 /**
16  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
17  * for each resude in any sequence.
18  * <p>
19  * This file is procuded by <code>t_coffee</code> providing the option 
20  * <code>-output=score_ascii </code> to the program command line
21  * 
22  * An example file is the following 
23  * 
24  * <pre>
25  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
26  * Cedric Notredame 
27  * CPU TIME:0 sec.
28  * SCORE=90
29  * *
30  *  BAD AVG GOOD
31  * *
32  * 1PHT   :  89
33  * 1BB9   :  90
34  * 1UHC   :  94
35  * 1YCS   :  94
36  * 1OOT   :  93
37  * 1ABO   :  94
38  * 1FYN   :  94
39  * 1QCF   :  94
40  * cons   :  90
41  * 
42  * 1PHT   999999999999999999999999998762112222543211112134
43  * 1BB9   99999999999999999999999999987-------4322----2234
44  * 1UHC   99999999999999999999999999987-------5321----2246
45  * 1YCS   99999999999999999999999999986-------4321----1-35
46  * 1OOT   999999999999999999999999999861-------3------1135
47  * 1ABO   99999999999999999999999999986-------422-------34
48  * 1FYN   99999999999999999999999999985-------32--------35
49  * 1QCF   99999999999999999999999999974-------2---------24
50  * cons   999999999999999999999999999851000110321100001134
51  * 
52  * 
53  * 1PHT   ----------5666642367889999999999889
54  * 1BB9   1111111111676653-355679999999999889
55  * 1UHC   ----------788774--66789999999999889
56  * 1YCS   ----------78777--356789999999999889
57  * 1OOT   ----------78877--356789999999997-67
58  * 1ABO   ----------687774--56779999999999889
59  * 1FYN   ----------6888842356789999999999889
60  * 1QCF   ----------6878742356789999999999889
61  * cons   00100000006877641356789999999999889
62  * </pre>
63  * 
64  * 
65  * @author Paolo Di Tommaso
66  *
67  */
68 public class TCoffeeScoreFile {
69         
70         /** The {@link Header} structure holder */
71         Header header;
72         
73         /** 
74          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
75          * insertion order. 
76          */
77         LinkedHashMap<String,StringBuilder> scores = new LinkedHashMap<String,StringBuilder>();
78         
79
80         /**
81          * Get the string of score values for the specified seqeunce ID. 
82          * @param id The sequence ID 
83          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
84          *      It return an empty string when the specified ID is missing. 
85          */
86         public String getScoresFor( String id ) {
87                 return scores.containsKey(id) ? scores.get(id).toString() : "";
88         }
89         
90         /**
91          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
92          */
93         public List<String> getScoresList() {
94                 List<String> result = new ArrayList<String>( scores.size() );
95                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
96                         result.add(it.getValue().toString());
97                 }
98                 
99                 return result;
100         }
101         
102         /**
103          * @return The parsed score values a matrix of bytes
104          */
105         public byte[][] getScoresArray() { 
106                 byte[][] result = new byte[ scores.size() ][];
107                 
108                 int rowCount = 0;
109                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
110                         String line = it.getValue().toString();
111                         byte[] seqValues = new byte[ line.length() ];
112                         for( int j=0, c=line.length(); j<c; j++ ) {
113                                 
114                                 byte val = (byte)(line.charAt(j) - '0');
115
116                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
117                         }
118
119                         result[rowCount++] = seqValues;
120                 }
121                 
122                 return result;
123         }
124         
125         /**
126          * Parse the specified file.
127          * 
128          * @param file The file to be parsed 
129          */
130         public void parse(File file) {
131                 try {
132                         parse(new FileReader(file));
133                 } 
134                 catch (FileNotFoundException e) {
135                         throw new RuntimeException(e);
136                 }
137         }
138         
139         /**
140          * Parse the provided reader for the T-Coffee scores file format
141          * 
142          * @param reader 
143          */
144         public void parse(Reader reader) {
145
146                 try {
147                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
148                         doParsing(in);
149                 }
150                 catch( Exception e) {
151                         throw new RuntimeException(e);
152                 }
153         }
154         
155         private void doParsing(BufferedReader in) throws IOException {
156
157                 /*
158                  * read the header
159                  */
160                 header = readHeader(in);
161                 
162                 /*
163                  * initilize the structure
164                  */
165                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
166                         scores.put( entry.getKey(), new StringBuilder());
167                 }
168                 
169                 /*
170                  * go with the reading
171                  */
172                 Block block;
173                 while( (block = readBlock(in, header.scores.size())) != null  ) {
174                         
175                         /*
176                          * append sequences read in the block
177                          */
178                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
179                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
180                                 if( scoreStringBuilder == null ) {
181                                         throw new RuntimeException(String.format("Invalid T-Coffee score file. Sequence ID '%s' is not declared in header section", entry.getKey()));
182                                 }
183                                 
184                                 scoreStringBuilder.append( entry.getValue() );
185                         }
186                         
187                 }
188                 
189         }
190
191
192         static int parseInt( String str ) {
193                 try {
194                         return Integer.parseInt(str);
195                 }
196                 catch( NumberFormatException e ) {
197                         // TODO report a warning ?
198                         return 0;
199                 }               
200         }
201         
202         /**
203          * Reaad the header section in the T-Coffee score file format 
204          * 
205          * @param reader The scores reader 
206          * @return The parser {@link Header} instance 
207          * @throws RuntimeException when the header is not in the expected format
208          */
209         static Header readHeader(BufferedReader reader) {
210                 
211                 Header result = null;
212                 try {
213                         result = new Header();
214                         result.head = reader.readLine();
215                         
216                         String line;
217
218                         while( (line = reader.readLine()) != null ) {
219                                 if( line.startsWith("SCORE=")) {
220                                         result.score = parseInt( line.substring(6).trim() );
221                                         break;
222                                 }
223                         }
224
225                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
226                         if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null;
227                         if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
228                         
229                         /*
230                          * now are expected a list if sequences ID up to the first blank line
231                          */
232                         while( (line=reader.readLine()) != null ) {
233                                 if( "".equals(line) ) {
234                                         break;
235                                 }
236                                 
237                                 int p = line.indexOf(":");
238                                 if( p == -1 ) {
239                                         // TODO report a warning
240                                         continue;
241                                 }
242                                 
243                                 String id = line.substring(0,p).trim();
244                                 int val = parseInt(line.substring(p+1).trim());
245                                 if( "".equals(id) ) {
246                                         // TODO report warning
247                                         continue;
248                                 }
249                                 
250                                 result.scores.put(id,val);
251                         }
252                         
253                 }
254                 catch( IOException e ) {
255                         throw new RuntimeException("Cannot parse T-Coffee score ascii file", e);
256                 }
257                 
258                 return result;
259         } 
260         
261         /**
262          * Read a scores block ihe provided stream. 
263          * 
264          * @param reader The stream to parse
265          * @param size The expected number of the sequence to be read 
266          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
267          * @throws IOException Something went wrong on the 'wire' 
268          */
269         static Block readBlock( BufferedReader reader, int size ) throws IOException {
270                 Block result = new Block(size);
271                 String line;
272                 
273                 /*
274                  * read blank lines (eventually)
275                  */
276                 while( (line=reader.readLine()) != null && "".equals(line.trim())) {
277                         // consume blank lines 
278                 }
279                 
280                 if( line == null ) return null;
281                 
282                 /*
283                  * read the scores block
284                  */
285                 do {
286                         if( "".equals(line.trim()) ) {
287                                 // terminated
288                                 break;
289                         }
290                         
291                         // split the line on the first blank 
292                         // the first part have to contain the sequence id
293                         // theramining part are the scores values
294                         int p = line.indexOf(" ");
295                         if( p == -1 ) {
296                                 //TODO This is an unexpected condition, log a warning or throw an exception ? 
297                                 continue;
298                         } 
299                         
300                         String id = line.substring(0,p).trim();
301                         String val = line.substring(p+1).trim();
302                         
303                         result.items.put(id, val);
304                         
305                 } while( (line = reader.readLine()) != null ); 
306                 
307
308                 return result;
309         }
310
311         /*
312          * The score file header 
313          */
314         static class Header {
315                 String head;
316                 int score;
317
318                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
319                 
320                 public int getScoreAvg() { return score; }
321                 
322                 public int getScoreFor( String ID ) { 
323
324                         return scores.containsKey(ID) ? scores.get(ID) : -1;
325         
326                 }
327         }
328         
329         /*
330          * Hold a single block values block in the score file
331          */
332         static class Block {
333                 int size;
334                 Map<String,String> items;
335                 
336                 public Block( int size ) {
337                         this.size = size;
338                         this.items = new HashMap<String,String>(size);
339                 } 
340         
341                 String getScoresFor( String id ) {
342                         return items.get(id);
343                 }
344                 
345                 String getConsensus() {
346                         return items.get("cons");
347                 }
348         }
349         
350         
351
352 }