3 import java.io.BufferedReader;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.HashMap;
11 import java.util.LinkedHashMap;
12 import java.util.List;
16 * A file parse for T-Coffee score ascii format. This file contains the alignment consensus
17 * for each resude in any sequence.
19 * This file is procuded by <code>t_coffee</code> providing the option
20 * <code>-output=score_ascii </code> to the program command line
22 * An example file is the following
25 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
42 * 1PHT 999999999999999999999999998762112222543211112134
43 * 1BB9 99999999999999999999999999987-------4322----2234
44 * 1UHC 99999999999999999999999999987-------5321----2246
45 * 1YCS 99999999999999999999999999986-------4321----1-35
46 * 1OOT 999999999999999999999999999861-------3------1135
47 * 1ABO 99999999999999999999999999986-------422-------34
48 * 1FYN 99999999999999999999999999985-------32--------35
49 * 1QCF 99999999999999999999999999974-------2---------24
50 * cons 999999999999999999999999999851000110321100001134
53 * 1PHT ----------5666642367889999999999889
54 * 1BB9 1111111111676653-355679999999999889
55 * 1UHC ----------788774--66789999999999889
56 * 1YCS ----------78777--356789999999999889
57 * 1OOT ----------78877--356789999999997-67
58 * 1ABO ----------687774--56779999999999889
59 * 1FYN ----------6888842356789999999999889
60 * 1QCF ----------6878742356789999999999889
61 * cons 00100000006877641356789999999999889
65 * @author Paolo Di Tommaso
68 public class TCoffeeScoreFile {
70 /** The {@link Header} structure holder */
74 * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the
77 LinkedHashMap<String,StringBuilder> scores = new LinkedHashMap<String,StringBuilder>();
82 * Parse the specified file.
84 * @param file The file to be parsed
86 public static TCoffeeScoreFile load(File file) {
88 return load(new FileReader(file));
90 catch (FileNotFoundException e) {
91 throw new RuntimeException(e);
96 * Parse the provided reader for the T-Coffee scores file format
100 public static TCoffeeScoreFile load(Reader reader) {
103 BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
104 TCoffeeScoreFile result = new TCoffeeScoreFile();
105 result.doParsing(in);
106 return result.header != null && result.scores != null ? result : null;
108 catch( Exception e) {
109 throw new RuntimeException(e);
114 * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches
115 * the number of sequences in the alignment
117 public int getHeight() {
118 // the last entry will always be the 'global' alingment consensus scores, so it is removed
119 // from the 'height' count to make this value compatible with the number of sequences in the MSA
120 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
124 * @return The 'width' of the score matrix i.e. the number of columns.
125 * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries
126 * have to have the same width.
128 public int getWidth() {
129 return fWidth != null ? fWidth : 0;
133 * The default constructor is marked as {@code protected} since this class is meant to created
134 * through the {@link #load(File)} or {@link #load(Reader)} factory methods
136 protected TCoffeeScoreFile() { }
139 * Get the string of score values for the specified seqeunce ID.
140 * @param id The sequence ID
141 * @return The scores as a string of values e.g. {@code 99999987-------432}.
142 * It return an empty string when the specified ID is missing.
144 public String getScoresFor( String id ) {
145 return scores.containsKey(id) ? scores.get(id).toString() : "";
149 * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
151 public List<String> getScoresList() {
152 List<String> result = new ArrayList<String>( scores.size() );
153 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
154 result.add(it.getValue().toString());
161 * @return The parsed score values a matrix of bytes
163 public byte[][] getScoresArray() {
164 byte[][] result = new byte[ scores.size() ][];
167 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
168 String line = it.getValue().toString();
169 byte[] seqValues = new byte[ line.length() ];
170 for( int j=0, c=line.length(); j<c; j++ ) {
172 byte val = (byte)(line.charAt(j) - '0');
174 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1;
177 result[rowCount++] = seqValues;
184 private void doParsing(BufferedReader in) throws IOException {
189 header = readHeader(in);
191 if( header == null ) { return; }
195 * initilize the structure
197 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
198 scores.put( entry.getKey(), new StringBuilder());
202 * go with the reading
205 while( (block = readBlock(in, header.scores.size())) != null ) {
208 * append sequences read in the block
210 for( Map.Entry<String,String> entry : block.items.entrySet() ) {
211 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
212 if( scoreStringBuilder == null ) {
213 throw new RuntimeException(String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey()));
216 scoreStringBuilder.append( entry.getValue() );
221 * verify that all rows have the same width
223 for( StringBuilder str : scores.values() ) {
224 if( fWidth == null ) {
225 fWidth = str.length();
227 else if( fWidth != str.length() ) {
228 throw new RuntimeException("Invalid T-Coffee score file: All the score sequences must have the same length");
237 static int parseInt( String str ) {
239 return Integer.parseInt(str);
241 catch( NumberFormatException e ) {
242 // TODO report a warning ?
248 * Reaad the header section in the T-Coffee score file format
250 * @param reader The scores reader
251 * @return The parser {@link Header} instance
252 * @throws RuntimeException when the header is not in the expected format
254 static Header readHeader(BufferedReader reader) {
256 Header result = null;
258 result = new Header();
259 result.head = reader.readLine();
263 while( (line = reader.readLine()) != null ) {
264 if( line.startsWith("SCORE=")) {
265 result.score = parseInt( line.substring(6).trim() );
270 if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
271 if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null;
272 if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
275 * now are expected a list if sequences ID up to the first blank line
277 while( (line=reader.readLine()) != null ) {
278 if( "".equals(line) ) {
282 int p = line.indexOf(":");
284 // TODO report a warning
288 String id = line.substring(0,p).trim();
289 int val = parseInt(line.substring(p+1).trim());
290 if( "".equals(id) ) {
291 // TODO report warning
295 result.scores.put(id,val);
299 catch( IOException e ) {
300 throw new RuntimeException("Cannot parse T-Coffee score ascii file", e);
307 * Read a scores block ihe provided stream.
309 * @param reader The stream to parse
310 * @param size The expected number of the sequence to be read
311 * @return The {@link Block} instance read or {link null} null if the end of file has reached.
312 * @throws IOException Something went wrong on the 'wire'
314 static Block readBlock( BufferedReader reader, int size ) throws IOException {
315 Block result = new Block(size);
319 * read blank lines (eventually)
321 while( (line=reader.readLine()) != null && "".equals(line.trim())) {
322 // consume blank lines
325 if( line == null ) return null;
328 * read the scores block
331 if( "".equals(line.trim()) ) {
336 // split the line on the first blank
337 // the first part have to contain the sequence id
338 // theramining part are the scores values
339 int p = line.indexOf(" ");
341 //TODO This is an unexpected condition, log a warning or throw an exception ?
345 String id = line.substring(0,p).trim();
346 String val = line.substring(p+1).trim();
348 result.items.put(id, val);
350 } while( (line = reader.readLine()) != null );
357 * The score file header
359 static class Header {
363 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
365 public int getScoreAvg() { return score; }
367 public int getScoreFor( String ID ) {
369 return scores.containsKey(ID) ? scores.get(ID) : -1;
375 * Hold a single block values block in the score file
379 Map<String,String> items;
381 public Block( int size ) {
383 this.items = new HashMap<String,String>(size);
386 String getScoresFor( String id ) {
387 return items.get(id);
390 String getConsensus() {
391 return items.get("cons");