3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
10 import java.io.BufferedReader;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
23 * A file parse for T-Coffee score ascii format. This file contains the alignment consensus
24 * for each resude in any sequence.
26 * This file is procuded by <code>t_coffee</code> providing the option
27 * <code>-output=score_ascii </code> to the program command line
29 * An example file is the following
32 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
49 * 1PHT 999999999999999999999999998762112222543211112134
50 * 1BB9 99999999999999999999999999987-------4322----2234
51 * 1UHC 99999999999999999999999999987-------5321----2246
52 * 1YCS 99999999999999999999999999986-------4321----1-35
53 * 1OOT 999999999999999999999999999861-------3------1135
54 * 1ABO 99999999999999999999999999986-------422-------34
55 * 1FYN 99999999999999999999999999985-------32--------35
56 * 1QCF 99999999999999999999999999974-------2---------24
57 * cons 999999999999999999999999999851000110321100001134
60 * 1PHT ----------5666642367889999999999889
61 * 1BB9 1111111111676653-355679999999999889
62 * 1UHC ----------788774--66789999999999889
63 * 1YCS ----------78777--356789999999999889
64 * 1OOT ----------78877--356789999999997-67
65 * 1ABO ----------687774--56779999999999889
66 * 1FYN ----------6888842356789999999999889
67 * 1QCF ----------6878742356789999999999889
68 * cons 00100000006877641356789999999999889
72 * @author Paolo Di Tommaso
75 public class TCoffeeScoreFile extends AlignFile {
77 public TCoffeeScoreFile(String inFile, String type) throws IOException
83 public TCoffeeScoreFile(FileParse source) throws IOException
88 /** The {@link Header} structure holder */
92 * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the
95 LinkedHashMap<String,StringBuilder> scores;
100 * Parse the provided reader for the T-Coffee scores file format
103 public static TCoffeeScoreFile load(Reader reader) {
106 BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
107 TCoffeeScoreFile result = new TCoffeeScoreFile();
108 result.doParsing(in);
109 return result.header != null && result.scores != null ? result : null;
111 catch( Exception e) {
112 throw new RuntimeException(e);
118 * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches
119 * the number of sequences in the alignment
121 public int getHeight() {
122 // the last entry will always be the 'global' alingment consensus scores, so it is removed
123 // from the 'height' count to make this value compatible with the number of sequences in the MSA
124 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
128 * @return The 'width' of the score matrix i.e. the number of columns.
129 * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries
130 * have to have the same width.
132 public int getWidth() {
133 return fWidth != null ? fWidth : 0;
138 * Get the string of score values for the specified seqeunce ID.
139 * @param id The sequence ID
140 * @return The scores as a string of values e.g. {@code 99999987-------432}.
141 * It return an empty string when the specified ID is missing.
143 public String getScoresFor( String id ) {
144 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
148 * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
150 public List<String> getScoresList() {
155 List<String> result = new ArrayList<String>( scores.size() );
156 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
157 result.add(it.getValue().toString());
164 * @return The parsed score values a matrix of bytes
166 public byte[][] getScoresArray() {
171 byte[][] result = new byte[ scores.size() ][];
174 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
175 String line = it.getValue().toString();
176 byte[] seqValues = new byte[ line.length() ];
177 for( int j=0, c=line.length(); j<c; j++ ) {
179 byte val = (byte)(line.charAt(j) - '0');
181 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1;
184 result[rowCount++] = seqValues;
191 public void parse() throws IOException
196 header = readHeader(this);
198 if( header == null ) { error=true; return;}
199 scores = new LinkedHashMap<String,StringBuilder>();
202 * initilize the structure
204 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
205 scores.put( entry.getKey(), new StringBuilder());
209 * go with the reading
212 while( (block = readBlock(this,header.scores.size())) != null ) {
215 * append sequences read in the block
217 for( Map.Entry<String,String> entry : block.items.entrySet() ) {
218 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
219 if( scoreStringBuilder == null ) {
221 errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
225 scoreStringBuilder.append( entry.getValue() );
230 * verify that all rows have the same width
232 for( StringBuilder str : scores.values() ) {
233 if( fWidth == null ) {
234 fWidth = str.length();
236 else if( fWidth != str.length() ) {
238 errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
248 static int parseInt( String str ) {
250 return Integer.parseInt(str);
252 catch( NumberFormatException e ) {
253 // TODO report a warning ?
259 * Reaad the header section in the T-Coffee score file format
261 * @param reader The scores reader
262 * @return The parser {@link Header} instance
263 * @throws RuntimeException when the header is not in the expected format
265 static Header readHeader(FileParse reader) throws IOException {
267 Header result = null;
269 result = new Header();
270 result.head = reader.nextLine();
274 while( (line = reader.nextLine()) != null ) {
275 if( line.startsWith("SCORE=")) {
276 result.score = parseInt( line.substring(6).trim() );
281 if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
282 if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
283 if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
286 * now are expected a list if sequences ID up to the first blank line
288 while( (line=reader.nextLine()) != null ) {
289 if( "".equals(line) ) {
293 int p = line.indexOf(":");
295 // TODO report a warning
299 String id = line.substring(0,p).trim();
300 int val = parseInt(line.substring(p+1).trim());
301 if( "".equals(id) ) {
302 // TODO report warning
306 result.scores.put(id,val);
310 error(reader, "T-COFFEE score file had no per-sequence scores");
314 catch( IOException e ) {
315 error(reader,"Unexpected problem parsing T-Coffee score ascii file");
321 private static void error(FileParse reader, String errm)
324 if (reader.errormessage==null)
325 { reader.errormessage=errm;
327 reader.errormessage+="\n"+errm;
331 * Read a scores block ihe provided stream.
333 * @param reader The stream to parse
334 * @param size The expected number of the sequence to be read
335 * @return The {@link Block} instance read or {link null} null if the end of file has reached.
336 * @throws IOException Something went wrong on the 'wire'
338 static Block readBlock( FileParse reader, int size ) throws IOException {
339 Block result = new Block(size);
343 * read blank lines (eventually)
345 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
346 // consume blank lines
349 if( line == null ) { return null; }
352 * read the scores block
355 if( "".equals(line.trim()) ) {
360 // split the line on the first blank
361 // the first part have to contain the sequence id
362 // the remaining part are the scores values
363 int p = line.indexOf(" ");
365 if (reader.warningMessage==null) { reader.warningMessage=""; }
366 reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
370 String id = line.substring(0,p).trim();
371 String val = line.substring(p+1).trim();
373 result.items.put(id, val);
375 } while( (line = reader.nextLine()) != null );
382 * The score file header
384 static class Header {
388 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
390 public int getScoreAvg() { return score; }
392 public int getScoreFor( String ID ) {
394 return scores.containsKey(ID) ? scores.get(ID) : -1;
400 * Hold a single block values block in the score file
404 Map<String,String> items;
406 public Block( int size ) {
408 this.items = new HashMap<String,String>(size);
411 String getScoresFor( String id ) {
412 return items.get(id);
415 String getConsensus() {
416 return items.get("cons");
420 * TCOFFEE score colourscheme
422 static final Color[] colors = {
423 new Color( 102, 102, 255 ), // #6666FF
424 new Color( 0, 255, 0), // #00FF00
425 new Color( 102, 255, 0), // #66FF00
426 new Color( 204, 255, 0), // #CCFF00
427 new Color( 255, 255, 0), // #FFFF00
428 new Color( 255, 204, 0), // #FFCC00
429 new Color( 255, 153, 0), // #FF9900
430 new Color( 255, 102, 0), // #FF6600
431 new Color( 255, 51, 0), // #FF3300
432 new Color( 255, 34, 0) // #FF2000
434 public final static String TCOFFEE_SCORE="TCoffeeScore";
436 * generate annotation for this TCoffee score set on the given alignment
437 * @param al alignment to annotate
438 * @param matchids if true, annotate sequences based on matching sequence names
439 * @return true if alignment annotation was modified, false otherwise.
441 public boolean annotateAlignment(AlignmentI al, boolean matchids)
443 if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
445 warningMessage="Alignment shape does not match T-Coffee score file shape.";
450 SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
451 byte[][] scoreMatrix=getScoresArray();
452 // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
453 for (Map.Entry<String,StringBuilder> id:scores.entrySet())
455 byte[] srow=scoreMatrix[i];
459 s=sidmatcher.findIdMatch(id.getKey());
461 s=al.getSequenceAt(i);
464 if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
466 System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
469 int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
470 Annotation[] annotations=new Annotation[al.getWidth()];
471 for (int j=0;j<jSize;j++) {
473 if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
478 System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
481 annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
484 // this will overwrite any existing t-coffee scores for the alignment
485 AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,TCOFFEE_SCORE,false,s, null);
489 aa.description=""+id.getKey();
490 aa.annotations=annotations;
492 aa.belowAlignment=false;
493 aa.setScore(header.getScoreFor(id.getKey()));
494 aa.createSequenceMapping(s, s.getStart(),true);
495 s.addAlignmentAnnotation(aa);
496 aa.adjustForAlignment();
498 aa.graph=AlignmentAnnotation.NO_GRAPH;
500 aa.description="TCoffee column reliability score";
501 aa.annotations=annotations;
502 aa.belowAlignment=true;
504 aa.setScore(header.getScoreAvg());
506 aa.showAllColLabels=true;
507 aa.validateRangeAndDisplay();
515 public String print()
517 // TODO Auto-generated method stub