3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
10 import java.io.BufferedReader;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
22 import javax.xml.parsers.ParserConfigurationException;
24 import org.xml.sax.SAXException;
26 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
27 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
28 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
31 * A file parse for T-Coffee score ascii format. This file contains the alignment consensus
32 * for each resude in any sequence.
34 * This file is procuded by <code>t_coffee</code> providing the option
35 * <code>-output=score_ascii </code> to the program command line
37 * An example file is the following
40 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
57 * 1PHT 999999999999999999999999998762112222543211112134
58 * 1BB9 99999999999999999999999999987-------4322----2234
59 * 1UHC 99999999999999999999999999987-------5321----2246
60 * 1YCS 99999999999999999999999999986-------4321----1-35
61 * 1OOT 999999999999999999999999999861-------3------1135
62 * 1ABO 99999999999999999999999999986-------422-------34
63 * 1FYN 99999999999999999999999999985-------32--------35
64 * 1QCF 99999999999999999999999999974-------2---------24
65 * cons 999999999999999999999999999851000110321100001134
68 * 1PHT ----------5666642367889999999999889
69 * 1BB9 1111111111676653-355679999999999889
70 * 1UHC ----------788774--66789999999999889
71 * 1YCS ----------78777--356789999999999889
72 * 1OOT ----------78877--356789999999997-67
73 * 1ABO ----------687774--56779999999999889
74 * 1FYN ----------6888842356789999999999889
75 * 1QCF ----------6878742356789999999999889
76 * cons 00100000006877641356789999999999889
80 * @author Paolo Di Tommaso
83 public class TCoffeeScoreFile extends AlignFile {
85 public TCoffeeScoreFile(String inFile, String type) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException
91 public TCoffeeScoreFile(FileParse source) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException
96 /** The {@link Header} structure holder */
100 * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the
103 LinkedHashMap<String,StringBuilder> scores;
108 * Parse the provided reader for the T-Coffee scores file format
111 public static TCoffeeScoreFile load(Reader reader) {
114 BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
115 TCoffeeScoreFile result = new TCoffeeScoreFile();
116 result.doParsing(in);
117 return result.header != null && result.scores != null ? result : null;
119 catch( Exception e) {
120 throw new RuntimeException(e);
126 * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches
127 * the number of sequences in the alignment
129 public int getHeight() {
130 // the last entry will always be the 'global' alingment consensus scores, so it is removed
131 // from the 'height' count to make this value compatible with the number of sequences in the MSA
132 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
136 * @return The 'width' of the score matrix i.e. the number of columns.
137 * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries
138 * have to have the same width.
140 public int getWidth() {
141 return fWidth != null ? fWidth : 0;
146 * Get the string of score values for the specified seqeunce ID.
147 * @param id The sequence ID
148 * @return The scores as a string of values e.g. {@code 99999987-------432}.
149 * It return an empty string when the specified ID is missing.
151 public String getScoresFor( String id ) {
152 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
156 * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
158 public List<String> getScoresList() {
163 List<String> result = new ArrayList<String>( scores.size() );
164 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
165 result.add(it.getValue().toString());
172 * @return The parsed score values a matrix of bytes
174 public byte[][] getScoresArray() {
179 byte[][] result = new byte[ scores.size() ][];
182 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
183 String line = it.getValue().toString();
184 byte[] seqValues = new byte[ line.length() ];
185 for( int j=0, c=line.length(); j<c; j++ ) {
187 byte val = (byte)(line.charAt(j) - '0');
189 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1;
192 result[rowCount++] = seqValues;
199 public void parse() throws IOException
204 header = readHeader(this);
206 if( header == null ) { error=true; return;}
207 scores = new LinkedHashMap<String,StringBuilder>();
210 * initilize the structure
212 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
213 scores.put( entry.getKey(), new StringBuilder());
217 * go with the reading
220 while( (block = readBlock(this,header.scores.size())) != null ) {
223 * append sequences read in the block
225 for( Map.Entry<String,String> entry : block.items.entrySet() ) {
226 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
227 if( scoreStringBuilder == null ) {
229 errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
233 scoreStringBuilder.append( entry.getValue() );
238 * verify that all rows have the same width
240 for( StringBuilder str : scores.values() ) {
241 if( fWidth == null ) {
242 fWidth = str.length();
244 else if( fWidth != str.length() ) {
246 errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
256 static int parseInt( String str ) {
258 return Integer.parseInt(str);
260 catch( NumberFormatException e ) {
261 // TODO report a warning ?
267 * Reaad the header section in the T-Coffee score file format
269 * @param reader The scores reader
270 * @return The parser {@link Header} instance
271 * @throws RuntimeException when the header is not in the expected format
273 static Header readHeader(FileParse reader) throws IOException {
275 Header result = null;
277 result = new Header();
278 result.head = reader.nextLine();
282 while( (line = reader.nextLine()) != null ) {
283 if( line.startsWith("SCORE=")) {
284 result.score = parseInt( line.substring(6).trim() );
289 if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
290 if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
291 if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
294 * now are expected a list if sequences ID up to the first blank line
296 while( (line=reader.nextLine()) != null ) {
297 if( "".equals(line) ) {
301 int p = line.indexOf(":");
303 // TODO report a warning
307 String id = line.substring(0,p).trim();
308 int val = parseInt(line.substring(p+1).trim());
309 if( "".equals(id) ) {
310 // TODO report warning
314 result.scores.put(id,val);
318 error(reader, "T-COFFEE score file had no per-sequence scores");
322 catch( IOException e ) {
323 error(reader,"Unexpected problem parsing T-Coffee score ascii file");
329 private static void error(FileParse reader, String errm)
332 if (reader.errormessage==null)
333 { reader.errormessage=errm;
335 reader.errormessage+="\n"+errm;
339 * Read a scores block ihe provided stream.
341 * @param reader The stream to parse
342 * @param size The expected number of the sequence to be read
343 * @return The {@link Block} instance read or {link null} null if the end of file has reached.
344 * @throws IOException Something went wrong on the 'wire'
346 static Block readBlock( FileParse reader, int size ) throws IOException {
347 Block result = new Block(size);
351 * read blank lines (eventually)
353 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
354 // consume blank lines
357 if( line == null ) { return null; }
360 * read the scores block
363 if( "".equals(line.trim()) ) {
368 // split the line on the first blank
369 // the first part have to contain the sequence id
370 // the remaining part are the scores values
371 int p = line.indexOf(" ");
373 if (reader.warningMessage==null) { reader.warningMessage=""; }
374 reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
378 String id = line.substring(0,p).trim();
379 String val = line.substring(p+1).trim();
381 result.items.put(id, val);
383 } while( (line = reader.nextLine()) != null );
390 * The score file header
392 static class Header {
396 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
398 public int getScoreAvg() { return score; }
400 public int getScoreFor( String ID ) {
402 return scores.containsKey(ID) ? scores.get(ID) : -1;
408 * Hold a single block values block in the score file
412 Map<String,String> items;
414 public Block( int size ) {
416 this.items = new HashMap<String,String>(size);
419 String getScoresFor( String id ) {
420 return items.get(id);
423 String getConsensus() {
424 return items.get("cons");
428 * TCOFFEE score colourscheme
430 static final Color[] colors = {
431 new Color( 102, 102, 255 ), // #6666FF
432 new Color( 0, 255, 0), // #00FF00
433 new Color( 102, 255, 0), // #66FF00
434 new Color( 204, 255, 0), // #CCFF00
435 new Color( 255, 255, 0), // #FFFF00
436 new Color( 255, 204, 0), // #FFCC00
437 new Color( 255, 153, 0), // #FF9900
438 new Color( 255, 102, 0), // #FF6600
439 new Color( 255, 51, 0), // #FF3300
440 new Color( 255, 34, 0) // #FF2000
442 public final static String TCOFFEE_SCORE="TCoffeeScore";
444 * generate annotation for this TCoffee score set on the given alignment
445 * @param al alignment to annotate
446 * @param matchids if true, annotate sequences based on matching sequence names
447 * @return true if alignment annotation was modified, false otherwise.
449 public boolean annotateAlignment(AlignmentI al, boolean matchids)
451 if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
453 warningMessage="Alignment shape does not match T-Coffee score file shape.";
458 SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
459 byte[][] scoreMatrix=getScoresArray();
460 // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
461 for (Map.Entry<String,StringBuilder> id:scores.entrySet())
463 byte[] srow=scoreMatrix[i];
467 s=sidmatcher.findIdMatch(id.getKey());
469 s=al.getSequenceAt(i);
472 if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
474 System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
477 int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
478 Annotation[] annotations=new Annotation[al.getWidth()];
479 for (int j=0;j<jSize;j++) {
481 if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
486 System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
489 annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
492 // this will overwrite any existing t-coffee scores for the alignment
493 AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,false,s,null);
497 aa.description=""+id.getKey();
498 aa.annotations=annotations;
500 aa.belowAlignment=false;
501 aa.setScore(header.getScoreFor(id.getKey()));
502 aa.createSequenceMapping(s, s.getStart(),true);
503 s.addAlignmentAnnotation(aa);
504 aa.adjustForAlignment();
506 aa.graph=AlignmentAnnotation.NO_GRAPH;
508 aa.description="TCoffee column reliability score";
509 aa.annotations=annotations;
510 aa.belowAlignment=true;
512 aa.setScore(header.getScoreAvg());
514 aa.showAllColLabels=true;
515 aa.validateRangeAndDisplay();
523 public String print()
525 // TODO Auto-generated method stub