3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
10 import java.io.BufferedReader;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
22 import javax.xml.parsers.ParserConfigurationException;
24 import org.xml.sax.SAXException;
26 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
27 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
28 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
29 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
32 * A file parse for T-Coffee score ascii format. This file contains the alignment consensus
33 * for each resude in any sequence.
35 * This file is procuded by <code>t_coffee</code> providing the option
36 * <code>-output=score_ascii </code> to the program command line
38 * An example file is the following
41 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
58 * 1PHT 999999999999999999999999998762112222543211112134
59 * 1BB9 99999999999999999999999999987-------4322----2234
60 * 1UHC 99999999999999999999999999987-------5321----2246
61 * 1YCS 99999999999999999999999999986-------4321----1-35
62 * 1OOT 999999999999999999999999999861-------3------1135
63 * 1ABO 99999999999999999999999999986-------422-------34
64 * 1FYN 99999999999999999999999999985-------32--------35
65 * 1QCF 99999999999999999999999999974-------2---------24
66 * cons 999999999999999999999999999851000110321100001134
69 * 1PHT ----------5666642367889999999999889
70 * 1BB9 1111111111676653-355679999999999889
71 * 1UHC ----------788774--66789999999999889
72 * 1YCS ----------78777--356789999999999889
73 * 1OOT ----------78877--356789999999997-67
74 * 1ABO ----------687774--56779999999999889
75 * 1FYN ----------6888842356789999999999889
76 * 1QCF ----------6878742356789999999999889
77 * cons 00100000006877641356789999999999889
81 * @author Paolo Di Tommaso
84 public class TCoffeeScoreFile extends AlignFile {
86 public TCoffeeScoreFile(String inFile, String type) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException, ExceptionUnmatchedClosingParentheses
92 public TCoffeeScoreFile(FileParse source) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException, ExceptionUnmatchedClosingParentheses
97 /** The {@link Header} structure holder */
101 * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the
104 LinkedHashMap<String,StringBuilder> scores;
109 * Parse the provided reader for the T-Coffee scores file format
112 public static TCoffeeScoreFile load(Reader reader) {
115 BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
116 TCoffeeScoreFile result = new TCoffeeScoreFile();
117 result.doParsing(in);
118 return result.header != null && result.scores != null ? result : null;
120 catch( Exception e) {
121 throw new RuntimeException(e);
127 * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches
128 * the number of sequences in the alignment
130 public int getHeight() {
131 // the last entry will always be the 'global' alingment consensus scores, so it is removed
132 // from the 'height' count to make this value compatible with the number of sequences in the MSA
133 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
137 * @return The 'width' of the score matrix i.e. the number of columns.
138 * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries
139 * have to have the same width.
141 public int getWidth() {
142 return fWidth != null ? fWidth : 0;
147 * Get the string of score values for the specified seqeunce ID.
148 * @param id The sequence ID
149 * @return The scores as a string of values e.g. {@code 99999987-------432}.
150 * It return an empty string when the specified ID is missing.
152 public String getScoresFor( String id ) {
153 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
157 * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
159 public List<String> getScoresList() {
164 List<String> result = new ArrayList<String>( scores.size() );
165 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
166 result.add(it.getValue().toString());
173 * @return The parsed score values a matrix of bytes
175 public byte[][] getScoresArray() {
180 byte[][] result = new byte[ scores.size() ][];
183 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
184 String line = it.getValue().toString();
185 byte[] seqValues = new byte[ line.length() ];
186 for( int j=0, c=line.length(); j<c; j++ ) {
188 byte val = (byte)(line.charAt(j) - '0');
190 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1;
193 result[rowCount++] = seqValues;
200 public void parse() throws IOException
205 header = readHeader(this);
207 if( header == null ) { error=true; return;}
208 scores = new LinkedHashMap<String,StringBuilder>();
211 * initilize the structure
213 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
214 scores.put( entry.getKey(), new StringBuilder());
218 * go with the reading
221 while( (block = readBlock(this,header.scores.size())) != null ) {
224 * append sequences read in the block
226 for( Map.Entry<String,String> entry : block.items.entrySet() ) {
227 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
228 if( scoreStringBuilder == null ) {
230 errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
234 scoreStringBuilder.append( entry.getValue() );
239 * verify that all rows have the same width
241 for( StringBuilder str : scores.values() ) {
242 if( fWidth == null ) {
243 fWidth = str.length();
245 else if( fWidth != str.length() ) {
247 errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
257 static int parseInt( String str ) {
259 return Integer.parseInt(str);
261 catch( NumberFormatException e ) {
262 // TODO report a warning ?
268 * Reaad the header section in the T-Coffee score file format
270 * @param reader The scores reader
271 * @return The parser {@link Header} instance
272 * @throws RuntimeException when the header is not in the expected format
274 static Header readHeader(FileParse reader) throws IOException {
276 Header result = null;
278 result = new Header();
279 result.head = reader.nextLine();
283 while( (line = reader.nextLine()) != null ) {
284 if( line.startsWith("SCORE=")) {
285 result.score = parseInt( line.substring(6).trim() );
290 if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
291 if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
292 if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
295 * now are expected a list if sequences ID up to the first blank line
297 while( (line=reader.nextLine()) != null ) {
298 if( "".equals(line) ) {
302 int p = line.indexOf(":");
304 // TODO report a warning
308 String id = line.substring(0,p).trim();
309 int val = parseInt(line.substring(p+1).trim());
310 if( "".equals(id) ) {
311 // TODO report warning
315 result.scores.put(id,val);
319 error(reader, "T-COFFEE score file had no per-sequence scores");
323 catch( IOException e ) {
324 error(reader,"Unexpected problem parsing T-Coffee score ascii file");
330 private static void error(FileParse reader, String errm)
333 if (reader.errormessage==null)
334 { reader.errormessage=errm;
336 reader.errormessage+="\n"+errm;
340 * Read a scores block ihe provided stream.
342 * @param reader The stream to parse
343 * @param size The expected number of the sequence to be read
344 * @return The {@link Block} instance read or {link null} null if the end of file has reached.
345 * @throws IOException Something went wrong on the 'wire'
347 static Block readBlock( FileParse reader, int size ) throws IOException {
348 Block result = new Block(size);
352 * read blank lines (eventually)
354 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
355 // consume blank lines
358 if( line == null ) { return null; }
361 * read the scores block
364 if( "".equals(line.trim()) ) {
369 // split the line on the first blank
370 // the first part have to contain the sequence id
371 // the remaining part are the scores values
372 int p = line.indexOf(" ");
374 if (reader.warningMessage==null) { reader.warningMessage=""; }
375 reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
379 String id = line.substring(0,p).trim();
380 String val = line.substring(p+1).trim();
382 result.items.put(id, val);
384 } while( (line = reader.nextLine()) != null );
391 * The score file header
393 static class Header {
397 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
399 public int getScoreAvg() { return score; }
401 public int getScoreFor( String ID ) {
403 return scores.containsKey(ID) ? scores.get(ID) : -1;
409 * Hold a single block values block in the score file
413 Map<String,String> items;
415 public Block( int size ) {
417 this.items = new HashMap<String,String>(size);
420 String getScoresFor( String id ) {
421 return items.get(id);
424 String getConsensus() {
425 return items.get("cons");
429 * TCOFFEE score colourscheme
431 static final Color[] colors = {
432 new Color( 102, 102, 255 ), // #6666FF
433 new Color( 0, 255, 0), // #00FF00
434 new Color( 102, 255, 0), // #66FF00
435 new Color( 204, 255, 0), // #CCFF00
436 new Color( 255, 255, 0), // #FFFF00
437 new Color( 255, 204, 0), // #FFCC00
438 new Color( 255, 153, 0), // #FF9900
439 new Color( 255, 102, 0), // #FF6600
440 new Color( 255, 51, 0), // #FF3300
441 new Color( 255, 34, 0) // #FF2000
443 public final static String TCOFFEE_SCORE="TCoffeeScore";
445 * generate annotation for this TCoffee score set on the given alignment
446 * @param al alignment to annotate
447 * @param matchids if true, annotate sequences based on matching sequence names
448 * @return true if alignment annotation was modified, false otherwise.
450 public boolean annotateAlignment(AlignmentI al, boolean matchids)
452 if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
454 warningMessage="Alignment shape does not match T-Coffee score file shape.";
459 SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
460 byte[][] scoreMatrix=getScoresArray();
461 // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
462 for (Map.Entry<String,StringBuilder> id:scores.entrySet())
464 byte[] srow=scoreMatrix[i];
468 s=sidmatcher.findIdMatch(id.getKey());
470 s=al.getSequenceAt(i);
473 if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
475 System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
478 int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
479 Annotation[] annotations=new Annotation[al.getWidth()];
480 for (int j=0;j<jSize;j++) {
482 if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
487 System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
490 annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
493 // this will overwrite any existing t-coffee scores for the alignment
494 AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,TCOFFEE_SCORE,false,s, null);
498 aa.description=""+id.getKey();
499 aa.annotations=annotations;
501 aa.belowAlignment=false;
502 aa.setScore(header.getScoreFor(id.getKey()));
503 aa.createSequenceMapping(s, s.getStart(),true);
504 s.addAlignmentAnnotation(aa);
505 aa.adjustForAlignment();
507 aa.graph=AlignmentAnnotation.NO_GRAPH;
509 aa.description="TCoffee column reliability score";
510 aa.annotations=annotations;
511 aa.belowAlignment=true;
513 aa.setScore(header.getScoreAvg());
515 aa.showAllColLabels=true;
516 aa.validateRangeAndDisplay();
524 public String print()
526 // TODO Auto-generated method stub