package jalview.io; import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.SequenceI; import java.awt.Color; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** * A file parse for T-Coffee score ascii format. This file contains the alignment consensus * for each resude in any sequence. *

* This file is procuded by t_coffee providing the option * -output=score_ascii to the program command line * * An example file is the following * *

 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
 * Cedric Notredame 
 * CPU TIME:0 sec.
 * SCORE=90
 * *
 *  BAD AVG GOOD
 * *
 * 1PHT   :  89
 * 1BB9   :  90
 * 1UHC   :  94
 * 1YCS   :  94
 * 1OOT   :  93
 * 1ABO   :  94
 * 1FYN   :  94
 * 1QCF   :  94
 * cons   :  90
 * 
 * 1PHT   999999999999999999999999998762112222543211112134
 * 1BB9   99999999999999999999999999987-------4322----2234
 * 1UHC   99999999999999999999999999987-------5321----2246
 * 1YCS   99999999999999999999999999986-------4321----1-35
 * 1OOT   999999999999999999999999999861-------3------1135
 * 1ABO   99999999999999999999999999986-------422-------34
 * 1FYN   99999999999999999999999999985-------32--------35
 * 1QCF   99999999999999999999999999974-------2---------24
 * cons   999999999999999999999999999851000110321100001134
 * 
 * 
 * 1PHT   ----------5666642367889999999999889
 * 1BB9   1111111111676653-355679999999999889
 * 1UHC   ----------788774--66789999999999889
 * 1YCS   ----------78777--356789999999999889
 * 1OOT   ----------78877--356789999999997-67
 * 1ABO   ----------687774--56779999999999889
 * 1FYN   ----------6888842356789999999999889
 * 1QCF   ----------6878742356789999999999889
 * cons   00100000006877641356789999999999889
 *

* * * @author Paolo Di Tommaso * */ public class TCoffeeScoreFile { /** The {@link Header} structure holder */ Header header; /** * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the * insertion order. */ LinkedHashMap scores = new LinkedHashMap(); Integer fWidth; /** * Parse the specified file. * * @param file The file to be parsed */ public static TCoffeeScoreFile load(File file) { try { return load(new FileReader(file)); } catch (FileNotFoundException e) { throw new RuntimeException(e); } } /** * Parse the provided reader for the T-Coffee scores file format * * @param reader */ public static TCoffeeScoreFile load(Reader reader) { try { BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader)); TCoffeeScoreFile result = new TCoffeeScoreFile(); result.doParsing(in); return result.header != null && result.scores != null ? result : null; } catch( Exception e) { throw new RuntimeException(e); } } /** * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches * the number of sequences in the alignment */ public int getHeight() { // the last entry will always be the 'global' alingment consensus scores, so it is removed // from the 'height' count to make this value compatible with the number of sequences in the MSA return scores != null && scores.size() > 0 ? scores.size()-1 : 0; } /** * @return The 'width' of the score matrix i.e. the number of columns. * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries * have to have the same width. */ public int getWidth() { return fWidth != null ? fWidth : 0; } /** * The default constructor is marked as {@code protected} since this class is meant to created * through the {@link #load(File)} or {@link #load(Reader)} factory methods */ protected TCoffeeScoreFile() { } /** * Get the string of score values for the specified seqeunce ID. * @param id The sequence ID * @return The scores as a string of values e.g. {@code 99999987-------432}. * It return an empty string when the specified ID is missing. */ public String getScoresFor( String id ) { return scores.containsKey(id) ? scores.get(id).toString() : ""; } /** * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA */ public List getScoresList() { List result = new ArrayList( scores.size() ); for( Map.Entry it : scores.entrySet() ) { result.add(it.getValue().toString()); } return result; } /** * @return The parsed score values a matrix of bytes */ public byte[][] getScoresArray() { byte[][] result = new byte[ scores.size() ][]; int rowCount = 0; for( Map.Entry it : scores.entrySet() ) { String line = it.getValue().toString(); byte[] seqValues = new byte[ line.length() ]; for( int j=0, c=line.length(); j= 0 && val <= 9 ) ? val : -1; } result[rowCount++] = seqValues; } return result; } private void doParsing(BufferedReader in) throws IOException { /* * read the header */ header = readHeader(in); if( header == null ) { return; } /* * initilize the structure */ for( Map.Entry entry : header.scores.entrySet() ) { scores.put( entry.getKey(), new StringBuilder()); } /* * go with the reading */ Block block; while( (block = readBlock(in, header.scores.size())) != null ) { /* * append sequences read in the block */ for( Map.Entry entry : block.items.entrySet() ) { StringBuilder scoreStringBuilder = scores.get(entry.getKey()); if( scoreStringBuilder == null ) { throw new RuntimeException(String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey())); } scoreStringBuilder.append( entry.getValue() ); } } /* * verify that all rows have the same width */ for( StringBuilder str : scores.values() ) { if( fWidth == null ) { fWidth = str.length(); } else if( fWidth != str.length() ) { throw new RuntimeException("Invalid T-Coffee score file: All the score sequences must have the same length"); } } } static int parseInt( String str ) { try { return Integer.parseInt(str); } catch( NumberFormatException e ) { // TODO report a warning ? return 0; } } /** * Reaad the header section in the T-Coffee score file format * * @param reader The scores reader * @return The parser {@link Header} instance * @throws RuntimeException when the header is not in the expected format */ static Header readHeader(BufferedReader reader) { Header result = null; try { result = new Header(); result.head = reader.readLine(); String line; while( (line = reader.readLine()) != null ) { if( line.startsWith("SCORE=")) { result.score = parseInt( line.substring(6).trim() ); break; } } if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null; if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null; if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null; /* * now are expected a list if sequences ID up to the first blank line */ while( (line=reader.readLine()) != null ) { if( "".equals(line) ) { break; } int p = line.indexOf(":"); if( p == -1 ) { // TODO report a warning continue; } String id = line.substring(0,p).trim(); int val = parseInt(line.substring(p+1).trim()); if( "".equals(id) ) { // TODO report warning continue; } result.scores.put(id,val); } } catch( IOException e ) { throw new RuntimeException("Cannot parse T-Coffee score ascii file", e); } return result; } /** * Read a scores block ihe provided stream. * * @param reader The stream to parse * @param size The expected number of the sequence to be read * @return The {@link Block} instance read or {link null} null if the end of file has reached. * @throws IOException Something went wrong on the 'wire' */ static Block readBlock( BufferedReader reader, int size ) throws IOException { Block result = new Block(size); String line; /* * read blank lines (eventually) */ while( (line=reader.readLine()) != null && "".equals(line.trim())) { // consume blank lines } if( line == null ) return null; /* * read the scores block */ do { if( "".equals(line.trim()) ) { // terminated break; } // split the line on the first blank // the first part have to contain the sequence id // theramining part are the scores values int p = line.indexOf(" "); if( p == -1 ) { //TODO This is an unexpected condition, log a warning or throw an exception ? continue; } String id = line.substring(0,p).trim(); String val = line.substring(p+1).trim(); result.items.put(id, val); } while( (line = reader.readLine()) != null ); return result; } /* * The score file header */ static class Header { String head; int score; LinkedHashMap scores = new LinkedHashMap(); public int getScoreAvg() { return score; } public int getScoreFor( String ID ) { return scores.containsKey(ID) ? scores.get(ID) : -1; } } /* * Hold a single block values block in the score file */ static class Block { int size; Map items; public Block( int size ) { this.size = size; this.items = new HashMap(size); } String getScoresFor( String id ) { return items.get(id); } String getConsensus() { return items.get("cons"); } } /** * TCOFFEE score colourscheme */ static final Color[] colors = { new Color( 102, 102, 255 ), // #6666FF new Color( 0, 255, 0), // #00FF00 new Color( 102, 255, 0), // #66FF00 new Color( 204, 255, 0), // #CCFF00 new Color( 255, 255, 0), // #FFFF00 new Color( 255, 204, 0), // #FFCC00 new Color( 255, 153, 0), // #FF9900 new Color( 255, 102, 0), // #FF6600 new Color( 255, 51, 0), // #FF3300 new Color( 255, 34, 0) // #FF2000 }; public final static String TCOFFEE_SCORE="TCoffeeScore"; /** * generate annotation for this TCoffee score set on the given alignment * @param al alignment to annotate * @param matchids if true, annotate sequences based on matching sequence names * @return true if alignment annotation was modified, false otherwise. */ public boolean annotateAlignment(AlignmentI al, boolean matchids) { boolean added=false; int i=0; SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray()); byte[][] scoreMatrix=getScoresArray(); // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this. for (Map.Entry id:scores.entrySet()) { byte[] srow=scoreMatrix[i]; SequenceI s; if (matchids) { s=sidmatcher.findIdMatch(id.getKey()); } else { s=al.getSequenceAt(i); } i++; if (s==null && i!=scores.size() && !id.getKey().equals("cons")) { System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey()); continue; } int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length; Annotation[] annotations=new Annotation[al.getWidth()]; for (int j=0;j= 0 && val < colors.length ? colors[val] : Color.white); } AlignmentAnnotation aa=null; if (s!=null) { // TODO - set per sequence score aa=new AlignmentAnnotation(TCOFFEE_SCORE, "Score for "+id.getKey(), annotations); aa.setSequenceRef(s); aa.visible=false; aa.belowAlignment=false; } else { aa=new AlignmentAnnotation("T-COFFEE", "TCoffee column reliability score", annotations); aa.belowAlignment=true; aa.visible=true; } al.addAnnotation(aa); added=true; } return added; } }