From: Paolo Di Tommaso Date: Mon, 9 Apr 2012 11:28:48 +0000 (+0200) Subject: JAL-1067 - T-Coffee score file parser X-Git-Tag: Archived_Release_2_7~4^2^2~3 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=9d63632f20b8c8c2961601b0ab22ecea92059240 JAL-1067 - T-Coffee score file parser --- diff --git a/src/jalview/io/TCoffeeScoreFile.java b/src/jalview/io/TCoffeeScoreFile.java new file mode 100644 index 0000000..df00986 --- /dev/null +++ b/src/jalview/io/TCoffeeScoreFile.java @@ -0,0 +1,352 @@ +package jalview.io; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * A file parse for T-Coffee score ascii format. This file contains the alignment consensus + * for each resude in any sequence. + *

+ * This file is procuded by t_coffee providing the option + * -output=score_ascii to the program command line + * + * An example file is the following + * + *

+ * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
+ * Cedric Notredame 
+ * CPU TIME:0 sec.
+ * SCORE=90
+ * *
+ *  BAD AVG GOOD
+ * *
+ * 1PHT   :  89
+ * 1BB9   :  90
+ * 1UHC   :  94
+ * 1YCS   :  94
+ * 1OOT   :  93
+ * 1ABO   :  94
+ * 1FYN   :  94
+ * 1QCF   :  94
+ * cons   :  90
+ * 
+ * 1PHT   999999999999999999999999998762112222543211112134
+ * 1BB9   99999999999999999999999999987-------4322----2234
+ * 1UHC   99999999999999999999999999987-------5321----2246
+ * 1YCS   99999999999999999999999999986-------4321----1-35
+ * 1OOT   999999999999999999999999999861-------3------1135
+ * 1ABO   99999999999999999999999999986-------422-------34
+ * 1FYN   99999999999999999999999999985-------32--------35
+ * 1QCF   99999999999999999999999999974-------2---------24
+ * cons   999999999999999999999999999851000110321100001134
+ * 
+ * 
+ * 1PHT   ----------5666642367889999999999889
+ * 1BB9   1111111111676653-355679999999999889
+ * 1UHC   ----------788774--66789999999999889
+ * 1YCS   ----------78777--356789999999999889
+ * 1OOT   ----------78877--356789999999997-67
+ * 1ABO   ----------687774--56779999999999889
+ * 1FYN   ----------6888842356789999999999889
+ * 1QCF   ----------6878742356789999999999889
+ * cons   00100000006877641356789999999999889
+ * 
+ * + * + * @author Paolo Di Tommaso + * + */ +public class TCoffeeScoreFile { + + /** The {@link Header} structure holder */ + Header header; + + /** + * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the + * insertion order. + */ + LinkedHashMap scores = new LinkedHashMap(); + + + /** + * Get the string of score values for the specified seqeunce ID. + * @param id The sequence ID + * @return The scores as a string of values e.g. {@code 99999987-------432}. + * It return an empty string when the specified ID is missing. + */ + public String getScoresFor( String id ) { + return scores.containsKey(id) ? scores.get(id).toString() : ""; + } + + /** + * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA + */ + public List getScoresList() { + List result = new ArrayList( scores.size() ); + for( Map.Entry it : scores.entrySet() ) { + result.add(it.getValue().toString()); + } + + return result; + } + + /** + * @return The parsed score values a matrix of bytes + */ + public byte[][] getScoresArray() { + byte[][] result = new byte[ scores.size() ][]; + + int rowCount = 0; + for( Map.Entry it : scores.entrySet() ) { + String line = it.getValue().toString(); + byte[] seqValues = new byte[ line.length() ]; + for( int j=0, c=line.length(); j= 0 && val <= 9 ) ? val : -1; + } + + result[rowCount++] = seqValues; + } + + return result; + } + + /** + * Parse the specified file. + * + * @param file The file to be parsed + */ + public void parse(File file) { + try { + parse(new FileReader(file)); + } + catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Parse the provided reader for the T-Coffee scores file format + * + * @param reader + */ + public void parse(Reader reader) { + + try { + BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader)); + doParsing(in); + } + catch( Exception e) { + throw new RuntimeException(e); + } + } + + private void doParsing(BufferedReader in) throws IOException { + + /* + * read the header + */ + header = readHeader(in); + + /* + * initilize the structure + */ + for( Map.Entry entry : header.scores.entrySet() ) { + scores.put( entry.getKey(), new StringBuilder()); + } + + /* + * go with the reading + */ + Block block; + while( (block = readBlock(in, header.scores.size())) != null ) { + + /* + * append sequences read in the block + */ + for( Map.Entry entry : block.items.entrySet() ) { + StringBuilder scoreStringBuilder = scores.get(entry.getKey()); + if( scoreStringBuilder == null ) { + throw new RuntimeException(String.format("Invalid T-Coffee score file. Sequence ID '%s' is not declared in header section", entry.getKey())); + } + + scoreStringBuilder.append( entry.getValue() ); + } + + } + + } + + + static int parseInt( String str ) { + try { + return Integer.parseInt(str); + } + catch( NumberFormatException e ) { + // TODO report a warning ? + return 0; + } + } + + /** + * Reaad the header section in the T-Coffee score file format + * + * @param reader The scores reader + * @return The parser {@link Header} instance + * @throws RuntimeException when the header is not in the expected format + */ + static Header readHeader(BufferedReader reader) { + + Header result = null; + try { + result = new Header(); + result.head = reader.readLine(); + + String line; + + while( (line = reader.readLine()) != null ) { + if( line.startsWith("SCORE=")) { + result.score = parseInt( line.substring(6).trim() ); + break; + } + } + + if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null; + if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null; + if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null; + + /* + * now are expected a list if sequences ID up to the first blank line + */ + while( (line=reader.readLine()) != null ) { + if( "".equals(line) ) { + break; + } + + int p = line.indexOf(":"); + if( p == -1 ) { + // TODO report a warning + continue; + } + + String id = line.substring(0,p).trim(); + int val = parseInt(line.substring(p+1).trim()); + if( "".equals(id) ) { + // TODO report warning + continue; + } + + result.scores.put(id,val); + } + + } + catch( IOException e ) { + throw new RuntimeException("Cannot parse T-Coffee score ascii file", e); + } + + return result; + } + + /** + * Read a scores block ihe provided stream. + * + * @param reader The stream to parse + * @param size The expected number of the sequence to be read + * @return The {@link Block} instance read or {link null} null if the end of file has reached. + * @throws IOException Something went wrong on the 'wire' + */ + static Block readBlock( BufferedReader reader, int size ) throws IOException { + Block result = new Block(size); + String line; + + /* + * read blank lines (eventually) + */ + while( (line=reader.readLine()) != null && "".equals(line.trim())) { + // consume blank lines + } + + if( line == null ) return null; + + /* + * read the scores block + */ + do { + if( "".equals(line.trim()) ) { + // terminated + break; + } + + // split the line on the first blank + // the first part have to contain the sequence id + // theramining part are the scores values + int p = line.indexOf(" "); + if( p == -1 ) { + //TODO This is an unexpected condition, log a warning or throw an exception ? + continue; + } + + String id = line.substring(0,p).trim(); + String val = line.substring(p+1).trim(); + + result.items.put(id, val); + + } while( (line = reader.readLine()) != null ); + + + return result; + } + + /* + * The score file header + */ + static class Header { + String head; + int score; + + LinkedHashMap scores = new LinkedHashMap(); + + public int getScoreAvg() { return score; } + + public int getScoreFor( String ID ) { + + return scores.containsKey(ID) ? scores.get(ID) : -1; + + } + } + + /* + * Hold a single block values block in the score file + */ + static class Block { + int size; + Map items; + + public Block( int size ) { + this.size = size; + this.items = new HashMap(size); + } + + String getScoresFor( String id ) { + return items.get(id); + } + + String getConsensus() { + return items.get("cons"); + } + } + + + +} diff --git a/test/jalview/io/TCoffeeScoreFileTest.java b/test/jalview/io/TCoffeeScoreFileTest.java new file mode 100644 index 0000000..f654ba1 --- /dev/null +++ b/test/jalview/io/TCoffeeScoreFileTest.java @@ -0,0 +1,140 @@ +package jalview.io; + +import static org.junit.Assert.*; +import jalview.io.TCoffeeScoreFile.Block; +import jalview.io.TCoffeeScoreFile.Header; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.List; + +import org.junit.Test; + +public class TCoffeeScoreFileTest { + + final static File SCORE_FILE = new File("./test/jalview/io/tcoffee.score_ascii"); + + @Test + public void testReadHeader() throws FileNotFoundException { + + Header header = TCoffeeScoreFile.readHeader( new BufferedReader(new FileReader(SCORE_FILE)) ); + assertNotNull(header); + assertEquals( "T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)", header.head ); + assertEquals( 90, header.score ); + assertEquals( 89, header.getScoreFor("1PHT") ); + assertEquals( 90, header.getScoreFor("1BB9") ); + assertEquals( 94, header.getScoreFor("1UHC") ); + assertEquals( 94, header.getScoreFor("1YCS") ); + assertEquals( 93, header.getScoreFor("1OOT") ); + assertEquals( 94, header.getScoreFor("1ABO") ); + assertEquals( 94, header.getScoreFor("1FYN") ); + assertEquals( 94, header.getScoreFor("1QCF") ); + assertEquals( 90, header.getScoreFor("cons") ); + } + + @Test + public void testReadBlock( ) throws IOException { + + String BLOCK = "\n" + + "\n" + + "\n" + + "1PHT 999999999999999999999999998762112222543211112134\n" + + "1BB9 99999999999999999999999999987-------4322----2234 \n" + + "1UHC 99999999999999999999999999987-------5321----2246\n" + + "1YCS 99999999999999999999999999986-------4321----1-35\n" + + "1OOT 999999999999999999999999999861-------3------1135 \n" + + "1ABO 99999999999999999999999999986-------422-------34\n" + + "1FYN 99999999999999999999999999985-------32--------35\n" + + "1QCF 99999999999999999999999999974-------2---------24\n" + + "cons 999999999999999999999999999851000110321100001134\n" + + "\n" + + "\n"; + + Block block = TCoffeeScoreFile.readBlock(new BufferedReader(new StringReader(BLOCK)), 0); + assertNotNull(block); + assertEquals( "999999999999999999999999998762112222543211112134", block.getScoresFor("1PHT") ); + assertEquals( "99999999999999999999999999987-------4322----2234", block.getScoresFor("1BB9") ); + assertEquals( "99999999999999999999999999987-------5321----2246", block.getScoresFor("1UHC") ); + assertEquals( "99999999999999999999999999986-------4321----1-35", block.getScoresFor("1YCS") ); + assertEquals( "999999999999999999999999999861-------3------1135", block.getScoresFor("1OOT") ); + assertEquals( "99999999999999999999999999986-------422-------34", block.getScoresFor("1ABO") ); + assertEquals( "99999999999999999999999999985-------32--------35", block.getScoresFor("1FYN") ); + assertEquals( "99999999999999999999999999974-------2---------24", block.getScoresFor("1QCF") ); + assertEquals( "999999999999999999999999999851000110321100001134", block.getConsensus() ); + } + + @Test + public void testParse() throws FileNotFoundException { + + TCoffeeScoreFile parser = new TCoffeeScoreFile(); + parser.parse(new BufferedReader(new FileReader(SCORE_FILE)) ); + + assertEquals( "999999999999999999999999998762112222543211112134----------5666642367889999999999889", parser.getScoresFor("1PHT") ); + assertEquals( "99999999999999999999999999987-------4322----22341111111111676653-355679999999999889", parser.getScoresFor("1BB9") ); + assertEquals( "99999999999999999999999999987-------5321----2246----------788774--66789999999999889", parser.getScoresFor("1UHC") ); + assertEquals( "99999999999999999999999999986-------4321----1-35----------78777--356789999999999889", parser.getScoresFor("1YCS") ); + assertEquals( "999999999999999999999999999861-------3------1135----------78877--356789999999997-67", parser.getScoresFor("1OOT") ); + assertEquals( "99999999999999999999999999986-------422-------34----------687774--56779999999999889", parser.getScoresFor("1ABO") ); + assertEquals( "99999999999999999999999999985-------32--------35----------6888842356789999999999889", parser.getScoresFor("1FYN") ); + assertEquals( "99999999999999999999999999974-------2---------24----------6878742356789999999999889", parser.getScoresFor("1QCF") ); + assertEquals( "99999999999999999999999999985100011032110000113400100000006877641356789999999999889", parser.getScoresFor("cons") ); + } + + + @Test + public void testGetAsList() throws FileNotFoundException { + + TCoffeeScoreFile parser = new TCoffeeScoreFile(); + parser.parse(new BufferedReader(new FileReader(SCORE_FILE)) ); + + List scores = parser.getScoresList(); + assertEquals( "999999999999999999999999998762112222543211112134----------5666642367889999999999889", scores.get(0) ); + assertEquals( "99999999999999999999999999987-------4322----22341111111111676653-355679999999999889", scores.get(1) ); + assertEquals( "99999999999999999999999999987-------5321----2246----------788774--66789999999999889", scores.get(2) ); + assertEquals( "99999999999999999999999999986-------4321----1-35----------78777--356789999999999889", scores.get(3) ); + assertEquals( "999999999999999999999999999861-------3------1135----------78877--356789999999997-67", scores.get(4) ); + assertEquals( "99999999999999999999999999986-------422-------34----------687774--56779999999999889", scores.get(5) ); + assertEquals( "99999999999999999999999999985-------32--------35----------6888842356789999999999889", scores.get(6) ); + assertEquals( "99999999999999999999999999974-------2---------24----------6878742356789999999999889", scores.get(7) ); + assertEquals( "99999999999999999999999999985100011032110000113400100000006877641356789999999999889", scores.get(8) ); + + } + + + @Test + public void testGetAsArray() throws FileNotFoundException { + + TCoffeeScoreFile parser = new TCoffeeScoreFile(); + parser.parse(new BufferedReader(new FileReader(SCORE_FILE)) ); + + byte[][] scores = parser.getScoresArray(); + + assertEquals( 9, scores[0][0] ); + assertEquals( 9, scores[1][0] ); + assertEquals( 9, scores[2][0] ); + assertEquals( 9, scores[3][0] ); + assertEquals( 9, scores[4][0] ); + assertEquals( 9, scores[5][0] ); + assertEquals( 9, scores[6][0] ); + assertEquals( 9, scores[7][0] ); + assertEquals( 9, scores[8][0] ); + + assertEquals( 5, scores[0][36] ); + assertEquals( 4, scores[1][36] ); + assertEquals( 5, scores[2][36] ); + assertEquals( 4, scores[3][36] ); + assertEquals( -1, scores[4][36] ); + assertEquals( 4, scores[5][36] ); + assertEquals( 3, scores[6][36] ); + assertEquals( 2, scores[7][36] ); + assertEquals( 3, scores[8][36] ); + + } + + + +} diff --git a/test/jalview/io/tcoffee.fasta_aln b/test/jalview/io/tcoffee.fasta_aln new file mode 100644 index 0000000..63b12b1 --- /dev/null +++ b/test/jalview/io/tcoffee.fasta_aln @@ -0,0 +1,24 @@ +>1PHT +YQYRALYDYKKEREEDIDLHLGDILTVNKGSLVALGFSDGQEARPEEI-- +--------GWLNGYNETTGERGDFPGTYVEYIG +>1BB9 +FKVQAQHDYTATDTDELQLKAGDVVLVIP-------FQNP----EEQDEG +WLMGVKESDWNQHK-ELEKCRGVFPENFTERVQ +>1UHC +QVYFAVYTFKARNPNELSVSANQKLKILE-------FKDV----TGNT-- +--------EWWLAE--VNGKKGYVPSNYIRKTE +>1YCS +GVIYALWDYEPQNDDELPMKEGDCMTIIH-------REDE----D-EI-- +--------EWWWA--RLNDKEGYVPRNLLGLYP +>1OOT +PKAVALYSFAGEESGDLPFRKGDVITILKK-------S------DSQN-- +--------DWWTG--RVNGREGIFPANYVE-LV +>1ABO +NLFVALYDFVASGDNTLSITKGEKLRVLG-------YNH-------NG-- +--------EWCEAQ--TKNGQGWVPSNYITPVN +>1FYN +TLFVALYDYEARTEDDLSFHKGEKFQILN-------SS--------EG-- +--------DWWEARSLTTGETGYIPSNYVAPVD +>1QCF +IIVVALYDYEAIHHEDLSFQKGDQMVVLE-------E---------SG-- +--------EWWKARSLATRKEGYIPSNYVARVD \ No newline at end of file diff --git a/test/jalview/io/tcoffee.score_ascii b/test/jalview/io/tcoffee.score_ascii new file mode 100644 index 0000000..2b533a3 --- /dev/null +++ b/test/jalview/io/tcoffee.score_ascii @@ -0,0 +1,37 @@ +T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336) +Cedric Notredame +CPU TIME:0 sec. +SCORE=90 +* + BAD AVG GOOD +* +1PHT : 89 +1BB9 : 90 +1UHC : 94 +1YCS : 94 +1OOT : 93 +1ABO : 94 +1FYN : 94 +1QCF : 94 +cons : 90 + +1PHT 999999999999999999999999998762112222543211112134 +1BB9 99999999999999999999999999987-------4322----2234 +1UHC 99999999999999999999999999987-------5321----2246 +1YCS 99999999999999999999999999986-------4321----1-35 +1OOT 999999999999999999999999999861-------3------1135 +1ABO 99999999999999999999999999986-------422-------34 +1FYN 99999999999999999999999999985-------32--------35 +1QCF 99999999999999999999999999974-------2---------24 +cons 999999999999999999999999999851000110321100001134 + + +1PHT ----------5666642367889999999999889 +1BB9 1111111111676653-355679999999999889 +1UHC ----------788774--66789999999999889 +1YCS ----------78777--356789999999999889 +1OOT ----------78877--356789999999997-67 +1ABO ----------687774--56779999999999889 +1FYN ----------6888842356789999999999889 +1QCF ----------6878742356789999999999889 +cons 00100000006877641356789999999999889