JAL-1067 - T-Coffee score file parser
authorPaolo Di Tommaso <paolo.ditommaso@gmail.com>
Mon, 9 Apr 2012 11:28:48 +0000 (13:28 +0200)
committerPaolo Di Tommaso <paolo.ditommaso@gmail.com>
Mon, 9 Apr 2012 11:28:48 +0000 (13:28 +0200)
src/jalview/io/TCoffeeScoreFile.java [new file with mode: 0644]
test/jalview/io/TCoffeeScoreFileTest.java [new file with mode: 0644]
test/jalview/io/tcoffee.fasta_aln [new file with mode: 0644]
test/jalview/io/tcoffee.score_ascii [new file with mode: 0644]

diff --git a/src/jalview/io/TCoffeeScoreFile.java b/src/jalview/io/TCoffeeScoreFile.java
new file mode 100644 (file)
index 0000000..df00986
--- /dev/null
@@ -0,0 +1,352 @@
+package jalview.io;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
+ * for each resude in any sequence.
+ * <p>
+ * This file is procuded by <code>t_coffee</code> providing the option 
+ * <code>-output=score_ascii </code> to the program command line
+ * 
+ * An example file is the following 
+ * 
+ * <pre>
+ * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
+ * Cedric Notredame 
+ * CPU TIME:0 sec.
+ * SCORE=90
+ * *
+ *  BAD AVG GOOD
+ * *
+ * 1PHT   :  89
+ * 1BB9   :  90
+ * 1UHC   :  94
+ * 1YCS   :  94
+ * 1OOT   :  93
+ * 1ABO   :  94
+ * 1FYN   :  94
+ * 1QCF   :  94
+ * cons   :  90
+ * 
+ * 1PHT   999999999999999999999999998762112222543211112134
+ * 1BB9   99999999999999999999999999987-------4322----2234
+ * 1UHC   99999999999999999999999999987-------5321----2246
+ * 1YCS   99999999999999999999999999986-------4321----1-35
+ * 1OOT   999999999999999999999999999861-------3------1135
+ * 1ABO   99999999999999999999999999986-------422-------34
+ * 1FYN   99999999999999999999999999985-------32--------35
+ * 1QCF   99999999999999999999999999974-------2---------24
+ * cons   999999999999999999999999999851000110321100001134
+ * 
+ * 
+ * 1PHT   ----------5666642367889999999999889
+ * 1BB9   1111111111676653-355679999999999889
+ * 1UHC   ----------788774--66789999999999889
+ * 1YCS   ----------78777--356789999999999889
+ * 1OOT   ----------78877--356789999999997-67
+ * 1ABO   ----------687774--56779999999999889
+ * 1FYN   ----------6888842356789999999999889
+ * 1QCF   ----------6878742356789999999999889
+ * cons   00100000006877641356789999999999889
+ * </pre>
+ * 
+ * 
+ * @author Paolo Di Tommaso
+ *
+ */
+public class TCoffeeScoreFile {
+       
+       /** The {@link Header} structure holder */
+       Header header;
+       
+       /** 
+        * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
+        * insertion order. 
+        */
+       LinkedHashMap<String,StringBuilder> scores = new LinkedHashMap<String,StringBuilder>();
+       
+
+       /**
+        * Get the string of score values for the specified seqeunce ID. 
+        * @param id The sequence ID 
+        * @return The scores as a string of values e.g. {@code 99999987-------432}. 
+        *      It return an empty string when the specified ID is missing. 
+        */
+       public String getScoresFor( String id ) {
+               return scores.containsKey(id) ? scores.get(id).toString() : "";
+       }
+       
+       /**
+        * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
+        */
+       public List<String> getScoresList() {
+               List<String> result = new ArrayList<String>( scores.size() );
+               for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
+                       result.add(it.getValue().toString());
+               }
+               
+               return result;
+       }
+       
+       /**
+        * @return The parsed score values a matrix of bytes
+        */
+       public byte[][] getScoresArray() { 
+               byte[][] result = new byte[ scores.size() ][];
+               
+               int rowCount = 0;
+               for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
+                       String line = it.getValue().toString();
+                       byte[] seqValues = new byte[ line.length() ];
+                       for( int j=0, c=line.length(); j<c; j++ ) {
+                               
+                               byte val = (byte)(line.charAt(j) - '0');
+
+                               seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
+                       }
+
+                       result[rowCount++] = seqValues;
+               }
+               
+               return result;
+       }
+       
+       /**
+        * Parse the specified file.
+        * 
+        * @param file The file to be parsed 
+        */
+       public void parse(File file) {
+               try {
+                       parse(new FileReader(file));
+               } 
+               catch (FileNotFoundException e) {
+                       throw new RuntimeException(e);
+               }
+       }
+       
+       /**
+        * Parse the provided reader for the T-Coffee scores file format
+        * 
+        * @param reader 
+        */
+       public void parse(Reader reader) {
+
+               try {
+                       BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
+                       doParsing(in);
+               }
+               catch( Exception e) {
+                       throw new RuntimeException(e);
+               }
+       }
+       
+       private void doParsing(BufferedReader in) throws IOException {
+
+               /*
+                * read the header
+                */
+               header = readHeader(in);
+               
+               /*
+                * initilize the structure
+                */
+               for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
+                       scores.put( entry.getKey(), new StringBuilder());
+               }
+               
+               /*
+                * go with the reading
+                */
+               Block block;
+               while( (block = readBlock(in, header.scores.size())) != null  ) {
+                       
+                       /*
+                        * append sequences read in the block
+                        */
+                       for( Map.Entry<String,String> entry : block.items.entrySet() ) {
+                               StringBuilder scoreStringBuilder = scores.get(entry.getKey());
+                               if( scoreStringBuilder == null ) {
+                                       throw new RuntimeException(String.format("Invalid T-Coffee score file. Sequence ID '%s' is not declared in header section", entry.getKey()));
+                               }
+                               
+                               scoreStringBuilder.append( entry.getValue() );
+                       }
+                       
+               }
+               
+       }
+
+
+       static int parseInt( String str ) {
+               try {
+                       return Integer.parseInt(str);
+               }
+               catch( NumberFormatException e ) {
+                       // TODO report a warning ?
+                       return 0;
+               }               
+       }
+       
+       /**
+        * Reaad the header section in the T-Coffee score file format 
+        * 
+        * @param reader The scores reader 
+        * @return The parser {@link Header} instance 
+        * @throws RuntimeException when the header is not in the expected format
+        */
+       static Header readHeader(BufferedReader reader) {
+               
+               Header result = null;
+               try {
+                       result = new Header();
+                       result.head = reader.readLine();
+                       
+                       String line;
+
+                       while( (line = reader.readLine()) != null ) {
+                               if( line.startsWith("SCORE=")) {
+                                       result.score = parseInt( line.substring(6).trim() );
+                                       break;
+                               }
+                       }
+
+                       if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
+                       if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null;
+                       if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null;
+                       
+                       /*
+                        * now are expected a list if sequences ID up to the first blank line
+                        */
+                       while( (line=reader.readLine()) != null ) {
+                               if( "".equals(line) ) {
+                                       break;
+                               }
+                               
+                               int p = line.indexOf(":");
+                               if( p == -1 ) {
+                                       // TODO report a warning
+                                       continue;
+                               }
+                               
+                               String id = line.substring(0,p).trim();
+                               int val = parseInt(line.substring(p+1).trim());
+                               if( "".equals(id) ) {
+                                       // TODO report warning
+                                       continue;
+                               }
+                               
+                               result.scores.put(id,val);
+                       }
+                       
+               }
+               catch( IOException e ) {
+                       throw new RuntimeException("Cannot parse T-Coffee score ascii file", e);
+               }
+               
+               return result;
+       } 
+       
+       /**
+        * Read a scores block ihe provided stream. 
+        * 
+        * @param reader The stream to parse
+        * @param size The expected number of the sequence to be read 
+        * @return The {@link Block} instance read or {link null} null if the end of file has reached.
+        * @throws IOException Something went wrong on the 'wire' 
+        */
+       static Block readBlock( BufferedReader reader, int size ) throws IOException {
+               Block result = new Block(size);
+               String line;
+               
+               /*
+                * read blank lines (eventually)
+                */
+               while( (line=reader.readLine()) != null && "".equals(line.trim())) {
+                       // consume blank lines 
+               }
+               
+               if( line == null ) return null;
+               
+               /*
+                * read the scores block
+                */
+               do {
+                       if( "".equals(line.trim()) ) {
+                               // terminated
+                               break;
+                       }
+                       
+                       // split the line on the first blank 
+                       // the first part have to contain the sequence id
+                       // theramining part are the scores values
+                       int p = line.indexOf(" ");
+                       if( p == -1 ) {
+                               //TODO This is an unexpected condition, log a warning or throw an exception ? 
+                               continue;
+                       } 
+                       
+                       String id = line.substring(0,p).trim();
+                       String val = line.substring(p+1).trim();
+                       
+                       result.items.put(id, val);
+                       
+               } while( (line = reader.readLine()) != null ); 
+               
+
+               return result;
+       }
+
+       /*
+        * The score file header 
+        */
+       static class Header {
+               String head;
+               int score;
+
+               LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
+               
+               public int getScoreAvg() { return score; }
+               
+               public int getScoreFor( String ID ) { 
+
+                       return scores.containsKey(ID) ? scores.get(ID) : -1;
+       
+               }
+       }
+       
+       /*
+        * Hold a single block values block in the score file
+        */
+       static class Block {
+               int size;
+               Map<String,String> items;
+               
+               public Block( int size ) {
+                       this.size = size;
+                       this.items = new HashMap<String,String>(size);
+               } 
+       
+               String getScoresFor( String id ) {
+                       return items.get(id);
+               }
+               
+               String getConsensus() {
+                       return items.get("cons");
+               }
+       }
+       
+       
+
+}
diff --git a/test/jalview/io/TCoffeeScoreFileTest.java b/test/jalview/io/TCoffeeScoreFileTest.java
new file mode 100644 (file)
index 0000000..f654ba1
--- /dev/null
@@ -0,0 +1,140 @@
+package jalview.io;
+
+import static org.junit.Assert.*;
+import jalview.io.TCoffeeScoreFile.Block;
+import jalview.io.TCoffeeScoreFile.Header;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.List;
+
+import org.junit.Test;
+
+public class TCoffeeScoreFileTest {
+
+       final static File SCORE_FILE = new File("./test/jalview/io/tcoffee.score_ascii");
+       
+       @Test
+       public void testReadHeader() throws FileNotFoundException {
+
+               Header header = TCoffeeScoreFile.readHeader( new BufferedReader(new FileReader(SCORE_FILE)) );
+               assertNotNull(header);
+               assertEquals( "T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)", header.head );
+               assertEquals( 90, header.score );
+               assertEquals( 89, header.getScoreFor("1PHT") );
+               assertEquals( 90, header.getScoreFor("1BB9") );
+               assertEquals( 94, header.getScoreFor("1UHC") );
+               assertEquals( 94, header.getScoreFor("1YCS") );
+               assertEquals( 93, header.getScoreFor("1OOT") );
+               assertEquals( 94, header.getScoreFor("1ABO") );
+               assertEquals( 94, header.getScoreFor("1FYN") );
+               assertEquals( 94, header.getScoreFor("1QCF") );
+               assertEquals( 90, header.getScoreFor("cons") );
+       }
+       
+       @Test
+       public void testReadBlock( ) throws IOException {
+               
+               String BLOCK = "\n" +
+                               "\n" +
+                               "\n" +
+                               "1PHT   999999999999999999999999998762112222543211112134\n" +
+                               "1BB9   99999999999999999999999999987-------4322----2234  \n" +
+                               "1UHC   99999999999999999999999999987-------5321----2246\n" +
+                               "1YCS   99999999999999999999999999986-------4321----1-35\n" +
+                               "1OOT   999999999999999999999999999861-------3------1135  \n" +
+                               "1ABO   99999999999999999999999999986-------422-------34\n" +
+                               "1FYN   99999999999999999999999999985-------32--------35\n" +
+                               "1QCF   99999999999999999999999999974-------2---------24\n" +
+                               "cons   999999999999999999999999999851000110321100001134\n" +
+                               "\n" +
+                               "\n";
+               
+               Block block = TCoffeeScoreFile.readBlock(new BufferedReader(new StringReader(BLOCK)), 0);
+               assertNotNull(block);
+               assertEquals( "999999999999999999999999998762112222543211112134", block.getScoresFor("1PHT") );
+               assertEquals( "99999999999999999999999999987-------4322----2234", block.getScoresFor("1BB9") );
+               assertEquals( "99999999999999999999999999987-------5321----2246", block.getScoresFor("1UHC") );
+               assertEquals( "99999999999999999999999999986-------4321----1-35", block.getScoresFor("1YCS") );
+               assertEquals( "999999999999999999999999999861-------3------1135", block.getScoresFor("1OOT") );
+               assertEquals( "99999999999999999999999999986-------422-------34", block.getScoresFor("1ABO") );
+               assertEquals( "99999999999999999999999999985-------32--------35", block.getScoresFor("1FYN") );
+               assertEquals( "99999999999999999999999999974-------2---------24", block.getScoresFor("1QCF") );
+               assertEquals( "999999999999999999999999999851000110321100001134", block.getConsensus() );
+       }
+
+       @Test
+       public void testParse() throws FileNotFoundException {
+
+               TCoffeeScoreFile parser = new TCoffeeScoreFile();
+               parser.parse(new BufferedReader(new FileReader(SCORE_FILE)) );
+
+               assertEquals( "999999999999999999999999998762112222543211112134----------5666642367889999999999889", parser.getScoresFor("1PHT") );
+               assertEquals( "99999999999999999999999999987-------4322----22341111111111676653-355679999999999889", parser.getScoresFor("1BB9") );
+               assertEquals( "99999999999999999999999999987-------5321----2246----------788774--66789999999999889", parser.getScoresFor("1UHC") );
+               assertEquals( "99999999999999999999999999986-------4321----1-35----------78777--356789999999999889", parser.getScoresFor("1YCS") );
+               assertEquals( "999999999999999999999999999861-------3------1135----------78877--356789999999997-67", parser.getScoresFor("1OOT") );
+               assertEquals( "99999999999999999999999999986-------422-------34----------687774--56779999999999889", parser.getScoresFor("1ABO") );
+               assertEquals( "99999999999999999999999999985-------32--------35----------6888842356789999999999889", parser.getScoresFor("1FYN") );
+               assertEquals( "99999999999999999999999999974-------2---------24----------6878742356789999999999889", parser.getScoresFor("1QCF") );
+               assertEquals( "99999999999999999999999999985100011032110000113400100000006877641356789999999999889", parser.getScoresFor("cons") );             
+       }
+
+       
+       @Test
+       public void testGetAsList() throws FileNotFoundException {
+               
+               TCoffeeScoreFile parser = new TCoffeeScoreFile();
+               parser.parse(new BufferedReader(new FileReader(SCORE_FILE)) );
+
+               List<String> scores = parser.getScoresList();
+               assertEquals( "999999999999999999999999998762112222543211112134----------5666642367889999999999889", scores.get(0) );
+               assertEquals( "99999999999999999999999999987-------4322----22341111111111676653-355679999999999889", scores.get(1) );
+               assertEquals( "99999999999999999999999999987-------5321----2246----------788774--66789999999999889", scores.get(2) );
+               assertEquals( "99999999999999999999999999986-------4321----1-35----------78777--356789999999999889", scores.get(3) );
+               assertEquals( "999999999999999999999999999861-------3------1135----------78877--356789999999997-67", scores.get(4) );
+               assertEquals( "99999999999999999999999999986-------422-------34----------687774--56779999999999889", scores.get(5) );
+               assertEquals( "99999999999999999999999999985-------32--------35----------6888842356789999999999889", scores.get(6) );
+               assertEquals( "99999999999999999999999999974-------2---------24----------6878742356789999999999889", scores.get(7) );
+               assertEquals( "99999999999999999999999999985100011032110000113400100000006877641356789999999999889", scores.get(8) );           
+               
+       } 
+       
+       
+       @Test
+       public void testGetAsArray() throws FileNotFoundException {
+               
+               TCoffeeScoreFile parser = new TCoffeeScoreFile();
+               parser.parse(new BufferedReader(new FileReader(SCORE_FILE)) );
+
+               byte[][] scores = parser.getScoresArray();
+       
+               assertEquals( 9, scores[0][0] );
+               assertEquals( 9, scores[1][0] );
+               assertEquals( 9, scores[2][0] );
+               assertEquals( 9, scores[3][0] );
+               assertEquals( 9, scores[4][0] );
+               assertEquals( 9, scores[5][0] );
+               assertEquals( 9, scores[6][0] );
+               assertEquals( 9, scores[7][0] );
+               assertEquals( 9, scores[8][0] );
+               
+               assertEquals( 5, scores[0][36] );
+               assertEquals( 4, scores[1][36] );
+               assertEquals( 5, scores[2][36] );
+               assertEquals( 4, scores[3][36] );
+               assertEquals( -1, scores[4][36] );
+               assertEquals( 4, scores[5][36] );
+               assertEquals( 3, scores[6][36] );
+               assertEquals( 2, scores[7][36] );
+               assertEquals( 3, scores[8][36] );
+               
+       } 
+       
+       
+       
+}
diff --git a/test/jalview/io/tcoffee.fasta_aln b/test/jalview/io/tcoffee.fasta_aln
new file mode 100644 (file)
index 0000000..63b12b1
--- /dev/null
@@ -0,0 +1,24 @@
+>1PHT
+YQYRALYDYKKEREEDIDLHLGDILTVNKGSLVALGFSDGQEARPEEI--
+--------GWLNGYNETTGERGDFPGTYVEYIG
+>1BB9
+FKVQAQHDYTATDTDELQLKAGDVVLVIP-------FQNP----EEQDEG
+WLMGVKESDWNQHK-ELEKCRGVFPENFTERVQ
+>1UHC
+QVYFAVYTFKARNPNELSVSANQKLKILE-------FKDV----TGNT--
+--------EWWLAE--VNGKKGYVPSNYIRKTE
+>1YCS
+GVIYALWDYEPQNDDELPMKEGDCMTIIH-------REDE----D-EI--
+--------EWWWA--RLNDKEGYVPRNLLGLYP
+>1OOT
+PKAVALYSFAGEESGDLPFRKGDVITILKK-------S------DSQN--
+--------DWWTG--RVNGREGIFPANYVE-LV
+>1ABO
+NLFVALYDFVASGDNTLSITKGEKLRVLG-------YNH-------NG--
+--------EWCEAQ--TKNGQGWVPSNYITPVN
+>1FYN
+TLFVALYDYEARTEDDLSFHKGEKFQILN-------SS--------EG--
+--------DWWEARSLTTGETGYIPSNYVAPVD
+>1QCF
+IIVVALYDYEAIHHEDLSFQKGDQMVVLE-------E---------SG--
+--------EWWKARSLATRKEGYIPSNYVARVD
\ No newline at end of file
diff --git a/test/jalview/io/tcoffee.score_ascii b/test/jalview/io/tcoffee.score_ascii
new file mode 100644 (file)
index 0000000..2b533a3
--- /dev/null
@@ -0,0 +1,37 @@
+T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
+Cedric Notredame 
+CPU TIME:0 sec.
+SCORE=90
+*
+ BAD AVG GOOD
+*
+1PHT   :  89
+1BB9   :  90
+1UHC   :  94
+1YCS   :  94
+1OOT   :  93
+1ABO   :  94
+1FYN   :  94
+1QCF   :  94
+cons   :  90
+
+1PHT   999999999999999999999999998762112222543211112134
+1BB9   99999999999999999999999999987-------4322----2234
+1UHC   99999999999999999999999999987-------5321----2246
+1YCS   99999999999999999999999999986-------4321----1-35
+1OOT   999999999999999999999999999861-------3------1135
+1ABO   99999999999999999999999999986-------422-------34
+1FYN   99999999999999999999999999985-------32--------35
+1QCF   99999999999999999999999999974-------2---------24
+cons   999999999999999999999999999851000110321100001134
+
+
+1PHT   ----------5666642367889999999999889
+1BB9   1111111111676653-355679999999999889
+1UHC   ----------788774--66789999999999889
+1YCS   ----------78777--356789999999999889
+1OOT   ----------78877--356789999999997-67
+1ABO   ----------687774--56779999999999889
+1FYN   ----------6888842356789999999999889
+1QCF   ----------6878742356789999999999889
+cons   00100000006877641356789999999999889