X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fio%2FTCoffeeScoreFile.java;h=fc0c9131294edcd2bc5d93492724b05984cb3d55;hb=3d0101179759ef157b088ea135423cd909512d9f;hp=6cab3aeeea735cbb1f77d68202d8b25894005f01;hpb=a79f9e113c51c032070c670e45ce3eb464691166;p=jalview.git
diff --git a/src/jalview/io/TCoffeeScoreFile.java b/src/jalview/io/TCoffeeScoreFile.java
index 6cab3ae..fc0c913 100644
--- a/src/jalview/io/TCoffeeScoreFile.java
+++ b/src/jalview/io/TCoffeeScoreFile.java
@@ -1,3 +1,23 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see
- * This file is procuded by t_coffee
providing the option
+ * This file is produced by t_coffee
providing the option
* -output=score_ascii
to the program command line
*
- * An example file is the following
+ * An example file is the following
*
*
* T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336) @@ -70,405 +87,569 @@ import java.util.Map; * * * @author Paolo Di Tommaso - * + * */ -public class TCoffeeScoreFile { - - /** The {@link Header} structure holder */ - Header header; - - /** - * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the - * insertion order. - */ - LinkedHashMapscores = new LinkedHashMap (); - - Integer fWidth; - - /** - * Parse the specified file. - * - * @param file The file to be parsed - */ - public static TCoffeeScoreFile load(File file) { - try { - return load(new FileReader(file)); - } - catch (FileNotFoundException e) { - throw new RuntimeException(e); - } - } - - /** - * Parse the provided reader for the T-Coffee scores file format - * - * @param reader - */ - public static TCoffeeScoreFile load(Reader reader) { - - try { - BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader)); - TCoffeeScoreFile result = new TCoffeeScoreFile(); - result.doParsing(in); - return result.header != null && result.scores != null ? result : null; - } - catch( Exception e) { - throw new RuntimeException(e); - } - } - - /** - * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches - * the number of sequences in the alignment - */ - public int getHeight() { - // the last entry will always be the 'global' alingment consensus scores, so it is removed - // from the 'height' count to make this value compatible with the number of sequences in the MSA - return scores != null && scores.size() > 0 ? scores.size()-1 : 0; - } - - /** - * @return The 'width' of the score matrix i.e. the number of columns. - * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries - * have to have the same width. - */ - public int getWidth() { - return fWidth != null ? fWidth : 0; - } - - /** - * The default constructor is marked as {@code protected} since this class is meant to created - * through the {@link #load(File)} or {@link #load(Reader)} factory methods - */ - protected TCoffeeScoreFile() { } - - /** - * Get the string of score values for the specified seqeunce ID. - * @param id The sequence ID - * @return The scores as a string of values e.g. {@code 99999987-------432}. - * It return an empty string when the specified ID is missing. - */ - public String getScoresFor( String id ) { - return scores.containsKey(id) ? scores.get(id).toString() : ""; - } - - /** - * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA - */ - public List getScoresList() { - List result = new ArrayList ( scores.size() ); - for( Map.Entry it : scores.entrySet() ) { - result.add(it.getValue().toString()); - } - - return result; - } - - /** - * @return The parsed score values a matrix of bytes - */ - public byte[][] getScoresArray() { - byte[][] result = new byte[ scores.size() ][]; - - int rowCount = 0; - for( Map.Entry it : scores.entrySet() ) { - String line = it.getValue().toString(); - byte[] seqValues = new byte[ line.length() ]; - for( int j=0, c=line.length(); j = 0 && val <= 9 ) ? val : -1; - } - - result[rowCount++] = seqValues; - } - - return result; - } - - - private void doParsing(BufferedReader in) throws IOException { - - /* - * read the header - */ - header = readHeader(in); - - if( header == null ) { return; } - - - /* - * initilize the structure - */ - for( Map.Entry entry : header.scores.entrySet() ) { - scores.put( entry.getKey(), new StringBuilder()); - } - - /* - * go with the reading - */ - Block block; - while( (block = readBlock(in, header.scores.size())) != null ) { - - /* - * append sequences read in the block - */ - for( Map.Entry entry : block.items.entrySet() ) { - StringBuilder scoreStringBuilder = scores.get(entry.getKey()); - if( scoreStringBuilder == null ) { - throw new RuntimeException(String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey())); - } - - scoreStringBuilder.append( entry.getValue() ); - } - } - - /* - * verify that all rows have the same width - */ - for( StringBuilder str : scores.values() ) { - if( fWidth == null ) { - fWidth = str.length(); - } - else if( fWidth != str.length() ) { - throw new RuntimeException("Invalid T-Coffee score file: All the score sequences must have the same length"); - } - } - - - - } - - - static int parseInt( String str ) { - try { - return Integer.parseInt(str); - } - catch( NumberFormatException e ) { - // TODO report a warning ? - return 0; - } - } - - /** - * Reaad the header section in the T-Coffee score file format - * - * @param reader The scores reader - * @return The parser {@link Header} instance - * @throws RuntimeException when the header is not in the expected format - */ - static Header readHeader(BufferedReader reader) { - - Header result = null; - try { - result = new Header(); - result.head = reader.readLine(); - - String line; - - while( (line = reader.readLine()) != null ) { - if( line.startsWith("SCORE=")) { - result.score = parseInt( line.substring(6).trim() ); - break; - } - } - - if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null; - if( (line=reader.readLine())==null || !"BAD AVG GOOD".equals(line.trim())) return null; - if( (line=reader.readLine())==null || !"*".equals(line.trim())) return null; - - /* - * now are expected a list if sequences ID up to the first blank line - */ - while( (line=reader.readLine()) != null ) { - if( "".equals(line) ) { - break; - } - - int p = line.indexOf(":"); - if( p == -1 ) { - // TODO report a warning - continue; - } - - String id = line.substring(0,p).trim(); - int val = parseInt(line.substring(p+1).trim()); - if( "".equals(id) ) { - // TODO report warning - continue; - } - - result.scores.put(id,val); - } - - } - catch( IOException e ) { - throw new RuntimeException("Cannot parse T-Coffee score ascii file", e); - } - - return result; - } - - /** - * Read a scores block ihe provided stream. - * - * @param reader The stream to parse - * @param size The expected number of the sequence to be read - * @return The {@link Block} instance read or {link null} null if the end of file has reached. - * @throws IOException Something went wrong on the 'wire' - */ - static Block readBlock( BufferedReader reader, int size ) throws IOException { - Block result = new Block(size); - String line; - - /* - * read blank lines (eventually) - */ - while( (line=reader.readLine()) != null && "".equals(line.trim())) { - // consume blank lines - } - - if( line == null ) return null; - - /* - * read the scores block - */ - do { - if( "".equals(line.trim()) ) { - // terminated - break; - } - - // split the line on the first blank - // the first part have to contain the sequence id - // theramining part are the scores values - int p = line.indexOf(" "); - if( p == -1 ) { - //TODO This is an unexpected condition, log a warning or throw an exception ? - continue; - } - - String id = line.substring(0,p).trim(); - String val = line.substring(p+1).trim(); - - result.items.put(id, val); - - } while( (line = reader.readLine()) != null ); - - - return result; - } - - /* - * The score file header - */ - static class Header { - String head; - int score; - - LinkedHashMap scores = new LinkedHashMap (); - - public int getScoreAvg() { return score; } - - public int getScoreFor( String ID ) { - - return scores.containsKey(ID) ? scores.get(ID) : -1; - - } - } - - /* - * Hold a single block values block in the score file - */ - static class Block { - int size; - Map items; - - public Block( int size ) { - this.size = size; - this.items = new HashMap (size); - } - - String getScoresFor( String id ) { - return items.get(id); - } - - String getConsensus() { - return items.get("cons"); - } - } - /** - * TCOFFEE score colourscheme - */ - static final Color[] colors = { - new Color( 102, 102, 255 ), // #6666FF - new Color( 0, 255, 0), // #00FF00 - new Color( 102, 255, 0), // #66FF00 - new Color( 204, 255, 0), // #CCFF00 - new Color( 255, 255, 0), // #FFFF00 - new Color( 255, 204, 0), // #FFCC00 - new Color( 255, 153, 0), // #FF9900 - new Color( 255, 102, 0), // #FF6600 - new Color( 255, 51, 0), // #FF3300 - new Color( 255, 34, 0) // #FF2000 - }; - public final static String TCOFFEE_SCORE="TCoffeeScore"; - /** - * generate annotation for this TCoffee score set on the given alignment - * @param al alignment to annotate - * @param matchids if true, annotate sequences based on matching sequence names - * @return true if alignment annotation was modified, false otherwise. - */ - public boolean annotateAlignment(AlignmentI al, boolean matchids) - { - boolean added=false; - int i=0; - SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray()); - byte[][] scoreMatrix=getScoresArray(); - // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this. - for (Map.Entry id:scores.entrySet()) - { - byte[] srow=scoreMatrix[i]; - SequenceI s; - if (matchids) - { - s=sidmatcher.findIdMatch(id.getKey()); - } else { - s=al.getSequenceAt(i); - } - i++; - if (s==null && i!=scores.size() && !id.getKey().equals("cons")) - { - System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey()); - continue; - } - int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length; - Annotation[] annotations=new Annotation[al.getWidth()]; - for (int j=0;j = 0 && val < colors.length ? colors[val] : Color.white); - } - AlignmentAnnotation aa=null; - if (s!=null) - { - // TODO - set per sequence score - aa=new AlignmentAnnotation(TCOFFEE_SCORE, "Score for "+id.getKey(), annotations); - - aa.setSequenceRef(s); - aa.visible=false; - aa.belowAlignment=false; - } else { - aa=new AlignmentAnnotation("T-COFFEE", "TCoffee column reliability score", annotations); - aa.belowAlignment=true; - aa.visible=true; - - } - al.addAnnotation(aa); - added=true; - } - return added; - } - +public class TCoffeeScoreFile extends AlignFile +{ + + /** + * TCOFFEE score colourscheme + */ + static final Color[] colors = { new Color(102, 102, 255), // 0: lilac #6666FF + new Color(0, 255, 0), // 1: green #00FF00 + new Color(102, 255, 0), // 2: lime green #66FF00 + new Color(204, 255, 0), // 3: greeny yellow #CCFF00 + new Color(255, 255, 0), // 4: yellow #FFFF00 + new Color(255, 204, 0), // 5: orange #FFCC00 + new Color(255, 153, 0), // 6: deep orange #FF9900 + new Color(255, 102, 0), // 7: ochre #FF6600 + new Color(255, 51, 0), // 8: red #FF3300 + new Color(255, 34, 0) // 9: redder #FF2000 + }; + + public final static String TCOFFEE_SCORE = "TCoffeeScore"; + + static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern + .compile("^\\d+\\s([^\\s]+)\\s+\\d+$"); + + /** The {@link Header} structure holder */ + Header header; + + /** + * Holds the consensues values for each sequences. It uses a LinkedHashMap to + * maintaint the insertion order. + */ + LinkedHashMap scores; + + Integer fWidth; + + public TCoffeeScoreFile(String inFile, DataSourceType fileSourceType) + throws IOException + { + super(inFile, fileSourceType); + + } + + public TCoffeeScoreFile(FileParse source) throws IOException + { + super(source); + } + + /** + * Parse the provided reader for the T-Coffee scores file format + * + * @param reader + * public static TCoffeeScoreFile load(Reader reader) { + * + * try { BufferedReader in = (BufferedReader) (reader instanceof + * BufferedReader ? reader : new BufferedReader(reader)); + * TCoffeeScoreFile result = new TCoffeeScoreFile(); + * result.doParsing(in); return result.header != null && + * result.scores != null ? result : null; } catch( Exception e) { + * throw new RuntimeException(e); } } + */ + + /** + * @return The 'height' of the score matrix i.e. the numbers of score rows + * that should matches the number of sequences in the alignment + */ + public int getHeight() + { + // the last entry will always be the 'global' alingment consensus scores, so + // it is removed + // from the 'height' count to make this value compatible with the number of + // sequences in the MSA + return scores != null && scores.size() > 0 ? scores.size() - 1 : 0; + } + + /** + * @return The 'width' of the score matrix i.e. the number of columns. Since + * the score value are supposed to be calculated for an 'aligned' MSA, + * all the entries have to have the same width. + */ + public int getWidth() + { + return fWidth != null ? fWidth : 0; + } + + /** + * Get the string of score values for the specified seqeunce ID. + * + * @param id + * The sequence ID + * @return The scores as a string of values e.g. {@code 99999987-------432}. + * It return an empty string when the specified ID is missing. + */ + public String getScoresFor(String id) + { + return scores != null && scores.containsKey(id) + ? scores.get(id).toString() + : ""; + } + + /** + * @return The list of score string as a {@link List} object, in the same + * ordeer of the insertion i.e. in the MSA + */ + public List getScoresList() + { + if (scores == null) + { + return null; + } + List result = new ArrayList (scores.size()); + for (Map.Entry it : scores.entrySet()) + { + result.add(it.getValue().toString()); + } + + return result; + } + + /** + * @return The parsed score values a matrix of bytes + */ + public byte[][] getScoresArray() + { + if (scores == null) + { + return null; + } + byte[][] result = new byte[scores.size()][]; + + int rowCount = 0; + for (Map.Entry it : scores.entrySet()) + { + String line = it.getValue().toString(); + byte[] seqValues = new byte[line.length()]; + for (int j = 0, c = line.length(); j < c; j++) + { + + byte val = (byte) (line.charAt(j) - '0'); + + seqValues[j] = (val >= 0 && val <= 9) ? val : -1; + } + + result[rowCount++] = seqValues; + } + + return result; + } + + @Override + public void parse() throws IOException + { + /* + * read the header + */ + header = readHeader(this); + + if (header == null) + { + error = true; + return; + } + scores = new LinkedHashMap (); + + /* + * initilize the structure + */ + for (Map.Entry entry : header.scores.entrySet()) + { + scores.put(entry.getKey(), new StringBuilder()); + } + + /* + * go with the reading + */ + Block block; + while ((block = readBlock(this, header.scores.size())) != null) + { + + /* + * append sequences read in the block + */ + for (Map.Entry entry : block.items.entrySet()) + { + StringBuilder scoreStringBuilder = scores.get(entry.getKey()); + if (scoreStringBuilder == null) + { + error = true; + errormessage = String.format( + "Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", + entry.getKey()); + return; + } + + scoreStringBuilder.append(entry.getValue()); + } + } + + /* + * verify that all rows have the same width + */ + for (StringBuilder str : scores.values()) + { + if (fWidth == null) + { + fWidth = str.length(); + } + else if (fWidth != str.length()) + { + error = true; + errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length"; + return; + } + } + + return; + } + + static int parseInt(String str) + { + try + { + return Integer.parseInt(str); + } catch (NumberFormatException e) + { + // TODO report a warning ? + return 0; + } + } + + /** + * Reaad the header section in the T-Coffee score file format + * + * @param reader + * The scores reader + * @return The parser {@link Header} instance + * @throws RuntimeException + * when the header is not in the expected format + */ + static Header readHeader(FileParse reader) throws IOException + { + + Header result = null; + try + { + result = new Header(); + result.head = reader.nextLine(); + + String line; + + while ((line = reader.nextLine()) != null) + { + if (line.startsWith("SCORE=")) + { + result.score = parseInt(line.substring(6).trim()); + break; + } + } + + if ((line = reader.nextLine()) == null || !"*".equals(line.trim())) + { + error(reader, + "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); + return null; + } + if ((line = reader.nextLine()) == null + || !"BAD AVG GOOD".equals(line.trim())) + { + error(reader, + "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); + return null; + } + if ((line = reader.nextLine()) == null || !"*".equals(line.trim())) + { + error(reader, + "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); + return null; + } + + /* + * now are expected a list if sequences ID up to the first blank line + */ + while ((line = reader.nextLine()) != null) + { + if ("".equals(line)) + { + break; + } + + int p = line.indexOf(":"); + if (p == -1) + { + // TODO report a warning + continue; + } + + String id = line.substring(0, p).trim(); + int val = parseInt(line.substring(p + 1).trim()); + if ("".equals(id)) + { + // TODO report warning + continue; + } + + result.scores.put(id, val); + } + + if (result == null) + { + error(reader, "T-COFFEE score file had no per-sequence scores"); + } + + } catch (IOException e) + { + error(reader, "Unexpected problem parsing T-Coffee score ascii file"); + throw e; + } + + return result; + } + + private static void error(FileParse reader, String errm) + { + reader.error = true; + if (reader.errormessage == null) + { + reader.errormessage = errm; + } + else + { + reader.errormessage += "\n" + errm; + } + } + + /** + * Read a scores block ihe provided stream. + * + * @param reader + * The stream to parse + * @param size + * The expected number of the sequence to be read + * @return The {@link Block} instance read or {link null} null if the end of + * file has reached. + * @throws IOException + * Something went wrong on the 'wire' + */ + static Block readBlock(FileParse reader, int size) throws IOException + { + Block result = new Block(size); + String line; + + /* + * read blank lines (eventually) + */ + while ((line = reader.nextLine()) != null && "".equals(line.trim())) + { + // consume blank lines + } + + if (line == null) + { + return null; + } + + /* + * read the scores block + */ + do + { + if ("".equals(line.trim())) + { + // terminated + break; + } + + // split the line on the first blank + // the first part have to contain the sequence id + // the remaining part are the scores values + int p = line.indexOf(" "); + if (p == -1) + { + if (reader.warningMessage == null) + { + reader.warningMessage = ""; + } + reader.warningMessage += "Possible parsing error - expected to find a space in line: '" + + line + "'\n"; + continue; + } + + String id = line.substring(0, p).trim(); + String val = line.substring(p + 1).trim(); + + Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val); + if (m.matches()) + { + val = m.group(1); + } + + result.items.put(id, val); + + } while ((line = reader.nextLine()) != null); + + return result; + } + + /* + * The score file header + */ + static class Header + { + String head; + + int score; + + LinkedHashMap scores = new LinkedHashMap (); + + public int getScoreAvg() + { + return score; + } + + public int getScoreFor(String ID) + { + + return scores.containsKey(ID) ? scores.get(ID) : -1; + + } + } + + /* + * Hold a single block values block in the score file + */ + static class Block + { + int size; + + Map items; + + public Block(int size) + { + this.size = size; + this.items = new HashMap (size); + } + + String getScoresFor(String id) + { + return items.get(id); + } + + String getConsensus() + { + return items.get("cons"); + } + } + + /** + * generate annotation for this TCoffee score set on the given alignment + * + * @param al + * alignment to annotate + * @param matchids + * if true, annotate sequences based on matching sequence names + * @return true if alignment annotation was modified, false otherwise. + */ + public boolean annotateAlignment(AlignmentI al, boolean matchids) + { + if (al.getHeight() != getHeight() || al.getWidth() != getWidth()) + { + String info = String.format( + "align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), + al.getHeight(), getWidth(), getHeight()); + warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + + info; + return false; + } + boolean added = false; + int i = 0; + SequenceIdMatcher sidmatcher = new SequenceIdMatcher( + al.getSequencesArray()); + byte[][] scoreMatrix = getScoresArray(); + // for 2.8 - we locate any existing TCoffee annotation and remove it first + // before adding this. + for (Map.Entry id : scores.entrySet()) + { + byte[] srow = scoreMatrix[i]; + SequenceI s; + if (matchids) + { + s = sidmatcher.findIdMatch(id.getKey()); + } + else + { + s = al.getSequenceAt(i); + } + i++; + if (s == null && i != scores.size() && !id.getKey().equals("cons")) + { + System.err + .println("No " + (matchids ? "match " : " sequences left ") + + " for TCoffee score set : " + id.getKey()); + continue; + } + int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length; + Annotation[] annotations = new Annotation[al.getWidth()]; + for (int j = 0; j < jSize; j++) + { + byte val = srow[j]; + if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j))) + { + annotations[j] = null; + if (val > 0) + { + System.err.println( + "Warning: non-zero value for positional T-COFFEE score for gap at " + + j + " in sequence " + s.getName()); + } + } + else + { + annotations[j] = new Annotation(s == null ? "" + val : null, + s == null ? "" + val : null, '\0', val * 1f, + val >= 0 && val < colors.length ? colors[val] + : Color.white); + } + } + // this will overwrite any existing t-coffee scores for the alignment + AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE, + TCOFFEE_SCORE, false, s, null); + if (s != null) + { + aa.label = "T-COFFEE"; + aa.description = "" + id.getKey(); + aa.annotations = annotations; + aa.visible = false; + aa.belowAlignment = false; + aa.setScore(header.getScoreFor(id.getKey())); + aa.createSequenceMapping(s, s.getStart(), true); + s.addAlignmentAnnotation(aa); + aa.adjustForAlignment(); + } + else + { + aa.graph = AlignmentAnnotation.NO_GRAPH; + aa.label = "T-COFFEE"; + aa.description = "TCoffee column reliability score"; + aa.annotations = annotations; + aa.belowAlignment = true; + aa.visible = true; + aa.setScore(header.getScoreAvg()); + } + aa.showAllColLabels = true; + aa.validateRangeAndDisplay(); + added = true; + } + + return added; + } + @Override + public String print(SequenceI[] sqs, boolean jvsuffix) + { + // TODO Auto-generated method stub + return "Not valid."; + } }