/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.io; import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.SequenceI; import java.awt.Color; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A file parser for T-Coffee score ascii format. This file contains the * alignment consensus for each residue in any sequence. *

* This file is produced by t_coffee providing the option * -output=score_ascii to the program command line * * An example file is the following * *

 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
 * Cedric Notredame 
 * CPU TIME:0 sec.
 * SCORE=90
 * *
 *  BAD AVG GOOD
 * *
 * 1PHT   :  89
 * 1BB9   :  90
 * 1UHC   :  94
 * 1YCS   :  94
 * 1OOT   :  93
 * 1ABO   :  94
 * 1FYN   :  94
 * 1QCF   :  94
 * cons   :  90
 * 
 * 1PHT   999999999999999999999999998762112222543211112134
 * 1BB9   99999999999999999999999999987-------4322----2234
 * 1UHC   99999999999999999999999999987-------5321----2246
 * 1YCS   99999999999999999999999999986-------4321----1-35
 * 1OOT   999999999999999999999999999861-------3------1135
 * 1ABO   99999999999999999999999999986-------422-------34
 * 1FYN   99999999999999999999999999985-------32--------35
 * 1QCF   99999999999999999999999999974-------2---------24
 * cons   999999999999999999999999999851000110321100001134
 * 
 * 
 * 1PHT   ----------5666642367889999999999889
 * 1BB9   1111111111676653-355679999999999889
 * 1UHC   ----------788774--66789999999999889
 * 1YCS   ----------78777--356789999999999889
 * 1OOT   ----------78877--356789999999997-67
 * 1ABO   ----------687774--56779999999999889
 * 1FYN   ----------6888842356789999999999889
 * 1QCF   ----------6878742356789999999999889
 * cons   00100000006877641356789999999999889
 * 
* * * @author Paolo Di Tommaso * */ public class TCoffeeScoreFile extends AlignFile { /** * TCOFFEE score colourscheme */ static final Color[] colors = { new Color(102, 102, 255), // 0: lilac #6666FF new Color(0, 255, 0), // 1: green #00FF00 new Color(102, 255, 0), // 2: lime green #66FF00 new Color(204, 255, 0), // 3: greeny yellow #CCFF00 new Color(255, 255, 0), // 4: yellow #FFFF00 new Color(255, 204, 0), // 5: orange #FFCC00 new Color(255, 153, 0), // 6: deep orange #FF9900 new Color(255, 102, 0), // 7: ochre #FF6600 new Color(255, 51, 0), // 8: red #FF3300 new Color(255, 34, 0) // 9: redder #FF2000 }; public final static String TCOFFEE_SCORE = "TCoffeeScore"; static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern .compile("^\\d+\\s([^\\s]+)\\s+\\d+$"); /** The {@link Header} structure holder */ Header header; /** * Holds the consensues values for each sequences. It uses a LinkedHashMap to * maintaint the insertion order. */ LinkedHashMap scores; Integer fWidth; public TCoffeeScoreFile(String inFile, DataSourceType fileSourceType) throws IOException { super(inFile, fileSourceType); } public TCoffeeScoreFile(FileParse source) throws IOException { super(source); } /** * Parse the provided reader for the T-Coffee scores file format * * @param reader * public static TCoffeeScoreFile load(Reader reader) { * * try { BufferedReader in = (BufferedReader) (reader instanceof * BufferedReader ? reader : new BufferedReader(reader)); * TCoffeeScoreFile result = new TCoffeeScoreFile(); * result.doParsing(in); return result.header != null && * result.scores != null ? result : null; } catch( Exception e) { * throw new RuntimeException(e); } } */ /** * @return The 'height' of the score matrix i.e. the numbers of score rows * that should matches the number of sequences in the alignment */ public int getHeight() { // the last entry will always be the 'global' alingment consensus scores, so // it is removed // from the 'height' count to make this value compatible with the number of // sequences in the MSA return scores != null && scores.size() > 0 ? scores.size() - 1 : 0; } /** * @return The 'width' of the score matrix i.e. the number of columns. Since * the score value are supposed to be calculated for an 'aligned' MSA, * all the entries have to have the same width. */ public int getWidth() { return fWidth != null ? fWidth : 0; } /** * Get the string of score values for the specified seqeunce ID. * * @param id * The sequence ID * @return The scores as a string of values e.g. {@code 99999987-------432}. * It return an empty string when the specified ID is missing. */ public String getScoresFor(String id) { return scores != null && scores.containsKey(id) ? scores.get(id).toString() : ""; } /** * @return The list of score string as a {@link List} object, in the same * ordeer of the insertion i.e. in the MSA */ public List getScoresList() { if (scores == null) { return null; } List result = new ArrayList(scores.size()); for (Map.Entry it : scores.entrySet()) { result.add(it.getValue().toString()); } return result; } /** * @return The parsed score values a matrix of bytes */ public byte[][] getScoresArray() { if (scores == null) { return null; } byte[][] result = new byte[scores.size()][]; int rowCount = 0; for (Map.Entry it : scores.entrySet()) { String line = it.getValue().toString(); byte[] seqValues = new byte[line.length()]; for (int j = 0, c = line.length(); j < c; j++) { byte val = (byte) (line.charAt(j) - '0'); seqValues[j] = (val >= 0 && val <= 9) ? val : -1; } result[rowCount++] = seqValues; } return result; } @Override public void parse() throws IOException { /* * read the header */ header = readHeader(this); if (header == null) { error = true; return; } scores = new LinkedHashMap(); /* * initilize the structure */ for (Map.Entry entry : header.scores.entrySet()) { scores.put(entry.getKey(), new StringBuilder()); } /* * go with the reading */ Block block; while ((block = readBlock(this, header.scores.size())) != null) { /* * append sequences read in the block */ for (Map.Entry entry : block.items.entrySet()) { StringBuilder scoreStringBuilder = scores.get(entry.getKey()); if (scoreStringBuilder == null) { error = true; errormessage = String.format( "Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey()); return; } scoreStringBuilder.append(entry.getValue()); } } /* * verify that all rows have the same width */ for (StringBuilder str : scores.values()) { if (fWidth == null) { fWidth = str.length(); } else if (fWidth != str.length()) { error = true; errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length"; return; } } return; } static int parseInt(String str) { try { return Integer.parseInt(str); } catch (NumberFormatException e) { // TODO report a warning ? return 0; } } /** * Reaad the header section in the T-Coffee score file format * * @param reader * The scores reader * @return The parser {@link Header} instance * @throws RuntimeException * when the header is not in the expected format */ static Header readHeader(FileParse reader) throws IOException { Header result = null; try { result = new Header(); result.head = reader.nextLine(); String line; while ((line = reader.nextLine()) != null) { if (line.startsWith("SCORE=")) { result.score = parseInt(line.substring(6).trim()); break; } } if ((line = reader.nextLine()) == null || !"*".equals(line.trim())) { error(reader, "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null; } if ((line = reader.nextLine()) == null || !"BAD AVG GOOD".equals(line.trim())) { error(reader, "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null; } if ((line = reader.nextLine()) == null || !"*".equals(line.trim())) { error(reader, "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null; } /* * now are expected a list if sequences ID up to the first blank line */ while ((line = reader.nextLine()) != null) { if ("".equals(line)) { break; } int p = line.indexOf(":"); if (p == -1) { // TODO report a warning continue; } String id = line.substring(0, p).trim(); int val = parseInt(line.substring(p + 1).trim()); if ("".equals(id)) { // TODO report warning continue; } result.scores.put(id, val); } if (result == null) { error(reader, "T-COFFEE score file had no per-sequence scores"); } } catch (IOException e) { error(reader, "Unexpected problem parsing T-Coffee score ascii file"); throw e; } return result; } private static void error(FileParse reader, String errm) { reader.error = true; if (reader.errormessage == null) { reader.errormessage = errm; } else { reader.errormessage += "\n" + errm; } } /** * Read a scores block ihe provided stream. * * @param reader * The stream to parse * @param size * The expected number of the sequence to be read * @return The {@link Block} instance read or {link null} null if the end of * file has reached. * @throws IOException * Something went wrong on the 'wire' */ static Block readBlock(FileParse reader, int size) throws IOException { Block result = new Block(size); String line; /* * read blank lines (eventually) */ while ((line = reader.nextLine()) != null && "".equals(line.trim())) { // consume blank lines } if (line == null) { return null; } /* * read the scores block */ do { if ("".equals(line.trim())) { // terminated break; } // split the line on the first blank // the first part have to contain the sequence id // the remaining part are the scores values int p = line.indexOf(" "); if (p == -1) { if (reader.warningMessage == null) { reader.warningMessage = ""; } reader.warningMessage += "Possible parsing error - expected to find a space in line: '" + line + "'\n"; continue; } String id = line.substring(0, p).trim(); String val = line.substring(p + 1).trim(); Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val); if (m.matches()) { val = m.group(1); } result.items.put(id, val); } while ((line = reader.nextLine()) != null); return result; } /* * The score file header */ static class Header { String head; int score; LinkedHashMap scores = new LinkedHashMap(); public int getScoreAvg() { return score; } public int getScoreFor(String ID) { return scores.containsKey(ID) ? scores.get(ID) : -1; } } /* * Hold a single block values block in the score file */ static class Block { int size; Map items; public Block(int size) { this.size = size; this.items = new HashMap(size); } String getScoresFor(String id) { return items.get(id); } String getConsensus() { return items.get("cons"); } } /** * generate annotation for this TCoffee score set on the given alignment * * @param al * alignment to annotate * @param matchids * if true, annotate sequences based on matching sequence names * @return true if alignment annotation was modified, false otherwise. */ public boolean annotateAlignment(AlignmentI al, boolean matchids) { if (al.getHeight() != getHeight() || al.getWidth() != getWidth()) { String info = String.format( "align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), al.getHeight(), getWidth(), getHeight()); warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + info; return false; } boolean added = false; int i = 0; SequenceIdMatcher sidmatcher = new SequenceIdMatcher( al.getSequencesArray()); byte[][] scoreMatrix = getScoresArray(); // for 2.8 - we locate any existing TCoffee annotation and remove it first // before adding this. for (Map.Entry id : scores.entrySet()) { byte[] srow = scoreMatrix[i]; SequenceI s; if (matchids) { s = sidmatcher.findIdMatch(id.getKey()); } else { s = al.getSequenceAt(i); } i++; if (s == null && i != scores.size() && !id.getKey().equals("cons")) { System.err .println("No " + (matchids ? "match " : " sequences left ") + " for TCoffee score set : " + id.getKey()); continue; } int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length; Annotation[] annotations = new Annotation[al.getWidth()]; for (int j = 0; j < jSize; j++) { byte val = srow[j]; if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j))) { annotations[j] = null; if (val > 0) { System.err.println( "Warning: non-zero value for positional T-COFFEE score for gap at " + j + " in sequence " + s.getName()); } } else { annotations[j] = new Annotation(s == null ? "" + val : null, s == null ? "" + val : null, '\0', val * 1f, val >= 0 && val < colors.length ? colors[val] : Color.white); } } // this will overwrite any existing t-coffee scores for the alignment AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE, TCOFFEE_SCORE, false, s, null); if (s != null) { aa.label = "T-COFFEE"; aa.description = "" + id.getKey(); aa.annotations = annotations; aa.visible = false; aa.belowAlignment = false; aa.setScore(header.getScoreFor(id.getKey())); aa.createSequenceMapping(s, s.getStart(), true); s.addAlignmentAnnotation(aa); aa.adjustForAlignment(); } else { aa.graph = AlignmentAnnotation.NO_GRAPH; aa.label = "T-COFFEE"; aa.description = "TCoffee column reliability score"; aa.annotations = annotations; aa.belowAlignment = true; aa.visible = true; aa.setScore(header.getScoreAvg()); } aa.showAllColLabels = true; aa.validateRangeAndDisplay(); added = true; } return added; } @Override public String print(SequenceI[] sqs, boolean jvsuffix) { // TODO Auto-generated method stub return "Not valid."; } }