2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
3 * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
20 import jalview.analysis.SequenceIdMatcher;
21 import jalview.datamodel.AlignmentAnnotation;
22 import jalview.datamodel.AlignmentI;
23 import jalview.datamodel.Annotation;
24 import jalview.datamodel.SequenceI;
26 import java.awt.Color;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.LinkedHashMap;
31 import java.util.List;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
36 import javax.xml.parsers.ParserConfigurationException;
38 import org.xml.sax.SAXException;
40 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
41 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
42 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
43 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
46 * A file parse for T-Coffee score ascii format. This file contains the
47 * alignment consensus for each resude in any sequence.
49 * This file is procuded by <code>t_coffee</code> providing the option
50 * <code>-output=score_ascii </code> to the program command line
52 * An example file is the following
55 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
72 * 1PHT 999999999999999999999999998762112222543211112134
73 * 1BB9 99999999999999999999999999987-------4322----2234
74 * 1UHC 99999999999999999999999999987-------5321----2246
75 * 1YCS 99999999999999999999999999986-------4321----1-35
76 * 1OOT 999999999999999999999999999861-------3------1135
77 * 1ABO 99999999999999999999999999986-------422-------34
78 * 1FYN 99999999999999999999999999985-------32--------35
79 * 1QCF 99999999999999999999999999974-------2---------24
80 * cons 999999999999999999999999999851000110321100001134
83 * 1PHT ----------5666642367889999999999889
84 * 1BB9 1111111111676653-355679999999999889
85 * 1UHC ----------788774--66789999999999889
86 * 1YCS ----------78777--356789999999999889
87 * 1OOT ----------78877--356789999999997-67
88 * 1ABO ----------687774--56779999999999889
89 * 1FYN ----------6888842356789999999999889
90 * 1QCF ----------6878742356789999999999889
91 * cons 00100000006877641356789999999999889
95 * @author Paolo Di Tommaso
98 public class TCoffeeScoreFile extends AlignFile {
100 public TCoffeeScoreFile(String inFile, String type) throws Exception
106 public TCoffeeScoreFile(FileParse source) throws Exception
111 /** The {@link Header} structure holder */
115 * Holds the consensues values for each sequences. It uses a LinkedHashMap to
116 * maintaint the insertion order.
118 LinkedHashMap<String, StringBuilder> scores;
123 * Parse the provided reader for the T-Coffee scores file format
126 * public static TCoffeeScoreFile load(Reader reader) {
128 * try { BufferedReader in = (BufferedReader) (reader instanceof
129 * BufferedReader ? reader : new BufferedReader(reader));
130 * TCoffeeScoreFile result = new TCoffeeScoreFile();
131 * result.doParsing(in); return result.header != null &&
132 * result.scores != null ? result : null; } catch( Exception e) {
133 * throw new RuntimeException(e); } }
137 * @return The 'height' of the score matrix i.e. the numbers of score rows
138 * that should matches the number of sequences in the alignment
140 public int getHeight()
142 // the last entry will always be the 'global' alingment consensus scores, so
144 // from the 'height' count to make this value compatible with the number of
145 // sequences in the MSA
146 return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
150 * @return The 'width' of the score matrix i.e. the number of columns. Since
151 * the score value are supposed to be calculated for an 'aligned' MSA,
152 * all the entries have to have the same width.
154 public int getWidth()
156 return fWidth != null ? fWidth : 0;
160 * Get the string of score values for the specified seqeunce ID.
164 * @return The scores as a string of values e.g. {@code 99999987-------432}.
165 * It return an empty string when the specified ID is missing.
167 public String getScoresFor(String id)
169 return scores != null && scores.containsKey(id) ? scores.get(id)
174 * @return The list of score string as a {@link List} object, in the same
175 * ordeer of the insertion i.e. in the MSA
177 public List<String> getScoresList()
183 List<String> result = new ArrayList<String>(scores.size());
184 for (Map.Entry<String, StringBuilder> it : scores.entrySet())
186 result.add(it.getValue().toString());
193 * @return The parsed score values a matrix of bytes
195 public byte[][] getScoresArray()
201 byte[][] result = new byte[scores.size()][];
204 for (Map.Entry<String, StringBuilder> it : scores.entrySet())
206 String line = it.getValue().toString();
207 byte[] seqValues = new byte[line.length()];
208 for (int j = 0, c = line.length(); j < c; j++)
211 byte val = (byte) (line.charAt(j) - '0');
213 seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
216 result[rowCount++] = seqValues;
222 public void parse() throws IOException
227 header = readHeader(this);
234 scores = new LinkedHashMap<String, StringBuilder>();
237 * initilize the structure
239 for (Map.Entry<String, Integer> entry : header.scores.entrySet())
241 scores.put(entry.getKey(), new StringBuilder());
245 * go with the reading
248 while ((block = readBlock(this, header.scores.size())) != null)
252 * append sequences read in the block
254 for (Map.Entry<String, String> entry : block.items.entrySet())
256 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
257 if (scoreStringBuilder == null)
260 errormessage = String
261 .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
266 scoreStringBuilder.append(entry.getValue());
271 * verify that all rows have the same width
273 for (StringBuilder str : scores.values())
277 fWidth = str.length();
279 else if (fWidth != str.length())
282 errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
290 static int parseInt(String str)
294 return Integer.parseInt(str);
295 } catch (NumberFormatException e)
297 // TODO report a warning ?
303 * Reaad the header section in the T-Coffee score file format
307 * @return The parser {@link Header} instance
308 * @throws RuntimeException
309 * when the header is not in the expected format
311 static Header readHeader(FileParse reader) throws IOException
314 Header result = null;
317 result = new Header();
318 result.head = reader.nextLine();
322 while ((line = reader.nextLine()) != null)
324 if (line.startsWith("SCORE="))
326 result.score = parseInt(line.substring(6).trim());
331 if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
334 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
337 if ((line = reader.nextLine()) == null
338 || !"BAD AVG GOOD".equals(line.trim()))
341 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
344 if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
347 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
352 * now are expected a list if sequences ID up to the first blank line
354 while ((line = reader.nextLine()) != null)
361 int p = line.indexOf(":");
364 // TODO report a warning
368 String id = line.substring(0, p).trim();
369 int val = parseInt(line.substring(p + 1).trim());
372 // TODO report warning
376 result.scores.put(id, val);
381 error(reader, "T-COFFEE score file had no per-sequence scores");
384 } catch (IOException e)
386 error(reader, "Unexpected problem parsing T-Coffee score ascii file");
393 private static void error(FileParse reader, String errm)
396 if (reader.errormessage == null)
398 reader.errormessage = errm;
402 reader.errormessage += "\n" + errm;
406 static Pattern SCORES_WITH_RESIDUE_NUMS = Pattern.compile("^\\d+\\s([^\\s]+)\\s+\\d+$");
409 * Read a scores block ihe provided stream.
412 * The stream to parse
414 * The expected number of the sequence to be read
415 * @return The {@link Block} instance read or {link null} null if the end of
417 * @throws IOException
418 * Something went wrong on the 'wire'
420 static Block readBlock(FileParse reader, int size) throws IOException
422 Block result = new Block(size);
426 * read blank lines (eventually)
428 while ((line = reader.nextLine()) != null && "".equals(line.trim()))
430 // consume blank lines
439 * read the scores block
443 if ("".equals(line.trim()))
449 // split the line on the first blank
450 // the first part have to contain the sequence id
451 // the remaining part are the scores values
452 int p = line.indexOf(" ");
455 if (reader.warningMessage == null)
457 reader.warningMessage = "";
459 reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
464 String id = line.substring(0, p).trim();
465 String val = line.substring(p + 1).trim();
467 Matcher m = SCORES_WITH_RESIDUE_NUMS.matcher(val);
472 result.items.put(id, val);
474 } while ((line = reader.nextLine()) != null);
480 * The score file header
488 LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
490 public int getScoreAvg()
495 public int getScoreFor(String ID)
498 return scores.containsKey(ID) ? scores.get(ID) : -1;
504 * Hold a single block values block in the score file
510 Map<String, String> items;
512 public Block(int size)
515 this.items = new HashMap<String, String>(size);
518 String getScoresFor(String id)
520 return items.get(id);
523 String getConsensus()
525 return items.get("cons");
530 * TCOFFEE score colourscheme
532 static final Color[] colors =
533 { new Color(102, 102, 255), // #6666FF
534 new Color(0, 255, 0), // #00FF00
535 new Color(102, 255, 0), // #66FF00
536 new Color(204, 255, 0), // #CCFF00
537 new Color(255, 255, 0), // #FFFF00
538 new Color(255, 204, 0), // #FFCC00
539 new Color(255, 153, 0), // #FF9900
540 new Color(255, 102, 0), // #FF6600
541 new Color(255, 51, 0), // #FF3300
542 new Color(255, 34, 0) // #FF2000
545 public final static String TCOFFEE_SCORE = "TCoffeeScore";
548 * generate annotation for this TCoffee score set on the given alignment
551 * alignment to annotate
553 * if true, annotate sequences based on matching sequence names
554 * @return true if alignment annotation was modified, false otherwise.
556 public boolean annotateAlignment(AlignmentI al, boolean matchids)
558 if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
560 String info = String.format("align w: %s, h: %s; score: w: %s; h: %s ", al.getWidth(), al.getHeight(), getWidth(), getHeight() );
561 warningMessage = "Alignment shape does not match T-Coffee score file shape -- " + info;
564 boolean added = false;
566 SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
567 al.getSequencesArray());
568 byte[][] scoreMatrix = getScoresArray();
569 // for 2.8 - we locate any existing TCoffee annotation and remove it first
570 // before adding this.
571 for (Map.Entry<String, StringBuilder> id : scores.entrySet())
573 byte[] srow = scoreMatrix[i];
577 s = sidmatcher.findIdMatch(id.getKey());
581 s = al.getSequenceAt(i);
584 if (s == null && i != scores.size() && !id.getKey().equals("cons"))
586 System.err.println("No "
587 + (matchids ? "match " : " sequences left ")
588 + " for TCoffee score set : " + id.getKey());
591 int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
592 Annotation[] annotations = new Annotation[al.getWidth()];
593 for (int j = 0; j < jSize; j++)
596 if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
598 annotations[j] = null;
602 .println("Warning: non-zero value for positional T-COFFEE score for gap at "
603 + j + " in sequence " + s.getName());
608 annotations[j] = new Annotation(s == null ? "" + val : null,
609 s == null ? "" + val : null, '\0', val * 1f, val >= 0
610 && val < colors.length ? colors[val]
614 // this will overwrite any existing t-coffee scores for the alignment
615 AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
616 TCOFFEE_SCORE, false, s, null);
619 aa.label = "T-COFFEE";
620 aa.description = "" + id.getKey();
621 aa.annotations = annotations;
623 aa.belowAlignment = false;
624 aa.setScore(header.getScoreFor(id.getKey()));
625 aa.createSequenceMapping(s, s.getStart(), true);
626 s.addAlignmentAnnotation(aa);
627 aa.adjustForAlignment();
631 aa.graph = AlignmentAnnotation.NO_GRAPH;
632 aa.label = "T-COFFEE";
633 aa.description = "TCoffee column reliability score";
634 aa.annotations = annotations;
635 aa.belowAlignment = true;
637 aa.setScore(header.getScoreAvg());
639 aa.showAllColLabels = true;
640 aa.validateRangeAndDisplay();
648 public String print()
650 // TODO Auto-generated method stub