3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
10 import java.io.BufferedReader;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
23 * A file parse for T-Coffee score ascii format. This file contains the
24 * alignment consensus for each resude in any sequence.
26 * This file is procuded by <code>t_coffee</code> providing the option
27 * <code>-output=score_ascii </code> to the program command line
29 * An example file is the following
32 * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
49 * 1PHT 999999999999999999999999998762112222543211112134
50 * 1BB9 99999999999999999999999999987-------4322----2234
51 * 1UHC 99999999999999999999999999987-------5321----2246
52 * 1YCS 99999999999999999999999999986-------4321----1-35
53 * 1OOT 999999999999999999999999999861-------3------1135
54 * 1ABO 99999999999999999999999999986-------422-------34
55 * 1FYN 99999999999999999999999999985-------32--------35
56 * 1QCF 99999999999999999999999999974-------2---------24
57 * cons 999999999999999999999999999851000110321100001134
60 * 1PHT ----------5666642367889999999999889
61 * 1BB9 1111111111676653-355679999999999889
62 * 1UHC ----------788774--66789999999999889
63 * 1YCS ----------78777--356789999999999889
64 * 1OOT ----------78877--356789999999997-67
65 * 1ABO ----------687774--56779999999999889
66 * 1FYN ----------6888842356789999999999889
67 * 1QCF ----------6878742356789999999999889
68 * cons 00100000006877641356789999999999889
72 * @author Paolo Di Tommaso
75 public class TCoffeeScoreFile extends AlignFile
78 public TCoffeeScoreFile(String inFile, String type) throws IOException
84 public TCoffeeScoreFile(FileParse source) throws IOException
89 /** The {@link Header} structure holder */
93 * Holds the consensues values for each sequences. It uses a LinkedHashMap to
94 * maintaint the insertion order.
96 LinkedHashMap<String, StringBuilder> scores;
101 * Parse the provided reader for the T-Coffee scores file format
104 * public static TCoffeeScoreFile load(Reader reader) {
106 * try { BufferedReader in = (BufferedReader) (reader instanceof
107 * BufferedReader ? reader : new BufferedReader(reader));
108 * TCoffeeScoreFile result = new TCoffeeScoreFile();
109 * result.doParsing(in); return result.header != null &&
110 * result.scores != null ? result : null; } catch( Exception e) {
111 * throw new RuntimeException(e); } }
115 * @return The 'height' of the score matrix i.e. the numbers of score rows
116 * that should matches the number of sequences in the alignment
118 public int getHeight()
120 // the last entry will always be the 'global' alingment consensus scores, so
122 // from the 'height' count to make this value compatible with the number of
123 // sequences in the MSA
124 return scores != null && scores.size() > 0 ? scores.size() - 1 : 0;
128 * @return The 'width' of the score matrix i.e. the number of columns. Since
129 * teh score value are supposd to be calculated for an 'aligned' MSA,
130 * all the entries have to have the same width.
132 public int getWidth()
134 return fWidth != null ? fWidth : 0;
138 * Get the string of score values for the specified seqeunce ID.
142 * @return The scores as a string of values e.g. {@code 99999987-------432}.
143 * It return an empty string when the specified ID is missing.
145 public String getScoresFor(String id)
147 return scores != null && scores.containsKey(id) ? scores.get(id)
152 * @return The list of score string as a {@link List} object, in the same
153 * ordeer of the insertion i.e. in the MSA
155 public List<String> getScoresList()
161 List<String> result = new ArrayList<String>(scores.size());
162 for (Map.Entry<String, StringBuilder> it : scores.entrySet())
164 result.add(it.getValue().toString());
171 * @return The parsed score values a matrix of bytes
173 public byte[][] getScoresArray()
179 byte[][] result = new byte[scores.size()][];
182 for (Map.Entry<String, StringBuilder> it : scores.entrySet())
184 String line = it.getValue().toString();
185 byte[] seqValues = new byte[line.length()];
186 for (int j = 0, c = line.length(); j < c; j++)
189 byte val = (byte) (line.charAt(j) - '0');
191 seqValues[j] = (val >= 0 && val <= 9) ? val : -1;
194 result[rowCount++] = seqValues;
200 public void parse() throws IOException
205 header = readHeader(this);
212 scores = new LinkedHashMap<String, StringBuilder>();
215 * initilize the structure
217 for (Map.Entry<String, Integer> entry : header.scores.entrySet())
219 scores.put(entry.getKey(), new StringBuilder());
223 * go with the reading
226 while ((block = readBlock(this, header.scores.size())) != null)
230 * append sequences read in the block
232 for (Map.Entry<String, String> entry : block.items.entrySet())
234 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
235 if (scoreStringBuilder == null)
238 errormessage = String
239 .format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section",
244 scoreStringBuilder.append(entry.getValue());
249 * verify that all rows have the same width
251 for (StringBuilder str : scores.values())
255 fWidth = str.length();
257 else if (fWidth != str.length())
260 errormessage = "Invalid T-Coffee score file: All the score sequences must have the same length";
268 static int parseInt(String str)
272 return Integer.parseInt(str);
273 } catch (NumberFormatException e)
275 // TODO report a warning ?
281 * Reaad the header section in the T-Coffee score file format
285 * @return The parser {@link Header} instance
286 * @throws RuntimeException
287 * when the header is not in the expected format
289 static Header readHeader(FileParse reader) throws IOException
292 Header result = null;
295 result = new Header();
296 result.head = reader.nextLine();
300 while ((line = reader.nextLine()) != null)
302 if (line.startsWith("SCORE="))
304 result.score = parseInt(line.substring(6).trim());
309 if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
312 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
315 if ((line = reader.nextLine()) == null
316 || !"BAD AVG GOOD".equals(line.trim()))
319 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
322 if ((line = reader.nextLine()) == null || !"*".equals(line.trim()))
325 "Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)");
330 * now are expected a list if sequences ID up to the first blank line
332 while ((line = reader.nextLine()) != null)
339 int p = line.indexOf(":");
342 // TODO report a warning
346 String id = line.substring(0, p).trim();
347 int val = parseInt(line.substring(p + 1).trim());
350 // TODO report warning
354 result.scores.put(id, val);
359 error(reader, "T-COFFEE score file had no per-sequence scores");
362 } catch (IOException e)
364 error(reader, "Unexpected problem parsing T-Coffee score ascii file");
371 private static void error(FileParse reader, String errm)
374 if (reader.errormessage == null)
376 reader.errormessage = errm;
380 reader.errormessage += "\n" + errm;
385 * Read a scores block ihe provided stream.
388 * The stream to parse
390 * The expected number of the sequence to be read
391 * @return The {@link Block} instance read or {link null} null if the end of
393 * @throws IOException
394 * Something went wrong on the 'wire'
396 static Block readBlock(FileParse reader, int size) throws IOException
398 Block result = new Block(size);
402 * read blank lines (eventually)
404 while ((line = reader.nextLine()) != null && "".equals(line.trim()))
406 // consume blank lines
415 * read the scores block
419 if ("".equals(line.trim()))
425 // split the line on the first blank
426 // the first part have to contain the sequence id
427 // the remaining part are the scores values
428 int p = line.indexOf(" ");
431 if (reader.warningMessage == null)
433 reader.warningMessage = "";
435 reader.warningMessage += "Possible parsing error - expected to find a space in line: '"
440 String id = line.substring(0, p).trim();
441 String val = line.substring(p + 1).trim();
443 result.items.put(id, val);
445 } while ((line = reader.nextLine()) != null);
451 * The score file header
459 LinkedHashMap<String, Integer> scores = new LinkedHashMap<String, Integer>();
461 public int getScoreAvg()
466 public int getScoreFor(String ID)
469 return scores.containsKey(ID) ? scores.get(ID) : -1;
475 * Hold a single block values block in the score file
481 Map<String, String> items;
483 public Block(int size)
486 this.items = new HashMap<String, String>(size);
489 String getScoresFor(String id)
491 return items.get(id);
494 String getConsensus()
496 return items.get("cons");
501 * TCOFFEE score colourscheme
503 static final Color[] colors =
504 { new Color(102, 102, 255), // #6666FF
505 new Color(0, 255, 0), // #00FF00
506 new Color(102, 255, 0), // #66FF00
507 new Color(204, 255, 0), // #CCFF00
508 new Color(255, 255, 0), // #FFFF00
509 new Color(255, 204, 0), // #FFCC00
510 new Color(255, 153, 0), // #FF9900
511 new Color(255, 102, 0), // #FF6600
512 new Color(255, 51, 0), // #FF3300
513 new Color(255, 34, 0) // #FF2000
516 public final static String TCOFFEE_SCORE = "TCoffeeScore";
519 * generate annotation for this TCoffee score set on the given alignment
522 * alignment to annotate
524 * if true, annotate sequences based on matching sequence names
525 * @return true if alignment annotation was modified, false otherwise.
527 public boolean annotateAlignment(AlignmentI al, boolean matchids)
529 if (al.getHeight() != getHeight() || al.getWidth() != getWidth())
531 warningMessage = "Alignment shape does not match T-Coffee score file shape.";
534 boolean added = false;
536 SequenceIdMatcher sidmatcher = new SequenceIdMatcher(
537 al.getSequencesArray());
538 byte[][] scoreMatrix = getScoresArray();
539 // for 2.8 - we locate any existing TCoffee annotation and remove it first
540 // before adding this.
541 for (Map.Entry<String, StringBuilder> id : scores.entrySet())
543 byte[] srow = scoreMatrix[i];
547 s = sidmatcher.findIdMatch(id.getKey());
551 s = al.getSequenceAt(i);
554 if (s == null && i != scores.size() && !id.getKey().equals("cons"))
556 System.err.println("No "
557 + (matchids ? "match " : " sequences left ")
558 + " for TCoffee score set : " + id.getKey());
561 int jSize = al.getWidth() < srow.length ? al.getWidth() : srow.length;
562 Annotation[] annotations = new Annotation[al.getWidth()];
563 for (int j = 0; j < jSize; j++)
566 if (s != null && jalview.util.Comparison.isGap(s.getCharAt(j)))
568 annotations[j] = null;
572 .println("Warning: non-zero value for positional T-COFFEE score for gap at "
573 + j + " in sequence " + s.getName());
578 annotations[j] = new Annotation(s == null ? "" + val : null,
579 s == null ? "" + val : null, '\0', val * 1f, val >= 0
580 && val < colors.length ? colors[val]
584 // this will overwrite any existing t-coffee scores for the alignment
585 AlignmentAnnotation aa = al.findOrCreateAnnotation(TCOFFEE_SCORE,
586 TCOFFEE_SCORE, false, s, null);
589 aa.label = "T-COFFEE";
590 aa.description = "" + id.getKey();
591 aa.annotations = annotations;
593 aa.belowAlignment = false;
594 aa.setScore(header.getScoreFor(id.getKey()));
595 aa.createSequenceMapping(s, s.getStart(), true);
596 s.addAlignmentAnnotation(aa);
597 aa.adjustForAlignment();
601 aa.graph = AlignmentAnnotation.NO_GRAPH;
602 aa.label = "T-COFFEE";
603 aa.description = "TCoffee column reliability score";
604 aa.annotations = annotations;
605 aa.belowAlignment = true;
607 aa.setScore(header.getScoreAvg());
609 aa.showAllColLabels = true;
610 aa.validateRangeAndDisplay();
618 public String print()
620 // TODO Auto-generated method stub