JAL-1159 patch for building menard branch on cruisecontrol
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
8
9 import java.awt.Color;
10 import java.io.BufferedReader;
11 import java.io.File;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
20 import java.util.Map;
21
22 import javax.xml.parsers.ParserConfigurationException;
23
24 import org.xml.sax.SAXException;
25
26 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
27 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
28 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
29 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
30
31 /**
32  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
33  * for each resude in any sequence.
34  * <p>
35  * This file is procuded by <code>t_coffee</code> providing the option 
36  * <code>-output=score_ascii </code> to the program command line
37  * 
38  * An example file is the following 
39  * 
40  * <pre>
41  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
42  * Cedric Notredame 
43  * CPU TIME:0 sec.
44  * SCORE=90
45  * *
46  *  BAD AVG GOOD
47  * *
48  * 1PHT   :  89
49  * 1BB9   :  90
50  * 1UHC   :  94
51  * 1YCS   :  94
52  * 1OOT   :  93
53  * 1ABO   :  94
54  * 1FYN   :  94
55  * 1QCF   :  94
56  * cons   :  90
57  * 
58  * 1PHT   999999999999999999999999998762112222543211112134
59  * 1BB9   99999999999999999999999999987-------4322----2234
60  * 1UHC   99999999999999999999999999987-------5321----2246
61  * 1YCS   99999999999999999999999999986-------4321----1-35
62  * 1OOT   999999999999999999999999999861-------3------1135
63  * 1ABO   99999999999999999999999999986-------422-------34
64  * 1FYN   99999999999999999999999999985-------32--------35
65  * 1QCF   99999999999999999999999999974-------2---------24
66  * cons   999999999999999999999999999851000110321100001134
67  * 
68  * 
69  * 1PHT   ----------5666642367889999999999889
70  * 1BB9   1111111111676653-355679999999999889
71  * 1UHC   ----------788774--66789999999999889
72  * 1YCS   ----------78777--356789999999999889
73  * 1OOT   ----------78877--356789999999997-67
74  * 1ABO   ----------687774--56779999999999889
75  * 1FYN   ----------6888842356789999999999889
76  * 1QCF   ----------6878742356789999999999889
77  * cons   00100000006877641356789999999999889
78  * </pre>
79  * 
80  * 
81  * @author Paolo Di Tommaso
82  *
83  */
84 public class TCoffeeScoreFile extends AlignFile {
85         
86   public TCoffeeScoreFile(String inFile, String type) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException, ExceptionUnmatchedClosingParentheses
87   {
88     super(inFile, type);
89     
90   }
91
92   public TCoffeeScoreFile(FileParse source) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed, InterruptedException, ExceptionUnmatchedClosingParentheses
93   {
94     super(source);
95   }
96
97         /** The {@link Header} structure holder */
98         Header header;
99         
100         /** 
101          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
102          * insertion order. 
103          */
104         LinkedHashMap<String,StringBuilder> scores;
105
106         Integer fWidth;
107         
108         /**
109          * Parse the provided reader for the T-Coffee scores file format
110          * 
111          * @param reader 
112         public static TCoffeeScoreFile load(Reader reader) {
113
114                 try {
115                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
116                         TCoffeeScoreFile result = new TCoffeeScoreFile();
117                         result.doParsing(in);
118                         return result.header != null && result.scores != null ? result : null;
119                 }
120                 catch( Exception e) {
121                         throw new RuntimeException(e);
122                 }
123         }
124          */
125                 
126         /**
127          * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches 
128          * the number of sequences in the alignment
129          */
130         public int getHeight() {
131                 // the last entry will always be the 'global' alingment consensus scores, so it is removed 
132                 // from the 'height' count to make this value compatible with the number of sequences in the MSA
133                 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
134         }
135         
136         /**
137          * @return The 'width' of the score matrix i.e. the number of columns. 
138          * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries 
139          * have to have the same width.  
140          */
141         public int getWidth() {
142                 return fWidth != null ? fWidth : 0;
143         }
144         
145         
146         /**
147          * Get the string of score values for the specified seqeunce ID. 
148          * @param id The sequence ID 
149          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
150          *      It return an empty string when the specified ID is missing. 
151          */
152         public String getScoresFor( String id ) {
153                 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
154         }
155         
156         /**
157          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
158          */
159         public List<String> getScoresList() {
160           if (scores==null)
161           {
162             return null;
163           }
164                 List<String> result = new ArrayList<String>( scores.size() );
165                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
166                         result.add(it.getValue().toString());
167                 }
168                 
169                 return result;
170         }
171         
172         /**
173          * @return The parsed score values a matrix of bytes
174          */
175         public byte[][] getScoresArray() { 
176           if (scores==null)
177           {
178             return null;
179           }
180                 byte[][] result = new byte[ scores.size() ][];
181                 
182                 int rowCount = 0;
183                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
184                         String line = it.getValue().toString();
185                         byte[] seqValues = new byte[ line.length() ];
186                         for( int j=0, c=line.length(); j<c; j++ ) {
187                                 
188                                 byte val = (byte)(line.charAt(j) - '0');
189
190                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
191                         }
192
193                         result[rowCount++] = seqValues;
194                 }
195                 
196                 return result;
197         }
198         
199
200         public void parse() throws IOException
201         {
202                 /*
203                  * read the header
204                  */
205                 header = readHeader(this);
206
207                 if( header == null ) { error=true; return;}
208                 scores = new LinkedHashMap<String,StringBuilder>();
209         
210                 /*
211                  * initilize the structure
212                  */
213                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
214                         scores.put( entry.getKey(), new StringBuilder());
215                 }
216                 
217                 /*
218                  * go with the reading
219                  */
220                 Block block;
221                 while( (block = readBlock(this,header.scores.size())) != null  ) {
222                         
223                         /*
224                          * append sequences read in the block
225                          */
226                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
227                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
228                                 if( scoreStringBuilder == null ) {
229                                         error=true;
230                                         errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
231                                         return ;
232                                 }
233                                 
234                                 scoreStringBuilder.append( entry.getValue() );
235                         }
236                 }
237                 
238                 /*
239                  * verify that all rows have the same width
240                  */
241                 for( StringBuilder str : scores.values() ) {
242                         if( fWidth == null ) {
243                                 fWidth = str.length();
244                         }
245                         else if( fWidth != str.length() ) {
246                           error=true;
247                           errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
248                           return ;
249                         }
250                 }
251                 
252                 
253                 return;
254         }
255
256
257         static int parseInt( String str ) {
258                 try {
259                         return Integer.parseInt(str);
260                 }
261                 catch( NumberFormatException e ) {
262                         // TODO report a warning ?
263                         return 0;
264                 }               
265         }
266         
267         /**
268          * Reaad the header section in the T-Coffee score file format 
269          * 
270          * @param reader The scores reader 
271          * @return The parser {@link Header} instance 
272          * @throws RuntimeException when the header is not in the expected format
273          */
274         static Header readHeader(FileParse reader) throws IOException {
275                 
276                 Header result = null;
277                 try {
278                         result = new Header();
279                         result.head = reader.nextLine();
280                         
281                         String line;
282
283                         while( (line = reader.nextLine()) != null ) {
284                                 if( line.startsWith("SCORE=")) {
285                                         result.score = parseInt( line.substring(6).trim() );
286                                         break;
287                                 }
288                         }
289
290                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
291                         if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
292                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
293                         
294                         /*
295                          * now are expected a list if sequences ID up to the first blank line
296                          */
297                         while( (line=reader.nextLine()) != null ) {
298                                 if( "".equals(line) ) {
299                                         break;
300                                 }
301                                 
302                                 int p = line.indexOf(":");
303                                 if( p == -1 ) {
304                                         // TODO report a warning
305                                         continue;
306                                 }
307                                 
308                                 String id = line.substring(0,p).trim();
309                                 int val = parseInt(line.substring(p+1).trim());
310                                 if( "".equals(id) ) {
311                                         // TODO report warning
312                                         continue;
313                                 }
314                                 
315                                 result.scores.put(id,val);
316                         }
317                         
318                         if (result==null) {
319                           error(reader, "T-COFFEE score file had no per-sequence scores");
320                         }
321                         
322                 }
323                 catch( IOException e ) {
324                   error(reader,"Unexpected problem parsing T-Coffee score ascii file");
325                   throw e;
326                 }
327                 
328                 return result;
329         } 
330         private static void error(FileParse reader, String errm)
331         {
332           reader.error=true;
333           if (reader.errormessage==null)
334           { reader.errormessage=errm;
335           } else {
336             reader.errormessage+="\n"+errm;
337           }
338         }
339         /**
340          * Read a scores block ihe provided stream. 
341          * 
342          * @param reader The stream to parse
343          * @param size The expected number of the sequence to be read 
344          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
345          * @throws IOException Something went wrong on the 'wire' 
346          */
347         static Block readBlock( FileParse reader, int size ) throws IOException {
348                 Block result = new Block(size);
349                 String line;
350                 
351                 /*
352                  * read blank lines (eventually)
353                  */
354                 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
355                         // consume blank lines 
356                 }
357                 
358                 if( line == null ) { return null; }
359                 
360                 /*
361                  * read the scores block
362                  */
363                 do {
364                         if( "".equals(line.trim()) ) {
365                                 // terminated
366                                 break;
367                         }
368                         
369                         // split the line on the first blank 
370                         // the first part have to contain the sequence id
371                         // the remaining part are the scores values
372                         int p = line.indexOf(" ");
373                         if( p == -1 ) {
374                           if (reader.warningMessage==null) { reader.warningMessage=""; }
375                           reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
376                                 continue;
377                         } 
378                         
379                         String id = line.substring(0,p).trim();
380                         String val = line.substring(p+1).trim();
381                         
382                         result.items.put(id, val);
383                         
384                 } while( (line = reader.nextLine()) != null ); 
385                 
386
387                 return result;
388         }
389
390         /*
391          * The score file header 
392          */
393         static class Header {
394                 String head;
395                 int score;
396
397                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
398                 
399                 public int getScoreAvg() { return score; }
400                 
401                 public int getScoreFor( String ID ) { 
402
403                         return scores.containsKey(ID) ? scores.get(ID) : -1;
404         
405                 }
406         }
407         
408         /*
409          * Hold a single block values block in the score file
410          */
411         static class Block {
412                 int size;
413                 Map<String,String> items;
414                 
415                 public Block( int size ) {
416                         this.size = size;
417                         this.items = new HashMap<String,String>(size);
418                 } 
419         
420                 String getScoresFor( String id ) {
421                         return items.get(id);
422                 }
423                 
424                 String getConsensus() {
425                         return items.get("cons");
426                 }
427         }
428         /**
429          * TCOFFEE score colourscheme
430          */
431         static final Color[] colors = {
432                         new Color( 102, 102, 255 ),     // #6666FF
433                         new Color( 0, 255, 0),          // #00FF00
434                         new Color( 102, 255, 0),        // #66FF00
435                         new Color( 204, 255, 0),        // #CCFF00
436                         new Color( 255, 255, 0),        // #FFFF00
437                         new Color( 255, 204, 0),        // #FFCC00
438                         new Color( 255, 153, 0),        // #FF9900
439                         new Color( 255, 102, 0),        // #FF6600
440                         new Color( 255, 51, 0),         // #FF3300
441                         new Color( 255, 34, 0)          // #FF2000
442                 };
443         public final static String TCOFFEE_SCORE="TCoffeeScore";
444         /**
445          * generate annotation for this TCoffee score set on the given alignment
446          * @param al alignment to annotate
447          * @param matchids if true, annotate sequences based on matching sequence names
448          * @return true if alignment annotation was modified, false otherwise.
449          */
450         public boolean annotateAlignment(AlignmentI al, boolean matchids)
451         {
452           if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
453           {
454             warningMessage="Alignment shape does not match T-Coffee score file shape.";
455             return false;
456           }
457           boolean added=false;
458           int i=0;
459           SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
460           byte[][] scoreMatrix=getScoresArray();
461           // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
462           for (Map.Entry<String,StringBuilder> id:scores.entrySet())
463           {
464             byte[] srow=scoreMatrix[i];
465             SequenceI s;
466             if (matchids)
467             {
468               s=sidmatcher.findIdMatch(id.getKey());
469             } else {
470               s=al.getSequenceAt(i);
471             }
472             i++;
473             if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
474             {
475               System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
476               continue;
477             }
478             int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
479             Annotation[] annotations=new Annotation[al.getWidth()];
480             for (int j=0;j<jSize;j++) {
481               byte val = srow[j];
482               if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
483               {
484                 annotations[j]=null;
485                 if (val>0)
486                 {
487                   System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
488                 }
489               } else {
490               annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
491               }
492             }
493             // this will overwrite any existing t-coffee scores for the alignment
494             AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,TCOFFEE_SCORE,false,s, null);
495             if (s!=null)
496             {
497               aa.label="T-COFFEE";
498               aa.description=""+id.getKey();
499               aa.annotations=annotations;
500               aa.visible=false;
501               aa.belowAlignment=false;
502               aa.setScore(header.getScoreFor(id.getKey()));
503               aa.createSequenceMapping(s, s.getStart(),true);
504               s.addAlignmentAnnotation(aa);
505               aa.adjustForAlignment();
506             } else {
507               aa.graph=AlignmentAnnotation.NO_GRAPH;
508               aa.label="T-COFFEE";
509               aa.description="TCoffee column reliability score";
510               aa.annotations=annotations;
511               aa.belowAlignment=true;
512               aa.visible=true;
513               aa.setScore(header.getScoreAvg());
514             }
515             aa.showAllColLabels=true;
516             aa.validateRangeAndDisplay();
517             added=true;
518           }
519           
520           return added;
521         }
522
523   @Override
524   public String print()
525   {
526     // TODO Auto-generated method stub
527     return "Not valid.";
528   }
529 }