Add support RNAML format
[jalview.git] / src / jalview / io / TCoffeeScoreFile.java
1 package jalview.io;
2
3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignmentAnnotation;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.Annotation;
7 import jalview.datamodel.SequenceI;
8
9 import java.awt.Color;
10 import java.io.BufferedReader;
11 import java.io.File;
12 import java.io.FileNotFoundException;
13 import java.io.FileReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.util.ArrayList;
17 import java.util.HashMap;
18 import java.util.LinkedHashMap;
19 import java.util.List;
20 import java.util.Map;
21
22 import javax.xml.parsers.ParserConfigurationException;
23
24 import org.xml.sax.SAXException;
25
26 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
27 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
28 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
29
30 /**
31  * A file parse for T-Coffee score ascii format. This file contains the alignment consensus 
32  * for each resude in any sequence.
33  * <p>
34  * This file is procuded by <code>t_coffee</code> providing the option 
35  * <code>-output=score_ascii </code> to the program command line
36  * 
37  * An example file is the following 
38  * 
39  * <pre>
40  * T-COFFEE, Version_9.02.r1228 (2012-02-16 18:15:12 - Revision 1228 - Build 336)
41  * Cedric Notredame 
42  * CPU TIME:0 sec.
43  * SCORE=90
44  * *
45  *  BAD AVG GOOD
46  * *
47  * 1PHT   :  89
48  * 1BB9   :  90
49  * 1UHC   :  94
50  * 1YCS   :  94
51  * 1OOT   :  93
52  * 1ABO   :  94
53  * 1FYN   :  94
54  * 1QCF   :  94
55  * cons   :  90
56  * 
57  * 1PHT   999999999999999999999999998762112222543211112134
58  * 1BB9   99999999999999999999999999987-------4322----2234
59  * 1UHC   99999999999999999999999999987-------5321----2246
60  * 1YCS   99999999999999999999999999986-------4321----1-35
61  * 1OOT   999999999999999999999999999861-------3------1135
62  * 1ABO   99999999999999999999999999986-------422-------34
63  * 1FYN   99999999999999999999999999985-------32--------35
64  * 1QCF   99999999999999999999999999974-------2---------24
65  * cons   999999999999999999999999999851000110321100001134
66  * 
67  * 
68  * 1PHT   ----------5666642367889999999999889
69  * 1BB9   1111111111676653-355679999999999889
70  * 1UHC   ----------788774--66789999999999889
71  * 1YCS   ----------78777--356789999999999889
72  * 1OOT   ----------78877--356789999999997-67
73  * 1ABO   ----------687774--56779999999999889
74  * 1FYN   ----------6888842356789999999999889
75  * 1QCF   ----------6878742356789999999999889
76  * cons   00100000006877641356789999999999889
77  * </pre>
78  * 
79  * 
80  * @author Paolo Di Tommaso
81  *
82  */
83 public class TCoffeeScoreFile extends AlignFile {
84         
85   public TCoffeeScoreFile(String inFile, String type) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed
86   {
87     super(inFile, type);
88     
89   }
90
91   public TCoffeeScoreFile(FileParse source) throws IOException, ExceptionFileFormatOrSyntax, ParserConfigurationException, SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed
92   {
93     super(source);
94   }
95
96         /** The {@link Header} structure holder */
97         Header header;
98         
99         /** 
100          * Holds the consensues values for each sequences. It uses a LinkedHashMap to maintaint the 
101          * insertion order. 
102          */
103         LinkedHashMap<String,StringBuilder> scores;
104
105         Integer fWidth;
106         
107         /**
108          * Parse the provided reader for the T-Coffee scores file format
109          * 
110          * @param reader 
111         public static TCoffeeScoreFile load(Reader reader) {
112
113                 try {
114                         BufferedReader in = (BufferedReader) (reader instanceof BufferedReader ? reader : new BufferedReader(reader));
115                         TCoffeeScoreFile result = new TCoffeeScoreFile();
116                         result.doParsing(in);
117                         return result.header != null && result.scores != null ? result : null;
118                 }
119                 catch( Exception e) {
120                         throw new RuntimeException(e);
121                 }
122         }
123          */
124                 
125         /**
126          * @return The 'height' of the score matrix i.e. the numbers of score rows that should matches 
127          * the number of sequences in the alignment
128          */
129         public int getHeight() {
130                 // the last entry will always be the 'global' alingment consensus scores, so it is removed 
131                 // from the 'height' count to make this value compatible with the number of sequences in the MSA
132                 return scores != null && scores.size() > 0 ? scores.size()-1 : 0;
133         }
134         
135         /**
136          * @return The 'width' of the score matrix i.e. the number of columns. 
137          * Since teh score value are supposd to be calculated for an 'aligned' MSA, all the entries 
138          * have to have the same width.  
139          */
140         public int getWidth() {
141                 return fWidth != null ? fWidth : 0;
142         }
143         
144         
145         /**
146          * Get the string of score values for the specified seqeunce ID. 
147          * @param id The sequence ID 
148          * @return The scores as a string of values e.g. {@code 99999987-------432}. 
149          *      It return an empty string when the specified ID is missing. 
150          */
151         public String getScoresFor( String id ) {
152                 return scores!=null && scores.containsKey(id) ? scores.get(id).toString() : "";
153         }
154         
155         /**
156          * @return The list of score string as a {@link List} object, in the same ordeer of the insertion i.e. in the MSA
157          */
158         public List<String> getScoresList() {
159           if (scores==null)
160           {
161             return null;
162           }
163                 List<String> result = new ArrayList<String>( scores.size() );
164                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
165                         result.add(it.getValue().toString());
166                 }
167                 
168                 return result;
169         }
170         
171         /**
172          * @return The parsed score values a matrix of bytes
173          */
174         public byte[][] getScoresArray() { 
175           if (scores==null)
176           {
177             return null;
178           }
179                 byte[][] result = new byte[ scores.size() ][];
180                 
181                 int rowCount = 0;
182                 for( Map.Entry<String,StringBuilder> it : scores.entrySet() ) {
183                         String line = it.getValue().toString();
184                         byte[] seqValues = new byte[ line.length() ];
185                         for( int j=0, c=line.length(); j<c; j++ ) {
186                                 
187                                 byte val = (byte)(line.charAt(j) - '0');
188
189                                 seqValues[j] = ( val >= 0 && val <= 9 ) ? val : -1; 
190                         }
191
192                         result[rowCount++] = seqValues;
193                 }
194                 
195                 return result;
196         }
197         
198
199         public void parse() throws IOException
200         {
201                 /*
202                  * read the header
203                  */
204                 header = readHeader(this);
205
206                 if( header == null ) { error=true; return;}
207                 scores = new LinkedHashMap<String,StringBuilder>();
208         
209                 /*
210                  * initilize the structure
211                  */
212                 for( Map.Entry<String,Integer> entry : header.scores.entrySet() ) {
213                         scores.put( entry.getKey(), new StringBuilder());
214                 }
215                 
216                 /*
217                  * go with the reading
218                  */
219                 Block block;
220                 while( (block = readBlock(this,header.scores.size())) != null  ) {
221                         
222                         /*
223                          * append sequences read in the block
224                          */
225                         for( Map.Entry<String,String> entry : block.items.entrySet() ) {
226                                 StringBuilder scoreStringBuilder = scores.get(entry.getKey());
227                                 if( scoreStringBuilder == null ) {
228                                         error=true;
229                                         errormessage=String.format("Invalid T-Coffee score file: Sequence ID '%s' is not declared in header section", entry.getKey());
230                                         return ;
231                                 }
232                                 
233                                 scoreStringBuilder.append( entry.getValue() );
234                         }
235                 }
236                 
237                 /*
238                  * verify that all rows have the same width
239                  */
240                 for( StringBuilder str : scores.values() ) {
241                         if( fWidth == null ) {
242                                 fWidth = str.length();
243                         }
244                         else if( fWidth != str.length() ) {
245                           error=true;
246                           errormessage="Invalid T-Coffee score file: All the score sequences must have the same length";
247                           return ;
248                         }
249                 }
250                 
251                 
252                 return;
253         }
254
255
256         static int parseInt( String str ) {
257                 try {
258                         return Integer.parseInt(str);
259                 }
260                 catch( NumberFormatException e ) {
261                         // TODO report a warning ?
262                         return 0;
263                 }               
264         }
265         
266         /**
267          * Reaad the header section in the T-Coffee score file format 
268          * 
269          * @param reader The scores reader 
270          * @return The parser {@link Header} instance 
271          * @throws RuntimeException when the header is not in the expected format
272          */
273         static Header readHeader(FileParse reader) throws IOException {
274                 
275                 Header result = null;
276                 try {
277                         result = new Header();
278                         result.head = reader.nextLine();
279                         
280                         String line;
281
282                         while( (line = reader.nextLine()) != null ) {
283                                 if( line.startsWith("SCORE=")) {
284                                         result.score = parseInt( line.substring(6).trim() );
285                                         break;
286                                 }
287                         }
288
289                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
290                         if( (line=reader.nextLine())==null || !"BAD AVG GOOD".equals(line.trim())) { error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
291                         if( (line=reader.nextLine())==null || !"*".equals(line.trim())) {error(reader,"Invalid T-COFFEE score format (NO BAD/AVG/GOOD header)"); return null;}
292                         
293                         /*
294                          * now are expected a list if sequences ID up to the first blank line
295                          */
296                         while( (line=reader.nextLine()) != null ) {
297                                 if( "".equals(line) ) {
298                                         break;
299                                 }
300                                 
301                                 int p = line.indexOf(":");
302                                 if( p == -1 ) {
303                                         // TODO report a warning
304                                         continue;
305                                 }
306                                 
307                                 String id = line.substring(0,p).trim();
308                                 int val = parseInt(line.substring(p+1).trim());
309                                 if( "".equals(id) ) {
310                                         // TODO report warning
311                                         continue;
312                                 }
313                                 
314                                 result.scores.put(id,val);
315                         }
316                         
317                         if (result==null) {
318                           error(reader, "T-COFFEE score file had no per-sequence scores");
319                         }
320                         
321                 }
322                 catch( IOException e ) {
323                   error(reader,"Unexpected problem parsing T-Coffee score ascii file");
324                   throw e;
325                 }
326                 
327                 return result;
328         } 
329         private static void error(FileParse reader, String errm)
330         {
331           reader.error=true;
332           if (reader.errormessage==null)
333           { reader.errormessage=errm;
334           } else {
335             reader.errormessage+="\n"+errm;
336           }
337         }
338         /**
339          * Read a scores block ihe provided stream. 
340          * 
341          * @param reader The stream to parse
342          * @param size The expected number of the sequence to be read 
343          * @return The {@link Block} instance read or {link null} null if the end of file has reached.
344          * @throws IOException Something went wrong on the 'wire' 
345          */
346         static Block readBlock( FileParse reader, int size ) throws IOException {
347                 Block result = new Block(size);
348                 String line;
349                 
350                 /*
351                  * read blank lines (eventually)
352                  */
353                 while( (line=reader.nextLine()) != null && "".equals(line.trim())) {
354                         // consume blank lines 
355                 }
356                 
357                 if( line == null ) { return null; }
358                 
359                 /*
360                  * read the scores block
361                  */
362                 do {
363                         if( "".equals(line.trim()) ) {
364                                 // terminated
365                                 break;
366                         }
367                         
368                         // split the line on the first blank 
369                         // the first part have to contain the sequence id
370                         // the remaining part are the scores values
371                         int p = line.indexOf(" ");
372                         if( p == -1 ) {
373                           if (reader.warningMessage==null) { reader.warningMessage=""; }
374                           reader.warningMessage+="Possible parsing error - expected to find a space in line: '"+line+"'\n";
375                                 continue;
376                         } 
377                         
378                         String id = line.substring(0,p).trim();
379                         String val = line.substring(p+1).trim();
380                         
381                         result.items.put(id, val);
382                         
383                 } while( (line = reader.nextLine()) != null ); 
384                 
385
386                 return result;
387         }
388
389         /*
390          * The score file header 
391          */
392         static class Header {
393                 String head;
394                 int score;
395
396                 LinkedHashMap<String,Integer> scores = new LinkedHashMap<String,Integer>();
397                 
398                 public int getScoreAvg() { return score; }
399                 
400                 public int getScoreFor( String ID ) { 
401
402                         return scores.containsKey(ID) ? scores.get(ID) : -1;
403         
404                 }
405         }
406         
407         /*
408          * Hold a single block values block in the score file
409          */
410         static class Block {
411                 int size;
412                 Map<String,String> items;
413                 
414                 public Block( int size ) {
415                         this.size = size;
416                         this.items = new HashMap<String,String>(size);
417                 } 
418         
419                 String getScoresFor( String id ) {
420                         return items.get(id);
421                 }
422                 
423                 String getConsensus() {
424                         return items.get("cons");
425                 }
426         }
427         /**
428          * TCOFFEE score colourscheme
429          */
430         static final Color[] colors = {
431                         new Color( 102, 102, 255 ),     // #6666FF
432                         new Color( 0, 255, 0),          // #00FF00
433                         new Color( 102, 255, 0),        // #66FF00
434                         new Color( 204, 255, 0),        // #CCFF00
435                         new Color( 255, 255, 0),        // #FFFF00
436                         new Color( 255, 204, 0),        // #FFCC00
437                         new Color( 255, 153, 0),        // #FF9900
438                         new Color( 255, 102, 0),        // #FF6600
439                         new Color( 255, 51, 0),         // #FF3300
440                         new Color( 255, 34, 0)          // #FF2000
441                 };
442         public final static String TCOFFEE_SCORE="TCoffeeScore";
443         /**
444          * generate annotation for this TCoffee score set on the given alignment
445          * @param al alignment to annotate
446          * @param matchids if true, annotate sequences based on matching sequence names
447          * @return true if alignment annotation was modified, false otherwise.
448          */
449         public boolean annotateAlignment(AlignmentI al, boolean matchids)
450         {
451           if (al.getHeight()!=getHeight() || al.getWidth()!=getWidth())
452           {
453             warningMessage="Alignment shape does not match T-Coffee score file shape.";
454             return false;
455           }
456           boolean added=false;
457           int i=0;
458           SequenceIdMatcher sidmatcher = new SequenceIdMatcher(al.getSequencesArray());
459           byte[][] scoreMatrix=getScoresArray();
460           // for 2.8 - we locate any existing TCoffee annotation and remove it first before adding this.
461           for (Map.Entry<String,StringBuilder> id:scores.entrySet())
462           {
463             byte[] srow=scoreMatrix[i];
464             SequenceI s;
465             if (matchids)
466             {
467               s=sidmatcher.findIdMatch(id.getKey());
468             } else {
469               s=al.getSequenceAt(i);
470             }
471             i++;
472             if (s==null && i!=scores.size() && !id.getKey().equals("cons"))
473             {
474               System.err.println("No "+(matchids ? "match ":" sequences left ")+" for TCoffee score set : "+id.getKey());
475               continue;
476             }
477             int jSize=al.getWidth()< srow.length ? al.getWidth() : srow.length;
478             Annotation[] annotations=new Annotation[al.getWidth()];
479             for (int j=0;j<jSize;j++) {
480               byte val = srow[j];
481               if (s!=null && jalview.util.Comparison.isGap(s.getCharAt(j)))
482               {
483                 annotations[j]=null;
484                 if (val>0)
485                 {
486                   System.err.println("Warning: non-zero value for positional T-COFFEE score for gap at "+j+" in sequence "+s.getName());
487                 }
488               } else {
489               annotations[j]=new Annotation(s==null ? ""+val:null,s==null ? ""+val:null,'\0',val*1f,val >= 0 && val < colors.length ? colors[val] : Color.white);
490               }
491             }
492             // this will overwrite any existing t-coffee scores for the alignment
493             AlignmentAnnotation aa=al.findOrCreateAnnotation(TCOFFEE_SCORE,false,s,null);
494             if (s!=null)
495             {
496               aa.label="T-COFFEE";
497               aa.description=""+id.getKey();
498               aa.annotations=annotations;
499               aa.visible=false;
500               aa.belowAlignment=false;
501               aa.setScore(header.getScoreFor(id.getKey()));
502               aa.createSequenceMapping(s, s.getStart(),true);
503               s.addAlignmentAnnotation(aa);
504               aa.adjustForAlignment();
505             } else {
506               aa.graph=AlignmentAnnotation.NO_GRAPH;
507               aa.label="T-COFFEE";
508               aa.description="TCoffee column reliability score";
509               aa.annotations=annotations;
510               aa.belowAlignment=true;
511               aa.visible=true;
512               aa.setScore(header.getScoreAvg());
513             }
514             aa.showAllColLabels=true;
515             aa.validateRangeAndDisplay();
516             added=true;
517           }
518           
519           return added;
520         }
521
522   @Override
523   public String print()
524   {
525     // TODO Auto-generated method stub
526     return "Not valid.";
527   }
528 }