in progress
[jalview.git] / forester / java / src / org / forester / archaeopteryx / tools / PhylogeneticInferrer.java
1 // $Id:
2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
4 //
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.archaeopteryx.tools;
27
28 import java.io.BufferedWriter;
29 import java.io.File;
30 import java.io.FileWriter;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.List;
34 import java.util.regex.Matcher;
35
36 import javax.swing.JOptionPane;
37
38 import org.forester.archaeopteryx.AptxUtil;
39 import org.forester.archaeopteryx.MainFrameApplication;
40 import org.forester.evoinference.distance.NeighborJoining;
41 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
42 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
43 import org.forester.evoinference.tools.BootstrapResampler;
44 import org.forester.io.parsers.FastaParser;
45 import org.forester.msa.BasicMsa;
46 import org.forester.msa.ClustalOmega;
47 import org.forester.msa.Mafft;
48 import org.forester.msa.Msa;
49 import org.forester.msa.Msa.MSA_FORMAT;
50 import org.forester.msa.MsaInferrer;
51 import org.forester.msa.MsaMethods;
52 import org.forester.msa.ResampleableMsa;
53 import org.forester.phylogeny.Phylogeny;
54 import org.forester.phylogeny.PhylogenyNode;
55 import org.forester.phylogeny.data.Accession;
56 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
57 import org.forester.sequence.Sequence;
58 import org.forester.tools.ConfidenceAssessor;
59 import org.forester.util.ForesterUtil;
60
61 public class PhylogeneticInferrer extends RunnableProcess {
62
63     private Msa                                _msa;
64     private final MainFrameApplication         _mf;
65     private final PhylogeneticInferenceOptions _options;
66     private final List<Sequence>               _seqs;
67     private final boolean                      DEBUG           = true;
68     public final static String                 MSA_FILE_SUFFIX = ".aln";
69     public final static String                 PWD_FILE_SUFFIX = ".pwd";
70
71     public PhylogeneticInferrer( final List<Sequence> seqs,
72                                  final PhylogeneticInferenceOptions options,
73                                  final MainFrameApplication mf ) {
74         _msa = null;
75         _seqs = seqs;
76         _mf = mf;
77         _options = options;
78     }
79
80     public PhylogeneticInferrer( final Msa msa,
81                                  final PhylogeneticInferenceOptions options,
82                                  final MainFrameApplication mf ) {
83         _msa = msa;
84         _seqs = null;
85         _mf = mf;
86         _options = options;
87     }
88
89     private Msa inferMsa( final MSA_PRG msa_prg ) throws IOException, InterruptedException {
90         //        final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" );
91         //        if ( DEBUG ) {
92         //            System.out.println();
93         //            System.out.println( "temp file: " + temp_seqs_file );
94         //            System.out.println();
95         //        }
96         //        //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
97         //        final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
98         //        SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
99         //        writer.close();
100         switch ( msa_prg ) {
101             case MAFFT: 
102                 return runMAFFT( _seqs, processMafftOptions() );
103                 
104             case CLUSTAL_O:
105                 return runClustalOmega( _seqs, processMafftOptions() );
106             default:
107                 return null;
108         }
109         
110        
111     }
112
113     private List<String> processMafftOptions() {
114         final String opts_str = _options.getMsaPrgParameters().trim().toLowerCase();
115         final String[] opts_ary = opts_str.split( " " );
116         final List<String> opts = new ArrayList<String>();
117         boolean saw_quiet = false;
118         for( final String opt : opts_ary ) {
119             opts.add( opt );
120             if ( opt.equals( "--quiet" ) ) {
121                 saw_quiet = true;
122             }
123         }
124         if ( !saw_quiet ) {
125             opts.add( "--quiet" );
126         }
127         return opts;
128     }
129
130     private Phylogeny inferPhylogeny( final Msa msa ) {
131         BasicSymmetricalDistanceMatrix m = null;
132         switch ( _options.getPwdDistanceMethod() ) {
133             case KIMURA_DISTANCE:
134                 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
135                 break;
136             case POISSON_DISTANCE:
137                 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
138                 break;
139             case FRACTIONAL_DISSIMILARITY:
140                 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
141                 break;
142             default:
143                 throw new RuntimeException( "invalid pwd method" );
144         }
145         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
146             BufferedWriter pwd_writer;
147             try {
148                 pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + PWD_FILE_SUFFIX ) );
149                 m.write( pwd_writer );
150                 pwd_writer.close();
151             }
152             catch ( final IOException e ) {
153                 // TODO Auto-generated catch block
154                 e.printStackTrace();
155             }
156         }
157         final NeighborJoining nj = NeighborJoining.createInstance();
158         final Phylogeny phy = nj.execute( m );
159         PhylogeneticInferrer.extractFastaInformation( phy );
160         return phy;
161     }
162
163     private void infer() throws InterruptedException {
164         //_mf.getMainPanel().getCurrentTreePanel().setWaitCursor();
165         if ( ( _msa == null ) && ( _seqs == null ) ) {
166             throw new IllegalArgumentException( "cannot run phylogenetic analysis with null msa and seq array" );
167         }
168         start( _mf, "phylogenetic inference" );
169         if ( _msa == null ) {
170             Msa msa = null;
171             try {
172                 msa = inferMsa( MSA_PRG.MAFFT );
173             }
174             catch ( final IOException e ) {
175                 end( _mf );
176                 JOptionPane.showMessageDialog( _mf,
177                                                "Could not create multiple sequence alignment with \""
178                                                        + _options.getMsaPrg() + "\" and the following parameters:\n\""
179                                                        + _options.getMsaPrgParameters() + "\"\nError: "
180                                                        + e.getLocalizedMessage(),
181                                                "Failed to Calculate MSA",
182                                                JOptionPane.ERROR_MESSAGE );
183                 if ( DEBUG ) {
184                     e.printStackTrace();
185                 }
186                 return;
187             }
188             catch ( final Exception e ) {
189                 end( _mf );
190                 JOptionPane.showMessageDialog( _mf,
191                                                "Could not create multiple sequence alignment with \""
192                                                        + _options.getMsaPrg() + "\" and the following parameters:\n\""
193                                                        + _options.getMsaPrgParameters() + "\"\nError: "
194                                                        + e.getLocalizedMessage(),
195                                                "Unexpected Exception During MSA Calculation",
196                                                JOptionPane.ERROR_MESSAGE );
197                 if ( DEBUG ) {
198                     e.printStackTrace();
199                 }
200                 return;
201             }
202             if ( msa == null ) {
203                 end( _mf );
204                 JOptionPane.showMessageDialog( _mf,
205                                                "Could not create multiple sequence alignment with "
206                                                        + _options.getMsaPrg() + "\nand the following parameters:\n\""
207                                                        + _options.getMsaPrgParameters() + "\"",
208                                                "Failed to Calculate MSA",
209                                                JOptionPane.ERROR_MESSAGE );
210                 return;
211             }
212             if ( DEBUG ) {
213                 System.out.println( msa.toString() );
214                 System.out.println( MsaMethods.calcGapRatio( msa ) );
215             }
216             final MsaMethods msa_tools = MsaMethods.createInstance();
217             if ( _options.isExecuteMsaProcessing() ) {
218                 msa = msa_tools.removeGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
219                                                   _options.getMsaProcessingMinAllowedLength(),
220                                                   msa );
221                 if ( msa == null ) {
222                     end( _mf );
223                     JOptionPane.showMessageDialog( _mf,
224                                                    "Less than two sequences longer than "
225                                                            + _options.getMsaProcessingMinAllowedLength()
226                                                            + " residues left after MSA processing",
227                                                    "MSA Processing Settings Too Stringent",
228                                                    JOptionPane.ERROR_MESSAGE );
229                     return;
230                 }
231             }
232             if ( DEBUG ) {
233                 System.out.println( msa_tools.getIgnoredSequenceIds() );
234                 System.out.println( msa.toString() );
235                 System.out.println( MsaMethods.calcGapRatio( msa ) );
236             }
237             _msa = msa;
238         }
239         final int n = _options.getBootstrapSamples();
240         final long seed = _options.getRandomNumberGeneratorSeed();
241         final Phylogeny master_phy = inferPhylogeny( _msa );
242         if ( _options.isPerformBootstrapResampling() && ( n > 0 ) ) {
243             final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
244             final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa
245                     .getLength(), n, seed );
246             final Phylogeny[] eval_phys = new Phylogeny[ n ];
247             for( int i = 0; i < n; ++i ) {
248                 resampleable_msa.resample( resampled_column_positions[ i ] );
249                 eval_phys[ i ] = inferPhylogeny( resampleable_msa );
250             }
251             ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
252         }
253         _mf.getMainPanel().addPhylogenyInNewTab( master_phy, _mf.getConfiguration(), "nj", "njpath" );
254         //  _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
255         end( _mf );
256         JOptionPane.showMessageDialog( _mf,
257                                        "Inference successfully completed",
258                                        "Inference Completed",
259                                        JOptionPane.INFORMATION_MESSAGE );
260     }
261
262     @Override
263     public void run() {
264         try {
265             infer();
266         }
267         catch ( final InterruptedException e ) {
268             // TODO need to handle this exception SOMEHOW!
269             // TODO Auto-generated catch block
270             e.printStackTrace();
271         }
272     }
273
274     private Msa runMAFFT( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
275         Msa msa = null;
276         final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft()
277                 .getCanonicalPath() );
278         try {
279             msa = mafft.infer( seqs, opts );
280         }
281         catch ( final IOException e ) {
282             System.out.println( mafft.getErrorDescription() );
283         }
284         return msa;
285     }
286
287     private Msa runClustalOmega( final List<Sequence> seqs, final List<String> opts ) throws IOException,
288             InterruptedException {
289         Msa msa = null;
290         final MsaInferrer clustalo = ClustalOmega.createInstance( _mf.getInferenceManager().getPathToLocalClustalo()
291                 .getCanonicalPath() );
292         try {
293             msa = clustalo.infer( seqs, opts );
294         }
295         catch ( final IOException e ) {
296             System.out.println( clustalo.getErrorDescription() );
297         }
298         return msa;
299     }
300
301     private void writeToFiles( final BasicSymmetricalDistanceMatrix m ) {
302         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
303             try {
304                 final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
305                         + MSA_FILE_SUFFIX ) );
306                 _msa.write( msa_writer, MSA_FORMAT.PHYLIP );
307                 msa_writer.close();
308                 final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
309                         + PWD_FILE_SUFFIX ) );
310                 m.write( pwd_writer );
311                 pwd_writer.close();
312             }
313             catch ( final Exception e ) {
314                 System.out.println( "Error: " + e.getMessage() );
315             }
316         }
317     }
318
319     public static void extractFastaInformation( final Phylogeny phy ) {
320         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
321             final PhylogenyNode node = iter.next();
322             if ( !ForesterUtil.isEmpty( node.getName() ) ) {
323                 final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() );
324                 if ( name_m.lookingAt() ) {
325                     System.out.println();
326                     // System.out.println( name_m.group( 1 ) );
327                     // System.out.println( name_m.group( 2 ) );
328                     // System.out.println( name_m.group( 3 ) );
329                     // System.out.println( name_m.group( 4 ) );
330                     final String acc_source = name_m.group( 1 );
331                     final String acc = name_m.group( 2 );
332                     final String seq_name = name_m.group( 3 );
333                     final String tax_sn = name_m.group( 4 );
334                     if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
335                         AptxUtil.ensurePresenceOfSequence( node );
336                         node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
337                     }
338                     if ( !ForesterUtil.isEmpty( seq_name ) ) {
339                         AptxUtil.ensurePresenceOfSequence( node );
340                         node.getNodeData().getSequence( 0 ).setName( seq_name );
341                     }
342                     if ( !ForesterUtil.isEmpty( tax_sn ) ) {
343                         AptxUtil.ensurePresenceOfTaxonomy( node );
344                         node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
345                     }
346                 }
347             }
348         }
349     }
350     
351     public enum MSA_PRG {
352         MAFFT, CLUSTAL_O;
353     }
354 }