moved to: https://sites.google.com/site/cmzmasek/home/software/forester
[jalview.git] / forester / java / src / org / forester / archaeopteryx / tools / PhylogeneticInferrer.java
1 // $Id:
2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
4 //
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.archaeopteryx.tools;
27
28 import java.io.BufferedWriter;
29 import java.io.FileWriter;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.regex.Matcher;
34
35 import javax.swing.JOptionPane;
36
37 import org.forester.archaeopteryx.MainFrameApplication;
38 import org.forester.evoinference.distance.NeighborJoining;
39 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
40 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
41 import org.forester.evoinference.tools.BootstrapResampler;
42 import org.forester.io.parsers.FastaParser;
43 import org.forester.msa.BasicMsa;
44 import org.forester.msa.ClustalOmega;
45 import org.forester.msa.Mafft;
46 import org.forester.msa.Msa;
47 import org.forester.msa.Msa.MSA_FORMAT;
48 import org.forester.msa.MsaInferrer;
49 import org.forester.msa.MsaMethods;
50 import org.forester.msa.ResampleableMsa;
51 import org.forester.phylogeny.Phylogeny;
52 import org.forester.phylogeny.PhylogenyNode;
53 import org.forester.phylogeny.data.Accession;
54 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
55 import org.forester.sequence.Sequence;
56 import org.forester.tools.ConfidenceAssessor;
57 import org.forester.util.ForesterUtil;
58
59 public class PhylogeneticInferrer extends RunnableProcess {
60
61     private Msa                                _msa;
62     private final MainFrameApplication         _mf;
63     private final PhylogeneticInferenceOptions _options;
64     private final List<Sequence>               _seqs;
65     private final boolean                      DEBUG           = true;
66     public final static String                 MSA_FILE_SUFFIX = ".aln";
67     public final static String                 PWD_FILE_SUFFIX = ".pwd";
68
69     public PhylogeneticInferrer( final List<Sequence> seqs,
70                                  final PhylogeneticInferenceOptions options,
71                                  final MainFrameApplication mf ) {
72         _msa = null;
73         _seqs = seqs;
74         _mf = mf;
75         _options = options;
76     }
77
78     public PhylogeneticInferrer( final Msa msa,
79                                  final PhylogeneticInferenceOptions options,
80                                  final MainFrameApplication mf ) {
81         _msa = msa;
82         _seqs = null;
83         _mf = mf;
84         _options = options;
85     }
86
87     private Msa inferMsa( final MSA_PRG msa_prg ) throws IOException, InterruptedException {
88         //        final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" );
89         //        if ( DEBUG ) {
90         //            System.out.println();
91         //            System.out.println( "temp file: " + temp_seqs_file );
92         //            System.out.println();
93         //        }
94         //        //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
95         //        final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
96         //        SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
97         //        writer.close();
98         switch ( msa_prg ) {
99             case MAFFT:
100                 return runMAFFT( _seqs, processMafftOptions() );
101             case CLUSTAL_O:
102                 return runClustalOmega( _seqs, processMafftOptions() );
103             default:
104                 return null;
105         }
106     }
107
108     private List<String> processMafftOptions() {
109         final String opts_str = _options.getMsaPrgParameters().trim().toLowerCase();
110         final String[] opts_ary = opts_str.split( " " );
111         final List<String> opts = new ArrayList<String>();
112         boolean saw_quiet = false;
113         for( final String opt : opts_ary ) {
114             opts.add( opt );
115             if ( opt.equals( "--quiet" ) ) {
116                 saw_quiet = true;
117             }
118         }
119         if ( !saw_quiet ) {
120             opts.add( "--quiet" );
121         }
122         return opts;
123     }
124
125     private Phylogeny inferPhylogeny( final Msa msa ) {
126         BasicSymmetricalDistanceMatrix m = null;
127         switch ( _options.getPwdDistanceMethod() ) {
128             case KIMURA_DISTANCE:
129                 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
130                 break;
131             case POISSON_DISTANCE:
132                 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
133                 break;
134             case FRACTIONAL_DISSIMILARITY:
135                 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
136                 break;
137             default:
138                 throw new RuntimeException( "invalid pwd method" );
139         }
140         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
141             BufferedWriter pwd_writer;
142             try {
143                 pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + PWD_FILE_SUFFIX ) );
144                 m.write( pwd_writer );
145                 pwd_writer.close();
146             }
147             catch ( final IOException e ) {
148                 // TODO Auto-generated catch block
149                 e.printStackTrace();
150             }
151         }
152         final NeighborJoining nj = NeighborJoining.createInstance();
153         final Phylogeny phy = nj.execute( m );
154         PhylogeneticInferrer.extractFastaInformation( phy );
155         return phy;
156     }
157
158     private void infer() throws InterruptedException {
159         //_mf.getMainPanel().getCurrentTreePanel().setWaitCursor();
160         if ( ( _msa == null ) && ( _seqs == null ) ) {
161             throw new IllegalArgumentException( "cannot run phylogenetic analysis with null msa and seq array" );
162         }
163         start( _mf, "phylogenetic inference" );
164         if ( _msa == null ) {
165             Msa msa = null;
166             try {
167                 msa = inferMsa( MSA_PRG.MAFFT );
168             }
169             catch ( final IOException e ) {
170                 end( _mf );
171                 JOptionPane.showMessageDialog( _mf,
172                                                "Could not create multiple sequence alignment with \""
173                                                        + _options.getMsaPrg() + "\" and the following parameters:\n\""
174                                                        + _options.getMsaPrgParameters() + "\"\nError: "
175                                                        + e.getLocalizedMessage(),
176                                                "Failed to Calculate MSA",
177                                                JOptionPane.ERROR_MESSAGE );
178                 if ( DEBUG ) {
179                     e.printStackTrace();
180                 }
181                 return;
182             }
183             catch ( final Exception e ) {
184                 end( _mf );
185                 JOptionPane.showMessageDialog( _mf,
186                                                "Could not create multiple sequence alignment with \""
187                                                        + _options.getMsaPrg() + "\" and the following parameters:\n\""
188                                                        + _options.getMsaPrgParameters() + "\"\nError: "
189                                                        + e.getLocalizedMessage(),
190                                                "Unexpected Exception During MSA Calculation",
191                                                JOptionPane.ERROR_MESSAGE );
192                 if ( DEBUG ) {
193                     e.printStackTrace();
194                 }
195                 return;
196             }
197             if ( msa == null ) {
198                 end( _mf );
199                 JOptionPane.showMessageDialog( _mf,
200                                                "Could not create multiple sequence alignment with "
201                                                        + _options.getMsaPrg() + "\nand the following parameters:\n\""
202                                                        + _options.getMsaPrgParameters() + "\"",
203                                                "Failed to Calculate MSA",
204                                                JOptionPane.ERROR_MESSAGE );
205                 return;
206             }
207             if ( DEBUG ) {
208                 System.out.println( msa.toString() );
209                 System.out.println( MsaMethods.calcGapRatio( msa ) );
210             }
211             final MsaMethods msa_tools = MsaMethods.createInstance();
212             if ( _options.isExecuteMsaProcessing() ) {
213                 msa = msa_tools.removeGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
214                                                   _options.getMsaProcessingMinAllowedLength(),
215                                                   msa );
216                 if ( msa == null ) {
217                     end( _mf );
218                     JOptionPane.showMessageDialog( _mf,
219                                                    "Less than two sequences longer than "
220                                                            + _options.getMsaProcessingMinAllowedLength()
221                                                            + " residues left after MSA processing",
222                                                    "MSA Processing Settings Too Stringent",
223                                                    JOptionPane.ERROR_MESSAGE );
224                     return;
225                 }
226             }
227             if ( DEBUG ) {
228                 System.out.println( msa_tools.getIgnoredSequenceIds() );
229                 System.out.println( msa.toString() );
230                 System.out.println( MsaMethods.calcGapRatio( msa ) );
231             }
232             _msa = msa;
233         }
234         final int n = _options.getBootstrapSamples();
235         final long seed = _options.getRandomNumberGeneratorSeed();
236         final Phylogeny master_phy = inferPhylogeny( _msa );
237         if ( _options.isPerformBootstrapResampling() && ( n > 0 ) ) {
238             final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
239             final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa
240                     .getLength(), n, seed );
241             final Phylogeny[] eval_phys = new Phylogeny[ n ];
242             for( int i = 0; i < n; ++i ) {
243                 resampleable_msa.resample( resampled_column_positions[ i ] );
244                 eval_phys[ i ] = inferPhylogeny( resampleable_msa );
245             }
246             ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
247         }
248         _mf.getMainPanel().addPhylogenyInNewTab( master_phy, _mf.getConfiguration(), "nj", "njpath" );
249         //  _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
250         end( _mf );
251         JOptionPane.showMessageDialog( _mf,
252                                        "Inference successfully completed",
253                                        "Inference Completed",
254                                        JOptionPane.INFORMATION_MESSAGE );
255     }
256
257     @Override
258     public void run() {
259         try {
260             infer();
261         }
262         catch ( final InterruptedException e ) {
263             // TODO need to handle this exception SOMEHOW!
264             // TODO Auto-generated catch block
265             e.printStackTrace();
266         }
267     }
268
269     private Msa runMAFFT( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
270         Msa msa = null;
271         final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft()
272                 .getCanonicalPath() );
273         try {
274             msa = mafft.infer( seqs, opts );
275         }
276         catch ( final IOException e ) {
277             System.out.println( mafft.getErrorDescription() );
278         }
279         return msa;
280     }
281
282     private Msa runClustalOmega( final List<Sequence> seqs, final List<String> opts ) throws IOException,
283             InterruptedException {
284         Msa msa = null;
285         final MsaInferrer clustalo = ClustalOmega.createInstance( _mf.getInferenceManager().getPathToLocalClustalo()
286                 .getCanonicalPath() );
287         try {
288             msa = clustalo.infer( seqs, opts );
289         }
290         catch ( final IOException e ) {
291             System.out.println( clustalo.getErrorDescription() );
292         }
293         return msa;
294     }
295
296     private void writeToFiles( final BasicSymmetricalDistanceMatrix m ) {
297         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
298             try {
299                 final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
300                         + MSA_FILE_SUFFIX ) );
301                 _msa.write( msa_writer, MSA_FORMAT.PHYLIP );
302                 msa_writer.close();
303                 final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
304                         + PWD_FILE_SUFFIX ) );
305                 m.write( pwd_writer );
306                 pwd_writer.close();
307             }
308             catch ( final Exception e ) {
309                 System.out.println( "Error: " + e.getMessage() );
310             }
311         }
312     }
313
314     public static void extractFastaInformation( final Phylogeny phy ) {
315         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
316             final PhylogenyNode node = iter.next();
317             if ( !ForesterUtil.isEmpty( node.getName() ) ) {
318                 final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() );
319                 if ( name_m.lookingAt() ) {
320                     System.out.println();
321                     // System.out.println( name_m.group( 1 ) );
322                     // System.out.println( name_m.group( 2 ) );
323                     // System.out.println( name_m.group( 3 ) );
324                     // System.out.println( name_m.group( 4 ) );
325                     final String acc_source = name_m.group( 1 );
326                     final String acc = name_m.group( 2 );
327                     final String seq_name = name_m.group( 3 );
328                     final String tax_sn = name_m.group( 4 );
329                     if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
330                         ForesterUtil.ensurePresenceOfSequence( node );
331                         node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
332                     }
333                     if ( !ForesterUtil.isEmpty( seq_name ) ) {
334                         ForesterUtil.ensurePresenceOfSequence( node );
335                         node.getNodeData().getSequence( 0 ).setName( seq_name );
336                     }
337                     if ( !ForesterUtil.isEmpty( tax_sn ) ) {
338                         ForesterUtil.ensurePresenceOfTaxonomy( node );
339                         node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
340                     }
341                 }
342             }
343         }
344     }
345
346     public enum MSA_PRG {
347         MAFFT, CLUSTAL_O;
348     }
349 }