inference
[jalview.git] / forester / java / src / org / forester / archaeopteryx / tools / PhylogeneticInferrer.java
1 // $Id:
2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
4 //
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.archaeopteryx.tools;
27
28 import java.io.BufferedWriter;
29 import java.io.File;
30 import java.io.FileWriter;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.List;
34 import java.util.regex.Matcher;
35
36 import javax.swing.JOptionPane;
37
38 import org.forester.archaeopteryx.AptxUtil;
39 import org.forester.archaeopteryx.MainFrameApplication;
40 import org.forester.evoinference.distance.NeighborJoining;
41 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
42 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
43 import org.forester.evoinference.tools.BootstrapResampler;
44 import org.forester.io.parsers.FastaParser;
45 import org.forester.io.writers.SequenceWriter;
46 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
47 import org.forester.msa.BasicMsa;
48 import org.forester.msa.ClustalOmega;
49 import org.forester.msa.Mafft;
50 import org.forester.msa.Msa;
51 import org.forester.msa.MsaInferrer;
52 import org.forester.msa.MsaMethods;
53 import org.forester.msa.ResampleableMsa;
54 import org.forester.phylogeny.Phylogeny;
55 import org.forester.phylogeny.PhylogenyNode;
56 import org.forester.phylogeny.data.Accession;
57 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
58 import org.forester.sequence.Sequence;
59 import org.forester.tools.ConfidenceAssessor;
60 import org.forester.util.ForesterUtil;
61
62 public class PhylogeneticInferrer extends RunnableProcess {
63
64     private Msa                                _msa;
65     private final MainFrameApplication         _mf;
66     private final PhylogeneticInferenceOptions _options;
67     private final List<Sequence>               _seqs;
68     private final boolean                      DEBUG           = true;
69     public final static String                 MSA_FILE_SUFFIX = ".aln";
70     public final static String                 PWD_FILE_SUFFIX = ".pwd";
71
72     public PhylogeneticInferrer( final List<Sequence> seqs,
73                                  final PhylogeneticInferenceOptions options,
74                                  final MainFrameApplication mf ) {
75         _msa = null;
76         _seqs = seqs;
77         _mf = mf;
78         _options = options;
79     }
80
81     public PhylogeneticInferrer( final Msa msa,
82                                  final PhylogeneticInferenceOptions options,
83                                  final MainFrameApplication mf ) {
84         _msa = msa;
85         _seqs = null;
86         _mf = mf;
87         _options = options;
88     }
89
90     private Msa inferMsa() throws IOException, InterruptedException {
91         final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" );
92         if ( DEBUG ) {
93             System.out.println();
94             System.out.println( "temp file: " + temp_seqs_file );
95             System.out.println();
96         }
97         //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
98         final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
99         SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
100         writer.close();
101         final List<String> opts = processMafftOptions();
102         return runMAFFT( temp_seqs_file, opts );
103     }
104
105     private List<String> processMafftOptions() {
106         final String opts_str = _options.getMsaPrgParameters().trim().toLowerCase();
107         final String[] opts_ary = opts_str.split( " " );
108         final List<String> opts = new ArrayList<String>();
109         boolean saw_quiet = false;
110         for( final String opt : opts_ary ) {
111             opts.add( opt );
112             if ( opt.equals( "--quiet" ) ) {
113                 saw_quiet = true;
114             }
115         }
116         if ( !saw_quiet ) {
117             opts.add( "--quiet" );
118         }
119         return opts;
120     }
121
122     private Phylogeny inferPhylogeny( final Msa msa ) {
123         BasicSymmetricalDistanceMatrix m = null;
124         switch ( _options.getPwdDistanceMethod() ) {
125             case KIMURA_DISTANCE:
126                 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
127                 break;
128             case POISSON_DISTANCE:
129                 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
130                 break;
131             case FRACTIONAL_DISSIMILARITY:
132                 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
133                 break;
134             default:
135                 throw new RuntimeException( "invalid pwd method" );
136         }
137         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
138             BufferedWriter pwd_writer;
139             try {
140                 pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + PWD_FILE_SUFFIX ) );
141                 m.write( pwd_writer );
142                 pwd_writer.close();
143             }
144             catch ( final IOException e ) {
145                 // TODO Auto-generated catch block
146                 e.printStackTrace();
147             }
148         }
149         final NeighborJoining nj = NeighborJoining.createInstance();
150         final Phylogeny phy = nj.execute( m );
151         PhylogeneticInferrer.extractFastaInformation( phy );
152         return phy;
153     }
154
155     private void infer() throws InterruptedException {
156         //_mf.getMainPanel().getCurrentTreePanel().setWaitCursor();
157         if ( ( _msa == null ) && ( _seqs == null ) ) {
158             throw new IllegalArgumentException( "cannot run phylogenetic analysis with null msa and seq array" );
159         }
160         start( _mf, "phylogenetic inference" );
161         if ( _msa == null ) {
162             Msa msa = null;
163             try {
164                 msa = inferMsa();
165             }
166             catch ( final IOException e ) {
167                 end( _mf );
168                 JOptionPane.showMessageDialog( _mf,
169                                                "Could not create multiple sequence alignment with \""
170                                                        + _options.getMsaPrg() + "\" and the following parameters:\n\""
171                                                        + _options.getMsaPrgParameters() + "\"\nError: "
172                                                        + e.getLocalizedMessage(),
173                                                "Failed to Calculate MSA",
174                                                JOptionPane.ERROR_MESSAGE );
175                 if ( DEBUG ) {
176                     e.printStackTrace();
177                 }
178                 return;
179             }
180             catch ( final Exception e ) {
181                 end( _mf );
182                 JOptionPane.showMessageDialog( _mf,
183                                                "Could not create multiple sequence alignment with \""
184                                                        + _options.getMsaPrg() + "\" and the following parameters:\n\""
185                                                        + _options.getMsaPrgParameters() + "\"\nError: "
186                                                        + e.getLocalizedMessage(),
187                                                "Unexpected Exception During MSA Calculation",
188                                                JOptionPane.ERROR_MESSAGE );
189                 if ( DEBUG ) {
190                     e.printStackTrace();
191                 }
192                 return;
193             }
194             if ( msa == null ) {
195                 end( _mf );
196                 JOptionPane.showMessageDialog( _mf,
197                                                "Could not create multiple sequence alignment with "
198                                                        + _options.getMsaPrg() + "\nand the following parameters:\n\""
199                                                        + _options.getMsaPrgParameters() + "\"",
200                                                "Failed to Calculate MSA",
201                                                JOptionPane.ERROR_MESSAGE );
202                 return;
203             }
204             if ( DEBUG ) {
205                 System.out.println( msa.toString() );
206                 System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).toString() );
207             }
208             final MsaMethods msa_tools = MsaMethods.createInstance();
209             if ( _options.isExecuteMsaProcessing() ) {
210                 msa = msa_tools.removeGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
211                                                   _options.getMsaProcessingMinAllowedLength(),
212                                                   msa );
213                 if ( msa == null ) {
214                     end( _mf );
215                     JOptionPane.showMessageDialog( _mf,
216                                                    "Less than two sequences longer than "
217                                                            + _options.getMsaProcessingMinAllowedLength()
218                                                            + " residues left after MSA processing",
219                                                    "MSA Processing Settings Too Stringent",
220                                                    JOptionPane.ERROR_MESSAGE );
221                     return;
222                 }
223             }
224             if ( DEBUG ) {
225                 System.out.println( msa_tools.getIgnoredSequenceIds() );
226                 System.out.println( msa.toString() );
227                 System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).toString() );
228             }
229             _msa = msa;
230         }
231         final int n = _options.getBootstrapSamples();
232         final long seed = _options.getRandomNumberGeneratorSeed();
233         final Phylogeny master_phy = inferPhylogeny( _msa );
234         if ( _options.isPerformBootstrapResampling() && ( n > 0 ) ) {
235             final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
236             final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa
237                     .getLength(), n, seed );
238             final Phylogeny[] eval_phys = new Phylogeny[ n ];
239             for( int i = 0; i < n; ++i ) {
240                 resampleable_msa.resample( resampled_column_positions[ i ] );
241                 eval_phys[ i ] = inferPhylogeny( resampleable_msa );
242             }
243             ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
244         }
245         _mf.getMainPanel().addPhylogenyInNewTab( master_phy, _mf.getConfiguration(), "nj", "njpath" );
246         //  _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
247         end( _mf );
248         JOptionPane.showMessageDialog( _mf,
249                                        "Inference successfully completed",
250                                        "Inference Completed",
251                                        JOptionPane.INFORMATION_MESSAGE );
252     }
253
254     @Override
255     public void run() {
256         try {
257             infer();
258         }
259         catch ( final InterruptedException e ) {
260             // TODO need to handle this exception SOMEHOW!
261             // TODO Auto-generated catch block
262             e.printStackTrace();
263         }
264     }
265
266     private Msa runMAFFT( final File input_seqs, final List<String> opts ) throws IOException, InterruptedException {
267         Msa msa = null;
268         final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft().getCanonicalPath());
269         try {
270             msa = mafft.infer( input_seqs, opts );
271         }
272         catch ( final IOException e ) {
273             System.out.println( mafft.getErrorDescription() );
274         }
275         return msa;
276     }
277     
278     private Msa runClustalOmega( final File input_seqs, final List<String> opts ) throws IOException, InterruptedException {
279         Msa msa = null;
280         final MsaInferrer clustalo = ClustalOmega.createInstance(_mf.getInferenceManager().getPathToLocalClustalo().getCanonicalPath());
281         try {
282             msa = clustalo.infer( input_seqs, opts );
283         }
284         catch ( final IOException e ) {
285             System.out.println( clustalo.getErrorDescription() );
286         }
287         return msa;
288     }
289
290     private void writeToFiles( final BasicSymmetricalDistanceMatrix m ) {
291         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
292             try {
293                 final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
294                         + MSA_FILE_SUFFIX ) );
295                 _msa.write( msa_writer );
296                 msa_writer.close();
297                 final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
298                         + PWD_FILE_SUFFIX ) );
299                 m.write( pwd_writer );
300                 pwd_writer.close();
301             }
302             catch ( final Exception e ) {
303                 System.out.println( "Error: " + e.getMessage() );
304             }
305         }
306     }
307
308     public static void extractFastaInformation( final Phylogeny phy ) {
309         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
310             final PhylogenyNode node = iter.next();
311             if ( !ForesterUtil.isEmpty( node.getName() ) ) {
312                 final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() );
313                 if ( name_m.lookingAt() ) {
314                     System.out.println();
315                     // System.out.println( name_m.group( 1 ) );
316                     // System.out.println( name_m.group( 2 ) );
317                     // System.out.println( name_m.group( 3 ) );
318                     // System.out.println( name_m.group( 4 ) );
319                     final String acc_source = name_m.group( 1 );
320                     final String acc = name_m.group( 2 );
321                     final String seq_name = name_m.group( 3 );
322                     final String tax_sn = name_m.group( 4 );
323                     if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
324                         AptxUtil.ensurePresenceOfSequence( node );
325                         node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
326                     }
327                     if ( !ForesterUtil.isEmpty( seq_name ) ) {
328                         AptxUtil.ensurePresenceOfSequence( node );
329                         node.getNodeData().getSequence( 0 ).setName( seq_name );
330                     }
331                     if ( !ForesterUtil.isEmpty( tax_sn ) ) {
332                         AptxUtil.ensurePresenceOfTaxonomy( node );
333                         node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
334                     }
335                 }
336             }
337         }
338     }
339 }