in progress
[jalview.git] / forester / java / src / org / forester / archaeopteryx / tools / PhylogeneticInferrer.java
1 // $Id:
2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
4 //
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.archaeopteryx.tools;
27
28 import java.io.BufferedWriter;
29 import java.io.File;
30 import java.io.FileWriter;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.List;
34 import java.util.regex.Matcher;
35
36 import javax.swing.JOptionPane;
37
38 import org.forester.archaeopteryx.AptxUtil;
39 import org.forester.archaeopteryx.MainFrameApplication;
40 import org.forester.evoinference.distance.NeighborJoining;
41 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
42 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
43 import org.forester.evoinference.tools.BootstrapResampler;
44 import org.forester.io.parsers.FastaParser;
45 import org.forester.io.writers.SequenceWriter;
46 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
47 import org.forester.msa.BasicMsa;
48 import org.forester.msa.Mafft;
49 import org.forester.msa.Msa;
50 import org.forester.msa.MsaInferrer;
51 import org.forester.msa.MsaTools;
52 import org.forester.msa.ResampleableMsa;
53 import org.forester.phylogeny.Phylogeny;
54 import org.forester.phylogeny.PhylogenyNode;
55 import org.forester.phylogeny.data.Accession;
56 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
57 import org.forester.sequence.Sequence;
58 import org.forester.tools.ConfidenceAssessor;
59 import org.forester.util.ForesterUtil;
60
61 public class PhylogeneticInferrer implements Runnable {
62
63     private Msa                                _msa;
64     private final MainFrameApplication         _mf;
65     private final PhylogeneticInferenceOptions _options;
66     private final List<Sequence>               _seqs;
67     public final static String                 MSA_FILE_SUFFIX = ".aln";
68     public final static String                 PWD_FILE_SUFFIX = ".pwd";
69
70     public PhylogeneticInferrer( final List<Sequence> seqs,
71                                  final PhylogeneticInferenceOptions options,
72                                  final MainFrameApplication mf ) {
73         _msa = null;
74         _seqs = seqs;
75         _mf = mf;
76         _options = options;
77     }
78
79     public PhylogeneticInferrer( final Msa msa,
80                                  final PhylogeneticInferenceOptions options,
81                                  final MainFrameApplication mf ) {
82         _msa = msa;
83         _seqs = null;
84         _mf = mf;
85         _options = options;
86     }
87
88     private Msa inferMsa() throws IOException {
89         final File temp_seqs_file = File.createTempFile( "aptx", ".fasta" );
90         System.out.println( "temp file: " + temp_seqs_file );
91         //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
92         final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
93         SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
94         writer.close();
95         final List<String> opts = processMafftOptions();
96         Msa msa = null;
97         try {
98             msa = runMAFFT( temp_seqs_file, opts );
99         }
100         catch ( final InterruptedException e ) {
101             // TODO Auto-generated catch block
102             e.printStackTrace();
103         }
104         // copy aln file to intermediate dir file
105         // delete temp seqs file
106         return msa;
107     }
108
109     private List<String> processMafftOptions() {
110         final String opts_str = _options.getMsaPrgParameters().trim().toLowerCase();
111         final String[] opts_ary = opts_str.split( " " );
112         final List<String> opts = new ArrayList<String>();
113         boolean saw_quiet = false;
114         for( final String opt : opts_ary ) {
115             opts.add( opt );
116             if ( opt.equals( "--quiet" ) ) {
117                 saw_quiet = true;
118             }
119         }
120         if ( !saw_quiet ) {
121             opts.add( "--quiet" );
122         }
123         return opts;
124     }
125
126     private Phylogeny inferPhylogeny( final Msa msa ) {
127         BasicSymmetricalDistanceMatrix m = null;
128         switch ( _options.getPwdDistanceMethod() ) {
129             case KIMURA_DISTANCE:
130                 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
131                 break;
132             case POISSON_DISTANCE:
133                 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
134                 break;
135             case FRACTIONAL_DISSIMILARITY:
136                 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
137                 break;
138             default:
139                 throw new RuntimeException( "invalid pwd method" );
140         }
141         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
142             BufferedWriter pwd_writer;
143             try {
144                 pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + PWD_FILE_SUFFIX ) );
145                 m.write( pwd_writer );
146                 pwd_writer.close();
147             }
148             catch ( final IOException e ) {
149                 // TODO Auto-generated catch block
150                 e.printStackTrace();
151             }
152         }
153         final NeighborJoining nj = new NeighborJoining();
154         final Phylogeny phy = nj.execute( m );
155         PhylogeneticInferrer.extractFastaInformation( phy );
156         return phy;
157     }
158
159     private void infer() {
160         //_mf.getMainPanel().getCurrentTreePanel().setWaitCursor();
161         if ( ( _msa == null ) && ( _seqs == null ) ) {
162             throw new IllegalArgumentException( "cannot run phylogenetic analysis with null msa and seq array" );
163         }
164         if ( _msa == null ) {
165             Msa msa = null;
166             try {
167                 msa = inferMsa();
168             }
169             catch ( final IOException e ) {
170                 JOptionPane.showMessageDialog( _mf,
171                                                "Could not create multiple sequence alignment with "
172                                                        + _options.getMsaPrg() + "\nand the following parameters:\n\""
173                                                        + _options.getMsaPrgParameters() + "\"\nError:"
174                                                        + e.getLocalizedMessage(),
175                                                "Failed to Calculate MSA",
176                                                JOptionPane.ERROR_MESSAGE );
177                 return;
178             }
179             if ( msa == null ) {
180                 JOptionPane.showMessageDialog( _mf,
181                                                "Could not create multiple sequence alignment with "
182                                                        + _options.getMsaPrg() + "\nand the following parameters:\n\""
183                                                        + _options.getMsaPrgParameters() + "\"",
184                                                "Failed to Calculate MSA",
185                                                JOptionPane.ERROR_MESSAGE );
186                 return;
187             }
188             System.out.println( msa.toString() );
189             System.out.println( MsaTools.calcBasicGapinessStatistics( msa ).toString() );
190             final MsaTools msa_tools = MsaTools.createInstance();
191             if ( _options.isExecuteMsaProcessing() ) {
192                 msa = msa_tools.removeGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
193                                                   _options.getMsaProcessingMinAllowedLength(),
194                                                   msa );
195                 if ( msa == null ) {
196                     JOptionPane.showMessageDialog( _mf,
197                                                    "Less than two sequences longer than "
198                                                            + _options.getMsaProcessingMinAllowedLength()
199                                                            + " residues left after MSA processing",
200                                                    "MSA Processing Settings Too Stringent",
201                                                    JOptionPane.ERROR_MESSAGE );
202                     return;
203                 }
204             }
205             System.out.println( msa_tools.getIgnoredSequenceIds() );
206             System.out.println( msa.toString() );
207             System.out.println( MsaTools.calcBasicGapinessStatistics( msa ).toString() );
208             _msa = msa;
209         }
210         final int n = _options.getBootstrapSamples();
211         final long seed = _options.getRandomNumberGeneratorSeed();
212         final Phylogeny master_phy = inferPhylogeny( _msa );
213         if ( _options.isPerformBootstrapResampling() && ( n > 0 ) ) {
214             final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
215             final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa
216                     .getLength(), n, seed );
217             final Phylogeny[] eval_phys = new Phylogeny[ n ];
218             for( int i = 0; i < n; ++i ) {
219                 resampleable_msa.resample( resampled_column_positions[ i ] );
220                 eval_phys[ i ] = inferPhylogeny( resampleable_msa );
221             }
222             ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
223         }
224         _mf.getMainPanel().addPhylogenyInNewTab( master_phy, _mf.getConfiguration(), "nj", "njpath" );
225         _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
226         JOptionPane.showMessageDialog( _mf,
227                                        "Inference successfully completed",
228                                        "Inference Completed",
229                                        JOptionPane.INFORMATION_MESSAGE );
230     }
231
232     @Override
233     public void run() {
234         infer();
235     }
236
237     private Msa runMAFFT( final File input_seqs, final List<String> opts ) throws IOException, InterruptedException {
238         Msa msa = null;
239         final MsaInferrer mafft = Mafft.createInstance();
240         try {
241             msa = mafft.infer( input_seqs, opts );
242         }
243         catch ( final IOException e ) {
244             System.out.println( mafft.getErrorDescription() );
245         }
246         return msa;
247     }
248
249     private void writeToFiles( final BasicSymmetricalDistanceMatrix m ) {
250         if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
251             try {
252                 final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
253                         + MSA_FILE_SUFFIX ) );
254                 _msa.write( msa_writer );
255                 msa_writer.close();
256                 final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
257                         + PWD_FILE_SUFFIX ) );
258                 m.write( pwd_writer );
259                 pwd_writer.close();
260             }
261             catch ( final Exception e ) {
262                 System.out.println( "Error: " + e.getMessage() );
263             }
264         }
265     }
266
267     public static void extractFastaInformation( final Phylogeny phy ) {
268         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
269             final PhylogenyNode node = iter.next();
270             if ( !ForesterUtil.isEmpty( node.getName() ) ) {
271                 final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() );
272                 if ( name_m.lookingAt() ) {
273                     System.out.println();
274                     // System.out.println( name_m.group( 1 ) );
275                     // System.out.println( name_m.group( 2 ) );
276                     // System.out.println( name_m.group( 3 ) );
277                     // System.out.println( name_m.group( 4 ) );
278                     final String acc_source = name_m.group( 1 );
279                     final String acc = name_m.group( 2 );
280                     final String seq_name = name_m.group( 3 );
281                     final String tax_sn = name_m.group( 4 );
282                     if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
283                         AptxUtil.ensurePresenceOfSequence( node );
284                         node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
285                     }
286                     if ( !ForesterUtil.isEmpty( seq_name ) ) {
287                         AptxUtil.ensurePresenceOfSequence( node );
288                         node.getNodeData().getSequence( 0 ).setName( seq_name );
289                     }
290                     if ( !ForesterUtil.isEmpty( tax_sn ) ) {
291                         AptxUtil.ensurePresenceOfTaxonomy( node );
292                         node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
293                     }
294                 }
295             }
296         }
297     }
298 }