2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.archaeopteryx.tools;
28 import java.io.BufferedWriter;
29 import java.io.FileWriter;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.regex.Matcher;
35 import javax.swing.JOptionPane;
37 import org.forester.archaeopteryx.AptxUtil;
38 import org.forester.archaeopteryx.MainFrameApplication;
39 import org.forester.evoinference.distance.NeighborJoining;
40 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
41 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
42 import org.forester.evoinference.tools.BootstrapResampler;
43 import org.forester.io.parsers.FastaParser;
44 import org.forester.msa.BasicMsa;
45 import org.forester.msa.ClustalOmega;
46 import org.forester.msa.Mafft;
47 import org.forester.msa.Msa;
48 import org.forester.msa.Msa.MSA_FORMAT;
49 import org.forester.msa.MsaInferrer;
50 import org.forester.msa.MsaMethods;
51 import org.forester.msa.ResampleableMsa;
52 import org.forester.phylogeny.Phylogeny;
53 import org.forester.phylogeny.PhylogenyNode;
54 import org.forester.phylogeny.data.Accession;
55 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
56 import org.forester.sequence.Sequence;
57 import org.forester.tools.ConfidenceAssessor;
58 import org.forester.util.ForesterUtil;
60 public class PhylogeneticInferrer extends RunnableProcess {
63 private final MainFrameApplication _mf;
64 private final PhylogeneticInferenceOptions _options;
65 private final List<Sequence> _seqs;
66 private final boolean DEBUG = true;
67 public final static String MSA_FILE_SUFFIX = ".aln";
68 public final static String PWD_FILE_SUFFIX = ".pwd";
70 public PhylogeneticInferrer( final List<Sequence> seqs,
71 final PhylogeneticInferenceOptions options,
72 final MainFrameApplication mf ) {
79 public PhylogeneticInferrer( final Msa msa,
80 final PhylogeneticInferenceOptions options,
81 final MainFrameApplication mf ) {
88 private Msa inferMsa( final MSA_PRG msa_prg ) throws IOException, InterruptedException {
89 // final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" );
91 // System.out.println();
92 // System.out.println( "temp file: " + temp_seqs_file );
93 // System.out.println();
95 // //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
96 // final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
97 // SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
101 return runMAFFT( _seqs, processMafftOptions() );
103 return runClustalOmega( _seqs, processMafftOptions() );
109 private List<String> processMafftOptions() {
110 final String opts_str = _options.getMsaPrgParameters().trim().toLowerCase();
111 final String[] opts_ary = opts_str.split( " " );
112 final List<String> opts = new ArrayList<String>();
113 boolean saw_quiet = false;
114 for( final String opt : opts_ary ) {
116 if ( opt.equals( "--quiet" ) ) {
121 opts.add( "--quiet" );
126 private Phylogeny inferPhylogeny( final Msa msa ) {
127 BasicSymmetricalDistanceMatrix m = null;
128 switch ( _options.getPwdDistanceMethod() ) {
129 case KIMURA_DISTANCE:
130 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
132 case POISSON_DISTANCE:
133 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
135 case FRACTIONAL_DISSIMILARITY:
136 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
139 throw new RuntimeException( "invalid pwd method" );
141 if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
142 BufferedWriter pwd_writer;
144 pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + PWD_FILE_SUFFIX ) );
145 m.write( pwd_writer );
148 catch ( final IOException e ) {
149 // TODO Auto-generated catch block
153 final NeighborJoining nj = NeighborJoining.createInstance();
154 final Phylogeny phy = nj.execute( m );
155 PhylogeneticInferrer.extractFastaInformation( phy );
159 private void infer() throws InterruptedException {
160 //_mf.getMainPanel().getCurrentTreePanel().setWaitCursor();
161 if ( ( _msa == null ) && ( _seqs == null ) ) {
162 throw new IllegalArgumentException( "cannot run phylogenetic analysis with null msa and seq array" );
164 start( _mf, "phylogenetic inference" );
165 if ( _msa == null ) {
168 msa = inferMsa( MSA_PRG.MAFFT );
170 catch ( final IOException e ) {
172 JOptionPane.showMessageDialog( _mf,
173 "Could not create multiple sequence alignment with \""
174 + _options.getMsaPrg() + "\" and the following parameters:\n\""
175 + _options.getMsaPrgParameters() + "\"\nError: "
176 + e.getLocalizedMessage(),
177 "Failed to Calculate MSA",
178 JOptionPane.ERROR_MESSAGE );
184 catch ( final Exception e ) {
186 JOptionPane.showMessageDialog( _mf,
187 "Could not create multiple sequence alignment with \""
188 + _options.getMsaPrg() + "\" and the following parameters:\n\""
189 + _options.getMsaPrgParameters() + "\"\nError: "
190 + e.getLocalizedMessage(),
191 "Unexpected Exception During MSA Calculation",
192 JOptionPane.ERROR_MESSAGE );
200 JOptionPane.showMessageDialog( _mf,
201 "Could not create multiple sequence alignment with "
202 + _options.getMsaPrg() + "\nand the following parameters:\n\""
203 + _options.getMsaPrgParameters() + "\"",
204 "Failed to Calculate MSA",
205 JOptionPane.ERROR_MESSAGE );
209 System.out.println( msa.toString() );
210 System.out.println( MsaMethods.calcGapRatio( msa ) );
212 final MsaMethods msa_tools = MsaMethods.createInstance();
213 if ( _options.isExecuteMsaProcessing() ) {
214 msa = msa_tools.removeGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
215 _options.getMsaProcessingMinAllowedLength(),
219 JOptionPane.showMessageDialog( _mf,
220 "Less than two sequences longer than "
221 + _options.getMsaProcessingMinAllowedLength()
222 + " residues left after MSA processing",
223 "MSA Processing Settings Too Stringent",
224 JOptionPane.ERROR_MESSAGE );
229 System.out.println( msa_tools.getIgnoredSequenceIds() );
230 System.out.println( msa.toString() );
231 System.out.println( MsaMethods.calcGapRatio( msa ) );
235 final int n = _options.getBootstrapSamples();
236 final long seed = _options.getRandomNumberGeneratorSeed();
237 final Phylogeny master_phy = inferPhylogeny( _msa );
238 if ( _options.isPerformBootstrapResampling() && ( n > 0 ) ) {
239 final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
240 final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa
241 .getLength(), n, seed );
242 final Phylogeny[] eval_phys = new Phylogeny[ n ];
243 for( int i = 0; i < n; ++i ) {
244 resampleable_msa.resample( resampled_column_positions[ i ] );
245 eval_phys[ i ] = inferPhylogeny( resampleable_msa );
247 ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
249 _mf.getMainPanel().addPhylogenyInNewTab( master_phy, _mf.getConfiguration(), "nj", "njpath" );
250 // _mf.getMainPanel().getCurrentTreePanel().setArrowCursor();
252 JOptionPane.showMessageDialog( _mf,
253 "Inference successfully completed",
254 "Inference Completed",
255 JOptionPane.INFORMATION_MESSAGE );
263 catch ( final InterruptedException e ) {
264 // TODO need to handle this exception SOMEHOW!
265 // TODO Auto-generated catch block
270 private Msa runMAFFT( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
272 final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft()
273 .getCanonicalPath() );
275 msa = mafft.infer( seqs, opts );
277 catch ( final IOException e ) {
278 System.out.println( mafft.getErrorDescription() );
283 private Msa runClustalOmega( final List<Sequence> seqs, final List<String> opts ) throws IOException,
284 InterruptedException {
286 final MsaInferrer clustalo = ClustalOmega.createInstance( _mf.getInferenceManager().getPathToLocalClustalo()
287 .getCanonicalPath() );
289 msa = clustalo.infer( seqs, opts );
291 catch ( final IOException e ) {
292 System.out.println( clustalo.getErrorDescription() );
297 private void writeToFiles( final BasicSymmetricalDistanceMatrix m ) {
298 if ( !ForesterUtil.isEmpty( _options.getIntermediateFilesBase() ) ) {
300 final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
301 + MSA_FILE_SUFFIX ) );
302 _msa.write( msa_writer, MSA_FORMAT.PHYLIP );
304 final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
305 + PWD_FILE_SUFFIX ) );
306 m.write( pwd_writer );
309 catch ( final Exception e ) {
310 System.out.println( "Error: " + e.getMessage() );
315 public static void extractFastaInformation( final Phylogeny phy ) {
316 for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
317 final PhylogenyNode node = iter.next();
318 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
319 final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() );
320 if ( name_m.lookingAt() ) {
321 System.out.println();
322 // System.out.println( name_m.group( 1 ) );
323 // System.out.println( name_m.group( 2 ) );
324 // System.out.println( name_m.group( 3 ) );
325 // System.out.println( name_m.group( 4 ) );
326 final String acc_source = name_m.group( 1 );
327 final String acc = name_m.group( 2 );
328 final String seq_name = name_m.group( 3 );
329 final String tax_sn = name_m.group( 4 );
330 if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) {
331 AptxUtil.ensurePresenceOfSequence( node );
332 node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) );
334 if ( !ForesterUtil.isEmpty( seq_name ) ) {
335 AptxUtil.ensurePresenceOfSequence( node );
336 node.getNodeData().getSequence( 0 ).setName( seq_name );
338 if ( !ForesterUtil.isEmpty( tax_sn ) ) {
339 AptxUtil.ensurePresenceOfTaxonomy( node );
340 node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn );
347 public enum MSA_PRG {