From bd30826b30945fec52ffd33f65f9456b6ef57e77 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 7 Jun 2012 16:37:45 +0000 Subject: [PATCH] phylotastic hackathon at NESCENT 120607 --- .../forester/application/gene_tree_preprocess.java | 140 ++++++++++++++++++++ .../archaeopteryx/tools/SequenceDataRetriver.java | 10 +- .../org/forester/io/writers/PhylogenyWriter.java | 9 ++ 3 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 forester/java/src/org/forester/application/gene_tree_preprocess.java diff --git a/forester/java/src/org/forester/application/gene_tree_preprocess.java b/forester/java/src/org/forester/application/gene_tree_preprocess.java new file mode 100644 index 0000000..46e5bee --- /dev/null +++ b/forester/java/src/org/forester/application/gene_tree_preprocess.java @@ -0,0 +1,140 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2012 Christian M. Zmasek +// Copyright (C) 2008-2012 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.SortedSet; + +import org.forester.archaeopteryx.tools.SequenceDataRetriver; +import org.forester.io.parsers.util.ParserUtils; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class gene_tree_preprocess { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String FROM_OPTION = "f"; + final static private String TO_OPTION = "t"; + final static private String STEP_OPTION = "s"; + final static private String WINDOW_OPTION = "w"; + final static private String PRG_NAME = "gene_tree_preprocess"; + final static private String PRG_DESC = "gene tree preprocessing for SDI analysis"; + final static private String PRG_VERSION = "1.00"; + final static private String PRG_DATE = "2012.06.07"; + final static private String E_MAIL = "phylosoft@gmail.com"; + final static private String WWW = "www.phylosoft.org/forester/"; + + public static void main( final String[] args ) { + try { + final CommandLineArguments cla = new CommandLineArguments( args ); + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length != 1 ) ) { + printHelp(); + System.exit( 0 ); + } + final File in = cla.getFile( 0 ); + Phylogeny phy = null; + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + try { + phy = factory.create( in, ParserUtils.createParserDependingOnFileType( in, true ) )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, + "failed to read target phylogenies from [" + in + "]: " + + e.getLocalizedMessage() ); + } + final File outtree = new File( ForesterUtil.removeSuffix( in.toString() ) + + "_preprocessed_gene_tree.phylo.xml" ); + final File removed_nodes = new File( ForesterUtil.removeSuffix( in.toString() ) + "_removed_nodes.txt" ); + checkForOutputFileWriteability( outtree ); + checkForOutputFileWriteability( removed_nodes ); + if ( phy.getNumberOfExternalNodes() < 2 ) { + ForesterUtil.fatalError( PRG_NAME, "phylogeny has " + phy.getNumberOfExternalNodes() + + " external node(s), aborting" ); + } + final SortedSet not_found = SequenceDataRetriver.obtainSeqInformation( phy ); + for( final String remove_me : not_found ) { + System.out.println( " not found: " + not_found ); + PhylogenyMethods.removeNode( phy.getNode( remove_me ), phy ); + } + if ( phy.getNumberOfExternalNodes() < 2 ) { + ForesterUtil.fatalError( PRG_NAME, + "after removal of unresolvable external nodes, phylogeny has " + + phy.getNumberOfExternalNodes() + " external node(s), aborting" ); + } + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( phy, 0, outtree ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to [" + outtree + "]: " + e.getLocalizedMessage() ); + } + // ForesterUtil.programMessage( PRG_NAME, "wrote output to: [" + outfile + "]" ); + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + } + + public static void checkForOutputFileWriteability( final File outfile ) { + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( PRG_NAME, error ); + } + } + + private static void printHelp() { + ForesterUtil.printProgramInformation( PRG_NAME, + PRG_DESC, + PRG_VERSION, + PRG_DATE, + E_MAIL, + WWW, + ForesterUtil.getForesterLibraryInformation() ); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + " " ); + System.out.println(); + System.out.println( " options: " ); + System.out.println(); + System.out.println( " -" + FROM_OPTION + "=: from (msa column)" ); + System.out.println( " -" + TO_OPTION + "=: to (msa column)" ); + System.out.println( " or" ); + System.out.println( " -" + WINDOW_OPTION + "=: window size (msa columns)" ); + System.out.println( " -" + STEP_OPTION + "=: step size (msa columns)" ); + System.out.println(); + System.out.println(); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 508d60f..b5a8968 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -47,6 +47,8 @@ import org.forester.ws.seqdb.SequenceDatabaseEntry; import org.forester.ws.seqdb.SequenceDbWsTools; public final class SequenceDataRetriver extends RunnableProcess { + + public final static int DEFAULT_LINES_TO_RETURN = 50; private final Phylogeny _phy; private final MainFrameApplication _mf; @@ -189,23 +191,23 @@ public final class SequenceDataRetriver extends RunnableProcess { if ( DEBUG ) { System.out.println( "uniprot: " + query ); } - db_entry = SequenceDbWsTools.obtainUniProtEntry( query, 200 ); + db_entry = SequenceDbWsTools.obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN ); } if ( ( db == Db.EMBL ) || ( ( db == Db.UNIPROT ) && ( db_entry == null ) ) ) { if ( DEBUG ) { System.out.println( "embl: " + query ); } - db_entry = SequenceDbWsTools.obtainEmblEntry( new Identifier( query ), 200 ); + db_entry = SequenceDbWsTools.obtainEmblEntry( new Identifier( query ), DEFAULT_LINES_TO_RETURN ); if ( ( db == Db.UNIPROT ) && ( db_entry != null ) ) { db = Db.EMBL; } } } else if ( ( db == Db.REFSEQ ) && ( id != null ) ) { - db_entry = SequenceDbWsTools.obtainRefSeqEntryFromEmbl( id, 200 ); + db_entry = SequenceDbWsTools.obtainRefSeqEntryFromEmbl( id, DEFAULT_LINES_TO_RETURN ); } else if ( ( db == Db.NCBI ) && ( id != null ) ) { - db_entry = SequenceDbWsTools.obtainEmblEntry( id, 200 ); + db_entry = SequenceDbWsTools.obtainEmblEntry( id, DEFAULT_LINES_TO_RETURN ); } if ( ( db_entry != null ) && !db_entry.isEmpty() ) { if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { diff --git a/forester/java/src/org/forester/io/writers/PhylogenyWriter.java b/forester/java/src/org/forester/io/writers/PhylogenyWriter.java index 36dfda4..8ee4522 100644 --- a/forester/java/src/org/forester/io/writers/PhylogenyWriter.java +++ b/forester/java/src/org/forester/io/writers/PhylogenyWriter.java @@ -481,6 +481,15 @@ public final class PhylogenyWriter { writer.flush(); writer.close(); } + + public void toPhyloXML( final Phylogeny phy, + final int phyloxml_level, + final File out_file ) throws IOException { + final Writer writer = new BufferedWriter( new PrintWriter( out_file ) ); + toPhyloXML( writer, phy, phyloxml_level ); + writer.flush(); + writer.close(); + } public void toPhyloXML( final Writer writer, final List trees, -- 1.7.10.2