From 5ad44ec9bac766f57d8c62a687389a63fe8bb339 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 7 Jun 2012 18:32:29 +0000 Subject: [PATCH] phylotastic hackathon at NESCENT 120607 --- .../forester/application/gene_tree_preprocess.java | 41 ++++++++++---------- .../archaeopteryx/tools/SequenceDataRetriver.java | 28 +++++++------ 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/forester/java/src/org/forester/application/gene_tree_preprocess.java b/forester/java/src/org/forester/application/gene_tree_preprocess.java index 46e5bee..e9804a3 100644 --- a/forester/java/src/org/forester/application/gene_tree_preprocess.java +++ b/forester/java/src/org/forester/application/gene_tree_preprocess.java @@ -25,7 +25,9 @@ package org.forester.application; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.util.SortedSet; @@ -43,10 +45,6 @@ public class gene_tree_preprocess { final static private String HELP_OPTION_1 = "help"; final static private String HELP_OPTION_2 = "h"; - final static private String FROM_OPTION = "f"; - final static private String TO_OPTION = "t"; - final static private String STEP_OPTION = "s"; - final static private String WINDOW_OPTION = "w"; final static private String PRG_NAME = "gene_tree_preprocess"; final static private String PRG_DESC = "gene tree preprocessing for SDI analysis"; final static private String PRG_VERSION = "1.00"; @@ -81,9 +79,9 @@ public class gene_tree_preprocess { ForesterUtil.fatalError( PRG_NAME, "phylogeny has " + phy.getNumberOfExternalNodes() + " external node(s), aborting" ); } - final SortedSet not_found = SequenceDataRetriver.obtainSeqInformation( phy ); + final SortedSet not_found = SequenceDataRetriver.obtainSeqInformation( phy, true ); for( final String remove_me : not_found ) { - System.out.println( " not found: " + not_found ); + System.out.println( " not found: " + remove_me ); PhylogenyMethods.removeNode( phy.getNode( remove_me ), phy ); } if ( phy.getNumberOfExternalNodes() < 2 ) { @@ -98,9 +96,21 @@ public class gene_tree_preprocess { catch ( final IOException e ) { ForesterUtil.fatalError( PRG_NAME, "failed to write to [" + outtree + "]: " + e.getLocalizedMessage() ); } - // ForesterUtil.programMessage( PRG_NAME, "wrote output to: [" + outfile + "]" ); + ForesterUtil.programMessage( PRG_NAME, "wrote output phylogeny to: " + outtree ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( removed_nodes ) ); + for( final String remove_me : not_found ) { + out.write( remove_me ); + out.newLine(); + } + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, + "failed to write to [" + removed_nodes + "]: " + e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( PRG_NAME, "wrote removed external nodes labels to: " + removed_nodes ); ForesterUtil.programMessage( PRG_NAME, "OK" ); - System.out.println(); } catch ( final Exception e ) { ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); @@ -122,19 +132,8 @@ public class gene_tree_preprocess { E_MAIL, WWW, ForesterUtil.getForesterLibraryInformation() ); - System.out.println( "Usage:" ); - System.out.println(); - System.out.println( PRG_NAME + " " ); - System.out.println(); - System.out.println( " options: " ); - System.out.println(); - System.out.println( " -" + FROM_OPTION + "=: from (msa column)" ); - System.out.println( " -" + TO_OPTION + "=: to (msa column)" ); - System.out.println( " or" ); - System.out.println( " -" + WINDOW_OPTION + "=: window size (msa columns)" ); - System.out.println( " -" + STEP_OPTION + "=: step size (msa columns)" ); - System.out.println(); - System.out.println(); + System.out.print( "Usage: " ); + System.out.println( PRG_NAME + " " ); System.out.println(); } } diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index b5a8968..ccef986 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -47,13 +47,12 @@ import org.forester.ws.seqdb.SequenceDatabaseEntry; import org.forester.ws.seqdb.SequenceDbWsTools; public final class SequenceDataRetriver extends RunnableProcess { - - public final static int DEFAULT_LINES_TO_RETURN = 50; + public final static int DEFAULT_LINES_TO_RETURN = 50; private final Phylogeny _phy; private final MainFrameApplication _mf; private final TreePanel _treepanel; - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private enum Db { UNIPROT, EMBL, NCBI, NONE, REFSEQ; @@ -69,7 +68,7 @@ public final class SequenceDataRetriver extends RunnableProcess { start( _mf, "sequence data" ); SortedSet not_found = null; try { - not_found = obtainSeqInformation( _phy ); + not_found = obtainSeqInformation( _phy, false ); } catch ( final UnknownHostException e ) { final String what = "_"; //TODO FIXME @@ -143,14 +142,15 @@ public final class SequenceDataRetriver extends RunnableProcess { } } - public static SortedSet obtainSeqInformation( final Phylogeny phy ) throws IOException { + public static SortedSet obtainSeqInformation( final Phylogeny phy, final boolean ext_nodes_only ) throws IOException { final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); - final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() : new Sequence() ; - final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy() ; - - + if ( ext_nodes_only && node.isInternal() ) { + continue; + } + final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() : new Sequence(); + final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy(); String query = null; Identifier id = null; Db db = Db.NONE; @@ -175,16 +175,19 @@ public final class SequenceDataRetriver extends RunnableProcess { db = Db.UNIPROT; } else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { db = Db.NCBI; } else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { db = Db.REFSEQ; } - } } + + if ( db == Db.NONE ) { + not_found.add( node.getName() ); + } + SequenceDatabaseEntry db_entry = null; if ( !ForesterUtil.isEmpty( query ) ) { if ( db == Db.UNIPROT ) { @@ -197,7 +200,7 @@ public final class SequenceDataRetriver extends RunnableProcess { if ( DEBUG ) { System.out.println( "embl: " + query ); } - db_entry = SequenceDbWsTools.obtainEmblEntry( new Identifier( query ), DEFAULT_LINES_TO_RETURN ); + db_entry = SequenceDbWsTools.obtainEmblEntry( new Identifier( query ), DEFAULT_LINES_TO_RETURN ); if ( ( db == Db.UNIPROT ) && ( db_entry != null ) ) { db = Db.EMBL; } @@ -209,6 +212,7 @@ public final class SequenceDataRetriver extends RunnableProcess { else if ( ( db == Db.NCBI ) && ( id != null ) ) { db_entry = SequenceDbWsTools.obtainEmblEntry( id, DEFAULT_LINES_TO_RETURN ); } + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { String type = null; -- 1.7.10.2