From: cmzmasek@gmail.com Date: Wed, 13 Mar 2013 00:24:20 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=fe5e0935a5341bbaeb83dd228ea97c154488f7f9;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/application/dom_dup.java b/forester/java/src/org/forester/application/dom_dup.java new file mode 100644 index 0000000..f1a0a8d --- /dev/null +++ b/forester/java/src/org/forester/application/dom_dup.java @@ -0,0 +1,199 @@ + +package org.forester.application; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class dom_dup { + + // HUMAN MOUSE + // ARATH SOYBN VOLCA CYAME PARTE THAPS EMIHU NAEGR + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String PRG_NAME = "dom_dup"; + final static private String PRG_DESC = ""; + final static private String PRG_VERSION = "0.90"; + final static private String PRG_DATE = "2013.03.12"; + final static private String E_MAIL = "phylosoft@gmail.com"; + final static private String WWW = "sites.google.com/site/cmzmasek/home/software/forester"; + + public static void main( final String args[] ) { + try { + final CommandLineArguments cla = new CommandLineArguments( args ); + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( cla.getNumberOfNames() != 3 ) ) { + printHelp(); + System.exit( 0 ); + } + final String pattern_str = cla.getName( 0 ); + final File intree_file = cla.getFile( 2 ); + final File species_groups_file = cla.getFile( 1 ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny phy = factory.create( intree_file, new PhyloXmlParser() )[ 0 ]; + ForesterUtil.programMessage( PRG_NAME, "Pattern string: " + pattern_str ); + final Pattern pattern = Pattern.compile( pattern_str ); + ForesterUtil.programMessage( PRG_NAME, "Pattern is: " + pattern ); + final SortedSet set_a = new TreeSet(); + final SortedSet set_b = new TreeSet(); + read( species_groups_file, set_a, set_b ); + print_set( set_a, "Set a:" ); + print_set( set_b, "Set b:" ); + final SortedSet matching_names = obtainMatchingNames( phy, pattern ); + ForesterUtil.programMessage( PRG_NAME, "Found names: " ); + final SortedMap> pairs = obtainPairs( matching_names ); + int lca_counter = 0; + int non_lca_counter = 0; + int missing_counter = 0; + int total_counter = 0; + final Iterator>> it = pairs.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> x = it.next(); + total_counter++; + if ( x.getValue().size() == 2 ) { + final String a = x.getValue().get( 0 ); + final String b = x.getValue().get( 1 ); + System.out.print( a + " - " + b ); + final PhylogenyNode lca = PhylogenyMethods.calculateLCA( phy.getNode( a ), phy.getNode( b ) ); + final List external_descs = lca.getAllExternalDescendants(); + boolean in_a = false; + boolean in_b = false; + for( final PhylogenyNode external_desc : external_descs ) { + final String tc = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); + if ( set_a.contains( tc ) ) { + in_a = true; + } + if ( set_b.contains( tc ) ) { + in_b = true; + } + } + if ( in_a && in_b ) { + System.out.print( " => LCA " ); + lca_counter++; + } + else { + non_lca_counter++; + } + System.out.println(); + } + else if ( x.getValue().size() == 1 ) { + System.out.println( x.getValue().get( 0 ) + " => no partner in current tree!" ); + missing_counter++; + } + else { + System.out.println( "error" ); + System.exit( -1 ); + } + } + System.out.println( "Total : " + total_counter ); + System.out.println( "LCA : " + lca_counter ); + System.out.println( "Non-LCA : " + non_lca_counter ); + System.out.println( "With missing: " + missing_counter ); + } + catch ( final Exception e ) { + e.printStackTrace(); + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + } + + private static SortedMap> obtainPairs( final SortedSet matching_names ) { + final SortedMap> pairs = new TreeMap>(); + for( final String m : matching_names ) { + final String short_m = m.substring( 0, m.indexOf( '~' ) ); + if ( !pairs.containsKey( short_m ) ) { + final List p = new ArrayList(); + p.add( m ); + pairs.put( short_m, p ); + } + else { + pairs.get( short_m ).add( m ); + } + } + return pairs; + } + + private static SortedSet obtainMatchingNames( final Phylogeny phy, final Pattern pattern ) { + final SortedSet matching_names = new TreeSet(); + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final Matcher m = pattern.matcher( n.getName() ); + if ( m.find() ) { + matching_names.add( n.getName() ); + } + } + return matching_names; + } + + private static void print_set( final Set set_a, final String l ) { + ForesterUtil.programMessage( PRG_NAME, l ); + for( final String s : set_a ) { + System.out.print( s + " " ); + } + System.out.println(); + } + + private static void read( final File species_groups_file, final Set set_a, final Set set_b ) + throws IOException { + final BufferedReader reader = ForesterUtil.obtainReader( species_groups_file ); + String line; + boolean first_line = true; + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( !ForesterUtil.isEmpty( line ) ) { + final String s[] = line.split( " " ); + for( final String name : s ) { + if ( first_line ) { + set_a.add( name ); + } + else { + set_b.add( name ); + } + } + if ( first_line ) { + first_line = false; + } + } + } + } + + private static void printHelp() { + ForesterUtil.printProgramInformation( PRG_NAME, + PRG_DESC, + PRG_VERSION, + PRG_DATE, + E_MAIL, + WWW, + ForesterUtil.getForesterLibraryInformation() ); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + "" ); + System.out.println(); + System.out.println( " exmple: " ); + System.out.println(); + System.out + .println( "dom_dup \"HUMAN~[12]-2\" groups.txt RRMa_ALL_plus_RRMa_ee3_50_hmmalign_05_40_fme_gsdi.phylo.xml" ); + System.out.println(); + System.out.println(); + } +}