inprogress
[jalview.git] / forester / java / src / org / forester / application / dom_dup.java
1
2 package org.forester.application;
3
4 import java.io.BufferedReader;
5 import java.io.File;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.Iterator;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.SortedMap;
14 import java.util.SortedSet;
15 import java.util.TreeMap;
16 import java.util.TreeSet;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19
20 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyMethods;
23 import org.forester.phylogeny.PhylogenyNode;
24 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
25 import org.forester.phylogeny.factories.PhylogenyFactory;
26 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
27 import org.forester.util.CommandLineArguments;
28 import org.forester.util.ForesterUtil;
29
30 public class dom_dup {
31
32     // HUMAN MOUSE
33     // ARATH SOYBN VOLCA CYAME PARTE THAPS EMIHU NAEGR 
34     final static private String HELP_OPTION_1 = "help";
35     final static private String HELP_OPTION_2 = "h";
36     final static private String PRG_NAME      = "dom_dup";
37     final static private String PRG_DESC      = "";
38     final static private String PRG_VERSION   = "0.90";
39     final static private String PRG_DATE      = "2013.03.12";
40     final static private String E_MAIL        = "phylosoft@gmail.com";
41     final static private String WWW           = "sites.google.com/site/cmzmasek/home/software/forester";
42
43     public static void main( final String args[] ) {
44         try {
45             final CommandLineArguments cla = new CommandLineArguments( args );
46             if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( cla.getNumberOfNames() != 3 ) ) {
47                 printHelp();
48                 System.exit( 0 );
49             }
50             final String pattern_str = cla.getName( 0 );
51             final File intree_file = cla.getFile( 2 );
52             final File species_groups_file = cla.getFile( 1 );
53             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
54             final Phylogeny phy = factory.create( intree_file, new PhyloXmlParser() )[ 0 ];
55             ForesterUtil.programMessage( PRG_NAME, "Pattern string: " + pattern_str );
56             final Pattern pattern = Pattern.compile( pattern_str );
57             ForesterUtil.programMessage( PRG_NAME, "Pattern is: " + pattern );
58             final SortedSet<String> set_a = new TreeSet<String>();
59             final SortedSet<String> set_b = new TreeSet<String>();
60             read( species_groups_file, set_a, set_b );
61             print_set( set_a, "Set a:" );
62             print_set( set_b, "Set b:" );
63             final SortedSet<String> matching_names = obtainMatchingNames( phy, pattern );
64             ForesterUtil.programMessage( PRG_NAME, "Found names: " );
65             final SortedMap<String, List<String>> pairs = obtainPairs( matching_names );
66             int lca_counter = 0;
67             int non_lca_counter = 0;
68             int missing_counter = 0;
69             int total_counter = 0;
70             final Iterator<Entry<String, List<String>>> it = pairs.entrySet().iterator();
71             while ( it.hasNext() ) {
72                 final Map.Entry<String, List<String>> x = it.next();
73                 total_counter++;
74                 if ( x.getValue().size() == 2 ) {
75                     final String a = x.getValue().get( 0 );
76                     final String b = x.getValue().get( 1 );
77                     System.out.print( a + " - " + b );
78                     final PhylogenyNode lca = PhylogenyMethods.calculateLCA( phy.getNode( a ), phy.getNode( b ) );
79                     final List<PhylogenyNode> external_descs = lca.getAllExternalDescendants();
80                     boolean in_a = false;
81                     boolean in_b = false;
82                     for( final PhylogenyNode external_desc : external_descs ) {
83                         final String tc = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
84                         if ( set_a.contains( tc ) ) {
85                             in_a = true;
86                         }
87                         if ( set_b.contains( tc ) ) {
88                             in_b = true;
89                         }
90                     }
91                     if ( in_a && in_b ) {
92                         System.out.print( " => LCA " );
93                         lca_counter++;
94                     }
95                     else {
96                         non_lca_counter++;
97                     }
98                     System.out.println();
99                 }
100                 else if ( x.getValue().size() == 1 ) {
101                     System.out.println( x.getValue().get( 0 ) + " => no partner in current tree!" );
102                     missing_counter++;
103                 }
104                 else {
105                     System.out.println( "error" );
106                     System.exit( -1 );
107                 }
108             }
109             System.out.println( "Total       : " + total_counter );
110             System.out.println( "LCA         : " + lca_counter );
111             System.out.println( "Non-LCA     : " + non_lca_counter );
112             System.out.println( "With missing: " + missing_counter );
113         }
114         catch ( final Exception e ) {
115             e.printStackTrace();
116             ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
117         }
118     }
119
120     private static SortedMap<String, List<String>> obtainPairs( final SortedSet<String> matching_names ) {
121         final SortedMap<String, List<String>> pairs = new TreeMap<String, List<String>>();
122         for( final String m : matching_names ) {
123             final String short_m = m.substring( 0, m.indexOf( '~' ) );
124             if ( !pairs.containsKey( short_m ) ) {
125                 final List<String> p = new ArrayList<String>();
126                 p.add( m );
127                 pairs.put( short_m, p );
128             }
129             else {
130                 pairs.get( short_m ).add( m );
131             }
132         }
133         return pairs;
134     }
135
136     private static SortedSet<String> obtainMatchingNames( final Phylogeny phy, final Pattern pattern ) {
137         final SortedSet<String> matching_names = new TreeSet<String>();
138         for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) {
139             final PhylogenyNode n = it.next();
140             final Matcher m = pattern.matcher( n.getName() );
141             if ( m.find() ) {
142                 matching_names.add( n.getName() );
143             }
144         }
145         return matching_names;
146     }
147
148     private static void print_set( final Set<String> set_a, final String l ) {
149         ForesterUtil.programMessage( PRG_NAME, l );
150         for( final String s : set_a ) {
151             System.out.print( s + " " );
152         }
153         System.out.println();
154     }
155
156     private static void read( final File species_groups_file, final Set<String> set_a, final Set<String> set_b )
157             throws IOException {
158         final BufferedReader reader = ForesterUtil.obtainReader( species_groups_file );
159         String line;
160         boolean first_line = true;
161         while ( ( line = reader.readLine() ) != null ) {
162             line = line.trim();
163             if ( !ForesterUtil.isEmpty( line ) ) {
164                 final String s[] = line.split( " " );
165                 for( final String name : s ) {
166                     if ( first_line ) {
167                         set_a.add( name );
168                     }
169                     else {
170                         set_b.add( name );
171                     }
172                 }
173                 if ( first_line ) {
174                     first_line = false;
175                 }
176             }
177         }
178     }
179
180     private static void printHelp() {
181         ForesterUtil.printProgramInformation( PRG_NAME,
182                                               PRG_DESC,
183                                               PRG_VERSION,
184                                               PRG_DATE,
185                                               E_MAIL,
186                                               WWW,
187                                               ForesterUtil.getForesterLibraryInformation() );
188         System.out.println( "Usage:" );
189         System.out.println();
190         System.out.println( PRG_NAME + "" );
191         System.out.println();
192         System.out.println( " example: " );
193         System.out.println();
194         System.out
195                 .println( "dom_dup \"HUMAN~[12]-2\" groups.txt RRMa_ALL_plus_RRMa_ee3_50_hmmalign_05_40_fme_gsdi.phylo.xml" );
196         System.out.println();
197         System.out.println();
198     }
199 }