2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.sdi;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.ArrayList;
35 import java.util.Arrays;
36 import java.util.Collections;
37 import java.util.HashMap;
38 import java.util.List;
40 import org.forester.evoinference.matrix.distance.DistanceMatrix;
41 import org.forester.io.parsers.PhylogenyParser;
42 import org.forester.io.parsers.SymmetricalDistanceMatrixParser;
43 import org.forester.io.parsers.nhx.NHXParser;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.phylogeny.Phylogeny;
46 import org.forester.phylogeny.PhylogenyMethods;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
49 import org.forester.phylogeny.factories.PhylogenyFactory;
50 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
51 import org.forester.util.ForesterUtil;
54 * @author Christian M. Zmasek
56 public final class RIO {
58 private final static boolean ROOT_BY_MINIMIZING_MAPPING_COST = false;
59 private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true;
60 private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true;
61 private final static boolean TIME = false;
62 private HashMap<String, HashMap<String, Integer>> _o_hash_maps;
63 private HashMap<String, HashMap<String, Integer>> _so_hash_maps;
64 private HashMap<String, HashMap<String, Integer>> _up_hash_maps;
65 private HashMap<String, HashMap<String, Integer>> _sn_hash_maps; // HashMap of HashMaps
66 private DistanceMatrix _m;
67 private HashMap<String, Double> _l;
68 private List<String> _seq_names;
69 private int _bootstraps;
70 private int _ext_nodes_;
74 * Default constructor.
81 * Returns the numbers of trees analyzed.
83 * @return the numbers of trees analyzed
85 public final int getBootstraps() {
89 // Helper method for inferredOrthologsToString.
90 // inferredOrthologsToArrayList,
91 // and inferredUltraParalogsToString.
92 private final double getBootstrapValueFromHash( final HashMap<String, Integer> h, final String name ) {
93 if ( !h.containsKey( name ) ) {
96 final int i = h.get( name );
97 return ( i * 100.0 / getBootstraps() );
101 * Returns the distance to a sequences/taxa after a distance list file has
102 * been read in with readDistanceList(File). Throws an exception if name is
103 * not found or if no list has been read in.
108 public final double getDistance( String name ) {
109 double distance = 0.0;
112 throw new RuntimeException( "Distance list has probably not been read in (successfully)." );
114 if ( _l.get( name ) == null ) {
115 throw new IllegalArgumentException( name + " not found." );
117 distance = ( _l.get( name ) ).doubleValue();
121 public final double getDistance( final String name1, final String name2 ) {
123 return _m.getValue( _m.getIndex( name1 ), _m.getIndex( name2 ) );
125 catch ( final Exception e ) {
131 * Returns the numbers of number of ext nodes in gene trees analyzed (after
134 * @return number of ext nodes in gene trees analyzed (after stripping)
136 public final int getExtNodesOfAnalyzedGeneTrees() {
141 * Returns a HashMap containing the inferred orthologs of the external gene
142 * tree node with the sequence name seq_name. Sequence names are the keys
143 * (String), numbers of observations are the values (Int). Orthologs are to
144 * be inferred by method "inferOrthologs". Throws an exception if seq_name
148 * sequence name of a external node of the gene trees
149 * @return HashMap containing the inferred orthologs
150 * (name(String)->value(Int))
152 public final HashMap<String, Integer> getInferredOrthologs( final String seq_name ) {
153 if ( _o_hash_maps == null ) {
156 for( String key : _o_hash_maps.keySet() ) {
157 System.out.println( key + " -> " + _o_hash_maps.get( key ) );
159 System.out.println( "seq_name: " + seq_name );
160 return _o_hash_maps.get( seq_name );
163 private final HashMap<String, Integer> getInferredSubtreeNeighbors( final String seq_name ) {
164 if ( _sn_hash_maps == null ) {
167 return _sn_hash_maps.get( seq_name );
171 * Returns a HashMap containing the inferred "super orthologs" of the
172 * external gene tree node with the sequence name seq_name. Sequence names
173 * are the keys (String), numbers of observations are the values (Int).
174 * Super orthologs are to be inferred by method "inferOrthologs". Throws an
175 * exception if seq_name is not found.
178 * sequence name of a external node of the gene trees
179 * @return HashMap containing the inferred super orthologs
180 * (name(String)->value(Int))
182 public final HashMap<String, Integer> getInferredSuperOrthologs( final String seq_name ) {
183 if ( _so_hash_maps == null ) {
186 return _so_hash_maps.get( seq_name );
190 * Returns a HashMap containing the inferred "ultra paralogs" of the
191 * external gene tree node with the sequence name seq_name. Sequence names
192 * are the keys (String), numbers of observations are the values (Int).
193 * "ultra paralogs" are to be inferred by method "inferOrthologs". Throws an
194 * exception if seq_name is not found.
197 * sequence name of a external node of the gene trees
198 * @return HashMap containing the inferred ultra paralogs
199 * (name(String)->value(Int))
201 public final HashMap<String, Integer> getInferredUltraParalogs( final String seq_name ) {
202 if ( _up_hash_maps == null ) {
205 return _up_hash_maps.get( seq_name );
209 * Returns the time (in ms) needed to run "inferOrthologs". Final variable
210 * TIME needs to be set to true.
212 * @return time (in ms) needed to run method "inferOrthologs"
214 public long getTime() {
219 * Infers the orthologs (as well the "super orthologs", the "subtree
220 * neighbors", and the "ultra paralogs") for each external node of the gene
221 * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR,
222 * for example). Tallies how many times each sequence is (super-)
223 * orthologous towards the query. Tallies how many times each sequence is
224 * ultra paralogous towards the query. Tallies how many times each sequence
225 * is a subtree neighbor of the query. Gene duplications are inferred using
226 * SDI. Modifies its argument species_tree. Is a little faster than
227 * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for
230 * To obtain the results use the methods listed below.
232 * @param gene_trees_file
233 * a File containing gene Trees in NH format, which is the result
234 * of performing a bootstrap analysis in PHYLIP
235 * @param species_tree
236 * a species Phylogeny, which has species names in its species
239 * the sequence name of the squence whose orthologs are to be
241 * @throws SDIException
243 public void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query )
244 throws IOException, SDIException {
247 _time = System.currentTimeMillis();
249 // Read in first tree to get its sequence names
250 // and strip species_tree.
251 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
252 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
253 if ( p instanceof NHXParser ) {
254 final NHXParser nhx = ( NHXParser ) p;
255 nhx.setReplaceUnderscores( false );
256 nhx.setIgnoreQuotes( true );
257 nhx.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES );
259 final Phylogeny gene_tree = factory.create( gene_trees_file, p )[ 0 ];
260 System.out.println( "species " + species_tree.toString() );
261 // Removes from species_tree all species not found in gene_tree.
262 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_tree, species_tree );
263 // System.out.println( "gene " + gene_tree.toString() );
264 // System.out.println( "species " + species_tree.toString() );
265 // Removes from gene_tree all species not found in species_tree.
266 // Archaeopteryx.createApplication( gene_tree );
267 // Archaeopteryx.createApplication( species_tree );
269 // Thread.sleep( 40000 );
271 // catch ( InterruptedException e ) {
272 // // TODO Auto-generated catch block
273 // e.printStackTrace();
275 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree );
276 _seq_names = getAllExternalSequenceNames( gene_tree );
277 if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) {
278 throw new IOException( "could not get sequence names" );
280 _o_hash_maps = new HashMap<String, HashMap<String, Integer>>();
281 _so_hash_maps = new HashMap<String, HashMap<String, Integer>>();
282 _up_hash_maps = new HashMap<String, HashMap<String, Integer>>();
283 _sn_hash_maps = new HashMap<String, HashMap<String, Integer>>();
284 // Go through all gene trees in the file.
285 final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
286 for( final Phylogeny gt : gene_trees ) {
288 // Removes from gene_tree all species not found in species_tree.
289 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
290 inferOrthologsHelper( gt, species_tree, _seq_names );
291 // System.out.println( bs );
295 _time = ( System.currentTimeMillis() - _time );
299 public List<PhylogenyNode> getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) {
300 final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
301 for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) {
302 final PhylogenyNode n = iter.next();
303 if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) {
306 if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) {
313 // Helper method which performs the actual ortholog inference for
314 // the external node with seqname query.
315 private void inferOrthologsHelper( final Phylogeny gene_tree,
316 final Phylogeny species_tree,
317 final List<String> seq_names ) throws SDIException {
318 Phylogeny assigned_tree = null;
319 List<PhylogenyNode> nodes = null;
320 final SDIR sdiunrooted = new SDIR();
321 List<PhylogenyNode> orthologs = null;
322 List<PhylogenyNode> super_orthologs = null;
323 List<PhylogenyNode> ultra_paralogs = null;
324 List<PhylogenyNode> subtree_neighbors = null;
325 assigned_tree = sdiunrooted.infer( gene_tree,
327 RIO.ROOT_BY_MINIMIZING_MAPPING_COST,
328 RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS,
329 RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT,
332 setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() );
333 for( String seq_name : seq_names ) {
334 _o_hash_maps.put( seq_name, new HashMap<String, Integer>() );
335 _so_hash_maps.put( seq_name, new HashMap<String, Integer>() );
336 _up_hash_maps.put( seq_name, new HashMap<String, Integer>() );
337 _sn_hash_maps.put( seq_name, new HashMap<String, Integer>() );
338 nodes = getNodesViaSequenceName( assigned_tree, seq_name );
339 if ( nodes.size() > 1 ) {
340 throw new IllegalArgumentException( "node named [" + seq_name + "] not unique" );
342 else if ( nodes.isEmpty() ) {
343 throw new IllegalArgumentException( "no node containing a sequence named [" + seq_name + "] found" );
345 final PhylogenyNode query_node = nodes.get( 0 );
346 final PhylogenyMethods methods = PhylogenyMethods.getInstance();
347 orthologs = methods.getOrthologousNodes( assigned_tree, query_node );
348 updateHash( _o_hash_maps, seq_name, orthologs );
349 super_orthologs = PhylogenyMethods.getSuperOrthologousNodes( query_node );
350 updateHash( _so_hash_maps, seq_name, super_orthologs );
351 subtree_neighbors = getSubtreeNeighbors( query_node, 2 );
352 updateHash( _sn_hash_maps, seq_name, subtree_neighbors );
353 ultra_paralogs = PhylogenyMethods.getUltraParalogousNodes( query_node );
354 updateHash( _up_hash_maps, seq_name, ultra_paralogs );
359 * Returns an ArrayList containg the names of orthologs of the PhylogenyNode
360 * with seq name seq_name.
363 * sequence name of a external node of the gene trees
364 * @param threshold_orthologs
365 * the minimal number of observations for a a sequence to be
366 * reported as orthologous as percentage (0.0-100.0%)
367 * @return ArrayList containg the names of orthologs of the PhylogenyNode
368 * with seq name seq_name
370 public ArrayList<String> inferredOrthologsToArrayList( final String seq_name, double threshold_orthologs ) {
371 HashMap<String, Integer> o_hashmap = null;
374 final ArrayList<String> arraylist = new ArrayList<String>();
375 if ( _o_hash_maps == null ) {
376 throw new RuntimeException( "Orthologs have not been calculated (successfully)." );
378 if ( threshold_orthologs < 0.0 ) {
379 threshold_orthologs = 0.0;
381 else if ( threshold_orthologs > 100.0 ) {
382 threshold_orthologs = 100.0;
384 o_hashmap = getInferredOrthologs( seq_name );
385 if ( o_hashmap == null ) {
386 throw new RuntimeException( "Orthologs for " + seq_name + " were not established." );
388 if ( _seq_names.size() > 0 ) {
389 I: for( int i = 0; i < _seq_names.size(); ++i ) {
390 name = _seq_names.get( i );
391 if ( name.equals( seq_name ) ) {
394 o = getBootstrapValueFromHash( o_hashmap, name );
395 if ( o < threshold_orthologs ) {
398 arraylist.add( name );
405 * Returns a String containg the names of orthologs of the PhylogenyNode
406 * with seq name query_name. The String also contains how many times a
407 * particular ortholog has been observed.
410 * The output order is (per line): Name, Ortholog, Subtree neighbor, Super
414 * The sort priority of this is determined by sort in the following manner:
417 * <li>1 : Ortholog, Super ortholog
418 * <li>2 : Super ortholog, Ortholog
419 * <li>3 : Ortholog, Distance
420 * <li>4 : Distance, Ortholog
421 * <li>5 : Ortholog, Super ortholog, Distance
422 * <li>6 : Ortholog, Distance, Super ortholog
423 * <li>7 : Super ortholog, Ortholog, Distance
424 * <li>8 : Super ortholog, Distance, Ortholog
425 * <li>9 : Distance, Ortholog, Super ortholog
426 * <li>10 : Distance, Super ortholog, Ortholog
427 * <li>11 : Ortholog, Subtree neighbor, Distance
428 * <li>12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default)
429 * <li>13 : Ortholog, Super ortholog, Subtree neighbor, Distance
430 * <li>14 : Subtree neighbor, Ortholog, Super ortholog, Distance
431 * <li>15 : Subtree neighbor, Distance, Ortholog, Super ortholog
432 * <li>16 : Ortholog, Distance, Subtree neighbor, Super ortholog
433 * <li>17 : Ortholog, Subtree neighbor, Distance, Super ortholog
436 * Returns "-" if no putative orthologs have been found (given
437 * threshold_orthologs).
439 * Orthologs are to be inferred by method "inferOrthologs".
441 * (Last modified: 05/08/01)
444 * sequence name of a external node of the gene trees
446 * order and sort priority
447 * @param threshold_orthologs
448 * the minimal number of observations for a a sequence to be
449 * reported as orthologous, in percents (0.0-100.0%)
450 * @param threshold_subtreeneighborings
451 * the minimal number of observations for a a sequence to be
452 * reported as orthologous, in percents (0.0-100.0%)
453 * @return String containing the inferred orthologs, String containing "-"
454 * if no orthologs have been found null in case of error
455 * @see #inferOrthologs(File,Phylogeny,String)
456 * @see #inferOrthologs(Phylogeny[],Phylogeny)
457 * @see #inferOrthologs(File,Phylogeny)
458 * @see #getOrder(int)
460 public StringBuffer inferredOrthologsToString( final String query_name,
462 double threshold_orthologs,
463 double threshold_subtreeneighborings ) {
464 HashMap<String, Integer> o_hashmap = null;
465 HashMap<String, Integer> s_hashmap = null;
466 HashMap<String, Integer> n_hashmap = null;
468 double o = 0.0, // Orthologs.
469 s = 0.0, // Super orthologs.
470 sn = 0.0, // Subtree neighbors.
471 value1 = 0.0, value2 = 0.0, value3 = 0.0, value4 = 0.0, d = 0.0;
472 final ArrayList<Tuplet> nv = new ArrayList<Tuplet>();
473 if ( ( _o_hash_maps == null ) || ( _so_hash_maps == null ) || ( _sn_hash_maps == null ) ) {
474 //TODO ~~~ throw new RuntimeException( "Orthologs have not been calculated (successfully)" );
476 if ( ( sort < 0 ) || ( sort > 17 ) ) {
479 if ( ( sort > 2 ) && ( _m == null ) && ( _l == null ) ) {
480 throw new RuntimeException( "Distance list or matrix have not been read in (successfully)" );
482 if ( threshold_orthologs < 0.0 ) {
483 threshold_orthologs = 0.0;
485 else if ( threshold_orthologs > 100.0 ) {
486 threshold_orthologs = 100.0;
488 if ( threshold_subtreeneighborings < 0.0 ) {
489 threshold_subtreeneighborings = 0.0;
491 else if ( threshold_subtreeneighborings > 100.0 ) {
492 threshold_subtreeneighborings = 100.0;
494 o_hashmap = getInferredOrthologs( query_name );
495 s_hashmap = getInferredSuperOrthologs( query_name );
496 n_hashmap = getInferredSubtreeNeighbors( query_name );
497 if ( ( o_hashmap == null ) || ( s_hashmap == null ) || ( n_hashmap == null ) ) {
498 //TODO ~~~ throw new RuntimeException( "Orthologs for " + query_name + " were not established" );
500 final StringBuffer orthologs = new StringBuffer();
501 if ( _seq_names.size() > 0 ) {
502 I: for( int i = 0; i < _seq_names.size(); ++i ) {
503 name = _seq_names.get( i );
504 if ( name.equals( query_name ) ) {
507 o = getBootstrapValueFromHash( o_hashmap, name );
508 if ( o < threshold_orthologs ) {
511 sn = getBootstrapValueFromHash( n_hashmap, name );
512 if ( sn < threshold_subtreeneighborings ) {
515 s = getBootstrapValueFromHash( s_hashmap, name );
518 d = getDistance( query_name, name );
521 d = getDistance( name );
526 nv.add( new Tuplet( name, o, 5 ) );
529 nv.add( new Tuplet( name, o, s, 5 ) );
532 nv.add( new Tuplet( name, s, o, 5 ) );
535 nv.add( new Tuplet( name, o, d, 1 ) );
538 nv.add( new Tuplet( name, d, o, 0 ) );
541 nv.add( new Tuplet( name, o, s, d, 2 ) );
544 nv.add( new Tuplet( name, o, d, s, 1 ) );
547 nv.add( new Tuplet( name, s, o, d, 2 ) );
550 nv.add( new Tuplet( name, s, d, o, 1 ) );
553 nv.add( new Tuplet( name, d, o, s, 0 ) );
556 nv.add( new Tuplet( name, d, s, o, 0 ) );
559 nv.add( new Tuplet( name, o, sn, d, 2 ) );
562 nv.add( new Tuplet( name, o, sn, s, d, 3 ) );
565 nv.add( new Tuplet( name, o, s, sn, d, 3 ) );
568 nv.add( new Tuplet( name, sn, o, s, d, 3 ) );
571 nv.add( new Tuplet( name, sn, d, o, s, 1 ) );
574 nv.add( new Tuplet( name, o, d, sn, s, 1 ) );
577 nv.add( new Tuplet( name, o, sn, d, s, 2 ) );
580 nv.add( new Tuplet( name, o, 5 ) );
582 } // End of I for loop.
583 if ( ( nv != null ) && ( nv.size() > 0 ) ) {
584 orthologs.append( "[seq name]\t\t[ortho]\t[st-n]\t[sup-o]\t[dist]" + ForesterUtil.LINE_SEPARATOR );
585 final Tuplet[] nv_array = new Tuplet[ nv.size() ];
586 for( int j = 0; j < nv.size(); ++j ) {
587 nv_array[ j ] = nv.get( j );
589 Arrays.sort( nv_array );
590 for( int i = 0; i < nv_array.length; ++i ) {
591 name = nv_array[ i ].getKey();
592 value1 = nv_array[ i ].getValue1();
593 value2 = nv_array[ i ].getValue2();
594 value3 = nv_array[ i ].getValue3();
595 value4 = nv_array[ i ].getValue4();
596 orthologs.append( addNameAndValues( name, value1, value2, value3, value4, sort ) );
600 // No orthologs found.
601 if ( ( orthologs == null ) || ( orthologs.length() < 1 ) ) {
602 orthologs.append( "-" );
605 } // inferredOrthologsToString( String, int, double )
607 // Helper method for inferredOrthologTableToFile.
608 // Returns individual rows for the table as String.
609 private String inferredOrthologsToTableHelper( final String name2,
610 final List<String> names,
612 final boolean super_orthologs ) {
613 HashMap<String, Integer> hashmap = null;
614 String name = null, orthologs = new String( "" );
616 if ( super_orthologs ) {
617 hashmap = getInferredSuperOrthologs( name2 );
620 hashmap = getInferredOrthologs( name2 );
622 if ( hashmap == null ) {
623 throw new RuntimeException( "Unexpected failure in method inferredOrthologsToTableHelper" );
625 for( int i = 0; i < names.size(); ++i ) {
626 name = names.get( i );
627 if ( !hashmap.containsKey( name ) ) {
631 value = hashmap.get( name );
636 throw new RuntimeException( "Failed sanity check in method inferredOrthologsToTableHelper: value not 0." );
638 orthologs += ( " " + "\t" );
641 orthologs += ( value + "\t" );
648 * Writes the orthologs for each external node of the gene trees to outfile
649 * in the form of a table. Orthologs are to be inferred by method
650 * "inferOrthologs". Overwrites without asking! (Last modified: 12/07/00)
653 * the File to write to
655 public void inferredOrthologTableToFile( final File outfile ) throws IOException {
656 if ( _o_hash_maps == null ) {
659 inferredOrthologTableToFile( outfile, false );
662 private void inferredOrthologTableToFile( final File outfile, final boolean super_orthologs ) throws IOException {
663 String name = "", line = "";
664 PrintWriter out = null;
665 if ( _seq_names == null ) {
666 throw new RuntimeException( "inferredOrthologTableToFile: _seq_names is null" );
668 Collections.sort( _seq_names );
669 out = new PrintWriter( new FileWriter( outfile ), true );
671 for( int i = 0; i < _seq_names.size(); ++i ) {
672 line += ( i + ")\t" );
676 for( int i = 0; i < _seq_names.size(); ++i ) {
677 name = _seq_names.get( i );
678 if ( name.length() < 8 ) {
679 line = i + ")\t" + name + "\t\t\t";
681 else if ( name.length() < 16 ) {
682 line = i + ")\t" + name + "\t\t";
685 line = i + ")\t" + name + "\t";
687 line += inferredOrthologsToTableHelper( name, _seq_names, i, super_orthologs );
694 * Writes the "super orthologs" for each external nodes of the gene trees to
695 * outfile in the form of a table. Super orthologs are to be inferred by
696 * method "inferOrthologs". Overwrites without asking!
699 * the File to write to
701 public void inferredSuperOrthologTableToFile( final File outfile ) throws IOException {
702 if ( _so_hash_maps == null ) {
705 inferredOrthologTableToFile( outfile, true );
709 * Returns a String containg the names of orthologs of the PhylogenyNode
710 * with seq name query_name. The String also contains how many times a
711 * particular ortholog has been observed. Returns "-" if no putative
712 * orthologs have been found (given threshold_orthologs).
714 * Orthologs are to be inferred by method "inferOrthologs".
717 * sequence name of a external node of the gene trees
718 * @param return_dists
719 * @param threshold_ultra_paralogs
721 * @return String containing the inferred orthologs, String containing "-"
722 * if no orthologs have been found null in case of error
724 public String inferredUltraParalogsToString( final String query_name,
725 final boolean return_dists,
726 double threshold_ultra_paralogs ) {
727 HashMap<String, Integer> sp_hashmap = null;
728 String name = "", ultra_paralogs = "";
730 double sp = 0.0, value1 = 0.0, value2 = 0.0, d = 0.0;
731 final List<Tuplet> nv = new ArrayList<Tuplet>();
732 if ( threshold_ultra_paralogs < 1.0 ) {
733 threshold_ultra_paralogs = 1.0;
735 else if ( threshold_ultra_paralogs > 100.0 ) {
736 threshold_ultra_paralogs = 100.0;
738 if ( _up_hash_maps == null ) {
739 throw new RuntimeException( "Ultra paralogs have not been calculated (successfully)." );
741 if ( return_dists && ( _m == null ) && ( _l == null ) ) {
742 throw new RuntimeException( "Distance list or matrix have not been read in (successfully)." );
744 sp_hashmap = getInferredUltraParalogs( query_name );
745 if ( sp_hashmap == null ) {
746 throw new RuntimeException( "Ultra paralogs for " + query_name + " were not established" );
748 if ( _seq_names.size() > 0 ) {
749 I: for( int i = 0; i < _seq_names.size(); ++i ) {
750 name = _seq_names.get( i );
751 if ( name.equals( query_name ) ) {
754 sp = getBootstrapValueFromHash( sp_hashmap, name );
755 if ( sp < threshold_ultra_paralogs ) {
758 if ( return_dists ) {
760 d = getDistance( query_name, name );
763 d = getDistance( name );
765 nv.add( new Tuplet( name, sp, d, 1 ) );
768 nv.add( new Tuplet( name, sp, 5 ) );
770 } // End of I for loop.
771 if ( ( nv != null ) && ( nv.size() > 0 ) ) {
772 final Tuplet[] nv_array = new Tuplet[ nv.size() ];
773 for( int j = 0; j < nv.size(); ++j ) {
774 nv_array[ j ] = nv.get( j );
776 Arrays.sort( nv_array );
777 if ( return_dists ) {
783 for( int i = 0; i < nv_array.length; ++i ) {
784 name = nv_array[ i ].getKey();
785 value1 = nv_array[ i ].getValue1();
786 value2 = nv_array[ i ].getValue2();
787 ultra_paralogs += addNameAndValues( name, value1, value2, 0.0, 0.0, sort );
791 // No ultra paralogs found.
792 if ( ( ultra_paralogs == null ) || ( ultra_paralogs.length() < 1 ) ) {
793 ultra_paralogs = "-";
795 return ultra_paralogs;
798 public final void readDistanceMatrix( final File matrix_file ) throws IOException {
799 DistanceMatrix[] matrices = null;
800 final SymmetricalDistanceMatrixParser parser = SymmetricalDistanceMatrixParser.createInstance();
801 matrices = parser.parse( matrix_file );
802 if ( ( matrices == null ) || ( matrices.length == 0 ) ) {
803 throw new IOException( "failed to parse distance matrix from [" + matrix_file + "]" );
805 if ( matrices.length > 1 ) {
806 throw new IOException( "[" + matrix_file + "] contains more than once distance matrix" );
812 * Brings this into the same state as immediately after construction.
814 private final void reset() {
816 _so_hash_maps = null;
817 _up_hash_maps = null;
827 * Sets the numbers of trees analyzed.
829 * numbers of trees analyzed
831 private void setBootstraps( int i ) {
839 * Sets number of ext nodes in gene trees analyzed (after stripping).
841 * number of ext nodes in gene trees analyzed (after stripping)
843 private void setExtNodesOfAnalyzedGeneTrees( int i ) {
850 // Helper for doInferOrthologs( Phylogeny, Phylogeny, String )
851 // and doInferOrthologs( Phylogeny, Phylogeny ).
852 private void updateHash( final HashMap<String, HashMap<String, Integer>> counter_map,
853 final String query_seq_name,
854 final List<PhylogenyNode> nodes ) {
855 final HashMap<String, Integer> hash_map = counter_map.get( query_seq_name );
856 if ( hash_map == null ) {
857 throw new RuntimeException( "Unexpected failure in method updateHash." );
859 for( int j = 0; j < nodes.size(); ++j ) {
861 if ( ( nodes.get( j ) ).getNodeData().isHasSequence()
862 && !ForesterUtil.isEmpty( ( nodes.get( j ) ).getNodeData().getSequence().getName() ) ) {
863 seq_name = ( nodes.get( j ) ).getNodeData().getSequence().getName();
866 seq_name = ( nodes.get( j ) ).getName();
868 if ( hash_map.containsKey( seq_name ) ) {
869 hash_map.put( seq_name, hash_map.get( seq_name ) + 1 );
872 hash_map.put( seq_name, 1 );
877 // Helper method for inferredOrthologsToString
878 // and inferredUltraParalogsToString.
879 private final static String addNameAndValues( final String name,
885 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#####" );
886 df.setDecimalSeparatorAlwaysShown( false );
888 if ( name.length() < 8 ) {
889 line += ( name + "\t\t\t" );
891 else if ( name.length() < 16 ) {
892 line += ( name + "\t\t" );
895 line += ( name + "\t" );
899 line += addToLine( value1, df );
905 line += addToLine( value1, df );
907 line += addToLine( value2, df );
911 line += addToLine( value2, df );
913 line += addToLine( value1, df );
917 line += addToLine( value1, df );
920 line += addToLine( value2, df );
923 line += addToLine( value2, df );
926 line += addToLine( value1, df );
929 line += addToLine( value1, df );
931 line += addToLine( value2, df );
932 line += addToLine( value3, df );
935 line += addToLine( value1, df );
937 line += addToLine( value3, df );
938 line += addToLine( value2, df );
941 line += addToLine( value2, df );
943 line += addToLine( value1, df );
944 line += addToLine( value3, df );
947 line += addToLine( value3, df );
949 line += addToLine( value1, df );
950 line += addToLine( value2, df );
953 line += addToLine( value2, df );
955 line += addToLine( value3, df );
956 line += addToLine( value1, df );
959 line += addToLine( value3, df );
961 line += addToLine( value2, df );
962 line += addToLine( value1, df );
965 line += addToLine( value1, df );
966 line += addToLine( value2, df );
968 line += addToLine( value3, df );
971 line += addToLine( value1, df );
972 line += addToLine( value2, df );
973 line += addToLine( value3, df );
974 line += addToLine( value4, df );
977 line += addToLine( value1, df );
978 line += addToLine( value3, df );
979 line += addToLine( value2, df );
980 line += addToLine( value4, df );
983 line += addToLine( value2, df );
984 line += addToLine( value1, df );
985 line += addToLine( value3, df );
986 line += addToLine( value4, df );
989 line += addToLine( value3, df );
990 line += addToLine( value1, df );
991 line += addToLine( value4, df );
992 line += addToLine( value2, df );
995 line += addToLine( value1, df );
996 line += addToLine( value3, df );
997 line += addToLine( value4, df );
998 line += addToLine( value2, df );
1001 line += addToLine( value1, df );
1002 line += addToLine( value2, df );
1003 line += addToLine( value4, df );
1004 line += addToLine( value3, df );
1007 line += addToLine( value1, df );
1011 line += addToLine( value1, df );
1012 line += addToLine( value2, df );
1015 line += ForesterUtil.LINE_SEPARATOR;
1019 // Helper for addNameAndValues.
1020 private final static String addToLine( final double value, final java.text.DecimalFormat df ) {
1022 if ( value != Tuplet.DEFAULT ) {
1023 s = df.format( value ) + "\t";
1031 private static List<String> getAllExternalSequenceNames( final Phylogeny phy ) {
1032 final List<String> names = new ArrayList<String>();
1033 for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
1034 final PhylogenyNode n = iter.next();
1035 if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
1036 names.add( n.getNodeData().getSequence().getName() );
1038 else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
1039 names.add( n.getName() );
1042 throw new IllegalArgumentException( "node has no (sequence) name: " + n );
1049 * Returns the order in which ortholog (o), "super ortholog" (s) and
1050 * distance (d) are returned and sorted (priority of sort always goes from
1051 * left to right), given sort. For the meaning of sort
1053 * @see #inferredOrthologsToString(String,int,double,double)
1056 * determines order and sort priority
1057 * @return String indicating the order
1059 public final static String getOrder( final int sort ) {
1063 order = "orthologies";
1066 order = "orthologies > super orthologies";
1069 order = "super orthologies > orthologies";
1072 order = "orthologies > distance to query";
1075 order = "distance to query > orthologies";
1078 order = "orthologies > super orthologies > distance to query";
1081 order = "orthologies > distance to query > super orthologies";
1084 order = "super orthologies > orthologies > distance to query";
1087 order = "super orthologies > distance to query > orthologies";
1090 order = "distance to query > orthologies > super orthologies";
1093 order = "distance to query > super orthologies > orthologies";
1096 order = "orthologies > subtree neighbors > distance to query";
1099 order = "orthologies > subtree neighbors > super orthologies > distance to query";
1102 order = "orthologies > super orthologies > subtree neighbors > distance to query";
1105 order = "subtree neighbors > orthologies > super orthologies > distance to query";
1108 order = "subtree neighbors > distance to query > orthologies > super orthologies";
1111 order = "orthologies > distance to query > subtree neighbors > super orthologies";
1114 order = "orthologies > subtree neighbors > distance to query > super orthologies";
1117 order = "orthologies";
1123 public final static StringBuffer getOrderHelp() {
1124 final StringBuffer sb = new StringBuffer();
1125 sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR );
1126 sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR );
1127 sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR );
1128 sb.append( " 3: orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR );
1129 sb.append( " 4: distance to query > orthologies" + ForesterUtil.LINE_SEPARATOR );
1130 sb.append( " 5: orthologies > super orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR );
1131 sb.append( " 6: orthologies > distance to query > super orthologies" + ForesterUtil.LINE_SEPARATOR );
1132 sb.append( " 7: super orthologies > orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR );
1133 sb.append( " 8: super orthologies > distance to query > orthologies" + ForesterUtil.LINE_SEPARATOR );
1134 sb.append( " 9: distance to query > orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR );
1135 sb.append( " 10: distance to query > super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR );
1136 sb.append( " 11: orthologies > subtree neighbors > distance to query" + ForesterUtil.LINE_SEPARATOR );
1137 sb.append( " 12: orthologies > subtree neighbors > super orthologies > distance to query"
1138 + ForesterUtil.LINE_SEPARATOR );
1139 sb.append( " 13: orthologies > super orthologies > subtree neighbors > distance to query"
1140 + ForesterUtil.LINE_SEPARATOR );
1141 sb.append( " 14: subtree neighbors > orthologies > super orthologies > distance to query"
1142 + ForesterUtil.LINE_SEPARATOR );
1143 sb.append( " 15: subtree neighbors > distance to query > orthologies > super orthologies"
1144 + ForesterUtil.LINE_SEPARATOR );
1145 sb.append( " 16: orthologies > distance to query > subtree neighbors > super orthologies"
1146 + ForesterUtil.LINE_SEPARATOR );
1147 sb.append( " 17: orthologies > subtree neighbors > distance to query > super orthologies"
1148 + ForesterUtil.LINE_SEPARATOR );
1152 private final static List<PhylogenyNode> getSubtreeNeighbors( final PhylogenyNode query, final int level ) {
1153 PhylogenyNode node = query;
1154 if ( !node.isExternal() ) {
1157 if ( !node.isRoot() ) {
1158 node = node.getParent();
1161 if ( !node.isRoot() ) {
1162 node = node.getParent();
1166 throw new IllegalArgumentException( "currently only supporting level 2 subtree neighbors " );
1168 final List<PhylogenyNode> sn = node.getAllExternalDescendants();