import org.forester.phylogeny.data.NodeVisualData.NodeFill;
import org.forester.phylogeny.data.NodeVisualData.NodeShape;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
import org.forester.tools.ConfidenceAssessor;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
public class MsaCompactor {
- final private static NumberFormat NF_1 = new DecimalFormat( "0.#" );
- final private static NumberFormat NF_3 = new DecimalFormat( "0.###" );
- final private static NumberFormat NF_4 = new DecimalFormat( "0.####" );
- private boolean _calculate_shannon_entropy = false;
+ final private static NumberFormat NF_1 = new DecimalFormat( "0.#" );
+ final private static NumberFormat NF_3 = new DecimalFormat( "0.###" );
+ final private static NumberFormat NF_4 = new DecimalFormat( "0.####" );
+ private boolean _calculate_shannon_entropy = false;
//
- private String _infile_name = null;
- private final short _longest_id_length;
+ private String _infile_name = null;
+ private final short _longest_id_length;
//
- private String _maffts_opts = "--auto";
- private DeleteableMsa _msa = null;
- private boolean _norm = true;
- private File _out_file_base = null;
- private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
- private String _path_to_mafft = null;
- private boolean _phylogentic_inference = false;
+ private String _maffts_opts = "--auto";
+ private DeleteableMsa _msa = null;
+ private boolean _normalize_for_effective_seq_length = true;
+ private File _out_file_base = null;
+ private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
+ private String _path_to_mafft = null;
+ private boolean _phylogentic_inference = false;
//
- private boolean _realign = false;
- private final SortedSet<String> _removed_seq_ids;
- private final ArrayList<Sequence> _removed_seqs;
- private File _removed_seqs_out_base = null;
- private int _step = -1;
- private int _step_for_diagnostics = -1;
+ private boolean _realign = false;
+ private final SortedSet<String> _removed_seq_ids;
+ private final ArrayList<MolecularSequence> _removed_seqs;
+ private File _removed_seqs_out_base = null;
+ private int _step = -1;
+ private int _step_for_diagnostics = -1;
static {
NF_1.setRoundingMode( RoundingMode.HALF_UP );
NF_4.setRoundingMode( RoundingMode.HALF_UP );
_msa = msa;
_removed_seq_ids = new TreeSet<String>();
_longest_id_length = _msa.determineMaxIdLength();
- _removed_seqs = new ArrayList<Sequence>();
+ _removed_seqs = new ArrayList<MolecularSequence>();
}
public final Phylogeny calcTree() {
return phy;
}
- public final List<MsaProperties> chart( final int step, final boolean realign, final boolean norm )
+ public final List<MsaProperties> chart( final int step, final boolean realign, final boolean normalize_for_effective_seq_length )
throws IOException, InterruptedException {
- final GapContribution stats[] = calcGapContribtionsStats( norm );
+ final GapContribution stats[] = calcGapContribtionsStats( normalize_for_effective_seq_length );
final List<String> to_remove_ids = new ArrayList<String>();
final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
for( final GapContribution gap_gontribution : stats ) {
System.out.println( "calculating phylogentic tree..." );
System.out.println();
phy = calcTree();
+ addSeqs2Tree( _msa, phy );
}
if ( !_realign ) {
_step = -1;
}
++i;
}
+
if ( _phylogentic_inference ) {
decorateTree( phy, msa_props, true );
displayTree( phy );
return msa_props;
}
- public final void decorateTree( final Phylogeny phy, final List<MsaProperties> msa_props, final boolean chart_only ) {
+ private final static void addSeqs2Tree( final Msa msa, final Phylogeny phy ) {
+ for( int i = 0; i < msa.getNumberOfSequences(); ++i ) {
+ final MolecularSequence seq = msa.getSequence( i );
+ final String seq_name = seq.getIdentifier();
+ final PhylogenyNode n = phy.getNode( seq_name );
+ if ( !n.getNodeData().isHasSequence() ) {
+ n.getNodeData().addSequence( new org.forester.phylogeny.data.Sequence() );
+ }
+ else {
+ throw new IllegalArgumentException( "this should not have happened" );
+ }
+ n.getNodeData().getSequence().setMolecularSequence( seq.getMolecularSequenceAsString() );
+ n.getNodeData().getSequence().setMolecularSequenceAligned( true );
+ n.getNodeData().getSequence().setName( seq_name );
+ }
+ }
+
+ private final static void decorateTree( final Phylogeny phy,
+ final List<MsaProperties> msa_props,
+ final boolean chart_only ) {
final BasicDescriptiveStatistics length_stats = new BasicDescriptiveStatistics();
for( int i = 0; i < msa_props.size(); ++i ) {
final MsaProperties msa_prop = msa_props.get( i );
}
else {
n.getNodeData()
- .getNodeVisualData()
- .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
- min,
- max,
- mean,
- min_color,
- max_color,
- mean_color ) );
+ .getNodeVisualData()
+ .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
+ min,
+ max,
+ mean,
+ min_color,
+ max_color,
+ mean_color ) );
}
}
}
config.setDisplaySequenceNames( false );
config.setDisplaySequenceSymbols( false );
config.setDisplayGeneNames( false );
+ config.setDisplayMultipleSequenceAlignment( true );
config.setShowScale( true );
config.setAddTaxonomyImagesCB( false );
config.setBaseFontSize( 9 );
}
public final List<MsaProperties> removeViaGapAverage( final double mean_gapiness ) throws IOException,
- InterruptedException {
- final GapContribution stats[] = calcGapContribtionsStats( _norm );
+ InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
final List<String> to_remove_ids = new ArrayList<String>();
final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
for( final GapContribution gap_gontribution : stats ) {
System.out.println( "calculating phylogentic tree..." );
System.out.println();
phy = calcTree();
+ addSeqs2Tree( _msa, phy );
}
printTableHeader();
MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
final String id = to_remove_ids.get( i );
_removed_seq_ids.add( id );
- final Sequence deleted = _msa.deleteRow( id, true );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
_removed_seqs.add( deleted );
removeGapColumns();
if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
if ( _phylogentic_inference ) {
decorateTree( phy, msa_props, false );
displayTree( phy );
- }
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ final Phylogeny phy2 = calcTree();
+ addSeqs2Tree( _msa, phy2 );
+ displayTree( phy2 );
+ }
+
+
return msa_props;
}
public List<MsaProperties> removeViaLength( final int length ) throws IOException, InterruptedException {
- final GapContribution stats[] = calcGapContribtionsStats( _norm );
+ final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
final List<String> to_remove_ids = new ArrayList<String>();
final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
for( final GapContribution gap_gontribution : stats ) {
System.out.println( "calculating phylogentic tree..." );
System.out.println();
phy = calcTree();
+ addSeqs2Tree( _msa, phy );
}
printTableHeader();
MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
while ( _msa.getLength() > length ) {
final String id = to_remove_ids.get( i );
_removed_seq_ids.add( id );
- final Sequence deleted = _msa.deleteRow( id, true );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
_removed_seqs.add( deleted );
removeGapColumns();
if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( _msa.getLength() <= length ) ) {
if ( _phylogentic_inference ) {
decorateTree( phy, msa_props, false );
displayTree( phy );
- }
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ final Phylogeny phy2 = calcTree();
+ addSeqs2Tree( _msa, phy2 );
+ displayTree( phy2 );
+ }
+
return msa_props;
}
public final List<MsaProperties> removeWorstOffenders( final int to_remove ) throws IOException,
- InterruptedException {
- final GapContribution stats[] = calcGapContribtionsStats( _norm );
+ InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
final List<String> to_remove_ids = new ArrayList<String>();
final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
for( int j = 0; j < to_remove; ++j ) {
System.out.println( "calculating phylogentic tree..." );
System.out.println();
phy = calcTree();
+ addSeqs2Tree( _msa, phy );
}
printTableHeader();
MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
for( int i = 0; i < to_remove_ids.size(); ++i ) {
final String id = to_remove_ids.get( i );
_removed_seq_ids.add( id );
- final Sequence deleted = _msa.deleteRow( id, true );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
_removed_seqs.add( deleted );
removeGapColumns();
if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
if ( _phylogentic_inference ) {
decorateTree( phy, msa_props, false );
displayTree( phy );
- }
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ final Phylogeny phy2 = calcTree();
+ addSeqs2Tree( _msa, phy2 );
+ displayTree( phy2 );
+ }
+
return msa_props;
}
_maffts_opts = maffts_opts;
}
- public final void setNorm( final boolean norm ) {
- _norm = norm;
+ public final void setNorm( final boolean normalize_for_effective_seq_length ) {
+ _normalize_for_effective_seq_length = normalize_for_effective_seq_length;
}
final public void setOutFileBase( final File out_file_base ) {
return s;
}
- final int calcNonGapResidues( final Sequence seq ) {
+ final int calcNonGapResidues( final MolecularSequence seq ) {
int ng = 0;
for( int i = 0; i < seq.getLength(); ++i ) {
if ( !seq.isGapAt( i ) ) {
return stats;
}
- final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
- final GapContribution stats[] = calcGapContribtions( norm );
+ final private GapContribution[] calcGapContribtionsStats( final boolean normalize_for_effective_seq_length ) {
+ final GapContribution stats[] = calcGapContribtions( normalize_for_effective_seq_length );
Arrays.sort( stats );
return stats;
}
sb.append( msa_properties.getLength() );
sb.append( "\t" );
sb.append( NF_4.format( msa_properties.getGapRatio() ) );
+ sb.append( "\t" );
+ sb.append( NF_1.format( msa_properties.getAvgNumberOfGaps() ) );
if ( _calculate_shannon_entropy ) {
sb.append( "\t" );
sb.append( NF_4.format( msa_properties.getEntropy7() ) );
System.out.print( "\t" );
System.out.print( "Length" );
System.out.print( "\t" );
+ System.out.print( "Gap R" );
+ System.out.print( "\t" );
System.out.print( "Gaps" );
System.out.print( "\t" );
if ( _calculate_shannon_entropy ) {