import org.forester.phylogeny.data.NodeVisualData.NodeFill;
import org.forester.phylogeny.data.NodeVisualData.NodeShape;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
import org.forester.tools.ConfidenceAssessor;
import org.forester.util.BasicDescriptiveStatistics;
+import org.forester.util.DescriptiveStatistics;
import org.forester.util.ForesterUtil;
public class MsaCompactor {
- final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
- final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
- private double _gap_ratio = -1;
+ final private static NumberFormat NF_1 = new DecimalFormat( "0.#" );
+ final private static NumberFormat NF_3 = new DecimalFormat( "0.###" );
+ final private static NumberFormat NF_4 = new DecimalFormat( "0.####" );
+ private boolean _calculate_shannon_entropy = false;
//
- private String _infile_name = null;
- private final short _longest_id_length;
+ private String _infile_name = null;
+ private final short _longest_id_length;
//
- private String _maffts_opts = "--auto";
- private int _min_length = -1;
- private DeleteableMsa _msa = null;
- private boolean _norm = true;
- private File _out_file_base = null;
- private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
- private String _path_to_mafft = null;
- private boolean _phylogentic_inference = false;
+ private String _maffts_opts = "--auto";
+ private DeleteableMsa _msa = null;
+ private boolean _norm = true;
+ private File _out_file_base = null;
+ private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
+ private String _path_to_mafft = null;
+ private boolean _phylogentic_inference = false;
//
- private boolean _realign = false;
- private final SortedSet<String> _removed_seq_ids;
- private final ArrayList<Sequence> _removed_seqs;
- private File _removed_seqs_out_base = null;
- private boolean _report_aln_mean_identity = false;
- private int _step = -1;
- private int _step_for_diagnostics = -1;
+ private boolean _realign = false;
+ private final SortedSet<String> _removed_seq_ids;
+ private final ArrayList<MolecularSequence> _removed_seqs;
+ private File _removed_seqs_out_base = null;
+ private int _step = -1;
+ private int _step_for_diagnostics = -1;
static {
+ NF_1.setRoundingMode( RoundingMode.HALF_UP );
NF_4.setRoundingMode( RoundingMode.HALF_UP );
NF_3.setRoundingMode( RoundingMode.HALF_UP );
}
_msa = msa;
_removed_seq_ids = new TreeSet<String>();
_longest_id_length = _msa.determineMaxIdLength();
- _removed_seqs = new ArrayList<Sequence>();
+ _removed_seqs = new ArrayList<MolecularSequence>();
}
public final Phylogeny calcTree() {
if ( !_realign ) {
_step = -1;
}
- int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 );
- if ( x < 1 ) {
- x = 1;
+ int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 10.0 );
+ if ( x < 2 ) {
+ x = 2;
}
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printTableHeader();
printMsaProperties( msa_prop );
if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) {
removeGapColumns();
realignWithMafft();
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.print( "(realigned)" );
}
else if ( isPrintMsaStats( i ) ) {
removeGapColumns();
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
return _msa;
}
- public final void removeSequencesByMinimalLength( final int min_effective_length ) {
- printMsaProperties( new MsaProperties( _msa, "", _report_aln_mean_identity ) );
- System.out.println();
+ public final void removeSequencesByMinimalLength( final int min_effective_length ) throws IOException {
_msa = DeleteableMsa.createInstance( MsaMethods.removeSequencesByMinimalLength( _msa, min_effective_length ) );
removeGapColumns();
- printMsaProperties( new MsaProperties( _msa, "", _report_aln_mean_identity ) );
+ final String s = writeOutfile();
+ final DescriptiveStatistics msa_stats = MsaMethods.calculateEffectiveLengthStatistics( _msa );
+ System.out.println( "Output MSA : " + s );
+ System.out.println( " MSA length : " + _msa.getLength() );
+ System.out.println( " Number of sequences : " + _msa.getNumberOfSequences() );
+ System.out.println( " Median sequence length : " + NF_1.format( msa_stats.median() ) );
+ System.out.println( " Mean sequence length : " + NF_1.format( msa_stats.arithmeticMean() ) );
+ System.out.println( " Max sequence length : " + ( ( int ) msa_stats.getMax() ) );
+ System.out.println( " Min sequence length : " + ( ( int ) msa_stats.getMin() ) );
+ System.out.println( " Gap ratio : " + NF_4.format( MsaMethods.calcGapRatio( _msa ) ) );
+ System.out.println( " Normalized Shannon Entropy (entn21): "
+ + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 21, _msa ) ) );
System.out.println();
}
phy = calcTree();
}
printTableHeader();
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
final String id = to_remove_ids.get( i );
_removed_seq_ids.add( id );
- final Sequence deleted = _msa.deleteRow( id, true );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
_removed_seqs.add( deleted );
removeGapColumns();
if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
System.out.println();
}
else if ( isPrintMsaStats( i ) ) {
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
phy = calcTree();
}
printTableHeader();
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
while ( _msa.getLength() > length ) {
final String id = to_remove_ids.get( i );
_removed_seq_ids.add( id );
- final Sequence deleted = _msa.deleteRow( id, true );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
_removed_seqs.add( deleted );
removeGapColumns();
if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( _msa.getLength() <= length ) ) {
System.out.println();
}
else if ( isPrintMsaStats( i ) ) {
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
printMsaProperties( msa_prop );
msa_props.add( msa_prop );
System.out.println();
phy = calcTree();
}
printTableHeader();
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
for( int i = 0; i < to_remove_ids.size(); ++i ) {
final String id = to_remove_ids.get( i );
_removed_seq_ids.add( id );
- final Sequence deleted = _msa.deleteRow( id, true );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
_removed_seqs.add( deleted );
removeGapColumns();
if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
System.out.println();
}
else if ( isPrintMsaStats( i ) ) {
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
return msa_props;
}
- public final void setGapRatio( final double gap_ratio ) {
- _gap_ratio = gap_ratio;
+ public final void setCalculateNormalizedShannonEntropy( final boolean calculate_shannon_entropy ) {
+ _calculate_shannon_entropy = calculate_shannon_entropy;
}
public void setInfileName( final String infile_name ) {
_maffts_opts = maffts_opts;
}
- public final void setMinLength( final int min_length ) {
- _min_length = min_length;
- }
-
public final void setNorm( final boolean norm ) {
_norm = norm;
}
_removed_seqs_out_base = removed_seqs_out_base;
}
- public final void setReportAlnMeanIdentity( final boolean report_aln_mean_identity ) {
- _report_aln_mean_identity = report_aln_mean_identity;
- }
-
public final void setStep( final int step ) {
_step = step;
}
return s;
}
- final int calcNonGapResidues( final Sequence seq ) {
+ final int calcNonGapResidues( final MolecularSequence seq ) {
int ng = 0;
for( int i = 0; i < seq.getLength(); ++i ) {
if ( !seq.isGapAt( i ) ) {
return gappiness;
}
+ private final Phylogeny collapse( final Msa msa, final int threshold ) {
+ final BasicSymmetricalDistanceMatrix m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
+ //TODO
+ return null;
+ }
+
private final Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
final Msa msa,
final boolean write_matrix,
sb.append( msa_properties.getLength() );
sb.append( "\t" );
sb.append( NF_4.format( msa_properties.getGapRatio() ) );
- if ( _report_aln_mean_identity ) {
+ if ( _calculate_shannon_entropy ) {
sb.append( "\t" );
- sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) );
+ sb.append( NF_4.format( msa_properties.getEntropy7() ) );
+ sb.append( "\t" );
+ sb.append( NF_4.format( msa_properties.getEntropy21() ) );
}
return sb;
}
if ( realign ) {
realignWithMafft();
}
- final MsaProperties msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ final MsaProperties msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
printMsaProperties( msa_prop );
final String s = writeOutfile();
System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) );
System.out.print( "\t" );
System.out.print( "Gaps" );
System.out.print( "\t" );
- if ( _report_aln_mean_identity ) {
- System.out.print( "MSA qual" );
+ if ( _calculate_shannon_entropy ) {
+ System.out.print( "entn7" );
+ System.out.print( "\t" );
+ System.out.print( "entn21" );
System.out.print( "\t" );
}
System.out.println();