+ public final void removeSequencesByMinimalLength( final int min_effective_length ) throws IOException {
+ _msa = DeleteableMsa.createInstance( MsaMethods.removeSequencesByMinimalLength( _msa, min_effective_length ) );
+ removeGapColumns();
+ final String s = writeOutfile();
+ final DescriptiveStatistics msa_stats = MsaMethods.calculateEffectiveLengthStatistics( _msa );
+ System.out.println( "Output MSA : " + s );
+ System.out.println( " MSA length : " + _msa.getLength() );
+ System.out.println( " Number of sequences : " + _msa.getNumberOfSequences() );
+ System.out.println( " Median sequence length : " + NF_1.format( msa_stats.median() ) );
+ System.out.println( " Mean sequence length : " + NF_1.format( msa_stats.arithmeticMean() ) );
+ System.out.println( " Max sequence length : " + ( ( int ) msa_stats.getMax() ) );
+ System.out.println( " Min sequence length : " + ( ( int ) msa_stats.getMin() ) );
+ System.out.println( " Gap ratio : " + NF_4.format( MsaMethods.calcGapRatio( _msa ) ) );
+ System.out.println( " Normalized Shannon Entropy (entn21): "
+ + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 21, _msa ) ) );
+ System.out.println();
+ }
+
+ public final List<MsaProperties> removeViaGapAverage( final double mean_gapiness ) throws IOException,
+ InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
+ final List<String> to_remove_ids = new ArrayList<String>();
+ final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
+ for( final GapContribution gap_gontribution : stats ) {
+ to_remove_ids.add( gap_gontribution.getId() );
+ }
+ Phylogeny phy = null;
+ if ( _phylogentic_inference ) {
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ phy = calcTree();
+ addSeqs2Tree( _msa, phy );
+ }
+ printTableHeader();
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
+ msa_props.add( msa_prop );
+ printMsaProperties( msa_prop );
+ System.out.println();
+ int i = 0;
+ while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
+ final String id = to_remove_ids.get( i );
+ _removed_seq_ids.add( id );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
+ _removed_seqs.add( deleted );
+ removeGapColumns();
+ if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
+ msa_prop = printMsaStatsWriteOutfileAndRealign( _realign, id );
+ msa_props.add( msa_prop );
+ System.out.println();
+ }
+ else if ( isPrintMsaStats( i ) ) {
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
+ msa_props.add( msa_prop );
+ printMsaProperties( msa_prop );
+ System.out.println();
+ }
+ ++i;
+ }
+ if ( _removed_seqs_out_base != null ) {
+ final String msg = writeAndAlignRemovedSeqs();
+ System.out.println();
+ System.out.println( msg );
+ }
+ if ( _phylogentic_inference ) {
+ decorateTree( phy, msa_props, false );
+ displayTree( phy );
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ final Phylogeny phy2 = calcTree();
+ addSeqs2Tree( _msa, phy2 );
+ displayTree( phy2 );
+ }
+
+
+ return msa_props;
+ }
+
+ public List<MsaProperties> removeViaLength( final int length ) throws IOException, InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
+ final List<String> to_remove_ids = new ArrayList<String>();
+ final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
+ for( final GapContribution gap_gontribution : stats ) {
+ to_remove_ids.add( gap_gontribution.getId() );
+ }
+ Phylogeny phy = null;
+ if ( _phylogentic_inference ) {
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ phy = calcTree();
+ addSeqs2Tree( _msa, phy );
+ }
+ printTableHeader();
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
+ msa_props.add( msa_prop );
+ printMsaProperties( msa_prop );
+ System.out.println();
+ int i = 0;
+ while ( _msa.getLength() > length ) {
+ final String id = to_remove_ids.get( i );
+ _removed_seq_ids.add( id );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
+ _removed_seqs.add( deleted );
+ removeGapColumns();
+ if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( _msa.getLength() <= length ) ) {
+ msa_prop = printMsaStatsWriteOutfileAndRealign( _realign, id );
+ msa_props.add( msa_prop );
+ System.out.println();
+ }
+ else if ( isPrintMsaStats( i ) ) {
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
+ printMsaProperties( msa_prop );
+ msa_props.add( msa_prop );
+ System.out.println();
+ }
+ ++i;
+ }
+ if ( _removed_seqs_out_base != null ) {
+ final String msg = writeAndAlignRemovedSeqs();
+ System.out.println();
+ System.out.println( msg );
+ }
+ if ( _phylogentic_inference ) {
+ decorateTree( phy, msa_props, false );
+ displayTree( phy );
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ final Phylogeny phy2 = calcTree();
+ addSeqs2Tree( _msa, phy2 );
+ displayTree( phy2 );
+ }
+
+ return msa_props;
+ }
+
+ public final List<MsaProperties> removeWorstOffenders( final int to_remove ) throws IOException,
+ InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
+ final List<String> to_remove_ids = new ArrayList<String>();
+ final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
+ for( int j = 0; j < to_remove; ++j ) {
+ to_remove_ids.add( stats[ j ].getId() );
+ }
+ Phylogeny phy = null;
+ if ( _phylogentic_inference ) {
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ phy = calcTree();
+ addSeqs2Tree( _msa, phy );
+ }
+ printTableHeader();
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
+ msa_props.add( msa_prop );
+ printMsaProperties( msa_prop );
+ System.out.println();
+ for( int i = 0; i < to_remove_ids.size(); ++i ) {
+ final String id = to_remove_ids.get( i );
+ _removed_seq_ids.add( id );
+ final MolecularSequence deleted = _msa.deleteRow( id, true );
+ _removed_seqs.add( deleted );
+ removeGapColumns();
+ if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
+ msa_prop = printMsaStatsWriteOutfileAndRealign( _realign, id );
+ msa_props.add( msa_prop );
+ System.out.println();
+ }
+ else if ( isPrintMsaStats( i ) ) {
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
+ msa_props.add( msa_prop );
+ printMsaProperties( msa_prop );
+ System.out.println();
+ }
+ }
+ if ( _removed_seqs_out_base != null ) {
+ final String msg = writeAndAlignRemovedSeqs();
+ System.out.println();
+ System.out.println( msg );
+ }
+ if ( _phylogentic_inference ) {
+ decorateTree( phy, msa_props, false );
+ displayTree( phy );
+ System.out.println( "calculating phylogentic tree..." );
+ System.out.println();
+ final Phylogeny phy2 = calcTree();
+ addSeqs2Tree( _msa, phy2 );
+ displayTree( phy2 );
+ }
+
+ return msa_props;
+ }
+
+ public final void setCalculateNormalizedShannonEntropy( final boolean calculate_shannon_entropy ) {
+ _calculate_shannon_entropy = calculate_shannon_entropy;
+ }
+
+ public void setInfileName( final String infile_name ) {
+ _infile_name = infile_name;
+ }
+
+ public final void setMafftOptions( final String maffts_opts ) {
+ _maffts_opts = maffts_opts;
+ }
+
+ public final void setNorm( final boolean normalize_for_effective_seq_length ) {
+ _normalize_for_effective_seq_length = normalize_for_effective_seq_length;
+ }
+
+ final public void setOutFileBase( final File out_file_base ) {
+ _out_file_base = out_file_base;
+ }
+
+ public final void setOutputFormat( final MSA_FORMAT output_format ) {
+ _output_format = output_format;
+ }
+
+ public void setPathToMafft( final String path_to_mafft ) {
+ _path_to_mafft = path_to_mafft;
+ }
+
+ public void setPeformPhylogenticInference( final boolean phylogentic_inference ) {
+ _phylogentic_inference = phylogentic_inference;
+ }
+
+ public final void setRealign( final boolean realign ) {
+ _realign = realign;
+ }
+
+ public final void setRemovedSeqsOutBase( final File removed_seqs_out_base ) {
+ _removed_seqs_out_base = removed_seqs_out_base;
+ }
+
+ public final void setStep( final int step ) {
+ _step = step;
+ }
+
+ public final void setStepForDiagnostics( final int step_for_diagnostics ) {
+ _step_for_diagnostics = step_for_diagnostics;
+ }
+
+ final public String writeAndAlignRemovedSeqs() throws IOException, InterruptedException {
+ final StringBuilder msg = new StringBuilder();
+ final String n = _removed_seqs_out_base + "_" + _removed_seqs.size() + ".fasta";
+ SequenceWriter.writeSeqs( _removed_seqs, new File( n ), SEQ_FORMAT.FASTA, 100 );
+ msg.append( "wrote " + _removed_seqs.size() + " removed sequences to " + "\"" + n + "\"" );
+ if ( _realign ) {
+ final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
+ final List<String> opts = new ArrayList<String>();
+ for( final String o : _maffts_opts.split( "\\s" ) ) {
+ opts.add( o );
+ }
+ final Msa removed_msa = mafft.infer( _removed_seqs, opts );
+ final Double gr = MsaMethods.calcGapRatio( removed_msa );
+ String s = _removed_seqs_out_base + "_" + removed_msa.getNumberOfSequences() + "_"
+ + removed_msa.getLength() + "_" + ForesterUtil.roundToInt( gr * 100 );
+ final String suffix = obtainSuffix();
+ s += suffix;
+ writeMsa( removed_msa, s, _output_format );
+ msg.append( ", and as MSA of length " + removed_msa.getLength() + " to \"" + s + "\"" );
+ }
+ return msg.toString();