From eb38568d7687f10934bef10881474845946777dc Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Sun, 27 Apr 2014 22:46:36 +0000 Subject: [PATCH] in progress --- .../org/forester/application/msa_compactor.java | 9 +- forester/java/src/org/forester/msa/BasicMsa.java | 6 +- .../java/src/org/forester/msa/DeleteableMsa.java | 15 ++- .../org/forester/msa_compactor/MsaCompactor.java | 109 +++++++++++++------- 4 files changed, 95 insertions(+), 44 deletions(-) diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index 2b3bad0..c0b0663 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -253,7 +253,8 @@ public class msa_compactor { System.out.println( "Step for output and re-aligning) : " + ( step > 1 ? step : 1 ) ); System.out.println( "Step for diagnostics reports : " + ( step_for_diagnostics > 1 ? step_for_diagnostics : 1 ) ); - System.out.println( "Report mean identity (\"MSA quality\") : " + report_aln_mean_identity ); + System.out.println( "Calculate mean identity : " + report_aln_mean_identity ); + if ( !norm ) { System.out.println( "Normalize : " + norm ); } @@ -295,7 +296,7 @@ public class msa_compactor { mc.setNorm( norm ); mc.setOutFileBase( out ); mc.setStep( step ); - mc.removeViaGapAverage( av_gap, true ); + mc.removeViaGapAverage( av_gap ); } else if ( length > 0 ) { // TODO if < shortest seq -> error @@ -307,7 +308,7 @@ public class msa_compactor { mc.setNorm( norm ); mc.setOutFileBase( out ); mc.setStep( step ); - mc.removeViaLength( length, true ); + mc.removeViaLength( length ); } else { //MsaCompactor.chart( msa, step, realign, norm, path_to_mafft ); @@ -394,7 +395,7 @@ public class msa_compactor { System.out.println( " -" + GAP_RATIO_LENGTH_OPTION + "= maximal allowed gap ratio per column (for deleting of columms) (0.0-1.0)" ); System.out.println( " -" + REPORT_ALN_MEAN_IDENTITY - + " to report mean identity diagnostic (not recommended for very large alignments)" ); + + " to calculate mean MSA column identity (\"MSA quality\") (not recommended for very large alignments)" ); System.out.println( " -" + OUTPUT_FORMAT_PHYLIP_OPTION + " to write output alignments in phylip format instead of fasta" ); System.out.println( " -" + OUTPUT_REMOVED_SEQS_OPTION + "= to output the removed sequences" ); diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index 395a16d..d913531 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -180,10 +180,10 @@ public class BasicMsa implements Msa { } } - private int determineMaxIdLength() { - int max = 0; + private short determineMaxIdLength() { + short max = 0; for( int row = 0; row < getNumberOfSequences(); ++row ) { - final int l = getIdentifier( row ).length(); + final short l = ( short ) getIdentifier( row ).length(); if ( l > max ) { max = l; } diff --git a/forester/java/src/org/forester/msa/DeleteableMsa.java b/forester/java/src/org/forester/msa/DeleteableMsa.java index 9f18924..05bf074 100644 --- a/forester/java/src/org/forester/msa/DeleteableMsa.java +++ b/forester/java/src/org/forester/msa/DeleteableMsa.java @@ -50,6 +50,17 @@ public final class DeleteableMsa extends BasicMsa { _seqs = msa.getNumberOfSequences(); } + public short determineMaxIdLength() { + short max = 0; + for( int row = 0; row < getNumberOfSequences(); ++row ) { + final short l = ( short ) getIdentifier( row ).length(); + if ( l > max ) { + max = l; + } + } + return max; + } + final public void deleteGapColumns( final double max_allowed_gap_ratio ) { if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) { throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio ); @@ -70,7 +81,7 @@ public final class DeleteableMsa extends BasicMsa { } } - final public void deleteRow( final String id ) { + final public Sequence deleteRow( final String id ) { int row = -1; for( int r = 0; r < getNumberOfSequences(); ++r ) { if ( getIdentifier( r ).equals( id ) ) { @@ -81,7 +92,9 @@ public final class DeleteableMsa extends BasicMsa { if ( row < 0 ) { throw new IllegalArgumentException( "id [" + id + "] not found" ); } + final Sequence s = getSequence( row ); deleteRow( row ); + return s; } @Override diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 8923aa7..30b66d9 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -41,6 +41,7 @@ import org.forester.evoinference.distance.PairwiseDistanceCalculator; import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD; import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.tools.BootstrapResampler; +import org.forester.msa.BasicMsa; import org.forester.msa.DeleteableMsa; import org.forester.msa.Mafft; import org.forester.msa.Msa; @@ -75,6 +76,8 @@ public class MsaCompactor { private boolean _report_aln_mean_identity = false; private int _step = -1; private int _step_for_diagnostics = -1; + private final short _longest_id_length; + private final ArrayList _removed_seqs; static { NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); @@ -83,6 +86,8 @@ public class MsaCompactor { public MsaCompactor( final DeleteableMsa msa ) { _msa = msa; _removed_seq_ids = new TreeSet(); + _longest_id_length = _msa.determineMaxIdLength(); + _removed_seqs = new ArrayList(); } final public Msa getMsa() { @@ -93,62 +98,59 @@ public class MsaCompactor { return _removed_seq_ids; } - public final void removeViaGapAverage( final double mean_gapiness, final boolean verbose ) throws IOException, - InterruptedException { + public final void removeViaGapAverage( final double mean_gapiness ) throws IOException, InterruptedException { final GapContribution stats[] = calcGapContribtionsStats( _norm ); final List to_remove_ids = new ArrayList(); for( final GapContribution gap_gontribution : stats ) { to_remove_ids.add( gap_gontribution.getId() ); } - if ( verbose ) { - printTableHeader(); - } + printTableHeader(); int i = 0; while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) { final String id = to_remove_ids.get( i ); - _msa.deleteRow( id ); + _removed_seq_ids.add( id ); + final Sequence deleted = _msa.deleteRow( id ); + _removed_seqs.add( deleted ); removeGapColumns(); if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) { printMsaStatsWriteOutfileAndRealign( _realign, id ); } - else if ( verbose ) { + else { final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); printMsaProperties( id, msa_prop ); } - if ( verbose ) { - System.out.println(); - } + System.out.println(); ++i; } + final String msg = writeAndAlignRemovedSeqs(); + System.out.println( msg ); } - public void removeViaLength( final int length, final boolean verbose ) throws IOException, InterruptedException { + public void removeViaLength( final int length ) throws IOException, InterruptedException { final GapContribution stats[] = calcGapContribtionsStats( _norm ); final List to_remove_ids = new ArrayList(); for( final GapContribution gap_gontribution : stats ) { to_remove_ids.add( gap_gontribution.getId() ); } - if ( verbose ) { - printTableHeader(); - } + printTableHeader(); int i = 0; while ( _msa.getLength() > length ) { final String id = to_remove_ids.get( i ); - _msa.deleteRow( id ); + _removed_seq_ids.add( id ); + final Sequence deleted = _msa.deleteRow( id ); + _removed_seqs.add( deleted ); removeGapColumns(); if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) || ( _msa.getLength() <= length ) ) { printMsaStatsWriteOutfileAndRealign( _realign, id ); } - else if ( verbose ) { - final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); - printMsaProperties( id, msa_prop ); - } - if ( verbose ) { - System.out.println(); - } + final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); + printMsaProperties( id, msa_prop ); + System.out.println(); ++i; } + final String msg = writeAndAlignRemovedSeqs(); + System.out.println( msg ); } public final void removeWorstOffenders( final int to_remove ) throws IOException, InterruptedException { @@ -161,7 +163,9 @@ public class MsaCompactor { printTableHeader(); for( int i = 0; i < to_remove_ids.size(); ++i ) { final String id = to_remove_ids.get( i ); - _msa.deleteRow( id ); + _removed_seq_ids.add( id ); + final Sequence deleted = _msa.deleteRow( id ); + _removed_seqs.add( deleted ); removeGapColumns(); if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) { printMsaStatsWriteOutfileAndRealign( _realign, id ); @@ -173,6 +177,8 @@ public class MsaCompactor { System.out.println(); } } + final String msg = writeAndAlignRemovedSeqs(); + System.out.println( msg ); } public final List chart( final int step, final boolean realign, final boolean norm ) @@ -266,10 +272,40 @@ public class MsaCompactor { final Double gr = MsaMethods.calcGapRatio( _msa ); final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + ForesterUtil.roundToInt( gr * 100 ); - writeMsa( s + suffix, format ); + writeMsa( _msa, s + suffix, format ); return s; } + final public String writeAndAlignRemovedSeqs() throws IOException, InterruptedException { + final StringBuilder msg = new StringBuilder(); + final Msa removed = BasicMsa.createInstance( _removed_seqs ); + final String n = _removed_seqs_out_base + "_" + removed.getNumberOfSequences() + ".fasta"; + writeMsa( removed, n, MSA_FORMAT.FASTA ); + msg.append( "wrote " + removed.getNumberOfSequences() + " removed sequences to " + n ); + if ( _realign ) { + final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft ); + final List opts = new ArrayList(); + for( final String o : _maffts_opts.split( "\\s" ) ) { + opts.add( o ); + } + final Msa removed_msa = mafft.infer( _removed_seqs, opts ); + final Double gr = MsaMethods.calcGapRatio( removed_msa ); + String s = _removed_seqs_out_base + "_" + removed_msa.getNumberOfSequences() + "_" + + removed_msa.getLength() + "_" + ForesterUtil.roundToInt( gr * 100 ); + String suffix = ""; + if ( _output_format == MSA_FORMAT.FASTA ) { + suffix = ".fasta"; + } + else if ( _output_format == MSA_FORMAT.PHYLIP ) { + suffix = ".aln"; + } + s += suffix; + writeMsa( removed_msa, s, _output_format ); + msg.append( ", and as MSA of length " + removed_msa.getLength() + " to " + s ); + } + return msg.toString(); + } + final int calcNonGapResidues( final Sequence seq ) { int ng = 0; for( int i = 0; i < seq.getLength(); ++i ) { @@ -365,22 +401,22 @@ public class MsaCompactor { return master_phy; } - private final static void printMsaProperties( final String id, final MsaProperties msa_properties ) { - System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); + private final void printMsaProperties( final String id, final MsaProperties msa_properties ) { + System.out.print( ForesterUtil.pad( _longest_id_length + 1, 20, ' ', false ) ); System.out.print( "\t" ); final StringBuilder sb = msaPropertiesAsSB( msa_properties ); System.out.print( sb ); System.out.print( "\t" ); } - private final static StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) { + private final StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) { final StringBuilder sb = new StringBuilder(); sb.append( msa_properties.getNumberOfSequences() ); sb.append( "\t" ); sb.append( msa_properties.getLength() ); sb.append( "\t" ); sb.append( NF_4.format( msa_properties.getGapRatio() ) ); - if ( msa_properties.getAverageIdentityRatio() >= 0 ) { + if ( _report_aln_mean_identity /*msa_properties.getAverageIdentityRatio() >= 0*/) { sb.append( "\t" ); sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) ); } @@ -399,8 +435,6 @@ public class MsaCompactor { } final private void realignWithMafft() throws IOException, InterruptedException { - // final MsaInferrer mafft = Mafft - // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft ); final List opts = new ArrayList(); for( final String o : _maffts_opts.split( "\\s" ) ) { @@ -413,9 +447,10 @@ public class MsaCompactor { _msa.deleteGapOnlyColumns(); } - final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException { + final private static void writeMsa( final Msa msa, final String outfile, final MSA_FORMAT format ) + throws IOException { final Writer w = ForesterUtil.createBufferedWriter( outfile ); - _msa.write( w, format ); + msa.write( w, format ); w.close(); } @@ -457,8 +492,8 @@ public class MsaCompactor { return null; } - private final static void printTableHeader() { - System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) ); + private final void printTableHeader() { + System.out.print( ForesterUtil.pad( "Id", _longest_id_length + 1, ' ', false ) ); System.out.print( "\t" ); System.out.print( "Seqs" ); System.out.print( "\t" ); @@ -466,8 +501,10 @@ public class MsaCompactor { System.out.print( "\t" ); System.out.print( "Gaps" ); System.out.print( "\t" ); - System.out.print( "MSA qual" ); - System.out.print( "\t" ); + if ( _report_aln_mean_identity ) { + System.out.print( "MSA qual" ); + System.out.print( "\t" ); + } System.out.println(); } } -- 1.7.10.2