import org.forester.io.parsers.FastaParser;
import org.forester.io.parsers.GeneralMsaParser;
-import org.forester.msa.BasicMsa;
import org.forester.msa.DeleteableMsa;
import org.forester.msa.MsaInferrer;
import org.forester.msa_compactor.MsaCompactor;
DeleteableMsa msa = null;
final FileInputStream is = new FileInputStream( in );
if ( FastaParser.isLikelyFasta( in ) ) {
- msa = new DeleteableMsa( ( BasicMsa ) FastaParser.parseMsa( is ) );
+ msa = DeleteableMsa.createInstance( FastaParser.parseMsa( is ) );
}
else {
- msa = new DeleteableMsa( ( BasicMsa ) GeneralMsaParser.parse( is ) );
+ msa = DeleteableMsa.createInstance( GeneralMsaParser.parse( is ) );
}
if ( cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) ) {
worst_remove = cla.getOptionValueAsInt( REMOVE_WORST_OFFENDERS_OPTION );
}
final MsaMethods msa_tools = MsaMethods.createInstance();
if ( _options.isExecuteMsaProcessing() ) {
- msa = msa_tools.removeGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
+ msa = msa_tools.deleteGapColumns( _options.getMsaProcessingMaxAllowedGapRatio(),
_options.getMsaProcessingMinAllowedLength(),
msa );
if ( msa == null ) {
public static void toFasta( final Sequence seq, final Writer w, final int width ) throws IOException {
w.write( ">" );
- w.write( seq.getIdentifier().toString() );
+ w.write( seq.getIdentifier() );
w.write( ForesterUtil.LINE_SEPARATOR );
if ( ( width < 1 ) || ( width >= seq.getLength() ) ) {
w.write( seq.getMolecularSequence() );
package org.forester.msa;
import java.io.IOException;
+import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashSet;
private int determineMaxIdLength() {
int max = 0;
for( int row = 0; row < getNumberOfSequences(); ++row ) {
- final int l = getIdentifier(row).length();
+ final int l = getIdentifier( row ).length();
if ( l > max ) {
max = l;
}
@Override
public Sequence getSequence( final int row ) {
- return new BasicSequence( getIdentifier( row ), getSequenceAsArray( row ), getType() );
+ return new BasicSequence( getIdentifier( row ), _data[ row ], getType() );
}
@Override
public StringBuffer getSequenceAsString( final int row ) {
- final StringBuffer sb = new StringBuffer(getLength() );
+ final StringBuffer sb = new StringBuffer( getLength() );
for( int col = 0; col < getLength(); ++col ) {
sb.append( getResidueAt( row, col ) );
}
return sb;
}
-
- @Override
- public char[] getSequenceAsArray( final int row ) {
- return _data[ row ];
- }
@Override
public TYPE getType() {
@Override
public String toString() {
- final int max = determineMaxIdLength() + 1;
- final StringBuffer sb = new StringBuffer();
- for( int row = 0; row < getNumberOfSequences(); ++row ) {
- sb.append( ForesterUtil.pad( getIdentifier( row ).toString(), max, ' ', false ) );
- for( int col = 0; col < getLength(); ++col ) {
- sb.append( getResidueAt( row, col ) );
- }
- sb.append( ForesterUtil.LINE_SEPARATOR );
+ final Writer w = new StringWriter();
+ try {
+ write( w, MSA_FORMAT.PHYLIP );
+ }
+ catch ( final IOException e ) {
+ e.printStackTrace();
}
- return sb.toString();
+ return w.toString();
}
@Override
final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() );
for( int row = 0; row < seqs.size(); ++row ) {
final Sequence seq = seqs.get( row );
- //
- // int x = length - seq.getLength();
- // if ( x > 0 ) {
- // String a = "";
- // for( int i = 0; i < x; i++ ) {
- // a += "-";
- // }
- // seq = BasicSequence.createAaSequence( seq.getIdentifier(), seq.getMolecularSequenceAsString() + a );
- // }
- // else {
- // seq = BasicSequence.createAaSequence( seq.getIdentifier(), seq.getMolecularSequenceAsString()
- // .substring( 0, length ) );
- // }
- //
if ( seq.getLength() != length ) {
throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length ["
+ seq.getIdentifier() + "]" );
package org.forester.msa;
import java.util.HashMap;
+import java.util.List;
-import org.forester.util.ForesterUtil;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.Sequence;
public final class DeleteableMsa extends BasicMsa {
private int _length = 0;
private int _mapped_col_positions[] = null;
private int _mapped_row_positions[] = null;
- private int _seqs = 0;
private HashMap<String, Integer> _seq_id_to_row_map = null;
+ private int _seqs = 0;
- public DeleteableMsa( final BasicMsa msa ) {
+ private DeleteableMsa( final BasicMsa msa ) {
super( msa );
_mapped_col_positions = new int[ msa.getLength() ];
_mapped_row_positions = new int[ msa.getNumberOfSequences() ];
_length = msa.getLength();
_seqs = msa.getNumberOfSequences();
}
-
-
- @Override
- public char[] getSequenceAsArray( final int row ) {
- return super.getSequenceAsArray( _mapped_row_positions[ row ] );
- }
- public void deleteColumn( final int col ) {
- if ( col >= _length || col < 0 ) {
- throw new IllegalArgumentException( "column " + col + " is out of range" );
+ final public void deleteGapColumns( final double max_allowed_gap_ratio ) {
+ if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
+ throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
}
- for( int c = col; c < _length - 1; ++c ) {
- _mapped_col_positions[ c ] = _mapped_col_positions[ c + 1 ];
+ for( int col = getLength() - 1; col >= 0; --col ) {
+ final boolean delete = ( ( double ) MsaMethods.calcGapSumPerColumn( this, col ) / getNumberOfSequences() ) > max_allowed_gap_ratio;
+ if ( delete ) {
+ deleteColumn( col );
+ }
}
- --_length;
}
-
-
- private void deleteRow( final int row ) {
- if ( row >= _seqs || row < 0 ) {
- throw new IllegalArgumentException( "row " + row + " is out of range" );
- }
- for( int r = row; r < _seqs - 1; ++r ) {
- _mapped_row_positions[ r ] = _mapped_row_positions[ r + 1 ];
+ final public void deleteGapOnlyColumns() {
+ for( int col = getLength() - 1; col >= 0; --col ) {
+ if ( MsaMethods.calcGapSumPerColumn( this, col ) == getNumberOfSequences() ) {
+ deleteColumn( col );
+ }
}
- --_seqs;
}
- public void deleteRow( final String id ) {
+ final public void deleteRow( final String id ) {
int row = -1;
for( int r = 0; r < getNumberOfSequences(); ++r ) {
if ( getIdentifier( r ).equals( id ) ) {
}
@Override
- public String getIdentifier( final int row ) {
+ final public String getIdentifier( final int row ) {
return super.getIdentifier( _mapped_row_positions[ row ] );
}
@Override
- public int getLength() {
+ final public int getLength() {
return _length;
}
@Override
- public int getNumberOfSequences() {
+ final public int getNumberOfSequences() {
return _seqs;
}
-
@Override
- public char getResidueAt( final int row, final int col ) {
- return super.getResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ] );
+ final public char getResidueAt( final int row, final int col ) {
+ return super.getResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ] );
}
@Override
- public void setIdentifier( final int row, final String id ) {
+ public Sequence getSequence( final int row ) {
+ return new BasicSequence( getIdentifier( row ), getSequenceAsString( row ).toString(), getType() );
+ }
+
+ @Override
+ final public void setIdentifier( final int row, final String id ) {
super.setIdentifier( _mapped_row_positions[ row ], id );
}
@Override
- public void setResidueAt( final int row, final int col, final char residue ) {
+ final public void setResidueAt( final int row, final int col, final char residue ) {
super.setResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ], residue );
}
+
+ final private void deleteColumn( final int col ) {
+ if ( ( col >= _length ) || ( col < 0 ) ) {
+ throw new IllegalArgumentException( "column " + col + " is out of range" );
+ }
+ for( int c = col; c < _length - 1; ++c ) {
+ _mapped_col_positions[ c ] = _mapped_col_positions[ c + 1 ];
+ }
+ --_length;
+ }
+
+ final private void deleteRow( final int row ) {
+ if ( ( row >= _seqs ) || ( row < 0 ) ) {
+ throw new IllegalArgumentException( "row " + row + " is out of range" );
+ }
+ for( int r = row; r < _seqs - 1; ++r ) {
+ _mapped_row_positions[ r ] = _mapped_row_positions[ r + 1 ];
+ }
+ --_seqs;
+ }
+
+ public final static DeleteableMsa createInstance( final List<Sequence> seqs ) {
+ return new DeleteableMsa( ( BasicMsa ) BasicMsa.createInstance( seqs ) );
+ }
+
+ public final static DeleteableMsa createInstance( final Msa msa ) {
+ return new DeleteableMsa( ( BasicMsa ) msa );
+ }
}
public void setResidueAt( final int row, final int col, final char residue );
public void write( Writer w, MSA_FORMAT format ) throws IOException;
-
- char[] getSequenceAsArray( int row );
}
return BasicMsa.createInstance( seqs );
}
- synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio,
+ synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio,
final int min_allowed_length,
final Msa msa ) {
init();
final boolean[] delete_cols = new boolean[ msa.getLength() ];
int new_length = 0;
for( int col = 0; col < msa.getLength(); ++col ) {
- delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
+ delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio;
if ( !delete_cols[ col ] ) {
++new_length;
}
return BasicMsa.createInstance( seqs );
}
- synchronized final public static void removeGapColumns( final double max_allowed_gap_ratio, final DeleteableMsa msa ) {
- if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
- throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
- }
- // final boolean ignore_too_short_seqs = min_allowed_length > 0;
- for( int col = msa.getLength() - 1; col >= 0 ; --col ) {
- final boolean delete = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
- if ( delete ) {
- msa.deleteColumn( col );
- }
- }
- }
-
public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
for( int c = from; c <= to; ++c ) {
package org.forester.msa;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.Sequence;
+
public final class ResampleableMsa extends BasicMsa {
private int[] _resampled_column_positions = null;
super( msa );
}
- public void resample( final int[] resampled_column_positions ) {
+ @Override
+ final public char getResidueAt( final int row, final int col ) {
+ if ( _resampled_column_positions != null ) {
+ return super.getResidueAt( row, _resampled_column_positions[ col ] );
+ }
+ return super.getResidueAt( row, col );
+ }
+
+ final public void resample( final int[] resampled_column_positions ) {
if ( resampled_column_positions.length != getLength() ) {
throw new IllegalArgumentException( "illegal attempt to use " + resampled_column_positions.length
+ " resampled column positions on msa of length " + getLength() );
}
@Override
- public char getResidueAt( final int row, final int col ) {
- if ( _resampled_column_positions != null ) {
- return super.getResidueAt( row, _resampled_column_positions[ col ] );
- }
- return super.getResidueAt( row, col );
+ final public void setResidueAt( final int row, final int col, final char residue ) {
+ throw new NoSuchMethodError( "illegal attempt to set residue in resampleable msa" );
}
@Override
- public void setResidueAt( final int row, final int col, final char residue ) {
- throw new NoSuchMethodError( "illegal attempt to set residue in resampleable msa" );
+ public Sequence getSequence( final int row ) {
+ return new BasicSequence( getIdentifier( row ), getSequenceAsString( row ).toString(), getType() );
}
}
import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD;
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.tools.BootstrapResampler;
-import org.forester.msa.BasicMsa;
import org.forester.msa.DeleteableMsa;
import org.forester.msa.Mafft;
import org.forester.msa.Msa;
//opts.add( "1000" );
//opts.add( "--localpair" );
//opts.add( "--quiet" );
- _msa = new DeleteableMsa( ( BasicMsa ) mafft.infer( _msa.asSequenceList(), opts ) );
+ _msa = DeleteableMsa.createInstance( mafft.infer( _msa.asSequenceList(), opts ) );
}
final private void removeGapColumns() {
//~ _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
- MsaMethods.removeGapColumns( 1, _msa );
+ _msa.deleteGapOnlyColumns();
}
final private void removeViaGapAverage( final double mean_gapiness,
private String _identifier;
private final TYPE _type;
- private BasicSequence( final String identifier, final String mol_sequence, final TYPE type ) {
+ /**
+ * Only use if you know what you are doing!
+ *
+ */
+ public BasicSequence( final String identifier, final String mol_sequence, final TYPE type ) {
if ( ForesterUtil.isEmpty( identifier ) ) {
throw new IllegalArgumentException( "identifier of sequence cannot be empty" );
}
_type = type;
}
- // Only use if you know what you are doing!
+ /**
+ * Only use if you know what you are doing!
+ *
+ */
public BasicSequence( final String identifier, final char[] mol_sequence, final TYPE type ) {
if ( ForesterUtil.isEmpty( identifier ) ) {
throw new IllegalArgumentException( "identifier of sequence cannot be empty" );
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import org.forester.msa.DeleteableMsa;
import org.forester.msa.Mafft;
import org.forester.msa.Msa;
+import org.forester.msa.Msa.MSA_FORMAT;
import org.forester.msa.MsaInferrer;
import org.forester.msa.MsaMethods;
import org.forester.pccx.TestPccx;
System.out.println( "failed." );
failed++;
}
- System.exit( 0 );
+ System.exit( 0 );
if ( PERFORM_DB_TESTS ) {
System.out.print( "Uniprot Entry Retrieval: " );
if ( Test.testUniprotEntryRetrieval() ) {
}
return true;
}
-
+
private static boolean testDeleteableMsa() {
try {
final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAA" );
l0.add( s3 );
l0.add( s4 );
l0.add( s5 );
- final Msa msa0 = BasicMsa.createInstance( l0 );
- final DeleteableMsa dmsa0 = new DeleteableMsa( ( BasicMsa ) msa0 );
+ final DeleteableMsa dmsa0 = DeleteableMsa.createInstance( l0 );
dmsa0.deleteRow( "b" );
if ( !dmsa0.getIdentifier( 1 ).equals( "c" ) ) {
return false;
}
System.out.println();
- System.out.println( dmsa0.toString() );
+ System.out.println( dmsa0.toString() );
dmsa0.deleteRow( "e" );
System.out.println();
- System.out.println( dmsa0.toString() );
+ System.out.println( dmsa0.toString() );
dmsa0.deleteRow( "a" );
System.out.println();
- System.out.println( dmsa0.toString() );
+ System.out.println( dmsa0.toString() );
dmsa0.deleteRow( "f" );
System.out.println();
- System.out.println( dmsa0.toString() );
-
+ System.out.println( dmsa0.toString() );
if ( dmsa0.getLength() != 4 ) {
return false;
}
if ( dmsa0.getNumberOfSequences() != 2 ) {
return false;
}
-
if ( !dmsa0.getIdentifier( 0 ).equals( "c" ) ) {
return false;
}
if ( !dmsa0.getIdentifier( 1 ).equals( "d" ) ) {
return false;
}
- if ( dmsa0.getResidueAt( 0, 0 ) != 'C') {
+ if ( dmsa0.getResidueAt( 0, 0 ) != 'C' ) {
return false;
}
- if ( !dmsa0.getSequenceAsString( 0 ).toString().equals( "CAAA" )) {
+ if ( !dmsa0.getSequenceAsString( 0 ).toString().equals( "CAAA" ) ) {
return false;
}
- if ( dmsa0.getColumnAt( 0 ).size() !=2 ) {
+ if ( dmsa0.getColumnAt( 0 ).size() != 2 ) {
return false;
}
dmsa0.deleteRow( "c" );
return false;
}
//
- final Sequence s_0 = BasicSequence.createAaSequence( "a", "--A---B-C---" );
- final Sequence s_1 = BasicSequence.createAaSequence( "b", "--B-----C---" );
- final Sequence s_2 = BasicSequence.createAaSequence( "c", "--C--AB-C---" );
- final Sequence s_3 = BasicSequence.createAaSequence( "d", "--D--AA-C---" );
- final Sequence s_4 = BasicSequence.createAaSequence( "e", "--E--AA-C---" );
- final Sequence s_5 = BasicSequence.createAaSequence( "f", "--F--AB-CD--" );
+ final Sequence s_0 = BasicSequence.createAaSequence( "a", "--A---B-C--X----" );
+ final Sequence s_1 = BasicSequence.createAaSequence( "b", "--B-----C-------" );
+ final Sequence s_2 = BasicSequence.createAaSequence( "c", "--C--AB-C------Z" );
+ final Sequence s_3 = BasicSequence.createAaSequence( "d", "--D--AA-C-------" );
+ final Sequence s_4 = BasicSequence.createAaSequence( "e", "--E--AA-C-------" );
+ final Sequence s_5 = BasicSequence.createAaSequence( "f", "--F--AB-CD--Y---" );
final List<Sequence> l1 = new ArrayList<Sequence>();
l1.add( s_0 );
l1.add( s_1 );
l1.add( s_3 );
l1.add( s_4 );
l1.add( s_5 );
- final Msa msa1 = BasicMsa.createInstance( l1 );
- final DeleteableMsa dmsa1 = new DeleteableMsa( ( BasicMsa ) msa1 );
- System.out.println( dmsa1.toString() );
- MsaMethods.removeGapColumns( 1, dmsa1 );
- System.out.println( dmsa1.toString() );
-
+ final DeleteableMsa dmsa1 = DeleteableMsa.createInstance( l1 );
+ System.out.println( dmsa1.toString() );
+ dmsa1.deleteGapOnlyColumns();
+ System.out.println( dmsa1.toString() );
+ dmsa1.deleteRow( "a" );
+ dmsa1.deleteRow( "f" );
+ dmsa1.deleteRow( "d" );
+ System.out.println( dmsa1.toString() );
+ dmsa1.deleteGapOnlyColumns();
+ System.out.println( dmsa1.toString() );
+ if ( !dmsa1.getSequenceAsString( 0 ).toString().equals( "B--C-" ) ) {
+ return false;
+ }
+ if ( !dmsa1.getSequenceAsString( 1 ).toString().equals( "CABCZ" ) ) {
+ return false;
+ }
+ if ( !dmsa1.getSequenceAsString( 2 ).toString().equals( "EAAC-" ) ) {
+ return false;
+ }
+ dmsa1.deleteRow( "c" );
+ dmsa1.deleteGapOnlyColumns();
+ final Writer w0 = new StringWriter();
+ dmsa1.write( w0, MSA_FORMAT.FASTA );
+ System.out.println( w0.toString() );
+ final Writer w1 = new StringWriter();
+ dmsa1.write( w1, MSA_FORMAT.PHYLIP );
+ System.out.println( w1.toString() );
+ if ( !dmsa1.getSequenceAsString( 0 ).toString().equals( "B--C" ) ) {
+ return false;
+ }
+ if ( !dmsa1.getSequenceAsString( 1 ).toString().equals( "EAAC" ) ) {
+ return false;
+ }
+ //
+ final Sequence s__0 = BasicSequence.createAaSequence( "a", "A------" );
+ final Sequence s__1 = BasicSequence.createAaSequence( "b", "BB-----" );
+ final Sequence s__2 = BasicSequence.createAaSequence( "c", "CCC----" );
+ final Sequence s__3 = BasicSequence.createAaSequence( "d", "DDDD---" );
+ final Sequence s__4 = BasicSequence.createAaSequence( "e", "EEEEE--" );
+ final Sequence s__5 = BasicSequence.createAaSequence( "f", "FFFFFF-" );
+ final List<Sequence> l2 = new ArrayList<Sequence>();
+ l2.add( s__0 );
+ l2.add( s__1 );
+ l2.add( s__2 );
+ l2.add( s__3 );
+ l2.add( s__4 );
+ l2.add( s__5 );
+ final DeleteableMsa dmsa2 = DeleteableMsa.createInstance( l2 );
+ System.out.println( dmsa2.toString() );
+ dmsa2.deleteGapColumns( 0.5 );
+ System.out.println( dmsa2.toString() );
}
catch ( final Exception e ) {
e.printStackTrace( System.out );