import java.util.TreeMap;
import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
++new_length;
}
}
- final List<Sequence> seqs = new ArrayList<Sequence>( msa.getNumberOfSequences() );
+ final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>( msa.getNumberOfSequences() );
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
final char[] mol_seq = new char[ new_length ];
int new_col = 0;
if ( !delete_cols[ col ] ) {
final char residue = msa.getResidueAt( row, col );
mol_seq[ new_col++ ] = ( residue );
- if ( residue != Sequence.GAP ) {
+ if ( residue != MolecularSequence.GAP ) {
++non_gap_cols_sum;
}
}
_ignored_seqs_ids = new ArrayList<String>();
}
+ public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) {
+ final int[] gaps = calcNumberOfGapsInMsa( msa );
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( final int gap : gaps ) {
+ stats.addValue( gap );
+ }
+ return stats;
+ }
+
+ public static final int[] calcNumberOfGapsInMsa( final Msa msa ) {
+ final int seqs = msa.getNumberOfSequences();
+ final int[] gaps= new int[ seqs ];
+ for( int i = 0; i < seqs; ++i ) {
+ gaps[ i ] = calcNumberOfGaps( msa.getSequence( i ) );
+ }
+ return gaps;
+ }
+
+
+
+ public final static int calcNumberOfGaps( final MolecularSequence seq ) {
+ int gaps = 0;
+ boolean was_gap = false;
+ for( int i = 0; i < seq.getLength(); ++i ) {
+ if ( seq.isGapAt( i ) ) {
+ if ( !was_gap ) {
+ ++gaps;
+ was_gap = true;
+ }
+ }
+ else {
+ was_gap = false;
+ }
+ }
+ return gaps;
+ }
+
public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
for( int i = 0; i < msa.getLength(); ++i ) {
int gaps = 0;
for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
for( int i = 0; i < msa.getLength(); ++i ) {
- if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
+ if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) {
gaps++;
}
}
final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
- final Sequence s = msa.getSequence( row );
+ final MolecularSequence s = msa.getSequence( row );
stats.addValue( s.getLength() - s.getNumberOfGapResidues() );
}
return stats;
public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
for( final Character r : msa.getColumnAt( column ) ) {
- if ( r != Sequence.GAP ) {
+ if ( r != MolecularSequence.GAP ) {
if ( !map.containsKey( r ) ) {
map.put( r, 1 );
}
}
final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
- final List<Sequence> seqs = new ArrayList<Sequence>();
+ final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
seqs.add( msa.getSequence( row ) );
}
final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
- final List<Sequence> seqs = new ArrayList<Sequence>();
+ final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
seqs.add( msa.getSequence( row ) );
for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
int eff_length = 0;
for( int i = 0; i < msa.getLength(); ++i ) {
- if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
+ if ( msa.getResidueAt( seq, i ) != MolecularSequence.GAP ) {
eff_length++;
}
}
}
final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
- final List<Sequence> seqs = new ArrayList<Sequence>();
+ final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
if ( !to_remove_rows.contains( row ) ) {
seqs.add( msa.getSequence( row ) );
final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
final char c = msa.getResidueAt( row, col );
- if ( c != Sequence.GAP ) {
+ if ( c != MolecularSequence.GAP ) {
if ( !counts.containsKey( c ) ) {
counts.put( c, 1 );
}
final private static HashMap<Character, Integer> calcResidueDistribution6( final Msa msa, final int col ) {
// Residues are classified into one of tex2html_wrap199 types:
- // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
- // special conformations [GP] and gaps. This convention follows that
+ // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
+ // special conformations [GP] and gaps. This convention follows that
// of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
final private static HashMap<Character, Integer> calcResidueDistribution7( final Msa msa, final int col ) {
// Residues are classified into one of tex2html_wrap199 types:
- // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
- // special conformations [GP] and gaps. This convention follows that
+ // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
+ // special conformations [GP] and gaps. This convention follows that
// of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {