X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FMsaMethods.java;h=2b470203d611294f1de19114e5226e0102d17208;hb=88718609970e490e94727d12ebbca1270ba2c0a7;hp=c94869e37d18822fde7536b869a23b39e65dad49;hpb=cea9261d689486e31e8abd13a25ffbff3c826e2e;p=jalview.git diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index c94869e..2b47020 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -34,7 +34,7 @@ import java.util.SortedMap; import java.util.TreeMap; import org.forester.sequence.BasicSequence; -import org.forester.sequence.Sequence; +import org.forester.sequence.MolecularSequence; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; @@ -67,7 +67,7 @@ public final class MsaMethods { ++new_length; } } - final List seqs = new ArrayList( msa.getNumberOfSequences() ); + final List seqs = new ArrayList( msa.getNumberOfSequences() ); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { final char[] mol_seq = new char[ new_length ]; int new_col = 0; @@ -76,7 +76,7 @@ public final class MsaMethods { if ( !delete_cols[ col ] ) { final char residue = msa.getResidueAt( row, col ); mol_seq[ new_col++ ] = ( residue ); - if ( residue != Sequence.GAP ) { + if ( residue != MolecularSequence.GAP ) { ++non_gap_cols_sum; } } @@ -107,6 +107,43 @@ public final class MsaMethods { _ignored_seqs_ids = new ArrayList(); } + public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) { + final int[] gaps = calcNumberOfGapsInMsa( msa ); + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final int gap : gaps ) { + stats.addValue( gap ); + } + return stats; + } + + public static final int[] calcNumberOfGapsInMsa( final Msa msa ) { + final int seqs = msa.getNumberOfSequences(); + final int[] gaps= new int[ seqs ]; + for( int i = 0; i < seqs; ++i ) { + gaps[ i ] = calcNumberOfGaps( msa.getSequence( i ) ); + } + return gaps; + } + + + + public final static int calcNumberOfGaps( final MolecularSequence seq ) { + int gaps = 0; + boolean was_gap = false; + for( int i = 0; i < seq.getLength(); ++i ) { + if ( seq.isGapAt( i ) ) { + if ( !was_gap ) { + ++gaps; + was_gap = true; + } + } + else { + was_gap = false; + } + } + return gaps; + } + public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( int i = 0; i < msa.getLength(); ++i ) { @@ -119,7 +156,7 @@ public final class MsaMethods { int gaps = 0; for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { for( int i = 0; i < msa.getLength(); ++i ) { - if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) { + if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) { gaps++; } } @@ -191,7 +228,7 @@ public final class MsaMethods { final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - final Sequence s = msa.getSequence( row ); + final MolecularSequence s = msa.getSequence( row ); stats.addValue( s.getLength() - s.getNumberOfGapResidues() ); } return stats; @@ -221,7 +258,7 @@ public final class MsaMethods { public static SortedMap calculateResidueDestributionPerColumn( final Msa msa, final int column ) { final SortedMap map = new TreeMap(); for( final Character r : msa.getColumnAt( column ) ) { - if ( r != Sequence.GAP ) { + if ( r != MolecularSequence.GAP ) { if ( !map.containsKey( r ) ) { map.put( r, 1 ); } @@ -238,7 +275,7 @@ public final class MsaMethods { } final public static Msa removeSequence( final Msa msa, final String to_remove_id ) { - final List seqs = new ArrayList(); + final List seqs = new ArrayList(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { seqs.add( msa.getSequence( row ) ); @@ -251,7 +288,7 @@ public final class MsaMethods { } final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { - final List seqs = new ArrayList(); + final List seqs = new ArrayList(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { seqs.add( msa.getSequence( row ) ); @@ -268,7 +305,7 @@ public final class MsaMethods { for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { int eff_length = 0; for( int i = 0; i < msa.getLength(); ++i ) { - if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) { + if ( msa.getResidueAt( seq, i ) != MolecularSequence.GAP ) { eff_length++; } } @@ -280,7 +317,7 @@ public final class MsaMethods { } final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { - final List seqs = new ArrayList(); + final List seqs = new ArrayList(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { if ( !to_remove_rows.contains( row ) ) { seqs.add( msa.getSequence( row ) ); @@ -296,7 +333,7 @@ public final class MsaMethods { final HashMap counts = new HashMap(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { final char c = msa.getResidueAt( row, col ); - if ( c != Sequence.GAP ) { + if ( c != MolecularSequence.GAP ) { if ( !counts.containsKey( c ) ) { counts.put( c, 1 ); } @@ -324,8 +361,8 @@ public final class MsaMethods { final private static HashMap calcResidueDistribution6( final Msa msa, final int col ) { // Residues are classified into one of tex2html_wrap199 types: - // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], - // special conformations [GP] and gaps. This convention follows that + // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], + // special conformations [GP] and gaps. This convention follows that // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196). final HashMap counts = new HashMap(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { @@ -370,8 +407,8 @@ public final class MsaMethods { final private static HashMap calcResidueDistribution7( final Msa msa, final int col ) { // Residues are classified into one of tex2html_wrap199 types: - // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], - // special conformations [GP] and gaps. This convention follows that + // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], + // special conformations [GP] and gaps. This convention follows that // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196). final HashMap counts = new HashMap(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {