X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FBasicMsa.java;h=02da257664a0817e2a58a9a5a094e847a0dd8a99;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=bda642466b2b6aaecd05b3a487f2def2e46697b0;hpb=d0bb37d418d945966304afe70431185c3873635a;p=jalview.git diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index bda6424..02da257 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -26,6 +26,7 @@ package org.forester.msa; import java.io.IOException; +import java.io.StringWriter; import java.io.Writer; import java.util.ArrayList; import java.util.HashSet; @@ -35,15 +36,16 @@ import java.util.Set; import org.forester.io.writers.SequenceWriter; import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; import org.forester.sequence.BasicSequence; -import org.forester.sequence.Sequence; -import org.forester.sequence.Sequence.TYPE; +import org.forester.sequence.MolecularSequence; +import org.forester.sequence.MolecularSequence.TYPE; import org.forester.util.ForesterUtil; public class BasicMsa implements Msa { - private final char[][] _data; - private final String[] _identifiers; - private final TYPE _type; + private final char[][] _data; + private final String[] _identifiers; + private final Set _identifiers_set; + private final TYPE _type; public BasicMsa( final int rows, final int columns, final TYPE type ) { if ( ( rows < 1 ) || ( columns < 1 ) ) { @@ -51,6 +53,7 @@ public class BasicMsa implements Msa { } _data = new char[ rows ][ columns ]; _identifiers = new String[ rows ]; + _identifiers_set = new HashSet(); _type = type; } @@ -58,26 +61,25 @@ public class BasicMsa implements Msa { _data = msa._data; _identifiers = msa._identifiers; _type = msa._type; + _identifiers_set = msa._identifiers_set; } @Override - public List asSequenceList() { - final List seqs = new ArrayList(); + public List asSequenceList() { + final List seqs = new ArrayList(); for( int i = 0; i < getNumberOfSequences(); ++i ) { seqs.add( getSequence( i ) ); } return seqs; } - private int determineMaxIdLength() { - int max = 0; - for( int row = 0; row < _data.length; ++row ) { - final int l = _identifiers[ row ].toString().length(); - if ( l > max ) { - max = l; - } + @Override + public List getColumnAt( final int col ) { + final List column = new ArrayList(); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + column.add( getResidueAt( row, col ) ); } - return max; + return column; } @Override @@ -101,7 +103,12 @@ public class BasicMsa implements Msa { } @Override - public Sequence getSequence( final String id ) { + public MolecularSequence getSequence( final int row ) { + return new BasicSequence( getIdentifier( row ), _data[ row ], getType() ); + } + + @Override + public MolecularSequence getSequence( final String id ) { for( int i = 0; i < getNumberOfSequences(); ++i ) { if ( getIdentifier( i ).equals( id ) ) { return getSequence( i ); @@ -111,14 +118,9 @@ public class BasicMsa implements Msa { } @Override - public Sequence getSequence( final int row ) { - return new BasicSequence( getIdentifier( row ), _data[ row ], getType() ); - } - - @Override public StringBuffer getSequenceAsString( final int row ) { - final StringBuffer sb = new StringBuffer( _data[ 0 ].length ); - for( int col = 0; col < _data[ 0 ].length; ++col ) { + final StringBuffer sb = new StringBuffer( getLength() ); + for( int col = 0; col < getLength(); ++col ) { sb.append( getResidueAt( row, col ) ); } return sb; @@ -130,7 +132,20 @@ public class BasicMsa implements Msa { } @Override + public boolean isGapAt( final int row, final int col ) { + return getResidueAt( row, col ) == MolecularSequence.GAP; + } + + @Override public void setIdentifier( final int row, final String id ) { + if ( ForesterUtil.isEmpty( id ) ) { + throw new IllegalArgumentException( "illegal attempt to create msa with empty identifier" ); + } + if ( _identifiers_set.contains( id ) ) { + throw new IllegalArgumentException( "illegal attempt to create msa with non-unique identifiers [" + id + + "]" ); + } + _identifiers_set.add( id ); _identifiers[ row ] = id; } @@ -141,16 +156,14 @@ public class BasicMsa implements Msa { @Override public String toString() { - final int max = determineMaxIdLength() + 1; - final StringBuffer sb = new StringBuffer(); - for( int row = 0; row < _data.length; ++row ) { - sb.append( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ) ); - for( int col = 0; col < _data[ 0 ].length; ++col ) { - sb.append( getResidueAt( row, col ) ); - } - sb.append( ForesterUtil.LINE_SEPARATOR ); + final Writer w = new StringWriter(); + try { + write( w, MSA_FORMAT.PHYLIP ); } - return sb.toString(); + catch ( final IOException e ) { + e.printStackTrace(); + } + return w.toString(); } @Override @@ -162,35 +175,77 @@ public class BasicMsa implements Msa { case FASTA: writeToFasta( w ); break; + case NEXUS: + writeToNexus( w ); + break; default: throw new RuntimeException( "unknown format " + format ); } } + private short determineMaxIdLength() { + short max = 0; + for( int row = 0; row < getNumberOfSequences(); ++row ) { + final short l = ( short ) getIdentifier( row ).length(); + if ( l > max ) { + max = l; + } + } + return max; + } + private void writeToFasta( final Writer w ) throws IOException { SequenceWriter.writeSeqs( asSequenceList(), w, SEQ_FORMAT.FASTA, 100 ); } + private void writeToNexus( final Writer w ) throws IOException { + final int max = determineMaxIdLength() + 1; + w.write( "Begin Data;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Dimensions NTax=" + getNumberOfSequences() ); + w.write( " NChar=" + getLength() ); + w.write( ";" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Format DataType=Protein Interleave=No gap=-;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Matrix" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + final MolecularSequence seq = getSequence( row ); + final String s = seq.getMolecularSequenceAsString(); + w.write( " " ); + w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() ); + w.write( " " ); + w.write( s ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + w.write( " ;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "End;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + private void writeToPhylip( final Writer w ) throws IOException { final int max = determineMaxIdLength() + 1; - for( int row = 0; row < _data.length; ++row ) { - w.write( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ).toString() ); - for( int col = 0; col < _data[ 0 ].length; ++col ) { + w.write( getNumberOfSequences() + " " + getLength() ); + w.write( ForesterUtil.LINE_SEPARATOR ); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() ); + for( int col = 0; col < getLength(); ++col ) { w.write( getResidueAt( row, col ) ); } w.write( ForesterUtil.LINE_SEPARATOR ); } } - public static Msa createInstance( final List seqs ) { + public static Msa createInstance( final List seqs ) { if ( seqs.size() < 1 ) { - throw new IllegalArgumentException( "cannot create basic msa from less than one sequence" ); + throw new IllegalArgumentException( "cannot create msa from less than one sequence" ); } - final Set ids = new HashSet(); final int length = seqs.get( 0 ).getLength(); final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() ); for( int row = 0; row < seqs.size(); ++row ) { - final Sequence seq = seqs.get( row ); + final MolecularSequence seq = seqs.get( row ); if ( seq.getLength() != length ) { throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length [" + seq.getIdentifier() + "]" ); @@ -199,11 +254,6 @@ public class BasicMsa implements Msa { throw new IllegalArgumentException( "illegal attempt to build msa from sequences of different type [" + seq.getIdentifier() + "]" ); } - if ( ids.contains( seq.getIdentifier() ) ) { - throw new IllegalArgumentException( "illegal attempt to create msa with non-unique identifiers [" - + seq.getIdentifier() + "]" ); - } - ids.add( seq.getIdentifier() ); msa.setIdentifier( row, seq.getIdentifier() ); for( int col = 0; col < length; ++col ) { msa._data[ row ][ col ] = seq.getResidueAt( col ); @@ -211,18 +261,4 @@ public class BasicMsa implements Msa { } return msa; } - - @Override - public List getColumnAt( final int col ) { - final List column = new ArrayList(); - for( int row = 0; row < getNumberOfSequences(); ++row ) { - column.add( getResidueAt( row, col ) ); - } - return column; - } - - @Override - public boolean isGapAt( final int row, final int col ) { - return getResidueAt( row, col ) == Sequence.GAP; - } }