X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FBasicMsa.java;h=02da257664a0817e2a58a9a5a094e847a0dd8a99;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=6e407f83070b7ca48b076374f41b6aff05cd07fc;hpb=48f7a89be9d34f1930a1f863e608235cc27184c5;p=jalview.git diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index 6e407f8..02da257 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -5,7 +5,7 @@ // Copyright (C) 2010 Christian M Zmasek // Copyright (C) 2010 Sanford-Burnham Medical Research Institute // All rights reserved -// +// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -15,36 +15,45 @@ // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. -// +// // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.msa; import java.io.IOException; +import java.io.StringWriter; import java.io.Writer; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; -import org.forester.sequence.Sequence; -import org.forester.sequence.Sequence.TYPE; +import org.forester.io.writers.SequenceWriter; +import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.MolecularSequence; +import org.forester.sequence.MolecularSequence.TYPE; import org.forester.util.ForesterUtil; public class BasicMsa implements Msa { - private final char[][] _data; - private final Object[] _identifiers; - private final TYPE _type; + private final char[][] _data; + private final String[] _identifiers; + private final Set _identifiers_set; + private final TYPE _type; public BasicMsa( final int rows, final int columns, final TYPE type ) { if ( ( rows < 1 ) || ( columns < 1 ) ) { throw new IllegalArgumentException( "basic msa of size zero are illegal" ); } _data = new char[ rows ][ columns ]; - _identifiers = new Object[ rows ]; + _identifiers = new String[ rows ]; + _identifiers_set = new HashSet(); _type = type; } @@ -52,21 +61,29 @@ public class BasicMsa implements Msa { _data = msa._data; _identifiers = msa._identifiers; _type = msa._type; + _identifiers_set = msa._identifiers_set; } - private int determineMaxIdLength() { - int max = 0; - for( int row = 0; row < _data.length; ++row ) { - final int l = _identifiers[ row ].toString().length(); - if ( l > max ) { - max = l; - } + @Override + public List asSequenceList() { + final List seqs = new ArrayList(); + for( int i = 0; i < getNumberOfSequences(); ++i ) { + seqs.add( getSequence( i ) ); } - return max; + return seqs; + } + + @Override + public List getColumnAt( final int col ) { + final List column = new ArrayList(); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + column.add( getResidueAt( row, col ) ); + } + return column; } @Override - public Object getIdentifier( final int row ) { + public String getIdentifier( final int row ) { return _identifiers[ row ]; } @@ -86,9 +103,24 @@ public class BasicMsa implements Msa { } @Override + public MolecularSequence getSequence( final int row ) { + return new BasicSequence( getIdentifier( row ), _data[ row ], getType() ); + } + + @Override + public MolecularSequence getSequence( final String id ) { + for( int i = 0; i < getNumberOfSequences(); ++i ) { + if ( getIdentifier( i ).equals( id ) ) { + return getSequence( i ); + } + } + return null; + } + + @Override public StringBuffer getSequenceAsString( final int row ) { - final StringBuffer sb = new StringBuffer( _data[ 0 ].length ); - for( int col = 0; col < _data[ 0 ].length; ++col ) { + final StringBuffer sb = new StringBuffer( getLength() ); + for( int col = 0; col < getLength(); ++col ) { sb.append( getResidueAt( row, col ) ); } return sb; @@ -99,52 +131,128 @@ public class BasicMsa implements Msa { return _type; } - public void setIdentifier( final int row, final Object id ) { + @Override + public boolean isGapAt( final int row, final int col ) { + return getResidueAt( row, col ) == MolecularSequence.GAP; + } + + @Override + public void setIdentifier( final int row, final String id ) { + if ( ForesterUtil.isEmpty( id ) ) { + throw new IllegalArgumentException( "illegal attempt to create msa with empty identifier" ); + } + if ( _identifiers_set.contains( id ) ) { + throw new IllegalArgumentException( "illegal attempt to create msa with non-unique identifiers [" + id + + "]" ); + } + _identifiers_set.add( id ); _identifiers[ row ] = id; } + @Override public void setResidueAt( final int row, final int col, final char residue ) { _data[ row ][ col ] = residue; } @Override public String toString() { - final int max = determineMaxIdLength() + 1; - final StringBuffer sb = new StringBuffer(); - for( int row = 0; row < _data.length; ++row ) { - sb.append( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ) ); - for( int col = 0; col < _data[ 0 ].length; ++col ) { - sb.append( getResidueAt( row, col ) ); + final Writer w = new StringWriter(); + try { + write( w, MSA_FORMAT.PHYLIP ); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + return w.toString(); + } + + @Override + public void write( final Writer w, final MSA_FORMAT format ) throws IOException { + switch ( format ) { + case PHYLIP: + writeToPhylip( w ); + break; + case FASTA: + writeToFasta( w ); + break; + case NEXUS: + writeToNexus( w ); + break; + default: + throw new RuntimeException( "unknown format " + format ); + } + } + + private short determineMaxIdLength() { + short max = 0; + for( int row = 0; row < getNumberOfSequences(); ++row ) { + final short l = ( short ) getIdentifier( row ).length(); + if ( l > max ) { + max = l; } - sb.append( ForesterUtil.LINE_SEPARATOR ); } - return sb.toString(); + return max; + } + + private void writeToFasta( final Writer w ) throws IOException { + SequenceWriter.writeSeqs( asSequenceList(), w, SEQ_FORMAT.FASTA, 100 ); + } + + private void writeToNexus( final Writer w ) throws IOException { + final int max = determineMaxIdLength() + 1; + w.write( "Begin Data;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Dimensions NTax=" + getNumberOfSequences() ); + w.write( " NChar=" + getLength() ); + w.write( ";" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Format DataType=Protein Interleave=No gap=-;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Matrix" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + final MolecularSequence seq = getSequence( row ); + final String s = seq.getMolecularSequenceAsString(); + w.write( " " ); + w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() ); + w.write( " " ); + w.write( s ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + w.write( " ;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "End;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); } - public void write( final Writer w ) throws IOException { + private void writeToPhylip( final Writer w ) throws IOException { final int max = determineMaxIdLength() + 1; - for( int row = 0; row < _data.length; ++row ) { - w.write( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ).toString() ); - for( int col = 0; col < _data[ 0 ].length; ++col ) { + w.write( getNumberOfSequences() + " " + getLength() ); + w.write( ForesterUtil.LINE_SEPARATOR ); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() ); + for( int col = 0; col < getLength(); ++col ) { w.write( getResidueAt( row, col ) ); } w.write( ForesterUtil.LINE_SEPARATOR ); } } - public static Msa createInstance( final List seqs ) { + public static Msa createInstance( final List seqs ) { if ( seqs.size() < 1 ) { - throw new IllegalArgumentException( "cannot create basic msa from less than one sequence" ); + throw new IllegalArgumentException( "cannot create msa from less than one sequence" ); } final int length = seqs.get( 0 ).getLength(); final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() ); for( int row = 0; row < seqs.size(); ++row ) { - final Sequence seq = seqs.get( row ); + final MolecularSequence seq = seqs.get( row ); if ( seq.getLength() != length ) { - throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length" ); + throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length [" + + seq.getIdentifier() + "]" ); } if ( seq.getType() != msa.getType() ) { - throw new IllegalArgumentException( "illegal attempt to build msa from sequences of different type" ); + throw new IllegalArgumentException( "illegal attempt to build msa from sequences of different type [" + + seq.getIdentifier() + "]" ); } msa.setIdentifier( row, seq.getIdentifier() ); for( int col = 0; col < length; ++col ) {