in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 27 Aug 2014 17:50:01 +0000 (17:50 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 27 Aug 2014 17:50:01 +0000 (17:50 +0000)
26 files changed:
forester/java/src/org/forester/application/check_fasta.java
forester/java/src/org/forester/application/decorator.java
forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
forester/java/src/org/forester/archaeopteryx/TreePanel.java
forester/java/src/org/forester/archaeopteryx/phylogeny/data/RenderableMsaSequence.java
forester/java/src/org/forester/archaeopteryx/tools/PhyloInferenceDialog.java
forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java
forester/java/src/org/forester/io/parsers/FastaParser.java
forester/java/src/org/forester/io/parsers/GeneralMsaParser.java
forester/java/src/org/forester/io/parsers/nexus/NexusConstants.java
forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java
forester/java/src/org/forester/io/writers/SequenceWriter.java
forester/java/src/org/forester/msa/BasicMsa.java
forester/java/src/org/forester/msa/ClustalOmega.java
forester/java/src/org/forester/msa/DeleteableMsa.java
forester/java/src/org/forester/msa/Mafft.java
forester/java/src/org/forester/msa/Msa.java
forester/java/src/org/forester/msa/MsaInferrer.java
forester/java/src/org/forester/msa/MsaMethods.java
forester/java/src/org/forester/msa/ResampleableMsa.java
forester/java/src/org/forester/msa_compactor/MsaCompactor.java
forester/java/src/org/forester/phylogeny/PhylogenyMethods.java
forester/java/src/org/forester/phylogeny/data/Sequence.java
forester/java/src/org/forester/sequence/BasicSequence.java
forester/java/src/org/forester/sequence/MolecularSequence.java [moved from forester/java/src/org/forester/sequence/Sequence.java with 98% similarity]
forester/java/src/org/forester/test/Test.java

index dd312e8..cba390e 100644 (file)
@@ -38,7 +38,7 @@ import org.forester.io.parsers.FastaParser;
 import org.forester.io.writers.SequenceWriter;
 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.ForesterUtil;
 
@@ -94,10 +94,10 @@ public final class check_fasta {
         }
         else {
             try {
-                final List<Sequence> seqs = FastaParser.parse( new FileInputStream( infile ) );
+                final List<MolecularSequence> seqs = FastaParser.parse( new FileInputStream( infile ) );
                 final Map<String, Short> names = new HashMap<String, Short>();
                 int duplicates = 0;
-                for( final Sequence seq : seqs ) {
+                for( final MolecularSequence seq : seqs ) {
                     if ( procSeq( infile.toString(), names, seq ) ) {
                         ++duplicates;
                     }
@@ -112,7 +112,7 @@ public final class check_fasta {
         }
     }
 
-    private static boolean procSeq( final String infile, final Map<String, Short> names, final Sequence seq ) {
+    private static boolean procSeq( final String infile, final Map<String, Short> names, final MolecularSequence seq ) {
         boolean duplicate = false;
         final String name = seq.getIdentifier();
         if ( !names.containsKey( name ) ) {
index 54458a3..7f4c304 100644 (file)
@@ -45,7 +45,7 @@ import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
 import org.forester.phylogeny.data.Identifier;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.tools.PhylogenyDecorator;
 import org.forester.tools.PhylogenyDecorator.FIELD;
 import org.forester.util.BasicTable;
@@ -399,7 +399,7 @@ public final class decorator {
     }
 
     private static Map<String, String> readFastaFileIntoMap( final File mapping_infile, final boolean verbose ) {
-        List<Sequence> seqs = null;
+        List<MolecularSequence> seqs = null;
         try {
             seqs = FastaParser.parse( new FileInputStream( mapping_infile ) );
         }
@@ -412,7 +412,7 @@ public final class decorator {
                     + "] is devoid of fasta-formatted sequences" );
         }
         final Map<String, String> map = new HashMap<String, String>();
-        for( final Sequence seq : seqs ) {
+        for( final MolecularSequence seq : seqs ) {
             if ( ForesterUtil.isEmpty( seq.getIdentifier() ) ) {
                 ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
                         + "] contains sequence with empty identifier" );
index 40d9581..947ed2a 100644 (file)
@@ -97,7 +97,7 @@ import org.forester.phylogeny.data.Taxonomy;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;\r
 import org.forester.phylogeny.factories.PhylogenyFactory;\r
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;\r
-import org.forester.sequence.Sequence;\r
+import org.forester.sequence.MolecularSequence;\r
 import org.forester.util.BasicDescriptiveStatistics;\r
 import org.forester.util.BasicTable;\r
 import org.forester.util.BasicTableParser;\r
@@ -153,7 +153,7 @@ public final class MainFrameApplication extends MainFrame {
     private PhylogeneticInferenceOptions     _phylogenetic_inference_options       = null;\r
     private Msa                              _msa                                  = null;\r
     private File                             _msa_file                             = null;\r
-    private List<Sequence>                   _seqs                                 = null;\r
+    private List<MolecularSequence>          _seqs                                 = null;\r
     private File                             _seqs_file                            = null;\r
     JMenuItem                                _read_values_jmi;\r
     JMenuItem                                _read_seqs_jmi;\r
@@ -589,7 +589,7 @@ public final class MainFrameApplication extends MainFrame {
         return _msa_file;\r
     }\r
 \r
-    public List<Sequence> getSeqs() {\r
+    public List<MolecularSequence> getSeqs() {\r
         return _seqs;\r
     }\r
 \r
@@ -697,11 +697,11 @@ public final class MainFrameApplication extends MainFrame {
         if ( ( file != null ) && !file.isDirectory() && ( result == JFileChooser.APPROVE_OPTION ) ) {\r
             setSeqsFile( null );\r
             setSeqs( null );\r
-            List<Sequence> seqs = null;\r
+            List<MolecularSequence> seqs = null;\r
             try {\r
                 if ( FastaParser.isLikelyFasta( new FileInputStream( file ) ) ) {\r
                     seqs = FastaParser.parse( new FileInputStream( file ) );\r
-                    for( final Sequence seq : seqs ) {\r
+                    for( final MolecularSequence seq : seqs ) {\r
                         System.out.println( SequenceWriter.toFasta( seq, 60 ) );\r
                     }\r
                 }\r
@@ -1260,7 +1260,7 @@ public final class MainFrameApplication extends MainFrame {
         _msa_file = msa_file;\r
     }\r
 \r
-    void setSeqs( final List<Sequence> seqs ) {\r
+    void setSeqs( final List<MolecularSequence> seqs ) {\r
         _seqs = seqs;\r
     }\r
 \r
@@ -1431,7 +1431,7 @@ public final class MainFrameApplication extends MainFrame {
         }\r
         final int result = _sequences_filechooser.showOpenDialog( _contentpane );\r
         final File file = _sequences_filechooser.getSelectedFile();\r
-        List<Sequence> seqs = null;\r
+        List<MolecularSequence> seqs = null;\r
         if ( ( file != null ) && !file.isDirectory() && ( result == JFileChooser.APPROVE_OPTION ) ) {\r
             try {\r
                 if ( FastaParser.isLikelyFasta( new FileInputStream( file ) ) ) {\r
@@ -1480,13 +1480,13 @@ public final class MainFrameApplication extends MainFrame {
             }\r
         }\r
         if ( seqs != null ) {\r
-            for( final Sequence seq : seqs ) {\r
+            for( final MolecularSequence seq : seqs ) {\r
                 System.out.println( seq.getIdentifier() );\r
             }\r
             final Phylogeny phy = getCurrentTreePanel().getPhylogeny();\r
             int total_counter = 0;\r
             int attached_counter = 0;\r
-            for( final Sequence seq : seqs ) {\r
+            for( final MolecularSequence seq : seqs ) {\r
                 ++total_counter;\r
                 final String seq_name = seq.getIdentifier();\r
                 if ( !ForesterUtil.isEmpty( seq_name ) ) {\r
index e96c197..314ee51 100644 (file)
@@ -4812,18 +4812,18 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
                         if ( getOptions().isRightLineUpDomains() ) {
                             rds.render( ( float ) ( ( getMaxDistanceToRoot() * getXcorrectionFactor() )
                                     + _length_of_longest_text + ( ( _longest_domain - rds.getTotalLength() ) * rds
-                                    .getRenderingFactorWidth() ) ), node.getYcoord() - ( h / 2 ), g, this, to_pdf );
+                                    .getRenderingFactorWidth() ) ), node.getYcoord() - ( h / 2.0f ), g, this, to_pdf );
                         }
                         else {
                             rds.render( ( float ) ( ( getMaxDistanceToRoot() * getXcorrectionFactor() ) + _length_of_longest_text ),
-                                        node.getYcoord() - ( h / 2 ),
+                                        node.getYcoord() - ( h / 2.0f ),
                                         g,
                                         this,
                                         to_pdf );
                         }
                     }
                     else {
-                        rds.render( node.getXcoord() + x, node.getYcoord() - ( h / 2 ), g, this, to_pdf );
+                        rds.render( node.getXcoord() + x, node.getYcoord() - ( h / 2.0f ), g, this, to_pdf );
                     }
                 }
                 else {
@@ -4831,14 +4831,14 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
                         rds.render( ( ( getPhylogeny().getFirstExternalNode().getXcoord() + _length_of_longest_text ) - 20 )
                                             + ( ( _longest_domain - rds.getTotalLength() ) * rds
                                                     .getRenderingFactorWidth() ),
-                                    node.getYcoord() - ( h / 2 ),
+                                    node.getYcoord() - ( h / 2.0f ),
                                     g,
                                     this,
                                     to_pdf );
                     }
                     else {
                         rds.render( getPhylogeny().getFirstExternalNode().getXcoord() + _length_of_longest_text,
-                                    node.getYcoord() - ( h / 2 ),
+                                    node.getYcoord() - ( h / 2.0f ),
                                     g,
                                     this,
                                     to_pdf );
@@ -4873,7 +4873,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
                 && ( node.getNodeData().getSequence().isMolecularSequenceAligned() )
                 && ( !ForesterUtil.isEmpty( node.getNodeData().getSequence().getMolecularSequence() ) ) ) {
             final RenderableMsaSequence rs = RenderableMsaSequence.createInstance( node.getNodeData().getSequence()
-                    .getMolecularSequence(), getConfiguration() );
+                    .getMolecularSequence(), node.getNodeData().getSequence().getType(), getConfiguration() );
             if ( rs != null ) {
                 final int default_height = 7;
                 float y = getYdistance();
@@ -4884,14 +4884,14 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
                 rs.setRenderingHeight( h > 1 ? h : 2 );
                 if ( getControlPanel().isDrawPhylogram() ) {
                     rs.render( ( float ) ( ( getMaxDistanceToRoot() * getXcorrectionFactor() ) + _length_of_longest_text ),
-                               node.getYcoord() - ( h / 2 ),
+                               node.getYcoord() - ( h / 2.0f ),
                                g,
                                this,
                                to_pdf );
                 }
                 else {
                     rs.render( getPhylogeny().getFirstExternalNode().getXcoord() + _length_of_longest_text,
-                               node.getYcoord() - ( h / 2 ),
+                               node.getYcoord() - ( h / 2.0f ),
                                g,
                                this,
                                to_pdf );
index c83a5da..2f09ccf 100644 (file)
@@ -36,6 +36,8 @@ import java.io.Writer;
 import org.forester.archaeopteryx.Configuration;
 import org.forester.archaeopteryx.TreePanel;
 import org.forester.phylogeny.data.PhylogenyData;
+import org.forester.sequence.MolecularSequence;
+import org.forester.sequence.MolecularSequence.TYPE;
 
 public final class RenderableMsaSequence implements RenderablePhylogenyData {
 
@@ -46,6 +48,7 @@ public final class RenderableMsaSequence implements RenderablePhylogenyData {
     private final Rectangle2D            _rectangle              = new Rectangle2D.Float();
     private double                       _height                 = DEFAULT_HEIGHT;
     private final float                  _width                  = DEFAULT_WIDTH;
+    private MolecularSequence.TYPE       _type;
     private static RenderableMsaSequence _instance               = null;
 
     private RenderableMsaSequence() {
@@ -150,11 +153,39 @@ public final class RenderableMsaSequence implements RenderablePhylogenyData {
     }
 
     private Color calculateColor( final char c ) {
-        if ( ( c == 'G' ) || ( c == 'A' ) || ( c == 'S' ) || ( c == 'T' ) ) {
+        if ( _type == TYPE.AA ) {
+            return calculateAAColor( c );
+        }
+        return calculateNucleotideColor( c );
+    }
+
+    private Color calculateNucleotideColor( final char c ) {
+        if ( c == 'A' ) {
+            return Color.YELLOW;
+        }
+        if ( ( c == 'T' ) || ( c == 'U' ) ) {
             return Color.ORANGE;
         }
+        if ( c == 'G' ) {
+            return Color.BLUE;
+        }
+        if ( c == 'C' ) {
+            return Color.CYAN;
+        }
+        else if ( c == '-' ) {
+            return Color.GRAY;
+        }
+        else {
+            return Color.GRAY;
+        }
+    }
+
+    private Color calculateAAColor( final char c ) {
+        if ( ( c == 'G' ) || ( c == 'A' ) || ( c == 'S' ) || ( c == 'T' ) ) {
+            return Color.YELLOW;
+        }
         else if ( ( c == 'N' ) || ( c == 'Q' ) || ( c == 'H' ) ) {
-            return Color.MAGENTA;
+            return Color.PINK;
         }
         else if ( ( c == 'D' ) || ( c == 'E' ) ) {
             return Color.RED;
@@ -165,6 +196,9 @@ public final class RenderableMsaSequence implements RenderablePhylogenyData {
         else if ( c == '-' ) {
             return Color.GRAY;
         }
+        else if ( c == 'X' ) {
+            return Color.GRAY;
+        }
         else {
             return Color.GREEN;
         }
@@ -174,10 +208,21 @@ public final class RenderableMsaSequence implements RenderablePhylogenyData {
         return _height;
     }
 
-    public static RenderableMsaSequence createInstance( final String seq, final Configuration configuration ) {
+    public static RenderableMsaSequence createInstance( final String seq,
+                                                        final String type,
+                                                        final Configuration configuration ) {
         if ( _instance == null ) {
             _instance = new RenderableMsaSequence();
         }
+        if ( type.equals( "protein" ) ) {
+            _instance._type = TYPE.AA;
+        }
+        else if ( type.equals( "dna" ) ) {
+            _instance._type = TYPE.DNA;
+        }
+        else {
+            _instance._type = TYPE.RNA;
+        }
         _instance._seq = seq.toCharArray();
         if ( configuration != null ) {
         }
index 9eab024..8850d2b 100644 (file)
@@ -48,7 +48,7 @@ import javax.swing.border.LineBorder;
 import org.forester.archaeopteryx.AptxUtil;
 import org.forester.archaeopteryx.MainFrameApplication;
 import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.DescriptiveStatistics;
 
@@ -477,9 +477,9 @@ public class PhyloInferenceDialog extends JDialog implements ActionListener {
         }
     }
 
-    DescriptiveStatistics calcSequenceStats( final List<Sequence> seqs ) {
+    DescriptiveStatistics calcSequenceStats( final List<MolecularSequence> seqs ) {
         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
-        for( final Sequence s : seqs ) {
+        for( final MolecularSequence s : seqs ) {
             stats.addValue( s.getLength() );
         }
         return stats;
index 8b89646..0f29e93 100644 (file)
@@ -47,7 +47,7 @@ import org.forester.msa.MsaMethods;
 import org.forester.msa.ResampleableMsa;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.tools.ConfidenceAssessor;
 import org.forester.util.ForesterUtil;
 
@@ -56,12 +56,12 @@ public class PhylogeneticInferrer extends RunnableProcess {
     private Msa                                _msa;
     private final MainFrameApplication         _mf;
     private final PhylogeneticInferenceOptions _options;
-    private final List<Sequence>               _seqs;
+    private final List<MolecularSequence>      _seqs;
     private final boolean                      DEBUG           = true;
     public final static String                 MSA_FILE_SUFFIX = ".aln";
     public final static String                 PWD_FILE_SUFFIX = ".pwd";
 
-    public PhylogeneticInferrer( final List<Sequence> seqs,
+    public PhylogeneticInferrer( final List<MolecularSequence> seqs,
                                  final PhylogeneticInferenceOptions options,
                                  final MainFrameApplication mf ) {
         _msa = null;
@@ -260,7 +260,8 @@ public class PhylogeneticInferrer extends RunnableProcess {
         }
     }
 
-    private Msa runMAFFT( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
+    private Msa runMAFFT( final List<MolecularSequence> seqs, final List<String> opts ) throws IOException,
+            InterruptedException {
         Msa msa = null;
         final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft()
                 .getCanonicalPath() );
index 348bec4..0b863e0 100644 (file)
@@ -42,7 +42,7 @@ import org.forester.msa.BasicMsa;
 import org.forester.msa.Msa;
 import org.forester.msa.MsaFormatException;
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 
 public class FastaParser {
 
@@ -109,11 +109,11 @@ public class FastaParser {
         return parseMsa( new ByteArrayInputStream( bytes ) );
     }
 
-    static public List<Sequence> parse( final File f ) throws IOException {
+    static public List<MolecularSequence> parse( final File f ) throws IOException {
         return parse( new FileInputStream( f ) );
     }
 
-    static public List<Sequence> parse( final InputStream is ) throws IOException {
+    static public List<MolecularSequence> parse( final InputStream is ) throws IOException {
         final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
         String line = null;
         int line_counter = 0;
@@ -151,7 +151,7 @@ public class FastaParser {
         }
         addSeq( name, current_seq, temp_msa );
         reader.close();
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int i = 0; i < temp_msa.size(); ++i ) {
             seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(),
                                                       temp_msa.get( i )[ 1 ].toString() ) );
index 507ae1f..4b6ddf5 100644 (file)
@@ -41,7 +41,7 @@ import org.forester.msa.BasicMsa;
 import org.forester.msa.Msa;
 import org.forester.msa.MsaFormatException;
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 
 public final class GeneralMsaParser {
 
@@ -168,7 +168,7 @@ public final class GeneralMsaParser {
                 }
             }
         } // while ( ( line = reader.readLine() ) != null )
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int i = 0; i < names_in_order.size(); ++i ) {
             seqs.add( BasicSequence.createAaSequence( names_in_order.get( i ), temp_msa.get( names_in_order.get( i ) )
                     .toString() ) );
index 250cfb4..23cb0ea 100644 (file)
@@ -39,6 +39,7 @@ public final class NexusConstants {
     public final static String END              = "End;";
     public final static String MATRIX           = "Matrix";
     public final static String BEGIN_CHARACTERS = "Begin Characters;";
+    public final static String BEGIN_DATA       = "Begin Data;";
     public final static String FORMAT           = "Format";
     public final static String DATATYPE         = "DataType";
     public final static String STANDARD         = "Standard";
index 86ffaef..4617a8f 100644 (file)
@@ -45,42 +45,55 @@ import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.parsers.util.PhylogenyParserException;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.ForesterUtil;
 
 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
 
-    final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
-    final private static String  end                       = NexusConstants.END.toLowerCase();
-    final private static String  endblock                  = "endblock";
-    final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
-    final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
-    final private static Pattern TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
-                                                                              Pattern.CASE_INSENSITIVE );
-    final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
-    final private static String  tree                      = NexusConstants.TREE.toLowerCase();
-    final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
-                                                                              Pattern.CASE_INSENSITIVE );
-    final private static Pattern TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
-    final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
-    private BufferedReader       _br;
-    private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
-    private boolean              _in_taxalabels;
-    private boolean              _in_translate;
-    private boolean              _in_tree;
-    private boolean              _in_trees_block;
-    private boolean              _is_rooted;
-    private String               _name;
-    private Phylogeny            _next;
-    private Object               _nexus_source;
-    private StringBuilder        _nh;
-    private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
-    private boolean              _rooted_info_present;
-    private List<String>         _taxlabels;
-    private TAXONOMY_EXTRACTION  _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
-    private String               _title;
-    private Map<String, String>  _translate_map;
-    private StringBuilder        _translate_sb;
+    final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
+    final private static String            end                       = NexusConstants.END.toLowerCase();
+    final private static String            endblock                  = "endblock";
+    final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
+    final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
+    final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
+    final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
+    final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
+    final private static String            tree                      = NexusConstants.TREE.toLowerCase();
+    final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
+    final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
+    final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
+    final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
+    private BufferedReader                 _br;
+    private boolean                        _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
+    private boolean                        _in_taxalabels;
+    private boolean                        _in_translate;
+    private boolean                        _in_tree;
+    private boolean                        _in_trees_block;
+    private boolean                        _in_data_block;
+    private boolean                        _is_rooted;
+    private String                         _datatype;
+    private String                         _name;
+    private Phylogeny                      _next;
+    private Object                         _nexus_source;
+    private StringBuilder                  _nh;
+    private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
+    private boolean                        _rooted_info_present;
+    private List<String>                   _taxlabels;
+    private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
+    private String                         _title;
+    private Map<String, String>            _translate_map;
+    private StringBuilder                  _translate_sb;
+    private Map<String, MolecularSequence> _seqs;
+    private final boolean                  _add_sequences            = true;
 
     @Override
     public String getName() {
@@ -128,6 +141,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _in_tree = false;
         _rooted_info_present = false;
         _is_rooted = false;
+        _seqs = new HashMap<String, MolecularSequence>();
         _br = ParserUtils.createReader( _nexus_source );
         getNext();
     }
@@ -214,6 +228,15 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         node.setName( node.getName().replace( '_', ' ' ).trim() );
                     }
                 }
+                if ( _add_sequences ) {
+                    if ( _seqs.containsKey( node.getName() ) ) {
+                        final MolecularSequence s = _seqs.get( node.getName() );
+                        //TODO need to check for uniqueness when adding seqs....
+                        final Sequence ns = new Sequence( s );
+                        ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
+                        node.getNodeData().addSequence( ns );
+                    }
+                }
             }
         }
         _next = p;
@@ -232,17 +255,31 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                     _in_trees_block = true;
                     _in_taxalabels = false;
                     _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
                     _title = "";
                 }
                 else if ( line_lc.startsWith( taxlabels ) ) {
+                    //TODO need to be taxa block instead
                     _in_trees_block = false;
                     _in_taxalabels = true;
                     _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
                 }
                 else if ( line_lc.startsWith( translate ) ) {
                     _translate_sb = new StringBuilder();
                     _in_taxalabels = false;
                     _in_translate = true;
+                    _in_data_block = false;
+                    _datatype = null;
+                }
+                else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
+                    _in_taxalabels = false;
+                    _in_trees_block = false;
+                    _in_translate = false;
+                    _in_data_block = true;
+                    _datatype = null;
                 }
                 else if ( _in_trees_block ) {
                     if ( line_lc.startsWith( "title" ) ) {
@@ -252,6 +289,11 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         }
                     }
                     else if ( line_lc.startsWith( "link" ) ) {
+                        final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                        if ( link_m.lookingAt() ) {
+                            final String link = link_m.group( 1 );
+                            System.out.println( "link taxa:" + link );
+                        }
                     }
                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
                         _in_trees_block = false;
@@ -347,6 +389,53 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         }
                     }
                 }
+                if ( _in_data_block ) {
+                    if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
+                        _in_data_block = false;
+                        _datatype = null;
+                    }
+                    else if ( line_lc.startsWith( "link" ) ) {
+                        final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                        if ( link_m.lookingAt() ) {
+                            final String link = link_m.group( 1 );
+                            System.out.println( "link taxa:" + link );
+                        }
+                    }
+                    else {
+                        final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
+                        if ( datatype_matcher.find() ) {
+                            _datatype = datatype_matcher.group( 1 );
+                            System.out.println( _datatype );
+                        }
+                        else {
+                            if ( ( _datatype != null )
+                                    && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
+                                            .equals( "rna" ) ) ) {
+                                if ( line.endsWith( ";" ) ) {
+                                    _in_data_block = false;
+                                    line = line.substring( 0, line.length() - 1 );
+                                }
+                                final Matcher aln_matcher = ALN_PATTERN.matcher( line );
+                                if ( aln_matcher.matches() ) {
+                                    final String id = aln_matcher.group( 1 );
+                                    final String seq = aln_matcher.group( 2 );
+                                    MolecularSequence s = null;
+                                    if ( _datatype.equals( "protein" ) ) {
+                                        s = BasicSequence.createAaSequence( id, seq );
+                                    }
+                                    else if ( _datatype.equals( "dna" ) ) {
+                                        s = BasicSequence.createDnaSequence( id, seq );
+                                    }
+                                    else {
+                                        s = BasicSequence.createRnaSequence( id, seq );
+                                    }
+                                    _seqs.put( id, s );
+                                    System.out.println( s );
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
         if ( _nh.length() > 0 ) {
index 0829eb7..bd5a5ae 100644 (file)
@@ -6,7 +6,7 @@ import java.io.IOException;
 import java.io.Writer;
 import java.util.List;
 
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.ForesterUtil;
 
 public class SequenceWriter {
@@ -15,7 +15,7 @@ public class SequenceWriter {
         FASTA;
     }
 
-    public static StringBuilder toFasta( final Sequence seq, final int width ) {
+    public static StringBuilder toFasta( final MolecularSequence seq, final int width ) {
         return toFasta( seq.getIdentifier(), seq.getMolecularSequenceAsString(), width );
     }
 
@@ -44,7 +44,7 @@ public class SequenceWriter {
         return sb;
     }
 
-    public static void toFasta( final Sequence seq, final Writer w, final int width ) throws IOException {
+    public static void toFasta( final MolecularSequence seq, final Writer w, final int width ) throws IOException {
         w.write( ">" );
         w.write( seq.getIdentifier() );
         w.write( ForesterUtil.LINE_SEPARATOR );
@@ -67,20 +67,22 @@ public class SequenceWriter {
         }
     }
 
-    public static void writeSeqs( final List<Sequence> seqs, final File file, final SEQ_FORMAT format, final int width )
-            throws IOException {
+    public static void writeSeqs( final List<MolecularSequence> seqs,
+                                  final File file,
+                                  final SEQ_FORMAT format,
+                                  final int width ) throws IOException {
         final Writer w = ForesterUtil.createBufferedWriter( file );
         SequenceWriter.writeSeqs( seqs, w, format, width );
         w.close();
     }
 
-    public static void writeSeqs( final List<Sequence> seqs,
+    public static void writeSeqs( final List<MolecularSequence> seqs,
                                   final Writer writer,
                                   final SEQ_FORMAT format,
                                   final int width ) throws IOException {
         switch ( format ) {
             case FASTA:
-                for( final Sequence s : seqs ) {
+                for( final MolecularSequence s : seqs ) {
                     toFasta( s, writer, width );
                     writer.write( ForesterUtil.LINE_SEPARATOR );
                 }
index dc383c9..25f0db7 100644 (file)
@@ -36,8 +36,8 @@ import java.util.Set;
 import org.forester.io.writers.SequenceWriter;
 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
-import org.forester.sequence.Sequence.TYPE;
+import org.forester.sequence.MolecularSequence;
+import org.forester.sequence.MolecularSequence.TYPE;
 import org.forester.util.ForesterUtil;
 
 public class BasicMsa implements Msa {
@@ -65,8 +65,8 @@ public class BasicMsa implements Msa {
     }
 
     @Override
-    public List<Sequence> asSequenceList() {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+    public List<MolecularSequence> asSequenceList() {
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int i = 0; i < getNumberOfSequences(); ++i ) {
             seqs.add( getSequence( i ) );
         }
@@ -103,12 +103,12 @@ public class BasicMsa implements Msa {
     }
 
     @Override
-    public Sequence getSequence( final int row ) {
+    public MolecularSequence getSequence( final int row ) {
         return new BasicSequence( getIdentifier( row ), _data[ row ], getType() );
     }
 
     @Override
-    public Sequence getSequence( final String id ) {
+    public MolecularSequence getSequence( final String id ) {
         for( int i = 0; i < getNumberOfSequences(); ++i ) {
             if ( getIdentifier( i ).equals( id ) ) {
                 return getSequence( i );
@@ -133,7 +133,7 @@ public class BasicMsa implements Msa {
 
     @Override
     public boolean isGapAt( final int row, final int col ) {
-        return getResidueAt( row, col ) == Sequence.GAP;
+        return getResidueAt( row, col ) == MolecularSequence.GAP;
     }
 
     @Override
@@ -211,7 +211,7 @@ public class BasicMsa implements Msa {
         w.write( "   Matrix" );
         w.write( ForesterUtil.LINE_SEPARATOR );
         for( int row = 0; row < getNumberOfSequences(); ++row ) {
-            final Sequence seq = getSequence( row );
+            final MolecularSequence seq = getSequence( row );
             final String s = seq.getMolecularSequenceAsString();
             w.write( "      " );
             w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() );
@@ -238,14 +238,14 @@ public class BasicMsa implements Msa {
         }
     }
 
-    public static Msa createInstance( final List<Sequence> seqs ) {
+    public static Msa createInstance( final List<MolecularSequence> seqs ) {
         if ( seqs.size() < 1 ) {
             throw new IllegalArgumentException( "cannot create msa from less than one sequence" );
         }
         final int length = seqs.get( 0 ).getLength();
         final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() );
         for( int row = 0; row < seqs.size(); ++row ) {
-            final Sequence seq = seqs.get( row );
+            final MolecularSequence seq = seqs.get( row );
             if ( seq.getLength() != length ) {
                 throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length ["
                         + seq.getIdentifier() + "]" );
index 417e889..de57797 100644 (file)
@@ -35,7 +35,7 @@ import java.util.List;
 import org.forester.io.parsers.FastaParser;
 import org.forester.io.writers.SequenceWriter;
 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.SystemCommandExecutor;
 
 public final class ClustalOmega extends MsaInferrer {
@@ -72,7 +72,8 @@ public final class ClustalOmega extends MsaInferrer {
     }
 
     @Override
-    public Msa infer( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
+    public Msa infer( final List<MolecularSequence> seqs, final List<String> opts ) throws IOException,
+            InterruptedException {
         final File file = File.createTempFile( "__clustalo_input_", ".fasta" );
         file.deleteOnExit();
         final BufferedWriter writer = new BufferedWriter( new FileWriter( file ) );
index fee7bd3..5126e47 100644 (file)
@@ -27,7 +27,7 @@ package org.forester.msa;
 import java.util.List;
 
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 
 public final class DeleteableMsa extends BasicMsa {
 
@@ -102,7 +102,7 @@ public final class DeleteableMsa extends BasicMsa {
         }
     }
 
-    final public Sequence deleteRow( final String id, final boolean return_removed_seq ) {
+    final public MolecularSequence deleteRow( final String id, final boolean return_removed_seq ) {
         int row = -1;
         for( int r = 0; r < getNumberOfSequences(); ++r ) {
             if ( getIdentifier( r ).equals( id ) ) {
@@ -113,14 +113,14 @@ public final class DeleteableMsa extends BasicMsa {
         if ( row < 0 ) {
             throw new IllegalArgumentException( "id [" + id + "] not found" );
         }
-        Sequence s = null;
+        MolecularSequence s = null;
         StringBuilder sb = null;
         if ( return_removed_seq ) {
             s = getSequence( row );
             final char[] x = s.getMolecularSequence();
             sb = new StringBuilder( x.length );
             for( final char element : x ) {
-                if ( element != Sequence.GAP ) {
+                if ( element != MolecularSequence.GAP ) {
                     sb.append( element );
                 }
             }
@@ -158,7 +158,7 @@ public final class DeleteableMsa extends BasicMsa {
     }
 
     @Override
-    public Sequence getSequence( final int row ) {
+    public MolecularSequence getSequence( final int row ) {
         checkRow( row );
         return new BasicSequence( getIdentifier( row ), getSequenceAsString( row ).toString(), getType() );
     }
@@ -166,7 +166,7 @@ public final class DeleteableMsa extends BasicMsa {
     final public boolean isAllGap( final int col ) {
         final int m_col = _mapped_col_positions[ col ];
         for( int j = 0; j < getNumberOfSequences(); ++j ) {
-            if ( super.getResidueAt( _mapped_row_positions[ j ], m_col ) != Sequence.GAP ) {
+            if ( super.getResidueAt( _mapped_row_positions[ j ], m_col ) != MolecularSequence.GAP ) {
                 return false;
             }
         }
@@ -214,7 +214,7 @@ public final class DeleteableMsa extends BasicMsa {
         --_seqs;
     }
 
-    public final static DeleteableMsa createInstance( final List<Sequence> seqs ) {
+    public final static DeleteableMsa createInstance( final List<MolecularSequence> seqs ) {
         return new DeleteableMsa( ( BasicMsa ) BasicMsa.createInstance( seqs ) );
     }
 
index ae9fdf1..b1daad5 100644 (file)
@@ -35,7 +35,7 @@ import java.util.List;
 import org.forester.io.parsers.FastaParser;
 import org.forester.io.writers.SequenceWriter;
 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.SystemCommandExecutor;
 
 public final class Mafft extends MsaInferrer {
@@ -72,7 +72,8 @@ public final class Mafft extends MsaInferrer {
     }
 
     @Override
-    public Msa infer( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
+    public Msa infer( final List<MolecularSequence> seqs, final List<String> opts ) throws IOException,
+            InterruptedException {
         final File file = File.createTempFile( "__mafft_input_", ".fasta" );
         file.deleteOnExit();
         final BufferedWriter writer = new BufferedWriter( new FileWriter( file ) );
index ed73cd4..066b10f 100644 (file)
@@ -29,8 +29,8 @@ import java.io.IOException;
 import java.io.Writer;
 import java.util.List;
 
-import org.forester.sequence.Sequence;
-import org.forester.sequence.Sequence.TYPE;
+import org.forester.sequence.MolecularSequence;
+import org.forester.sequence.MolecularSequence.TYPE;
 
 public interface Msa {
 
@@ -52,11 +52,11 @@ public interface Msa {
 
     public List<Character> getColumnAt( int col );
 
-    public Sequence getSequence( final String id );
+    public MolecularSequence getSequence( final String id );
 
-    public Sequence getSequence( final int row );
+    public MolecularSequence getSequence( final int row );
 
-    public List<Sequence> asSequenceList();
+    public List<MolecularSequence> asSequenceList();
 
     public StringBuffer getSequenceAsString( int row );
 
index 55aa7f7..f26b329 100644 (file)
@@ -29,7 +29,7 @@ import java.io.File;
 import java.io.IOException;
 import java.util.List;
 
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.SystemCommandExecutor;
 
 public abstract class MsaInferrer {
@@ -49,6 +49,6 @@ public abstract class MsaInferrer {
 
     public abstract Msa infer( File path_to_input_seqs, List<String> opts ) throws IOException, InterruptedException;
 
-    public abstract Msa infer( final List<Sequence> seqs, final List<String> opts ) throws IOException,
+    public abstract Msa infer( final List<MolecularSequence> seqs, final List<String> opts ) throws IOException,
             InterruptedException;
 }
index c94869e..edca9b7 100644 (file)
@@ -34,7 +34,7 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.DescriptiveStatistics;
 
@@ -67,7 +67,7 @@ public final class MsaMethods {
                 ++new_length;
             }
         }
-        final List<Sequence> seqs = new ArrayList<Sequence>( msa.getNumberOfSequences() );
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>( msa.getNumberOfSequences() );
         for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
             final char[] mol_seq = new char[ new_length ];
             int new_col = 0;
@@ -76,7 +76,7 @@ public final class MsaMethods {
                 if ( !delete_cols[ col ] ) {
                     final char residue = msa.getResidueAt( row, col );
                     mol_seq[ new_col++ ] = ( residue );
-                    if ( residue != Sequence.GAP ) {
+                    if ( residue != MolecularSequence.GAP ) {
                         ++non_gap_cols_sum;
                     }
                 }
@@ -119,7 +119,7 @@ public final class MsaMethods {
         int gaps = 0;
         for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
             for( int i = 0; i < msa.getLength(); ++i ) {
-                if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
+                if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) {
                     gaps++;
                 }
             }
@@ -191,7 +191,7 @@ public final class MsaMethods {
     final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
         for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
-            final Sequence s = msa.getSequence( row );
+            final MolecularSequence s = msa.getSequence( row );
             stats.addValue( s.getLength() - s.getNumberOfGapResidues() );
         }
         return stats;
@@ -221,7 +221,7 @@ public final class MsaMethods {
     public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
         final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
         for( final Character r : msa.getColumnAt( column ) ) {
-            if ( r != Sequence.GAP ) {
+            if ( r != MolecularSequence.GAP ) {
                 if ( !map.containsKey( r ) ) {
                     map.put( r, 1 );
                 }
@@ -238,7 +238,7 @@ public final class MsaMethods {
     }
 
     final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
             if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
                 seqs.add( msa.getSequence( row ) );
@@ -251,7 +251,7 @@ public final class MsaMethods {
     }
 
     final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
             if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
                 seqs.add( msa.getSequence( row ) );
@@ -268,7 +268,7 @@ public final class MsaMethods {
         for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
             int eff_length = 0;
             for( int i = 0; i < msa.getLength(); ++i ) {
-                if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
+                if ( msa.getResidueAt( seq, i ) != MolecularSequence.GAP ) {
                     eff_length++;
                 }
             }
@@ -280,7 +280,7 @@ public final class MsaMethods {
     }
 
     final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
         for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
             if ( !to_remove_rows.contains( row ) ) {
                 seqs.add( msa.getSequence( row ) );
@@ -296,7 +296,7 @@ public final class MsaMethods {
         final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
         for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
             final char c = msa.getResidueAt( row, col );
-            if ( c != Sequence.GAP ) {
+            if ( c != MolecularSequence.GAP ) {
                 if ( !counts.containsKey( c ) ) {
                     counts.put( c, 1 );
                 }
index 4427ea7..1ab7ac7 100644 (file)
@@ -26,7 +26,7 @@
 package org.forester.msa;
 
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 
 public final class ResampleableMsa extends BasicMsa {
 
@@ -58,7 +58,7 @@ public final class ResampleableMsa extends BasicMsa {
     }
 
     @Override
-    public Sequence getSequence( final int row ) {
+    public MolecularSequence getSequence( final int row ) {
         return new BasicSequence( getIdentifier( row ), getSequenceAsString( row ).toString(), getType() );
     }
 }
index 8f0ea1b..cea2460 100644 (file)
@@ -64,7 +64,7 @@ import org.forester.phylogeny.data.NodeVisualData;
 import org.forester.phylogeny.data.NodeVisualData.NodeFill;
 import org.forester.phylogeny.data.NodeVisualData.NodeShape;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.tools.ConfidenceAssessor;
 import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.DescriptiveStatistics;
@@ -72,28 +72,28 @@ import org.forester.util.ForesterUtil;
 
 public class MsaCompactor {
 
-    final private static NumberFormat NF_1                       = new DecimalFormat( "0.#" );
-    final private static NumberFormat NF_3                       = new DecimalFormat( "0.###" );
-    final private static NumberFormat NF_4                       = new DecimalFormat( "0.####" );
-    private boolean                   _calculate_shannon_entropy = false;
+    final private static NumberFormat          NF_1                       = new DecimalFormat( "0.#" );
+    final private static NumberFormat          NF_3                       = new DecimalFormat( "0.###" );
+    final private static NumberFormat          NF_4                       = new DecimalFormat( "0.####" );
+    private boolean                            _calculate_shannon_entropy = false;
     //
-    private String                    _infile_name               = null;
-    private final short               _longest_id_length;
+    private String                             _infile_name               = null;
+    private final short                        _longest_id_length;
     //
-    private String                    _maffts_opts               = "--auto";
-    private DeleteableMsa             _msa                       = null;
-    private boolean                   _norm                      = true;
-    private File                      _out_file_base             = null;
-    private MSA_FORMAT                _output_format             = MSA_FORMAT.FASTA;
-    private String                    _path_to_mafft             = null;
-    private boolean                   _phylogentic_inference     = false;
+    private String                             _maffts_opts               = "--auto";
+    private DeleteableMsa                      _msa                       = null;
+    private boolean                            _norm                      = true;
+    private File                               _out_file_base             = null;
+    private MSA_FORMAT                         _output_format             = MSA_FORMAT.FASTA;
+    private String                             _path_to_mafft             = null;
+    private boolean                            _phylogentic_inference     = false;
     //
-    private boolean                   _realign                   = false;
-    private final SortedSet<String>   _removed_seq_ids;
-    private final ArrayList<Sequence> _removed_seqs;
-    private File                      _removed_seqs_out_base     = null;
-    private int                       _step                      = -1;
-    private int                       _step_for_diagnostics      = -1;
+    private boolean                            _realign                   = false;
+    private final SortedSet<String>            _removed_seq_ids;
+    private final ArrayList<MolecularSequence> _removed_seqs;
+    private File                               _removed_seqs_out_base     = null;
+    private int                                _step                      = -1;
+    private int                                _step_for_diagnostics      = -1;
     static {
         NF_1.setRoundingMode( RoundingMode.HALF_UP );
         NF_4.setRoundingMode( RoundingMode.HALF_UP );
@@ -104,7 +104,7 @@ public class MsaCompactor {
         _msa = msa;
         _removed_seq_ids = new TreeSet<String>();
         _longest_id_length = _msa.determineMaxIdLength();
-        _removed_seqs = new ArrayList<Sequence>();
+        _removed_seqs = new ArrayList<MolecularSequence>();
     }
 
     public final Phylogeny calcTree() {
@@ -304,7 +304,7 @@ public class MsaCompactor {
         while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
@@ -354,7 +354,7 @@ public class MsaCompactor {
         while ( _msa.getLength() > length ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( _msa.getLength() <= length ) ) {
@@ -404,7 +404,7 @@ public class MsaCompactor {
         for( int i = 0; i < to_remove_ids.size(); ++i ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
@@ -510,7 +510,7 @@ public class MsaCompactor {
         return s;
     }
 
-    final int calcNonGapResidues( final Sequence seq ) {
+    final int calcNonGapResidues( final MolecularSequence seq ) {
         int ng = 0;
         for( int i = 0; i < seq.getLength(); ++i ) {
             if ( !seq.isGapAt( i ) ) {
index 9426a96..a5302b7 100644 (file)
@@ -1814,7 +1814,7 @@ public class PhylogenyMethods {
 \r
     public static void addMolecularSeqsToTree( final Phylogeny phy, final Msa msa ) {\r
         for( int s = 0; s < msa.getNumberOfSequences(); ++s ) {\r
-            final org.forester.sequence.Sequence seq = msa.getSequence( s );\r
+            final org.forester.sequence.MolecularSequence seq = msa.getSequence( s );\r
             final PhylogenyNode node = phy.getNode( seq.getIdentifier() );\r
             final org.forester.phylogeny.data.Sequence new_seq = new Sequence();\r
             new_seq.setMolecularSequenceAligned( true );\r
index 525bc1b..bedb112 100644 (file)
@@ -37,6 +37,8 @@ import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.io.parsers.phyloxml.PhyloXmlMapping;
 import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
 import org.forester.io.writers.PhylogenyWriter;
+import org.forester.sequence.MolecularSequence;
+import org.forester.sequence.MolecularSequence.TYPE;
 import org.forester.util.ForesterUtil;
 
 public class Sequence implements PhylogenyData, MultipleUris, Comparable<Sequence> {
@@ -60,6 +62,31 @@ public class Sequence implements PhylogenyData, MultipleUris, Comparable<Sequenc
         init();
     }
 
+    public Sequence( final MolecularSequence mol_seq ) {
+        init();
+        setMolecularSequence( mol_seq.getMolecularSequenceAsString() );
+        setName( mol_seq.getIdentifier() );
+        String type;
+        if ( mol_seq.getType() == TYPE.AA ) {
+            type = "protein";
+        }
+        else if ( mol_seq.getType() == TYPE.DNA ) {
+            type = "dna";
+        }
+        else if ( mol_seq.getType() == TYPE.RNA ) {
+            type = "rna";
+        }
+        else {
+            throw new IllegalArgumentException( "unknown sequence type " + mol_seq.getType() );
+        }
+        try {
+            setType( type );
+        }
+        catch ( final PhyloXmlDataFormatException e ) {
+            throw new IllegalArgumentException( "don't know how to handle type " + mol_seq.getType() );
+        }
+    }
+
     public void addAnnotation( final Annotation annotation ) {
         getAnnotations().add( annotation );
     }
index dccc191..f7a8917 100644 (file)
@@ -28,7 +28,7 @@ package org.forester.sequence;
 
 import org.forester.util.ForesterUtil;
 
-public class BasicSequence implements Sequence {
+public class BasicSequence implements MolecularSequence {
 
     private final char[] _mol_sequence;
     private String       _identifier;
@@ -114,7 +114,7 @@ public class BasicSequence implements Sequence {
         if ( obj.getClass() != getClass() ) {
             return false;
         }
-        final Sequence other = ( Sequence ) obj;
+        final MolecularSequence other = ( MolecularSequence ) obj;
         if ( getMolecularSequenceAsString().equals( other.getMolecularSequenceAsString() ) ) {
             return true;
         }
@@ -135,7 +135,7 @@ public class BasicSequence implements Sequence {
         return sb.toString();
     }
 
-    public static Sequence copySequence( final Sequence seq ) {
+    public static MolecularSequence copySequence( final MolecularSequence seq ) {
         final char[] s = new char[ seq.getMolecularSequence().length ];
         for( int i = 0; i < seq.getMolecularSequence().length; i++ ) {
             s[ i ] = seq.getMolecularSequence()[ i ];
@@ -143,17 +143,17 @@ public class BasicSequence implements Sequence {
         return new BasicSequence( new String( seq.getIdentifier() ), s, seq.getType() );
     }
 
-    public static Sequence createAaSequence( final String identifier, final String mol_sequence ) {
+    public static MolecularSequence createAaSequence( final String identifier, final String mol_sequence ) {
         return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
                 .replaceAll( AA_REGEXP, Character.toString( UNSPECIFIED_AA ) ), TYPE.AA );
     }
 
-    public static Sequence createDnaSequence( final String identifier, final String mol_sequence ) {
+    public static MolecularSequence createDnaSequence( final String identifier, final String mol_sequence ) {
         return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
                 .replaceAll( DNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.DNA );
     }
 
-    public static Sequence createRnaSequence( final String identifier, final String mol_sequence ) {
+    public static MolecularSequence createRnaSequence( final String identifier, final String mol_sequence ) {
         return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
                 .replaceAll( RNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.RNA );
     }
@@ -26,7 +26,7 @@
 
 package org.forester.sequence;
 
-public interface Sequence {
+public interface MolecularSequence {
 
     public static final char   UNSPECIFIED_AA  = 'X';
     public static final char   UNSPECIFIED_NUC = 'N';
index 22253df..ba18142 100644 (file)
@@ -106,7 +106,7 @@ import org.forester.sdi.SDI;
 import org.forester.sdi.SDIR;
 import org.forester.sdi.TestGSDI;
 import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.species.BasicSpecies;
 import org.forester.species.Species;
 import org.forester.surfacing.TestSurfacing;
@@ -1552,7 +1552,7 @@ public final class Test {
 
     private static boolean testAminoAcidSequence() {
         try {
-            final Sequence aa1 = BasicSequence.createAaSequence( "aa1", "aAklm-?xX*z$#" );
+            final MolecularSequence aa1 = BasicSequence.createAaSequence( "aa1", "aAklm-?xX*z$#" );
             if ( aa1.getLength() != 13 ) {
                 return false;
             }
@@ -1565,15 +1565,15 @@ public final class Test {
             if ( !new String( aa1.getMolecularSequence() ).equals( "AAKLM-XXX*ZXX" ) ) {
                 return false;
             }
-            final Sequence aa2 = BasicSequence.createAaSequence( "aa3", "ARNDCQEGHILKMFPSTWYVX*-BZOJU" );
+            final MolecularSequence aa2 = BasicSequence.createAaSequence( "aa3", "ARNDCQEGHILKMFPSTWYVX*-BZOJU" );
             if ( !new String( aa2.getMolecularSequence() ).equals( "ARNDCQEGHILKMFPSTWYVX*-BZXXU" ) ) {
                 return false;
             }
-            final Sequence dna1 = BasicSequence.createDnaSequence( "dna1", "ACGTUX*-?RYMKWSN" );
+            final MolecularSequence dna1 = BasicSequence.createDnaSequence( "dna1", "ACGTUX*-?RYMKWSN" );
             if ( !new String( dna1.getMolecularSequence() ).equals( "ACGTNN*-NRYMKWSN" ) ) {
                 return false;
             }
-            final Sequence rna1 = BasicSequence.createRnaSequence( "rna1", "..ACGUTX*-?RYMKWSN" );
+            final MolecularSequence rna1 = BasicSequence.createRnaSequence( "rna1", "..ACGUTX*-?RYMKWSN" );
             if ( !new String( rna1.getMolecularSequence() ).equals( "--ACGUNN*-NRYMKWSN" ) ) {
                 return false;
             }
@@ -6134,11 +6134,11 @@ public final class Test {
 
     private static boolean testMsaQualityMethod() {
         try {
-            final Sequence s0 = BasicSequence.createAaSequence( "a", "ABAXEFGHIJJE-" );
-            final Sequence s1 = BasicSequence.createAaSequence( "b", "ABBXEFGHIJJBB" );
-            final Sequence s2 = BasicSequence.createAaSequence( "c", "AXCXEFGHIJJ--" );
-            final Sequence s3 = BasicSequence.createAaSequence( "d", "AXDDEFGHIJ---" );
-            final List<Sequence> l = new ArrayList<Sequence>();
+            final MolecularSequence s0 = BasicSequence.createAaSequence( "a", "ABAXEFGHIJJE-" );
+            final MolecularSequence s1 = BasicSequence.createAaSequence( "b", "ABBXEFGHIJJBB" );
+            final MolecularSequence s2 = BasicSequence.createAaSequence( "c", "AXCXEFGHIJJ--" );
+            final MolecularSequence s3 = BasicSequence.createAaSequence( "d", "AXDDEFGHIJ---" );
+            final List<MolecularSequence> l = new ArrayList<MolecularSequence>();
             l.add( s0 );
             l.add( s1 );
             l.add( s2 );
@@ -6175,11 +6175,11 @@ public final class Test {
 
     private static boolean testMsaEntropy() {
         try {
-            final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAAAAA" );
-            final Sequence s1 = BasicSequence.createAaSequence( "b", "AAAIACC" );
-            final Sequence s2 = BasicSequence.createAaSequence( "c", "AAIIIIF" );
-            final Sequence s3 = BasicSequence.createAaSequence( "d", "AIIIVVW" );
-            final List<Sequence> l = new ArrayList<Sequence>();
+            final MolecularSequence s0 = BasicSequence.createAaSequence( "a", "AAAAAAA" );
+            final MolecularSequence s1 = BasicSequence.createAaSequence( "b", "AAAIACC" );
+            final MolecularSequence s2 = BasicSequence.createAaSequence( "c", "AAIIIIF" );
+            final MolecularSequence s3 = BasicSequence.createAaSequence( "d", "AIIIVVW" );
+            final List<MolecularSequence> l = new ArrayList<MolecularSequence>();
             l.add( s0 );
             l.add( s1 );
             l.add( s2 );
@@ -6200,7 +6200,7 @@ public final class Test {
             System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 4 ) );
             System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 5 ) );
             System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 6 ) );
-            final List<Sequence> l2 = new ArrayList<Sequence>();
+            final List<MolecularSequence> l2 = new ArrayList<MolecularSequence>();
             l2.add( BasicSequence.createAaSequence( "1", "AAAAAAA" ) );
             l2.add( BasicSequence.createAaSequence( "2", "AAAIACC" ) );
             l2.add( BasicSequence.createAaSequence( "3", "AAIIIIF" ) );
@@ -6238,13 +6238,13 @@ public final class Test {
 
     private static boolean testDeleteableMsa() {
         try {
-            final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAA" );
-            final Sequence s1 = BasicSequence.createAaSequence( "b", "BAAA" );
-            final Sequence s2 = BasicSequence.createAaSequence( "c", "CAAA" );
-            final Sequence s3 = BasicSequence.createAaSequence( "d", "DAAA" );
-            final Sequence s4 = BasicSequence.createAaSequence( "e", "EAAA" );
-            final Sequence s5 = BasicSequence.createAaSequence( "f", "FAAA" );
-            final List<Sequence> l0 = new ArrayList<Sequence>();
+            final MolecularSequence s0 = BasicSequence.createAaSequence( "a", "AAAA" );
+            final MolecularSequence s1 = BasicSequence.createAaSequence( "b", "BAAA" );
+            final MolecularSequence s2 = BasicSequence.createAaSequence( "c", "CAAA" );
+            final MolecularSequence s3 = BasicSequence.createAaSequence( "d", "DAAA" );
+            final MolecularSequence s4 = BasicSequence.createAaSequence( "e", "EAAA" );
+            final MolecularSequence s5 = BasicSequence.createAaSequence( "f", "FAAA" );
+            final List<MolecularSequence> l0 = new ArrayList<MolecularSequence>();
             l0.add( s0 );
             l0.add( s1 );
             l0.add( s2 );
@@ -6286,13 +6286,13 @@ public final class Test {
                 return false;
             }
             //
-            final Sequence s_0 = BasicSequence.createAaSequence( "a", "--A---B-C--X----" );
-            final Sequence s_1 = BasicSequence.createAaSequence( "b", "--B-----C-------" );
-            final Sequence s_2 = BasicSequence.createAaSequence( "c", "--C--AB-C------Z" );
-            final Sequence s_3 = BasicSequence.createAaSequence( "d", "--D--AA-C-------" );
-            final Sequence s_4 = BasicSequence.createAaSequence( "e", "--E--AA-C-------" );
-            final Sequence s_5 = BasicSequence.createAaSequence( "f", "--F--AB-CD--Y---" );
-            final List<Sequence> l1 = new ArrayList<Sequence>();
+            final MolecularSequence s_0 = BasicSequence.createAaSequence( "a", "--A---B-C--X----" );
+            final MolecularSequence s_1 = BasicSequence.createAaSequence( "b", "--B-----C-------" );
+            final MolecularSequence s_2 = BasicSequence.createAaSequence( "c", "--C--AB-C------Z" );
+            final MolecularSequence s_3 = BasicSequence.createAaSequence( "d", "--D--AA-C-------" );
+            final MolecularSequence s_4 = BasicSequence.createAaSequence( "e", "--E--AA-C-------" );
+            final MolecularSequence s_5 = BasicSequence.createAaSequence( "f", "--F--AB-CD--Y---" );
+            final List<MolecularSequence> l1 = new ArrayList<MolecularSequence>();
             l1.add( s_0 );
             l1.add( s_1 );
             l1.add( s_2 );
@@ -6327,13 +6327,13 @@ public final class Test {
                 return false;
             }
             //
-            final Sequence s__0 = BasicSequence.createAaSequence( "a", "A------" );
-            final Sequence s__1 = BasicSequence.createAaSequence( "b", "BB-----" );
-            final Sequence s__2 = BasicSequence.createAaSequence( "c", "CCC----" );
-            final Sequence s__3 = BasicSequence.createAaSequence( "d", "DDDD---" );
-            final Sequence s__4 = BasicSequence.createAaSequence( "e", "EEEEE--" );
-            final Sequence s__5 = BasicSequence.createAaSequence( "f", "FFFFFF-" );
-            final List<Sequence> l2 = new ArrayList<Sequence>();
+            final MolecularSequence s__0 = BasicSequence.createAaSequence( "a", "A------" );
+            final MolecularSequence s__1 = BasicSequence.createAaSequence( "b", "BB-----" );
+            final MolecularSequence s__2 = BasicSequence.createAaSequence( "c", "CCC----" );
+            final MolecularSequence s__3 = BasicSequence.createAaSequence( "d", "DDDD---" );
+            final MolecularSequence s__4 = BasicSequence.createAaSequence( "e", "EEEEE--" );
+            final MolecularSequence s__5 = BasicSequence.createAaSequence( "f", "FFFFFF-" );
+            final List<MolecularSequence> l2 = new ArrayList<MolecularSequence>();
             l2.add( s__0 );
             l2.add( s__1 );
             l2.add( s__2 );
@@ -6369,7 +6369,7 @@ public final class Test {
             dmsa2.setIdentifier( 0, "new_c" );
             dmsa2.setIdentifier( 1, "new_d" );
             dmsa2.setResidueAt( 0, 0, 'x' );
-            final Sequence s = dmsa2.deleteRow( "new_d", true );
+            final MolecularSequence s = dmsa2.deleteRow( "new_d", true );
             if ( !s.getMolecularSequenceAsString().equals( "D" ) ) {
                 return false;
             }