import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.forester.archaeopteryx.Constants;
+import org.forester.archaeopteryx.AptxConstants;
import org.forester.io.parsers.IteratingPhylogenyParser;
import org.forester.io.parsers.PhylogenyParser;
import org.forester.io.parsers.nhx.NHXFormatException;
import org.forester.io.parsers.util.PhylogenyParserException;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Sequence;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.MolecularSequence;
+import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
- final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase();
- final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase();
- final private static String translate = NexusConstants.TRANSLATE.toLowerCase();
- final private static String tree = NexusConstants.TREE.toLowerCase();
- final private static String utree = NexusConstants.UTREE.toLowerCase();
- final private static String end = NexusConstants.END.toLowerCase();
- final private static String endblock = "endblock";
- final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
- Pattern.CASE_INSENSITIVE );
- final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
- private Object _nexus_source;
- private List<String> _taxlabels;
- private Map<String, String> _translate_map;
- private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
- private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
- private TAXONOMY_EXTRACTION _taxonomy_extraction = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
- private Phylogeny _next;
- private BufferedReader _br;
- private boolean _in_trees_block;
- private StringBuilder _nh;
- private String _name;
- private StringBuilder _translate_sb;
- private boolean _in_taxalabels;
- private boolean _in_translate;
- private boolean _is_rooted;
- private boolean _rooted_info_present;
- private boolean _in_tree;
+
+ final private static boolean DEBUG = false;
+
+ final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase();
+ final private static String end = NexusConstants.END.toLowerCase();
+ final private static String endblock = "endblock";
+ final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
+ final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase();
+ final private static Pattern TITLE_PATTERN = Pattern.compile( "TITLE.?\\s+([^;]+)",
+ Pattern.CASE_INSENSITIVE );
+ final private static String translate = NexusConstants.TRANSLATE.toLowerCase();
+ final private static String data = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
+ final private static String characters = NexusConstants.BEGIN_DATA.toLowerCase();
+ final private static String tree = NexusConstants.TREE.toLowerCase();
+ final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
+ Pattern.CASE_INSENSITIVE );
+ final private static Pattern TRANSLATE_PATTERN = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
+ final private static Pattern ALN_PATTERN = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
+ final private static Pattern DATATYPE_PATTERN = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
+ //final private static Pattern LINK_TAXA_PATTERN = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
+ // Pattern.CASE_INSENSITIVE );
+ final private static String utree = NexusConstants.UTREE.toLowerCase();
+ private BufferedReader _br;
+ private boolean _ignore_quotes_in_nh_data = AptxConstants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
+ private boolean _in_taxalabels;
+ private boolean _in_translate;
+ private boolean _in_tree;
+ private boolean _in_trees_block;
+ private boolean _in_data_block;
+ private boolean _is_rooted;
+ private String _datatype;
+ private String _name;
+ private Phylogeny _next;
+ private Object _nexus_source;
+ private StringBuilder _nh;
+ private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
+ private boolean _rooted_info_present;
+ private List<String> _taxlabels;
+ private TAXONOMY_EXTRACTION _taxonomy_extraction = TAXONOMY_EXTRACTION.NO;
+ private String _title;
+ private Map<String, String> _translate_map;
+ private StringBuilder _translate_sb;
+ private Map<String, MolecularSequence> _seqs;
+ private final boolean _add_sequences = true;
+ private boolean _parse_beast_style_extended_tags = false;
+
+
+ @Override
+ public String getName() {
+ return "Nexus Phylogenies Parser";
+ }
@Override
public final boolean hasNext() {
@Override
public final Phylogeny[] parse() throws IOException {
- reset();
final List<Phylogeny> l = new ArrayList<Phylogeny>();
while ( hasNext() ) {
l.add( next() );
for( int i = 0; i < l.size(); ++i ) {
p[ i ] = l.get( i );
}
+ reset();
return p;
}
_translate_map = new HashMap<String, String>();
_nh = new StringBuilder();
_name = "";
- _translate_sb = new StringBuilder();
+ _title = "";
+ _translate_sb = null;
_next = null;
_in_trees_block = false;
_in_taxalabels = false;
_in_tree = false;
_rooted_info_present = false;
_is_rooted = false;
- _br = ParserUtils.createReader( _nexus_source );
+ _seqs = new HashMap<String, MolecularSequence>();
+ _br = ParserUtils.createReader( _nexus_source, ForesterConstants.UTF_8 );
getNext();
}
_taxonomy_extraction = taxonomy_extraction;
}
- private final void createPhylogeny( final String name,
+ private final void createPhylogeny( final String title,
+ final String name,
final StringBuilder nhx,
final boolean rooted_info_present,
final boolean is_rooted ) throws IOException {
_next = null;
final NHXParser pars = new NHXParser();
- if ( ( _taxlabels.size() < 1 ) && ( _translate_map.size() < 1 ) ) {
- pars.setTaxonomyExtraction( _taxonomy_extraction );
- pars.setReplaceUnderscores( _replace_underscores );
- pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
- }
- else {
- pars.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO );
- pars.setReplaceUnderscores( false );
- pars.setIgnoreQuotes( false );
- }
+ pars.setTaxonomyExtraction( _taxonomy_extraction );
+ pars.setReplaceUnderscores( _replace_underscores );
+ pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
+ pars.setParseBeastStyleExtendedTags( _parse_beast_style_extended_tags );
if ( rooted_info_present ) {
pars.setGuessRootedness( false );
}
- pars.setSource( nhx );
+ pars.setSource( nhx.toString() );
final Phylogeny p = pars.next();
if ( p == null ) {
throw new PhylogenyParserException( "failed to create phylogeny" );
}
- p.setName( name );
+ String myname = null;
+ if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
+ myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
+ }
+ else if ( !ForesterUtil.isEmpty( title ) ) {
+ myname = title.replace( '_', ' ' ).trim();
+ }
+ else if ( !ForesterUtil.isEmpty( name ) ) {
+ myname = name.trim();
+ }
+ if ( !ForesterUtil.isEmpty( myname ) ) {
+ p.setName( myname );
+ }
if ( rooted_info_present ) {
p.setRooted( is_rooted );
}
}
if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
- // final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
- // getTaxonomyExtraction() );
- // if ( !ForesterUtil.isEmpty( tax ) ) {
- // if ( !node.getNodeData().isHasTaxonomy() ) {
- // node.getNodeData().setTaxonomy( new Taxonomy() );
- // }
- // node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
- // }
+ }
+ else if ( _replace_underscores ) {
+ if ( !ForesterUtil.isEmpty( node.getName() ) ) {
+ node.setName( node.getName().replace( '_', ' ' ).trim() );
+ }
+ }
+ if ( _add_sequences ) {
+ if ( _seqs.containsKey( node.getName() ) ) {
+ final MolecularSequence s = _seqs.get( node.getName() );
+ //TODO need to check for uniqueness when adding seqs....
+ final Sequence ns = new Sequence( s );
+ ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
+ node.getNodeData().addSequence( ns );
+ }
}
}
}
_next = null;
String line;
while ( ( line = _br.readLine() ) != null ) {
+ if ( DEBUG ) {
+ System.out.println( line );
+ }
line = line.trim();
if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
line = ForesterUtil.collapseWhiteSpace( line );
_in_trees_block = true;
_in_taxalabels = false;
_in_translate = false;
+ _in_data_block = false;
+ _datatype = null;
+ _title = "";
}
else if ( line_lc.startsWith( taxlabels ) ) {
+ //TODO need to be taxa block instead
_in_trees_block = false;
_in_taxalabels = true;
_in_translate = false;
+ _in_data_block = false;
+ _datatype = null;
}
else if ( line_lc.startsWith( translate ) ) {
+ _translate_sb = new StringBuilder();
_in_taxalabels = false;
_in_translate = true;
+ _in_data_block = false;
+ _datatype = null;
+ }
+ else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
+ _in_taxalabels = false;
+ _in_trees_block = false;
+ _in_translate = false;
+ _in_data_block = true;
+ _datatype = null;
}
else if ( _in_trees_block ) {
- //FIXME TODO need to work on this "title" and "link"
- if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
- // Do nothing.
+ if ( line_lc.startsWith( "title" ) ) {
+ final Matcher title_m = TITLE_PATTERN.matcher( line );
+ if ( title_m.lookingAt() ) {
+ _title = title_m.group( 1 );
+ }
+ }
+ else if ( line_lc.startsWith( "link" ) ) {
+ //final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+ //if ( link_m.lookingAt() ) {
+ //final String link = link_m.group( 1 ); //TODO why?
+ // }
}
else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
_in_trees_block = false;
_in_tree = false;
_in_translate = false;
if ( _nh.length() > 0 ) {
- createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+ createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
_nh = new StringBuilder();
_name = "";
_rooted_info_present = false;
boolean might = false;
if ( _nh.length() > 0 ) {
might = true;
- createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+ createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
_nh = new StringBuilder();
_name = "";
_rooted_info_present = false;
&& !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
_in_tree = false;
_in_translate = false;
- createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+ createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
_nh = new StringBuilder();
_name = "";
_rooted_info_present = false;
}
}
}
+ if ( _in_data_block ) {
+ if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
+ _in_data_block = false;
+ _datatype = null;
+ }
+ else if ( line_lc.startsWith( "link" ) ) {
+ // final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+ // if ( link_m.lookingAt() ) {
+ // final String link = link_m.group( 1 );
+ // }
+ }
+ else {
+ final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
+ if ( datatype_matcher.find() ) {
+ _datatype = datatype_matcher.group( 1 );
+ }
+ else {
+ if ( ( _datatype != null )
+ && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
+ .equals( "rna" ) ) ) {
+ if ( line.endsWith( ";" ) ) {
+ _in_data_block = false;
+ line = line.substring( 0, line.length() - 1 );
+ }
+ final Matcher aln_matcher = ALN_PATTERN.matcher( line );
+ if ( aln_matcher.matches() ) {
+ final String id = aln_matcher.group( 1 );
+ final String seq = aln_matcher.group( 2 );
+ MolecularSequence s = null;
+ if ( _datatype.equals( "protein" ) ) {
+ s = BasicSequence.createAaSequence( id, seq );
+ }
+ else if ( _datatype.equals( "dna" ) ) {
+ s = BasicSequence.createDnaSequence( id, seq );
+ }
+ else {
+ s = BasicSequence.createRnaSequence( id, seq );
+ }
+ _seqs.put( id, s );
+ }
+ }
+ }
+ }
+ }
}
}
if ( _nh.length() > 0 ) {
- createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+ createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
if ( _next != null ) {
return;
}
if ( s.endsWith( ";" ) ) {
s = s.substring( 0, s.length() - 1 ).trim();
}
- for( final String pair : s.split( "," ) ) {
- final String[] kv = pair.trim().split( "\\s+" );
- if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
- throw new IOException( "ill-formatted translate values: " + translate_sb );
- }
- if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
- throw new IOException( "ill-formatted translate values: " + translate_sb );
- }
+ for( String pair : s.split( "," ) ) {
String key = "";
String value = "";
- if ( kv.length == 3 ) {
- key = kv[ 1 ];
- value = kv[ 2 ];
+ final int ti = pair.toLowerCase().indexOf( "translate" );
+ if ( ti > -1 ) {
+ pair = pair.substring( ti + 9 );
+ }
+ final Matcher m = TRANSLATE_PATTERN.matcher( pair );
+ if ( m.find() ) {
+ key = m.group( 1 );
+ value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
}
else {
- key = kv[ 0 ];
- value = kv[ 1 ];
+ throw new IOException( "ill-formatted translate values: " + pair );
}
if ( value.endsWith( ";" ) ) {
value = value.substring( 0, value.length() - 1 );
_translate_map.put( key, value );
}
}
-
+
+ public final void setParseBeastStyleExtendedTags( final boolean parse_beast_style_extended_tags ) {
+ _parse_beast_style_extended_tags = parse_beast_style_extended_tags;
+ }
+
private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
return s.replaceAll( "\\s+;", ";" );
}