final PhylogenyNode n = it.next();
final String name = n.getName().trim();
if ( !ForesterUtil.isEmpty( name ) ) {
- final String code = ParserUtils
- .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES );
- if ( !ForesterUtil.isEmpty( code ) ) {
- PhylogenyMethods.setTaxonomyCode( n, code );
- }
+
+ ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES );
+
+ // final String code = ParserUtils
+ // .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES );
+ // if ( !ForesterUtil.isEmpty( code ) ) {
+ // PhylogenyMethods.setTaxonomyCode( n, code );
+ // }
}
}
}
final PhylogenyNode n = it.next();
if ( n.isExternal() && n.getNodeData().isHasTaxonomy() ) {
final String name = n.getNodeData().getTaxonomy().getScientificName();
- if ( !ForesterUtil.isEmpty( name ) && PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( name ).matches() ) {
+ if ( !ForesterUtil.isEmpty( name ) && PhyloXmlUtil.TAXOMONY_CODE_PATTERN_STRICT.matcher( name ).matches() ) {
n.getNodeData().getTaxonomy().setScientificName( "" );
n.getNodeData().getTaxonomy().setTaxonomyCode( name );
}
}
}
if ( !isReplaceUnderscores() && ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) ) ) {
- final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
- getTaxonomyExtraction() );
- if ( !ForesterUtil.isEmpty( tax ) ) {
- if ( !node.getNodeData().isHasTaxonomy() ) {
- node.getNodeData().setTaxonomy( new Taxonomy() );
- }
- node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
- }
+
+ ParserUtils.extractTaxonomyDataFromNodeName( node, getTaxonomyExtraction() );
+
+// final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
+// getTaxonomyExtraction() );
+// if ( !ForesterUtil.isEmpty( tax ) ) {
+// if ( !node.getNodeData().isHasTaxonomy() ) {
+// node.getNodeData().setTaxonomy( new Taxonomy() );
+// }
+// node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
+// }
}
}
}
if ( !s.startsWith( ":" ) ) {
node_to_annotate.setName( t.nextToken() );
if ( !replace_underscores && ( !is_nhx && ( taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
- final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
- taxonomy_extraction );
- if ( !ForesterUtil.isEmpty( tax ) ) {
- if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
- node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
- }
- node_to_annotate.getNodeData().getTaxonomy().setTaxonomyCode( tax );
- }
+ ParserUtils.extractTaxonomyDataFromNodeName( node_to_annotate, taxonomy_extraction );
}
}
while ( t.hasMoreTokens() ) {
import java.util.Set;
import java.util.regex.Pattern;
+import org.forester.io.parsers.util.ParserUtils;
+
public final class PhyloXmlUtil {
public static final String OTHER = "other";
public static final String UNKNOWN = "unknown";
public final static Pattern SEQUENCE_SYMBOL_PATTERN = Pattern.compile( "\\S{1,20}" );
- public final static Pattern TAXOMONY_CODE_PATTERN = Pattern.compile( "[A-Z0-9]{3,5}" );
+ public final static Pattern TAXOMONY_CODE_PATTERN_STRICT = ParserUtils.TAXOMONY_CODE_PATTERN_1;
+ public final static Pattern TAXOMONY_CODE_PATTERN_LAX = Pattern.compile( "[A-Z0-9]{3,6}" );
public final static Pattern LIT_REF_DOI_PATTERN = Pattern
.compile( "[a-zA-Z0-9_\\.]+\\S+" );
public final static Set<String> SEQUENCE_TYPES = new HashSet<String>();
import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
import org.forester.io.parsers.nhx.NHXParser;
import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
import org.forester.io.parsers.phyloxml.PhyloXmlParser;
import org.forester.io.parsers.tol.TolParser;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.Taxonomy;
import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
public final class ParserUtils {
- final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
+ final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern
.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" );
final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
+
+ final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\d{1,7}" );
+ final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern
+ .compile( "(\\d{1,7})[^A-Za-z].*" );
+ final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" );
+
+
final public static PhylogenyParser createParserDependingFileContents( final File file,
final boolean phyloxml_validate_against_xsd )
throws FileNotFoundException, IOException {
}
return null;
}
+
+ public final static String extractUniprotTaxonomyIdFromNodeName( final String name,
+ final TAXONOMY_EXTRACTION taxonomy_extraction ) {
+ if ( ( name.indexOf( "_" ) > 0 )
+ && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) {
+ final String[] s = name.split( "[_\\s]" );
+ if ( s.length > 1 ) {
+ final String str = s[ 1 ];
+ if ( !ForesterUtil.isEmpty( str ) ) {
+ if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) {
+ final Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PF.matcher( str );
+ if ( m.matches() ) {
+ return m.group( 1 );
+ }
+ }
+ else {
+ final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( str );
+ if ( m1.matches() ) {
+ return m1.group();
+ }
+ final Matcher m2 = TAXOMONY_UNIPROT_ID_PATTERN_2.matcher( str );
+ if ( m2.matches() ) {
+ return m2.group( 1 );
+ }
+ }
+ }
+ }
+ }
+ else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) {
+ final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( name );
+ if ( m1.matches() ) {
+ return name;
+ }
+ }
+ return null;
+ }
public final static Phylogeny[] readPhylogenies( final File file ) throws FileNotFoundException, IOException {
return PhylogenyMethods.readPhylogenies( ParserUtils.createParserDependingOnFileType( file, true ), file );
public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException {
return readPhylogenies( new File( file_name ) );
}
+
+ public final static void extractTaxonomyDataFromNodeName( final PhylogenyNode node,
+ final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction )
+ throws PhyloXmlDataFormatException {
+ final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction );
+ if ( !ForesterUtil.isEmpty( id ) ) {
+ if ( !node.getNodeData().isHasTaxonomy() ) {
+ node.getNodeData().setTaxonomy( new Taxonomy() );
+ }
+ node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) );
+ }
+ else {
+ final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction );
+ if ( !ForesterUtil.isEmpty( code ) ) {
+ if ( !node.getNodeData().isHasTaxonomy() ) {
+ node.getNodeData().setTaxonomy( new Taxonomy() );
+ }
+ node.getNodeData().getTaxonomy().setTaxonomyCode( code );
+ }
+ }
+ }
}
final ArrayList<PhylogenyNode> to_delete = new ArrayList<PhylogenyNode>();
for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
final PhylogenyNode n = iter.next();
- if ( ( !n.isExternal() ) && ( !n.isRoot() ) && ( n.getNumberOfDescendants() == 1 ) ) {
+ if ( ( !n.isExternal() ) && ( n.getNumberOfDescendants() == 1 ) ) {
to_delete.add( n );
}
}
import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
import org.forester.io.parsers.phyloxml.PhyloXmlMapping;
import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
+import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
public class Taxonomy implements PhylogenyData, MultipleUris, Comparable<Taxonomy> {
+
+
private String _scientific_name;
private String _common_name;
private List<String> _synonyms;
}
public void setTaxonomyCode( final String taxonomy_code ) throws PhyloXmlDataFormatException {
- if ( !ForesterUtil.isEmpty( taxonomy_code )
- && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( taxonomy_code ).matches() ) {
- throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" );
+ if ( ForesterConstants.TAXONOMY_CODE_STRICT ) {
+ if ( !ForesterUtil.isEmpty( taxonomy_code )
+ && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN_STRICT.matcher( taxonomy_code ).matches() ) {
+ throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" );
+ }
+ }
+ else {
+ if ( !ForesterUtil.isEmpty( taxonomy_code )
+ && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN_LAX.matcher( taxonomy_code ).matches() ) {
+ throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" );
+ }
}
_taxonomy_code = taxonomy_code;
}
if ( !n13.getName().equals( "blah_12345/1-2" ) ) {
return false;
}
- if ( !PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) {
+ if ( PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) {
+ return false;
+ }
+ if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+ return false;
+ }
+ if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
return false;
}
final PhylogenyNode n14 = PhylogenyNode
if ( !isEqual( n18.getBranchData().getConfidence( 0 ).getValue(), 91 ) ) {
return false;
}
+
+
+ //
+ final PhylogenyNode n19 = PhylogenyNode
+ .createInstanceFromNhxString( "blah_1-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
+
+
+ if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) {
+ return false;
+ }
+ if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
+ return false;
+ }
+ final PhylogenyNode n30 = PhylogenyNode
+ .createInstanceFromNhxString( "blah_1234567-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
+
+
+ if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" ) ) {
+ return false;
+ }
+ if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
+ return false;
+ }
+ final PhylogenyNode n31 = PhylogenyNode
+ .createInstanceFromNhxString( "blah_12345678-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
+
+
+ if ( n31.getNodeData().isHasTaxonomy() ) {
+ return false;
+ }
+ // if ( !n31.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
+ // return false;
+ // }
}
catch ( final Exception e ) {
e.printStackTrace( System.out );
public final static String UTF8 = "UTF-8";
public final static String PHYLO_XML_REFERENCE = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356";
public final static boolean RELEASE = false;
-
+ public final static boolean TAXONOMY_CODE_STRICT = true;
+
+
public enum PhylogeneticTreeFormats {
NH, NHX, NEXUS, PHYLOXML
}