X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Futil%2FParserUtils.java;h=51f7f892b03b4545e6961933e092d0d54143f027;hb=1051703a697d29be2dc59ed1ed26b7ce5665ad66;hp=b8dd42e7dd97cfe539bf8aa34416e80198d20226;hpb=038c34792757a86f24296de5683e722fab3f9307;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index b8dd42e..51f7f89 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -36,73 +36,42 @@ import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.io.parsers.tol.TolParser; +import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static PhylogenyParser createParserDependingOnUrlContents( final URL url, - final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { - final String lc_filename = url.getFile().toString().toLowerCase(); - PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); - if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { - if ( parser instanceof PhyloXmlParser ) { - ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); - } - else if ( parser instanceof TolParser ) { - ( ( TolParser ) parser ).setZippedInputstream( true ); - } - } - if ( parser == null ) { - final String first_line = ForesterUtil.getFirstLine( url ).trim().toLowerCase(); - if ( first_line.startsWith( "<" ) ) { - parser = new PhyloXmlParser(); - if ( phyloxml_validate_against_xsd ) { - final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); - final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); - if ( xsd_url != null ) { - ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); - } - else { - throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" - + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); - } - } - } - else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) - || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { - parser = new NexusPhylogeniesParser(); - } - else { - parser = new NHXParser(); - } - } - return parser; - } + final public static Pattern TAXOMONY_SN_PATTERN = Pattern + .compile( "[^_]{2,}_([A-Z][a-z]+_[a-z]{2,}(_[A-Za-z]\\w+|))\\b" ); + final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "\\b[A-Z0-9]{5}|RAT|PIG|PEA|CAP\\b" ); + final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern + .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^0-9A-Za-z].*" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern + .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\b\\d{1,7}\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern.compile( "(\\d{1,7})[^0-9A-Za-z].*" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" ); - /** - * Return null if it can not guess the parser to use based on name suffix. - * - * @param filename - * @return - */ - final public static PhylogenyParser createParserDependingOnSuffix( final String filename, - final boolean phyloxml_validate_against_xsd ) { + final public static PhylogenyParser createParserDependingFileContents( final File file, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { PhylogenyParser parser = null; - final String filename_lc = filename.toLowerCase(); - if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { - parser = new TolParser(); - } - else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" ) - || filename_lc.endsWith( ".zip" ) ) { + final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); + if ( first_line.startsWith( "<" ) ) { parser = new PhyloXmlParser(); if ( phyloxml_validate_against_xsd ) { final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); @@ -118,10 +87,11 @@ public final class ParserUtils { } } } - else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { + else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) + || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { parser = new NexusPhylogeniesParser(); } - else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) ) { + else { parser = new NHXParser(); } return parser; @@ -138,12 +108,21 @@ public final class ParserUtils { return parser; } - final public static PhylogenyParser createParserDependingFileContents( final File file, - final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + /** + * Return null if it can not guess the parser to use based on name suffix. + * + * @param filename + * @return + */ + final public static PhylogenyParser createParserDependingOnSuffix( final String filename, + final boolean phyloxml_validate_against_xsd ) { PhylogenyParser parser = null; - final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); - if ( first_line.startsWith( "<" ) ) { + final String filename_lc = filename.toLowerCase(); + if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { + parser = new TolParser(); + } + else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" ) + || filename_lc.endsWith( ".zip" ) ) { parser = new PhyloXmlParser(); if ( phyloxml_validate_against_xsd ) { final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); @@ -159,66 +138,54 @@ public final class ParserUtils { } } } - else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) - || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { + else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { parser = new NexusPhylogeniesParser(); } - else { + else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) + || filename_lc.endsWith( ".nwk" ) ) { parser = new NHXParser(); } return parser; } - /** - * Extracts a code if and only if: - * one and only one _, - * shorter than 25, - * no |, - * no ., - * if / present it has to be after the _, - * if PFAM_STYLE_ONLY: / must be present, - * tax code can only contain uppercase letters and numbers, - * and must contain at least one uppercase letter. - * Return null if no code extractable. - * - * @param name - * @param limit_to_five - * @return - */ - public static String extractTaxonomyCodeFromNodeName( final String name, - final boolean limit_to_five, - final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction ) { - if ( ( name.indexOf( "_" ) > 0 ) - && ( name.length() < 25 ) - && ( name.lastIndexOf( "_" ) == name.indexOf( "_" ) ) - && ( name.indexOf( "|" ) < 0 ) - && ( name.indexOf( "." ) < 0 ) - && ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name - .indexOf( "/" ) >= 0 ) ) - && ( ( ( name.indexOf( "/" ) ) < 0 ) || ( name.indexOf( "/" ) > name.indexOf( "_" ) ) ) ) { - final String[] s = name.split( "[_/]" ); - if ( s.length > 1 ) { - String str = s[ 1 ]; - if ( limit_to_five ) { - if ( str.length() > 5 ) { - str = str.substring( 0, 5 ); + final public static PhylogenyParser createParserDependingOnUrlContents( final URL url, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + final String lc_filename = url.getFile().toString().toLowerCase(); + PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); + if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); + } + } + if ( parser == null ) { + final String first_line = ForesterUtil.getFirstLine( url ).trim().toLowerCase(); + if ( first_line.startsWith( "<" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); } - else if ( ( str.length() < 5 ) && ( str.startsWith( "RAT" ) || str.startsWith( "PIG" ) ) ) { - str = str.substring( 0, 3 ); + else { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); } } - final Matcher letters_and_numbers = NHXParser.UC_LETTERS_NUMBERS_PATTERN.matcher( str ); - if ( !letters_and_numbers.matches() ) { - return null; - } - final Matcher numbers_only = NHXParser.NUMBERS_ONLY_PATTERN.matcher( str ); - if ( numbers_only.matches() ) { - return null; - } - return str; + } + else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) + || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { + parser = new NexusPhylogeniesParser(); + } + else { + parser = new NHXParser(); } } - return null; + return parser; } public static BufferedReader createReader( final Object source ) throws IOException, FileNotFoundException { @@ -254,4 +221,133 @@ public final class ParserUtils { } return reader; } + + public final static String extractTaxonomyCodeFromNodeName( final String name, + final TAXONOMY_EXTRACTION taxonomy_extraction ) { + if ( ( name.indexOf( "_" ) > 0 ) + && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) { + final String[] s = name.split( "[_\\s]" ); + if ( s.length > 1 ) { + final String str = s[ 1 ]; + if ( !ForesterUtil.isEmpty( str ) ) { + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) { + final Matcher m = TAXOMONY_CODE_PATTERN_PF.matcher( str ); + if ( m.matches() ) { + return m.group( 1 ); + } + } + else { + final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( str ); + if ( m1.matches() ) { + return m1.group(); + } + final Matcher m2 = TAXOMONY_CODE_PATTERN_2.matcher( str ); + if ( m2.matches() ) { + return m2.group( 1 ); + } + } + } + } + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) { + final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( name ); + if ( m1.matches() ) { + return name; + } + } + return null; + } + + public final static String extractScientificNameFromNodeName( final String name ) { + final Matcher m1 = TAXOMONY_SN_PATTERN.matcher( name ); + if ( m1.matches() ) { + return m1.group( 1 ).replace( '_', ' ' ); + } + return null; + } + + public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node, + final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) + throws PhyloXmlDataFormatException { + final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction ); + if ( !ForesterUtil.isEmpty( id ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + if ( ( node.getNodeData().getTaxonomy().getIdentifier() == null ) + || ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getIdentifier().getValue() ) ) { + node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) ); + return id; + } + } + else { + final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); + if ( !ForesterUtil.isEmpty( code ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + node.getNodeData().getTaxonomy().setTaxonomyCode( code ); + return code; + } + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) { + final String sn = extractScientificNameFromNodeName( node.getName() ); + if ( !ForesterUtil.isEmpty( sn ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getScientificName() ) ) { + node.getNodeData().getTaxonomy().setScientificName( sn ); + return sn; + } + } + } + } + return null; + } + + public final static String extractUniprotTaxonomyIdFromNodeName( final String name, + final TAXONOMY_EXTRACTION taxonomy_extraction ) { + if ( ( name.indexOf( "_" ) > 0 ) + && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) { + final String[] s = name.split( "[_\\s]" ); + if ( s.length > 1 ) { + final String str = s[ 1 ]; + if ( !ForesterUtil.isEmpty( str ) ) { + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) { + final Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PF.matcher( str ); + if ( m.matches() ) { + return m.group( 1 ); + } + } + else { + final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( str ); + if ( m1.matches() ) { + return m1.group(); + } + final Matcher m2 = TAXOMONY_UNIPROT_ID_PATTERN_2.matcher( str ); + if ( m2.matches() ) { + return m2.group( 1 ); + } + } + } + } + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) { + final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( name ); + if ( m1.matches() ) { + return name; + } + } + return null; + } + + public final static Phylogeny[] readPhylogenies( final File file ) throws FileNotFoundException, IOException { + return PhylogenyMethods.readPhylogenies( ParserUtils.createParserDependingOnFileType( file, true ), file ); + } + + public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException { + return readPhylogenies( new File( file_name ) ); + } }