X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Futil%2FParserUtils.java;h=a1f842ee0df8353be6a37a3606bda2296cfd42d9;hb=44fddb76faa8975295b8b0ad38609256b5011ced;hp=09d5b247ebc1cb4d64c37d5b91c1722fa2041422;hpb=48f7a89be9d34f1930a1f863e608235cc27184c5;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 09d5b24..a1f842e 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -6,7 +6,7 @@ // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved -// +// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -16,7 +16,7 @@ // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. -// +// // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA @@ -34,9 +34,142 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; +import java.net.URL; +import java.util.regex.Matcher; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.parsers.tol.TolParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; public final class ParserUtils { + final public static PhylogenyParser createParserDependingFileContents( final File file, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + PhylogenyParser parser = null; + final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); + if ( first_line.startsWith( "<" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + if ( ForesterConstants.RELEASE ) { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + } + else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) + || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { + parser = new NexusPhylogeniesParser(); + } + else { + parser = new NHXParser(); + } + return parser; + } + + final public static PhylogenyParser createParserDependingOnFileType( final File file, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + PhylogenyParser parser = null; + parser = ParserUtils.createParserDependingOnSuffix( file.getName(), phyloxml_validate_against_xsd ); + if ( parser == null ) { + parser = createParserDependingFileContents( file, phyloxml_validate_against_xsd ); + } + return parser; + } + + /** + * Return null if it can not guess the parser to use based on name suffix. + * + * @param filename + * @return + */ + final public static PhylogenyParser createParserDependingOnSuffix( final String filename, + final boolean phyloxml_validate_against_xsd ) { + PhylogenyParser parser = null; + final String filename_lc = filename.toLowerCase(); + if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { + parser = new TolParser(); + } + else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" ) + || filename_lc.endsWith( ".zip" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + if ( ForesterConstants.RELEASE ) { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + } + else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { + parser = new NexusPhylogeniesParser(); + } + else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) ) { + parser = new NHXParser(); + } + return parser; + } + + final public static PhylogenyParser createParserDependingOnUrlContents( final URL url, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + final String lc_filename = url.getFile().toString().toLowerCase(); + PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); + if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); + } + } + if ( parser == null ) { + final String first_line = ForesterUtil.getFirstLine( url ).trim().toLowerCase(); + if ( first_line.startsWith( "<" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) + || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { + parser = new NexusPhylogeniesParser(); + } + else { + parser = new NHXParser(); + } + } + return parser; + } + public static BufferedReader createReader( final Object source ) throws IOException, FileNotFoundException { BufferedReader reader = null; if ( ( source instanceof File ) || ( source instanceof String ) ) { @@ -70,4 +203,61 @@ public final class ParserUtils { } return reader; } + + /** + * Extracts a code if and only if: + * one and only one _, + * shorter than 25, + * no |, + * no ., + * if / present it has to be after the _, + * if PFAM_STYLE_ONLY: / must be present, + * tax code can only contain uppercase letters and numbers, + * and must contain at least one uppercase letter. + * Return null if no code extractable. + * + * @param name + * @param limit_to_five + * @return + */ + public static String extractTaxonomyCodeFromNodeName( final String name, + final boolean limit_to_five, + final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction ) { + if ( ( name.indexOf( "_" ) > 0 ) + && ( name.length() < 31 ) + // && ( name.lastIndexOf( "_" ) == name.indexOf( "_" ) ) + && ( name.indexOf( "|" ) < 0 ) + && ( name.indexOf( "." ) < 0 ) + && ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name + .indexOf( "/" ) >= 0 ) ) + && ( ( ( name.indexOf( "/" ) ) < 0 ) || ( name.indexOf( "/" ) > name.indexOf( "_" ) ) ) ) { + final String[] s = name.split( "[_/]" ); + if ( s.length > 1 ) { + String str = s[ 1 ]; + if ( ( str.length() < 6 ) || ( !limit_to_five && ( str.length() < 7 ) ) ) { + if ( ( str.length() < 5 ) && ( str.startsWith( "RAT" ) || str.startsWith( "PIG" ) ) ) { + str = str.substring( 0, 3 ); + } + final Matcher uc_letters_and_numbers = NHXParser.UC_LETTERS_NUMBERS_PATTERN.matcher( str ); + if ( !uc_letters_and_numbers.matches() ) { + return null; + } + final Matcher numbers_only = NHXParser.NUMBERS_ONLY_PATTERN.matcher( str ); + if ( numbers_only.matches() ) { + return null; + } + return str; + } + } + } + return null; + } + + public final static Phylogeny[] readPhylogenies( final File file ) throws FileNotFoundException, IOException { + return PhylogenyMethods.readPhylogenies( ParserUtils.createParserDependingOnFileType( file, true ), file ); + } + + public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException { + return readPhylogenies( new File( file_name ) ); + } }