X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Futil%2FParserUtils.java;h=ef37f98d0aa8969d0696c804d121111c8efdc602;hb=e9776288e623af2543d3a5e396c60ff1411d01c3;hp=26e35b353483f80e90763d248494b4c91f309202;hpb=6062dfb954cafb6af22e01af89222888d9d5ba66;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 26e35b3..ef37f98 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -55,22 +55,29 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP"; + final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA"; + final public static Pattern TAXOMONY_CODE_PATTERN_A = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + ")\\b" ); + final public static Pattern TAXOMONY_CODE_PATTERN_BRACKETED = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); + final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" + + TAX_CODE + ")\\b" ); final public static Pattern TAXOMONY_SN_PATTERN = Pattern - .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" ); + .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]{2,30}_[a-z]{3,30}(?:_[a-z][a-z0-9_]+)?)\\b" ); + final public static Pattern TAXOMONY_SN_PATTERN_SN = Pattern + .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}(?:[_ ][a-z]{3,30})?)(?:\\b|_)?" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_1 = Pattern + .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_2 = Pattern + .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))(?:\\b|_)?" ); + final public static Pattern TAXOMONY_SN_PATTERN_SP = Pattern + .compile( "\\b([A-Z][a-z]{2,30}[_ ]sp\\.)(?:\\b|_)?" ); + + final public static Pattern TAXOMONY_SN_PATTERN_GENUS = Pattern.compile( "([A-Z][a-z]{2,30})" ); final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" + TAX_CODE + ")/\\d+-\\d+\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" - + TAX_CODE + ")\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_A = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + ")\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_4 = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); - final public static Pattern TAXOMONY_CODE_PATTERN_6 = Pattern - .compile( "\\[([A-Z9][A-Z]{2}[A-Z0-9]{3})\\]" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_A = Pattern.compile( "(?:\\b|_)(\\d{1,7})\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern + .compile( "(?:\\b|_)[A-Z0-9]{1,}_(\\d{1,7})\\b" ); final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFS = Pattern .compile( "(?:\\b|_)[A-Z0-9]{4,}_(\\d{1,7})/\\d+-\\d+\\b" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern - .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(\\d{1,7})\\b" ); final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) @@ -78,7 +85,7 @@ public final class ParserUtils { PhylogenyParser parser = null; final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); if ( first_line.startsWith( "<" ) ) { - parser = new PhyloXmlParser(); + parser = PhyloXmlParser.createPhyloXmlParser(); if ( phyloxml_validate_against_xsd ) { final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); @@ -111,45 +118,13 @@ public final class ParserUtils { if ( parser == null ) { parser = createParserDependingFileContents( file, phyloxml_validate_against_xsd ); } - return parser; - } - - /** - * Return null if it can not guess the parser to use based on name suffix. - * - * @param filename - * @return - */ - final public static PhylogenyParser createParserDependingOnSuffix( final String filename, - final boolean phyloxml_validate_against_xsd ) { - PhylogenyParser parser = null; - final String filename_lc = filename.toLowerCase(); - if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { - parser = new TolParser(); - } - else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" ) - || filename_lc.endsWith( ".zip" ) ) { - parser = new PhyloXmlParser(); - if ( phyloxml_validate_against_xsd ) { - final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); - final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); - if ( xsd_url != null ) { - ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); - } - else { - if ( ForesterConstants.RELEASE ) { - throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" - + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); - } - } + if ( ( parser != null ) && file.toString().toLowerCase().endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); } - } - else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { - parser = new NexusPhylogeniesParser(); - } - else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) - || filename_lc.endsWith( ".nwk" ) ) { - parser = new NHXParser(); } return parser; } @@ -159,18 +134,10 @@ public final class ParserUtils { throws FileNotFoundException, IOException { final String lc_filename = url.getFile().toString().toLowerCase(); PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); - if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { - if ( parser instanceof PhyloXmlParser ) { - ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); - } - else if ( parser instanceof TolParser ) { - ( ( TolParser ) parser ).setZippedInputstream( true ); - } - } if ( parser == null ) { final String first_line = ForesterUtil.getFirstLine( url ).trim().toLowerCase(); if ( first_line.startsWith( "<" ) ) { - parser = new PhyloXmlParser(); + parser = PhyloXmlParser.createPhyloXmlParser(); if ( phyloxml_validate_against_xsd ) { final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); @@ -191,6 +158,14 @@ public final class ParserUtils { parser = new NHXParser(); } } + if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); + } + } return parser; } @@ -233,6 +208,25 @@ public final class ParserUtils { if ( m.find() ) { return m.group( 1 ).replace( '_', ' ' ); } + final Matcher m_str1 = TAXOMONY_SN_PATTERN_STRAIN_1.matcher( name ); + if ( m_str1.find() ) { + return m_str1.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_str2 = TAXOMONY_SN_PATTERN_STRAIN_2.matcher( name ); + if ( m_str2.find() ) { + return m_str2.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name ); + + if ( m_sn.find() ) { + return m_sn.group( 1 ).replace( '_', ' ' ); + } + + final Matcher m_sp = TAXOMONY_SN_PATTERN_SP.matcher( name ); + + if ( m_sp.find() ) { + return m_sp.group( 1 ).replace( '_', ' ' ); + } return null; } @@ -307,12 +301,12 @@ public final class ParserUtils { if ( m.find() ) { return m.group( 1 ); } - else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { - m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name ); - if ( m.find() ) { - return m.group( 1 ); - } - } + //else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { + // m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name ); + // if ( m.find() ) { + // return m.group( 1 ); + // } + //} } return null; } @@ -324,4 +318,43 @@ public final class ParserUtils { public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException { return readPhylogenies( new File( file_name ) ); } + + /** + * Return null if it can not guess the parser to use based on name suffix. + * + * @param filename + * @return + */ + final private static PhylogenyParser createParserDependingOnSuffix( final String filename, + final boolean phyloxml_validate_against_xsd ) { + PhylogenyParser parser = null; + final String filename_lc = filename.toLowerCase(); + if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { + parser = new TolParser(); + } + else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( "phyloxml" ) || filename_lc.endsWith( ".zip" ) ) { + parser = PhyloXmlParser.createPhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + if ( ForesterConstants.RELEASE ) { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + } + else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { + parser = new NexusPhylogeniesParser(); + } + else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) + || filename_lc.endsWith( ".nwk" ) ) { + parser = new NHXParser(); + } + return parser; + } }