forester/java/src/org/forester/io/parsers/util/ParserUtils.java

   1 // $Id:
   2 //
   3 // FORESTER -- software libraries and applications
   4 // for evolutionary biology research and applications.
   5 //
   6 // Copyright (C) 2008-2009 Christian M. Zmasek
   7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
   8 // All rights reserved
   9 //
  10 // This library is free software; you can redistribute it and/or
  11 // modify it under the terms of the GNU Lesser General Public
  12 // License as published by the Free Software Foundation; either
  13 // version 2.1 of the License, or (at your option) any later version.
  14 //
  15 // This library is distributed in the hope that it will be useful,
  16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18 // Lesser General Public License for more details.
  19 //
  20 // You should have received a copy of the GNU Lesser General Public
  21 // License along with this library; if not, write to the Free Software
  22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  23 //
  24 // Contact: phylosoft @ gmail . com
  25 // WWW: www.phylosoft.org/
  26
  27 package org.forester.io.parsers.util;
  28
  29 import java.io.BufferedReader;
  30 import java.io.File;
  31 import java.io.FileNotFoundException;
  32 import java.io.FileReader;
  33 import java.io.IOException;
  34 import java.io.InputStream;
  35 import java.io.InputStreamReader;
  36 import java.io.StringReader;
  37 import java.net.URL;
  38 import java.util.regex.Matcher;
  39 import java.util.regex.Pattern;
  40
  41 import org.forester.io.parsers.PhylogenyParser;
  42 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
  43 import org.forester.io.parsers.nhx.NHXParser;
  44 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
  45 import org.forester.io.parsers.tol.TolParser;
  46 import org.forester.phylogeny.Phylogeny;
  47 import org.forester.phylogeny.PhylogenyMethods;
  48 import org.forester.util.ForesterConstants;
  49 import org.forester.util.ForesterUtil;
  50
  51 public final class ParserUtils {
  52
  53     final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}" );
  54     final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern.compile( "([A-Z0-9]{5})[^A-Z].*" );
  55
  56     final public static PhylogenyParser createParserDependingFileContents( final File file,
  57                                                                            final boolean phyloxml_validate_against_xsd )
  58             throws FileNotFoundException, IOException {
  59         PhylogenyParser parser = null;
  60         final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase();
  61         if ( first_line.startsWith( "<" ) ) {
  62             parser = new PhyloXmlParser();
  63             if ( phyloxml_validate_against_xsd ) {
  64                 final ClassLoader cl = PhyloXmlParser.class.getClassLoader();
  65                 final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE );
  66                 if ( xsd_url != null ) {
  67                     ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() );
  68                 }
  69                 else {
  70                     if ( ForesterConstants.RELEASE ) {
  71                         throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from ["
  72                                 + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" );
  73                     }
  74                 }
  75             }
  76         }
  77         else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) )
  78                 || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) {
  79             parser = new NexusPhylogeniesParser();
  80         }
  81         else {
  82             parser = new NHXParser();
  83         }
  84         return parser;
  85     }
  86
  87     final public static PhylogenyParser createParserDependingOnFileType( final File file,
  88                                                                          final boolean phyloxml_validate_against_xsd )
  89             throws FileNotFoundException, IOException {
  90         PhylogenyParser parser = null;
  91         parser = ParserUtils.createParserDependingOnSuffix( file.getName(), phyloxml_validate_against_xsd );
  92         if ( parser == null ) {
  93             parser = createParserDependingFileContents( file, phyloxml_validate_against_xsd );
  94         }
  95         return parser;
  96     }
  97
  98     /**
  99      * Return null if it can not guess the parser to use based on name suffix.
 100      *
 101      * @param filename
 102      * @return
 103      */
 104     final public static PhylogenyParser createParserDependingOnSuffix( final String filename,
 105                                                                        final boolean phyloxml_validate_against_xsd ) {
 106         PhylogenyParser parser = null;
 107         final String filename_lc = filename.toLowerCase();
 108         if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) {
 109             parser = new TolParser();
 110         }
 111         else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" )
 112                 || filename_lc.endsWith( ".zip" ) ) {
 113             parser = new PhyloXmlParser();
 114             if ( phyloxml_validate_against_xsd ) {
 115                 final ClassLoader cl = PhyloXmlParser.class.getClassLoader();
 116                 final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE );
 117                 if ( xsd_url != null ) {
 118                     ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() );
 119                 }
 120                 else {
 121                     if ( ForesterConstants.RELEASE ) {
 122                         throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from ["
 123                                 + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" );
 124                     }
 125                 }
 126             }
 127         }
 128         else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) {
 129             parser = new NexusPhylogeniesParser();
 130         }
 131         else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" )
 132                 || filename_lc.endsWith( ".nwk" ) ) {
 133             parser = new NHXParser();
 134         }
 135         return parser;
 136     }
 137
 138     final public static PhylogenyParser createParserDependingOnUrlContents( final URL url,
 139                                                                             final boolean phyloxml_validate_against_xsd )
 140             throws FileNotFoundException, IOException {
 141         final String lc_filename = url.getFile().toString().toLowerCase();
 142         PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd );
 143         if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) {
 144             if ( parser instanceof PhyloXmlParser ) {
 145                 ( ( PhyloXmlParser ) parser ).setZippedInputstream( true );
 146             }
 147             else if ( parser instanceof TolParser ) {
 148                 ( ( TolParser ) parser ).setZippedInputstream( true );
 149             }
 150         }
 151         if ( parser == null ) {
 152             final String first_line = ForesterUtil.getFirstLine( url ).trim().toLowerCase();
 153             if ( first_line.startsWith( "<" ) ) {
 154                 parser = new PhyloXmlParser();
 155                 if ( phyloxml_validate_against_xsd ) {
 156                     final ClassLoader cl = PhyloXmlParser.class.getClassLoader();
 157                     final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE );
 158                     if ( xsd_url != null ) {
 159                         ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() );
 160                     }
 161                     else {
 162                         throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from ["
 163                                 + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" );
 164                     }
 165                 }
 166             }
 167             else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) )
 168                     || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) {
 169                 parser = new NexusPhylogeniesParser();
 170             }
 171             else {
 172                 parser = new NHXParser();
 173             }
 174         }
 175         return parser;
 176     }
 177
 178     public static BufferedReader createReader( final Object source ) throws IOException, FileNotFoundException {
 179         BufferedReader reader = null;
 180         if ( ( source instanceof File ) || ( source instanceof String ) ) {
 181             File f = null;
 182             if ( source instanceof File ) {
 183                 f = ( File ) source;
 184             }
 185             else {
 186                 f = new File( ( String ) source );
 187             }
 188             if ( !f.exists() ) {
 189                 throw new IOException( "[" + f.getAbsolutePath() + "] does not exist" );
 190             }
 191             else if ( !f.isFile() ) {
 192                 throw new IOException( "[" + f.getAbsolutePath() + "] is not a file" );
 193             }
 194             else if ( !f.canRead() ) {
 195                 throw new IOException( "[" + f.getAbsolutePath() + "] is not a readable" );
 196             }
 197             reader = new BufferedReader( new FileReader( f ) );
 198         }
 199         else if ( source instanceof InputStream ) {
 200             reader = new BufferedReader( new InputStreamReader( ( InputStream ) source ) );
 201         }
 202         else if ( ( source instanceof StringBuffer ) || ( source instanceof StringBuilder ) ) {
 203             reader = new BufferedReader( new StringReader( source.toString() ) );
 204         }
 205         else {
 206             throw new IllegalArgumentException( "attempt to parse object of type [" + source.getClass()
 207                     + "] (can only parse objects of type File/String, InputStream, StringBuffer, or StringBuilder)" );
 208         }
 209         return reader;
 210     }
 211
 212     /**
 213      * Extracts a code if and only if:
 214      * one and only one _,
 215      * shorter than 25,
 216      * no |,
 217      * no .,
 218      * if / present it has to be after the _,
 219      * if PFAM_STYLE_ONLY: / must be present,
 220      * tax code can only contain uppercase letters and numbers,
 221      * and must contain at least one uppercase letter.
 222      * Return null if no code extractable.
 223      *
 224      * @param name
 225      * @return
 226      */
 227     public static String extractTaxonomyCodeFromNodeName( final String name,
 228                                                           final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction ) {
 229         if ( ( name.indexOf( "_" ) > 0 )
 230                 && ( name.length() < 31 )
 231                 //  && ( name.lastIndexOf( "_" ) == name.indexOf( "_" ) )
 232                 && ( name.indexOf( "|" ) < 0 )
 233                 && ( name.indexOf( "." ) < 0 )
 234                 && ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name
 235                         .indexOf( "/" ) >= 0 ) )
 236                 && ( ( ( name.indexOf( "/" ) ) < 0 ) || ( name.indexOf( "/" ) > name.indexOf( "_" ) ) ) ) {
 237             final String[] s = name.split( "[_/]" );
 238             if ( s.length > 1 ) {
 239                 final String str = s[ 1 ];
 240                 //   if (  str.length() < 6  ) {
 241                 if ( ( str.length() < 5 ) && ( str.startsWith( "RAT" ) || str.startsWith( "PIG" ) ) ) {
 242                     return str.substring( 0, 3 );
 243                 }
 244                 final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( str );
 245                 if ( m1.matches() ) {
 246                     return m1.group();
 247                 }
 248                 final Matcher m2 = TAXOMONY_CODE_PATTERN_2.matcher( str );
 249                 if ( m2.matches() ) {
 250                     return m2.group( 1 );
 251                 }
 252                 // return null;
 253                 //                final Matcher uc_letters_and_numbers = NHXParser.UC_LETTERS_NUMBERS_PATTERN.matcher( str );
 254                 //                if ( !uc_letters_and_numbers.matches() ) {
 255                 //                    return null;
 256                 //                }
 257                 //                final Matcher numbers_only = NHXParser.NUMBERS_ONLY_PATTERN.matcher( str );
 258                 //                if ( numbers_only.matches() ) {
 259                 //                    return null;
 260                 //                }
 261                 //                return str;
 262                 //  }
 263             }
 264         }
 265         return null;
 266     }
 267
 268     public final static Phylogeny[] readPhylogenies( final File file ) throws FileNotFoundException, IOException {
 269         return PhylogenyMethods.readPhylogenies( ParserUtils.createParserDependingOnFileType( file, true ), file );
 270     }
 271
 272     public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException {
 273         return readPhylogenies( new File( file_name ) );
 274     }
 275 }