src/jalview/io/IdentifyFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.util.Locale;
  24 import java.io.File;
  25 import java.io.IOException;
  26
  27 /**
  28  * DOCUMENT ME!
  29  *
  30  * @author $author$
  31  * @version $Revision$
  32  */
  33 public class IdentifyFile
  34 {
  35
  36   public FileFormatI identify(Object file, DataSourceType protocol)
  37           throws FileFormatException
  38   {
  39     // BH 2018
  40     return (file instanceof File ? identify((File) file, protocol)
  41             : identify((String) file, protocol));
  42
  43   }
  44
  45   public FileFormatI identify(File file, DataSourceType sourceType)
  46           throws FileFormatException
  47   {
  48     // BH 2018
  49     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  50     FileParse parser = null;
  51     try
  52     {
  53       parser = new FileParse(file, sourceType);
  54       if (parser.isValid())
  55       {
  56         return identify(parser);
  57       }
  58     } catch (Exception e)
  59     {
  60       System.err.println("Error whilst identifying " + file);
  61       e.printStackTrace(System.err);
  62       emessage = e.getMessage();
  63     }
  64     if (parser != null)
  65     {
  66       throw new FileFormatException(parser.errormessage);
  67     }
  68     throw new FileFormatException(emessage);
  69   }
  70
  71   /**
  72    * Identify a datasource's file content.
  73    *
  74    * @note Do not use this method for stream sources - create a FileParse object
  75    *       instead.
  76    *
  77    * @param file
  78    * @param sourceType
  79    * @return
  80    * @throws FileFormatException
  81    */
  82   public FileFormatI identify(String file, DataSourceType sourceType)
  83           throws FileFormatException
  84   {
  85     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  86     FileParse parser = null;
  87     try
  88     {
  89       parser = new FileParse(file, sourceType);
  90       if (parser.isValid())
  91       {
  92         return identify(parser);
  93       }
  94     } catch (Exception e)
  95     {
  96       System.err.println("Error whilst identifying " + file);
  97       e.printStackTrace(System.err);
  98       emessage = e.getMessage();
  99     }
 100     if (parser != null)
 101     {
 102       throw new FileFormatException(parser.errormessage);
 103     }
 104     throw new FileFormatException(emessage);
 105   }
 106
 107   public FileFormatI identify(FileParse source) throws FileFormatException
 108   {
 109     return identify(source, true);
 110     // preserves original behaviour prior to version 2.3
 111   }
 112
 113   public FileFormatI identify(AlignmentFileReaderI file,
 114           boolean closeSource) throws IOException
 115   {
 116     FileParse fp = new FileParse(file.getInFile(),
 117             file.getDataSourceType());
 118     return identify(fp, closeSource);
 119   }
 120
 121   /**
 122    * Identify contents of source, closing it or resetting source to start
 123    * afterwards.
 124    *
 125    * @param source
 126    * @param closeSource
 127    * @return (best guess at) file format
 128    * @throws FileFormatException
 129    */
 130   public FileFormatI identify(FileParse source, boolean closeSource)
 131           throws FileFormatException
 132   {
 133     FileFormatI reply = FileFormat.Pfam;
 134     String data;
 135     int bytesRead = 0;
 136     int trimmedLength = 0;
 137     boolean lineswereskipped = false;
 138     boolean isBinary = false; // true if length is non-zero and non-printable
 139     // characters are encountered
 140
 141     try
 142     {
 143       if (!closeSource)
 144       {
 145         source.mark();
 146       }
 147       boolean aaIndexHeaderRead = false;
 148
 149       while ((data = source.nextLine()) != null)
 150       {
 151         bytesRead += data.length();
 152         trimmedLength += data.trim().length();
 153         if (!lineswereskipped)
 154         {
 155           for (int i = 0; !isBinary && i < data.length(); i++)
 156           {
 157             char c = data.charAt(i);
 158             isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
 159                     && c != 5 && c != 27); // nominal binary character filter
 160             // excluding CR, LF, tab,DEL and ^E
 161             // for certain blast ids
 162           }
 163         }
 164         if (isBinary)
 165         {
 166           // jar files are special - since they contain all sorts of random
 167           // characters.
 168           if (source.inFile != null)
 169           {
 170             String fileStr = source.inFile.getName();
 171             if (fileStr.contains(".jar") || fileStr.contains(".zip")
 172                     || fileStr.contains(".jvp"))
 173             {
 174               // possibly a Jalview archive (but check further)
 175               reply = FileFormat.Jalview;
 176             }
 177           }
 178           if (!lineswereskipped && data.startsWith("PK"))
 179           {
 180             reply = FileFormat.Jalview; // archive
 181             break;
 182           }
 183         }
 184         data = data.toUpperCase(Locale.ROOT);
 185
 186         if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
 187         {
 188           reply = FileFormat.ScoreMatrix;
 189           break;
 190         }
 191         if (data.startsWith("HMMER3"))
 192         {
 193           reply = FileFormat.HMMER3;
 194           break;
 195         }
 196         if (data.startsWith("LOCUS"))
 197         {
 198           reply = FileFormat.GenBank;
 199           break;
 200         }
 201         if (data.startsWith("ID "))
 202         {
 203           if (data.substring(2).trim().split(";").length == 7)
 204           {
 205             reply = FileFormat.Embl;
 206             break;
 207           }
 208         }
 209         if (data.startsWith("H ") && !aaIndexHeaderRead)
 210         {
 211           aaIndexHeaderRead = true;
 212         }
 213         if (data.startsWith("D ") && aaIndexHeaderRead)
 214         {
 215           reply = FileFormat.ScoreMatrix;
 216           break;
 217         }
 218         if (data.startsWith("##GFF-VERSION"))
 219         {
 220           // GFF - possibly embedded in a Jalview features file!
 221           reply = FileFormat.Features;
 222           break;
 223         }
 224         if (looksLikeFeatureData(data))
 225         {
 226           reply = FileFormat.Features;
 227           break;
 228         }
 229         if (data.indexOf("# STOCKHOLM") > -1)
 230         {
 231           reply = FileFormat.Stockholm;
 232           break;
 233         }
 234         if (data.indexOf("_ENTRY.ID") > -1
 235                 || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
 236                 || data.indexOf("_ATOM_SITE.") > -1)
 237         {
 238           reply = FileFormat.MMCif;
 239           break;
 240         }
 241         // if (data.indexOf(">") > -1)
 242         if (data.startsWith(">"))
 243         {
 244           // FASTA, PIR file or BLC file
 245           boolean checkPIR = false, starterm = false;
 246           if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
 247           {
 248             // watch for PIR file attributes
 249             checkPIR = true;
 250             reply = FileFormat.PIR;
 251           }
 252           // could also be BLC file, read next line to confirm
 253           data = source.nextLine();
 254
 255           if (data.indexOf(">") > -1)
 256           {
 257             reply = FileFormat.BLC;
 258           }
 259           else
 260           {
 261             // Is this a single line BLC file?
 262             String data1 = source.nextLine();
 263             String data2 = source.nextLine();
 264             int c1;
 265             if (checkPIR)
 266             {
 267               starterm = (data1 != null && data1.indexOf("*") > -1)
 268                       || (data2 != null && data2.indexOf("*") > -1);
 269             }
 270             if (data2 != null && (c1 = data.indexOf("*")) > -1)
 271             {
 272               if (c1 == 0 && c1 == data2.indexOf("*"))
 273               {
 274                 reply = FileFormat.BLC;
 275               }
 276               else
 277               {
 278                 reply = FileFormat.Fasta; // possibly a bad choice - may be
 279                                           // recognised as
 280                 // PIR
 281               }
 282               // otherwise can still possibly be a PIR file
 283             }
 284             else
 285             {
 286               reply = FileFormat.Fasta;
 287               // TODO : AMSA File is indicated if there is annotation in the
 288               // FASTA file - but FASTA will automatically generate this at the
 289               // mo.
 290               if (!checkPIR)
 291               {
 292                 break;
 293               }
 294             }
 295           }
 296           // final check for PIR content. require
 297           // >P1;title\n<blah>\nterminated sequence to occur at least once.
 298
 299           // TODO the PIR/fasta ambiguity may be the use case that is needed to
 300           // have
 301           // a 'Parse as type XXX' parameter for the applet/application.
 302           if (checkPIR)
 303           {
 304             String dta = null;
 305             if (!starterm)
 306             {
 307               do
 308               {
 309                 try
 310                 {
 311                   dta = source.nextLine();
 312                 } catch (IOException ex)
 313                 {
 314                 }
 315                 if (dta != null && dta.indexOf("*") > -1)
 316                 {
 317                   starterm = true;
 318                 }
 319               } while (dta != null && !starterm);
 320             }
 321             if (starterm)
 322             {
 323               reply = FileFormat.PIR;
 324               break;
 325             }
 326             else
 327             {
 328               reply = FileFormat.Fasta; // probably a bad choice!
 329             }
 330           }
 331           // read as a FASTA (probably)
 332           break;
 333         }
 334         if (data.indexOf("{\"") > -1)
 335         {
 336           reply = FileFormat.Json;
 337           break;
 338         }
 339         int lessThan = data.indexOf("<");
 340         if ((lessThan > -1)) // possible Markup Language data i.e HTML,
 341                              // RNAML, XML
 342         {
 343           String upper = data.toUpperCase(Locale.ROOT);
 344           if (upper.substring(lessThan).startsWith("<HTML"))
 345           {
 346             reply = FileFormat.Html;
 347             break;
 348           }
 349           if (upper.substring(lessThan).startsWith("<RNAML"))
 350           {
 351             reply = FileFormat.Rnaml;
 352             break;
 353           }
 354           if (upper.substring(lessThan).startsWith("<BSML"))
 355           {
 356             reply = FileFormat.BSML;
 357             break;
 358           }
 359         }
 360
 361         if ((data.length() < 1) || (data.indexOf("#") == 0))
 362         {
 363           lineswereskipped = true;
 364           continue;
 365         }
 366
 367         if (data.indexOf("PILEUP") > -1)
 368         {
 369           reply = FileFormat.Pileup;
 370
 371           break;
 372         }
 373
 374         if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
 375                 .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
 376         {
 377           reply = FileFormat.MSF;
 378
 379           break;
 380         }
 381         else if (data.indexOf("CLUSTAL") > -1)
 382         {
 383           reply = FileFormat.Clustal;
 384
 385           break;
 386         }
 387
 388         else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
 389         {
 390           reply = FileFormat.PDB;
 391           break;
 392         }
 393         else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
 394         {
 395           reply = FileFormat.Phylip;
 396           break;
 397         }
 398         else
 399         {
 400           if (!lineswereskipped && looksLikeJnetData(data))
 401           {
 402             reply = FileFormat.Jnet;
 403             break;
 404           }
 405         }
 406
 407         lineswereskipped = true; // this means there was some junk before any
 408         // key file signature
 409       }
 410       if (closeSource)
 411       {
 412         source.close();
 413       }
 414       else
 415       {
 416         source.reset(bytesRead); // so the file can be parsed from the mark
 417       }
 418     } catch (Exception ex)
 419     {
 420       System.err.println("File Identification failed!\n" + ex);
 421       throw new FileFormatException(source.errormessage);
 422     }
 423     if (trimmedLength == 0)
 424     {
 425       System.err.println(
 426               "File Identification failed! - Empty file was read.");
 427       throw new FileFormatException("EMPTY DATA FILE");
 428     }
 429     System.out.println("File format identified as " + reply.toString());
 430     return reply;
 431   }
 432
 433   /**
 434    * Returns true if the data appears to be Jnet concise annotation format
 435    *
 436    * @param data
 437    * @return
 438    */
 439   protected boolean looksLikeJnetData(String data)
 440   {
 441     char firstChar = data.charAt(0);
 442     int colonPos = data.indexOf(":");
 443     int commaPos = data.indexOf(",");
 444     boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
 445             && commaPos > -1 && colonPos < commaPos;
 446     // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
 447     return isJnet;
 448   }
 449
 450   /**
 451    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
 452    * and 5 are integer (start/end)
 453    *
 454    * @param data
 455    * @return
 456    */
 457   protected boolean looksLikeFeatureData(String data)
 458   {
 459     if (data == null)
 460     {
 461       return false;
 462     }
 463     String[] columns = data.split("\t");
 464     if (columns.length < 6)
 465     {
 466       return false;
 467     }
 468     for (int col = 3; col < 5; col++)
 469     {
 470       try
 471       {
 472         Integer.parseInt(columns[col]);
 473       } catch (NumberFormatException e)
 474       {
 475         return false;
 476       }
 477     }
 478     return true;
 479   }
 480
 481   /**
 482    *
 483    * @param args
 484    * @j2sIgnore
 485    */
 486   public static void main(String[] args)
 487   {
 488     for (int i = 0; args != null && i < args.length; i++)
 489     {
 490       IdentifyFile ider = new IdentifyFile();
 491       FileFormatI type = null;
 492       try
 493       {
 494         type = ider.identify(args[i], DataSourceType.FILE);
 495       } catch (FileFormatException e)
 496       {
 497         System.err.println(
 498                 String.format("Error '%s' identifying file type for %s",
 499                         args[i], e.getMessage()));
 500       }
 501       System.out.println("Type of " + args[i] + " is " + type);
 502     }
 503     if (args == null || args.length == 0)
 504     {
 505       System.err.println("Usage: <Filename> [<Filename> ...]");
 506     }
 507   }
 508
 509 }