src/jalview/io/IdentifyFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.io.File;
  24 import java.io.IOException;
  25 import java.util.Locale;
  26
  27 import jalview.bin.Console;
  28
  29 /**
  30  * DOCUMENT ME!
  31  *
  32  * @author $author$
  33  * @version $Revision$
  34  */
  35 public class IdentifyFile
  36 {
  37
  38   public FileFormatI identify(Object file, DataSourceType protocol)
  39           throws FileFormatException
  40   {
  41     // BH 2018
  42     return (file instanceof File ? identify((File) file, protocol)
  43             : identify((String) file, protocol));
  44
  45   }
  46
  47   public FileFormatI identify(File file, DataSourceType sourceType)
  48           throws FileFormatException
  49   {
  50     // BH 2018
  51     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  52     FileParse parser = null;
  53     try
  54     {
  55       parser = new FileParse(file, sourceType);
  56       if (parser.isValid())
  57       {
  58         return identify(parser);
  59       }
  60     } catch (Exception e)
  61     {
  62       Console.error("Error whilst identifying " + file, e);
  63       emessage = e.getMessage();
  64     }
  65     if (parser != null)
  66     {
  67       throw new FileFormatException(parser.errormessage);
  68     }
  69     throw new FileFormatException(emessage);
  70   }
  71
  72   /**
  73    * Identify a datasource's file content.
  74    *
  75    * @note Do not use this method for stream sources - create a FileParse object
  76    *       instead.
  77    *
  78    * @param file
  79    * @param sourceType
  80    * @return
  81    * @throws FileFormatException
  82    */
  83   public FileFormatI identify(String file, DataSourceType sourceType)
  84           throws FileFormatException
  85   {
  86     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  87     FileParse parser = null;
  88     try
  89     {
  90       parser = new FileParse(file, sourceType);
  91       if (parser.isValid())
  92       {
  93         return identify(parser);
  94       }
  95     } catch (Exception e)
  96     {
  97       Console.error("Error whilst identifying " + file, e);
  98       emessage = e.getMessage();
  99     }
 100     if (parser != null)
 101     {
 102       throw new FileFormatException(parser.errormessage);
 103     }
 104     throw new FileFormatException(emessage);
 105   }
 106
 107   public FileFormatI identify(FileParse source) throws FileFormatException
 108   {
 109     return identify(source, true);
 110     // preserves original behaviour prior to version 2.3
 111   }
 112
 113   public FileFormatI identify(AlignmentFileReaderI file,
 114           boolean closeSource) throws IOException
 115   {
 116     FileParse fp = new FileParse(file.getInFile(),
 117             file.getDataSourceType());
 118     return identify(fp, closeSource);
 119   }
 120
 121   /**
 122    * Identify contents of source, closing it or resetting source to start
 123    * afterwards.
 124    *
 125    * @param source
 126    * @param closeSource
 127    * @return (best guess at) file format
 128    * @throws FileFormatException
 129    */
 130   public FileFormatI identify(FileParse source, boolean closeSource)
 131           throws FileFormatException
 132   {
 133     FileFormatI reply = FileFormat.Pfam;
 134     String data;
 135     int bytesRead = 0;
 136     int trimmedLength = 0;
 137     boolean lineswereskipped = false;
 138     boolean isBinary = false; // true if length is non-zero and non-printable
 139     // characters are encountered
 140
 141     try
 142     {
 143       if (!closeSource)
 144       {
 145         source.mark();
 146       }
 147       boolean aaIndexHeaderRead = false;
 148
 149       while ((data = source.nextLine()) != null)
 150       {
 151         bytesRead += data.length();
 152         trimmedLength += data.trim().length();
 153         if (!lineswereskipped)
 154         {
 155           for (int i = 0; !isBinary && i < data.length(); i++)
 156           {
 157             char c = data.charAt(i);
 158             isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
 159                     && c != 5 && c != 27); // nominal binary character filter
 160             // excluding CR, LF, tab,DEL and ^E
 161             // for certain blast ids
 162           }
 163         }
 164         if (isBinary)
 165         {
 166           // jar files are special - since they contain all sorts of random
 167           // characters.
 168           if (source.inFile != null)
 169           {
 170             String fileStr = source.inFile.getName();
 171             if (fileStr.contains(".jar") || fileStr.contains(".zip")
 172                     || fileStr.contains(".jvp"))
 173             {
 174               // possibly a Jalview archive (but check further)
 175               reply = FileFormat.Jalview;
 176             }
 177           }
 178           if (!lineswereskipped && data.startsWith("PK"))
 179           {
 180             reply = FileFormat.Jalview; // archive
 181             break;
 182           }
 183         }
 184         data = data.toUpperCase(Locale.ROOT);
 185
 186         if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
 187         {
 188           reply = FileFormat.ScoreMatrix;
 189           break;
 190         }
 191         if (data.startsWith("LOCUS"))
 192         {
 193           reply = FileFormat.GenBank;
 194           break;
 195         }
 196         if (data.startsWith("ID "))
 197         {
 198           if (data.substring(2).trim().split(";").length == 7)
 199           {
 200             reply = FileFormat.Embl;
 201             break;
 202           }
 203         }
 204         if (data.startsWith("H ") && !aaIndexHeaderRead)
 205         {
 206           aaIndexHeaderRead = true;
 207         }
 208         if (data.startsWith("D ") && aaIndexHeaderRead)
 209         {
 210           reply = FileFormat.ScoreMatrix;
 211           break;
 212         }
 213         if (data.startsWith("##GFF-VERSION"))
 214         {
 215           // GFF - possibly embedded in a Jalview features file!
 216           reply = FileFormat.Features;
 217           break;
 218         }
 219         if (looksLikeFeatureData(data))
 220         {
 221           reply = FileFormat.Features;
 222           break;
 223         }
 224         if (data.indexOf("# STOCKHOLM") > -1)
 225         {
 226           reply = FileFormat.Stockholm;
 227           break;
 228         }
 229         if (data.indexOf("_ENTRY.ID") > -1
 230                 || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
 231                 || data.indexOf("_ATOM_SITE.") > -1)
 232         {
 233           reply = FileFormat.MMCif;
 234           break;
 235         }
 236         // if (data.indexOf(">") > -1)
 237         if (data.startsWith(">"))
 238         {
 239           // FASTA, PIR file or BLC file
 240           boolean checkPIR = false, starterm = false;
 241           if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
 242           {
 243             // watch for PIR file attributes
 244             checkPIR = true;
 245             reply = FileFormat.PIR;
 246           }
 247           // could also be BLC file, read next line to confirm
 248           data = source.nextLine();
 249
 250           if (data.indexOf(">") > -1)
 251           {
 252             reply = FileFormat.BLC;
 253           }
 254           else
 255           {
 256             // Is this a single line BLC file?
 257             String data1 = source.nextLine();
 258             String data2 = source.nextLine();
 259             int c1;
 260             if (checkPIR)
 261             {
 262               starterm = (data1 != null && data1.indexOf("*") > -1)
 263                       || (data2 != null && data2.indexOf("*") > -1);
 264             }
 265             if (data2 != null && (c1 = data.indexOf("*")) > -1)
 266             {
 267               if (c1 == 0 && c1 == data2.indexOf("*"))
 268               {
 269                 reply = FileFormat.BLC;
 270               }
 271               else
 272               {
 273                 reply = FileFormat.Fasta; // possibly a bad choice - may be
 274                                           // recognised as
 275                 // PIR
 276               }
 277               // otherwise can still possibly be a PIR file
 278             }
 279             else
 280             {
 281               reply = FileFormat.Fasta;
 282               // TODO : AMSA File is indicated if there is annotation in the
 283               // FASTA file - but FASTA will automatically generate this at the
 284               // mo.
 285               if (!checkPIR)
 286               {
 287                 break;
 288               }
 289             }
 290           }
 291           // final check for PIR content. require
 292           // >P1;title\n<blah>\nterminated sequence to occur at least once.
 293
 294           // TODO the PIR/fasta ambiguity may be the use case that is needed to
 295           // have
 296           // a 'Parse as type XXX' parameter for the applet/application.
 297           if (checkPIR)
 298           {
 299             String dta = null;
 300             if (!starterm)
 301             {
 302               do
 303               {
 304                 try
 305                 {
 306                   dta = source.nextLine();
 307                 } catch (IOException ex)
 308                 {
 309                 }
 310                 if (dta != null && dta.indexOf("*") > -1)
 311                 {
 312                   starterm = true;
 313                 }
 314               } while (dta != null && !starterm);
 315             }
 316             if (starterm)
 317             {
 318               reply = FileFormat.PIR;
 319               break;
 320             }
 321             else
 322             {
 323               reply = FileFormat.Fasta; // probably a bad choice!
 324             }
 325           }
 326           // read as a FASTA (probably)
 327           break;
 328         }
 329         if (data.indexOf("{\"") > -1)
 330         {
 331           reply = FileFormat.Json;
 332           break;
 333         }
 334         int lessThan = data.indexOf("<");
 335         if ((lessThan > -1)) // possible Markup Language data i.e HTML,
 336                              // RNAML, XML
 337         {
 338           String upper = data.toUpperCase(Locale.ROOT);
 339           if (upper.substring(lessThan).startsWith("<HTML"))
 340           {
 341             reply = FileFormat.Html;
 342             break;
 343           }
 344           if (upper.substring(lessThan).startsWith("<RNAML"))
 345           {
 346             reply = FileFormat.Rnaml;
 347             break;
 348           }
 349         }
 350
 351         if ((data.length() < 1) || (data.indexOf("#") == 0))
 352         {
 353           lineswereskipped = true;
 354           continue;
 355         }
 356
 357         if (data.indexOf("PILEUP") > -1)
 358         {
 359           reply = FileFormat.Pileup;
 360
 361           break;
 362         }
 363
 364         if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
 365                 .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
 366         {
 367           reply = FileFormat.MSF;
 368
 369           break;
 370         }
 371         else if (data.indexOf("CLUSTAL") > -1)
 372         {
 373           reply = FileFormat.Clustal;
 374
 375           break;
 376         }
 377
 378         else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
 379         {
 380           reply = FileFormat.PDB;
 381           break;
 382         }
 383         else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
 384         {
 385           reply = FileFormat.Phylip;
 386           break;
 387         }
 388         else
 389         {
 390           if (!lineswereskipped && looksLikeJnetData(data))
 391           {
 392             reply = FileFormat.Jnet;
 393             break;
 394           }
 395         }
 396
 397         lineswereskipped = true; // this means there was some junk before any
 398         // key file signature
 399       }
 400       if (closeSource)
 401       {
 402         source.close();
 403       }
 404       else
 405       {
 406         source.reset(bytesRead); // so the file can be parsed from the mark
 407       }
 408     } catch (Exception ex)
 409     {
 410       Console.error("File Identification failed!\n" + ex);
 411       throw new FileFormatException(source.errormessage);
 412     }
 413     if (trimmedLength == 0)
 414     {
 415       Console.error("File Identification failed! - Empty file was read.");
 416       throw new FileFormatException("EMPTY DATA FILE");
 417     }
 418     Console.debug("File format identified as " + reply.toString());
 419     return reply;
 420   }
 421
 422   /**
 423    * Returns true if the data appears to be Jnet concise annotation format
 424    *
 425    * @param data
 426    * @return
 427    */
 428   protected boolean looksLikeJnetData(String data)
 429   {
 430     char firstChar = data.charAt(0);
 431     int colonPos = data.indexOf(":");
 432     int commaPos = data.indexOf(",");
 433     boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
 434             && commaPos > -1 && colonPos < commaPos;
 435     // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
 436     return isJnet;
 437   }
 438
 439   /**
 440    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
 441    * and 5 are integer (start/end)
 442    *
 443    * @param data
 444    * @return
 445    */
 446   protected boolean looksLikeFeatureData(String data)
 447   {
 448     if (data == null)
 449     {
 450       return false;
 451     }
 452     String[] columns = data.split("\t");
 453     if (columns.length < 6)
 454     {
 455       return false;
 456     }
 457     for (int col = 3; col < 5; col++)
 458     {
 459       try
 460       {
 461         Integer.parseInt(columns[col]);
 462       } catch (NumberFormatException e)
 463       {
 464         return false;
 465       }
 466     }
 467     return true;
 468   }
 469
 470   /**
 471    *
 472    * @param args
 473    * @j2sIgnore
 474    */
 475   public static void main(String[] args)
 476   {
 477     for (int i = 0; args != null && i < args.length; i++)
 478     {
 479       IdentifyFile ider = new IdentifyFile();
 480       FileFormatI type = null;
 481       try
 482       {
 483         type = ider.identify(args[i], DataSourceType.FILE);
 484       } catch (FileFormatException e)
 485       {
 486         Console.error(
 487                 String.format("Error '%s' identifying file type for %s",
 488                         args[i], e.getMessage()));
 489       }
 490       Console.debug("Type of " + args[i] + " is " + type);
 491     }
 492     if (args == null || args.length == 0)
 493     {
 494       Console.error("Usage: <Filename> [<Filename> ...]");
 495     }
 496   }
 497
 498 }