src/jalview/io/IdentifyFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.io.File;
  24 import java.io.IOException;
  25
  26 /**
  27  * DOCUMENT ME!
  28  *
  29  * @author $author$
  30  * @version $Revision$
  31  */
  32 public class IdentifyFile
  33 {
  34
  35   public FileFormatI identify(Object file, DataSourceType protocol) throws FileFormatException
  36   {
  37     // BH 2018
  38     return (file instanceof File ? identify((File) file, protocol) : identify((String) file, protocol));
  39
  40   }
  41
  42   public FileFormatI identify(File file, DataSourceType sourceType)
  43           throws FileFormatException
  44   {
  45     // BH 2018
  46     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  47     FileParse parser = null;
  48     try
  49     {
  50       parser = new FileParse(file, sourceType);
  51       if (parser.isValid())
  52       {
  53         return identify(parser);
  54       }
  55     } catch (Exception e)
  56     {
  57       System.err.println("Error whilst identifying " + file);
  58       e.printStackTrace(System.err);
  59       emessage = e.getMessage();
  60     }
  61     if (parser != null)
  62     {
  63       throw new FileFormatException(parser.errormessage);
  64     }
  65     throw new FileFormatException(emessage);
  66   }
  67
  68   /**
  69    * Identify a datasource's file content.
  70    *
  71    * @note Do not use this method for stream sources - create a FileParse object
  72    *       instead.
  73    *
  74    * @param file
  75    * @param sourceType
  76    * @return
  77    * @throws FileFormatException
  78    */
  79   public FileFormatI identify(String file, DataSourceType sourceType)
  80           throws FileFormatException
  81   {
  82     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  83     FileParse parser = null;
  84     try
  85     {
  86       parser = new FileParse(file, sourceType);
  87       if (parser.isValid())
  88       {
  89         return identify(parser);
  90       }
  91     } catch (Exception e)
  92     {
  93       System.err.println("Error whilst identifying " + file);
  94       e.printStackTrace(System.err);
  95       emessage = e.getMessage();
  96     }
  97     if (parser != null)
  98     {
  99       throw new FileFormatException(parser.errormessage);
 100     }
 101     throw new FileFormatException(emessage);
 102   }
 103
 104   public FileFormatI identify(FileParse source) throws FileFormatException
 105   {
 106     return identify(source, true);
 107     // preserves original behaviour prior to version 2.3
 108   }
 109
 110   public FileFormatI identify(AlignmentFileReaderI file,
 111           boolean closeSource) throws IOException
 112   {
 113     FileParse fp = new FileParse(file.getInFile(),
 114             file.getDataSourceType());
 115     return identify(fp, closeSource);
 116   }
 117
 118   /**
 119    * Identify contents of source, closing it or resetting source to start
 120    * afterwards.
 121    *
 122    * @param source
 123    * @param closeSource
 124    * @return (best guess at) file format
 125    * @throws FileFormatException
 126    */
 127   public FileFormatI identify(FileParse source, boolean closeSource)
 128           throws FileFormatException
 129   {
 130     FileFormatI reply = FileFormat.Pfam;
 131     String data;
 132     int bytesRead = 0;
 133     int trimmedLength = 0;
 134     boolean lineswereskipped = false;
 135     boolean isBinary = false; // true if length is non-zero and non-printable
 136     // characters are encountered
 137
 138     try
 139     {
 140       if (!closeSource)
 141       {
 142         source.mark();
 143       }
 144       boolean aaIndexHeaderRead = false;
 145
 146       while ((data = source.nextLine()) != null)
 147       {
 148         bytesRead += data.length();
 149         trimmedLength += data.trim().length();
 150         if (!lineswereskipped)
 151         {
 152           for (int i = 0; !isBinary && i < data.length(); i++)
 153           {
 154             char c = data.charAt(i);
 155             isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
 156                     && c != 5 && c != 27); // nominal binary character filter
 157             // excluding CR, LF, tab,DEL and ^E
 158             // for certain blast ids
 159           }
 160         }
 161         if (isBinary)
 162         {
 163           // jar files are special - since they contain all sorts of random
 164           // characters.
 165           if (source.inFile != null)
 166           {
 167             String fileStr = source.inFile.getName();
 168             if (fileStr.contains(".jar")
 169                     || fileStr.contains(".zip") || fileStr.contains(".jvp"))
 170             {
 171               // possibly a Jalview archive (but check further)
 172               reply = FileFormat.Jalview;
 173             }
 174           }
 175           if (!lineswereskipped && data.startsWith("PK"))
 176           {
 177             reply = FileFormat.Jalview; // archive
 178             break;
 179           }
 180         }
 181         data = data.toUpperCase();
 182
 183         if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
 184         {
 185           reply = FileFormat.ScoreMatrix;
 186           break;
 187         }
 188         if (data.startsWith("HMMER3"))
 189         {
 190           reply = FileFormat.HMMER3;
 191           break;
 192         }
 193         if (data.startsWith("H ") && !aaIndexHeaderRead)
 194         {
 195           aaIndexHeaderRead = true;
 196         }
 197         if (data.startsWith("D ") && aaIndexHeaderRead)
 198         {
 199           reply = FileFormat.ScoreMatrix;
 200           break;
 201         }
 202         if (data.startsWith("##GFF-VERSION"))
 203         {
 204           // GFF - possibly embedded in a Jalview features file!
 205           reply = FileFormat.Features;
 206           break;
 207         }
 208         if (looksLikeFeatureData(data))
 209         {
 210           reply = FileFormat.Features;
 211           break;
 212         }
 213         if (data.indexOf("# STOCKHOLM") > -1)
 214         {
 215           reply = FileFormat.Stockholm;
 216           break;
 217         }
 218         if (data.indexOf("_ENTRY.ID") > -1
 219                 || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
 220                 || data.indexOf("_ATOM_SITE.") > -1)
 221         {
 222           reply = FileFormat.MMCif;
 223           break;
 224         }
 225         // if (data.indexOf(">") > -1)
 226         if (data.startsWith(">"))
 227         {
 228           // FASTA, PIR file or BLC file
 229           boolean checkPIR = false, starterm = false;
 230           if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
 231           {
 232             // watch for PIR file attributes
 233             checkPIR = true;
 234             reply = FileFormat.PIR;
 235           }
 236           // could also be BLC file, read next line to confirm
 237           data = source.nextLine();
 238
 239           if (data.indexOf(">") > -1)
 240           {
 241             reply = FileFormat.BLC;
 242           }
 243           else
 244           {
 245             // Is this a single line BLC file?
 246             String data1 = source.nextLine();
 247             String data2 = source.nextLine();
 248             int c1;
 249             if (checkPIR)
 250             {
 251               starterm = (data1 != null && data1.indexOf("*") > -1)
 252                       || (data2 != null && data2.indexOf("*") > -1);
 253             }
 254             if (data2 != null && (c1 = data.indexOf("*")) > -1)
 255             {
 256               if (c1 == 0 && c1 == data2.indexOf("*"))
 257               {
 258                 reply = FileFormat.BLC;
 259               }
 260               else
 261               {
 262                 reply = FileFormat.Fasta; // possibly a bad choice - may be
 263                                           // recognised as
 264                 // PIR
 265               }
 266               // otherwise can still possibly be a PIR file
 267             }
 268             else
 269             {
 270               reply = FileFormat.Fasta;
 271               // TODO : AMSA File is indicated if there is annotation in the
 272               // FASTA file - but FASTA will automatically generate this at the
 273               // mo.
 274               if (!checkPIR)
 275               {
 276                 break;
 277               }
 278             }
 279           }
 280           // final check for PIR content. require
 281           // >P1;title\n<blah>\nterminated sequence to occur at least once.
 282
 283           // TODO the PIR/fasta ambiguity may be the use case that is needed to
 284           // have
 285           // a 'Parse as type XXX' parameter for the applet/application.
 286           if (checkPIR)
 287           {
 288             String dta = null;
 289             if (!starterm)
 290             {
 291               do
 292               {
 293                 try
 294                 {
 295                   dta = source.nextLine();
 296                 } catch (IOException ex)
 297                 {
 298                 }
 299                 if (dta != null && dta.indexOf("*") > -1)
 300                 {
 301                   starterm = true;
 302                 }
 303               } while (dta != null && !starterm);
 304             }
 305             if (starterm)
 306             {
 307               reply = FileFormat.PIR;
 308               break;
 309             }
 310             else
 311             {
 312               reply = FileFormat.Fasta; // probably a bad choice!
 313             }
 314           }
 315           // read as a FASTA (probably)
 316           break;
 317         }
 318         if (data.indexOf("{\"") > -1)
 319         {
 320           reply = FileFormat.Json;
 321           break;
 322         }
 323         int lessThan = data.indexOf("<");
 324         if ((lessThan > -1)) // possible Markup Language data i.e HTML,
 325                              // RNAML, XML
 326         {
 327           String upper = data.toUpperCase();
 328           if (upper.substring(lessThan).startsWith("<HTML"))
 329           {
 330             reply = FileFormat.Html;
 331             break;
 332           }
 333           if (upper.substring(lessThan).startsWith("<RNAML"))
 334           {
 335             reply = FileFormat.Rnaml;
 336             break;
 337           }
 338           if (upper.substring(lessThan).startsWith("<BSML"))
 339           {
 340             reply = FileFormat.BSML;
 341             break;
 342           }
 343         }
 344
 345         if ((data.length() < 1) || (data.indexOf("#") == 0))
 346         {
 347           lineswereskipped = true;
 348           continue;
 349         }
 350
 351         if (data.indexOf("PILEUP") > -1)
 352         {
 353           reply = FileFormat.Pileup;
 354
 355           break;
 356         }
 357
 358         if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
 359                 .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
 360         {
 361           reply = FileFormat.MSF;
 362
 363           break;
 364         }
 365         else if (data.indexOf("CLUSTAL") > -1)
 366         {
 367           reply = FileFormat.Clustal;
 368
 369           break;
 370         }
 371
 372         else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
 373         {
 374           reply = FileFormat.PDB;
 375           break;
 376         }
 377         else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
 378         {
 379           reply = FileFormat.Phylip;
 380           break;
 381         }
 382         else
 383         {
 384           if (!lineswereskipped && looksLikeJnetData(data))
 385           {
 386             reply = FileFormat.Jnet;
 387             break;
 388           }
 389         }
 390
 391         lineswereskipped = true; // this means there was some junk before any
 392         // key file signature
 393       }
 394       if (closeSource)
 395       {
 396         source.close();
 397       }
 398       else
 399       {
 400         source.reset(bytesRead); // so the file can be parsed from the mark
 401       }
 402     } catch (Exception ex)
 403     {
 404       System.err.println("File Identification failed!\n" + ex);
 405       throw new FileFormatException(source.errormessage);
 406     }
 407     if (trimmedLength == 0)
 408     {
 409       System.err.println(
 410               "File Identification failed! - Empty file was read.");
 411       throw new FileFormatException("EMPTY DATA FILE");
 412     }
 413     System.out.println("File format identified as " + reply.toString());
 414     return reply;
 415   }
 416
 417   /**
 418    * Returns true if the data appears to be Jnet concise annotation format
 419    *
 420    * @param data
 421    * @return
 422    */
 423   protected boolean looksLikeJnetData(String data)
 424   {
 425     char firstChar = data.charAt(0);
 426     int colonPos = data.indexOf(":");
 427     int commaPos = data.indexOf(",");
 428     boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
 429             && commaPos > -1 && colonPos < commaPos;
 430     // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
 431     return isJnet;
 432   }
 433
 434   /**
 435    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
 436    * and 5 are integer (start/end)
 437    *
 438    * @param data
 439    * @return
 440    */
 441   protected boolean looksLikeFeatureData(String data)
 442   {
 443     if (data == null)
 444     {
 445       return false;
 446     }
 447     String[] columns = data.split("\t");
 448     if (columns.length < 6)
 449     {
 450       return false;
 451     }
 452     for (int col = 3; col < 5; col++)
 453     {
 454       try
 455       {
 456         Integer.parseInt(columns[col]);
 457       } catch (NumberFormatException e)
 458       {
 459         return false;
 460       }
 461     }
 462     return true;
 463   }
 464
 465   /**
 466    *
 467    * @param args
 468    * @j2sIgnore
 469    */
 470   public static void main(String[] args)
 471   {
 472     for (int i = 0; args != null && i < args.length; i++)
 473     {
 474       IdentifyFile ider = new IdentifyFile();
 475       FileFormatI type = null;
 476       try
 477       {
 478         type = ider.identify(args[i], DataSourceType.FILE);
 479       } catch (FileFormatException e)
 480       {
 481         System.err.println(
 482                 String.format("Error '%s' identifying file type for %s",
 483                         args[i], e.getMessage()));
 484       }
 485       System.out.println("Type of " + args[i] + " is " + type);
 486     }
 487     if (args == null || args.length == 0)
 488     {
 489       System.err.println("Usage: <Filename> [<Filename> ...]");
 490     }
 491   }
 492
 493
 494 }