src/jalview/io/IdentifyFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.io.File;
  24 import java.io.IOException;
  25
  26 /**
  27  * DOCUMENT ME!
  28  *
  29  * @author $author$
  30  * @version $Revision$
  31  */
  32 public class IdentifyFile
  33 {
  34
  35   public FileFormatI identify(Object file, DataSourceType protocol) throws FileFormatException
  36   {
  37     // BH 2018
  38     return (file instanceof File ? identify((File) file, protocol) : identify((String) file, protocol));
  39
  40   }
  41
  42   public FileFormatI identify(File file, DataSourceType sourceType)
  43           throws FileFormatException
  44   {
  45     // BH 2018
  46     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  47     FileParse parser = null;
  48     try
  49     {
  50       parser = new FileParse(file, sourceType);
  51       if (parser.isValid())
  52       {
  53         return identify(parser);
  54       }
  55     } catch (Exception e)
  56     {
  57       System.err.println("Error whilst identifying " + file);
  58       e.printStackTrace(System.err);
  59       emessage = e.getMessage();
  60     }
  61     if (parser != null)
  62     {
  63       throw new FileFormatException(parser.errormessage);
  64     }
  65     throw new FileFormatException(emessage);
  66   }
  67
  68   /**
  69    * Identify a datasource's file content.
  70    *
  71    * @note Do not use this method for stream sources - create a FileParse object
  72    *       instead.
  73    *
  74    * @param file
  75    * @param sourceType
  76    * @return
  77    * @throws FileFormatException
  78    */
  79   public FileFormatI identify(String file, DataSourceType sourceType)
  80           throws FileFormatException
  81   {
  82     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  83     FileParse parser = null;
  84     try
  85     {
  86       parser = new FileParse(file, sourceType);
  87       if (parser.isValid())
  88       {
  89         return identify(parser);
  90       }
  91     } catch (Exception e)
  92     {
  93       System.err.println("Error whilst identifying " + file);
  94       e.printStackTrace(System.err);
  95       emessage = e.getMessage();
  96     }
  97     if (parser != null)
  98     {
  99       throw new FileFormatException(parser.errormessage);
 100     }
 101     throw new FileFormatException(emessage);
 102   }
 103
 104   public FileFormatI identify(FileParse source) throws FileFormatException
 105   {
 106     return identify(source, true);
 107     // preserves original behaviour prior to version 2.3
 108   }
 109
 110   public FileFormatI identify(AlignmentFileReaderI file,
 111           boolean closeSource) throws IOException
 112   {
 113     FileParse fp = new FileParse(file.getInFile(),
 114             file.getDataSourceType());
 115     return identify(fp, closeSource);
 116   }
 117
 118   /**
 119    * Identify contents of source, closing it or resetting source to start
 120    * afterwards.
 121    *
 122    * @param source
 123    * @param closeSource
 124    * @return (best guess at) file format
 125    * @throws FileFormatException
 126    */
 127   public FileFormatI identify(FileParse source, boolean closeSource)
 128           throws FileFormatException
 129   {
 130     FileFormatI reply = FileFormat.Pfam;
 131     String data;
 132     int bytesRead = 0;
 133     int trimmedLength = 0;
 134     boolean lineswereskipped = false;
 135     boolean isBinary = false; // true if length is non-zero and non-printable
 136     // characters are encountered
 137
 138     try
 139     {
 140       if (!closeSource)
 141       {
 142         source.mark();
 143       }
 144       boolean aaIndexHeaderRead = false;
 145
 146       while ((data = source.nextLine()) != null)
 147       {
 148         bytesRead += data.length();
 149         trimmedLength += data.trim().length();
 150         if (!lineswereskipped)
 151         {
 152           for (int i = 0; !isBinary && i < data.length(); i++)
 153           {
 154             char c = data.charAt(i);
 155             isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
 156                     && c != 5 && c != 27); // nominal binary character filter
 157             // excluding CR, LF, tab,DEL and ^E
 158             // for certain blast ids
 159           }
 160         }
 161         if (isBinary)
 162         {
 163           // jar files are special - since they contain all sorts of random
 164           // characters.
 165           if (source.inFile != null)
 166           {
 167             String fileStr = source.inFile.getName();
 168             if (fileStr.contains(".jar")
 169                     || fileStr.contains(".zip") || fileStr.contains(".jvp"))
 170             {
 171               // possibly a Jalview archive (but check further)
 172               reply = FileFormat.Jalview;
 173             }
 174           }
 175           if (!lineswereskipped && data.startsWith("PK"))
 176           {
 177             reply = FileFormat.Jalview; // archive
 178             break;
 179           }
 180         }
 181         data = data.toUpperCase();
 182
 183         if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
 184         {
 185           reply = FileFormat.ScoreMatrix;
 186           break;
 187         }
 188         if (data.startsWith("H ") && !aaIndexHeaderRead)
 189         {
 190           aaIndexHeaderRead = true;
 191         }
 192         if (data.startsWith("D ") && aaIndexHeaderRead)
 193         {
 194           reply = FileFormat.ScoreMatrix;
 195           break;
 196         }
 197         if (data.startsWith("##GFF-VERSION"))
 198         {
 199           // GFF - possibly embedded in a Jalview features file!
 200           reply = FileFormat.Features;
 201           break;
 202         }
 203         if (looksLikeFeatureData(data))
 204         {
 205           reply = FileFormat.Features;
 206           break;
 207         }
 208         if (data.indexOf("# STOCKHOLM") > -1)
 209         {
 210           reply = FileFormat.Stockholm;
 211           break;
 212         }
 213         if (data.indexOf("_ENTRY.ID") > -1
 214                 || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
 215                 || data.indexOf("_ATOM_SITE.") > -1)
 216         {
 217           reply = FileFormat.MMCif;
 218           break;
 219         }
 220         // if (data.indexOf(">") > -1)
 221         if (data.startsWith(">"))
 222         {
 223           // FASTA, PIR file or BLC file
 224           boolean checkPIR = false, starterm = false;
 225           if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
 226           {
 227             // watch for PIR file attributes
 228             checkPIR = true;
 229             reply = FileFormat.PIR;
 230           }
 231           // could also be BLC file, read next line to confirm
 232           data = source.nextLine();
 233
 234           if (data.indexOf(">") > -1)
 235           {
 236             reply = FileFormat.BLC;
 237           }
 238           else
 239           {
 240             // Is this a single line BLC file?
 241             String data1 = source.nextLine();
 242             String data2 = source.nextLine();
 243             int c1;
 244             if (checkPIR)
 245             {
 246               starterm = (data1 != null && data1.indexOf("*") > -1)
 247                       || (data2 != null && data2.indexOf("*") > -1);
 248             }
 249             if (data2 != null && (c1 = data.indexOf("*")) > -1)
 250             {
 251               if (c1 == 0 && c1 == data2.indexOf("*"))
 252               {
 253                 reply = FileFormat.BLC;
 254               }
 255               else
 256               {
 257                 reply = FileFormat.Fasta; // possibly a bad choice - may be
 258                                           // recognised as
 259                 // PIR
 260               }
 261               // otherwise can still possibly be a PIR file
 262             }
 263             else
 264             {
 265               reply = FileFormat.Fasta;
 266               // TODO : AMSA File is indicated if there is annotation in the
 267               // FASTA file - but FASTA will automatically generate this at the
 268               // mo.
 269               if (!checkPIR)
 270               {
 271                 break;
 272               }
 273             }
 274           }
 275           // final check for PIR content. require
 276           // >P1;title\n<blah>\nterminated sequence to occur at least once.
 277
 278           // TODO the PIR/fasta ambiguity may be the use case that is needed to
 279           // have
 280           // a 'Parse as type XXX' parameter for the applet/application.
 281           if (checkPIR)
 282           {
 283             String dta = null;
 284             if (!starterm)
 285             {
 286               do
 287               {
 288                 try
 289                 {
 290                   dta = source.nextLine();
 291                 } catch (IOException ex)
 292                 {
 293                 }
 294                 if (dta != null && dta.indexOf("*") > -1)
 295                 {
 296                   starterm = true;
 297                 }
 298               } while (dta != null && !starterm);
 299             }
 300             if (starterm)
 301             {
 302               reply = FileFormat.PIR;
 303               break;
 304             }
 305             else
 306             {
 307               reply = FileFormat.Fasta; // probably a bad choice!
 308             }
 309           }
 310           // read as a FASTA (probably)
 311           break;
 312         }
 313         if (data.indexOf("{\"") > -1)
 314         {
 315           reply = FileFormat.Json;
 316           break;
 317         }
 318         int lessThan = data.indexOf("<");
 319         if ((lessThan > -1)) // possible Markup Language data i.e HTML,
 320                              // RNAML, XML
 321         {
 322           String upper = data.toUpperCase();
 323           if (upper.substring(lessThan).startsWith("<HTML"))
 324           {
 325             reply = FileFormat.Html;
 326             break;
 327           }
 328           if (upper.substring(lessThan).startsWith("<RNAML"))
 329           {
 330             reply = FileFormat.Rnaml;
 331             break;
 332           }
 333         }
 334
 335         if ((data.length() < 1) || (data.indexOf("#") == 0))
 336         {
 337           lineswereskipped = true;
 338           continue;
 339         }
 340
 341         if (data.indexOf("PILEUP") > -1)
 342         {
 343           reply = FileFormat.Pileup;
 344
 345           break;
 346         }
 347
 348         if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
 349                 .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
 350         {
 351           reply = FileFormat.MSF;
 352
 353           break;
 354         }
 355         else if (data.indexOf("CLUSTAL") > -1)
 356         {
 357           reply = FileFormat.Clustal;
 358
 359           break;
 360         }
 361
 362         else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
 363         {
 364           reply = FileFormat.PDB;
 365           break;
 366         }
 367         else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
 368         {
 369           reply = FileFormat.Phylip;
 370           break;
 371         }
 372         else
 373         {
 374           if (!lineswereskipped && looksLikeJnetData(data))
 375           {
 376             reply = FileFormat.Jnet;
 377             break;
 378           }
 379         }
 380
 381         lineswereskipped = true; // this means there was some junk before any
 382         // key file signature
 383       }
 384       if (closeSource)
 385       {
 386         source.close();
 387       }
 388       else
 389       {
 390         source.reset(bytesRead); // so the file can be parsed from the mark
 391       }
 392     } catch (Exception ex)
 393     {
 394       System.err.println("File Identification failed!\n" + ex);
 395       throw new FileFormatException(source.errormessage);
 396     }
 397     if (trimmedLength == 0)
 398     {
 399       System.err.println(
 400               "File Identification failed! - Empty file was read.");
 401       throw new FileFormatException("EMPTY DATA FILE");
 402     }
 403     System.out.println("File format identified as " + reply.toString());
 404     return reply;
 405   }
 406
 407   /**
 408    * Returns true if the data appears to be Jnet concise annotation format
 409    *
 410    * @param data
 411    * @return
 412    */
 413   protected boolean looksLikeJnetData(String data)
 414   {
 415     char firstChar = data.charAt(0);
 416     int colonPos = data.indexOf(":");
 417     int commaPos = data.indexOf(",");
 418     boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
 419             && commaPos > -1 && colonPos < commaPos;
 420     // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
 421     return isJnet;
 422   }
 423
 424   /**
 425    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
 426    * and 5 are integer (start/end)
 427    *
 428    * @param data
 429    * @return
 430    */
 431   protected boolean looksLikeFeatureData(String data)
 432   {
 433     if (data == null)
 434     {
 435       return false;
 436     }
 437     String[] columns = data.split("\t");
 438     if (columns.length < 6)
 439     {
 440       return false;
 441     }
 442     for (int col = 3; col < 5; col++)
 443     {
 444       try
 445       {
 446         Integer.parseInt(columns[col]);
 447       } catch (NumberFormatException e)
 448       {
 449         return false;
 450       }
 451     }
 452     return true;
 453   }
 454
 455   /**
 456    *
 457    * @param args
 458    * @j2sIgnore
 459    */
 460   public static void main(String[] args)
 461   {
 462     for (int i = 0; args != null && i < args.length; i++)
 463     {
 464       IdentifyFile ider = new IdentifyFile();
 465       FileFormatI type = null;
 466       try
 467       {
 468         type = ider.identify(args[i], DataSourceType.FILE);
 469       } catch (FileFormatException e)
 470       {
 471         System.err.println(
 472                 String.format("Error '%s' identifying file type for %s",
 473                         args[i], e.getMessage()));
 474       }
 475       System.out.println("Type of " + args[i] + " is " + type);
 476     }
 477     if (args == null || args.length == 0)
 478     {
 479       System.err.println("Usage: <Filename> [<Filename> ...]");
 480     }
 481   }
 482
 483
 484 }