src/jalview/io/IdentifyFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.io.IOException;
  24 import java.util.Locale;
  25
  26 /**
  27  * DOCUMENT ME!
  28  *
  29  * @author $author$
  30  * @version $Revision$
  31  */
  32 public class IdentifyFile
  33 {
  34   /**
  35    * Identify a datasource's file content.
  36    *
  37    * @note Do not use this method for stream sources - create a FileParse object
  38    *       instead.
  39    *
  40    * @param file
  41    * @param sourceType
  42    * @return
  43    * @throws FileFormatException
  44    */
  45   public FileFormatI identify(String file, DataSourceType sourceType)
  46           throws FileFormatException
  47   {
  48     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  49     FileParse parser = null;
  50     try
  51     {
  52       parser = new FileParse(file, sourceType);
  53       if (parser.isValid())
  54       {
  55         return identify(parser);
  56       }
  57     } catch (Exception e)
  58     {
  59       System.err.println("Error whilst identifying");
  60       e.printStackTrace(System.err);
  61       emessage = e.getMessage();
  62     }
  63     if (parser != null)
  64     {
  65       throw new FileFormatException(parser.errormessage);
  66     }
  67     throw new FileFormatException(emessage);
  68   }
  69
  70   public FileFormatI identify(FileParse source) throws FileFormatException
  71   {
  72     return identify(source, true);
  73     // preserves original behaviour prior to version 2.3
  74   }
  75
  76   public FileFormatI identify(AlignmentFileReaderI file,
  77           boolean closeSource) throws IOException
  78   {
  79     FileParse fp = new FileParse(file.getInFile(),
  80             file.getDataSourceType());
  81     return identify(fp, closeSource);
  82   }
  83
  84   /**
  85    * Identify contents of source, closing it or resetting source to start
  86    * afterwards.
  87    *
  88    * @param source
  89    * @param closeSource
  90    * @return (best guess at) file format
  91    * @throws FileFormatException
  92    */
  93   public FileFormatI identify(FileParse source, boolean closeSource)
  94           throws FileFormatException
  95   {
  96     FileFormatI reply = FileFormat.Pfam;
  97     String data;
  98     int bytesRead = 0;
  99     int trimmedLength = 0;
 100     boolean lineswereskipped = false;
 101     boolean isBinary = false; // true if length is non-zero and non-printable
 102     // characters are encountered
 103
 104     try
 105     {
 106       if (!closeSource)
 107       {
 108         source.mark();
 109       }
 110       boolean aaIndexHeaderRead = false;
 111
 112       while ((data = source.nextLine()) != null)
 113       {
 114         bytesRead += data.length();
 115         trimmedLength += data.trim().length();
 116         if (!lineswereskipped)
 117         {
 118           for (int i = 0; !isBinary && i < data.length(); i++)
 119           {
 120             char c = data.charAt(i);
 121             isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
 122                     && c != 5 && c != 27); // nominal binary character filter
 123             // excluding CR, LF, tab,DEL and ^E
 124             // for certain blast ids
 125           }
 126         }
 127         if (isBinary)
 128         {
 129           // jar files are special - since they contain all sorts of random
 130           // characters.
 131           if (source.inFile != null)
 132           {
 133             String fileStr = source.inFile.getName();
 134             // possibly a Jalview archive.
 135             if (fileStr.lastIndexOf(".jar") > -1
 136                     || fileStr.lastIndexOf(".zip") > -1)
 137             {
 138               reply = FileFormat.Jalview;
 139             }
 140           }
 141           if (!lineswereskipped && data.startsWith("PK"))
 142           {
 143             reply = FileFormat.Jalview; // archive.
 144             break;
 145           }
 146         }
 147         data = data.toUpperCase(Locale.ROOT);
 148
 149         if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
 150         {
 151           reply = FileFormat.ScoreMatrix;
 152           break;
 153         }
 154         if (data.startsWith("H ") && !aaIndexHeaderRead)
 155         {
 156           aaIndexHeaderRead = true;
 157         }
 158         if (data.startsWith("D ") && aaIndexHeaderRead)
 159         {
 160           reply = FileFormat.ScoreMatrix;
 161           break;
 162         }
 163         if (data.startsWith("##GFF-VERSION"))
 164         {
 165           // GFF - possibly embedded in a Jalview features file!
 166           reply = FileFormat.Features;
 167           break;
 168         }
 169         if (looksLikeFeatureData(data))
 170         {
 171           reply = FileFormat.Features;
 172           break;
 173         }
 174         if (data.indexOf("# STOCKHOLM") > -1)
 175         {
 176           reply = FileFormat.Stockholm;
 177           break;
 178         }
 179         if (data.indexOf("_ENTRY.ID") > -1
 180                 || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
 181                 || data.indexOf("_ATOM_SITE.") > -1)
 182         {
 183           reply = FileFormat.MMCif;
 184           break;
 185         }
 186         // if (data.indexOf(">") > -1)
 187         if (data.startsWith(">"))
 188         {
 189           // FASTA, PIR file or BLC file
 190           boolean checkPIR = false, starterm = false;
 191           if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
 192           {
 193             // watch for PIR file attributes
 194             checkPIR = true;
 195             reply = FileFormat.PIR;
 196           }
 197           // could also be BLC file, read next line to confirm
 198           data = source.nextLine();
 199
 200           if (data.indexOf(">") > -1)
 201           {
 202             reply = FileFormat.BLC;
 203           }
 204           else
 205           {
 206             // Is this a single line BLC file?
 207             String data1 = source.nextLine();
 208             String data2 = source.nextLine();
 209             int c1;
 210             if (checkPIR)
 211             {
 212               starterm = (data1 != null && data1.indexOf("*") > -1)
 213                       || (data2 != null && data2.indexOf("*") > -1);
 214             }
 215             if (data2 != null && (c1 = data.indexOf("*")) > -1)
 216             {
 217               if (c1 == 0 && c1 == data2.indexOf("*"))
 218               {
 219                 reply = FileFormat.BLC;
 220               }
 221               else
 222               {
 223                 reply = FileFormat.Fasta; // possibly a bad choice - may be
 224                                           // recognised as
 225                 // PIR
 226               }
 227               // otherwise can still possibly be a PIR file
 228             }
 229             else
 230             {
 231               reply = FileFormat.Fasta;
 232               // TODO : AMSA File is indicated if there is annotation in the
 233               // FASTA file - but FASTA will automatically generate this at the
 234               // mo.
 235               if (!checkPIR)
 236               {
 237                 break;
 238               }
 239             }
 240           }
 241           // final check for PIR content. require
 242           // >P1;title\n<blah>\nterminated sequence to occur at least once.
 243
 244           // TODO the PIR/fasta ambiguity may be the use case that is needed to
 245           // have
 246           // a 'Parse as type XXX' parameter for the applet/application.
 247           if (checkPIR)
 248           {
 249             String dta = null;
 250             if (!starterm)
 251             {
 252               do
 253               {
 254                 try
 255                 {
 256                   dta = source.nextLine();
 257                 } catch (IOException ex)
 258                 {
 259                 }
 260                 if (dta != null && dta.indexOf("*") > -1)
 261                 {
 262                   starterm = true;
 263                 }
 264               } while (dta != null && !starterm);
 265             }
 266             if (starterm)
 267             {
 268               reply = FileFormat.PIR;
 269               break;
 270             }
 271             else
 272             {
 273               reply = FileFormat.Fasta; // probably a bad choice!
 274             }
 275           }
 276           // read as a FASTA (probably)
 277           break;
 278         }
 279         if (data.indexOf("{\"") > -1)
 280         {
 281           reply = FileFormat.Json;
 282           break;
 283         }
 284         int lessThan = data.indexOf("<");
 285         if ((lessThan > -1)) // possible Markup Language data i.e HTML,
 286                              // RNAML, XML
 287         {
 288           String upper = data.toUpperCase(Locale.ROOT);
 289           if (upper.substring(lessThan).startsWith("<HTML"))
 290           {
 291             reply = FileFormat.Html;
 292             break;
 293           }
 294           if (upper.substring(lessThan).startsWith("<RNAML"))
 295           {
 296             reply = FileFormat.Rnaml;
 297             break;
 298           }
 299         }
 300
 301         if ((data.length() < 1) || (data.indexOf("#") == 0))
 302         {
 303           lineswereskipped = true;
 304           continue;
 305         }
 306
 307         if (data.indexOf("PILEUP") > -1)
 308         {
 309           reply = FileFormat.Pileup;
 310
 311           break;
 312         }
 313
 314         if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
 315                 .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
 316         {
 317           reply = FileFormat.MSF;
 318
 319           break;
 320         }
 321         else if (data.indexOf("CLUSTAL") > -1)
 322         {
 323           reply = FileFormat.Clustal;
 324
 325           break;
 326         }
 327
 328         else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
 329         {
 330           reply = FileFormat.PDB;
 331           break;
 332         }
 333         else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
 334         {
 335           reply = FileFormat.Phylip;
 336           break;
 337         }
 338         else
 339         {
 340           if (!lineswereskipped && looksLikeJnetData(data))
 341           {
 342             reply = FileFormat.Jnet;
 343             break;
 344           }
 345         }
 346
 347         lineswereskipped = true; // this means there was some junk before any
 348         // key file signature
 349       }
 350       if (closeSource)
 351       {
 352         source.close();
 353       }
 354       else
 355       {
 356         source.reset(bytesRead); // so the file can be parsed from the mark
 357       }
 358     } catch (Exception ex)
 359     {
 360       System.err.println("File Identification failed!\n" + ex);
 361       throw new FileFormatException(source.errormessage);
 362     }
 363     if (trimmedLength == 0)
 364     {
 365       System.err.println(
 366               "File Identification failed! - Empty file was read.");
 367       throw new FileFormatException("EMPTY DATA FILE");
 368     }
 369     System.out.println("File format identified as " + reply.toString());
 370     return reply;
 371   }
 372
 373   /**
 374    * Returns true if the data appears to be Jnet concise annotation format
 375    *
 376    * @param data
 377    * @return
 378    */
 379   protected boolean looksLikeJnetData(String data)
 380   {
 381     char firstChar = data.charAt(0);
 382     int colonPos = data.indexOf(":");
 383     int commaPos = data.indexOf(",");
 384     boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
 385             && commaPos > -1 && colonPos < commaPos;
 386     // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
 387     return isJnet;
 388   }
 389
 390   /**
 391    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
 392    * and 5 are integer (start/end)
 393    *
 394    * @param data
 395    * @return
 396    */
 397   protected boolean looksLikeFeatureData(String data)
 398   {
 399     if (data == null)
 400     {
 401       return false;
 402     }
 403     String[] columns = data.split("\t");
 404     if (columns.length < 6)
 405     {
 406       return false;
 407     }
 408     for (int col = 3; col < 5; col++)
 409     {
 410       try
 411       {
 412         Integer.parseInt(columns[col]);
 413       } catch (NumberFormatException e)
 414       {
 415         return false;
 416       }
 417     }
 418     return true;
 419   }
 420
 421   public static void main(String[] args)
 422   {
 423     for (int i = 0; args != null && i < args.length; i++)
 424     {
 425       IdentifyFile ider = new IdentifyFile();
 426       FileFormatI type = null;
 427       try
 428       {
 429         type = ider.identify(args[i], DataSourceType.FILE);
 430       } catch (FileFormatException e)
 431       {
 432         System.err.println(
 433                 String.format("Error '%s' identifying file type for %s",
 434                         args[i], e.getMessage()));
 435       }
 436       System.out.println("Type of " + args[i] + " is " + type);
 437     }
 438     if (args == null || args.length == 0)
 439     {
 440       System.err.println("Usage: <Filename> [<Filename> ...]");
 441     }
 442   }
 443 }