src/jalview/io/IdentifyFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.io.IOException;
  24
  25 /**
  26  * DOCUMENT ME!
  27  *
  28  * @author $author$
  29  * @version $Revision$
  30  */
  31 public class IdentifyFile
  32 {
  33   public static final String FeaturesFile = "GFF or Jalview features";
  34
  35   /**
  36    * Identify a datasource's file content.
  37    *
  38    * @note Do not use this method for stream sources - create a FileParse object
  39    *       instead.
  40    *
  41    * @param file
  42    * @param sourceType
  43    * @return
  44    * @throws FileFormatException
  45    */
  46   public FileFormatI identify(String file, DataSourceType sourceType)
  47           throws FileFormatException
  48   {
  49     String emessage = "UNIDENTIFIED FILE PARSING ERROR";
  50     FileParse parser = null;
  51     try
  52     {
  53       parser = new FileParse(file, sourceType);
  54       if (parser.isValid())
  55       {
  56         return identify(parser);
  57       }
  58     } catch (Exception e)
  59     {
  60       System.err.println("Error whilst identifying");
  61       e.printStackTrace(System.err);
  62       emessage = e.getMessage();
  63     }
  64     if (parser != null)
  65     {
  66       throw new FileFormatException(parser.errormessage);
  67     }
  68     throw new FileFormatException(emessage);
  69   }
  70
  71   public FileFormatI identify(FileParse source) throws FileFormatException
  72   {
  73     return identify(source, true);
  74     // preserves original behaviour prior to version 2.3
  75   }
  76
  77   /**
  78    * Identify contents of source, closing it or resetting source to start
  79    * afterwards.
  80    *
  81    * @param source
  82    * @param closeSource
  83    * @return (best guess at) file format
  84    * @throws FileFormatException
  85    */
  86   public FileFormatI identify(FileParse source, boolean closeSource)
  87           throws FileFormatException
  88   {
  89     FileFormatI reply = FileFormat.Pfam;
  90     String data;
  91     int bytesRead = 0;
  92     int trimmedLength = 0;
  93     boolean lineswereskipped = false;
  94     boolean isBinary = false; // true if length is non-zero and non-printable
  95     // characters are encountered
  96     try
  97     {
  98       if (!closeSource)
  99       {
 100         source.mark();
 101       }
 102       while ((data = source.nextLine()) != null)
 103       {
 104         bytesRead += data.length();
 105         trimmedLength += data.trim().length();
 106         if (!lineswereskipped)
 107         {
 108           for (int i = 0; !isBinary && i < data.length(); i++)
 109           {
 110             char c = data.charAt(i);
 111             isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
 112                     && c != 5 && c != 27); // nominal binary character filter
 113             // excluding CR, LF, tab,DEL and ^E
 114             // for certain blast ids
 115           }
 116         }
 117         if (isBinary)
 118         {
 119           // jar files are special - since they contain all sorts of random
 120           // characters.
 121           if (source.inFile != null)
 122           {
 123             String fileStr = source.inFile.getName();
 124             // possibly a Jalview archive.
 125             if (fileStr.lastIndexOf(".jar") > -1
 126                     || fileStr.lastIndexOf(".zip") > -1)
 127             {
 128               reply = FileFormat.Jalview;
 129             }
 130           }
 131           if (!lineswereskipped && data.startsWith("PK"))
 132           {
 133             reply = FileFormat.Jalview; // archive.
 134             break;
 135           }
 136         }
 137         data = data.toUpperCase();
 138
 139         if (data.startsWith("##GFF-VERSION"))
 140         {
 141           // GFF - possibly embedded in a Jalview features file!
 142           reply = FileFormat.Features;
 143           break;
 144         }
 145         if (looksLikeFeatureData(data))
 146         {
 147           reply = FileFormat.Features;
 148           break;
 149         }
 150         if (data.indexOf("# STOCKHOLM") > -1)
 151         {
 152           reply = FileFormat.Stockholm;
 153           break;
 154         }
 155         if (data.indexOf("_ENTRY.ID") > -1
 156                 || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
 157                 || data.indexOf("_ATOM_SITE.") > -1)
 158         {
 159           reply = FileFormat.MMCif;
 160           break;
 161         }
 162         // if (data.indexOf(">") > -1)
 163         if (data.startsWith(">"))
 164         {
 165           // FASTA, PIR file or BLC file
 166           boolean checkPIR = false, starterm = false;
 167           if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
 168           {
 169             // watch for PIR file attributes
 170             checkPIR = true;
 171             reply = FileFormat.PIR;
 172           }
 173           // could also be BLC file, read next line to confirm
 174           data = source.nextLine();
 175
 176           if (data.indexOf(">") > -1)
 177           {
 178             reply = FileFormat.BLC;
 179           }
 180           else
 181           {
 182             // Is this a single line BLC file?
 183             String data1 = source.nextLine();
 184             String data2 = source.nextLine();
 185             int c1;
 186             if (checkPIR)
 187             {
 188               starterm = (data1 != null && data1.indexOf("*") > -1)
 189                       || (data2 != null && data2.indexOf("*") > -1);
 190             }
 191             if (data2 != null && (c1 = data.indexOf("*")) > -1)
 192             {
 193               if (c1 == 0 && c1 == data2.indexOf("*"))
 194               {
 195                 reply = FileFormat.BLC;
 196               }
 197               else
 198               {
 199                 reply = FileFormat.Fasta; // possibly a bad choice - may be
 200                                           // recognised as
 201                 // PIR
 202               }
 203               // otherwise can still possibly be a PIR file
 204             }
 205             else
 206             {
 207               reply = FileFormat.Fasta;
 208               // TODO : AMSA File is indicated if there is annotation in the
 209               // FASTA file - but FASTA will automatically generate this at the
 210               // mo.
 211               if (!checkPIR)
 212               {
 213                 break;
 214               }
 215             }
 216           }
 217           // final check for PIR content. require
 218           // >P1;title\n<blah>\nterminated sequence to occur at least once.
 219
 220           // TODO the PIR/fasta ambiguity may be the use case that is needed to
 221           // have
 222           // a 'Parse as type XXX' parameter for the applet/application.
 223           if (checkPIR)
 224           {
 225             String dta = null;
 226             if (!starterm)
 227             {
 228               do
 229               {
 230                 try
 231                 {
 232                   dta = source.nextLine();
 233                 } catch (IOException ex)
 234                 {
 235                 }
 236                 if (dta != null && dta.indexOf("*") > -1)
 237                 {
 238                   starterm = true;
 239                 }
 240               } while (dta != null && !starterm);
 241             }
 242             if (starterm)
 243             {
 244               reply = FileFormat.PIR;
 245               break;
 246             }
 247             else
 248             {
 249               reply = FileFormat.Fasta; // probably a bad choice!
 250             }
 251           }
 252           // read as a FASTA (probably)
 253           break;
 254         }
 255         int lessThan = data.indexOf("<");
 256         if ((lessThan > -1)) // possible Markup Language data i.e HTML,
 257                                       // RNAML, XML
 258         {
 259           String upper = data.toUpperCase();
 260           if (upper.substring(lessThan).startsWith("<HTML"))
 261           {
 262             reply = FileFormat.Html;
 263             break;
 264           }
 265           if (upper.substring(lessThan).startsWith("<RNAML"))
 266           {
 267             reply = FileFormat.Rnaml;
 268             break;
 269           }
 270         }
 271
 272         if (data.indexOf("{\"") > -1)
 273         {
 274           reply = FileFormat.Json;
 275           break;
 276         }
 277         if ((data.length() < 1) || (data.indexOf("#") == 0))
 278         {
 279           lineswereskipped = true;
 280           continue;
 281         }
 282
 283         if (data.indexOf("PILEUP") > -1)
 284         {
 285           reply = FileFormat.Pileup;
 286
 287           break;
 288         }
 289
 290         if ((data.indexOf("//") == 0)
 291                 || ((data.indexOf("!!") > -1) && (data.indexOf("!!") < data
 292                         .indexOf("_MULTIPLE_ALIGNMENT "))))
 293         {
 294           reply = FileFormat.MSF;
 295
 296           break;
 297         }
 298         else if (data.indexOf("CLUSTAL") > -1)
 299         {
 300           reply = FileFormat.Clustal;
 301
 302           break;
 303         }
 304
 305         else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
 306         {
 307           reply = FileFormat.PDB;
 308           break;
 309         }
 310         else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
 311         {
 312           reply = FileFormat.Phylip;
 313           break;
 314         }
 315         else
 316         {
 317           if (!lineswereskipped && looksLikeJnetData(data))
 318           {
 319             reply = FileFormat.Jnet;
 320             break;
 321           }
 322         }
 323
 324         lineswereskipped = true; // this means there was some junk before any
 325         // key file signature
 326       }
 327       if (closeSource)
 328       {
 329         source.close();
 330       }
 331       else
 332       {
 333         source.reset(bytesRead); // so the file can be parsed from the mark
 334       }
 335     } catch (Exception ex)
 336     {
 337       System.err.println("File Identification failed!\n" + ex);
 338       throw new FileFormatException(source.errormessage);
 339     }
 340     if (trimmedLength == 0)
 341     {
 342       System.err
 343               .println("File Identification failed! - Empty file was read.");
 344       throw new FileFormatException("EMPTY DATA FILE");
 345     }
 346     return reply;
 347   }
 348
 349   /**
 350    * Returns true if the data appears to be Jnet concise annotation format
 351    *
 352    * @param data
 353    * @return
 354    */
 355   protected boolean looksLikeJnetData(String data)
 356   {
 357     char firstChar = data.charAt(0);
 358     int colonPos = data.indexOf(":");
 359     int commaPos = data.indexOf(",");
 360     boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
 361             && commaPos > -1 && colonPos < commaPos;
 362     // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
 363     return isJnet;
 364   }
 365
 366   /**
 367    * Returns true if the data has at least 6 tab-delimited fields _and_
 368    * fields 4 and 5 are integer (start/end)
 369    * @param data
 370    * @return
 371    */
 372   protected boolean looksLikeFeatureData(String data)
 373   {
 374     if (data == null)
 375     {
 376       return false;
 377     }
 378     String[] columns = data.split("\t");
 379     if (columns.length < 6) {
 380       return false;
 381     }
 382     for (int col = 3; col < 5; col++)
 383     {
 384       try {
 385         Integer.parseInt(columns[col]);
 386       } catch (NumberFormatException e) {
 387         return false;
 388       }
 389     }
 390     return true;
 391   }
 392
 393   public static void main(String[] args)
 394   {
 395     for (int i = 0; args != null && i < args.length; i++)
 396     {
 397       IdentifyFile ider = new IdentifyFile();
 398       FileFormatI type = null;
 399       try
 400       {
 401         type = ider.identify(args[i], DataSourceType.FILE);
 402       } catch (FileFormatException e)
 403       {
 404         System.err.println(String.format(
 405                 "Error '%s' identifying file type for %s", args[i],
 406                 e.getMessage()));
 407       }
 408       System.out.println("Type of " + args[i] + " is " + type);
 409     }
 410     if (args == null || args.length == 0)
 411     {
 412       System.err.println("Usage: <Filename> [<Filename> ...]");
 413     }
 414   }
 415 }