src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceDummy;
  27 import jalview.datamodel.SequenceFeature;
  28 import jalview.datamodel.SequenceI;
  29 import jalview.schemes.AnnotationColourGradient;
  30 import jalview.schemes.GraduatedColor;
  31 import jalview.schemes.UserColourScheme;
  32 import jalview.util.Format;
  33 import jalview.util.MapList;
  34
  35 import java.io.IOException;
  36 import java.util.ArrayList;
  37 import java.util.Arrays;
  38 import java.util.HashMap;
  39 import java.util.Hashtable;
  40 import java.util.Iterator;
  41 import java.util.List;
  42 import java.util.Map;
  43 import java.util.StringTokenizer;
  44 import java.util.Vector;
  45
  46 /**
  47  * Parse and create Jalview Features files Detects GFF format features files and
  48  * parses. Does not implement standard print() - call specific printFeatures or
  49  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  50  * for the features annotation - this normally works on an exact match.
  51  *
  52  * @author AMW
  53  * @version $Revision$
  54  */
  55 public class FeaturesFile extends AlignFile
  56 {
  57   /**
  58    * work around for GFF interpretation bug where source string becomes
  59    * description rather than a group
  60    */
  61   private boolean doGffSource = true;
  62
  63   private int gffversion;
  64
  65   /**
  66    * Creates a new FeaturesFile object.
  67    */
  68   public FeaturesFile()
  69   {
  70   }
  71
  72   /**
  73    * @param inFile
  74    * @param type
  75    * @throws IOException
  76    */
  77   public FeaturesFile(String inFile, String type) throws IOException
  78   {
  79     super(inFile, type);
  80   }
  81
  82   /**
  83    * @param source
  84    * @throws IOException
  85    */
  86   public FeaturesFile(FileParse source) throws IOException
  87   {
  88     super(source);
  89   }
  90
  91   /**
  92    * @param parseImmediately
  93    * @param source
  94    * @throws IOException
  95    */
  96   public FeaturesFile(boolean parseImmediately, FileParse source)
  97           throws IOException
  98   {
  99     super(parseImmediately, source);
 100   }
 101
 102   /**
 103    * @param parseImmediately
 104    * @param inFile
 105    * @param type
 106    * @throws IOException
 107    */
 108   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 109           throws IOException
 110   {
 111     super(parseImmediately, inFile, type);
 112   }
 113
 114   /**
 115    * Parse GFF or sequence features file using case-independent matching,
 116    * discarding URLs
 117    *
 118    * @param align
 119    *          - alignment/dataset containing sequences that are to be annotated
 120    * @param colours
 121    *          - hashtable to store feature colour definitions
 122    * @param removeHTML
 123    *          - process html strings into plain text
 124    * @return true if features were added
 125    */
 126   public boolean parse(AlignmentI align, Hashtable colours,
 127           boolean removeHTML)
 128   {
 129     return parse(align, colours, null, removeHTML, false);
 130   }
 131
 132   /**
 133    * Parse GFF or sequence features file optionally using case-independent
 134    * matching, discarding URLs
 135    *
 136    * @param align
 137    *          - alignment/dataset containing sequences that are to be annotated
 138    * @param colours
 139    *          - hashtable to store feature colour definitions
 140    * @param removeHTML
 141    *          - process html strings into plain text
 142    * @param relaxedIdmatching
 143    *          - when true, ID matches to compound sequence IDs are allowed
 144    * @return true if features were added
 145    */
 146   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 147           boolean relaxedIdMatching)
 148   {
 149     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 150   }
 151
 152   /**
 153    * Parse GFF or sequence features file optionally using case-independent
 154    * matching
 155    *
 156    * @param align
 157    *          - alignment/dataset containing sequences that are to be annotated
 158    * @param colours
 159    *          - hashtable to store feature colour definitions
 160    * @param featureLink
 161    *          - hashtable to store associated URLs
 162    * @param removeHTML
 163    *          - process html strings into plain text
 164    * @return true if features were added
 165    */
 166   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 167           boolean removeHTML)
 168   {
 169     return parse(align, colours, featureLink, removeHTML, false);
 170   }
 171
 172   @Override
 173   public void addAnnotations(AlignmentI al)
 174   {
 175     // TODO Auto-generated method stub
 176     super.addAnnotations(al);
 177   }
 178
 179   @Override
 180   public void addProperties(AlignmentI al)
 181   {
 182     // TODO Auto-generated method stub
 183     super.addProperties(al);
 184   }
 185
 186   @Override
 187   public void addSeqGroups(AlignmentI al)
 188   {
 189     // TODO Auto-generated method stub
 190     super.addSeqGroups(al);
 191   }
 192
 193   /**
 194    * Parse GFF or sequence features file
 195    *
 196    * @param align
 197    *          - alignment/dataset containing sequences that are to be annotated
 198    * @param colours
 199    *          - hashtable to store feature colour definitions
 200    * @param featureLink
 201    *          - hashtable to store associated URLs
 202    * @param removeHTML
 203    *          - process html strings into plain text
 204    * @param relaxedIdmatching
 205    *          - when true, ID matches to compound sequence IDs are allowed
 206    * @return true if features were added
 207    */
 208   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 209           boolean removeHTML, boolean relaxedIdmatching)
 210   {
 211
 212     String line = null;
 213     try
 214     {
 215       SequenceI seq = null;
 216       /**
 217        * keep track of any sequences we try to create from the data if it is a
 218        * GFF3 file
 219        */
 220       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 221       String type, desc, token = null;
 222
 223       int index, start, end;
 224       float score;
 225       StringTokenizer st;
 226       SequenceFeature sf;
 227       String featureGroup = null, groupLink = null;
 228       Map typeLink = new Hashtable();
 229       /**
 230        * when true, assume GFF style features rather than Jalview style.
 231        */
 232       boolean GFFFile = true;
 233       Map<String, String> gffProps = new HashMap<String, String>();
 234       while ((line = nextLine()) != null)
 235       {
 236         // skip comments/process pragmas
 237         if (line.startsWith("#"))
 238         {
 239           if (line.startsWith("##"))
 240           {
 241             // possibly GFF2/3 version and metadata header
 242             processGffPragma(line, gffProps, align, newseqs);
 243             line = "";
 244           }
 245           continue;
 246         }
 247
 248         st = new StringTokenizer(line, "\t");
 249         if (st.countTokens() == 1)
 250         {
 251           if (line.trim().equalsIgnoreCase("GFF"))
 252           {
 253             // Start parsing file as if it might be GFF again.
 254             GFFFile = true;
 255             continue;
 256           }
 257         }
 258         if (st.countTokens() > 1 && st.countTokens() < 4)
 259         {
 260           GFFFile = false;
 261           type = st.nextToken();
 262           if (type.equalsIgnoreCase("startgroup"))
 263           {
 264             featureGroup = st.nextToken();
 265             if (st.hasMoreElements())
 266             {
 267               groupLink = st.nextToken();
 268               featureLink.put(featureGroup, groupLink);
 269             }
 270           }
 271           else if (type.equalsIgnoreCase("endgroup"))
 272           {
 273             // We should check whether this is the current group,
 274             // but at present theres no way of showing more than 1 group
 275             st.nextToken();
 276             featureGroup = null;
 277             groupLink = null;
 278           }
 279           else
 280           {
 281             Object colour = null;
 282             String colscheme = st.nextToken();
 283             if (colscheme.indexOf("|") > -1
 284                     || colscheme.trim().equalsIgnoreCase("label"))
 285             {
 286               // Parse '|' separated graduated colourscheme fields:
 287               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 288               // can either provide 'label' only, first is optional, next two
 289               // colors are required (but may be
 290               // left blank), next is optional, nxt two min/max are required.
 291               // first is either 'label'
 292               // first/second and third are both hexadecimal or word equivalent
 293               // colour.
 294               // next two are values parsed as floats.
 295               // fifth is either 'above','below', or 'none'.
 296               // sixth is a float value and only required when fifth is either
 297               // 'above' or 'below'.
 298               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 299                       true);
 300               // set defaults
 301               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 302               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 303               boolean labelCol = false;
 304               // Parse spec line
 305               String mincol = gcol.nextToken();
 306               if (mincol == "|")
 307               {
 308                 System.err
 309                         .println("Expected either 'label' or a colour specification in the line: "
 310                                 + line);
 311                 continue;
 312               }
 313               String maxcol = null;
 314               if (mincol.toLowerCase().indexOf("label") == 0)
 315               {
 316                 labelCol = true;
 317                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 318                                                                            // '|'
 319                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 320               }
 321               String abso = null, minval, maxval;
 322               if (mincol != null)
 323               {
 324                 // at least four more tokens
 325                 if (mincol.equals("|"))
 326                 {
 327                   mincol = "";
 328                 }
 329                 else
 330                 {
 331                   gcol.nextToken(); // skip next '|'
 332                 }
 333                 // continue parsing rest of line
 334                 maxcol = gcol.nextToken();
 335                 if (maxcol.equals("|"))
 336                 {
 337                   maxcol = "";
 338                 }
 339                 else
 340                 {
 341                   gcol.nextToken(); // skip next '|'
 342                 }
 343                 abso = gcol.nextToken();
 344                 gcol.nextToken(); // skip next '|'
 345                 if (abso.toLowerCase().indexOf("abso") != 0)
 346                 {
 347                   minval = abso;
 348                   abso = null;
 349                 }
 350                 else
 351                 {
 352                   minval = gcol.nextToken();
 353                   gcol.nextToken(); // skip next '|'
 354                 }
 355                 maxval = gcol.nextToken();
 356                 if (gcol.hasMoreTokens())
 357                 {
 358                   gcol.nextToken(); // skip next '|'
 359                 }
 360                 try
 361                 {
 362                   if (minval.length() > 0)
 363                   {
 364                     min = new Float(minval).floatValue();
 365                   }
 366                 } catch (Exception e)
 367                 {
 368                   System.err
 369                           .println("Couldn't parse the minimum value for graduated colour for type ("
 370                                   + colscheme
 371                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 372                   e.printStackTrace();
 373                 }
 374                 try
 375                 {
 376                   if (maxval.length() > 0)
 377                   {
 378                     max = new Float(maxval).floatValue();
 379                   }
 380                 } catch (Exception e)
 381                 {
 382                   System.err
 383                           .println("Couldn't parse the maximum value for graduated colour for type ("
 384                                   + colscheme + ")");
 385                   e.printStackTrace();
 386                 }
 387               }
 388               else
 389               {
 390                 // add in some dummy min/max colours for the label-only
 391                 // colourscheme.
 392                 mincol = "FFFFFF";
 393                 maxcol = "000000";
 394               }
 395               try
 396               {
 397                 colour = new jalview.schemes.GraduatedColor(
 398                         new UserColourScheme(mincol).findColour('A'),
 399                         new UserColourScheme(maxcol).findColour('A'), min,
 400                         max);
 401               } catch (Exception e)
 402               {
 403                 System.err
 404                         .println("Couldn't parse the graduated colour scheme ("
 405                                 + colscheme + ")");
 406                 e.printStackTrace();
 407               }
 408               if (colour != null)
 409               {
 410                 ((jalview.schemes.GraduatedColor) colour)
 411                         .setColourByLabel(labelCol);
 412                 ((jalview.schemes.GraduatedColor) colour)
 413                         .setAutoScaled(abso == null);
 414                 // add in any additional parameters
 415                 String ttype = null, tval = null;
 416                 if (gcol.hasMoreTokens())
 417                 {
 418                   // threshold type and possibly a threshold value
 419                   ttype = gcol.nextToken();
 420                   if (ttype.toLowerCase().startsWith("below"))
 421                   {
 422                     ((jalview.schemes.GraduatedColor) colour)
 423                             .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 424                   }
 425                   else if (ttype.toLowerCase().startsWith("above"))
 426                   {
 427                     ((jalview.schemes.GraduatedColor) colour)
 428                             .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 429                   }
 430                   else
 431                   {
 432                     ((jalview.schemes.GraduatedColor) colour)
 433                             .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 434                     if (!ttype.toLowerCase().startsWith("no"))
 435                     {
 436                       System.err
 437                               .println("Ignoring unrecognised threshold type : "
 438                                       + ttype);
 439                     }
 440                   }
 441                 }
 442                 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 443                 {
 444                   try
 445                   {
 446                     gcol.nextToken();
 447                     tval = gcol.nextToken();
 448                     ((jalview.schemes.GraduatedColor) colour)
 449                             .setThresh(new Float(tval).floatValue());
 450                   } catch (Exception e)
 451                   {
 452                     System.err
 453                             .println("Couldn't parse threshold value as a float: ("
 454                                     + tval + ")");
 455                     e.printStackTrace();
 456                   }
 457                 }
 458                 // parse the thresh-is-min token ?
 459                 if (gcol.hasMoreTokens())
 460                 {
 461                   System.err
 462                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 463                   while (gcol.hasMoreTokens())
 464                   {
 465                     System.err.println("|" + gcol.nextToken());
 466                   }
 467                   System.err.println("\n");
 468                 }
 469               }
 470             }
 471             else
 472             {
 473               UserColourScheme ucs = new UserColourScheme(colscheme);
 474               colour = ucs.findColour('A');
 475             }
 476             if (colour != null)
 477             {
 478               colours.put(type, colour);
 479             }
 480             if (st.hasMoreElements())
 481             {
 482               String link = st.nextToken();
 483               typeLink.put(type, link);
 484               if (featureLink == null)
 485               {
 486                 featureLink = new Hashtable();
 487               }
 488               featureLink.put(type, link);
 489             }
 490           }
 491           continue;
 492         }
 493         String seqId = "";
 494         while (st.hasMoreElements())
 495         {
 496
 497           if (GFFFile)
 498           {
 499             // Still possible this is an old Jalview file,
 500             // which does not have type colours at the beginning
 501             seqId = token = st.nextToken();
 502             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 503             if (seq != null)
 504             {
 505               desc = st.nextToken();
 506               String group = null;
 507               if (doGffSource && desc.indexOf(' ') == -1)
 508               {
 509                 // could also be a source term rather than description line
 510                 group = new String(desc);
 511               }
 512               type = st.nextToken();
 513               try
 514               {
 515                 String stt = st.nextToken();
 516                 if (stt.length() == 0 || stt.equals("-"))
 517                 {
 518                   start = 0;
 519                 }
 520                 else
 521                 {
 522                   start = Integer.parseInt(stt);
 523                 }
 524               } catch (NumberFormatException ex)
 525               {
 526                 start = 0;
 527               }
 528               try
 529               {
 530                 String stt = st.nextToken();
 531                 if (stt.length() == 0 || stt.equals("-"))
 532                 {
 533                   end = 0;
 534                 }
 535                 else
 536                 {
 537                   end = Integer.parseInt(stt);
 538                 }
 539               } catch (NumberFormatException ex)
 540               {
 541                 end = 0;
 542               }
 543               // TODO: decide if non positional feature assertion for input data
 544               // where end==0 is generally valid
 545               if (end == 0)
 546               {
 547                 // treat as non-positional feature, regardless.
 548                 start = 0;
 549               }
 550               try
 551               {
 552                 score = new Float(st.nextToken()).floatValue();
 553               } catch (NumberFormatException ex)
 554               {
 555                 score = 0;
 556               }
 557
 558               sf = new SequenceFeature(type, desc, start, end, score, group);
 559
 560               try
 561               {
 562                 sf.setValue("STRAND", st.nextToken());
 563                 sf.setValue("FRAME", st.nextToken());
 564               } catch (Exception ex)
 565               {
 566               }
 567
 568               if (st.hasMoreTokens())
 569               {
 570                 StringBuffer attributes = new StringBuffer();
 571                 boolean sep = false;
 572                 while (st.hasMoreTokens())
 573                 {
 574                   attributes.append((sep ? "\t" : "") + st.nextElement());
 575                   sep = true;
 576                 }
 577                 // TODO validate and split GFF2 attributes field ? parse out
 578                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 579                 // sf.setValue(attrib, val);
 580                 sf.setValue("ATTRIBUTES", attributes.toString());
 581               }
 582
 583               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 584                       relaxedIdmatching))
 585               {
 586                 // check whether we should add the sequence feature to any other
 587                 // sequences in the alignment with the same or similar
 588                 while ((seq = align.findName(seq, seqId, true)) != null)
 589                 {
 590                   seq.addSequenceFeature(new SequenceFeature(sf));
 591                 }
 592               }
 593               break;
 594             }
 595           }
 596
 597           if (GFFFile && seq == null)
 598           {
 599             desc = token;
 600           }
 601           else
 602           {
 603             desc = st.nextToken();
 604           }
 605           if (!st.hasMoreTokens())
 606           {
 607             System.err
 608                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 609             // in all probability, this isn't a file we understand, so bail
 610             // quietly.
 611             return false;
 612           }
 613
 614           token = st.nextToken();
 615
 616           if (!token.equals("ID_NOT_SPECIFIED"))
 617           {
 618             seq = findName(align, seqId = token, relaxedIdmatching, null);
 619             st.nextToken();
 620           }
 621           else
 622           {
 623             seqId = null;
 624             try
 625             {
 626               index = Integer.parseInt(st.nextToken());
 627               seq = align.getSequenceAt(index);
 628             } catch (NumberFormatException ex)
 629             {
 630               seq = null;
 631             }
 632           }
 633
 634           if (seq == null)
 635           {
 636             System.out.println("Sequence not found: " + line);
 637             break;
 638           }
 639
 640           start = Integer.parseInt(st.nextToken());
 641           end = Integer.parseInt(st.nextToken());
 642
 643           type = st.nextToken();
 644
 645           if (!colours.containsKey(type))
 646           {
 647             // Probably the old style groups file
 648             UserColourScheme ucs = new UserColourScheme(type);
 649             colours.put(type, ucs.findColour('A'));
 650           }
 651           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 652           if (st.hasMoreTokens())
 653           {
 654             try
 655             {
 656               score = new Float(st.nextToken()).floatValue();
 657               // update colourgradient bounds if allowed to
 658             } catch (NumberFormatException ex)
 659             {
 660               score = 0;
 661             }
 662             sf.setScore(score);
 663           }
 664           if (groupLink != null && removeHTML)
 665           {
 666             sf.addLink(groupLink);
 667             sf.description += "%LINK%";
 668           }
 669           if (typeLink.containsKey(type) && removeHTML)
 670           {
 671             sf.addLink(typeLink.get(type).toString());
 672             sf.description += "%LINK%";
 673           }
 674
 675           parseDescriptionHTML(sf, removeHTML);
 676
 677           seq.addSequenceFeature(sf);
 678
 679           while (seqId != null
 680                   && (seq = align.findName(seq, seqId, false)) != null)
 681           {
 682             seq.addSequenceFeature(new SequenceFeature(sf));
 683           }
 684           // If we got here, its not a GFFFile
 685           GFFFile = false;
 686         }
 687       }
 688       resetMatcher();
 689     } catch (Exception ex)
 690     {
 691       // should report somewhere useful for UI if necessary
 692       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 693               + "Parsing error at\n" + line;
 694       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 695       ex.printStackTrace(System.err);
 696       resetMatcher();
 697       return false;
 698     }
 699
 700     return true;
 701   }
 702
 703   private enum GffPragmas
 704   {
 705     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 706   };
 707
 708   private static Map<String, GffPragmas> GFFPRAGMA;
 709   static
 710   {
 711     GFFPRAGMA = new HashMap<String, GffPragmas>();
 712     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 713     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 714     GFFPRAGMA.put("#", GffPragmas.hash);
 715     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 716     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 717     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 718     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 719   }
 720
 721   private void processGffPragma(String line, Map<String, String> gffProps,
 722           AlignmentI align, ArrayList<SequenceI> newseqs)
 723           throws IOException
 724   {
 725     // line starts with ##
 726     int spacepos = line.indexOf(' ');
 727     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 728             .substring(2, spacepos);
 729     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 730     if (gffpragma == null)
 731     {
 732       return;
 733     }
 734     switch (gffpragma)
 735     {
 736     case gff_version:
 737       try
 738       {
 739         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 740       } finally
 741       {
 742
 743       }
 744       break;
 745     case feature_ontology:
 746       // resolve against specific feature ontology
 747       break;
 748     case attribute_ontology:
 749       // resolve against specific attribute ontology
 750       break;
 751     case source_ontology:
 752       // resolve against specific source ontology
 753       break;
 754     case species_build:
 755       // resolve against specific NCBI taxon version
 756       break;
 757     case hash:
 758       // close off any open feature hierarchies
 759       break;
 760     case fasta:
 761       // process the rest of the file as a fasta file and replace any dummy
 762       // sequence IDs
 763       process_as_fasta(align, newseqs);
 764       break;
 765     default:
 766       // we do nothing ?
 767       System.err.println("Ignoring unknown pragma:\n" + line);
 768     }
 769   }
 770
 771   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 772           throws IOException
 773   {
 774     try
 775     {
 776       mark();
 777     } catch (IOException q)
 778     {
 779     }
 780     FastaFile parser = new FastaFile(this);
 781     List<SequenceI> includedseqs = parser.getSeqs();
 782     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 783     // iterate over includedseqs, and replacing matching ones with newseqs
 784     // sequences. Generic iterator not used here because we modify includedseqs
 785     // as we go
 786     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 787     {
 788       // search for any dummy seqs that this sequence can be used to update
 789       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 790       if (dummyseq != null)
 791       {
 792         // dummyseq was created so it could be annotated and referred to in
 793         // alignments/codon mappings
 794
 795         SequenceI mseq = includedseqs.get(p);
 796         // mseq is the 'template' imported from the FASTA file which we'll use
 797         // to coomplete dummyseq
 798         if (dummyseq instanceof SequenceDummy)
 799         {
 800           // probably have the pattern wrong
 801           // idea is that a flyweight proxy for a sequence ID can be created for
 802           // 1. stable reference creation
 803           // 2. addition of annotation
 804           // 3. future replacement by a real sequence
 805           // current pattern is to create SequenceDummy objects - a convenience
 806           // constructor for a Sequence.
 807           // problem is that when promoted to a real sequence, all references
 808           // need
 809           // to be updated somehow.
 810           ((SequenceDummy) dummyseq).become(mseq);
 811           includedseqs.set(p, dummyseq); // template is no longer needed
 812         }
 813       }
 814     }
 815     // finally add sequences to the dataset
 816     for (SequenceI seq : includedseqs)
 817     {
 818       align.addSequence(seq);
 819     }
 820   }
 821
 822   /**
 823    * take a sequence feature and examine its attributes to decide how it should
 824    * be added to a sequence
 825    *
 826    * @param seq
 827    *          - the destination sequence constructed or discovered in the
 828    *          current context
 829    * @param sf
 830    *          - the base feature with ATTRIBUTES property containing any
 831    *          additional attributes
 832    * @param gFFFile
 833    *          - true if we are processing a GFF annotation file
 834    * @return true if sf was actually added to the sequence, false if it was
 835    *         processed in another way
 836    */
 837   public boolean processOrAddSeqFeature(AlignmentI align,
 838           List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 839           boolean gFFFile, boolean relaxedIdMatching)
 840   {
 841     String attr = (String) sf.getValue("ATTRIBUTES");
 842     boolean add = true;
 843     if (gFFFile && attr != null)
 844     {
 845       int nattr = 8;
 846
 847       for (String attset : attr.split("\t"))
 848       {
 849         if (attset == null || attset.trim().length() == 0)
 850         {
 851           continue;
 852         }
 853         nattr++;
 854         Map<String, List<String>> set = new HashMap<String, List<String>>();
 855         // normally, only expect one column - 9 - in this field
 856         // the attributes (Gff3) or groups (gff2) field
 857         for (String pair : attset.trim().split(";"))
 858         {
 859           pair = pair.trim();
 860           if (pair.length() == 0)
 861           {
 862             continue;
 863           }
 864
 865           // expect either space seperated (gff2) or '=' separated (gff3)
 866           // key/value pairs here
 867
 868           int eqpos = pair.indexOf('='), sppos = pair.indexOf(' ');
 869           String key = null, value = null;
 870
 871           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 872           {
 873             key = pair.substring(0, sppos);
 874             value = pair.substring(sppos + 1);
 875           }
 876           else
 877           {
 878             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 879             {
 880               key = pair.substring(0, eqpos);
 881               value = pair.substring(eqpos + 1);
 882             }
 883             else
 884             {
 885               key = pair;
 886             }
 887           }
 888           if (key != null)
 889           {
 890             List<String> vals = set.get(key);
 891             if (vals == null)
 892             {
 893               vals = new ArrayList<String>();
 894               set.put(key, vals);
 895             }
 896             if (value != null)
 897             {
 898               vals.add(value.trim());
 899             }
 900           }
 901         }
 902         try
 903         {
 904           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 905                   relaxedIdMatching); // process decides if
 906                                       // feature is actually
 907                                       // added
 908         } catch (InvalidGFF3FieldException ivfe)
 909         {
 910           System.err.println(ivfe);
 911         }
 912       }
 913     }
 914     if (add)
 915     {
 916       seq.addSequenceFeature(sf);
 917     }
 918     return add;
 919   }
 920
 921   public class InvalidGFF3FieldException extends Exception
 922   {
 923     String field, value;
 924
 925     public InvalidGFF3FieldException(String field,
 926             Map<String, List<String>> set, String message)
 927     {
 928       super(message + " (Field was " + field + " and value was "
 929               + set.get(field).toString());
 930       this.field = field;
 931       this.value = set.get(field).toString();
 932     }
 933
 934   }
 935
 936   /**
 937    * take a set of keys for a feature and interpret them
 938    *
 939    * @param set
 940    * @param nattr
 941    * @param seq
 942    * @param sf
 943    * @return
 944    */
 945   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 946           SequenceI seq, SequenceFeature sf, AlignmentI align,
 947           List<SequenceI> newseqs, boolean relaxedIdMatching)
 948           throws InvalidGFF3FieldException
 949   {
 950     String attr;
 951     // decide how to interpret according to type
 952     if (sf.getType().equals("similarity"))
 953     {
 954       int strand = sf.getStrand();
 955       // exonerate cdna/protein map
 956       // look for fields
 957       List<SequenceI> querySeq = findNames(align, newseqs,
 958               relaxedIdMatching, set.get(attr = "Query"));
 959       if (querySeq == null || querySeq.size() != 1)
 960       {
 961         throw new InvalidGFF3FieldException(attr, set,
 962                 "Expecting exactly one sequence in Query field (got "
 963                         + set.get(attr) + ")");
 964       }
 965       if (set.containsKey(attr = "Align"))
 966       {
 967         // process the align maps and create cdna/protein maps
 968         // ideally, the query sequences are in the alignment, but maybe not...
 969
 970         AlignedCodonFrame alco = new AlignedCodonFrame();
 971         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 972                 strand);
 973
 974         // add codon mapping, and hope!
 975         alco.addMap(seq, querySeq.get(0), codonmapping);
 976         align.addCodonFrame(alco);
 977         // everything that's needed to be done is done
 978         // no features to create here !
 979         return false;
 980       }
 981
 982     }
 983     return true;
 984   }
 985
 986   private MapList constructCodonMappingFromAlign(
 987           Map<String, List<String>> set, String attr, int strand)
 988           throws InvalidGFF3FieldException
 989   {
 990     if (strand == 0)
 991     {
 992       throw new InvalidGFF3FieldException(attr, set,
 993               "Invalid strand for a codon mapping (cannot be 0)");
 994     }
 995     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 996     int lastppos = 0, lastpframe = 0;
 997     for (String range : set.get(attr))
 998     {
 999       List<Integer> ints = new ArrayList<Integer>();
1000       StringTokenizer st = new StringTokenizer(range, " ");
1001       while (st.hasMoreTokens())
1002       {
1003         String num = st.nextToken();
1004         try
1005         {
1006           ints.add(new Integer(num));
1007         } catch (NumberFormatException nfe)
1008         {
1009           throw new InvalidGFF3FieldException(attr, set,
1010                   "Invalid number in field " + num);
1011         }
1012       }
1013       // Align positionInRef positionInQuery LengthInRef
1014       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
1015       // 3652 - . alignment_id 0 ;
1016       // Query DDB_G0269124
1017       // Align 11270 143 120
1018       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
1019       // dna in strand direction
1020       // Align 11150 187 282
1021       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
1022       // dna in strand direction
1023       //
1024       // Align 10865 281 888
1025       // Align 9977 578 1068
1026       // Align 8909 935 375
1027       //
1028       if (ints.size() != 3)
1029       {
1030         throw new InvalidGFF3FieldException(attr, set,
1031                 "Invalid number of fields for this attribute ("
1032                         + ints.size() + ")");
1033       }
1034       fromrange.add(new Integer(ints.get(0).intValue()));
1035       fromrange.add(new Integer(ints.get(0).intValue() + strand
1036               * ints.get(2).intValue()));
1037       // how are intron/exon boundaries that do not align in codons
1038       // represented
1039       if (ints.get(1).equals(lastppos) && lastpframe > 0)
1040       {
1041         // extend existing to map
1042         lastppos += ints.get(2) / 3;
1043         lastpframe = ints.get(2) % 3;
1044         torange.set(torange.size() - 1, new Integer(lastppos));
1045       }
1046       else
1047       {
1048         // new to map range
1049         torange.add(ints.get(1));
1050         lastppos = ints.get(1) + ints.get(2) / 3;
1051         lastpframe = ints.get(2) % 3;
1052         torange.add(new Integer(lastppos));
1053       }
1054     }
1055     // from and to ranges must end up being a series of start/end intervals
1056     if (fromrange.size() % 2 == 1)
1057     {
1058       throw new InvalidGFF3FieldException(attr, set,
1059               "Couldn't parse the DNA alignment range correctly");
1060     }
1061     if (torange.size() % 2 == 1)
1062     {
1063       throw new InvalidGFF3FieldException(attr, set,
1064               "Couldn't parse the protein alignment range correctly");
1065     }
1066     // finally, build the map
1067     int[] frommap = new int[fromrange.size()], tomap = new int[torange
1068             .size()];
1069     int p = 0;
1070     for (Integer ip : fromrange)
1071     {
1072       frommap[p++] = ip.intValue();
1073     }
1074     p = 0;
1075     for (Integer ip : torange)
1076     {
1077       tomap[p++] = ip.intValue();
1078     }
1079
1080     return new MapList(frommap, tomap, 3, 1);
1081   }
1082
1083   private List<SequenceI> findNames(AlignmentI align,
1084           List<SequenceI> newseqs, boolean relaxedIdMatching,
1085           List<String> list)
1086   {
1087     List<SequenceI> found = new ArrayList<SequenceI>();
1088     for (String seqId : list)
1089     {
1090       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1091       if (seq != null)
1092       {
1093         found.add(seq);
1094       }
1095     }
1096     return found;
1097   }
1098
1099   private AlignmentI lastmatchedAl = null;
1100
1101   private SequenceIdMatcher matcher = null;
1102
1103   /**
1104    * clear any temporary handles used to speed up ID matching
1105    */
1106   private void resetMatcher()
1107   {
1108     lastmatchedAl = null;
1109     matcher = null;
1110   }
1111
1112   private SequenceI findName(AlignmentI align, String seqId,
1113           boolean relaxedIdMatching, List<SequenceI> newseqs)
1114   {
1115     SequenceI match = null;
1116     if (relaxedIdMatching)
1117     {
1118       if (lastmatchedAl != align)
1119       {
1120         matcher = new SequenceIdMatcher(
1121                 (lastmatchedAl = align).getSequencesArray());
1122         if (newseqs != null)
1123         {
1124           matcher.addAll(newseqs);
1125         }
1126       }
1127       match = matcher.findIdMatch(seqId);
1128     }
1129     else
1130     {
1131       match = align.findName(seqId, true);
1132       if (match == null && newseqs != null)
1133       {
1134         for (SequenceI m : newseqs)
1135         {
1136           if (seqId.equals(m.getName()))
1137           {
1138             return m;
1139           }
1140         }
1141       }
1142
1143     }
1144     if (match == null && newseqs != null)
1145     {
1146       match = new SequenceDummy(seqId);
1147       if (relaxedIdMatching)
1148       {
1149         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
1150       }
1151       // add dummy sequence to the newseqs list
1152       newseqs.add(match);
1153     }
1154     return match;
1155   }
1156
1157   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
1158   {
1159     if (sf.getDescription() == null)
1160     {
1161       return;
1162     }
1163     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
1164             sf.getDescription(), removeHTML, newline);
1165
1166     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
1167             : sf.description;
1168     for (String link : parsed.getLinks())
1169     {
1170       sf.addLink(link);
1171     }
1172
1173   }
1174
1175   /**
1176    * generate a features file for seqs includes non-pos features by default.
1177    *
1178    * @param seqs
1179    *          source of sequence features
1180    * @param visible
1181    *          hash of feature types and colours
1182    * @return features file contents
1183    */
1184   public String printJalviewFormat(SequenceI[] seqs,
1185           Map<String, Object> visible)
1186   {
1187     return printJalviewFormat(seqs, visible, true, true);
1188   }
1189
1190   /**
1191    * generate a features file for seqs with colours from visible (if any)
1192    *
1193    * @param seqs
1194    *          source of features
1195    * @param visible
1196    *          hash of Colours for each feature type
1197    * @param visOnly
1198    *          when true only feature types in 'visible' will be output
1199    * @param nonpos
1200    *          indicates if non-positional features should be output (regardless
1201    *          of group or type)
1202    * @return features file contents
1203    */
1204   public String printJalviewFormat(SequenceI[] seqs, Map visible,
1205           boolean visOnly, boolean nonpos)
1206   {
1207     StringBuffer out = new StringBuffer();
1208     SequenceFeature[] next;
1209     boolean featuresGen = false;
1210     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1211     {
1212       // no point continuing.
1213       return "No Features Visible";
1214     }
1215
1216     if (visible != null && visOnly)
1217     {
1218       // write feature colours only if we're given them and we are generating
1219       // viewed features
1220       // TODO: decide if feature links should also be written here ?
1221       Iterator en = visible.keySet().iterator();
1222       String type, color;
1223       while (en.hasNext())
1224       {
1225         type = en.next().toString();
1226
1227         if (visible.get(type) instanceof GraduatedColor)
1228         {
1229           GraduatedColor gc = (GraduatedColor) visible.get(type);
1230           color = (gc.isColourByLabel() ? "label|" : "")
1231                   + Format.getHexString(gc.getMinColor()) + "|"
1232                   + Format.getHexString(gc.getMaxColor())
1233                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
1234                   + gc.getMax() + "|";
1235           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
1236           {
1237             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
1238             {
1239               color += "below";
1240             }
1241             else
1242             {
1243               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
1244               {
1245                 System.err.println("WARNING: Unsupported threshold type ("
1246                         + gc.getThreshType() + ") : Assuming 'above'");
1247               }
1248               color += "above";
1249             }
1250             // add the value
1251             color += "|" + gc.getThresh();
1252           }
1253           else
1254           {
1255             color += "none";
1256           }
1257         }
1258         else if (visible.get(type) instanceof java.awt.Color)
1259         {
1260           color = Format.getHexString((java.awt.Color) visible.get(type));
1261         }
1262         else
1263         {
1264           // legacy support for integer objects containing colour triplet values
1265           color = Format.getHexString(new java.awt.Color(Integer
1266                   .parseInt(visible.get(type).toString())));
1267         }
1268         out.append(type);
1269         out.append("\t");
1270         out.append(color);
1271         out.append(newline);
1272       }
1273     }
1274     // Work out which groups are both present and visible
1275     Vector groups = new Vector();
1276     int groupIndex = 0;
1277     boolean isnonpos = false;
1278
1279     for (int i = 0; i < seqs.length; i++)
1280     {
1281       next = seqs[i].getSequenceFeatures();
1282       if (next != null)
1283       {
1284         for (int j = 0; j < next.length; j++)
1285         {
1286           isnonpos = next[j].begin == 0 && next[j].end == 0;
1287           if ((!nonpos && isnonpos)
1288                   || (!isnonpos && visOnly && !visible
1289                           .containsKey(next[j].type)))
1290           {
1291             continue;
1292           }
1293
1294           if (next[j].featureGroup != null
1295                   && !groups.contains(next[j].featureGroup))
1296           {
1297             groups.addElement(next[j].featureGroup);
1298           }
1299         }
1300       }
1301     }
1302
1303     String group = null;
1304     do
1305     {
1306
1307       if (groups.size() > 0 && groupIndex < groups.size())
1308       {
1309         group = groups.elementAt(groupIndex).toString();
1310         out.append(newline);
1311         out.append("STARTGROUP\t");
1312         out.append(group);
1313         out.append(newline);
1314       }
1315       else
1316       {
1317         group = null;
1318       }
1319
1320       for (int i = 0; i < seqs.length; i++)
1321       {
1322         next = seqs[i].getSequenceFeatures();
1323         if (next != null)
1324         {
1325           for (int j = 0; j < next.length; j++)
1326           {
1327             isnonpos = next[j].begin == 0 && next[j].end == 0;
1328             if ((!nonpos && isnonpos)
1329                     || (!isnonpos && visOnly && !visible
1330                             .containsKey(next[j].type)))
1331             {
1332               // skip if feature is nonpos and we ignore them or if we only
1333               // output visible and it isn't non-pos and it's not visible
1334               continue;
1335             }
1336
1337             if (group != null
1338                     && (next[j].featureGroup == null || !next[j].featureGroup
1339                             .equals(group)))
1340             {
1341               continue;
1342             }
1343
1344             if (group == null && next[j].featureGroup != null)
1345             {
1346               continue;
1347             }
1348             // we have features to output
1349             featuresGen = true;
1350             if (next[j].description == null
1351                     || next[j].description.equals(""))
1352             {
1353               out.append(next[j].type + "\t");
1354             }
1355             else
1356             {
1357               if (next[j].links != null
1358                       && next[j].getDescription().indexOf("<html>") == -1)
1359               {
1360                 out.append("<html>");
1361               }
1362
1363               out.append(next[j].description + " ");
1364               if (next[j].links != null)
1365               {
1366                 for (int l = 0; l < next[j].links.size(); l++)
1367                 {
1368                   String label = next[j].links.elementAt(l).toString();
1369                   String href = label.substring(label.indexOf("|") + 1);
1370                   label = label.substring(0, label.indexOf("|"));
1371
1372                   if (next[j].description.indexOf(href) == -1)
1373                   {
1374                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1375                   }
1376                 }
1377
1378                 if (next[j].getDescription().indexOf("</html>") == -1)
1379                 {
1380                   out.append("</html>");
1381                 }
1382               }
1383
1384               out.append("\t");
1385             }
1386             out.append(seqs[i].getName());
1387             out.append("\t-1\t");
1388             out.append(next[j].begin);
1389             out.append("\t");
1390             out.append(next[j].end);
1391             out.append("\t");
1392             out.append(next[j].type);
1393             if (!Float.isNaN(next[j].score))
1394             {
1395               out.append("\t");
1396               out.append(next[j].score);
1397             }
1398             out.append(newline);
1399           }
1400         }
1401       }
1402
1403       if (group != null)
1404       {
1405         out.append("ENDGROUP\t");
1406         out.append(group);
1407         out.append(newline);
1408         groupIndex++;
1409       }
1410       else
1411       {
1412         break;
1413       }
1414
1415     } while (groupIndex < groups.size() + 1);
1416
1417     if (!featuresGen)
1418     {
1419       return "No Features Visible";
1420     }
1421
1422     return out.toString();
1423   }
1424
1425   /**
1426    * generate a gff file for sequence features includes non-pos features by
1427    * default.
1428    *
1429    * @param seqs
1430    * @param visible
1431    * @return
1432    */
1433   public String printGFFFormat(SequenceI[] seqs, Map<String, Object> visible)
1434   {
1435     return printGFFFormat(seqs, visible, true, true);
1436   }
1437
1438   public String printGFFFormat(SequenceI[] seqs,
1439           Map<String, Object> visible, boolean visOnly, boolean nonpos)
1440   {
1441     StringBuffer out = new StringBuffer();
1442     SequenceFeature[] next;
1443     String source;
1444     boolean isnonpos;
1445     for (int i = 0; i < seqs.length; i++)
1446     {
1447       if (seqs[i].getSequenceFeatures() != null)
1448       {
1449         next = seqs[i].getSequenceFeatures();
1450         for (int j = 0; j < next.length; j++)
1451         {
1452           isnonpos = next[j].begin == 0 && next[j].end == 0;
1453           if ((!nonpos && isnonpos)
1454                   || (!isnonpos && visOnly && !visible
1455                           .containsKey(next[j].type)))
1456           {
1457             continue;
1458           }
1459
1460           source = next[j].featureGroup;
1461           if (source == null)
1462           {
1463             source = next[j].getDescription();
1464           }
1465
1466           out.append(seqs[i].getName());
1467           out.append("\t");
1468           out.append(source);
1469           out.append("\t");
1470           out.append(next[j].type);
1471           out.append("\t");
1472           out.append(next[j].begin);
1473           out.append("\t");
1474           out.append(next[j].end);
1475           out.append("\t");
1476           out.append(next[j].score);
1477           out.append("\t");
1478
1479           if (next[j].getValue("STRAND") != null)
1480           {
1481             out.append(next[j].getValue("STRAND"));
1482             out.append("\t");
1483           }
1484           else
1485           {
1486             out.append(".\t");
1487           }
1488
1489           if (next[j].getValue("FRAME") != null)
1490           {
1491             out.append(next[j].getValue("FRAME"));
1492           }
1493           else
1494           {
1495             out.append(".");
1496           }
1497           // TODO: verify/check GFF - should there be a /t here before attribute
1498           // output ?
1499
1500           if (next[j].getValue("ATTRIBUTES") != null)
1501           {
1502             out.append(next[j].getValue("ATTRIBUTES"));
1503           }
1504
1505           out.append(newline);
1506
1507         }
1508       }
1509     }
1510
1511     return out.toString();
1512   }
1513
1514   /**
1515    * this is only for the benefit of object polymorphism - method does nothing.
1516    */
1517   public void parse()
1518   {
1519     // IGNORED
1520   }
1521
1522   /**
1523    * this is only for the benefit of object polymorphism - method does nothing.
1524    *
1525    * @return error message
1526    */
1527   public String print()
1528   {
1529     return "USE printGFFFormat() or printJalviewFormat()";
1530   }
1531
1532 }