src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.schemes.AnnotationColourGradient;
  31 import jalview.schemes.GraduatedColor;
  32 import jalview.schemes.UserColourScheme;
  33 import jalview.util.Format;
  34 import jalview.util.MapList;
  35
  36 import java.io.IOException;
  37 import java.util.ArrayList;
  38 import java.util.Arrays;
  39 import java.util.HashMap;
  40 import java.util.Hashtable;
  41 import java.util.Iterator;
  42 import java.util.List;
  43 import java.util.Map;
  44 import java.util.StringTokenizer;
  45 import java.util.Vector;
  46
  47 /**
  48  * Parse and create Jalview Features files Detects GFF format features files and
  49  * parses. Does not implement standard print() - call specific printFeatures or
  50  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  51  * for the features annotation - this normally works on an exact match.
  52  *
  53  * @author AMW
  54  * @version $Revision$
  55  */
  56 public class FeaturesFile extends AlignFile
  57 {
  58   /**
  59    * work around for GFF interpretation bug where source string becomes
  60    * description rather than a group
  61    */
  62   private boolean doGffSource = true;
  63
  64   private int gffversion;
  65
  66   /**
  67    * Creates a new FeaturesFile object.
  68    */
  69   public FeaturesFile()
  70   {
  71   }
  72
  73   /**
  74    * @param inFile
  75    * @param type
  76    * @throws IOException
  77    */
  78   public FeaturesFile(String inFile, String type) throws IOException
  79   {
  80     super(inFile, type);
  81   }
  82
  83   /**
  84    * @param source
  85    * @throws IOException
  86    */
  87   public FeaturesFile(FileParse source) throws IOException
  88   {
  89     super(source);
  90   }
  91
  92   /**
  93    * @param parseImmediately
  94    * @param source
  95    * @throws IOException
  96    */
  97   public FeaturesFile(boolean parseImmediately, FileParse source)
  98           throws IOException
  99   {
 100     super(parseImmediately, source);
 101   }
 102
 103   /**
 104    * @param parseImmediately
 105    * @param inFile
 106    * @param type
 107    * @throws IOException
 108    */
 109   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 110           throws IOException
 111   {
 112     super(parseImmediately, inFile, type);
 113   }
 114
 115   /**
 116    * Parse GFF or sequence features file using case-independent matching,
 117    * discarding URLs
 118    *
 119    * @param align
 120    *          - alignment/dataset containing sequences that are to be annotated
 121    * @param colours
 122    *          - hashtable to store feature colour definitions
 123    * @param removeHTML
 124    *          - process html strings into plain text
 125    * @return true if features were added
 126    */
 127   public boolean parse(AlignmentI align, Hashtable colours,
 128           boolean removeHTML)
 129   {
 130     return parse(align, colours, null, removeHTML, false);
 131   }
 132
 133   /**
 134    * Parse GFF or sequence features file optionally using case-independent
 135    * matching, discarding URLs
 136    *
 137    * @param align
 138    *          - alignment/dataset containing sequences that are to be annotated
 139    * @param colours
 140    *          - hashtable to store feature colour definitions
 141    * @param removeHTML
 142    *          - process html strings into plain text
 143    * @param relaxedIdmatching
 144    *          - when true, ID matches to compound sequence IDs are allowed
 145    * @return true if features were added
 146    */
 147   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 148           boolean relaxedIdMatching)
 149   {
 150     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 151   }
 152
 153   /**
 154    * Parse GFF or sequence features file optionally using case-independent
 155    * matching
 156    *
 157    * @param align
 158    *          - alignment/dataset containing sequences that are to be annotated
 159    * @param colours
 160    *          - hashtable to store feature colour definitions
 161    * @param featureLink
 162    *          - hashtable to store associated URLs
 163    * @param removeHTML
 164    *          - process html strings into plain text
 165    * @return true if features were added
 166    */
 167   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 168           boolean removeHTML)
 169   {
 170     return parse(align, colours, featureLink, removeHTML, false);
 171   }
 172
 173   @Override
 174   public void addAnnotations(Alignment al)
 175   {
 176     super.addAnnotations(al);
 177   }
 178
 179   @Override
 180   public void addProperties(Alignment al)
 181   {
 182     super.addProperties(al);
 183   }
 184
 185   @Override
 186   public void addSeqGroups(AlignmentI al)
 187   {
 188     super.addSeqGroups(al);
 189   }
 190
 191   /**
 192    * Parse GFF or sequence features file
 193    *
 194    * @param align
 195    *          - alignment/dataset containing sequences that are to be annotated
 196    * @param colours
 197    *          - hashtable to store feature colour definitions
 198    * @param featureLink
 199    *          - hashtable to store associated URLs
 200    * @param removeHTML
 201    *          - process html strings into plain text
 202    * @param relaxedIdmatching
 203    *          - when true, ID matches to compound sequence IDs are allowed
 204    * @return true if features were added
 205    */
 206   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 207           boolean removeHTML, boolean relaxedIdmatching)
 208   {
 209
 210     String line = null;
 211     try
 212     {
 213       SequenceI seq = null;
 214       /**
 215        * keep track of any sequences we try to create from the data if it is a GFF3 file
 216        */
 217       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 218       String type, desc, token = null;
 219
 220       int index, start, end;
 221       float score;
 222       StringTokenizer st;
 223       SequenceFeature sf;
 224       String featureGroup = null, groupLink = null;
 225       Map typeLink = new Hashtable();
 226       /**
 227        * when true, assume GFF style features rather than Jalview style.
 228        */
 229       boolean GFFFile = true;
 230       Map<String, String> gffProps = new HashMap<String, String>();
 231       while ((line = nextLine()) != null)
 232       {
 233         // skip comments/process pragmas
 234         if (line.startsWith("#"))
 235         {
 236           if (line.startsWith("##"))
 237           {
 238             // possibly GFF2/3 version and metadata header
 239             processGffPragma(line, gffProps, align, newseqs);
 240             line = "";
 241           }
 242           continue;
 243         }
 244
 245         st = new StringTokenizer(line, "\t");
 246         if (st.countTokens() == 1)
 247         {
 248           if (line.trim().equalsIgnoreCase("GFF"))
 249           {
 250             // Start parsing file as if it might be GFF again.
 251             GFFFile = true;
 252             continue;
 253           }
 254         }
 255         if (st.countTokens() > 1 && st.countTokens() < 4)
 256         {
 257           GFFFile = false;
 258           type = st.nextToken();
 259           if (type.equalsIgnoreCase("startgroup"))
 260           {
 261             featureGroup = st.nextToken();
 262             if (st.hasMoreElements())
 263             {
 264               groupLink = st.nextToken();
 265               featureLink.put(featureGroup, groupLink);
 266             }
 267           }
 268           else if (type.equalsIgnoreCase("endgroup"))
 269           {
 270             // We should check whether this is the current group,
 271             // but at present theres no way of showing more than 1 group
 272             st.nextToken();
 273             featureGroup = null;
 274             groupLink = null;
 275           }
 276           else
 277           {
 278             Object colour = null;
 279             String colscheme = st.nextToken();
 280             if (colscheme.indexOf("|") > -1
 281                     || colscheme.trim().equalsIgnoreCase("label"))
 282             {
 283               // Parse '|' separated graduated colourscheme fields:
 284               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 285               // can either provide 'label' only, first is optional, next two
 286               // colors are required (but may be
 287               // left blank), next is optional, nxt two min/max are required.
 288               // first is either 'label'
 289               // first/second and third are both hexadecimal or word equivalent
 290               // colour.
 291               // next two are values parsed as floats.
 292               // fifth is either 'above','below', or 'none'.
 293               // sixth is a float value and only required when fifth is either
 294               // 'above' or 'below'.
 295               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 296                       true);
 297               // set defaults
 298               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 299               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 300               boolean labelCol = false;
 301               // Parse spec line
 302               String mincol = gcol.nextToken();
 303               if (mincol == "|")
 304               {
 305                 System.err
 306                         .println("Expected either 'label' or a colour specification in the line: "
 307                                 + line);
 308                 continue;
 309               }
 310               String maxcol = null;
 311               if (mincol.toLowerCase().indexOf("label") == 0)
 312               {
 313                 labelCol = true;
 314                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 315                                                                            // '|'
 316                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 317               }
 318               String abso = null, minval, maxval;
 319               if (mincol != null)
 320               {
 321                 // at least four more tokens
 322                 if (mincol.equals("|"))
 323                 {
 324                   mincol = "";
 325                 }
 326                 else
 327                 {
 328                   gcol.nextToken(); // skip next '|'
 329                 }
 330                 // continue parsing rest of line
 331                 maxcol = gcol.nextToken();
 332                 if (maxcol.equals("|"))
 333                 {
 334                   maxcol = "";
 335                 }
 336                 else
 337                 {
 338                   gcol.nextToken(); // skip next '|'
 339                 }
 340                 abso = gcol.nextToken();
 341                 gcol.nextToken(); // skip next '|'
 342                 if (abso.toLowerCase().indexOf("abso") != 0)
 343                 {
 344                   minval = abso;
 345                   abso = null;
 346                 }
 347                 else
 348                 {
 349                   minval = gcol.nextToken();
 350                   gcol.nextToken(); // skip next '|'
 351                 }
 352                 maxval = gcol.nextToken();
 353                 if (gcol.hasMoreTokens())
 354                 {
 355                   gcol.nextToken(); // skip next '|'
 356                 }
 357                 try
 358                 {
 359                   if (minval.length() > 0)
 360                   {
 361                     min = new Float(minval).floatValue();
 362                   }
 363                 } catch (Exception e)
 364                 {
 365                   System.err
 366                           .println("Couldn't parse the minimum value for graduated colour for type ("
 367                                   + colscheme
 368                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 369                   e.printStackTrace();
 370                 }
 371                 try
 372                 {
 373                   if (maxval.length() > 0)
 374                   {
 375                     max = new Float(maxval).floatValue();
 376                   }
 377                 } catch (Exception e)
 378                 {
 379                   System.err
 380                           .println("Couldn't parse the maximum value for graduated colour for type ("
 381                                   + colscheme + ")");
 382                   e.printStackTrace();
 383                 }
 384               }
 385               else
 386               {
 387                 // add in some dummy min/max colours for the label-only
 388                 // colourscheme.
 389                 mincol = "FFFFFF";
 390                 maxcol = "000000";
 391               }
 392               try
 393               {
 394                 colour = new jalview.schemes.GraduatedColor(
 395                         new UserColourScheme(mincol).findColour('A'),
 396                         new UserColourScheme(maxcol).findColour('A'), min,
 397                         max);
 398               } catch (Exception e)
 399               {
 400                 System.err
 401                         .println("Couldn't parse the graduated colour scheme ("
 402                                 + colscheme + ")");
 403                 e.printStackTrace();
 404               }
 405               if (colour != null)
 406               {
 407                 ((jalview.schemes.GraduatedColor) colour)
 408                         .setColourByLabel(labelCol);
 409                 ((jalview.schemes.GraduatedColor) colour)
 410                         .setAutoScaled(abso == null);
 411                 // add in any additional parameters
 412                 String ttype = null, tval = null;
 413                 if (gcol.hasMoreTokens())
 414                 {
 415                   // threshold type and possibly a threshold value
 416                   ttype = gcol.nextToken();
 417                   if (ttype.toLowerCase().startsWith("below"))
 418                   {
 419                     ((jalview.schemes.GraduatedColor) colour)
 420                             .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 421                   }
 422                   else if (ttype.toLowerCase().startsWith("above"))
 423                   {
 424                     ((jalview.schemes.GraduatedColor) colour)
 425                             .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 426                   }
 427                   else
 428                   {
 429                     ((jalview.schemes.GraduatedColor) colour)
 430                             .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 431                     if (!ttype.toLowerCase().startsWith("no"))
 432                     {
 433                       System.err
 434                               .println("Ignoring unrecognised threshold type : "
 435                                       + ttype);
 436                     }
 437                   }
 438                 }
 439                 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 440                 {
 441                   try
 442                   {
 443                     gcol.nextToken();
 444                     tval = gcol.nextToken();
 445                     ((jalview.schemes.GraduatedColor) colour)
 446                             .setThresh(new Float(tval).floatValue());
 447                   } catch (Exception e)
 448                   {
 449                     System.err
 450                             .println("Couldn't parse threshold value as a float: ("
 451                                     + tval + ")");
 452                     e.printStackTrace();
 453                   }
 454                 }
 455                 // parse the thresh-is-min token ?
 456                 if (gcol.hasMoreTokens())
 457                 {
 458                   System.err
 459                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 460                   while (gcol.hasMoreTokens())
 461                   {
 462                     System.err.println("|" + gcol.nextToken());
 463                   }
 464                   System.err.println("\n");
 465                 }
 466               }
 467             }
 468             else
 469             {
 470               UserColourScheme ucs = new UserColourScheme(colscheme);
 471               colour = ucs.findColour('A');
 472             }
 473             if (colour != null)
 474             {
 475               colours.put(type, colour);
 476             }
 477             if (st.hasMoreElements())
 478             {
 479               String link = st.nextToken();
 480               typeLink.put(type, link);
 481               if (featureLink == null)
 482               {
 483                 featureLink = new Hashtable();
 484               }
 485               featureLink.put(type, link);
 486             }
 487           }
 488           continue;
 489         }
 490         String seqId = "";
 491         while (st.hasMoreElements())
 492         {
 493
 494           if (GFFFile)
 495           {
 496             // Still possible this is an old Jalview file,
 497             // which does not have type colours at the beginning
 498             seqId = token = st.nextToken();
 499             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 500             if (seq != null)
 501             {
 502               desc = st.nextToken();
 503               String group = null;
 504               if (doGffSource && desc.indexOf(' ') == -1)
 505               {
 506                 // could also be a source term rather than description line
 507                 group = new String(desc);
 508               }
 509               type = st.nextToken();
 510               try
 511               {
 512                 String stt = st.nextToken();
 513                 if (stt.length() == 0 || stt.equals("-"))
 514                 {
 515                   start = 0;
 516                 }
 517                 else
 518                 {
 519                   start = Integer.parseInt(stt);
 520                 }
 521               } catch (NumberFormatException ex)
 522               {
 523                 start = 0;
 524               }
 525               try
 526               {
 527                 String stt = st.nextToken();
 528                 if (stt.length() == 0 || stt.equals("-"))
 529                 {
 530                   end = 0;
 531                 }
 532                 else
 533                 {
 534                   end = Integer.parseInt(stt);
 535                 }
 536               } catch (NumberFormatException ex)
 537               {
 538                 end = 0;
 539               }
 540               // TODO: decide if non positional feature assertion for input data
 541               // where end==0 is generally valid
 542               if (end == 0)
 543               {
 544                 // treat as non-positional feature, regardless.
 545                 start = 0;
 546               }
 547               try
 548               {
 549                 score = new Float(st.nextToken()).floatValue();
 550               } catch (NumberFormatException ex)
 551               {
 552                 score = 0;
 553               }
 554
 555               sf = new SequenceFeature(type, desc, start, end, score, group);
 556
 557               try
 558               {
 559                 sf.setValue("STRAND", st.nextToken());
 560                 sf.setValue("FRAME", st.nextToken());
 561               } catch (Exception ex)
 562               {
 563               }
 564
 565               if (st.hasMoreTokens())
 566               {
 567                 StringBuffer attributes = new StringBuffer();
 568                 boolean sep = false;
 569                 while (st.hasMoreTokens())
 570                 {
 571                   attributes.append((sep ? "\t" : "") + st.nextElement());
 572                   sep = true;
 573                 }
 574                 // TODO validate and split GFF2 attributes field ? parse out
 575                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 576                 // sf.setValue(attrib, val);
 577                 sf.setValue("ATTRIBUTES", attributes.toString());
 578               }
 579
 580               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 581                       relaxedIdmatching))
 582               {
 583                 // check whether we should add the sequence feature to any other
 584                 // sequences in the alignment with the same or similar
 585                 while ((seq = align.findName(seq, seqId, true)) != null)
 586                 {
 587                   seq.addSequenceFeature(new SequenceFeature(sf));
 588                 }
 589               }
 590               break;
 591             }
 592           }
 593
 594           if (GFFFile && seq == null)
 595           {
 596             desc = token;
 597           }
 598           else
 599           {
 600             desc = st.nextToken();
 601           }
 602           if (!st.hasMoreTokens())
 603           {
 604             System.err
 605                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 606             // in all probability, this isn't a file we understand, so bail
 607             // quietly.
 608             return false;
 609           }
 610
 611           token = st.nextToken();
 612
 613           if (!token.equals("ID_NOT_SPECIFIED"))
 614           {
 615             seq = findName(align, seqId = token, relaxedIdmatching, null);
 616             st.nextToken();
 617           }
 618           else
 619           {
 620             seqId = null;
 621             try
 622             {
 623               index = Integer.parseInt(st.nextToken());
 624               seq = align.getSequenceAt(index);
 625             } catch (NumberFormatException ex)
 626             {
 627               seq = null;
 628             }
 629           }
 630
 631           if (seq == null)
 632           {
 633             System.out.println("Sequence not found: " + line);
 634             break;
 635           }
 636
 637           start = Integer.parseInt(st.nextToken());
 638           end = Integer.parseInt(st.nextToken());
 639
 640           type = st.nextToken();
 641
 642           if (!colours.containsKey(type))
 643           {
 644             // Probably the old style groups file
 645             UserColourScheme ucs = new UserColourScheme(type);
 646             colours.put(type, ucs.findColour('A'));
 647           }
 648           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 649           if (st.hasMoreTokens())
 650           {
 651             try
 652             {
 653               score = new Float(st.nextToken()).floatValue();
 654               // update colourgradient bounds if allowed to
 655             } catch (NumberFormatException ex)
 656             {
 657               score = 0;
 658             }
 659             sf.setScore(score);
 660           }
 661           if (groupLink != null && removeHTML)
 662           {
 663             sf.addLink(groupLink);
 664             sf.description += "%LINK%";
 665           }
 666           if (typeLink.containsKey(type) && removeHTML)
 667           {
 668             sf.addLink(typeLink.get(type).toString());
 669             sf.description += "%LINK%";
 670           }
 671
 672           parseDescriptionHTML(sf, removeHTML);
 673
 674           seq.addSequenceFeature(sf);
 675
 676           while (seqId != null
 677                   && (seq = align.findName(seq, seqId, false)) != null)
 678           {
 679             seq.addSequenceFeature(new SequenceFeature(sf));
 680           }
 681           // If we got here, its not a GFFFile
 682           GFFFile = false;
 683         }
 684       }
 685       resetMatcher();
 686     } catch (Exception ex)
 687     {
 688       // should report somewhere useful for UI if necessary
 689       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 690               + "Parsing error at\n" + line;
 691       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 692       ex.printStackTrace(System.err);
 693       resetMatcher();
 694       return false;
 695     }
 696
 697     return true;
 698   }
 699
 700   private enum GffPragmas
 701   {
 702     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 703   };
 704
 705   private static Map<String, GffPragmas> GFFPRAGMA;
 706   static
 707   {
 708     GFFPRAGMA = new HashMap<String, GffPragmas>();
 709     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 710     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 711     GFFPRAGMA.put("#", GffPragmas.hash);
 712     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 713     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 714     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 715     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 716   }
 717
 718   private void processGffPragma(String line, Map<String, String> gffProps,
 719           AlignmentI align, ArrayList<SequenceI> newseqs)
 720           throws IOException
 721   {
 722     // line starts with ##
 723     int spacepos = line.indexOf(' ');
 724     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 725             .substring(2, spacepos);
 726     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 727     if (gffpragma == null)
 728     {
 729       return;
 730     }
 731     switch (gffpragma)
 732     {
 733     case gff_version:
 734       try
 735       {
 736         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 737       } finally
 738       {
 739
 740       }
 741       break;
 742     case feature_ontology:
 743       // resolve against specific feature ontology
 744       break;
 745     case attribute_ontology:
 746       // resolve against specific attribute ontology
 747       break;
 748     case source_ontology:
 749       // resolve against specific source ontology
 750       break;
 751     case species_build:
 752       // resolve against specific NCBI taxon version
 753       break;
 754     case hash:
 755       // close off any open feature hierarchies
 756       break;
 757     case fasta:
 758       // process the rest of the file as a fasta file and replace any dummy
 759       // sequence IDs
 760       process_as_fasta(align, newseqs);
 761       break;
 762     default:
 763       // we do nothing ?
 764       System.err.println("Ignoring unknown pragma:\n" + line);
 765     }
 766   }
 767
 768   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 769           throws IOException
 770   {
 771     try
 772     {
 773       mark();
 774     } catch (IOException q)
 775     {
 776     }
 777     FastaFile parser = new FastaFile(this);
 778     List<SequenceI> includedseqs = parser.getSeqs();
 779     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 780     // iterate over includedseqs, and replacing matching ones with newseqs
 781     // sequences. Generic iterator not used here because we modify includedseqs
 782     // as we go
 783     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 784     {
 785       // search for any dummy seqs that this sequence can be used to update
 786       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 787       if (dummyseq != null)
 788       {
 789         // dummyseq was created so it could be annotated and referred to in
 790         // alignments/codon mappings
 791
 792         SequenceI mseq = includedseqs.get(p);
 793         // mseq is the 'template' imported from the FASTA file which we'll use
 794         // to coomplete dummyseq
 795         if (dummyseq instanceof SequenceDummy)
 796         {
 797           // probably have the pattern wrong
 798           // idea is that a flyweight proxy for a sequence ID can be created for
 799           // 1. stable reference creation
 800           // 2. addition of annotation
 801           // 3. future replacement by a real sequence
 802           // current pattern is to create SequenceDummy objects - a convenience
 803           // constructor for a Sequence.
 804           // problem is that when promoted to a real sequence, all references
 805           // need
 806           // to be updated somehow.
 807           ((SequenceDummy) dummyseq).become(mseq);
 808           includedseqs.set(p, dummyseq); // template is no longer needed
 809         }
 810       }
 811     }
 812     // finally add sequences to the dataset
 813     for (SequenceI seq : includedseqs)
 814     {
 815       align.addSequence(seq);
 816     }
 817   }
 818
 819   /**
 820    * take a sequence feature and examine its attributes to decide how it should
 821    * be added to a sequence
 822    *
 823    * @param seq
 824    *          - the destination sequence constructed or discovered in the
 825    *          current context
 826    * @param sf
 827    *          - the base feature with ATTRIBUTES property containing any
 828    *          additional attributes
 829    * @param gFFFile
 830    *          - true if we are processing a GFF annotation file
 831    * @return true if sf was actually added to the sequence, false if it was
 832    *         processed in another way
 833    */
 834   public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 835           boolean gFFFile, boolean relaxedIdMatching)
 836   {
 837     String attr = (String) sf.getValue("ATTRIBUTES");
 838     boolean add = true;
 839     if (gFFFile && attr != null)
 840     {
 841       int nattr=8;
 842
 843       for (String attset : attr.split("\t"))
 844       {
 845         if (attset==null || attset.trim().length()==0)
 846         {
 847           continue;
 848         }
 849         nattr++;
 850         Map<String, List<String>> set = new HashMap<String, List<String>>();
 851         // normally, only expect one column - 9 - in this field
 852         // the attributes (Gff3) or groups (gff2) field
 853         for (String pair : attset.trim().split(";"))
 854         {
 855           pair = pair.trim();
 856           if (pair.length() == 0)
 857           {
 858             continue;
 859           }
 860
 861           // expect either space seperated (gff2) or '=' separated (gff3)
 862           // key/value pairs here
 863
 864           int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
 865           String key = null, value = null;
 866
 867           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 868           {
 869             key = pair.substring(0, sppos);
 870             value = pair.substring(sppos + 1);
 871           } else {
 872             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 873             {
 874               key = pair.substring(0, eqpos);
 875               value = pair.substring(eqpos + 1);
 876             } else
 877             {
 878               key = pair;
 879             }
 880           }
 881           if (key != null)
 882           {
 883             List<String> vals = set.get(key);
 884             if (vals == null)
 885             {
 886               vals = new ArrayList<String>();
 887               set.put(key, vals);
 888             }
 889             if (value != null)
 890             {
 891               vals.add(value.trim());
 892             }
 893           }
 894         }
 895         try
 896         {
 897           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 898                   relaxedIdMatching); // process decides if
 899                                                      // feature is actually
 900                                                      // added
 901         } catch (InvalidGFF3FieldException ivfe)
 902         {
 903           System.err.println(ivfe);
 904         }
 905       }
 906     }
 907     if (add)
 908     {
 909       seq.addSequenceFeature(sf);
 910     }
 911     return add;
 912   }
 913
 914   public class InvalidGFF3FieldException extends Exception
 915   {
 916     String field, value;
 917
 918     public InvalidGFF3FieldException(String field,
 919             Map<String, List<String>> set, String message)
 920     {
 921       super(message + " (Field was " + field + " and value was "
 922               + set.get(field).toString());
 923       this.field = field;
 924       this.value = set.get(field).toString();
 925     }
 926
 927   }
 928
 929   /**
 930    * take a set of keys for a feature and interpret them
 931    *
 932    * @param set
 933    * @param nattr
 934    * @param seq
 935    * @param sf
 936    * @return
 937    */
 938   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 939           SequenceI seq, SequenceFeature sf, AlignmentI align,
 940           List<SequenceI> newseqs, boolean relaxedIdMatching)
 941           throws InvalidGFF3FieldException
 942   {
 943     String attr;
 944     // decide how to interpret according to type
 945     if (sf.getType().equals("similarity"))
 946     {
 947       int strand = sf.getStrand();
 948       // exonerate cdna/protein map
 949       // look for fields
 950       List<SequenceI> querySeq = findNames(align, newseqs,
 951               relaxedIdMatching, set.get(attr="Query"));
 952       if (querySeq==null || querySeq.size()!=1)
 953       {
 954         throw new InvalidGFF3FieldException( attr, set,
 955                 "Expecting exactly one sequence in Query field (got "
 956                         + set.get(attr) + ")");
 957       }
 958       if (set.containsKey(attr="Align"))
 959       {
 960         // process the align maps and create cdna/protein maps
 961         // ideally, the query sequences are in the alignment, but maybe not...
 962
 963         AlignedCodonFrame alco = new AlignedCodonFrame();
 964         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 965                 strand);
 966
 967         // add codon mapping, and hope!
 968         alco.addMap(seq, querySeq.get(0), codonmapping);
 969         align.addCodonFrame(alco);
 970         // everything that's needed to be done is done
 971         // no features to create here !
 972         return false;
 973       }
 974
 975     }
 976     return true;
 977   }
 978
 979   private MapList constructCodonMappingFromAlign(
 980           Map<String, List<String>> set,
 981           String attr, int strand) throws InvalidGFF3FieldException
 982   {
 983     if (strand == 0)
 984     {
 985       throw new InvalidGFF3FieldException(attr, set,
 986               "Invalid strand for a codon mapping (cannot be 0)");
 987     }
 988     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 989     int lastppos = 0, lastpframe = 0;
 990     for (String range : set.get(attr))
 991     {
 992       List<Integer> ints = new ArrayList<Integer>();
 993       StringTokenizer st = new StringTokenizer(range, " ");
 994       while (st.hasMoreTokens())
 995       {
 996         String num = st.nextToken();
 997         try
 998         {
 999           ints.add(new Integer(num));
1000         } catch (NumberFormatException nfe)
1001         {
1002           throw new InvalidGFF3FieldException(attr, set,
1003                   "Invalid number in field " + num);
1004         }
1005       }
1006       // Align positionInRef positionInQuery LengthInRef
1007       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
1008       // 3652 - . alignment_id 0 ;
1009       // Query DDB_G0269124
1010       // Align 11270 143 120
1011       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
1012       // dna in strand direction
1013       // Align 11150 187 282
1014       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
1015       // dna in strand direction
1016       //
1017       // Align 10865 281 888
1018       // Align 9977 578 1068
1019       // Align 8909 935 375
1020       //
1021       if (ints.size() != 3)
1022       {
1023         throw new InvalidGFF3FieldException(attr, set,
1024                 "Invalid number of fields for this attribute ("
1025                         + ints.size() + ")");
1026       }
1027       fromrange.add(new Integer(ints.get(0).intValue()));
1028       fromrange.add(new Integer(ints.get(0).intValue() + strand
1029               * ints.get(2).intValue()));
1030       // how are intron/exon boundaries that do not align in codons
1031       // represented
1032       if (ints.get(1).equals(lastppos) && lastpframe > 0)
1033       {
1034         // extend existing to map
1035         lastppos += ints.get(2) / 3;
1036         lastpframe = ints.get(2) % 3;
1037         torange.set(torange.size() - 1, new Integer(lastppos));
1038       }
1039       else
1040       {
1041         // new to map range
1042         torange.add(ints.get(1));
1043         lastppos = ints.get(1) + ints.get(2) / 3;
1044         lastpframe = ints.get(2) % 3;
1045         torange.add(new Integer(lastppos));
1046       }
1047     }
1048     // from and to ranges must end up being a series of start/end intervals
1049     if (fromrange.size() % 2 == 1)
1050     {
1051       throw new InvalidGFF3FieldException(attr, set,
1052               "Couldn't parse the DNA alignment range correctly");
1053     }
1054     if (torange.size() % 2 == 1)
1055     {
1056       throw new InvalidGFF3FieldException(attr, set,
1057               "Couldn't parse the protein alignment range correctly");
1058     }
1059     // finally, build the map
1060     int[] frommap = new int[fromrange.size()], tomap = new int[torange
1061             .size()];
1062     int p = 0;
1063     for (Integer ip : fromrange)
1064     {
1065       frommap[p++] = ip.intValue();
1066     }
1067     p = 0;
1068     for (Integer ip : torange)
1069     {
1070       tomap[p++] = ip.intValue();
1071     }
1072
1073     return new MapList(frommap, tomap, 3, 1);
1074   }
1075
1076   private List<SequenceI> findNames(AlignmentI align,
1077           List<SequenceI> newseqs, boolean relaxedIdMatching,
1078           List<String> list)
1079   {
1080     List<SequenceI> found = new ArrayList<SequenceI>();
1081     for (String seqId : list)
1082     {
1083       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1084       if (seq != null)
1085       {
1086         found.add(seq);
1087       }
1088     }
1089     return found;
1090   }
1091
1092   private AlignmentI lastmatchedAl = null;
1093
1094   private SequenceIdMatcher matcher = null;
1095
1096   /**
1097    * clear any temporary handles used to speed up ID matching
1098    */
1099   private void resetMatcher()
1100   {
1101     lastmatchedAl = null;
1102     matcher = null;
1103   }
1104
1105   private SequenceI findName(AlignmentI align, String seqId,
1106           boolean relaxedIdMatching, List<SequenceI> newseqs)
1107   {
1108     SequenceI match = null;
1109     if (relaxedIdMatching)
1110     {
1111       if (lastmatchedAl != align)
1112       {
1113         matcher = new SequenceIdMatcher(
1114                 (lastmatchedAl = align).getSequencesArray());
1115         if (newseqs != null)
1116         {
1117           matcher.addAll(newseqs);
1118         }
1119       }
1120       match = matcher.findIdMatch(seqId);
1121     }
1122     else
1123     {
1124       match = align.findName(seqId, true);
1125       if (match == null && newseqs != null)
1126       {
1127         for (SequenceI m : newseqs)
1128         {
1129           if (seqId.equals(m.getName()))
1130           {
1131             return m;
1132           }
1133         }
1134       }
1135
1136     }
1137     if (match==null && newseqs!=null)
1138     {
1139       match = new SequenceDummy(seqId);
1140       if (relaxedIdMatching)
1141       {
1142         matcher.addAll(Arrays.asList(new SequenceI[]
1143         { match }));
1144       }
1145       // add dummy sequence to the newseqs list
1146       newseqs.add(match);
1147     }
1148     return match;
1149   }
1150   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
1151   {
1152     if (sf.getDescription() == null)
1153     {
1154       return;
1155     }
1156     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
1157             sf.getDescription(), removeHTML, newline);
1158
1159     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
1160             : sf.description;
1161     for (String link : parsed.getLinks())
1162     {
1163       sf.addLink(link);
1164     }
1165
1166   }
1167
1168   /**
1169    * generate a features file for seqs includes non-pos features by default.
1170    *
1171    * @param seqs
1172    *          source of sequence features
1173    * @param visible
1174    *          hash of feature types and colours
1175    * @return features file contents
1176    */
1177   public String printJalviewFormat(SequenceI[] seqs, Map<String,Object> visible)
1178   {
1179     return printJalviewFormat(seqs, visible, true, true);
1180   }
1181
1182   /**
1183    * generate a features file for seqs with colours from visible (if any)
1184    *
1185    * @param seqs
1186    *          source of features
1187    * @param visible
1188    *          hash of Colours for each feature type
1189    * @param visOnly
1190    *          when true only feature types in 'visible' will be output
1191    * @param nonpos
1192    *          indicates if non-positional features should be output (regardless
1193    *          of group or type)
1194    * @return features file contents
1195    */
1196   public String printJalviewFormat(SequenceI[] seqs, Map visible,
1197           boolean visOnly, boolean nonpos)
1198   {
1199     StringBuffer out = new StringBuffer();
1200     SequenceFeature[] next;
1201     boolean featuresGen = false;
1202     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1203     {
1204       // no point continuing.
1205       return "No Features Visible";
1206     }
1207
1208     if (visible != null && visOnly)
1209     {
1210       // write feature colours only if we're given them and we are generating
1211       // viewed features
1212       // TODO: decide if feature links should also be written here ?
1213       Iterator en = visible.keySet().iterator();
1214       String type, color;
1215       while (en.hasNext())
1216       {
1217         type = en.next().toString();
1218
1219         if (visible.get(type) instanceof GraduatedColor)
1220         {
1221           GraduatedColor gc = (GraduatedColor) visible.get(type);
1222           color = (gc.isColourByLabel() ? "label|" : "")
1223                   + Format.getHexString(gc.getMinColor()) + "|"
1224                   + Format.getHexString(gc.getMaxColor())
1225                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
1226                   + gc.getMax() + "|";
1227           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
1228           {
1229             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
1230             {
1231               color += "below";
1232             }
1233             else
1234             {
1235               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
1236               {
1237                 System.err.println("WARNING: Unsupported threshold type ("
1238                         + gc.getThreshType() + ") : Assuming 'above'");
1239               }
1240               color += "above";
1241             }
1242             // add the value
1243             color += "|" + gc.getThresh();
1244           }
1245           else
1246           {
1247             color += "none";
1248           }
1249         }
1250         else if (visible.get(type) instanceof java.awt.Color)
1251         {
1252           color = Format.getHexString((java.awt.Color) visible.get(type));
1253         }
1254         else
1255         {
1256           // legacy support for integer objects containing colour triplet values
1257           color = Format.getHexString(new java.awt.Color(Integer
1258                   .parseInt(visible.get(type).toString())));
1259         }
1260         out.append(type);
1261         out.append("\t");
1262         out.append(color);
1263         out.append(newline);
1264       }
1265     }
1266     // Work out which groups are both present and visible
1267     Vector groups = new Vector();
1268     int groupIndex = 0;
1269     boolean isnonpos = false;
1270
1271     for (int i = 0; i < seqs.length; i++)
1272     {
1273       next = seqs[i].getSequenceFeatures();
1274       if (next != null)
1275       {
1276         for (int j = 0; j < next.length; j++)
1277         {
1278           isnonpos = next[j].begin == 0 && next[j].end == 0;
1279           if ((!nonpos && isnonpos)
1280                   || (!isnonpos && visOnly && !visible
1281                           .containsKey(next[j].type)))
1282           {
1283             continue;
1284           }
1285
1286           if (next[j].featureGroup != null
1287                   && !groups.contains(next[j].featureGroup))
1288           {
1289             groups.addElement(next[j].featureGroup);
1290           }
1291         }
1292       }
1293     }
1294
1295     String group = null;
1296     do
1297     {
1298
1299       if (groups.size() > 0 && groupIndex < groups.size())
1300       {
1301         group = groups.elementAt(groupIndex).toString();
1302         out.append(newline);
1303         out.append("STARTGROUP\t");
1304         out.append(group);
1305         out.append(newline);
1306       }
1307       else
1308       {
1309         group = null;
1310       }
1311
1312       for (int i = 0; i < seqs.length; i++)
1313       {
1314         next = seqs[i].getSequenceFeatures();
1315         if (next != null)
1316         {
1317           for (int j = 0; j < next.length; j++)
1318           {
1319             isnonpos = next[j].begin == 0 && next[j].end == 0;
1320             if ((!nonpos && isnonpos)
1321                     || (!isnonpos && visOnly && !visible
1322                             .containsKey(next[j].type)))
1323             {
1324               // skip if feature is nonpos and we ignore them or if we only
1325               // output visible and it isn't non-pos and it's not visible
1326               continue;
1327             }
1328
1329             if (group != null
1330                     && (next[j].featureGroup == null || !next[j].featureGroup
1331                             .equals(group)))
1332             {
1333               continue;
1334             }
1335
1336             if (group == null && next[j].featureGroup != null)
1337             {
1338               continue;
1339             }
1340             // we have features to output
1341             featuresGen = true;
1342             if (next[j].description == null
1343                     || next[j].description.equals(""))
1344             {
1345               out.append(next[j].type + "\t");
1346             }
1347             else
1348             {
1349               if (next[j].links != null
1350                       && next[j].getDescription().indexOf("<html>") == -1)
1351               {
1352                 out.append("<html>");
1353               }
1354
1355               out.append(next[j].description + " ");
1356               if (next[j].links != null)
1357               {
1358                 for (int l = 0; l < next[j].links.size(); l++)
1359                 {
1360                   String label = next[j].links.elementAt(l).toString();
1361                   String href = label.substring(label.indexOf("|") + 1);
1362                   label = label.substring(0, label.indexOf("|"));
1363
1364                   if (next[j].description.indexOf(href) == -1)
1365                   {
1366                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1367                   }
1368                 }
1369
1370                 if (next[j].getDescription().indexOf("</html>") == -1)
1371                 {
1372                   out.append("</html>");
1373                 }
1374               }
1375
1376               out.append("\t");
1377             }
1378             out.append(seqs[i].getName());
1379             out.append("\t-1\t");
1380             out.append(next[j].begin);
1381             out.append("\t");
1382             out.append(next[j].end);
1383             out.append("\t");
1384             out.append(next[j].type);
1385             if (next[j].score != Float.NaN)
1386             {
1387               out.append("\t");
1388               out.append(next[j].score);
1389             }
1390             out.append(newline);
1391           }
1392         }
1393       }
1394
1395       if (group != null)
1396       {
1397         out.append("ENDGROUP\t");
1398         out.append(group);
1399         out.append(newline);
1400         groupIndex++;
1401       }
1402       else
1403       {
1404         break;
1405       }
1406
1407     } while (groupIndex < groups.size() + 1);
1408
1409     if (!featuresGen)
1410     {
1411       return "No Features Visible";
1412     }
1413
1414     return out.toString();
1415   }
1416
1417   /**
1418    * generate a gff file for sequence features includes non-pos features by
1419    * default.
1420    *
1421    * @param seqs
1422    * @param visible
1423    * @return
1424    */
1425   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible)
1426   {
1427     return printGFFFormat(seqs, visible, true, true);
1428   }
1429
1430   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible,
1431           boolean visOnly, boolean nonpos)
1432   {
1433     StringBuffer out = new StringBuffer();
1434     SequenceFeature[] next;
1435     String source;
1436     boolean isnonpos;
1437     for (int i = 0; i < seqs.length; i++)
1438     {
1439       if (seqs[i].getSequenceFeatures() != null)
1440       {
1441         next = seqs[i].getSequenceFeatures();
1442         for (int j = 0; j < next.length; j++)
1443         {
1444           isnonpos = next[j].begin == 0 && next[j].end == 0;
1445           if ((!nonpos && isnonpos)
1446                   || (!isnonpos && visOnly && !visible
1447                           .containsKey(next[j].type)))
1448           {
1449             continue;
1450           }
1451
1452           source = next[j].featureGroup;
1453           if (source == null)
1454           {
1455             source = next[j].getDescription();
1456           }
1457
1458           out.append(seqs[i].getName());
1459           out.append("\t");
1460           out.append(source);
1461           out.append("\t");
1462           out.append(next[j].type);
1463           out.append("\t");
1464           out.append(next[j].begin);
1465           out.append("\t");
1466           out.append(next[j].end);
1467           out.append("\t");
1468           out.append(next[j].score);
1469           out.append("\t");
1470
1471           if (next[j].getValue("STRAND") != null)
1472           {
1473             out.append(next[j].getValue("STRAND"));
1474             out.append("\t");
1475           }
1476           else
1477           {
1478             out.append(".\t");
1479           }
1480
1481           if (next[j].getValue("FRAME") != null)
1482           {
1483             out.append(next[j].getValue("FRAME"));
1484           }
1485           else
1486           {
1487             out.append(".");
1488           }
1489           // TODO: verify/check GFF - should there be a /t here before attribute
1490           // output ?
1491
1492           if (next[j].getValue("ATTRIBUTES") != null)
1493           {
1494             out.append(next[j].getValue("ATTRIBUTES"));
1495           }
1496
1497           out.append(newline);
1498
1499         }
1500       }
1501     }
1502
1503     return out.toString();
1504   }
1505
1506   /**
1507    * this is only for the benefit of object polymorphism - method does nothing.
1508    */
1509   public void parse()
1510   {
1511     // IGNORED
1512   }
1513
1514   /**
1515    * this is only for the benefit of object polymorphism - method does nothing.
1516    *
1517    * @return error message
1518    */
1519   public String print()
1520   {
1521     return "USE printGFFFormat() or printJalviewFormat()";
1522   }
1523
1524 }