src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.api.FeatureColourI;
  25 import jalview.datamodel.AlignedCodonFrame;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.schemes.AnnotationColourGradient;
  31 import jalview.schemes.FeatureColour;
  32 import jalview.schemes.UserColourScheme;
  33 import jalview.util.Format;
  34 import jalview.util.MapList;
  35
  36 import java.io.IOException;
  37 import java.util.ArrayList;
  38 import java.util.Arrays;
  39 import java.util.HashMap;
  40 import java.util.Hashtable;
  41 import java.util.Iterator;
  42 import java.util.List;
  43 import java.util.Map;
  44 import java.util.StringTokenizer;
  45 import java.util.Vector;
  46
  47 /**
  48  * Parse and create Jalview Features files Detects GFF format features files and
  49  * parses. Does not implement standard print() - call specific printFeatures or
  50  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  51  * for the features annotation - this normally works on an exact match.
  52  *
  53  * @author AMW
  54  * @version $Revision$
  55  */
  56 public class FeaturesFile extends AlignFile
  57 {
  58   /**
  59    * work around for GFF interpretation bug where source string becomes
  60    * description rather than a group
  61    */
  62   private boolean doGffSource = true;
  63
  64   private int gffversion;
  65
  66   /**
  67    * Creates a new FeaturesFile object.
  68    */
  69   public FeaturesFile()
  70   {
  71   }
  72
  73   /**
  74    * @param inFile
  75    * @param type
  76    * @throws IOException
  77    */
  78   public FeaturesFile(String inFile, String type) throws IOException
  79   {
  80     super(inFile, type);
  81   }
  82
  83   /**
  84    * @param source
  85    * @throws IOException
  86    */
  87   public FeaturesFile(FileParse source) throws IOException
  88   {
  89     super(source);
  90   }
  91
  92   /**
  93    * @param parseImmediately
  94    * @param source
  95    * @throws IOException
  96    */
  97   public FeaturesFile(boolean parseImmediately, FileParse source)
  98           throws IOException
  99   {
 100     super(parseImmediately, source);
 101   }
 102
 103   /**
 104    * @param parseImmediately
 105    * @param inFile
 106    * @param type
 107    * @throws IOException
 108    */
 109   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 110           throws IOException
 111   {
 112     super(parseImmediately, inFile, type);
 113   }
 114
 115   /**
 116    * Parse GFF or sequence features file using case-independent matching,
 117    * discarding URLs
 118    *
 119    * @param align
 120    *          - alignment/dataset containing sequences that are to be annotated
 121    * @param colours
 122    *          - hashtable to store feature colour definitions
 123    * @param removeHTML
 124    *          - process html strings into plain text
 125    * @return true if features were added
 126    */
 127   public boolean parse(AlignmentI align, Map colours, boolean removeHTML)
 128   {
 129     return parse(align, colours, null, removeHTML, false);
 130   }
 131
 132   /**
 133    * Parse GFF or sequence features file optionally using case-independent
 134    * matching, discarding URLs
 135    *
 136    * @param align
 137    *          - alignment/dataset containing sequences that are to be annotated
 138    * @param colours
 139    *          - hashtable to store feature colour definitions
 140    * @param removeHTML
 141    *          - process html strings into plain text
 142    * @param relaxedIdmatching
 143    *          - when true, ID matches to compound sequence IDs are allowed
 144    * @return true if features were added
 145    */
 146   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 147           boolean relaxedIdMatching)
 148   {
 149     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 150   }
 151
 152   /**
 153    * Parse GFF or sequence features file optionally using case-independent
 154    * matching
 155    *
 156    * @param align
 157    *          - alignment/dataset containing sequences that are to be annotated
 158    * @param colours
 159    *          - hashtable to store feature colour definitions
 160    * @param featureLink
 161    *          - hashtable to store associated URLs
 162    * @param removeHTML
 163    *          - process html strings into plain text
 164    * @return true if features were added
 165    */
 166   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 167           boolean removeHTML)
 168   {
 169     return parse(align, colours, featureLink, removeHTML, false);
 170   }
 171
 172   @Override
 173   public void addAnnotations(AlignmentI al)
 174   {
 175     // TODO Auto-generated method stub
 176     super.addAnnotations(al);
 177   }
 178
 179   @Override
 180   public void addProperties(AlignmentI al)
 181   {
 182     // TODO Auto-generated method stub
 183     super.addProperties(al);
 184   }
 185
 186   @Override
 187   public void addSeqGroups(AlignmentI al)
 188   {
 189     // TODO Auto-generated method stub
 190     super.addSeqGroups(al);
 191   }
 192
 193   /**
 194    * Parse GFF or sequence features file
 195    *
 196    * @param align
 197    *          - alignment/dataset containing sequences that are to be annotated
 198    * @param colours
 199    *          - hashtable to store feature colour definitions
 200    * @param featureLink
 201    *          - hashtable to store associated URLs
 202    * @param removeHTML
 203    *          - process html strings into plain text
 204    * @param relaxedIdmatching
 205    *          - when true, ID matches to compound sequence IDs are allowed
 206    * @return true if features were added
 207    */
 208   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 209           boolean removeHTML, boolean relaxedIdmatching)
 210   {
 211
 212     String line = null;
 213     try
 214     {
 215       SequenceI seq = null;
 216       /**
 217        * keep track of any sequences we try to create from the data if it is a
 218        * GFF3 file
 219        */
 220       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 221       String type, desc, token = null;
 222
 223       int index, start, end;
 224       float score;
 225       StringTokenizer st;
 226       SequenceFeature sf;
 227       String featureGroup = null, groupLink = null;
 228       Map typeLink = new Hashtable();
 229       /**
 230        * when true, assume GFF style features rather than Jalview style.
 231        */
 232       boolean GFFFile = true;
 233       Map<String, String> gffProps = new HashMap<String, String>();
 234       while ((line = nextLine()) != null)
 235       {
 236         // skip comments/process pragmas
 237         if (line.startsWith("#"))
 238         {
 239           if (line.startsWith("##"))
 240           {
 241             // possibly GFF2/3 version and metadata header
 242             processGffPragma(line, gffProps, align, newseqs);
 243             line = "";
 244           }
 245           continue;
 246         }
 247
 248         st = new StringTokenizer(line, "\t");
 249         if (st.countTokens() == 1)
 250         {
 251           if (line.trim().equalsIgnoreCase("GFF"))
 252           {
 253             // Start parsing file as if it might be GFF again.
 254             GFFFile = true;
 255             continue;
 256           }
 257         }
 258         if (st.countTokens() > 1 && st.countTokens() < 4)
 259         {
 260           GFFFile = false;
 261           type = st.nextToken();
 262           if (type.equalsIgnoreCase("startgroup"))
 263           {
 264             featureGroup = st.nextToken();
 265             if (st.hasMoreElements())
 266             {
 267               groupLink = st.nextToken();
 268               featureLink.put(featureGroup, groupLink);
 269             }
 270           }
 271           else if (type.equalsIgnoreCase("endgroup"))
 272           {
 273             // We should check whether this is the current group,
 274             // but at present theres no way of showing more than 1 group
 275             st.nextToken();
 276             featureGroup = null;
 277             groupLink = null;
 278           }
 279           else
 280           {
 281             FeatureColourI colour = null;
 282             String colscheme = st.nextToken();
 283             if (colscheme.indexOf("|") > -1
 284                     || colscheme.trim().equalsIgnoreCase("label"))
 285             {
 286               // Parse '|' separated graduated colourscheme fields:
 287               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 288               // can either provide 'label' only, first is optional, next two
 289               // colors are required (but may be
 290               // left blank), next is optional, nxt two min/max are required.
 291               // first is either 'label'
 292               // first/second and third are both hexadecimal or word equivalent
 293               // colour.
 294               // next two are values parsed as floats.
 295               // fifth is either 'above','below', or 'none'.
 296               // sixth is a float value and only required when fifth is either
 297               // 'above' or 'below'.
 298               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 299                       true);
 300               // set defaults
 301               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 302               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 303               boolean labelCol = false;
 304               // Parse spec line
 305               String mincol = gcol.nextToken();
 306               if (mincol == "|")
 307               {
 308                 System.err
 309                         .println("Expected either 'label' or a colour specification in the line: "
 310                                 + line);
 311                 continue;
 312               }
 313               String maxcol = null;
 314               if (mincol.toLowerCase().indexOf("label") == 0)
 315               {
 316                 labelCol = true;
 317                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 318                                                                            // '|'
 319                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 320               }
 321               String abso = null, minval, maxval;
 322               if (mincol != null)
 323               {
 324                 // at least four more tokens
 325                 if (mincol.equals("|"))
 326                 {
 327                   mincol = "";
 328                 }
 329                 else
 330                 {
 331                   gcol.nextToken(); // skip next '|'
 332                 }
 333                 // continue parsing rest of line
 334                 maxcol = gcol.nextToken();
 335                 if (maxcol.equals("|"))
 336                 {
 337                   maxcol = "";
 338                 }
 339                 else
 340                 {
 341                   gcol.nextToken(); // skip next '|'
 342                 }
 343                 abso = gcol.nextToken();
 344                 gcol.nextToken(); // skip next '|'
 345                 if (abso.toLowerCase().indexOf("abso") != 0)
 346                 {
 347                   minval = abso;
 348                   abso = null;
 349                 }
 350                 else
 351                 {
 352                   minval = gcol.nextToken();
 353                   gcol.nextToken(); // skip next '|'
 354                 }
 355                 maxval = gcol.nextToken();
 356                 if (gcol.hasMoreTokens())
 357                 {
 358                   gcol.nextToken(); // skip next '|'
 359                 }
 360                 try
 361                 {
 362                   if (minval.length() > 0)
 363                   {
 364                     min = new Float(minval).floatValue();
 365                   }
 366                 } catch (Exception e)
 367                 {
 368                   System.err
 369                           .println("Couldn't parse the minimum value for graduated colour for type ("
 370                                   + colscheme
 371                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 372                   e.printStackTrace();
 373                 }
 374                 try
 375                 {
 376                   if (maxval.length() > 0)
 377                   {
 378                     max = new Float(maxval).floatValue();
 379                   }
 380                 } catch (Exception e)
 381                 {
 382                   System.err
 383                           .println("Couldn't parse the maximum value for graduated colour for type ("
 384                                   + colscheme + ")");
 385                   e.printStackTrace();
 386                 }
 387               }
 388               else
 389               {
 390                 // add in some dummy min/max colours for the label-only
 391                 // colourscheme.
 392                 mincol = "FFFFFF";
 393                 maxcol = "000000";
 394               }
 395               try
 396               {
 397                 colour = new FeatureColour(
 398                         new UserColourScheme(mincol).findColour('A'),
 399                         new UserColourScheme(maxcol).findColour('A'), min,
 400                         max);
 401               } catch (Exception e)
 402               {
 403                 System.err
 404                         .println("Couldn't parse the graduated colour scheme ("
 405                                 + colscheme + ")");
 406                 e.printStackTrace();
 407               }
 408               if (colour != null)
 409               {
 410                 colour.setColourByLabel(labelCol);
 411                 colour.setAutoScaled(abso == null);
 412                 // add in any additional parameters
 413                 String ttype = null, tval = null;
 414                 if (gcol.hasMoreTokens())
 415                 {
 416                   // threshold type and possibly a threshold value
 417                   ttype = gcol.nextToken();
 418                   if (ttype.toLowerCase().startsWith("below"))
 419                   {
 420                     colour.setBelowThreshold(true);
 421                   }
 422                   else if (ttype.toLowerCase().startsWith("above"))
 423                   {
 424                     colour.setAboveThreshold(true);
 425                   }
 426                   else
 427                   {
 428                     if (!ttype.toLowerCase().startsWith("no"))
 429                     {
 430                       System.err
 431                               .println("Ignoring unrecognised threshold type : "
 432                                       + ttype);
 433                     }
 434                   }
 435                 }
 436                 if (colour.hasThreshold())
 437                 {
 438                   try
 439                   {
 440                     gcol.nextToken();
 441                     tval = gcol.nextToken();
 442                     colour.setThreshold(new Float(tval).floatValue());
 443                   } catch (Exception e)
 444                   {
 445                     System.err
 446                             .println("Couldn't parse threshold value as a float: ("
 447                                     + tval + ")");
 448                     e.printStackTrace();
 449                   }
 450                 }
 451                 // parse the thresh-is-min token ?
 452                 if (gcol.hasMoreTokens())
 453                 {
 454                   System.err
 455                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 456                   while (gcol.hasMoreTokens())
 457                   {
 458                     System.err.println("|" + gcol.nextToken());
 459                   }
 460                   System.err.println("\n");
 461                 }
 462               }
 463             }
 464             else
 465             {
 466               UserColourScheme ucs = new UserColourScheme(colscheme);
 467               colour = new FeatureColour(ucs.findColour('A'));
 468             }
 469             if (colour != null)
 470             {
 471               colours.put(type, colour);
 472             }
 473             if (st.hasMoreElements())
 474             {
 475               String link = st.nextToken();
 476               typeLink.put(type, link);
 477               if (featureLink == null)
 478               {
 479                 featureLink = new Hashtable();
 480               }
 481               featureLink.put(type, link);
 482             }
 483           }
 484           continue;
 485         }
 486         String seqId = "";
 487         while (st.hasMoreElements())
 488         {
 489
 490           if (GFFFile)
 491           {
 492             // Still possible this is an old Jalview file,
 493             // which does not have type colours at the beginning
 494             seqId = token = st.nextToken();
 495             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 496             if (seq != null)
 497             {
 498               desc = st.nextToken();
 499               String group = null;
 500               if (doGffSource && desc.indexOf(' ') == -1)
 501               {
 502                 // could also be a source term rather than description line
 503                 group = new String(desc);
 504               }
 505               type = st.nextToken();
 506               try
 507               {
 508                 String stt = st.nextToken();
 509                 if (stt.length() == 0 || stt.equals("-"))
 510                 {
 511                   start = 0;
 512                 }
 513                 else
 514                 {
 515                   start = Integer.parseInt(stt);
 516                 }
 517               } catch (NumberFormatException ex)
 518               {
 519                 start = 0;
 520               }
 521               try
 522               {
 523                 String stt = st.nextToken();
 524                 if (stt.length() == 0 || stt.equals("-"))
 525                 {
 526                   end = 0;
 527                 }
 528                 else
 529                 {
 530                   end = Integer.parseInt(stt);
 531                 }
 532               } catch (NumberFormatException ex)
 533               {
 534                 end = 0;
 535               }
 536               // TODO: decide if non positional feature assertion for input data
 537               // where end==0 is generally valid
 538               if (end == 0)
 539               {
 540                 // treat as non-positional feature, regardless.
 541                 start = 0;
 542               }
 543               try
 544               {
 545                 score = new Float(st.nextToken()).floatValue();
 546               } catch (NumberFormatException ex)
 547               {
 548                 score = 0;
 549               }
 550
 551               sf = new SequenceFeature(type, desc, start, end, score, group);
 552
 553               try
 554               {
 555                 sf.setValue("STRAND", st.nextToken());
 556                 sf.setValue("FRAME", st.nextToken());
 557               } catch (Exception ex)
 558               {
 559               }
 560
 561               if (st.hasMoreTokens())
 562               {
 563                 StringBuffer attributes = new StringBuffer();
 564                 boolean sep = false;
 565                 while (st.hasMoreTokens())
 566                 {
 567                   attributes.append((sep ? "\t" : "") + st.nextElement());
 568                   sep = true;
 569                 }
 570                 // TODO validate and split GFF2 attributes field ? parse out
 571                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 572                 // sf.setValue(attrib, val);
 573                 sf.setValue("ATTRIBUTES", attributes.toString());
 574               }
 575
 576               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 577                       relaxedIdmatching))
 578               {
 579                 // check whether we should add the sequence feature to any other
 580                 // sequences in the alignment with the same or similar
 581                 while ((seq = align.findName(seq, seqId, true)) != null)
 582                 {
 583                   seq.addSequenceFeature(new SequenceFeature(sf));
 584                 }
 585               }
 586               break;
 587             }
 588           }
 589
 590           if (GFFFile && seq == null)
 591           {
 592             desc = token;
 593           }
 594           else
 595           {
 596             desc = st.nextToken();
 597           }
 598           if (!st.hasMoreTokens())
 599           {
 600             System.err
 601                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 602             // in all probability, this isn't a file we understand, so bail
 603             // quietly.
 604             return false;
 605           }
 606
 607           token = st.nextToken();
 608
 609           if (!token.equals("ID_NOT_SPECIFIED"))
 610           {
 611             seq = findName(align, seqId = token, relaxedIdmatching, null);
 612             st.nextToken();
 613           }
 614           else
 615           {
 616             seqId = null;
 617             try
 618             {
 619               index = Integer.parseInt(st.nextToken());
 620               seq = align.getSequenceAt(index);
 621             } catch (NumberFormatException ex)
 622             {
 623               seq = null;
 624             }
 625           }
 626
 627           if (seq == null)
 628           {
 629             System.out.println("Sequence not found: " + line);
 630             break;
 631           }
 632
 633           start = Integer.parseInt(st.nextToken());
 634           end = Integer.parseInt(st.nextToken());
 635
 636           type = st.nextToken();
 637
 638           if (!colours.containsKey(type))
 639           {
 640             // Probably the old style groups file
 641             UserColourScheme ucs = new UserColourScheme(type);
 642             colours.put(type, ucs.findColour('A'));
 643           }
 644           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 645           if (st.hasMoreTokens())
 646           {
 647             try
 648             {
 649               score = new Float(st.nextToken()).floatValue();
 650               // update colourgradient bounds if allowed to
 651             } catch (NumberFormatException ex)
 652             {
 653               score = 0;
 654             }
 655             sf.setScore(score);
 656           }
 657           if (groupLink != null && removeHTML)
 658           {
 659             sf.addLink(groupLink);
 660             sf.description += "%LINK%";
 661           }
 662           if (typeLink.containsKey(type) && removeHTML)
 663           {
 664             sf.addLink(typeLink.get(type).toString());
 665             sf.description += "%LINK%";
 666           }
 667
 668           parseDescriptionHTML(sf, removeHTML);
 669
 670           seq.addSequenceFeature(sf);
 671
 672           while (seqId != null
 673                   && (seq = align.findName(seq, seqId, false)) != null)
 674           {
 675             seq.addSequenceFeature(new SequenceFeature(sf));
 676           }
 677           // If we got here, its not a GFFFile
 678           GFFFile = false;
 679         }
 680       }
 681       resetMatcher();
 682     } catch (Exception ex)
 683     {
 684       // should report somewhere useful for UI if necessary
 685       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 686               + "Parsing error at\n" + line;
 687       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 688       ex.printStackTrace(System.err);
 689       resetMatcher();
 690       return false;
 691     }
 692
 693     return true;
 694   }
 695
 696   private enum GffPragmas
 697   {
 698     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 699   };
 700
 701   private static Map<String, GffPragmas> GFFPRAGMA;
 702   static
 703   {
 704     GFFPRAGMA = new HashMap<String, GffPragmas>();
 705     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 706     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 707     GFFPRAGMA.put("#", GffPragmas.hash);
 708     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 709     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 710     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 711     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 712   }
 713
 714   private void processGffPragma(String line, Map<String, String> gffProps,
 715           AlignmentI align, ArrayList<SequenceI> newseqs)
 716           throws IOException
 717   {
 718     // line starts with ##
 719     int spacepos = line.indexOf(' ');
 720     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 721             .substring(2, spacepos);
 722     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 723     if (gffpragma == null)
 724     {
 725       return;
 726     }
 727     switch (gffpragma)
 728     {
 729     case gff_version:
 730       try
 731       {
 732         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 733       } finally
 734       {
 735
 736       }
 737       break;
 738     case feature_ontology:
 739       // resolve against specific feature ontology
 740       break;
 741     case attribute_ontology:
 742       // resolve against specific attribute ontology
 743       break;
 744     case source_ontology:
 745       // resolve against specific source ontology
 746       break;
 747     case species_build:
 748       // resolve against specific NCBI taxon version
 749       break;
 750     case hash:
 751       // close off any open feature hierarchies
 752       break;
 753     case fasta:
 754       // process the rest of the file as a fasta file and replace any dummy
 755       // sequence IDs
 756       process_as_fasta(align, newseqs);
 757       break;
 758     default:
 759       // we do nothing ?
 760       System.err.println("Ignoring unknown pragma:\n" + line);
 761     }
 762   }
 763
 764   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 765           throws IOException
 766   {
 767     try
 768     {
 769       mark();
 770     } catch (IOException q)
 771     {
 772     }
 773     FastaFile parser = new FastaFile(this);
 774     List<SequenceI> includedseqs = parser.getSeqs();
 775     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 776     // iterate over includedseqs, and replacing matching ones with newseqs
 777     // sequences. Generic iterator not used here because we modify includedseqs
 778     // as we go
 779     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 780     {
 781       // search for any dummy seqs that this sequence can be used to update
 782       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 783       if (dummyseq != null)
 784       {
 785         // dummyseq was created so it could be annotated and referred to in
 786         // alignments/codon mappings
 787
 788         SequenceI mseq = includedseqs.get(p);
 789         // mseq is the 'template' imported from the FASTA file which we'll use
 790         // to coomplete dummyseq
 791         if (dummyseq instanceof SequenceDummy)
 792         {
 793           // probably have the pattern wrong
 794           // idea is that a flyweight proxy for a sequence ID can be created for
 795           // 1. stable reference creation
 796           // 2. addition of annotation
 797           // 3. future replacement by a real sequence
 798           // current pattern is to create SequenceDummy objects - a convenience
 799           // constructor for a Sequence.
 800           // problem is that when promoted to a real sequence, all references
 801           // need
 802           // to be updated somehow.
 803           ((SequenceDummy) dummyseq).become(mseq);
 804           includedseqs.set(p, dummyseq); // template is no longer needed
 805         }
 806       }
 807     }
 808     // finally add sequences to the dataset
 809     for (SequenceI seq : includedseqs)
 810     {
 811       align.addSequence(seq);
 812     }
 813   }
 814
 815   /**
 816    * take a sequence feature and examine its attributes to decide how it should
 817    * be added to a sequence
 818    *
 819    * @param seq
 820    *          - the destination sequence constructed or discovered in the
 821    *          current context
 822    * @param sf
 823    *          - the base feature with ATTRIBUTES property containing any
 824    *          additional attributes
 825    * @param gFFFile
 826    *          - true if we are processing a GFF annotation file
 827    * @return true if sf was actually added to the sequence, false if it was
 828    *         processed in another way
 829    */
 830   public boolean processOrAddSeqFeature(AlignmentI align,
 831           List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 832           boolean gFFFile, boolean relaxedIdMatching)
 833   {
 834     String attr = (String) sf.getValue("ATTRIBUTES");
 835     boolean add = true;
 836     if (gFFFile && attr != null)
 837     {
 838       int nattr = 8;
 839
 840       for (String attset : attr.split("\t"))
 841       {
 842         if (attset == null || attset.trim().length() == 0)
 843         {
 844           continue;
 845         }
 846         nattr++;
 847         Map<String, List<String>> set = new HashMap<String, List<String>>();
 848         // normally, only expect one column - 9 - in this field
 849         // the attributes (Gff3) or groups (gff2) field
 850         for (String pair : attset.trim().split(";"))
 851         {
 852           pair = pair.trim();
 853           if (pair.length() == 0)
 854           {
 855             continue;
 856           }
 857
 858           // expect either space seperated (gff2) or '=' separated (gff3)
 859           // key/value pairs here
 860
 861           int eqpos = pair.indexOf('='), sppos = pair.indexOf(' ');
 862           String key = null, value = null;
 863
 864           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 865           {
 866             key = pair.substring(0, sppos);
 867             value = pair.substring(sppos + 1);
 868           }
 869           else
 870           {
 871             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 872             {
 873               key = pair.substring(0, eqpos);
 874               value = pair.substring(eqpos + 1);
 875             }
 876             else
 877             {
 878               key = pair;
 879             }
 880           }
 881           if (key != null)
 882           {
 883             List<String> vals = set.get(key);
 884             if (vals == null)
 885             {
 886               vals = new ArrayList<String>();
 887               set.put(key, vals);
 888             }
 889             if (value != null)
 890             {
 891               vals.add(value.trim());
 892             }
 893           }
 894         }
 895         try
 896         {
 897           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 898                   relaxedIdMatching); // process decides if
 899                                       // feature is actually
 900                                       // added
 901         } catch (InvalidGFF3FieldException ivfe)
 902         {
 903           System.err.println(ivfe);
 904         }
 905       }
 906     }
 907     if (add)
 908     {
 909       seq.addSequenceFeature(sf);
 910     }
 911     return add;
 912   }
 913
 914   public class InvalidGFF3FieldException extends Exception
 915   {
 916     String field, value;
 917
 918     public InvalidGFF3FieldException(String field,
 919             Map<String, List<String>> set, String message)
 920     {
 921       super(message + " (Field was " + field + " and value was "
 922               + set.get(field).toString());
 923       this.field = field;
 924       this.value = set.get(field).toString();
 925     }
 926
 927   }
 928
 929   /**
 930    * take a set of keys for a feature and interpret them
 931    *
 932    * @param set
 933    * @param nattr
 934    * @param seq
 935    * @param sf
 936    * @return
 937    */
 938   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 939           SequenceI seq, SequenceFeature sf, AlignmentI align,
 940           List<SequenceI> newseqs, boolean relaxedIdMatching)
 941           throws InvalidGFF3FieldException
 942   {
 943     String attr;
 944     // decide how to interpret according to type
 945     if (sf.getType().equals("similarity"))
 946     {
 947       int strand = sf.getStrand();
 948       // exonerate cdna/protein map
 949       // look for fields
 950       List<SequenceI> querySeq = findNames(align, newseqs,
 951               relaxedIdMatching, set.get(attr = "Query"));
 952       if (querySeq == null || querySeq.size() != 1)
 953       {
 954         throw new InvalidGFF3FieldException(attr, set,
 955                 "Expecting exactly one sequence in Query field (got "
 956                         + set.get(attr) + ")");
 957       }
 958       if (set.containsKey(attr = "Align"))
 959       {
 960         // process the align maps and create cdna/protein maps
 961         // ideally, the query sequences are in the alignment, but maybe not...
 962
 963         AlignedCodonFrame alco = new AlignedCodonFrame();
 964         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 965                 strand);
 966
 967         // add codon mapping, and hope!
 968         alco.addMap(seq, querySeq.get(0), codonmapping);
 969         align.addCodonFrame(alco);
 970         // everything that's needed to be done is done
 971         // no features to create here !
 972         return false;
 973       }
 974
 975     }
 976     return true;
 977   }
 978
 979   private MapList constructCodonMappingFromAlign(
 980           Map<String, List<String>> set, String attr, int strand)
 981           throws InvalidGFF3FieldException
 982   {
 983     if (strand == 0)
 984     {
 985       throw new InvalidGFF3FieldException(attr, set,
 986               "Invalid strand for a codon mapping (cannot be 0)");
 987     }
 988     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 989     int lastppos = 0, lastpframe = 0;
 990     for (String range : set.get(attr))
 991     {
 992       List<Integer> ints = new ArrayList<Integer>();
 993       StringTokenizer st = new StringTokenizer(range, " ");
 994       while (st.hasMoreTokens())
 995       {
 996         String num = st.nextToken();
 997         try
 998         {
 999           ints.add(new Integer(num));
1000         } catch (NumberFormatException nfe)
1001         {
1002           throw new InvalidGFF3FieldException(attr, set,
1003                   "Invalid number in field " + num);
1004         }
1005       }
1006       // Align positionInRef positionInQuery LengthInRef
1007       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
1008       // 3652 - . alignment_id 0 ;
1009       // Query DDB_G0269124
1010       // Align 11270 143 120
1011       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
1012       // dna in strand direction
1013       // Align 11150 187 282
1014       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
1015       // dna in strand direction
1016       //
1017       // Align 10865 281 888
1018       // Align 9977 578 1068
1019       // Align 8909 935 375
1020       //
1021       if (ints.size() != 3)
1022       {
1023         throw new InvalidGFF3FieldException(attr, set,
1024                 "Invalid number of fields for this attribute ("
1025                         + ints.size() + ")");
1026       }
1027       fromrange.add(new Integer(ints.get(0).intValue()));
1028       fromrange.add(new Integer(ints.get(0).intValue() + strand
1029               * ints.get(2).intValue()));
1030       // how are intron/exon boundaries that do not align in codons
1031       // represented
1032       if (ints.get(1).equals(lastppos) && lastpframe > 0)
1033       {
1034         // extend existing to map
1035         lastppos += ints.get(2) / 3;
1036         lastpframe = ints.get(2) % 3;
1037         torange.set(torange.size() - 1, new Integer(lastppos));
1038       }
1039       else
1040       {
1041         // new to map range
1042         torange.add(ints.get(1));
1043         lastppos = ints.get(1) + ints.get(2) / 3;
1044         lastpframe = ints.get(2) % 3;
1045         torange.add(new Integer(lastppos));
1046       }
1047     }
1048     // from and to ranges must end up being a series of start/end intervals
1049     if (fromrange.size() % 2 == 1)
1050     {
1051       throw new InvalidGFF3FieldException(attr, set,
1052               "Couldn't parse the DNA alignment range correctly");
1053     }
1054     if (torange.size() % 2 == 1)
1055     {
1056       throw new InvalidGFF3FieldException(attr, set,
1057               "Couldn't parse the protein alignment range correctly");
1058     }
1059     // finally, build the map
1060     int[] frommap = new int[fromrange.size()], tomap = new int[torange
1061             .size()];
1062     int p = 0;
1063     for (Integer ip : fromrange)
1064     {
1065       frommap[p++] = ip.intValue();
1066     }
1067     p = 0;
1068     for (Integer ip : torange)
1069     {
1070       tomap[p++] = ip.intValue();
1071     }
1072
1073     return new MapList(frommap, tomap, 3, 1);
1074   }
1075
1076   private List<SequenceI> findNames(AlignmentI align,
1077           List<SequenceI> newseqs, boolean relaxedIdMatching,
1078           List<String> list)
1079   {
1080     List<SequenceI> found = new ArrayList<SequenceI>();
1081     for (String seqId : list)
1082     {
1083       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1084       if (seq != null)
1085       {
1086         found.add(seq);
1087       }
1088     }
1089     return found;
1090   }
1091
1092   private AlignmentI lastmatchedAl = null;
1093
1094   private SequenceIdMatcher matcher = null;
1095
1096   /**
1097    * clear any temporary handles used to speed up ID matching
1098    */
1099   private void resetMatcher()
1100   {
1101     lastmatchedAl = null;
1102     matcher = null;
1103   }
1104
1105   private SequenceI findName(AlignmentI align, String seqId,
1106           boolean relaxedIdMatching, List<SequenceI> newseqs)
1107   {
1108     SequenceI match = null;
1109     if (relaxedIdMatching)
1110     {
1111       if (lastmatchedAl != align)
1112       {
1113         matcher = new SequenceIdMatcher(
1114                 (lastmatchedAl = align).getSequencesArray());
1115         if (newseqs != null)
1116         {
1117           matcher.addAll(newseqs);
1118         }
1119       }
1120       match = matcher.findIdMatch(seqId);
1121     }
1122     else
1123     {
1124       match = align.findName(seqId, true);
1125       if (match == null && newseqs != null)
1126       {
1127         for (SequenceI m : newseqs)
1128         {
1129           if (seqId.equals(m.getName()))
1130           {
1131             return m;
1132           }
1133         }
1134       }
1135
1136     }
1137     if (match == null && newseqs != null)
1138     {
1139       match = new SequenceDummy(seqId);
1140       if (relaxedIdMatching)
1141       {
1142         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
1143       }
1144       // add dummy sequence to the newseqs list
1145       newseqs.add(match);
1146     }
1147     return match;
1148   }
1149
1150   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
1151   {
1152     if (sf.getDescription() == null)
1153     {
1154       return;
1155     }
1156     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
1157             sf.getDescription(), removeHTML, newline);
1158
1159     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
1160             : sf.description;
1161     for (String link : parsed.getLinks())
1162     {
1163       sf.addLink(link);
1164     }
1165
1166   }
1167
1168   /**
1169    * generate a features file for seqs includes non-pos features by default.
1170    *
1171    * @param seqs
1172    *          source of sequence features
1173    * @param map
1174    *          hash of feature types and colours
1175    * @return features file contents
1176    */
1177   public String printJalviewFormat(SequenceI[] seqs,
1178           Map<String, FeatureColourI> map)
1179   {
1180     return printJalviewFormat(seqs, map, true, true);
1181   }
1182
1183   /**
1184    * generate a features file for seqs with colours from visible (if any)
1185    *
1186    * @param sequences
1187    *          source of features
1188    * @param visible
1189    *          hash of Colours for each feature type
1190    * @param visOnly
1191    *          when true only feature types in 'visible' will be output
1192    * @param nonpos
1193    *          indicates if non-positional features should be output (regardless
1194    *          of group or type)
1195    * @return features file contents
1196    */
1197   public String printJalviewFormat(SequenceI[] sequences,
1198           Map<String, FeatureColourI> visible,
1199           boolean visOnly, boolean nonpos)
1200   {
1201     StringBuffer out = new StringBuffer();
1202     SequenceFeature[] next;
1203     boolean featuresGen = false;
1204     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1205     {
1206       // no point continuing.
1207       return "No Features Visible";
1208     }
1209
1210     if (visible != null && visOnly)
1211     {
1212       // write feature colours only if we're given them and we are generating
1213       // viewed features
1214       // TODO: decide if feature links should also be written here ?
1215       Iterator<String> en = visible.keySet().iterator();
1216       String feature, color;
1217       while (en.hasNext())
1218       {
1219         feature = en.next();
1220
1221         FeatureColourI gc = visible.get(feature);
1222         if (!gc.isSimpleColour())
1223         {
1224           color = (gc.isColourByLabel() ? "label|" : "")
1225                   + Format.getHexString(gc.getMinColour()) + "|"
1226                   + Format.getHexString(gc.getMaxColour())
1227                   + (gc.isAutoScaled() ? "|" : "|abso|") + gc.getMin()
1228                   + "|"
1229                   + gc.getMax() + "|";
1230           if (gc.isBelowThreshold())
1231           {
1232             color += "below|" + gc.getThreshold();
1233           }
1234           else if (gc.isAboveThreshold())
1235           {
1236             color += "above|" + gc.getThreshold();
1237           }
1238           else
1239           {
1240             color += "none";
1241           }
1242         }
1243         else
1244         {
1245           color = Format.getHexString(gc.getColour());
1246         }
1247         // else
1248         // {
1249         // // legacy support for integer objects containing colour triplet
1250         // values
1251         // color = Format.getHexString(new java.awt.Color(Integer
1252         // .parseInt(visible.get(type).toString())));
1253         // }
1254         out.append(feature);
1255         out.append("\t");
1256         out.append(color);
1257         out.append(newline);
1258       }
1259     }
1260     // Work out which groups are both present and visible
1261     Vector groups = new Vector();
1262     int groupIndex = 0;
1263     boolean isnonpos = false;
1264
1265     for (int i = 0; i < sequences.length; i++)
1266     {
1267       next = sequences[i].getSequenceFeatures();
1268       if (next != null)
1269       {
1270         for (int j = 0; j < next.length; j++)
1271         {
1272           isnonpos = next[j].begin == 0 && next[j].end == 0;
1273           if ((!nonpos && isnonpos)
1274                   || (!isnonpos && visOnly && !visible
1275                           .containsKey(next[j].type)))
1276           {
1277             continue;
1278           }
1279
1280           if (next[j].featureGroup != null
1281                   && !groups.contains(next[j].featureGroup))
1282           {
1283             groups.addElement(next[j].featureGroup);
1284           }
1285         }
1286       }
1287     }
1288
1289     String group = null;
1290     do
1291     {
1292
1293       if (groups.size() > 0 && groupIndex < groups.size())
1294       {
1295         group = groups.elementAt(groupIndex).toString();
1296         out.append(newline);
1297         out.append("STARTGROUP\t");
1298         out.append(group);
1299         out.append(newline);
1300       }
1301       else
1302       {
1303         group = null;
1304       }
1305
1306       for (int i = 0; i < sequences.length; i++)
1307       {
1308         next = sequences[i].getSequenceFeatures();
1309         if (next != null)
1310         {
1311           for (int j = 0; j < next.length; j++)
1312           {
1313             isnonpos = next[j].begin == 0 && next[j].end == 0;
1314             if ((!nonpos && isnonpos)
1315                     || (!isnonpos && visOnly && !visible
1316                             .containsKey(next[j].type)))
1317             {
1318               // skip if feature is nonpos and we ignore them or if we only
1319               // output visible and it isn't non-pos and it's not visible
1320               continue;
1321             }
1322
1323             if (group != null
1324                     && (next[j].featureGroup == null || !next[j].featureGroup
1325                             .equals(group)))
1326             {
1327               continue;
1328             }
1329
1330             if (group == null && next[j].featureGroup != null)
1331             {
1332               continue;
1333             }
1334             // we have features to output
1335             featuresGen = true;
1336             if (next[j].description == null
1337                     || next[j].description.equals(""))
1338             {
1339               out.append(next[j].type + "\t");
1340             }
1341             else
1342             {
1343               if (next[j].links != null
1344                       && next[j].getDescription().indexOf("<html>") == -1)
1345               {
1346                 out.append("<html>");
1347               }
1348
1349               out.append(next[j].description + " ");
1350               if (next[j].links != null)
1351               {
1352                 for (int l = 0; l < next[j].links.size(); l++)
1353                 {
1354                   String label = next[j].links.elementAt(l).toString();
1355                   String href = label.substring(label.indexOf("|") + 1);
1356                   label = label.substring(0, label.indexOf("|"));
1357
1358                   if (next[j].description.indexOf(href) == -1)
1359                   {
1360                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1361                   }
1362                 }
1363
1364                 if (next[j].getDescription().indexOf("</html>") == -1)
1365                 {
1366                   out.append("</html>");
1367                 }
1368               }
1369
1370               out.append("\t");
1371             }
1372             out.append(sequences[i].getName());
1373             out.append("\t-1\t");
1374             out.append(next[j].begin);
1375             out.append("\t");
1376             out.append(next[j].end);
1377             out.append("\t");
1378             out.append(next[j].type);
1379             if (!Float.isNaN(next[j].score))
1380             {
1381               out.append("\t");
1382               out.append(next[j].score);
1383             }
1384             out.append(newline);
1385           }
1386         }
1387       }
1388
1389       if (group != null)
1390       {
1391         out.append("ENDGROUP\t");
1392         out.append(group);
1393         out.append(newline);
1394         groupIndex++;
1395       }
1396       else
1397       {
1398         break;
1399       }
1400
1401     } while (groupIndex < groups.size() + 1);
1402
1403     if (!featuresGen)
1404     {
1405       return "No Features Visible";
1406     }
1407
1408     return out.toString();
1409   }
1410
1411   /**
1412    * generate a gff file for sequence features includes non-pos features by
1413    * default.
1414    *
1415    * @param seqs
1416    * @param map
1417    * @return
1418    */
1419   public String printGFFFormat(SequenceI[] seqs,
1420           Map<String, FeatureColourI> map)
1421   {
1422     return printGFFFormat(seqs, map, true, true);
1423   }
1424
1425   public String printGFFFormat(SequenceI[] seqs,
1426           Map<String, FeatureColourI> map, boolean visOnly, boolean nonpos)
1427   {
1428     StringBuffer out = new StringBuffer();
1429     SequenceFeature[] next;
1430     String source;
1431     boolean isnonpos;
1432     for (int i = 0; i < seqs.length; i++)
1433     {
1434       if (seqs[i].getSequenceFeatures() != null)
1435       {
1436         next = seqs[i].getSequenceFeatures();
1437         for (int j = 0; j < next.length; j++)
1438         {
1439           isnonpos = next[j].begin == 0 && next[j].end == 0;
1440           if ((!nonpos && isnonpos)
1441                   || (!isnonpos && visOnly && !map
1442                           .containsKey(next[j].type)))
1443           {
1444             continue;
1445           }
1446
1447           source = next[j].featureGroup;
1448           if (source == null)
1449           {
1450             source = next[j].getDescription();
1451           }
1452
1453           out.append(seqs[i].getName());
1454           out.append("\t");
1455           out.append(source);
1456           out.append("\t");
1457           out.append(next[j].type);
1458           out.append("\t");
1459           out.append(next[j].begin);
1460           out.append("\t");
1461           out.append(next[j].end);
1462           out.append("\t");
1463           out.append(next[j].score);
1464           out.append("\t");
1465
1466           if (next[j].getValue("STRAND") != null)
1467           {
1468             out.append(next[j].getValue("STRAND"));
1469             out.append("\t");
1470           }
1471           else
1472           {
1473             out.append(".\t");
1474           }
1475
1476           if (next[j].getValue("FRAME") != null)
1477           {
1478             out.append(next[j].getValue("FRAME"));
1479           }
1480           else
1481           {
1482             out.append(".");
1483           }
1484           // TODO: verify/check GFF - should there be a /t here before attribute
1485           // output ?
1486
1487           if (next[j].getValue("ATTRIBUTES") != null)
1488           {
1489             out.append(next[j].getValue("ATTRIBUTES"));
1490           }
1491
1492           out.append(newline);
1493
1494         }
1495       }
1496     }
1497
1498     return out.toString();
1499   }
1500
1501   /**
1502    * this is only for the benefit of object polymorphism - method does nothing.
1503    */
1504   @Override
1505   public void parse()
1506   {
1507     // IGNORED
1508   }
1509
1510   /**
1511    * this is only for the benefit of object polymorphism - method does nothing.
1512    *
1513    * @return error message
1514    */
1515   @Override
1516   public String print()
1517   {
1518     return "USE printGFFFormat() or printJalviewFormat()";
1519   }
1520
1521 }