src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.api.AlignViewportI;
  25 import jalview.datamodel.AlignedCodonFrame;
  26 import jalview.datamodel.Alignment;
  27 import jalview.datamodel.AlignmentI;
  28 import jalview.datamodel.SequenceDummy;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.schemes.AnnotationColourGradient;
  32 import jalview.schemes.GraduatedColor;
  33 import jalview.schemes.UserColourScheme;
  34 import jalview.util.Format;
  35 import jalview.util.MapList;
  36 import jalview.util.ParseHtmlBodyAndLinks;
  37 import jalview.util.StringUtils;
  38
  39 import java.awt.Color;
  40 import java.io.IOException;
  41 import java.util.ArrayList;
  42 import java.util.Arrays;
  43 import java.util.HashMap;
  44 import java.util.Iterator;
  45 import java.util.List;
  46 import java.util.Map;
  47 import java.util.Map.Entry;
  48 import java.util.StringTokenizer;
  49
  50 /**
  51  * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
  52  * format. These are tab-delimited formats but with differences in the use of
  53  * columns.
  54  *
  55  * A Jalview feature file may define feature colours and then declare that the
  56  * remainder of the file is in GFF format with the line 'GFF'.
  57  *
  58  * GFF3 files may include alignment mappings for features, which Jalview will
  59  * attempt to model, and may include sequence data following a ##FASTA line.
  60  *
  61  *
  62  * @author AMW
  63  * @author jbprocter
  64  * @author gmcarstairs
  65  */
  66 public class FeaturesFile extends AlignFile
  67 {
  68   private static final String NOTE = "Note";
  69
  70   private static final String ALIGN = "Align";
  71
  72   private static final String QUERY = "Query";
  73
  74   private static final String TARGET = "Target";
  75
  76   private static final String SIMILARITY = "similarity";
  77
  78   protected static final String STRAND = "STRAND";
  79
  80   protected static final String FRAME = "FRAME";
  81
  82   protected static final String ATTRIBUTES = "ATTRIBUTES";
  83
  84   protected static final String TAB = "\t";
  85
  86   protected static final String GFF_VERSION = "##gff-version";
  87
  88   private AlignmentI lastmatchedAl = null;
  89
  90   private SequenceIdMatcher matcher = null;
  91
  92   protected AlignmentI dataset;
  93
  94   protected int gffVersion;
  95
  96   /**
  97    * Creates a new FeaturesFile object.
  98    */
  99   public FeaturesFile()
 100   {
 101   }
 102
 103   /**
 104    * Constructor which does not parse the file immediately
 105    *
 106    * @param inFile
 107    * @param type
 108    * @throws IOException
 109    */
 110   public FeaturesFile(String inFile, String type) throws IOException
 111   {
 112     super(false, inFile, type);
 113   }
 114
 115   /**
 116    * @param source
 117    * @throws IOException
 118    */
 119   public FeaturesFile(FileParse source) throws IOException
 120   {
 121     super(source);
 122   }
 123
 124   /**
 125    * Constructor that optionally parses the file immediately
 126    *
 127    * @param parseImmediately
 128    * @param inFile
 129    * @param type
 130    * @throws IOException
 131    */
 132   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 133           throws IOException
 134   {
 135     super(parseImmediately, inFile, type);
 136   }
 137
 138   /**
 139    * Parse GFF or sequence features file using case-independent matching,
 140    * discarding URLs
 141    *
 142    * @param align
 143    *          - alignment/dataset containing sequences that are to be annotated
 144    * @param colours
 145    *          - hashtable to store feature colour definitions
 146    * @param removeHTML
 147    *          - process html strings into plain text
 148    * @return true if features were added
 149    */
 150   public boolean parse(AlignmentI align, Map<String, Object> colours,
 151           boolean removeHTML)
 152   {
 153     return parse(align, colours, removeHTML, false);
 154   }
 155
 156   /**
 157    * Extends the default addProperties by also adding peptide-to-cDNA mappings
 158    * (if any) derived while parsing a GFF file
 159    */
 160   @Override
 161   public void addProperties(AlignmentI al)
 162   {
 163     super.addProperties(al);
 164     if (dataset != null && dataset.getCodonFrames() != null)
 165     {
 166       AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
 167       for (AlignedCodonFrame codons : dataset.getCodonFrames())
 168       {
 169         ds.addCodonFrame(codons);
 170       }
 171     }
 172   }
 173
 174   /**
 175    * Parse GFF or Jalview format sequence features file
 176    *
 177    * @param align
 178    *          - alignment/dataset containing sequences that are to be annotated
 179    * @param colours
 180    *          - hashtable to store feature colour definitions
 181    * @param removeHTML
 182    *          - process html strings into plain text
 183    * @param relaxedIdmatching
 184    *          - when true, ID matches to compound sequence IDs are allowed
 185    * @return true if features were added
 186    */
 187   public boolean parse(AlignmentI align, Map<String, Object> colours,
 188           boolean removeHTML, boolean relaxedIdmatching)
 189   {
 190     Map<String, String> gffProps = new HashMap<String, String>();
 191     /*
 192      * keep track of any sequences we try to create from the data
 193      */
 194     List<SequenceI> newseqs = new ArrayList<SequenceI>();
 195
 196     String line = null;
 197     try
 198     {
 199       StringTokenizer st;
 200       String featureGroup = null;
 201
 202       while ((line = nextLine()) != null)
 203       {
 204         // skip comments/process pragmas
 205         if (line.length() == 0 || line.startsWith("#"))
 206         {
 207           if (line.toLowerCase().startsWith("##"))
 208           {
 209             processGffPragma(line, gffProps, align, newseqs);
 210           }
 211           continue;
 212         }
 213
 214         st = new StringTokenizer(line, TAB);
 215         if (st.countTokens() == 1)
 216         {
 217           if (line.trim().equalsIgnoreCase("GFF"))
 218           {
 219             /*
 220              * Jalview features file with appendded GFF
 221              * assume GFF2 (though it may declare gff-version 3)
 222              */
 223             gffVersion = 2;
 224             continue;
 225           }
 226         }
 227
 228         if (st.countTokens() > 1 && st.countTokens() < 4)
 229         {
 230           /*
 231            * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
 232            * a feature type colour specification; not GFF format
 233            */
 234           String ft = st.nextToken();
 235           if (ft.equalsIgnoreCase("startgroup"))
 236           {
 237             featureGroup = st.nextToken();
 238           }
 239           else if (ft.equalsIgnoreCase("endgroup"))
 240           {
 241             // We should check whether this is the current group,
 242             // but at present theres no way of showing more than 1 group
 243             st.nextToken();
 244             featureGroup = null;
 245           }
 246           else
 247           {
 248             parseFeatureColour(line, ft, st, colours);
 249           }
 250           continue;
 251         }
 252
 253         /*
 254          * if not a comment, GFF pragma, startgroup, endgroup or feature
 255          * colour specification, that just leaves a feature details line
 256          * in either Jalview or GFF format
 257          */
 258         if (gffVersion == 0)
 259         {
 260           parseJalviewFeature(line, st, align, colours, removeHTML,
 261                   relaxedIdmatching, featureGroup);
 262         }
 263         else
 264         {
 265           parseGffFeature(st, align, relaxedIdmatching, newseqs);
 266         }
 267       }
 268       resetMatcher();
 269     } catch (Exception ex)
 270     {
 271       // should report somewhere useful for UI if necessary
 272       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 273               + "Parsing error at\n" + line;
 274       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 275       ex.printStackTrace(System.err);
 276       resetMatcher();
 277       return false;
 278     }
 279
 280     return true;
 281   }
 282
 283   /**
 284    * Try to parse a Jalview format feature specification. Returns true if
 285    * successful or false if not.
 286    *
 287    * @param line
 288    * @param st
 289    * @param alignment
 290    * @param featureColours
 291    * @param removeHTML
 292    * @param relaxedIdmatching
 293    * @param featureGroup
 294    */
 295   protected boolean parseJalviewFeature(String line, StringTokenizer st,
 296           AlignmentI alignment, Map<String, Object> featureColours,
 297           boolean removeHTML, boolean relaxedIdMatching, String featureGroup)
 298   {
 299     /*
 300      * Jalview: description seqid  seqIndex start end type [score]
 301      */
 302     if (st.countTokens() < 6)
 303     {
 304       System.err.println("Ignoring feature line '" + line
 305               + "' with unexpected number of columns (" + st.countTokens()
 306               + ")");
 307       return false;
 308     }
 309     String desc = st.nextToken();
 310     String seqId = st.nextToken();
 311     SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId);
 312
 313     if (!seqId.equals("ID_NOT_SPECIFIED"))
 314     {
 315       seq = findName(alignment, null, relaxedIdMatching, seqId);
 316       st.nextToken();
 317     }
 318     else
 319     {
 320       seqId = null;
 321       seq = null;
 322       try
 323       {
 324         int idx = Integer.parseInt(st.nextToken());
 325         seq = alignment.getSequenceAt(idx);
 326       } catch (NumberFormatException ex)
 327       {
 328         // continue
 329       }
 330     }
 331
 332     if (seq == null)
 333     {
 334       System.out.println("Sequence not found: " + line);
 335       return false;
 336     }
 337
 338     int startPos = Integer.parseInt(st.nextToken());
 339     int endPos = Integer.parseInt(st.nextToken());
 340
 341     String ft = st.nextToken();
 342
 343     if (!featureColours.containsKey(ft))
 344     {
 345       /*
 346        * Perhaps an old style groups file with no colours -
 347        * synthesize a colour from the feature type
 348        */
 349       UserColourScheme ucs = new UserColourScheme(ft);
 350       featureColours.put(ft, ucs.findColour('A'));
 351     }
 352     SequenceFeature sf = new SequenceFeature(ft, desc, "",
 353             startPos, endPos, featureGroup);
 354     if (st.hasMoreTokens())
 355     {
 356       float score = 0f;
 357       try
 358       {
 359         score = new Float(st.nextToken()).floatValue();
 360         // update colourgradient bounds if allowed to
 361       } catch (NumberFormatException ex)
 362       {
 363         // leave as 0
 364       }
 365       sf.setScore(score);
 366     }
 367
 368     parseDescriptionHTML(sf, removeHTML);
 369
 370     seq.addSequenceFeature(sf);
 371
 372     while (seqId != null
 373             && (seq = alignment.findName(seq, seqId, false)) != null)
 374     {
 375       seq.addSequenceFeature(new SequenceFeature(sf));
 376     }
 377     return true;
 378   }
 379
 380   /**
 381    * Process a feature type colour specification
 382    *
 383    * @param line
 384    *          the current input line (for error messages only)
 385    * @param featureType
 386    *          the first token on the line
 387    * @param st
 388    *          holds remaining tokens on the line
 389    * @param colours
 390    *          map to which to add derived colour specification
 391    */
 392   protected void parseFeatureColour(String line, String featureType,
 393           StringTokenizer st, Map<String, Object> colours)
 394   {
 395     Object colour = null;
 396     String colscheme = st.nextToken();
 397     if (colscheme.indexOf("|") > -1
 398             || colscheme.trim().equalsIgnoreCase("label"))
 399     {
 400       colour = parseGraduatedColourScheme(line, colscheme);
 401     }
 402     else
 403     {
 404       UserColourScheme ucs = new UserColourScheme(colscheme);
 405       colour = ucs.findColour('A');
 406     }
 407     if (colour != null)
 408     {
 409       colours.put(featureType, colour);
 410     }
 411   }
 412
 413   /**
 414    * Parse a Jalview graduated colour descriptor
 415    *
 416    * @param line
 417    * @param colourDescriptor
 418    * @return
 419    */
 420   protected GraduatedColor parseGraduatedColourScheme(String line,
 421           String colourDescriptor)
 422   {
 423     // Parse '|' separated graduated colourscheme fields:
 424     // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 425     // can either provide 'label' only, first is optional, next two
 426     // colors are required (but may be
 427     // left blank), next is optional, nxt two min/max are required.
 428     // first is either 'label'
 429     // first/second and third are both hexadecimal or word equivalent
 430     // colour.
 431     // next two are values parsed as floats.
 432     // fifth is either 'above','below', or 'none'.
 433     // sixth is a float value and only required when fifth is either
 434     // 'above' or 'below'.
 435     StringTokenizer gcol = new StringTokenizer(colourDescriptor, "|", true);
 436     // set defaults
 437     float min = Float.MIN_VALUE, max = Float.MAX_VALUE;
 438     boolean labelCol = false;
 439     // Parse spec line
 440     String mincol = gcol.nextToken();
 441     if (mincol == "|")
 442     {
 443       System.err
 444               .println("Expected either 'label' or a colour specification in the line: "
 445                       + line);
 446       return null;
 447     }
 448     String maxcol = null;
 449     if (mincol.toLowerCase().indexOf("label") == 0)
 450     {
 451       labelCol = true;
 452       mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip '|'
 453       mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 454     }
 455     String abso = null, minval, maxval;
 456     if (mincol != null)
 457     {
 458       // at least four more tokens
 459       if (mincol.equals("|"))
 460       {
 461         mincol = "";
 462       }
 463       else
 464       {
 465         gcol.nextToken(); // skip next '|'
 466       }
 467       // continue parsing rest of line
 468       maxcol = gcol.nextToken();
 469       if (maxcol.equals("|"))
 470       {
 471         maxcol = "";
 472       }
 473       else
 474       {
 475         gcol.nextToken(); // skip next '|'
 476       }
 477       abso = gcol.nextToken();
 478       gcol.nextToken(); // skip next '|'
 479       if (abso.toLowerCase().indexOf("abso") != 0)
 480       {
 481         minval = abso;
 482         abso = null;
 483       }
 484       else
 485       {
 486         minval = gcol.nextToken();
 487         gcol.nextToken(); // skip next '|'
 488       }
 489       maxval = gcol.nextToken();
 490       if (gcol.hasMoreTokens())
 491       {
 492         gcol.nextToken(); // skip next '|'
 493       }
 494       try
 495       {
 496         if (minval.length() > 0)
 497         {
 498           min = Float.valueOf(minval);
 499         }
 500       } catch (Exception e)
 501       {
 502         System.err
 503                 .println("Couldn't parse the minimum value for graduated colour for type ("
 504                         + colourDescriptor
 505                         + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 506         e.printStackTrace();
 507       }
 508       try
 509       {
 510         if (maxval.length() > 0)
 511         {
 512           max = Float.valueOf(maxval);
 513         }
 514       } catch (Exception e)
 515       {
 516         System.err
 517                 .println("Couldn't parse the maximum value for graduated colour for type ("
 518                         + colourDescriptor + ")");
 519         e.printStackTrace();
 520       }
 521     }
 522     else
 523     {
 524       // add in some dummy min/max colours for the label-only
 525       // colourscheme.
 526       mincol = "FFFFFF";
 527       maxcol = "000000";
 528     }
 529
 530     GraduatedColor colour = null;
 531     try
 532     {
 533       colour = new GraduatedColor(
 534               new UserColourScheme(mincol).findColour('A'),
 535               new UserColourScheme(maxcol).findColour('A'), min, max);
 536     } catch (Exception e)
 537     {
 538       System.err.println("Couldn't parse the graduated colour scheme ("
 539               + colourDescriptor + ")");
 540       e.printStackTrace();
 541     }
 542     if (colour != null)
 543     {
 544       colour.setColourByLabel(labelCol);
 545       colour.setAutoScaled(abso == null);
 546       // add in any additional parameters
 547       String ttype = null, tval = null;
 548       if (gcol.hasMoreTokens())
 549       {
 550         // threshold type and possibly a threshold value
 551         ttype = gcol.nextToken();
 552         if (ttype.toLowerCase().startsWith("below"))
 553         {
 554           colour.setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 555         }
 556         else if (ttype.toLowerCase().startsWith("above"))
 557         {
 558           colour.setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 559         }
 560         else
 561         {
 562           colour.setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 563           if (!ttype.toLowerCase().startsWith("no"))
 564           {
 565             System.err.println("Ignoring unrecognised threshold type : "
 566                     + ttype);
 567           }
 568         }
 569       }
 570       if (colour.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 571       {
 572         try
 573         {
 574           gcol.nextToken();
 575           tval = gcol.nextToken();
 576           colour.setThresh(new Float(tval).floatValue());
 577         } catch (Exception e)
 578         {
 579           System.err.println("Couldn't parse threshold value as a float: ("
 580                   + tval + ")");
 581           e.printStackTrace();
 582         }
 583       }
 584       // parse the thresh-is-min token ?
 585       if (gcol.hasMoreTokens())
 586       {
 587         System.err
 588                 .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 589         while (gcol.hasMoreTokens())
 590         {
 591           System.err.println("|" + gcol.nextToken());
 592         }
 593         System.err.println("\n");
 594       }
 595     }
 596     return colour;
 597   }
 598
 599   /**
 600    * clear any temporary handles used to speed up ID matching
 601    */
 602   protected void resetMatcher()
 603   {
 604     lastmatchedAl = null;
 605     matcher = null;
 606   }
 607
 608   /**
 609    * Returns a sequence matching the given id, as follows
 610    * <ul>
 611    * <li>strict matching is on exact sequence name</li>
 612    * <li>relaxed matching allows matching on a token within the sequence name,
 613    * or a dbxref</li>
 614    * <li>first tries to find a match in the alignment sequences</li>
 615    * <li>else tries to find a match in the new sequences already generated while
 616    * parsing the features file</li>
 617    * <li>else creates a new placeholder sequence, adds it to the new sequences
 618    * list, and returns it</li>
 619    * </ul>
 620    *
 621    * @param align
 622    * @param newseqs
 623    * @param relaxedIdMatching
 624    * @param seqId
 625    * @return
 626    */
 627   protected SequenceI findName(AlignmentI align, List<SequenceI> newseqs,
 628           boolean relaxedIdMatching, String seqId)
 629   {
 630     SequenceI match = null;
 631     if (relaxedIdMatching)
 632     {
 633       if (lastmatchedAl != align)
 634       {
 635         lastmatchedAl = align;
 636         matcher = new SequenceIdMatcher(align.getSequencesArray());
 637         if (newseqs != null)
 638         {
 639           matcher.addAll(newseqs);
 640         }
 641       }
 642       match = matcher.findIdMatch(seqId);
 643     }
 644     else
 645     {
 646       match = align.findName(seqId, true);
 647       if (match == null && newseqs != null)
 648       {
 649         for (SequenceI m : newseqs)
 650         {
 651           if (seqId.equals(m.getName()))
 652           {
 653             return m;
 654           }
 655         }
 656       }
 657
 658     }
 659     if (match == null && newseqs != null)
 660     {
 661       match = new SequenceDummy(seqId);
 662       if (relaxedIdMatching)
 663       {
 664         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 665       }
 666       // add dummy sequence to the newseqs list
 667       newseqs.add(match);
 668     }
 669     return match;
 670   }
 671
 672   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
 673   {
 674     if (sf.getDescription() == null)
 675     {
 676       return;
 677     }
 678     ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
 679             sf.getDescription(), removeHTML, newline);
 680
 681     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
 682             : sf.description;
 683     for (String link : parsed.getLinks())
 684     {
 685       sf.addLink(link);
 686     }
 687
 688   }
 689
 690   /**
 691    * generate a features file for seqs includes non-pos features by default.
 692    *
 693    * @param sequences
 694    *          source of sequence features
 695    * @param visible
 696    *          hash of feature types and colours
 697    * @return features file contents
 698    */
 699   public String printJalviewFormat(SequenceI[] sequences,
 700           Map<String, Object> visible)
 701   {
 702     return printJalviewFormat(sequences, visible, true, true);
 703   }
 704
 705   /**
 706    * generate a features file for seqs with colours from visible (if any)
 707    *
 708    * @param sequences
 709    *          source of features
 710    * @param visible
 711    *          hash of Colours for each feature type
 712    * @param visOnly
 713    *          when true only feature types in 'visible' will be output
 714    * @param nonpos
 715    *          indicates if non-positional features should be output (regardless
 716    *          of group or type)
 717    * @return features file contents
 718    */
 719   public String printJalviewFormat(SequenceI[] sequences,
 720           Map<String, Object> visible, boolean visOnly, boolean nonpos)
 721   {
 722     StringBuilder out = new StringBuilder(256);
 723     boolean featuresGen = false;
 724     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
 725     {
 726       // no point continuing.
 727       return "No Features Visible";
 728     }
 729
 730     if (visible != null && visOnly)
 731     {
 732       // write feature colours only if we're given them and we are generating
 733       // viewed features
 734       // TODO: decide if feature links should also be written here ?
 735       Iterator<String> en = visible.keySet().iterator();
 736       String featureType, color;
 737       while (en.hasNext())
 738       {
 739         featureType = en.next().toString();
 740
 741         if (visible.get(featureType) instanceof GraduatedColor)
 742         {
 743           GraduatedColor gc = (GraduatedColor) visible.get(featureType);
 744           color = (gc.isColourByLabel() ? "label|" : "")
 745                   + Format.getHexString(gc.getMinColor()) + "|"
 746                   + Format.getHexString(gc.getMaxColor())
 747                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
 748                   + gc.getMax() + "|";
 749           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 750           {
 751             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
 752             {
 753               color += "below";
 754             }
 755             else
 756             {
 757               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
 758               {
 759                 System.err.println("WARNING: Unsupported threshold type ("
 760                         + gc.getThreshType() + ") : Assuming 'above'");
 761               }
 762               color += "above";
 763             }
 764             // add the value
 765             color += "|" + gc.getThresh();
 766           }
 767           else
 768           {
 769             color += "none";
 770           }
 771         }
 772         else if (visible.get(featureType) instanceof Color)
 773         {
 774           color = Format.getHexString((Color) visible.get(featureType));
 775         }
 776         else
 777         {
 778           // legacy support for integer objects containing colour triplet values
 779           color = Format.getHexString(new Color(Integer.parseInt(visible
 780                   .get(featureType).toString())));
 781         }
 782         out.append(featureType);
 783         out.append(TAB);
 784         out.append(color);
 785         out.append(newline);
 786       }
 787     }
 788     // Work out which groups are both present and visible
 789     List<String> groups = new ArrayList<String>();
 790     int groupIndex = 0;
 791     boolean isnonpos = false;
 792
 793     SequenceFeature[] features;
 794     for (int i = 0; i < sequences.length; i++)
 795     {
 796       features = sequences[i].getSequenceFeatures();
 797       if (features != null)
 798       {
 799         for (int j = 0; j < features.length; j++)
 800         {
 801           isnonpos = features[j].begin == 0 && features[j].end == 0;
 802           if ((!nonpos && isnonpos)
 803                   || (!isnonpos && visOnly && !visible
 804                           .containsKey(features[j].type)))
 805           {
 806             continue;
 807           }
 808
 809           if (features[j].featureGroup != null
 810                   && !groups.contains(features[j].featureGroup))
 811           {
 812             groups.add(features[j].featureGroup);
 813           }
 814         }
 815       }
 816     }
 817
 818     String group = null;
 819     do
 820     {
 821       if (groups.size() > 0 && groupIndex < groups.size())
 822       {
 823         group = groups.get(groupIndex);
 824         out.append(newline);
 825         out.append("STARTGROUP").append(TAB);
 826         out.append(group);
 827         out.append(newline);
 828       }
 829       else
 830       {
 831         group = null;
 832       }
 833
 834       for (int i = 0; i < sequences.length; i++)
 835       {
 836         features = sequences[i].getSequenceFeatures();
 837         if (features != null)
 838         {
 839           for (int j = 0; j < features.length; j++)
 840           {
 841             isnonpos = features[j].begin == 0 && features[j].end == 0;
 842             if ((!nonpos && isnonpos)
 843                     || (!isnonpos && visOnly && !visible
 844                             .containsKey(features[j].type)))
 845             {
 846               // skip if feature is nonpos and we ignore them or if we only
 847               // output visible and it isn't non-pos and it's not visible
 848               continue;
 849             }
 850
 851             if (group != null
 852                     && (features[j].featureGroup == null || !features[j].featureGroup
 853                             .equals(group)))
 854             {
 855               continue;
 856             }
 857
 858             if (group == null && features[j].featureGroup != null)
 859             {
 860               continue;
 861             }
 862             // we have features to output
 863             featuresGen = true;
 864             if (features[j].description == null
 865                     || features[j].description.equals(""))
 866             {
 867               out.append(features[j].type).append(TAB);
 868             }
 869             else
 870             {
 871               if (features[j].links != null
 872                       && features[j].getDescription().indexOf("<html>") == -1)
 873               {
 874                 out.append("<html>");
 875               }
 876
 877               out.append(features[j].description + " ");
 878               if (features[j].links != null)
 879               {
 880                 for (int l = 0; l < features[j].links.size(); l++)
 881                 {
 882                   String label = features[j].links.elementAt(l).toString();
 883                   String href = label.substring(label.indexOf("|") + 1);
 884                   label = label.substring(0, label.indexOf("|"));
 885
 886                   if (features[j].description.indexOf(href) == -1)
 887                   {
 888                     out.append("<a href=\"" + href + "\">" + label + "</a>");
 889                   }
 890                 }
 891
 892                 if (features[j].getDescription().indexOf("</html>") == -1)
 893                 {
 894                   out.append("</html>");
 895                 }
 896               }
 897
 898               out.append(TAB);
 899             }
 900             out.append(sequences[i].getName());
 901             out.append("\t-1\t");
 902             out.append(features[j].begin);
 903             out.append(TAB);
 904             out.append(features[j].end);
 905             out.append(TAB);
 906             out.append(features[j].type);
 907             if (!Float.isNaN(features[j].score))
 908             {
 909               out.append(TAB);
 910               out.append(features[j].score);
 911             }
 912             out.append(newline);
 913           }
 914         }
 915       }
 916
 917       if (group != null)
 918       {
 919         out.append("ENDGROUP").append(TAB);
 920         out.append(group);
 921         out.append(newline);
 922         groupIndex++;
 923       }
 924       else
 925       {
 926         break;
 927       }
 928
 929     } while (groupIndex < groups.size() + 1);
 930
 931     if (!featuresGen)
 932     {
 933       return "No Features Visible";
 934     }
 935
 936     return out.toString();
 937   }
 938
 939   /**
 940    * Parse method that is called when a GFF file is dragged to the desktop
 941    */
 942   @Override
 943   public void parse()
 944   {
 945     AlignViewportI av = getViewport();
 946     if (av != null)
 947     {
 948       if (av.getAlignment() != null)
 949       {
 950         dataset = av.getAlignment().getDataset();
 951       }
 952       if (dataset == null)
 953       {
 954         // working in the applet context ?
 955         dataset = av.getAlignment();
 956       }
 957     }
 958     else
 959     {
 960       dataset = new Alignment(new SequenceI[] {});
 961     }
 962
 963     boolean parseResult = parse(dataset, null, false, true);
 964     if (!parseResult)
 965     {
 966       // pass error up somehow
 967     }
 968     if (av != null)
 969     {
 970       // update viewport with the dataset data ?
 971     }
 972     else
 973     {
 974       setSeqs(dataset.getSequencesArray());
 975     }
 976   }
 977
 978   /**
 979    * Implementation of unused abstract method
 980    *
 981    * @return error message
 982    */
 983   @Override
 984   public String print()
 985   {
 986     return "Use printGffFormat() or printJalviewFormat()";
 987   }
 988
 989   /**
 990    * Returns features output in GFF2 format, including hidden and non-positional
 991    * features
 992    *
 993    * @param sequences
 994    *          the sequences whose features are to be output
 995    * @param visible
 996    *          a map whose keys are the type names of visible features
 997    * @return
 998    */
 999   public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible)
1000   {
1001     return printGffFormat(sequences, visible, true, true);
1002   }
1003
1004   /**
1005    * Returns features output in GFF2 format
1006    *
1007    * @param sequences
1008    *          the sequences whose features are to be output
1009    * @param visible
1010    *          a map whose keys are the type names of visible features
1011    * @param outputVisibleOnly
1012    * @param includeNonPositionalFeatures
1013    * @return
1014    */
1015   public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible, boolean outputVisibleOnly,
1016           boolean includeNonPositionalFeatures)
1017   {
1018     StringBuilder out = new StringBuilder(256);
1019     out.append(String.format("%s %d\n", GFF_VERSION, gffVersion));
1020     String source;
1021     boolean isnonpos;
1022     for (SequenceI seq : sequences)
1023     {
1024       SequenceFeature[] features = seq.getSequenceFeatures();
1025       if (features != null)
1026       {
1027         for (SequenceFeature sf : features)
1028         {
1029           isnonpos = sf.begin == 0 && sf.end == 0;
1030           if (!includeNonPositionalFeatures && isnonpos)
1031           {
1032             /*
1033              * ignore non-positional features if not wanted
1034              */
1035             continue;
1036           }
1037           // TODO why the test !isnonpos here?
1038           // what about not visible non-positional features?
1039           if (!isnonpos && outputVisibleOnly
1040                   && !visible.containsKey(sf.type))
1041           {
1042             /*
1043              * ignore not visible features if not wanted
1044              */
1045             continue;
1046           }
1047
1048           source = sf.featureGroup;
1049           if (source == null)
1050           {
1051             source = sf.getDescription();
1052           }
1053
1054           out.append(seq.getName());
1055           out.append(TAB);
1056           out.append(source);
1057           out.append(TAB);
1058           out.append(sf.type);
1059           out.append(TAB);
1060           out.append(sf.begin);
1061           out.append(TAB);
1062           out.append(sf.end);
1063           out.append(TAB);
1064           out.append(sf.score);
1065           out.append(TAB);
1066
1067           out.append(sf.getValue(STRAND, "."));
1068           out.append(TAB);
1069
1070           out.append(sf.getValue(FRAME, "."));
1071
1072           // miscellaneous key-values (GFF column 9)
1073           String attributes = (String) sf.getValue(ATTRIBUTES);
1074           if (attributes != null)
1075           {
1076             out.append(TAB).append(attributes);
1077           }
1078
1079           out.append(newline);
1080         }
1081       }
1082     }
1083
1084     return out.toString();
1085   }
1086
1087   /**
1088    * Returns a mapping given list of one or more Align descriptors (exonerate
1089    * format)
1090    *
1091    * @param alignedRegions
1092    *          a list of "Align fromStart toStart fromCount"
1093    * @param mapIsFromCdna
1094    *          if true, 'from' is dna, else 'from' is protein
1095    * @param strand
1096    *          either 1 (forward) or -1 (reverse)
1097    * @return
1098    * @throws IOException
1099    */
1100   protected MapList constructCodonMappingFromAlign(
1101           List<String> alignedRegions, boolean mapIsFromCdna, int strand)
1102           throws IOException
1103   {
1104     if (strand == 0)
1105     {
1106       throw new IOException(
1107               "Invalid strand for a codon mapping (cannot be 0)");
1108     }
1109     int regions = alignedRegions.size();
1110     // arrays to hold [start, end] for each aligned region
1111     int[] fromRanges = new int[regions * 2]; // from dna
1112     int[] toRanges = new int[regions * 2]; // to protein
1113     int fromRangesIndex = 0;
1114     int toRangesIndex = 0;
1115
1116     for (String range : alignedRegions)
1117     {
1118       /*
1119        * Align mapFromStart mapToStart mapFromCount
1120        * e.g. if mapIsFromCdna
1121        *     Align 11270 143 120
1122        * means:
1123        *     120 bases from pos 11270 align to pos 143 in peptide
1124        * if !mapIsFromCdna this would instead be
1125        *     Align 143 11270 40
1126        */
1127       String[] tokens = range.split(" ");
1128       if (tokens.length != 3)
1129       {
1130         throw new IOException("Wrong number of fields for Align");
1131       }
1132       int fromStart = 0;
1133       int toStart = 0;
1134       int fromCount = 0;
1135       try
1136       {
1137         fromStart = Integer.parseInt(tokens[0]);
1138         toStart = Integer.parseInt(tokens[1]);
1139         fromCount = Integer.parseInt(tokens[2]);
1140       } catch (NumberFormatException nfe)
1141       {
1142         throw new IOException("Invalid number in Align field: "
1143                 + nfe.getMessage());
1144       }
1145
1146       /*
1147        * Jalview always models from dna to protein, so adjust values if the
1148        * GFF mapping is from protein to dna
1149        */
1150       if (!mapIsFromCdna)
1151       {
1152         fromCount *= 3;
1153         int temp = fromStart;
1154         fromStart = toStart;
1155         toStart = temp;
1156       }
1157       fromRanges[fromRangesIndex++] = fromStart;
1158       fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
1159
1160       /*
1161        * If a codon has an intron gap, there will be contiguous 'toRanges';
1162        * this is handled for us by the MapList constructor.
1163        * (It is not clear that exonerate ever generates this case)
1164        */
1165       toRanges[toRangesIndex++] = toStart;
1166       toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
1167     }
1168
1169     return new MapList(fromRanges, toRanges, 3, 1);
1170   }
1171
1172   /**
1173    * Parse a GFF format feature. This may include creating a 'dummy' sequence
1174    * for the feature or its mapped sequence
1175    *
1176    * @param st
1177    * @param alignment
1178    * @param relaxedIdMatching
1179    * @param newseqs
1180    * @return
1181    */
1182   protected SequenceI parseGffFeature(StringTokenizer st,
1183           AlignmentI alignment, boolean relaxedIdMatching,
1184           List<SequenceI> newseqs)
1185   {
1186     SequenceI seq;
1187     /*
1188      * GFF: seqid source type start end score strand phase [attributes]
1189      */
1190     if (st.countTokens() < 8)
1191     {
1192       System.err
1193               .println("Ignoring GFF feature line with unexpected number of columns ("
1194                       + st.countTokens() + ")");
1195       return null;
1196     }
1197     String seqId = st.nextToken();
1198
1199     /*
1200      * locate referenced sequence in alignment _or_
1201      * as a forward reference (SequenceDummy)
1202      */
1203     seq = findName(alignment, newseqs, relaxedIdMatching, seqId);
1204
1205     String desc = st.nextToken();
1206     String group = null;
1207     if (desc.indexOf(' ') == -1)
1208     {
1209       // could also be a source term rather than description line
1210       group = desc;
1211     }
1212     String ft = st.nextToken();
1213     int startPos = StringUtils.parseInt(st.nextToken());
1214     int endPos = StringUtils.parseInt(st.nextToken());
1215     // TODO: decide if non positional feature assertion for input data
1216     // where end==0 is generally valid
1217     if (endPos == 0)
1218     {
1219       // treat as non-positional feature, regardless.
1220       startPos = 0;
1221     }
1222     float score = 0f;
1223     try
1224     {
1225       score = new Float(st.nextToken()).floatValue();
1226     } catch (NumberFormatException ex)
1227     {
1228       // leave at 0
1229     }
1230
1231     SequenceFeature sf = new SequenceFeature(ft, desc, startPos,
1232             endPos, score, group);
1233     if (st.hasMoreTokens())
1234     {
1235       sf.setValue(STRAND, st.nextToken());
1236     }
1237     if (st.hasMoreTokens())
1238     {
1239       sf.setValue(FRAME, st.nextToken());
1240     }
1241
1242     if (st.hasMoreTokens())
1243     {
1244       processGffColumnNine(st.nextToken(), sf);
1245     }
1246
1247     if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
1248             relaxedIdMatching))
1249     {
1250       // check whether we should add the sequence feature to any other
1251       // sequences in the alignment with the same or similar
1252       while ((seq = alignment.findName(seq, seqId, true)) != null)
1253       {
1254         seq.addSequenceFeature(new SequenceFeature(sf));
1255       }
1256     }
1257     return seq;
1258   }
1259
1260   /**
1261    * Process the 'column 9' data of the GFF file. This is less formally defined,
1262    * and its interpretation will vary depending on the tool that has generated
1263    * it.
1264    *
1265    * @param attributes
1266    * @param sf
1267    */
1268   protected void processGffColumnNine(String attributes, SequenceFeature sf)
1269   {
1270     sf.setValue(ATTRIBUTES, attributes);
1271
1272     /*
1273      * Parse attributes in column 9 and add them to the sequence feature's
1274      * 'otherData' table; use Note as a best proxy for description
1275      */
1276     char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' };
1277     Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
1278             nameValueSeparator);
1279     for (Entry<String, List<String>> attr : nameValues.entrySet())
1280     {
1281       String values = StringUtils.listToDelimitedString(attr.getValue(),
1282               "; ");
1283       sf.setValue(attr.getKey(), values);
1284       if (NOTE.equals(attr.getKey()))
1285       {
1286         sf.setDescription(values);
1287       }
1288     }
1289   }
1290
1291   /**
1292    * After encountering ##fasta in a GFF3 file, process the remainder of the
1293    * file as FAST sequence data. Any placeholder sequences created during
1294    * feature parsing are updated with the actual sequences.
1295    *
1296    * @param align
1297    * @param newseqs
1298    * @throws IOException
1299    */
1300   protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1301           throws IOException
1302   {
1303     try
1304     {
1305       mark();
1306     } catch (IOException q)
1307     {
1308     }
1309     FastaFile parser = new FastaFile(this);
1310     List<SequenceI> includedseqs = parser.getSeqs();
1311     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1312     // iterate over includedseqs, and replacing matching ones with newseqs
1313     // sequences. Generic iterator not used here because we modify includedseqs
1314     // as we go
1315     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1316     {
1317       // search for any dummy seqs that this sequence can be used to update
1318       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
1319       if (dummyseq != null)
1320       {
1321         // dummyseq was created so it could be annotated and referred to in
1322         // alignments/codon mappings
1323
1324         SequenceI mseq = includedseqs.get(p);
1325         // mseq is the 'template' imported from the FASTA file which we'll use
1326         // to coomplete dummyseq
1327         if (dummyseq instanceof SequenceDummy)
1328         {
1329           // probably have the pattern wrong
1330           // idea is that a flyweight proxy for a sequence ID can be created for
1331           // 1. stable reference creation
1332           // 2. addition of annotation
1333           // 3. future replacement by a real sequence
1334           // current pattern is to create SequenceDummy objects - a convenience
1335           // constructor for a Sequence.
1336           // problem is that when promoted to a real sequence, all references
1337           // need
1338           // to be updated somehow.
1339           ((SequenceDummy) dummyseq).become(mseq);
1340           includedseqs.set(p, dummyseq); // template is no longer needed
1341         }
1342       }
1343     }
1344     // finally add sequences to the dataset
1345     for (SequenceI seq : includedseqs)
1346     {
1347       align.addSequence(seq);
1348     }
1349   }
1350
1351   /**
1352    * Process a ## directive
1353    *
1354    * @param line
1355    * @param gffProps
1356    * @param align
1357    * @param newseqs
1358    * @throws IOException
1359    */
1360   protected void processGffPragma(String line, Map<String, String> gffProps, AlignmentI align,
1361           List<SequenceI> newseqs) throws IOException
1362   {
1363     line = line.trim();
1364     if ("###".equals(line))
1365     {
1366       // close off any open 'forward references'
1367       return;
1368     }
1369
1370     String[] tokens = line.substring(2).split(" ");
1371     String pragma = tokens[0];
1372     String value = tokens.length == 1 ? null : tokens[1];
1373
1374     if ("gff-version".equalsIgnoreCase(pragma))
1375     {
1376       if (value != null)
1377       {
1378         try
1379         {
1380           // value may be e.g. "3.1.2"
1381           gffVersion = Integer.parseInt(value.split("\\.")[0]);
1382         } catch (NumberFormatException e)
1383         {
1384           // ignore
1385         }
1386       }
1387     }
1388     else if ("feature-ontology".equalsIgnoreCase(pragma))
1389     {
1390       // should resolve against the specified feature ontology URI
1391     }
1392     else if ("attribute-ontology".equalsIgnoreCase(pragma))
1393     {
1394       // URI of attribute ontology - not currently used in GFF3
1395     }
1396     else if ("source-ontology".equalsIgnoreCase(pragma))
1397     {
1398       // URI of source ontology - not currently used in GFF3
1399     }
1400     else if ("species-build".equalsIgnoreCase(pragma))
1401     {
1402       // save URI of specific NCBI taxon version of annotations
1403       gffProps.put("species-build", value);
1404     }
1405     else if ("fasta".equalsIgnoreCase(pragma))
1406     {
1407       // process the rest of the file as a fasta file and replace any dummy
1408       // sequence IDs
1409       processAsFasta(align, newseqs);
1410     }
1411     else
1412     {
1413       System.err.println("Ignoring unknown pragma: " + line);
1414     }
1415   }
1416
1417   /**
1418    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
1419    * an exonerate GFF similarity feature; these properties define the mapping of
1420    * the annotated feature (e.g. 'exon') to a related sequence.
1421    *
1422    * @param set
1423    * @param seq
1424    * @param sf
1425    * @param align
1426    * @param newseqs
1427    * @param relaxedIdMatching
1428    * @throws IOException
1429    */
1430   public void processGffSimilarity(Map<String, List<String>> set, SequenceI seq,
1431           SequenceFeature sf, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)
1432           throws IOException
1433   {
1434     if (!validateExonerateModel(sf))
1435     {
1436       return;
1437     }
1438
1439     int strand = sf.getStrand();
1440
1441     /*
1442      * exonerate (protein2dna or protein2genome) may be run with
1443      * --showquerygff  outputs
1444      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos peptideCount
1445      * --showtargetgff outputs
1446      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
1447      * where the Align spec may repeat
1448      */
1449     boolean mapIsFromCdna = true;
1450     List<String> mapTo = set.get(QUERY);
1451     if (mapTo == null)
1452     {
1453       mapTo = set.get(TARGET);
1454       mapIsFromCdna = false;
1455     }
1456     if (mapTo == null || mapTo.size() != 1)
1457     {
1458       throw new IOException(
1459               "Expecting exactly one sequence in Query field (got " + mapTo
1460                       + ")");
1461     }
1462
1463     /*
1464      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
1465      */
1466     SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching,
1467             mapTo.get(0));
1468       /*
1469        * Process the Align maps and create cdna/protein maps;
1470        * ideally, the query sequences are in the alignment, but maybe not...
1471        */
1472     AlignedCodonFrame alco = new AlignedCodonFrame();
1473     MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN),
1474             mapIsFromCdna, strand);
1475
1476     /*
1477      * Jalview always maps from dna to protein
1478      */
1479     if (mapIsFromCdna)
1480     {
1481       alco.addMap(seq, mappedSequence, codonmapping);
1482     }
1483     else
1484     {
1485       alco.addMap(mappedSequence, seq, codonmapping);
1486     }
1487     align.addCodonFrame(alco);
1488   }
1489
1490   /**
1491    * Returns true if the exonerate model (saved from column 2 of the GFF as the
1492    * SequenceFeature's group) is one that we are willing to process, else false
1493    *
1494    * @param sf
1495    */
1496   protected boolean validateExonerateModel(SequenceFeature sf)
1497   {
1498     /*
1499      * we don't handle protein-to-protein or dna-to-dna alignment here
1500      */
1501     String source = sf.getFeatureGroup();
1502     if (source == null
1503             || (!source.contains("protein2dna") && !source
1504                     .contains("protein2genome")))
1505     {
1506       System.err
1507               .println("I only accept protein2dna or protein2genome but found "
1508                       + source);
1509       return false;
1510     }
1511     return true;
1512   }
1513
1514   /**
1515    * take a sequence feature and examine its attributes to decide how it should
1516    * be added to a sequence
1517    *
1518    * @param seq
1519    *          - the destination sequence constructed or discovered in the
1520    *          current context
1521    * @param sf
1522    *          - the base feature with ATTRIBUTES property containing any
1523    *          additional attributes
1524    * @param gFFFile
1525    *          - true if we are processing a GFF annotation file
1526    * @return true if sf was actually added to the sequence, false if it was
1527    *         processed in another way
1528    */
1529   public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs,
1530           SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching)
1531   {
1532     String attr = (String) sf.getValue(ATTRIBUTES);
1533     boolean addFeature = true;
1534     if (attr != null)
1535     {
1536       for (String attset : attr.split(TAB))
1537       {
1538         Map<String, List<String>> set = StringUtils.parseNameValuePairs(
1539                 attset, ";", new char[] { ' ', '-' });
1540
1541         if (SIMILARITY.equals(sf.getType()))
1542         {
1543           try
1544           {
1545             addFeature = false;
1546             processGffSimilarity(set, seq, sf, align, newseqs,
1547                     relaxedIdMatching);
1548           } catch (IOException ivfe)
1549           {
1550             System.err.println(ivfe);
1551           }
1552         }
1553       }
1554     }
1555     if (addFeature)
1556     {
1557       seq.addSequenceFeature(sf);
1558     }
1559     return addFeature;
1560   }
1561
1562 }