src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.AlignmentUtils;
  24 import jalview.analysis.SequenceIdMatcher;
  25 import jalview.api.AlignViewportI;
  26 import jalview.api.FeatureColourI;
  27 import jalview.api.FeatureRenderer;
  28 import jalview.api.FeaturesSourceI;
  29 import jalview.datamodel.AlignedCodonFrame;
  30 import jalview.datamodel.Alignment;
  31 import jalview.datamodel.AlignmentI;
  32 import jalview.datamodel.MappedFeatures;
  33 import jalview.datamodel.SequenceDummy;
  34 import jalview.datamodel.SequenceFeature;
  35 import jalview.datamodel.SequenceI;
  36 import jalview.datamodel.features.FeatureMatcherSet;
  37 import jalview.datamodel.features.FeatureMatcherSetI;
  38 import jalview.gui.Desktop;
  39 import jalview.io.gff.GffHelperBase;
  40 import jalview.io.gff.GffHelperFactory;
  41 import jalview.io.gff.GffHelperI;
  42 import jalview.schemes.FeatureColour;
  43 import jalview.util.ColorUtils;
  44 import jalview.util.MapList;
  45 import jalview.util.ParseHtmlBodyAndLinks;
  46 import jalview.util.StringUtils;
  47
  48 import java.awt.Color;
  49 import java.io.IOException;
  50 import java.util.ArrayList;
  51 import java.util.Arrays;
  52 import java.util.Collections;
  53 import java.util.HashMap;
  54 import java.util.LinkedHashMap;
  55 import java.util.List;
  56 import java.util.Map;
  57 import java.util.Map.Entry;
  58 import java.util.TreeMap;
  59
  60 /**
  61  * Parses and writes features files, which may be in Jalview, GFF2 or GFF3
  62  * format. These are tab-delimited formats but with differences in the use of
  63  * columns.
  64  *
  65  * A Jalview feature file may define feature colours and then declare that the
  66  * remainder of the file is in GFF format with the line 'GFF'.
  67  *
  68  * GFF3 files may include alignment mappings for features, which Jalview will
  69  * attempt to model, and may include sequence data following a ##FASTA line.
  70  *
  71  *
  72  * @author AMW
  73  * @author jbprocter
  74  * @author gmcarstairs
  75  */
  76 public class FeaturesFile extends AlignFile implements FeaturesSourceI
  77 {
  78   private static final String TAB_REGEX = "\\t";
  79
  80   private static final String STARTGROUP = "STARTGROUP";
  81
  82   private static final String ENDGROUP = "ENDGROUP";
  83
  84   private static final String STARTFILTERS = "STARTFILTERS";
  85
  86   private static final String ENDFILTERS = "ENDFILTERS";
  87
  88   private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
  89
  90   private static final String NOTE = "Note";
  91
  92   protected static final String GFF_VERSION = "##gff-version";
  93
  94   private AlignmentI lastmatchedAl = null;
  95
  96   private SequenceIdMatcher matcher = null;
  97
  98   protected AlignmentI dataset;
  99
 100   protected int gffVersion;
 101
 102   /**
 103    * Creates a new FeaturesFile object.
 104    */
 105   public FeaturesFile()
 106   {
 107   }
 108
 109   /**
 110    * Constructor which does not parse the file immediately
 111    *
 112    * @param file
 113    * @param paste
 114    * @throws IOException
 115    */
 116   public FeaturesFile(String file, DataSourceType paste)
 117           throws IOException
 118   {
 119     super(false, file, paste);
 120   }
 121
 122   /**
 123    * @param source
 124    * @throws IOException
 125    */
 126   public FeaturesFile(FileParse source) throws IOException
 127   {
 128     super(source);
 129   }
 130
 131   /**
 132    * Constructor that optionally parses the file immediately
 133    *
 134    * @param parseImmediately
 135    * @param file
 136    * @param type
 137    * @throws IOException
 138    */
 139   public FeaturesFile(boolean parseImmediately, String file,
 140           DataSourceType type) throws IOException
 141   {
 142     super(parseImmediately, file, type);
 143   }
 144
 145   /**
 146    * Parse GFF or sequence features file using case-independent matching,
 147    * discarding URLs
 148    *
 149    * @param align
 150    *          - alignment/dataset containing sequences that are to be annotated
 151    * @param colours
 152    *          - hashtable to store feature colour definitions
 153    * @param removeHTML
 154    *          - process html strings into plain text
 155    * @return true if features were added
 156    */
 157   public boolean parse(AlignmentI align,
 158           Map<String, FeatureColourI> colours, boolean removeHTML)
 159   {
 160     return parse(align, colours, removeHTML, false);
 161   }
 162
 163   /**
 164    * Extends the default addProperties by also adding peptide-to-cDNA mappings
 165    * (if any) derived while parsing a GFF file
 166    */
 167   @Override
 168   public void addProperties(AlignmentI al)
 169   {
 170     super.addProperties(al);
 171     if (dataset != null && dataset.getCodonFrames() != null)
 172     {
 173       AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset();
 174       for (AlignedCodonFrame codons : dataset.getCodonFrames())
 175       {
 176         ds.addCodonFrame(codons);
 177       }
 178     }
 179   }
 180
 181   /**
 182    * Parse GFF or Jalview format sequence features file
 183    *
 184    * @param align
 185    *          - alignment/dataset containing sequences that are to be annotated
 186    * @param colours
 187    *          - map to store feature colour definitions
 188    * @param removeHTML
 189    *          - process html strings into plain text
 190    * @param relaxedIdmatching
 191    *          - when true, ID matches to compound sequence IDs are allowed
 192    * @return true if features were added
 193    */
 194   public boolean parse(AlignmentI align,
 195           Map<String, FeatureColourI> colours, boolean removeHTML,
 196           boolean relaxedIdmatching)
 197   {
 198     return parse(align, colours, null, removeHTML, relaxedIdmatching);
 199   }
 200
 201   /**
 202    * Parse GFF or Jalview format sequence features file
 203    *
 204    * @param align
 205    *          - alignment/dataset containing sequences that are to be annotated
 206    * @param colours
 207    *          - map to store feature colour definitions
 208    * @param filters
 209    *          - map to store feature filter definitions
 210    * @param removeHTML
 211    *          - process html strings into plain text
 212    * @param relaxedIdmatching
 213    *          - when true, ID matches to compound sequence IDs are allowed
 214    * @return true if features were added
 215    */
 216   public boolean parse(AlignmentI align,
 217           Map<String, FeatureColourI> colours,
 218           Map<String, FeatureMatcherSetI> filters, boolean removeHTML,
 219           boolean relaxedIdmatching)
 220   {
 221     Map<String, String> gffProps = new HashMap<>();
 222     /*
 223      * keep track of any sequences we try to create from the data
 224      */
 225     List<SequenceI> newseqs = new ArrayList<>();
 226
 227     String line = null;
 228     try
 229     {
 230       String[] gffColumns;
 231       String featureGroup = null;
 232
 233       while ((line = nextLine()) != null)
 234       {
 235         // skip comments/process pragmas
 236         if (line.length() == 0 || line.startsWith("#"))
 237         {
 238           if (line.toLowerCase().startsWith("##"))
 239           {
 240             processGffPragma(line, gffProps, align, newseqs);
 241           }
 242           continue;
 243         }
 244
 245         gffColumns = line.split(TAB_REGEX);
 246         if (gffColumns.length == 1)
 247         {
 248           if (line.trim().equalsIgnoreCase("GFF"))
 249           {
 250             /*
 251              * Jalview features file with appended GFF
 252              * assume GFF2 (though it may declare ##gff-version 3)
 253              */
 254             gffVersion = 2;
 255             continue;
 256           }
 257         }
 258
 259         if (gffColumns.length > 0 && gffColumns.length < 4)
 260         {
 261           /*
 262            * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
 263            * a feature type colour specification
 264            */
 265           String ft = gffColumns[0];
 266           if (ft.equalsIgnoreCase(STARTFILTERS))
 267           {
 268             parseFilters(filters);
 269             continue;
 270           }
 271           if (ft.equalsIgnoreCase(STARTGROUP))
 272           {
 273             featureGroup = gffColumns[1];
 274           }
 275           else if (ft.equalsIgnoreCase(ENDGROUP))
 276           {
 277             // We should check whether this is the current group,
 278             // but at present there's no way of showing more than 1 group
 279             featureGroup = null;
 280           }
 281           else
 282           {
 283             String colscheme = gffColumns[1];
 284             FeatureColourI colour = FeatureColour
 285                     .parseJalviewFeatureColour(colscheme);
 286             if (colour != null)
 287             {
 288               colours.put(ft, colour);
 289             }
 290           }
 291           continue;
 292         }
 293
 294         /*
 295          * if not a comment, GFF pragma, startgroup, endgroup or feature
 296          * colour specification, that just leaves a feature details line
 297          * in either Jalview or GFF format
 298          */
 299         if (gffVersion == 0)
 300         {
 301           parseJalviewFeature(line, gffColumns, align, colours, removeHTML,
 302                   relaxedIdmatching, featureGroup);
 303         }
 304         else
 305         {
 306           parseGff(gffColumns, align, relaxedIdmatching, newseqs);
 307         }
 308       }
 309       resetMatcher();
 310     } catch (Exception ex)
 311     {
 312       // should report somewhere useful for UI if necessary
 313       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 314               + "Parsing error at\n" + line;
 315       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 316       ex.printStackTrace(System.err);
 317       resetMatcher();
 318       return false;
 319     }
 320
 321     /*
 322      * experimental - add any dummy sequences with features to the alignment
 323      * - we need them for Ensembl feature extraction - though maybe not otherwise
 324      */
 325     for (SequenceI newseq : newseqs)
 326     {
 327       if (newseq.getFeatures().hasFeatures())
 328       {
 329         align.addSequence(newseq);
 330       }
 331     }
 332     return true;
 333   }
 334
 335   /**
 336    * Reads input lines from STARTFILTERS to ENDFILTERS and adds a feature type
 337    * filter to the map for each line parsed. After exit from this method,
 338    * nextLine() should return the line after ENDFILTERS (or we are already at
 339    * end of file if ENDFILTERS was missing).
 340    *
 341    * @param filters
 342    * @throws IOException
 343    */
 344   protected void parseFilters(Map<String, FeatureMatcherSetI> filters)
 345           throws IOException
 346   {
 347     String line;
 348     while ((line = nextLine()) != null)
 349     {
 350       if (line.toUpperCase().startsWith(ENDFILTERS))
 351       {
 352         return;
 353       }
 354       String[] tokens = line.split(TAB_REGEX);
 355       if (tokens.length != 2)
 356       {
 357         System.err.println(String.format("Invalid token count %d for %d",
 358                 tokens.length, line));
 359       }
 360       else
 361       {
 362         String featureType = tokens[0];
 363         FeatureMatcherSetI fm = FeatureMatcherSet.fromString(tokens[1]);
 364         if (fm != null && filters != null)
 365         {
 366           filters.put(featureType, fm);
 367         }
 368       }
 369     }
 370   }
 371
 372   /**
 373    * Try to parse a Jalview format feature specification and add it as a
 374    * sequence feature to any matching sequences in the alignment. Returns true
 375    * if successful (a feature was added), or false if not.
 376    *
 377    * @param line
 378    * @param gffColumns
 379    * @param alignment
 380    * @param featureColours
 381    * @param removeHTML
 382    * @param relaxedIdmatching
 383    * @param featureGroup
 384    */
 385   protected boolean parseJalviewFeature(String line, String[] gffColumns,
 386           AlignmentI alignment, Map<String, FeatureColourI> featureColours,
 387           boolean removeHTML, boolean relaxedIdMatching,
 388           String featureGroup)
 389   {
 390     /*
 391      * tokens: description seqid seqIndex start end type [score]
 392      */
 393     if (gffColumns.length < 6)
 394     {
 395       System.err.println("Ignoring feature line '" + line
 396               + "' with too few columns (" + gffColumns.length + ")");
 397       return false;
 398     }
 399     String desc = gffColumns[0];
 400     String seqId = gffColumns[1];
 401     SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching);
 402
 403     if (!ID_NOT_SPECIFIED.equals(seqId))
 404     {
 405       seq = findSequence(seqId, alignment, null, relaxedIdMatching);
 406     }
 407     else
 408     {
 409       seqId = null;
 410       seq = null;
 411       String seqIndex = gffColumns[2];
 412       try
 413       {
 414         int idx = Integer.parseInt(seqIndex);
 415         seq = alignment.getSequenceAt(idx);
 416       } catch (NumberFormatException ex)
 417       {
 418         System.err.println("Invalid sequence index: " + seqIndex);
 419       }
 420     }
 421
 422     if (seq == null)
 423     {
 424       System.out.println("Sequence not found: " + line);
 425       return false;
 426     }
 427
 428     int startPos = Integer.parseInt(gffColumns[3]);
 429     int endPos = Integer.parseInt(gffColumns[4]);
 430
 431     String ft = gffColumns[5];
 432
 433     if (!featureColours.containsKey(ft))
 434     {
 435       /*
 436        * Perhaps an old style groups file with no colours -
 437        * synthesize a colour from the feature type
 438        */
 439       Color colour = ColorUtils.createColourFromName(ft);
 440       featureColours.put(ft, new FeatureColour(colour));
 441     }
 442     SequenceFeature sf = null;
 443     if (gffColumns.length > 6)
 444     {
 445       float score = Float.NaN;
 446       try
 447       {
 448         score = Float.valueOf(gffColumns[6]).floatValue();
 449       } catch (NumberFormatException ex)
 450       {
 451         sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup);
 452       }
 453       sf = new SequenceFeature(ft, desc, startPos, endPos, score,
 454               featureGroup);
 455     }
 456     else
 457     {
 458       sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup);
 459     }
 460
 461     parseDescriptionHTML(sf, removeHTML);
 462
 463     seq.addSequenceFeature(sf);
 464
 465     while (seqId != null
 466             && (seq = alignment.findName(seq, seqId, false)) != null)
 467     {
 468       seq.addSequenceFeature(new SequenceFeature(sf));
 469     }
 470     return true;
 471   }
 472
 473   /**
 474    * clear any temporary handles used to speed up ID matching
 475    */
 476   protected void resetMatcher()
 477   {
 478     lastmatchedAl = null;
 479     matcher = null;
 480   }
 481
 482   /**
 483    * Returns a sequence matching the given id, as follows
 484    * <ul>
 485    * <li>strict matching is on exact sequence name</li>
 486    * <li>relaxed matching allows matching on a token within the sequence name,
 487    * or a dbxref</li>
 488    * <li>first tries to find a match in the alignment sequences</li>
 489    * <li>else tries to find a match in the new sequences already generated while
 490    * parsing the features file</li>
 491    * <li>else creates a new placeholder sequence, adds it to the new sequences
 492    * list, and returns it</li>
 493    * </ul>
 494    *
 495    * @param seqId
 496    * @param align
 497    * @param newseqs
 498    * @param relaxedIdMatching
 499    *
 500    * @return
 501    */
 502   protected SequenceI findSequence(String seqId, AlignmentI align,
 503           List<SequenceI> newseqs, boolean relaxedIdMatching)
 504   {
 505     // TODO encapsulate in SequenceIdMatcher, share the matcher
 506     // with the GffHelper (removing code duplication)
 507     SequenceI match = null;
 508     if (relaxedIdMatching)
 509     {
 510       if (lastmatchedAl != align)
 511       {
 512         lastmatchedAl = align;
 513         matcher = new SequenceIdMatcher(align.getSequencesArray());
 514         if (newseqs != null)
 515         {
 516           matcher.addAll(newseqs);
 517         }
 518       }
 519       match = matcher.findIdMatch(seqId);
 520     }
 521     else
 522     {
 523       match = align.findName(seqId, true);
 524       if (match == null && newseqs != null)
 525       {
 526         for (SequenceI m : newseqs)
 527         {
 528           if (seqId.equals(m.getName()))
 529           {
 530             return m;
 531           }
 532         }
 533       }
 534
 535     }
 536     if (match == null && newseqs != null)
 537     {
 538       match = new SequenceDummy(seqId);
 539       if (relaxedIdMatching)
 540       {
 541         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 542       }
 543       // add dummy sequence to the newseqs list
 544       newseqs.add(match);
 545     }
 546     return match;
 547   }
 548
 549   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
 550   {
 551     if (sf.getDescription() == null)
 552     {
 553       return;
 554     }
 555     ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
 556             sf.getDescription(), removeHTML, newline);
 557
 558     if (removeHTML)
 559     {
 560       sf.setDescription(parsed.getNonHtmlContent());
 561     }
 562
 563     for (String link : parsed.getLinks())
 564     {
 565       sf.addLink(link);
 566     }
 567   }
 568
 569   /**
 570    * Returns contents of a Jalview format features file, for visible features, as
 571    * filtered by type and group. Features with a null group are displayed if their
 572    * feature type is visible. Non-positional features may optionally be included
 573    * (with no check on type or group).
 574    *
 575    * @param sequences
 576    * @param fr
 577    * @param includeNonPositional
 578    *                               if true, include non-positional features
 579    *                               (regardless of group or type)
 580    * @param includeComplement
 581    *                               if true, include visible complementary
 582    *                               (CDS/protein) positional features, with
 583    *                               locations converted to local sequence
 584    *                               coordinates
 585    * @return
 586    */
 587   public String printJalviewFormat(SequenceI[] sequences,
 588           FeatureRenderer fr, boolean includeNonPositional,
 589           boolean includeComplement)
 590   {
 591     Map<String, FeatureColourI> visibleColours = fr
 592             .getDisplayedFeatureCols();
 593     Map<String, FeatureMatcherSetI> featureFilters = fr.getFeatureFilters();
 594
 595     /*
 596      * write out feature colours (if we know them)
 597      */
 598     // TODO: decide if feature links should also be written here ?
 599     StringBuilder out = new StringBuilder(256);
 600     if (visibleColours != null)
 601     {
 602       for (Entry<String, FeatureColourI> featureColour : visibleColours
 603               .entrySet())
 604       {
 605         FeatureColourI colour = featureColour.getValue();
 606         out.append(colour.toJalviewFormat(featureColour.getKey())).append(
 607                 newline);
 608       }
 609     }
 610
 611     String[] types = visibleColours == null ? new String[0]
 612             : visibleColours.keySet()
 613                     .toArray(new String[visibleColours.keySet().size()]);
 614
 615     /*
 616      * feature filters if any
 617      */
 618     outputFeatureFilters(out, visibleColours, featureFilters);
 619
 620     /*
 621      * output features within groups
 622      */
 623     int count = outputFeaturesByGroup(out, fr, types, sequences,
 624             includeNonPositional);
 625
 626     if (includeComplement)
 627     {
 628       count += outputComplementFeatures(out, fr, sequences);
 629     }
 630
 631     return count > 0 ? out.toString() : "No Features Visible";
 632   }
 633
 634   /**
 635    * Outputs any visible complementary positional features, within feature group
 636    *
 637    * @param out
 638    * @param fr
 639    * @param sequences
 640    * @return
 641    */
 642   private int outputComplementFeatures(StringBuilder out,
 643           FeatureRenderer fr, SequenceI[] sequences)
 644   {
 645     AlignViewportI comp = fr.getViewport().getCodingComplement();
 646     FeatureRenderer fr2 = Desktop.getAlignFrameFor(comp)
 647             .getFeatureRenderer();
 648
 649     /*
 650      * build a map of {group, {seqName, List<SequenceFeature>}}
 651      */
 652     Map<String, Map<String, List<SequenceFeature>>> map = new TreeMap<>();
 653     int count = 0;
 654
 655     for (SequenceI seq : sequences)
 656     {
 657       /*
 658        * avoid duplication of features (e.g. peptide feature
 659        * at all 3 mapped codon positions)
 660        */
 661       List<SequenceFeature> found = new ArrayList<>();
 662       String seqName = seq.getName();
 663
 664       for (int pos = seq.getStart(); pos <= seq.getEnd(); pos++)
 665       {
 666         MappedFeatures mf = fr2.findComplementFeaturesAtResidue(seq, pos);
 667
 668         if (mf != null)
 669         {
 670           MapList mapping = mf.mapping.getMap();
 671           for (SequenceFeature sf : mf.features)
 672           {
 673             String group = sf.getFeatureGroup();
 674             if (group == null)
 675             {
 676               group = "";
 677             }
 678             if (!map.containsKey(group))
 679             {
 680               map.put(group, new LinkedHashMap<>());
 681             }
 682             Map<String, List<SequenceFeature>> groupFeatures = map
 683                     .get(group);
 684             if (!groupFeatures.containsKey(seqName))
 685             {
 686               groupFeatures.put(seqName, new ArrayList<>());
 687             }
 688             List<SequenceFeature> foundFeatures = groupFeatures
 689                     .get(seqName);
 690
 691             /*
 692              * make a virtual feature with local coordinates
 693              */
 694             if (!found.contains(sf))
 695             {
 696               found.add(sf);
 697               int begin = sf.getBegin();
 698               int end = sf.getEnd();
 699               int[] range = mf.mapping.getTo() == seq.getDatasetSequence()
 700                       ? mapping.locateInTo(begin, end)
 701                       : mapping.locateInFrom(begin, end);
 702               SequenceFeature sf2 = new SequenceFeature(sf, range[0],
 703                       range[1], group,
 704                       sf.getScore());
 705               foundFeatures.add(sf2);
 706               count++;
 707             }
 708           }
 709         }
 710       }
 711     }
 712
 713     /*
 714      * output features by group
 715      */
 716     for (Entry<String, Map<String, List<SequenceFeature>>> groupFeatures : map.entrySet())
 717     {
 718       out.append(newline);
 719       String group = groupFeatures.getKey();
 720       if (!"".equals(group))
 721       {
 722         out.append(STARTGROUP).append(TAB).append(group).append(newline);
 723       }
 724       Map<String, List<SequenceFeature>> seqFeaturesMap = groupFeatures
 725               .getValue();
 726       for (Entry<String, List<SequenceFeature>> seqFeatures : seqFeaturesMap
 727               .entrySet())
 728       {
 729         String sequenceName = seqFeatures.getKey();
 730         for (SequenceFeature sf : seqFeatures.getValue())
 731         {
 732           out.append(formatJalviewFeature(sequenceName, sf));
 733         }
 734       }
 735       if (!"".equals(group))
 736       {
 737         out.append(ENDGROUP).append(TAB).append(group).append(newline);
 738       }
 739     }
 740
 741     return count;
 742   }
 743
 744   /**
 745    * Outputs any feature filters defined for visible feature types, sandwiched by
 746    * STARTFILTERS and ENDFILTERS lines
 747    *
 748    * @param out
 749    * @param visible
 750    * @param featureFilters
 751    */
 752   void outputFeatureFilters(StringBuilder out,
 753           Map<String, FeatureColourI> visible,
 754           Map<String, FeatureMatcherSetI> featureFilters)
 755   {
 756     if (visible == null || featureFilters == null
 757             || featureFilters.isEmpty())
 758     {
 759       return;
 760     }
 761
 762     boolean first = true;
 763     for (String featureType : visible.keySet())
 764     {
 765       FeatureMatcherSetI filter = featureFilters.get(featureType);
 766       if (filter != null)
 767       {
 768         if (first)
 769         {
 770           first = false;
 771           out.append(newline).append(STARTFILTERS).append(newline);
 772         }
 773         out.append(featureType).append(TAB).append(filter.toStableString())
 774                 .append(newline);
 775       }
 776     }
 777     if (!first)
 778     {
 779       out.append(ENDFILTERS).append(newline);
 780     }
 781
 782   }
 783
 784   /**
 785    * Appends output of visible sequence features within feature groups to the
 786    * output buffer. Groups other than the null or empty group are sandwiched by
 787    * STARTGROUP and ENDGROUP lines. Answers the number of features written.
 788    *
 789    * @param out
 790    * @param fr
 791    * @param featureTypes
 792    * @param sequences
 793    * @param includeNonPositional
 794    * @return
 795    */
 796   private int outputFeaturesByGroup(StringBuilder out,
 797           FeatureRenderer fr, String[] featureTypes,
 798           SequenceI[] sequences, boolean includeNonPositional)
 799   {
 800     List<String> featureGroups = fr.getFeatureGroups();
 801
 802     /*
 803      * sort groups alphabetically, and ensure that features with a
 804      * null or empty group are output after those in named groups
 805      */
 806     List<String> sortedGroups = new ArrayList<>(featureGroups);
 807     sortedGroups.remove(null);
 808     sortedGroups.remove("");
 809     Collections.sort(sortedGroups);
 810     sortedGroups.add(null);
 811     sortedGroups.add("");
 812
 813     int count = 0;
 814     List<String> visibleGroups = fr.getDisplayedFeatureGroups();
 815
 816     /*
 817      * loop over all groups (may be visible or not);
 818      * non-positional features are output even if group is not visible
 819      */
 820     for (String group : sortedGroups)
 821     {
 822       boolean firstInGroup = true;
 823       boolean isNullGroup = group == null || "".equals(group);
 824
 825       for (int i = 0; i < sequences.length; i++)
 826       {
 827         String sequenceName = sequences[i].getName();
 828         List<SequenceFeature> features = new ArrayList<>();
 829
 830         /*
 831          * get any non-positional features in this group, if wanted
 832          * (for any feature type, whether visible or not)
 833          */
 834         if (includeNonPositional)
 835         {
 836           features.addAll(sequences[i].getFeatures()
 837                   .getFeaturesForGroup(false, group));
 838         }
 839
 840         /*
 841          * add positional features for visible feature types, but
 842          * (for named groups) only if feature group is visible
 843          */
 844         if (featureTypes.length > 0
 845                 && (isNullGroup || visibleGroups.contains(group)))
 846         {
 847           features.addAll(sequences[i].getFeatures().getFeaturesForGroup(
 848                   true, group, featureTypes));
 849         }
 850
 851         for (SequenceFeature sf : features)
 852         {
 853           if (sf.isNonPositional() || fr.isVisible(sf))
 854           {
 855             count++;
 856             if (firstInGroup)
 857             {
 858               out.append(newline);
 859               if (!isNullGroup)
 860               {
 861                 out.append(STARTGROUP).append(TAB).append(group)
 862                         .append(newline);
 863               }
 864             }
 865             firstInGroup = false;
 866             out.append(formatJalviewFeature(sequenceName, sf));
 867           }
 868         }
 869       }
 870
 871       if (!isNullGroup && !firstInGroup)
 872       {
 873         out.append(ENDGROUP).append(TAB).append(group).append(newline);
 874       }
 875     }
 876     return count;
 877   }
 878
 879   /**
 880    * @param out
 881    * @param sequenceName
 882    * @param sequenceFeature
 883    */
 884   protected String formatJalviewFeature(
 885           String sequenceName, SequenceFeature sequenceFeature)
 886   {
 887     StringBuilder out = new StringBuilder(64);
 888     if (sequenceFeature.description == null
 889             || sequenceFeature.description.equals(""))
 890     {
 891       out.append(sequenceFeature.type).append(TAB);
 892     }
 893     else
 894     {
 895       if (sequenceFeature.links != null
 896               && sequenceFeature.getDescription().indexOf("<html>") == -1)
 897       {
 898         out.append("<html>");
 899       }
 900
 901       out.append(sequenceFeature.description);
 902       if (sequenceFeature.links != null)
 903       {
 904         for (int l = 0; l < sequenceFeature.links.size(); l++)
 905         {
 906           String label = sequenceFeature.links.elementAt(l);
 907           String href = label.substring(label.indexOf("|") + 1);
 908           label = label.substring(0, label.indexOf("|"));
 909
 910           if (sequenceFeature.description.indexOf(href) == -1)
 911           {
 912             out.append(" <a href=\"" + href + "\">" + label + "</a>");
 913           }
 914         }
 915
 916         if (sequenceFeature.getDescription().indexOf("</html>") == -1)
 917         {
 918           out.append("</html>");
 919         }
 920       }
 921
 922       out.append(TAB);
 923     }
 924     out.append(sequenceName);
 925     out.append("\t-1\t");
 926     out.append(sequenceFeature.begin);
 927     out.append(TAB);
 928     out.append(sequenceFeature.end);
 929     out.append(TAB);
 930     out.append(sequenceFeature.type);
 931     if (!Float.isNaN(sequenceFeature.score))
 932     {
 933       out.append(TAB);
 934       out.append(sequenceFeature.score);
 935     }
 936     out.append(newline);
 937
 938     return out.toString();
 939   }
 940
 941   /**
 942    * Parse method that is called when a GFF file is dragged to the desktop
 943    */
 944   @Override
 945   public void parse()
 946   {
 947     AlignViewportI av = getViewport();
 948     if (av != null)
 949     {
 950       if (av.getAlignment() != null)
 951       {
 952         dataset = av.getAlignment().getDataset();
 953       }
 954       if (dataset == null)
 955       {
 956         // working in the applet context ?
 957         dataset = av.getAlignment();
 958       }
 959     }
 960     else
 961     {
 962       dataset = new Alignment(new SequenceI[] {});
 963     }
 964
 965     Map<String, FeatureColourI> featureColours = new HashMap<>();
 966     boolean parseResult = parse(dataset, featureColours, false, true);
 967     if (!parseResult)
 968     {
 969       // pass error up somehow
 970     }
 971     if (av != null)
 972     {
 973       // update viewport with the dataset data ?
 974     }
 975     else
 976     {
 977       setSeqs(dataset.getSequencesArray());
 978     }
 979   }
 980
 981   /**
 982    * Implementation of unused abstract method
 983    *
 984    * @return error message
 985    */
 986   @Override
 987   public String print(SequenceI[] sqs, boolean jvsuffix)
 988   {
 989     System.out.println("Use printGffFormat() or printJalviewFormat()");
 990     return null;
 991   }
 992
 993   /**
 994    * Returns features output in GFF2 format
 995    *
 996    * @param sequences
 997    *                                       the sequences whose features are to be
 998    *                                       output
 999    * @param visible
1000    *                                       a map whose keys are the type names of
1001    *                                       visible features
1002    * @param visibleFeatureGroups
1003    * @param includeNonPositionalFeatures
1004    * @param includeComplement
1005    * @return
1006    */
1007   public String printGffFormat(SequenceI[] sequences,
1008           FeatureRenderer fr, boolean includeNonPositionalFeatures,
1009           boolean includeComplement)
1010   {
1011     Map<String, FeatureColourI> visibleColours = fr.getDisplayedFeatureCols();
1012
1013     StringBuilder out = new StringBuilder(256);
1014
1015     out.append(String.format("%s %d\n", GFF_VERSION, gffVersion == 0 ? 2 : gffVersion));
1016
1017     if (!includeNonPositionalFeatures
1018             && (visibleColours == null || visibleColours.isEmpty()))
1019     {
1020       return out.toString();
1021     }
1022
1023     String[] types = visibleColours == null ? new String[0]
1024             : visibleColours.keySet()
1025                     .toArray(new String[visibleColours.keySet().size()]);
1026
1027     for (SequenceI seq : sequences)
1028     {
1029       List<SequenceFeature> features = new ArrayList<>();
1030       if (includeNonPositionalFeatures)
1031       {
1032         features.addAll(seq.getFeatures().getNonPositionalFeatures());
1033       }
1034       if (visibleColours != null && !visibleColours.isEmpty())
1035       {
1036         features.addAll(seq.getFeatures().getPositionalFeatures(types));
1037       }
1038
1039       for (SequenceFeature sf : features)
1040       {
1041         if (!sf.isNonPositional() && !fr.isVisible(sf))
1042         {
1043           /*
1044            * feature hidden by group visibility, colour threshold,
1045            * or feature filter condition
1046            */
1047           continue;
1048         }
1049
1050         String source = sf.featureGroup;
1051         if (source == null)
1052         {
1053           source = sf.getDescription();
1054         }
1055
1056         out.append(seq.getName());
1057         out.append(TAB);
1058         out.append(source);
1059         out.append(TAB);
1060         out.append(sf.type);
1061         out.append(TAB);
1062         out.append(sf.begin);
1063         out.append(TAB);
1064         out.append(sf.end);
1065         out.append(TAB);
1066         out.append(sf.score);
1067         out.append(TAB);
1068
1069         int strand = sf.getStrand();
1070         out.append(strand == 1 ? "+" : (strand == -1 ? "-" : "."));
1071         out.append(TAB);
1072
1073         String phase = sf.getPhase();
1074         out.append(phase == null ? "." : phase);
1075
1076         // miscellaneous key-values (GFF column 9)
1077         String attributes = sf.getAttributes();
1078         if (attributes != null)
1079         {
1080           out.append(TAB).append(attributes);
1081         }
1082
1083         out.append(newline);
1084       }
1085     }
1086
1087     return out.toString();
1088   }
1089
1090   /**
1091    * Returns a mapping given list of one or more Align descriptors (exonerate
1092    * format)
1093    *
1094    * @param alignedRegions
1095    *          a list of "Align fromStart toStart fromCount"
1096    * @param mapIsFromCdna
1097    *          if true, 'from' is dna, else 'from' is protein
1098    * @param strand
1099    *          either 1 (forward) or -1 (reverse)
1100    * @return
1101    * @throws IOException
1102    */
1103   protected MapList constructCodonMappingFromAlign(
1104           List<String> alignedRegions, boolean mapIsFromCdna, int strand)
1105           throws IOException
1106   {
1107     if (strand == 0)
1108     {
1109       throw new IOException(
1110               "Invalid strand for a codon mapping (cannot be 0)");
1111     }
1112     int regions = alignedRegions.size();
1113     // arrays to hold [start, end] for each aligned region
1114     int[] fromRanges = new int[regions * 2]; // from dna
1115     int[] toRanges = new int[regions * 2]; // to protein
1116     int fromRangesIndex = 0;
1117     int toRangesIndex = 0;
1118
1119     for (String range : alignedRegions)
1120     {
1121       /*
1122        * Align mapFromStart mapToStart mapFromCount
1123        * e.g. if mapIsFromCdna
1124        *     Align 11270 143 120
1125        * means:
1126        *     120 bases from pos 11270 align to pos 143 in peptide
1127        * if !mapIsFromCdna this would instead be
1128        *     Align 143 11270 40
1129        */
1130       String[] tokens = range.split(" ");
1131       if (tokens.length != 3)
1132       {
1133         throw new IOException("Wrong number of fields for Align");
1134       }
1135       int fromStart = 0;
1136       int toStart = 0;
1137       int fromCount = 0;
1138       try
1139       {
1140         fromStart = Integer.parseInt(tokens[0]);
1141         toStart = Integer.parseInt(tokens[1]);
1142         fromCount = Integer.parseInt(tokens[2]);
1143       } catch (NumberFormatException nfe)
1144       {
1145         throw new IOException(
1146                 "Invalid number in Align field: " + nfe.getMessage());
1147       }
1148
1149       /*
1150        * Jalview always models from dna to protein, so adjust values if the
1151        * GFF mapping is from protein to dna
1152        */
1153       if (!mapIsFromCdna)
1154       {
1155         fromCount *= 3;
1156         int temp = fromStart;
1157         fromStart = toStart;
1158         toStart = temp;
1159       }
1160       fromRanges[fromRangesIndex++] = fromStart;
1161       fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
1162
1163       /*
1164        * If a codon has an intron gap, there will be contiguous 'toRanges';
1165        * this is handled for us by the MapList constructor.
1166        * (It is not clear that exonerate ever generates this case)
1167        */
1168       toRanges[toRangesIndex++] = toStart;
1169       toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
1170     }
1171
1172     return new MapList(fromRanges, toRanges, 3, 1);
1173   }
1174
1175   /**
1176    * Parse a GFF format feature. This may include creating a 'dummy' sequence to
1177    * hold the feature, or for its mapped sequence, or both, to be resolved
1178    * either later in the GFF file (##FASTA section), or when the user loads
1179    * additional sequences.
1180    *
1181    * @param gffColumns
1182    * @param alignment
1183    * @param relaxedIdMatching
1184    * @param newseqs
1185    * @return
1186    */
1187   protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
1188           boolean relaxedIdMatching, List<SequenceI> newseqs)
1189   {
1190     /*
1191      * GFF: seqid source type start end score strand phase [attributes]
1192      */
1193     if (gffColumns.length < 5)
1194     {
1195       System.err.println("Ignoring GFF feature line with too few columns ("
1196               + gffColumns.length + ")");
1197       return null;
1198     }
1199
1200     /*
1201      * locate referenced sequence in alignment _or_
1202      * as a forward or external reference (SequenceDummy)
1203      */
1204     String seqId = gffColumns[0];
1205     SequenceI seq = findSequence(seqId, alignment, newseqs,
1206             relaxedIdMatching);
1207
1208     SequenceFeature sf = null;
1209     GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
1210     if (helper != null)
1211     {
1212       try
1213       {
1214         sf = helper.processGff(seq, gffColumns, alignment, newseqs,
1215                 relaxedIdMatching);
1216         if (sf != null)
1217         {
1218           seq.addSequenceFeature(sf);
1219           while ((seq = alignment.findName(seq, seqId, true)) != null)
1220           {
1221             seq.addSequenceFeature(new SequenceFeature(sf));
1222           }
1223         }
1224       } catch (IOException e)
1225       {
1226         System.err.println("GFF parsing failed with: " + e.getMessage());
1227         return null;
1228       }
1229     }
1230
1231     return seq;
1232   }
1233
1234   /**
1235    * Process the 'column 9' data of the GFF file. This is less formally defined,
1236    * and its interpretation will vary depending on the tool that has generated
1237    * it.
1238    *
1239    * @param attributes
1240    * @param sf
1241    */
1242   protected void processGffColumnNine(String attributes, SequenceFeature sf)
1243   {
1244     sf.setAttributes(attributes);
1245
1246     /*
1247      * Parse attributes in column 9 and add them to the sequence feature's
1248      * 'otherData' table; use Note as a best proxy for description
1249      */
1250     char nameValueSeparator = gffVersion == 3 ? '=' : ' ';
1251     // TODO check we don't break GFF2 values which include commas here
1252     Map<String, List<String>> nameValues = GffHelperBase
1253             .parseNameValuePairs(attributes, ";", nameValueSeparator, ",");
1254     for (Entry<String, List<String>> attr : nameValues.entrySet())
1255     {
1256       String values = StringUtils.listToDelimitedString(attr.getValue(),
1257               "; ");
1258       sf.setValue(attr.getKey(), values);
1259       if (NOTE.equals(attr.getKey()))
1260       {
1261         sf.setDescription(values);
1262       }
1263     }
1264   }
1265
1266   /**
1267    * After encountering ##fasta in a GFF3 file, process the remainder of the
1268    * file as FAST sequence data. Any placeholder sequences created during
1269    * feature parsing are updated with the actual sequences.
1270    *
1271    * @param align
1272    * @param newseqs
1273    * @throws IOException
1274    */
1275   protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
1276           throws IOException
1277   {
1278     try
1279     {
1280       mark();
1281     } catch (IOException q)
1282     {
1283     }
1284     FastaFile parser = new FastaFile(this);
1285     List<SequenceI> includedseqs = parser.getSeqs();
1286
1287     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
1288
1289     /*
1290      * iterate over includedseqs, and replacing matching ones with newseqs
1291      * sequences. Generic iterator not used here because we modify
1292      * includedseqs as we go
1293      */
1294     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
1295     {
1296       // search for any dummy seqs that this sequence can be used to update
1297       SequenceI includedSeq = includedseqs.get(p);
1298       SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
1299       if (dummyseq != null && dummyseq instanceof SequenceDummy)
1300       {
1301         // probably have the pattern wrong
1302         // idea is that a flyweight proxy for a sequence ID can be created for
1303         // 1. stable reference creation
1304         // 2. addition of annotation
1305         // 3. future replacement by a real sequence
1306         // current pattern is to create SequenceDummy objects - a convenience
1307         // constructor for a Sequence.
1308         // problem is that when promoted to a real sequence, all references
1309         // need to be updated somehow. We avoid that by keeping the same object.
1310         ((SequenceDummy) dummyseq).become(includedSeq);
1311         dummyseq.createDatasetSequence();
1312
1313         /*
1314          * Update mappings so they are now to the dataset sequence
1315          */
1316         for (AlignedCodonFrame mapping : align.getCodonFrames())
1317         {
1318           mapping.updateToDataset(dummyseq);
1319         }
1320
1321         /*
1322          * replace parsed sequence with the realised forward reference
1323          */
1324         includedseqs.set(p, dummyseq);
1325
1326         /*
1327          * and remove from the newseqs list
1328          */
1329         newseqs.remove(dummyseq);
1330       }
1331     }
1332
1333     /*
1334      * finally add sequences to the dataset
1335      */
1336     for (SequenceI seq : includedseqs)
1337     {
1338       // experimental: mapping-based 'alignment' to query sequence
1339       AlignmentUtils.alignSequenceAs(seq, align,
1340               String.valueOf(align.getGapCharacter()), false, true);
1341
1342       // rename sequences if GFF handler requested this
1343       // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
1344       List<SequenceFeature> sfs = seq.getFeatures().getPositionalFeatures();
1345       if (!sfs.isEmpty())
1346       {
1347         String newName = (String) sfs.get(0).getValue(
1348                 GffHelperI.RENAME_TOKEN);
1349         if (newName != null)
1350         {
1351           seq.setName(newName);
1352         }
1353       }
1354       align.addSequence(seq);
1355     }
1356   }
1357
1358   /**
1359    * Process a ## directive
1360    *
1361    * @param line
1362    * @param gffProps
1363    * @param align
1364    * @param newseqs
1365    * @throws IOException
1366    */
1367   protected void processGffPragma(String line, Map<String, String> gffProps,
1368           AlignmentI align, List<SequenceI> newseqs) throws IOException
1369   {
1370     line = line.trim();
1371     if ("###".equals(line))
1372     {
1373       // close off any open 'forward references'
1374       return;
1375     }
1376
1377     String[] tokens = line.substring(2).split(" ");
1378     String pragma = tokens[0];
1379     String value = tokens.length == 1 ? null : tokens[1];
1380
1381     if ("gff-version".equalsIgnoreCase(pragma))
1382     {
1383       if (value != null)
1384       {
1385         try
1386         {
1387           // value may be e.g. "3.1.2"
1388           gffVersion = Integer.parseInt(value.split("\\.")[0]);
1389         } catch (NumberFormatException e)
1390         {
1391           // ignore
1392         }
1393       }
1394     }
1395     else if ("sequence-region".equalsIgnoreCase(pragma))
1396     {
1397       // could capture <seqid start end> if wanted here
1398     }
1399     else if ("feature-ontology".equalsIgnoreCase(pragma))
1400     {
1401       // should resolve against the specified feature ontology URI
1402     }
1403     else if ("attribute-ontology".equalsIgnoreCase(pragma))
1404     {
1405       // URI of attribute ontology - not currently used in GFF3
1406     }
1407     else if ("source-ontology".equalsIgnoreCase(pragma))
1408     {
1409       // URI of source ontology - not currently used in GFF3
1410     }
1411     else if ("species-build".equalsIgnoreCase(pragma))
1412     {
1413       // save URI of specific NCBI taxon version of annotations
1414       gffProps.put("species-build", value);
1415     }
1416     else if ("fasta".equalsIgnoreCase(pragma))
1417     {
1418       // process the rest of the file as a fasta file and replace any dummy
1419       // sequence IDs
1420       processAsFasta(align, newseqs);
1421     }
1422     else
1423     {
1424       System.err.println("Ignoring unknown pragma: " + line);
1425     }
1426   }
1427 }