src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceDummy;
  27 import jalview.datamodel.SequenceFeature;
  28 import jalview.datamodel.SequenceI;
  29 import jalview.schemes.AnnotationColourGradient;
  30 import jalview.schemes.GraduatedColor;
  31 import jalview.schemes.UserColourScheme;
  32 import jalview.util.Format;
  33 import jalview.util.MapList;
  34
  35 import java.io.IOException;
  36 import java.util.ArrayList;
  37 import java.util.Arrays;
  38 import java.util.HashMap;
  39 import java.util.Hashtable;
  40 import java.util.Iterator;
  41 import java.util.List;
  42 import java.util.Map;
  43 import java.util.StringTokenizer;
  44 import java.util.Vector;
  45
  46 /**
  47  * Parse and create Jalview Features files Detects GFF format features files and
  48  * parses. Does not implement standard print() - call specific printFeatures or
  49  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  50  * for the features annotation - this normally works on an exact match.
  51  *
  52  * @author AMW
  53  * @version $Revision$
  54  */
  55 public class FeaturesFile extends AlignFile
  56 {
  57   /**
  58    * work around for GFF interpretation bug where source string becomes
  59    * description rather than a group
  60    */
  61   private boolean doGffSource = true;
  62
  63   private int gffversion;
  64
  65   /**
  66    * Creates a new FeaturesFile object.
  67    */
  68   public FeaturesFile()
  69   {
  70   }
  71
  72   /**
  73    * Creates a new FeaturesFile object.
  74    *
  75    * @param inFile
  76    *          DOCUMENT ME!
  77    * @param type
  78    *          DOCUMENT ME!
  79    *
  80    * @throws IOException
  81    *           DOCUMENT ME!
  82    */
  83   public FeaturesFile(String inFile, String type) throws IOException
  84   {
  85     super(inFile, type);
  86   }
  87
  88   public FeaturesFile(FileParse source) throws IOException
  89   {
  90     super(source);
  91   }
  92
  93   /**
  94    * Parse GFF or sequence features file using case-independent matching,
  95    * discarding URLs
  96    *
  97    * @param align
  98    *          - alignment/dataset containing sequences that are to be annotated
  99    * @param colours
 100    *          - hashtable to store feature colour definitions
 101    * @param removeHTML
 102    *          - process html strings into plain text
 103    * @return true if features were added
 104    */
 105   public boolean parse(AlignmentI align, Hashtable colours,
 106           boolean removeHTML)
 107   {
 108     return parse(align, colours, null, removeHTML, false);
 109   }
 110
 111   /**
 112    * Parse GFF or sequence features file optionally using case-independent
 113    * matching, discarding URLs
 114    *
 115    * @param align
 116    *          - alignment/dataset containing sequences that are to be annotated
 117    * @param colours
 118    *          - hashtable to store feature colour definitions
 119    * @param removeHTML
 120    *          - process html strings into plain text
 121    * @param relaxedIdmatching
 122    *          - when true, ID matches to compound sequence IDs are allowed
 123    * @return true if features were added
 124    */
 125   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 126           boolean relaxedIdMatching)
 127   {
 128     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 129   }
 130
 131   /**
 132    * Parse GFF or sequence features file optionally using case-independent
 133    * matching
 134    *
 135    * @param align
 136    *          - alignment/dataset containing sequences that are to be annotated
 137    * @param colours
 138    *          - hashtable to store feature colour definitions
 139    * @param featureLink
 140    *          - hashtable to store associated URLs
 141    * @param removeHTML
 142    *          - process html strings into plain text
 143    * @return true if features were added
 144    */
 145   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 146           boolean removeHTML)
 147   {
 148     return parse(align, colours, featureLink, removeHTML, false);
 149   }
 150
 151   /**
 152    * Parse GFF or sequence features file
 153    *
 154    * @param align
 155    *          - alignment/dataset containing sequences that are to be annotated
 156    * @param colours
 157    *          - hashtable to store feature colour definitions
 158    * @param featureLink
 159    *          - hashtable to store associated URLs
 160    * @param removeHTML
 161    *          - process html strings into plain text
 162    * @param relaxedIdmatching
 163    *          - when true, ID matches to compound sequence IDs are allowed
 164    * @return true if features were added
 165    */
 166   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 167           boolean removeHTML, boolean relaxedIdmatching)
 168   {
 169
 170     String line = null;
 171     try
 172     {
 173       SequenceI seq = null;
 174       /**
 175        * keep track of any sequences we try to create from the data if it is a GFF3 file
 176        */
 177       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 178       String type, desc, token = null;
 179
 180       int index, start, end;
 181       float score;
 182       StringTokenizer st;
 183       SequenceFeature sf;
 184       String featureGroup = null, groupLink = null;
 185       Map typeLink = new Hashtable();
 186       /**
 187        * when true, assume GFF style features rather than Jalview style.
 188        */
 189       boolean GFFFile = true;
 190       Map<String, String> gffProps = new HashMap<String, String>();
 191       while ((line = nextLine()) != null)
 192       {
 193         // skip comments/process pragmas
 194         if (line.startsWith("#"))
 195         {
 196           if (line.startsWith("##"))
 197           {
 198             // possibly GFF2/3 version and metadata header
 199             processGffPragma(line, gffProps, align, newseqs);
 200             line = "";
 201           }
 202           continue;
 203         }
 204
 205         st = new StringTokenizer(line, "\t");
 206         if (st.countTokens() == 1)
 207         {
 208           if (line.trim().equalsIgnoreCase("GFF"))
 209           {
 210             // Start parsing file as if it might be GFF again.
 211             GFFFile = true;
 212             continue;
 213           }
 214         }
 215         if (st.countTokens() > 1 && st.countTokens() < 4)
 216         {
 217           GFFFile = false;
 218           type = st.nextToken();
 219           if (type.equalsIgnoreCase("startgroup"))
 220           {
 221             featureGroup = st.nextToken();
 222             if (st.hasMoreElements())
 223             {
 224               groupLink = st.nextToken();
 225               featureLink.put(featureGroup, groupLink);
 226             }
 227           }
 228           else if (type.equalsIgnoreCase("endgroup"))
 229           {
 230             // We should check whether this is the current group,
 231             // but at present theres no way of showing more than 1 group
 232             st.nextToken();
 233             featureGroup = null;
 234             groupLink = null;
 235           }
 236           else
 237           {
 238             Object colour = null;
 239             String colscheme = st.nextToken();
 240             if (colscheme.indexOf("|") > -1
 241                     || colscheme.trim().equalsIgnoreCase("label"))
 242             {
 243               // Parse '|' separated graduated colourscheme fields:
 244               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 245               // can either provide 'label' only, first is optional, next two
 246               // colors are required (but may be
 247               // left blank), next is optional, nxt two min/max are required.
 248               // first is either 'label'
 249               // first/second and third are both hexadecimal or word equivalent
 250               // colour.
 251               // next two are values parsed as floats.
 252               // fifth is either 'above','below', or 'none'.
 253               // sixth is a float value and only required when fifth is either
 254               // 'above' or 'below'.
 255               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 256                       true);
 257               // set defaults
 258               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 259               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 260               boolean labelCol = false;
 261               // Parse spec line
 262               String mincol = gcol.nextToken();
 263               if (mincol == "|")
 264               {
 265                 System.err
 266                         .println("Expected either 'label' or a colour specification in the line: "
 267                                 + line);
 268                 continue;
 269               }
 270               String maxcol = null;
 271               if (mincol.toLowerCase().indexOf("label") == 0)
 272               {
 273                 labelCol = true;
 274                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 275                                                                            // '|'
 276                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 277               }
 278               String abso = null, minval, maxval;
 279               if (mincol != null)
 280               {
 281                 // at least four more tokens
 282                 if (mincol.equals("|"))
 283                 {
 284                   mincol = "";
 285                 }
 286                 else
 287                 {
 288                   gcol.nextToken(); // skip next '|'
 289                 }
 290                 // continue parsing rest of line
 291                 maxcol = gcol.nextToken();
 292                 if (maxcol.equals("|"))
 293                 {
 294                   maxcol = "";
 295                 }
 296                 else
 297                 {
 298                   gcol.nextToken(); // skip next '|'
 299                 }
 300                 abso = gcol.nextToken();
 301                 gcol.nextToken(); // skip next '|'
 302                 if (abso.toLowerCase().indexOf("abso") != 0)
 303                 {
 304                   minval = abso;
 305                   abso = null;
 306                 }
 307                 else
 308                 {
 309                   minval = gcol.nextToken();
 310                   gcol.nextToken(); // skip next '|'
 311                 }
 312                 maxval = gcol.nextToken();
 313                 if (gcol.hasMoreTokens())
 314                 {
 315                   gcol.nextToken(); // skip next '|'
 316                 }
 317                 try
 318                 {
 319                   if (minval.length() > 0)
 320                   {
 321                     min = new Float(minval).floatValue();
 322                   }
 323                 } catch (Exception e)
 324                 {
 325                   System.err
 326                           .println("Couldn't parse the minimum value for graduated colour for type ("
 327                                   + colscheme
 328                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 329                   e.printStackTrace();
 330                 }
 331                 try
 332                 {
 333                   if (maxval.length() > 0)
 334                   {
 335                     max = new Float(maxval).floatValue();
 336                   }
 337                 } catch (Exception e)
 338                 {
 339                   System.err
 340                           .println("Couldn't parse the maximum value for graduated colour for type ("
 341                                   + colscheme + ")");
 342                   e.printStackTrace();
 343                 }
 344               }
 345               else
 346               {
 347                 // add in some dummy min/max colours for the label-only
 348                 // colourscheme.
 349                 mincol = "FFFFFF";
 350                 maxcol = "000000";
 351               }
 352               try
 353               {
 354                 colour = new jalview.schemes.GraduatedColor(
 355                         new UserColourScheme(mincol).findColour('A'),
 356                         new UserColourScheme(maxcol).findColour('A'), min,
 357                         max);
 358               } catch (Exception e)
 359               {
 360                 System.err
 361                         .println("Couldn't parse the graduated colour scheme ("
 362                                 + colscheme + ")");
 363                 e.printStackTrace();
 364               }
 365               if (colour != null)
 366               {
 367                 ((jalview.schemes.GraduatedColor) colour)
 368                         .setColourByLabel(labelCol);
 369                 ((jalview.schemes.GraduatedColor) colour)
 370                         .setAutoScaled(abso == null);
 371                 // add in any additional parameters
 372                 String ttype = null, tval = null;
 373                 if (gcol.hasMoreTokens())
 374                 {
 375                   // threshold type and possibly a threshold value
 376                   ttype = gcol.nextToken();
 377                   if (ttype.toLowerCase().startsWith("below"))
 378                   {
 379                     ((jalview.schemes.GraduatedColor) colour)
 380                             .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 381                   }
 382                   else if (ttype.toLowerCase().startsWith("above"))
 383                   {
 384                     ((jalview.schemes.GraduatedColor) colour)
 385                             .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 386                   }
 387                   else
 388                   {
 389                     ((jalview.schemes.GraduatedColor) colour)
 390                             .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 391                     if (!ttype.toLowerCase().startsWith("no"))
 392                     {
 393                       System.err
 394                               .println("Ignoring unrecognised threshold type : "
 395                                       + ttype);
 396                     }
 397                   }
 398                 }
 399                 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 400                 {
 401                   try
 402                   {
 403                     gcol.nextToken();
 404                     tval = gcol.nextToken();
 405                     ((jalview.schemes.GraduatedColor) colour)
 406                             .setThresh(new Float(tval).floatValue());
 407                   } catch (Exception e)
 408                   {
 409                     System.err
 410                             .println("Couldn't parse threshold value as a float: ("
 411                                     + tval + ")");
 412                     e.printStackTrace();
 413                   }
 414                 }
 415                 // parse the thresh-is-min token ?
 416                 if (gcol.hasMoreTokens())
 417                 {
 418                   System.err
 419                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 420                   while (gcol.hasMoreTokens())
 421                   {
 422                     System.err.println("|" + gcol.nextToken());
 423                   }
 424                   System.err.println("\n");
 425                 }
 426               }
 427             }
 428             else
 429             {
 430               UserColourScheme ucs = new UserColourScheme(colscheme);
 431               colour = ucs.findColour('A');
 432             }
 433             if (colour != null)
 434             {
 435               colours.put(type, colour);
 436             }
 437             if (st.hasMoreElements())
 438             {
 439               String link = st.nextToken();
 440               typeLink.put(type, link);
 441               if (featureLink == null)
 442               {
 443                 featureLink = new Hashtable();
 444               }
 445               featureLink.put(type, link);
 446             }
 447           }
 448           continue;
 449         }
 450         String seqId = "";
 451         while (st.hasMoreElements())
 452         {
 453
 454           if (GFFFile)
 455           {
 456             // Still possible this is an old Jalview file,
 457             // which does not have type colours at the beginning
 458             seqId = token = st.nextToken();
 459             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 460             if (seq != null)
 461             {
 462               desc = st.nextToken();
 463               String group = null;
 464               if (doGffSource && desc.indexOf(' ') == -1)
 465               {
 466                 // could also be a source term rather than description line
 467                 group = new String(desc);
 468               }
 469               type = st.nextToken();
 470               try
 471               {
 472                 String stt = st.nextToken();
 473                 if (stt.length() == 0 || stt.equals("-"))
 474                 {
 475                   start = 0;
 476                 }
 477                 else
 478                 {
 479                   start = Integer.parseInt(stt);
 480                 }
 481               } catch (NumberFormatException ex)
 482               {
 483                 start = 0;
 484               }
 485               try
 486               {
 487                 String stt = st.nextToken();
 488                 if (stt.length() == 0 || stt.equals("-"))
 489                 {
 490                   end = 0;
 491                 }
 492                 else
 493                 {
 494                   end = Integer.parseInt(stt);
 495                 }
 496               } catch (NumberFormatException ex)
 497               {
 498                 end = 0;
 499               }
 500               // TODO: decide if non positional feature assertion for input data
 501               // where end==0 is generally valid
 502               if (end == 0)
 503               {
 504                 // treat as non-positional feature, regardless.
 505                 start = 0;
 506               }
 507               try
 508               {
 509                 score = new Float(st.nextToken()).floatValue();
 510               } catch (NumberFormatException ex)
 511               {
 512                 score = 0;
 513               }
 514
 515               sf = new SequenceFeature(type, desc, start, end, score, group);
 516
 517               try
 518               {
 519                 sf.setValue("STRAND", st.nextToken());
 520                 sf.setValue("FRAME", st.nextToken());
 521               } catch (Exception ex)
 522               {
 523               }
 524
 525               if (st.hasMoreTokens())
 526               {
 527                 StringBuffer attributes = new StringBuffer();
 528                 boolean sep = false;
 529                 while (st.hasMoreTokens())
 530                 {
 531                   attributes.append((sep ? "\t" : "") + st.nextElement());
 532                   sep = true;
 533                 }
 534                 // TODO validate and split GFF2 attributes field ? parse out
 535                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 536                 // sf.setValue(attrib, val);
 537                 sf.setValue("ATTRIBUTES", attributes.toString());
 538               }
 539
 540               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 541                       relaxedIdmatching))
 542               {
 543                 // check whether we should add the sequence feature to any other
 544                 // sequences in the alignment with the same or similar
 545                 while ((seq = align.findName(seq, seqId, true)) != null)
 546                 {
 547                   seq.addSequenceFeature(new SequenceFeature(sf));
 548                 }
 549               }
 550               break;
 551             }
 552           }
 553
 554           if (GFFFile && seq == null)
 555           {
 556             desc = token;
 557           }
 558           else
 559           {
 560             desc = st.nextToken();
 561           }
 562           if (!st.hasMoreTokens())
 563           {
 564             System.err
 565                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 566             // in all probability, this isn't a file we understand, so bail
 567             // quietly.
 568             return false;
 569           }
 570
 571           token = st.nextToken();
 572
 573           if (!token.equals("ID_NOT_SPECIFIED"))
 574           {
 575             seq = findName(align, seqId = token, relaxedIdmatching, null);
 576             st.nextToken();
 577           }
 578           else
 579           {
 580             seqId = null;
 581             try
 582             {
 583               index = Integer.parseInt(st.nextToken());
 584               seq = align.getSequenceAt(index);
 585             } catch (NumberFormatException ex)
 586             {
 587               seq = null;
 588             }
 589           }
 590
 591           if (seq == null)
 592           {
 593             System.out.println("Sequence not found: " + line);
 594             break;
 595           }
 596
 597           start = Integer.parseInt(st.nextToken());
 598           end = Integer.parseInt(st.nextToken());
 599
 600           type = st.nextToken();
 601
 602           if (!colours.containsKey(type))
 603           {
 604             // Probably the old style groups file
 605             UserColourScheme ucs = new UserColourScheme(type);
 606             colours.put(type, ucs.findColour('A'));
 607           }
 608           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 609           if (st.hasMoreTokens())
 610           {
 611             try
 612             {
 613               score = new Float(st.nextToken()).floatValue();
 614               // update colourgradient bounds if allowed to
 615             } catch (NumberFormatException ex)
 616             {
 617               score = 0;
 618             }
 619             sf.setScore(score);
 620           }
 621           if (groupLink != null && removeHTML)
 622           {
 623             sf.addLink(groupLink);
 624             sf.description += "%LINK%";
 625           }
 626           if (typeLink.containsKey(type) && removeHTML)
 627           {
 628             sf.addLink(typeLink.get(type).toString());
 629             sf.description += "%LINK%";
 630           }
 631
 632           parseDescriptionHTML(sf, removeHTML);
 633
 634           seq.addSequenceFeature(sf);
 635
 636           while (seqId != null
 637                   && (seq = align.findName(seq, seqId, false)) != null)
 638           {
 639             seq.addSequenceFeature(new SequenceFeature(sf));
 640           }
 641           // If we got here, its not a GFFFile
 642           GFFFile = false;
 643         }
 644       }
 645       resetMatcher();
 646     } catch (Exception ex)
 647     {
 648       // should report somewhere useful for UI if necessary
 649       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 650               + "Parsing error at\n" + line;
 651       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 652       ex.printStackTrace(System.err);
 653       resetMatcher();
 654       return false;
 655     }
 656
 657     return true;
 658   }
 659
 660   private enum GffPragmas
 661   {
 662     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 663   };
 664
 665   private static Map<String, GffPragmas> GFFPRAGMA;
 666   static
 667   {
 668     GFFPRAGMA = new HashMap<String, GffPragmas>();
 669     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 670     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 671     GFFPRAGMA.put("#", GffPragmas.hash);
 672     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 673     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 674     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 675     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 676   }
 677
 678   private void processGffPragma(String line, Map<String, String> gffProps,
 679           AlignmentI align, ArrayList<SequenceI> newseqs)
 680           throws IOException
 681   {
 682     // line starts with ##
 683     int spacepos = line.indexOf(' ');
 684     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 685             .substring(2, spacepos);
 686     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 687     if (gffpragma == null)
 688     {
 689       return;
 690     }
 691     switch (gffpragma)
 692     {
 693     case gff_version:
 694       try
 695       {
 696         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 697       } finally
 698       {
 699
 700       }
 701       break;
 702     case feature_ontology:
 703       // resolve against specific feature ontology
 704       break;
 705     case attribute_ontology:
 706       // resolve against specific attribute ontology
 707       break;
 708     case source_ontology:
 709       // resolve against specific source ontology
 710       break;
 711     case species_build:
 712       // resolve against specific NCBI taxon version
 713       break;
 714     case hash:
 715       // close off any open feature hierarchies
 716       break;
 717     case fasta:
 718       // process the rest of the file as a fasta file and replace any dummy
 719       // sequence IDs
 720       process_as_fasta(align, newseqs);
 721       break;
 722     default:
 723       // we do nothing ?
 724       System.err.println("Ignoring unknown pragma:\n" + line);
 725     }
 726   }
 727
 728   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 729           throws IOException
 730   {
 731     try
 732     {
 733       mark();
 734     } catch (IOException q)
 735     {
 736     }
 737     FastaFile parser = new FastaFile(this);
 738     List<SequenceI> includedseqs = parser.getSeqs();
 739     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 740     // iterate over includedseqs, and replacing matching ones with newseqs
 741     // sequences. Generic iterator not used here because we modify includedseqs
 742     // as we go
 743     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 744     {
 745       // search for any dummy seqs that this sequence can be used to update
 746       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 747       if (dummyseq != null)
 748       {
 749         // dummyseq was created so it could be annotated and referred to in
 750         // alignments/codon mappings
 751
 752         SequenceI mseq = includedseqs.get(p);
 753         // mseq is the 'template' imported from the FASTA file which we'll use
 754         // to coomplete dummyseq
 755         if (dummyseq instanceof SequenceDummy)
 756         {
 757           // probably have the pattern wrong
 758           // idea is that a flyweight proxy for a sequence ID can be created for
 759           // 1. stable reference creation
 760           // 2. addition of annotation
 761           // 3. future replacement by a real sequence
 762           // current pattern is to create SequenceDummy objects - a convenience
 763           // constructor for a Sequence.
 764           // problem is that when promoted to a real sequence, all references
 765           // need
 766           // to be updated somehow.
 767           ((SequenceDummy) dummyseq).become(mseq);
 768           includedseqs.set(p, dummyseq); // template is no longer needed
 769         }
 770       }
 771     }
 772     // finally add sequences to the dataset
 773     for (SequenceI seq : includedseqs)
 774     {
 775       align.addSequence(seq);
 776     }
 777   }
 778
 779   /**
 780    * take a sequence feature and examine its attributes to decide how it should
 781    * be added to a sequence
 782    *
 783    * @param seq
 784    *          - the destination sequence constructed or discovered in the
 785    *          current context
 786    * @param sf
 787    *          - the base feature with ATTRIBUTES property containing any
 788    *          additional attributes
 789    * @param gFFFile
 790    *          - true if we are processing a GFF annotation file
 791    * @return true if sf was actually added to the sequence, false if it was
 792    *         processed in another way
 793    */
 794   public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 795           boolean gFFFile, boolean relaxedIdMatching)
 796   {
 797     String attr = (String) sf.getValue("ATTRIBUTES");
 798     boolean add = true;
 799     if (gFFFile && attr != null)
 800     {
 801       int nattr=8;
 802
 803       for (String attset : attr.split("\t"))
 804       {
 805         if (attset==null || attset.trim().length()==0)
 806         {
 807           continue;
 808         }
 809         nattr++;
 810         Map<String, List<String>> set = new HashMap<String, List<String>>();
 811         // normally, only expect one column - 9 - in this field
 812         // the attributes (Gff3) or groups (gff2) field
 813         for (String pair : attset.trim().split(";"))
 814         {
 815           pair = pair.trim();
 816           if (pair.length() == 0)
 817           {
 818             continue;
 819           }
 820
 821           // expect either space seperated (gff2) or '=' separated (gff3)
 822           // key/value pairs here
 823
 824           int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
 825           String key = null, value = null;
 826
 827           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 828           {
 829             key = pair.substring(0, sppos);
 830             value = pair.substring(sppos + 1);
 831           } else {
 832             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 833             {
 834               key = pair.substring(0, eqpos);
 835               value = pair.substring(eqpos + 1);
 836             } else
 837             {
 838               key = pair;
 839             }
 840           }
 841           if (key != null)
 842           {
 843             List<String> vals = set.get(key);
 844             if (vals == null)
 845             {
 846               vals = new ArrayList<String>();
 847               set.put(key, vals);
 848             }
 849             if (value != null)
 850             {
 851               vals.add(value.trim());
 852             }
 853           }
 854         }
 855         try
 856         {
 857           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 858                   relaxedIdMatching); // process decides if
 859                                                      // feature is actually
 860                                                      // added
 861         } catch (InvalidGFF3FieldException ivfe)
 862         {
 863           System.err.println(ivfe);
 864         }
 865       }
 866     }
 867     if (add)
 868     {
 869       seq.addSequenceFeature(sf);
 870     }
 871     return add;
 872   }
 873
 874   public class InvalidGFF3FieldException extends Exception
 875   {
 876     String field, value;
 877
 878     public InvalidGFF3FieldException(String field,
 879             Map<String, List<String>> set, String message)
 880     {
 881       super(message + " (Field was " + field + " and value was "
 882               + set.get(field).toString());
 883       this.field = field;
 884       this.value = set.get(field).toString();
 885     }
 886
 887   }
 888
 889   /**
 890    * take a set of keys for a feature and interpret them
 891    *
 892    * @param set
 893    * @param nattr
 894    * @param seq
 895    * @param sf
 896    * @return
 897    */
 898   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 899           SequenceI seq, SequenceFeature sf, AlignmentI align,
 900           List<SequenceI> newseqs, boolean relaxedIdMatching)
 901           throws InvalidGFF3FieldException
 902   {
 903     String attr;
 904     // decide how to interpret according to type
 905     if (sf.getType().equals("similarity"))
 906     {
 907       int strand = sf.getStrand();
 908       // exonerate cdna/protein map
 909       // look for fields
 910       List<SequenceI> querySeq = findNames(align, newseqs,
 911               relaxedIdMatching, set.get(attr="Query"));
 912       if (querySeq==null || querySeq.size()!=1)
 913       {
 914         throw new InvalidGFF3FieldException( attr, set,
 915                 "Expecting exactly one sequence in Query field (got "
 916                         + set.get(attr) + ")");
 917       }
 918       if (set.containsKey(attr="Align"))
 919       {
 920         // process the align maps and create cdna/protein maps
 921         // ideally, the query sequences are in the alignment, but maybe not...
 922
 923         AlignedCodonFrame alco = new AlignedCodonFrame();
 924         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 925                 strand);
 926
 927         // add codon mapping, and hope!
 928         alco.addMap(seq, querySeq.get(0), codonmapping);
 929         align.addCodonFrame(alco);
 930         // everything that's needed to be done is done
 931         // no features to create here !
 932         return false;
 933       }
 934
 935     }
 936     return true;
 937   }
 938
 939   private MapList constructCodonMappingFromAlign(
 940           Map<String, List<String>> set,
 941           String attr, int strand) throws InvalidGFF3FieldException
 942   {
 943     if (strand == 0)
 944     {
 945       throw new InvalidGFF3FieldException(attr, set,
 946               "Invalid strand for a codon mapping (cannot be 0)");
 947     }
 948     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 949     int lastppos = 0, lastpframe = 0;
 950     for (String range : set.get(attr))
 951     {
 952       List<Integer> ints = new ArrayList<Integer>();
 953       StringTokenizer st = new StringTokenizer(range, " ");
 954       while (st.hasMoreTokens())
 955       {
 956         String num = st.nextToken();
 957         try
 958         {
 959           ints.add(new Integer(num));
 960         } catch (NumberFormatException nfe)
 961         {
 962           throw new InvalidGFF3FieldException(attr, set,
 963                   "Invalid number in field " + num);
 964         }
 965       }
 966       // Align positionInRef positionInQuery LengthInRef
 967       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
 968       // 3652 - . alignment_id 0 ;
 969       // Query DDB_G0269124
 970       // Align 11270 143 120
 971       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
 972       // dna in strand direction
 973       // Align 11150 187 282
 974       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
 975       // dna in strand direction
 976       //
 977       // Align 10865 281 888
 978       // Align 9977 578 1068
 979       // Align 8909 935 375
 980       //
 981       if (ints.size() != 3)
 982       {
 983         throw new InvalidGFF3FieldException(attr, set,
 984                 "Invalid number of fields for this attribute ("
 985                         + ints.size() + ")");
 986       }
 987       fromrange.add(new Integer(ints.get(0).intValue()));
 988       fromrange.add(new Integer(ints.get(0).intValue() + strand
 989               * ints.get(2).intValue()));
 990       // how are intron/exon boundaries that do not align in codons
 991       // represented
 992       if (ints.get(1).equals(lastppos) && lastpframe > 0)
 993       {
 994         // extend existing to map
 995         lastppos += ints.get(2) / 3;
 996         lastpframe = ints.get(2) % 3;
 997         torange.set(torange.size() - 1, new Integer(lastppos));
 998       }
 999       else
1000       {
1001         // new to map range
1002         torange.add(ints.get(1));
1003         lastppos = ints.get(1) + ints.get(2) / 3;
1004         lastpframe = ints.get(2) % 3;
1005         torange.add(new Integer(lastppos));
1006       }
1007     }
1008     // from and to ranges must end up being a series of start/end intervals
1009     if (fromrange.size() % 2 == 1)
1010     {
1011       throw new InvalidGFF3FieldException(attr, set,
1012               "Couldn't parse the DNA alignment range correctly");
1013     }
1014     if (torange.size() % 2 == 1)
1015     {
1016       throw new InvalidGFF3FieldException(attr, set,
1017               "Couldn't parse the protein alignment range correctly");
1018     }
1019     // finally, build the map
1020     int[] frommap = new int[fromrange.size()], tomap = new int[torange
1021             .size()];
1022     int p = 0;
1023     for (Integer ip : fromrange)
1024     {
1025       frommap[p++] = ip.intValue();
1026     }
1027     p = 0;
1028     for (Integer ip : torange)
1029     {
1030       tomap[p++] = ip.intValue();
1031     }
1032
1033     return new MapList(frommap, tomap, 3, 1);
1034   }
1035
1036   private List<SequenceI> findNames(AlignmentI align,
1037           List<SequenceI> newseqs, boolean relaxedIdMatching,
1038           List<String> list)
1039   {
1040     List<SequenceI> found = new ArrayList<SequenceI>();
1041     for (String seqId : list)
1042     {
1043       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1044       if (seq != null)
1045       {
1046         found.add(seq);
1047       }
1048     }
1049     return found;
1050   }
1051
1052   private AlignmentI lastmatchedAl = null;
1053
1054   private SequenceIdMatcher matcher = null;
1055
1056   /**
1057    * clear any temporary handles used to speed up ID matching
1058    */
1059   private void resetMatcher()
1060   {
1061     lastmatchedAl = null;
1062     matcher = null;
1063   }
1064
1065   private SequenceI findName(AlignmentI align, String seqId,
1066           boolean relaxedIdMatching, List<SequenceI> newseqs)
1067   {
1068     SequenceI match = null;
1069     if (relaxedIdMatching)
1070     {
1071       if (lastmatchedAl != align)
1072       {
1073         matcher = new SequenceIdMatcher(
1074                 (lastmatchedAl = align).getSequencesArray());
1075         if (newseqs != null)
1076         {
1077           matcher.addAll(newseqs);
1078         }
1079       }
1080       match = matcher.findIdMatch(seqId);
1081     }
1082     else
1083     {
1084       match = align.findName(seqId, true);
1085       if (match == null && newseqs != null)
1086       {
1087         for (SequenceI m : newseqs)
1088         {
1089           if (seqId.equals(m.getName()))
1090           {
1091             return m;
1092           }
1093         }
1094       }
1095
1096     }
1097     if (match==null && newseqs!=null)
1098     {
1099       match = new SequenceDummy(seqId);
1100       if (relaxedIdMatching)
1101       {
1102         matcher.addAll(Arrays.asList(new SequenceI[]
1103         { match }));
1104       }
1105       // add dummy sequence to the newseqs list
1106       newseqs.add(match);
1107     }
1108     return match;
1109   }
1110   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
1111   {
1112     if (sf.getDescription() == null)
1113     {
1114       return;
1115     }
1116     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
1117             sf.getDescription(), removeHTML, newline);
1118
1119     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
1120             : sf.description;
1121     for (String link : parsed.getLinks())
1122     {
1123       sf.addLink(link);
1124     }
1125
1126   }
1127
1128   /**
1129    * generate a features file for seqs includes non-pos features by default.
1130    *
1131    * @param seqs
1132    *          source of sequence features
1133    * @param visible
1134    *          hash of feature types and colours
1135    * @return features file contents
1136    */
1137   public String printJalviewFormat(SequenceI[] seqs, Map<String,Object> visible)
1138   {
1139     return printJalviewFormat(seqs, visible, true, true);
1140   }
1141
1142   /**
1143    * generate a features file for seqs with colours from visible (if any)
1144    *
1145    * @param seqs
1146    *          source of features
1147    * @param visible
1148    *          hash of Colours for each feature type
1149    * @param visOnly
1150    *          when true only feature types in 'visible' will be output
1151    * @param nonpos
1152    *          indicates if non-positional features should be output (regardless
1153    *          of group or type)
1154    * @return features file contents
1155    */
1156   public String printJalviewFormat(SequenceI[] seqs, Map visible,
1157           boolean visOnly, boolean nonpos)
1158   {
1159     StringBuffer out = new StringBuffer();
1160     SequenceFeature[] next;
1161     boolean featuresGen = false;
1162     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1163     {
1164       // no point continuing.
1165       return "No Features Visible";
1166     }
1167
1168     if (visible != null && visOnly)
1169     {
1170       // write feature colours only if we're given them and we are generating
1171       // viewed features
1172       // TODO: decide if feature links should also be written here ?
1173       Iterator en = visible.keySet().iterator();
1174       String type, color;
1175       while (en.hasNext())
1176       {
1177         type = en.next().toString();
1178
1179         if (visible.get(type) instanceof GraduatedColor)
1180         {
1181           GraduatedColor gc = (GraduatedColor) visible.get(type);
1182           color = (gc.isColourByLabel() ? "label|" : "")
1183                   + Format.getHexString(gc.getMinColor()) + "|"
1184                   + Format.getHexString(gc.getMaxColor())
1185                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
1186                   + gc.getMax() + "|";
1187           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
1188           {
1189             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
1190             {
1191               color += "below";
1192             }
1193             else
1194             {
1195               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
1196               {
1197                 System.err.println("WARNING: Unsupported threshold type ("
1198                         + gc.getThreshType() + ") : Assuming 'above'");
1199               }
1200               color += "above";
1201             }
1202             // add the value
1203             color += "|" + gc.getThresh();
1204           }
1205           else
1206           {
1207             color += "none";
1208           }
1209         }
1210         else if (visible.get(type) instanceof java.awt.Color)
1211         {
1212           color = Format.getHexString((java.awt.Color) visible.get(type));
1213         }
1214         else
1215         {
1216           // legacy support for integer objects containing colour triplet values
1217           color = Format.getHexString(new java.awt.Color(Integer
1218                   .parseInt(visible.get(type).toString())));
1219         }
1220         out.append(type);
1221         out.append("\t");
1222         out.append(color);
1223         out.append(newline);
1224       }
1225     }
1226     // Work out which groups are both present and visible
1227     Vector groups = new Vector();
1228     int groupIndex = 0;
1229     boolean isnonpos = false;
1230
1231     for (int i = 0; i < seqs.length; i++)
1232     {
1233       next = seqs[i].getSequenceFeatures();
1234       if (next != null)
1235       {
1236         for (int j = 0; j < next.length; j++)
1237         {
1238           isnonpos = next[j].begin == 0 && next[j].end == 0;
1239           if ((!nonpos && isnonpos)
1240                   || (!isnonpos && visOnly && !visible
1241                           .containsKey(next[j].type)))
1242           {
1243             continue;
1244           }
1245
1246           if (next[j].featureGroup != null
1247                   && !groups.contains(next[j].featureGroup))
1248           {
1249             groups.addElement(next[j].featureGroup);
1250           }
1251         }
1252       }
1253     }
1254
1255     String group = null;
1256     do
1257     {
1258
1259       if (groups.size() > 0 && groupIndex < groups.size())
1260       {
1261         group = groups.elementAt(groupIndex).toString();
1262         out.append(newline);
1263         out.append("STARTGROUP\t");
1264         out.append(group);
1265         out.append(newline);
1266       }
1267       else
1268       {
1269         group = null;
1270       }
1271
1272       for (int i = 0; i < seqs.length; i++)
1273       {
1274         next = seqs[i].getSequenceFeatures();
1275         if (next != null)
1276         {
1277           for (int j = 0; j < next.length; j++)
1278           {
1279             isnonpos = next[j].begin == 0 && next[j].end == 0;
1280             if ((!nonpos && isnonpos)
1281                     || (!isnonpos && visOnly && !visible
1282                             .containsKey(next[j].type)))
1283             {
1284               // skip if feature is nonpos and we ignore them or if we only
1285               // output visible and it isn't non-pos and it's not visible
1286               continue;
1287             }
1288
1289             if (group != null
1290                     && (next[j].featureGroup == null || !next[j].featureGroup
1291                             .equals(group)))
1292             {
1293               continue;
1294             }
1295
1296             if (group == null && next[j].featureGroup != null)
1297             {
1298               continue;
1299             }
1300             // we have features to output
1301             featuresGen = true;
1302             if (next[j].description == null
1303                     || next[j].description.equals(""))
1304             {
1305               out.append(next[j].type + "\t");
1306             }
1307             else
1308             {
1309               if (next[j].links != null
1310                       && next[j].getDescription().indexOf("<html>") == -1)
1311               {
1312                 out.append("<html>");
1313               }
1314
1315               out.append(next[j].description + " ");
1316               if (next[j].links != null)
1317               {
1318                 for (int l = 0; l < next[j].links.size(); l++)
1319                 {
1320                   String label = next[j].links.elementAt(l).toString();
1321                   String href = label.substring(label.indexOf("|") + 1);
1322                   label = label.substring(0, label.indexOf("|"));
1323
1324                   if (next[j].description.indexOf(href) == -1)
1325                   {
1326                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1327                   }
1328                 }
1329
1330                 if (next[j].getDescription().indexOf("</html>") == -1)
1331                 {
1332                   out.append("</html>");
1333                 }
1334               }
1335
1336               out.append("\t");
1337             }
1338             out.append(seqs[i].getName());
1339             out.append("\t-1\t");
1340             out.append(next[j].begin);
1341             out.append("\t");
1342             out.append(next[j].end);
1343             out.append("\t");
1344             out.append(next[j].type);
1345             if (next[j].score != Float.NaN)
1346             {
1347               out.append("\t");
1348               out.append(next[j].score);
1349             }
1350             out.append(newline);
1351           }
1352         }
1353       }
1354
1355       if (group != null)
1356       {
1357         out.append("ENDGROUP\t");
1358         out.append(group);
1359         out.append(newline);
1360         groupIndex++;
1361       }
1362       else
1363       {
1364         break;
1365       }
1366
1367     } while (groupIndex < groups.size() + 1);
1368
1369     if (!featuresGen)
1370     {
1371       return "No Features Visible";
1372     }
1373
1374     return out.toString();
1375   }
1376
1377   /**
1378    * generate a gff file for sequence features includes non-pos features by
1379    * default.
1380    *
1381    * @param seqs
1382    * @param visible
1383    * @return
1384    */
1385   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible)
1386   {
1387     return printGFFFormat(seqs, visible, true, true);
1388   }
1389
1390   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible,
1391           boolean visOnly, boolean nonpos)
1392   {
1393     StringBuffer out = new StringBuffer();
1394     SequenceFeature[] next;
1395     String source;
1396     boolean isnonpos;
1397     for (int i = 0; i < seqs.length; i++)
1398     {
1399       if (seqs[i].getSequenceFeatures() != null)
1400       {
1401         next = seqs[i].getSequenceFeatures();
1402         for (int j = 0; j < next.length; j++)
1403         {
1404           isnonpos = next[j].begin == 0 && next[j].end == 0;
1405           if ((!nonpos && isnonpos)
1406                   || (!isnonpos && visOnly && !visible
1407                           .containsKey(next[j].type)))
1408           {
1409             continue;
1410           }
1411
1412           source = next[j].featureGroup;
1413           if (source == null)
1414           {
1415             source = next[j].getDescription();
1416           }
1417
1418           out.append(seqs[i].getName());
1419           out.append("\t");
1420           out.append(source);
1421           out.append("\t");
1422           out.append(next[j].type);
1423           out.append("\t");
1424           out.append(next[j].begin);
1425           out.append("\t");
1426           out.append(next[j].end);
1427           out.append("\t");
1428           out.append(next[j].score);
1429           out.append("\t");
1430
1431           if (next[j].getValue("STRAND") != null)
1432           {
1433             out.append(next[j].getValue("STRAND"));
1434             out.append("\t");
1435           }
1436           else
1437           {
1438             out.append(".\t");
1439           }
1440
1441           if (next[j].getValue("FRAME") != null)
1442           {
1443             out.append(next[j].getValue("FRAME"));
1444           }
1445           else
1446           {
1447             out.append(".");
1448           }
1449           // TODO: verify/check GFF - should there be a /t here before attribute
1450           // output ?
1451
1452           if (next[j].getValue("ATTRIBUTES") != null)
1453           {
1454             out.append(next[j].getValue("ATTRIBUTES"));
1455           }
1456
1457           out.append(newline);
1458
1459         }
1460       }
1461     }
1462
1463     return out.toString();
1464   }
1465
1466   /**
1467    * this is only for the benefit of object polymorphism - method does nothing.
1468    */
1469   public void parse()
1470   {
1471     // IGNORED
1472   }
1473
1474   /**
1475    * this is only for the benefit of object polymorphism - method does nothing.
1476    *
1477    * @return error message
1478    */
1479   public String print()
1480   {
1481     return "USE printGFFFormat() or printJalviewFormat()";
1482   }
1483
1484 }