src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.schemes.AnnotationColourGradient;
  31 import jalview.schemes.GraduatedColor;
  32 import jalview.schemes.UserColourScheme;
  33 import jalview.util.Format;
  34 import jalview.util.MapList;
  35
  36 import java.io.IOException;
  37 import java.util.ArrayList;
  38 import java.util.Arrays;
  39 import java.util.HashMap;
  40 import java.util.Hashtable;
  41 import java.util.Iterator;
  42 import java.util.List;
  43 import java.util.Map;
  44 import java.util.StringTokenizer;
  45 import java.util.Vector;
  46
  47 /**
  48  * Parse and create Jalview Features files Detects GFF format features files and
  49  * parses. Does not implement standard print() - call specific printFeatures or
  50  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  51  * for the features annotation - this normally works on an exact match.
  52  *
  53  * @author AMW
  54  * @version $Revision$
  55  */
  56 public class FeaturesFile extends AlignFile
  57 {
  58   /**
  59    * work around for GFF interpretation bug where source string becomes
  60    * description rather than a group
  61    */
  62   private boolean doGffSource = true;
  63
  64   private int gffversion;
  65
  66   /**
  67    * Creates a new FeaturesFile object.
  68    */
  69   public FeaturesFile()
  70   {
  71   }
  72
  73   /**
  74    * @param inFile
  75    * @param type
  76    * @throws IOException
  77    */
  78   public FeaturesFile(String inFile, String type) throws IOException
  79   {
  80     super(inFile, type);
  81   }
  82
  83   /**
  84    * @param source
  85    * @throws IOException
  86    */
  87   public FeaturesFile(FileParse source) throws IOException
  88   {
  89     super(source);
  90   }
  91
  92   /**
  93    * @param parseImmediately
  94    * @param source
  95    * @throws IOException
  96    */
  97   public FeaturesFile(boolean parseImmediately, FileParse source)
  98           throws IOException
  99   {
 100     super(parseImmediately, source);
 101   }
 102
 103   /**
 104    * @param parseImmediately
 105    * @param inFile
 106    * @param type
 107    * @throws IOException
 108    */
 109   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 110           throws IOException
 111   {
 112     super(parseImmediately, inFile, type);
 113   }
 114
 115   /**
 116    * Parse GFF or sequence features file using case-independent matching,
 117    * discarding URLs
 118    *
 119    * @param align
 120    *          - alignment/dataset containing sequences that are to be annotated
 121    * @param colours
 122    *          - hashtable to store feature colour definitions
 123    * @param removeHTML
 124    *          - process html strings into plain text
 125    * @return true if features were added
 126    */
 127   public boolean parse(AlignmentI align, Hashtable colours,
 128           boolean removeHTML)
 129   {
 130     return parse(align, colours, null, removeHTML, false);
 131   }
 132
 133   /**
 134    * Parse GFF or sequence features file optionally using case-independent
 135    * matching, discarding URLs
 136    *
 137    * @param align
 138    *          - alignment/dataset containing sequences that are to be annotated
 139    * @param colours
 140    *          - hashtable to store feature colour definitions
 141    * @param removeHTML
 142    *          - process html strings into plain text
 143    * @param relaxedIdmatching
 144    *          - when true, ID matches to compound sequence IDs are allowed
 145    * @return true if features were added
 146    */
 147   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 148           boolean relaxedIdMatching)
 149   {
 150     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 151   }
 152
 153   /**
 154    * Parse GFF or sequence features file optionally using case-independent
 155    * matching
 156    *
 157    * @param align
 158    *          - alignment/dataset containing sequences that are to be annotated
 159    * @param colours
 160    *          - hashtable to store feature colour definitions
 161    * @param featureLink
 162    *          - hashtable to store associated URLs
 163    * @param removeHTML
 164    *          - process html strings into plain text
 165    * @return true if features were added
 166    */
 167   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 168           boolean removeHTML)
 169   {
 170     return parse(align, colours, featureLink, removeHTML, false);
 171   }
 172
 173   @Override
 174   public void addAnnotations(Alignment al)
 175   {
 176     // TODO Auto-generated method stub
 177     super.addAnnotations(al);
 178   }
 179
 180   @Override
 181   public void addProperties(Alignment al)
 182   {
 183     // TODO Auto-generated method stub
 184     super.addProperties(al);
 185   }
 186
 187   @Override
 188   public void addSeqGroups(AlignmentI al)
 189   {
 190     // TODO Auto-generated method stub
 191     super.addSeqGroups(al);
 192   }
 193
 194   /**
 195    * Parse GFF or sequence features file
 196    *
 197    * @param align
 198    *          - alignment/dataset containing sequences that are to be annotated
 199    * @param colours
 200    *          - hashtable to store feature colour definitions
 201    * @param featureLink
 202    *          - hashtable to store associated URLs
 203    * @param removeHTML
 204    *          - process html strings into plain text
 205    * @param relaxedIdmatching
 206    *          - when true, ID matches to compound sequence IDs are allowed
 207    * @return true if features were added
 208    */
 209   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 210           boolean removeHTML, boolean relaxedIdmatching)
 211   {
 212
 213     String line = null;
 214     try
 215     {
 216       SequenceI seq = null;
 217       /**
 218        * keep track of any sequences we try to create from the data if it is a GFF3 file
 219        */
 220       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 221       String type, desc, token = null;
 222
 223       int index, start, end;
 224       float score;
 225       StringTokenizer st;
 226       SequenceFeature sf;
 227       String featureGroup = null, groupLink = null;
 228       Map typeLink = new Hashtable();
 229       /**
 230        * when true, assume GFF style features rather than Jalview style.
 231        */
 232       boolean GFFFile = true;
 233       Map<String, String> gffProps = new HashMap<String, String>();
 234       while ((line = nextLine()) != null)
 235       {
 236         // skip comments/process pragmas
 237         if (line.startsWith("#"))
 238         {
 239           if (line.startsWith("##"))
 240           {
 241             // possibly GFF2/3 version and metadata header
 242             processGffPragma(line, gffProps, align, newseqs);
 243             line = "";
 244           }
 245           continue;
 246         }
 247
 248         st = new StringTokenizer(line, "\t");
 249         if (st.countTokens() == 1)
 250         {
 251           if (line.trim().equalsIgnoreCase("GFF"))
 252           {
 253             // Start parsing file as if it might be GFF again.
 254             GFFFile = true;
 255             continue;
 256           }
 257         }
 258         if (st.countTokens() > 1 && st.countTokens() < 4)
 259         {
 260           GFFFile = false;
 261           type = st.nextToken();
 262           if (type.equalsIgnoreCase("startgroup"))
 263           {
 264             featureGroup = st.nextToken();
 265             if (st.hasMoreElements())
 266             {
 267               groupLink = st.nextToken();
 268               featureLink.put(featureGroup, groupLink);
 269             }
 270           }
 271           else if (type.equalsIgnoreCase("endgroup"))
 272           {
 273             // We should check whether this is the current group,
 274             // but at present theres no way of showing more than 1 group
 275             st.nextToken();
 276             featureGroup = null;
 277             groupLink = null;
 278           }
 279           else
 280           {
 281             Object colour = null;
 282             String colscheme = st.nextToken();
 283             if (colscheme.indexOf("|") > -1
 284                     || colscheme.trim().equalsIgnoreCase("label"))
 285             {
 286               // Parse '|' separated graduated colourscheme fields:
 287               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 288               // can either provide 'label' only, first is optional, next two
 289               // colors are required (but may be
 290               // left blank), next is optional, nxt two min/max are required.
 291               // first is either 'label'
 292               // first/second and third are both hexadecimal or word equivalent
 293               // colour.
 294               // next two are values parsed as floats.
 295               // fifth is either 'above','below', or 'none'.
 296               // sixth is a float value and only required when fifth is either
 297               // 'above' or 'below'.
 298               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 299                       true);
 300               // set defaults
 301               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 302               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 303               boolean labelCol = false;
 304               // Parse spec line
 305               String mincol = gcol.nextToken();
 306               if (mincol == "|")
 307               {
 308                 System.err
 309                         .println("Expected either 'label' or a colour specification in the line: "
 310                                 + line);
 311                 continue;
 312               }
 313               String maxcol = null;
 314               if (mincol.toLowerCase().indexOf("label") == 0)
 315               {
 316                 labelCol = true;
 317                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 318                                                                            // '|'
 319                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 320               }
 321               String abso = null, minval, maxval;
 322               if (mincol != null)
 323               {
 324                 // at least four more tokens
 325                 if (mincol.equals("|"))
 326                 {
 327                   mincol = "";
 328                 }
 329                 else
 330                 {
 331                   gcol.nextToken(); // skip next '|'
 332                 }
 333                 // continue parsing rest of line
 334                 maxcol = gcol.nextToken();
 335                 if (maxcol.equals("|"))
 336                 {
 337                   maxcol = "";
 338                 }
 339                 else
 340                 {
 341                   gcol.nextToken(); // skip next '|'
 342                 }
 343                 abso = gcol.nextToken();
 344                 gcol.nextToken(); // skip next '|'
 345                 if (abso.toLowerCase().indexOf("abso") != 0)
 346                 {
 347                   minval = abso;
 348                   abso = null;
 349                 }
 350                 else
 351                 {
 352                   minval = gcol.nextToken();
 353                   gcol.nextToken(); // skip next '|'
 354                 }
 355                 maxval = gcol.nextToken();
 356                 if (gcol.hasMoreTokens())
 357                 {
 358                   gcol.nextToken(); // skip next '|'
 359                 }
 360                 try
 361                 {
 362                   if (minval.length() > 0)
 363                   {
 364                     min = new Float(minval).floatValue();
 365                   }
 366                 } catch (Exception e)
 367                 {
 368                   System.err
 369                           .println("Couldn't parse the minimum value for graduated colour for type ("
 370                                   + colscheme
 371                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 372                   e.printStackTrace();
 373                 }
 374                 try
 375                 {
 376                   if (maxval.length() > 0)
 377                   {
 378                     max = new Float(maxval).floatValue();
 379                   }
 380                 } catch (Exception e)
 381                 {
 382                   System.err
 383                           .println("Couldn't parse the maximum value for graduated colour for type ("
 384                                   + colscheme + ")");
 385                   e.printStackTrace();
 386                 }
 387               }
 388               else
 389               {
 390                 // add in some dummy min/max colours for the label-only
 391                 // colourscheme.
 392                 mincol = "FFFFFF";
 393                 maxcol = "000000";
 394               }
 395               try
 396               {
 397                 colour = new jalview.schemes.GraduatedColor(
 398                         new UserColourScheme(mincol).findColour('A'),
 399                         new UserColourScheme(maxcol).findColour('A'), min,
 400                         max);
 401               } catch (Exception e)
 402               {
 403                 System.err
 404                         .println("Couldn't parse the graduated colour scheme ("
 405                                 + colscheme + ")");
 406                 e.printStackTrace();
 407               }
 408               if (colour != null)
 409               {
 410                 ((jalview.schemes.GraduatedColor) colour)
 411                         .setColourByLabel(labelCol);
 412                 ((jalview.schemes.GraduatedColor) colour)
 413                         .setAutoScaled(abso == null);
 414                 // add in any additional parameters
 415                 String ttype = null, tval = null;
 416                 if (gcol.hasMoreTokens())
 417                 {
 418                   // threshold type and possibly a threshold value
 419                   ttype = gcol.nextToken();
 420                   if (ttype.toLowerCase().startsWith("below"))
 421                   {
 422                     ((jalview.schemes.GraduatedColor) colour)
 423                             .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 424                   }
 425                   else if (ttype.toLowerCase().startsWith("above"))
 426                   {
 427                     ((jalview.schemes.GraduatedColor) colour)
 428                             .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 429                   }
 430                   else
 431                   {
 432                     ((jalview.schemes.GraduatedColor) colour)
 433                             .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 434                     if (!ttype.toLowerCase().startsWith("no"))
 435                     {
 436                       System.err
 437                               .println("Ignoring unrecognised threshold type : "
 438                                       + ttype);
 439                     }
 440                   }
 441                 }
 442                 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 443                 {
 444                   try
 445                   {
 446                     gcol.nextToken();
 447                     tval = gcol.nextToken();
 448                     ((jalview.schemes.GraduatedColor) colour)
 449                             .setThresh(new Float(tval).floatValue());
 450                   } catch (Exception e)
 451                   {
 452                     System.err
 453                             .println("Couldn't parse threshold value as a float: ("
 454                                     + tval + ")");
 455                     e.printStackTrace();
 456                   }
 457                 }
 458                 // parse the thresh-is-min token ?
 459                 if (gcol.hasMoreTokens())
 460                 {
 461                   System.err
 462                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 463                   while (gcol.hasMoreTokens())
 464                   {
 465                     System.err.println("|" + gcol.nextToken());
 466                   }
 467                   System.err.println("\n");
 468                 }
 469               }
 470             }
 471             else
 472             {
 473               UserColourScheme ucs = new UserColourScheme(colscheme);
 474               colour = ucs.findColour('A');
 475             }
 476             if (colour != null)
 477             {
 478               colours.put(type, colour);
 479             }
 480             if (st.hasMoreElements())
 481             {
 482               String link = st.nextToken();
 483               typeLink.put(type, link);
 484               if (featureLink == null)
 485               {
 486                 featureLink = new Hashtable();
 487               }
 488               featureLink.put(type, link);
 489             }
 490           }
 491           continue;
 492         }
 493         String seqId = "";
 494         while (st.hasMoreElements())
 495         {
 496
 497           if (GFFFile)
 498           {
 499             // Still possible this is an old Jalview file,
 500             // which does not have type colours at the beginning
 501             seqId = token = st.nextToken();
 502             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 503             if (seq != null)
 504             {
 505               desc = st.nextToken();
 506               String group = null;
 507               if (doGffSource && desc.indexOf(' ') == -1)
 508               {
 509                 // could also be a source term rather than description line
 510                 group = new String(desc);
 511               }
 512               type = st.nextToken();
 513               try
 514               {
 515                 String stt = st.nextToken();
 516                 if (stt.length() == 0 || stt.equals("-"))
 517                 {
 518                   start = 0;
 519                 }
 520                 else
 521                 {
 522                   start = Integer.parseInt(stt);
 523                 }
 524               } catch (NumberFormatException ex)
 525               {
 526                 start = 0;
 527               }
 528               try
 529               {
 530                 String stt = st.nextToken();
 531                 if (stt.length() == 0 || stt.equals("-"))
 532                 {
 533                   end = 0;
 534                 }
 535                 else
 536                 {
 537                   end = Integer.parseInt(stt);
 538                 }
 539               } catch (NumberFormatException ex)
 540               {
 541                 end = 0;
 542               }
 543               // TODO: decide if non positional feature assertion for input data
 544               // where end==0 is generally valid
 545               if (end == 0)
 546               {
 547                 // treat as non-positional feature, regardless.
 548                 start = 0;
 549               }
 550               try
 551               {
 552                 score = new Float(st.nextToken()).floatValue();
 553               } catch (NumberFormatException ex)
 554               {
 555                 score = 0;
 556               }
 557
 558               sf = new SequenceFeature(type, desc, start, end, score, group);
 559
 560               try
 561               {
 562                 sf.setValue("STRAND", st.nextToken());
 563                 sf.setValue("FRAME", st.nextToken());
 564               } catch (Exception ex)
 565               {
 566               }
 567
 568               if (st.hasMoreTokens())
 569               {
 570                 StringBuffer attributes = new StringBuffer();
 571                 boolean sep = false;
 572                 while (st.hasMoreTokens())
 573                 {
 574                   attributes.append((sep ? "\t" : "") + st.nextElement());
 575                   sep = true;
 576                 }
 577                 // TODO validate and split GFF2 attributes field ? parse out
 578                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 579                 // sf.setValue(attrib, val);
 580                 sf.setValue("ATTRIBUTES", attributes.toString());
 581               }
 582
 583               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 584                       relaxedIdmatching))
 585               {
 586                 // check whether we should add the sequence feature to any other
 587                 // sequences in the alignment with the same or similar
 588                 while ((seq = align.findName(seq, seqId, true)) != null)
 589                 {
 590                   seq.addSequenceFeature(new SequenceFeature(sf));
 591                 }
 592               }
 593               break;
 594             }
 595           }
 596
 597           if (GFFFile && seq == null)
 598           {
 599             desc = token;
 600           }
 601           else
 602           {
 603             desc = st.nextToken();
 604           }
 605           if (!st.hasMoreTokens())
 606           {
 607             System.err
 608                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 609             // in all probability, this isn't a file we understand, so bail
 610             // quietly.
 611             return false;
 612           }
 613
 614           token = st.nextToken();
 615
 616           if (!token.equals("ID_NOT_SPECIFIED"))
 617           {
 618             seq = findName(align, seqId = token, relaxedIdmatching, null);
 619             st.nextToken();
 620           }
 621           else
 622           {
 623             seqId = null;
 624             try
 625             {
 626               index = Integer.parseInt(st.nextToken());
 627               seq = align.getSequenceAt(index);
 628             } catch (NumberFormatException ex)
 629             {
 630               seq = null;
 631             }
 632           }
 633
 634           if (seq == null)
 635           {
 636             System.out.println("Sequence not found: " + line);
 637             break;
 638           }
 639
 640           start = Integer.parseInt(st.nextToken());
 641           end = Integer.parseInt(st.nextToken());
 642
 643           type = st.nextToken();
 644
 645           if (!colours.containsKey(type))
 646           {
 647             // Probably the old style groups file
 648             UserColourScheme ucs = new UserColourScheme(type);
 649             colours.put(type, ucs.findColour('A'));
 650           }
 651           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 652           if (st.hasMoreTokens())
 653           {
 654             try
 655             {
 656               score = new Float(st.nextToken()).floatValue();
 657               // update colourgradient bounds if allowed to
 658             } catch (NumberFormatException ex)
 659             {
 660               score = 0;
 661             }
 662             sf.setScore(score);
 663           }
 664           if (groupLink != null && removeHTML)
 665           {
 666             sf.addLink(groupLink);
 667             sf.description += "%LINK%";
 668           }
 669           if (typeLink.containsKey(type) && removeHTML)
 670           {
 671             sf.addLink(typeLink.get(type).toString());
 672             sf.description += "%LINK%";
 673           }
 674
 675           parseDescriptionHTML(sf, removeHTML);
 676
 677           seq.addSequenceFeature(sf);
 678
 679           while (seqId != null
 680                   && (seq = align.findName(seq, seqId, false)) != null)
 681           {
 682             seq.addSequenceFeature(new SequenceFeature(sf));
 683           }
 684           // If we got here, its not a GFFFile
 685           GFFFile = false;
 686         }
 687       }
 688       resetMatcher();
 689     } catch (Exception ex)
 690     {
 691       // should report somewhere useful for UI if necessary
 692       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 693               + "Parsing error at\n" + line;
 694       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 695       ex.printStackTrace(System.err);
 696       resetMatcher();
 697       return false;
 698     }
 699
 700     return true;
 701   }
 702
 703   private enum GffPragmas
 704   {
 705     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 706   };
 707
 708   private static Map<String, GffPragmas> GFFPRAGMA;
 709   static
 710   {
 711     GFFPRAGMA = new HashMap<String, GffPragmas>();
 712     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 713     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 714     GFFPRAGMA.put("#", GffPragmas.hash);
 715     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 716     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 717     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 718     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 719   }
 720
 721   private void processGffPragma(String line, Map<String, String> gffProps,
 722           AlignmentI align, ArrayList<SequenceI> newseqs)
 723           throws IOException
 724   {
 725     // line starts with ##
 726     int spacepos = line.indexOf(' ');
 727     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 728             .substring(2, spacepos);
 729     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 730     if (gffpragma == null)
 731     {
 732       return;
 733     }
 734     switch (gffpragma)
 735     {
 736     case gff_version:
 737       try
 738       {
 739         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 740       } finally
 741       {
 742
 743       }
 744       break;
 745     case feature_ontology:
 746       // resolve against specific feature ontology
 747       break;
 748     case attribute_ontology:
 749       // resolve against specific attribute ontology
 750       break;
 751     case source_ontology:
 752       // resolve against specific source ontology
 753       break;
 754     case species_build:
 755       // resolve against specific NCBI taxon version
 756       break;
 757     case hash:
 758       // close off any open feature hierarchies
 759       break;
 760     case fasta:
 761       // process the rest of the file as a fasta file and replace any dummy
 762       // sequence IDs
 763       process_as_fasta(align, newseqs);
 764       break;
 765     default:
 766       // we do nothing ?
 767       System.err.println("Ignoring unknown pragma:\n" + line);
 768     }
 769   }
 770
 771   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 772           throws IOException
 773   {
 774     try
 775     {
 776       mark();
 777     } catch (IOException q)
 778     {
 779     }
 780     FastaFile parser = new FastaFile(this);
 781     List<SequenceI> includedseqs = parser.getSeqs();
 782     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 783     // iterate over includedseqs, and replacing matching ones with newseqs
 784     // sequences. Generic iterator not used here because we modify includedseqs
 785     // as we go
 786     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 787     {
 788       // search for any dummy seqs that this sequence can be used to update
 789       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 790       if (dummyseq != null)
 791       {
 792         // dummyseq was created so it could be annotated and referred to in
 793         // alignments/codon mappings
 794
 795         SequenceI mseq = includedseqs.get(p);
 796         // mseq is the 'template' imported from the FASTA file which we'll use
 797         // to coomplete dummyseq
 798         if (dummyseq instanceof SequenceDummy)
 799         {
 800           // probably have the pattern wrong
 801           // idea is that a flyweight proxy for a sequence ID can be created for
 802           // 1. stable reference creation
 803           // 2. addition of annotation
 804           // 3. future replacement by a real sequence
 805           // current pattern is to create SequenceDummy objects - a convenience
 806           // constructor for a Sequence.
 807           // problem is that when promoted to a real sequence, all references
 808           // need
 809           // to be updated somehow.
 810           ((SequenceDummy) dummyseq).become(mseq);
 811           includedseqs.set(p, dummyseq); // template is no longer needed
 812         }
 813       }
 814     }
 815     // finally add sequences to the dataset
 816     for (SequenceI seq : includedseqs)
 817     {
 818       align.addSequence(seq);
 819     }
 820   }
 821
 822   /**
 823    * take a sequence feature and examine its attributes to decide how it should
 824    * be added to a sequence
 825    *
 826    * @param seq
 827    *          - the destination sequence constructed or discovered in the
 828    *          current context
 829    * @param sf
 830    *          - the base feature with ATTRIBUTES property containing any
 831    *          additional attributes
 832    * @param gFFFile
 833    *          - true if we are processing a GFF annotation file
 834    * @return true if sf was actually added to the sequence, false if it was
 835    *         processed in another way
 836    */
 837   public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 838           boolean gFFFile, boolean relaxedIdMatching)
 839   {
 840     String attr = (String) sf.getValue("ATTRIBUTES");
 841     boolean add = true;
 842     if (gFFFile && attr != null)
 843     {
 844       int nattr=8;
 845
 846       for (String attset : attr.split("\t"))
 847       {
 848         if (attset==null || attset.trim().length()==0)
 849         {
 850           continue;
 851         }
 852         nattr++;
 853         Map<String, List<String>> set = new HashMap<String, List<String>>();
 854         // normally, only expect one column - 9 - in this field
 855         // the attributes (Gff3) or groups (gff2) field
 856         for (String pair : attset.trim().split(";"))
 857         {
 858           pair = pair.trim();
 859           if (pair.length() == 0)
 860           {
 861             continue;
 862           }
 863
 864           // expect either space seperated (gff2) or '=' separated (gff3)
 865           // key/value pairs here
 866
 867           int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
 868           String key = null, value = null;
 869
 870           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 871           {
 872             key = pair.substring(0, sppos);
 873             value = pair.substring(sppos + 1);
 874           } else {
 875             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 876             {
 877               key = pair.substring(0, eqpos);
 878               value = pair.substring(eqpos + 1);
 879             } else
 880             {
 881               key = pair;
 882             }
 883           }
 884           if (key != null)
 885           {
 886             List<String> vals = set.get(key);
 887             if (vals == null)
 888             {
 889               vals = new ArrayList<String>();
 890               set.put(key, vals);
 891             }
 892             if (value != null)
 893             {
 894               vals.add(value.trim());
 895             }
 896           }
 897         }
 898         try
 899         {
 900           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 901                   relaxedIdMatching); // process decides if
 902                                                      // feature is actually
 903                                                      // added
 904         } catch (InvalidGFF3FieldException ivfe)
 905         {
 906           System.err.println(ivfe);
 907         }
 908       }
 909     }
 910     if (add)
 911     {
 912       seq.addSequenceFeature(sf);
 913     }
 914     return add;
 915   }
 916
 917   public class InvalidGFF3FieldException extends Exception
 918   {
 919     String field, value;
 920
 921     public InvalidGFF3FieldException(String field,
 922             Map<String, List<String>> set, String message)
 923     {
 924       super(message + " (Field was " + field + " and value was "
 925               + set.get(field).toString());
 926       this.field = field;
 927       this.value = set.get(field).toString();
 928     }
 929
 930   }
 931
 932   /**
 933    * take a set of keys for a feature and interpret them
 934    *
 935    * @param set
 936    * @param nattr
 937    * @param seq
 938    * @param sf
 939    * @return
 940    */
 941   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 942           SequenceI seq, SequenceFeature sf, AlignmentI align,
 943           List<SequenceI> newseqs, boolean relaxedIdMatching)
 944           throws InvalidGFF3FieldException
 945   {
 946     String attr;
 947     // decide how to interpret according to type
 948     if (sf.getType().equals("similarity"))
 949     {
 950       int strand = sf.getStrand();
 951       // exonerate cdna/protein map
 952       // look for fields
 953       List<SequenceI> querySeq = findNames(align, newseqs,
 954               relaxedIdMatching, set.get(attr="Query"));
 955       if (querySeq==null || querySeq.size()!=1)
 956       {
 957         throw new InvalidGFF3FieldException( attr, set,
 958                 "Expecting exactly one sequence in Query field (got "
 959                         + set.get(attr) + ")");
 960       }
 961       if (set.containsKey(attr="Align"))
 962       {
 963         // process the align maps and create cdna/protein maps
 964         // ideally, the query sequences are in the alignment, but maybe not...
 965
 966         AlignedCodonFrame alco = new AlignedCodonFrame();
 967         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 968                 strand);
 969
 970         // add codon mapping, and hope!
 971         alco.addMap(seq, querySeq.get(0), codonmapping);
 972         align.addCodonFrame(alco);
 973         // everything that's needed to be done is done
 974         // no features to create here !
 975         return false;
 976       }
 977
 978     }
 979     return true;
 980   }
 981
 982   private MapList constructCodonMappingFromAlign(
 983           Map<String, List<String>> set,
 984           String attr, int strand) throws InvalidGFF3FieldException
 985   {
 986     if (strand == 0)
 987     {
 988       throw new InvalidGFF3FieldException(attr, set,
 989               "Invalid strand for a codon mapping (cannot be 0)");
 990     }
 991     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 992     int lastppos = 0, lastpframe = 0;
 993     for (String range : set.get(attr))
 994     {
 995       List<Integer> ints = new ArrayList<Integer>();
 996       StringTokenizer st = new StringTokenizer(range, " ");
 997       while (st.hasMoreTokens())
 998       {
 999         String num = st.nextToken();
1000         try
1001         {
1002           ints.add(new Integer(num));
1003         } catch (NumberFormatException nfe)
1004         {
1005           throw new InvalidGFF3FieldException(attr, set,
1006                   "Invalid number in field " + num);
1007         }
1008       }
1009       // Align positionInRef positionInQuery LengthInRef
1010       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
1011       // 3652 - . alignment_id 0 ;
1012       // Query DDB_G0269124
1013       // Align 11270 143 120
1014       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
1015       // dna in strand direction
1016       // Align 11150 187 282
1017       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
1018       // dna in strand direction
1019       //
1020       // Align 10865 281 888
1021       // Align 9977 578 1068
1022       // Align 8909 935 375
1023       //
1024       if (ints.size() != 3)
1025       {
1026         throw new InvalidGFF3FieldException(attr, set,
1027                 "Invalid number of fields for this attribute ("
1028                         + ints.size() + ")");
1029       }
1030       fromrange.add(new Integer(ints.get(0).intValue()));
1031       fromrange.add(new Integer(ints.get(0).intValue() + strand
1032               * ints.get(2).intValue()));
1033       // how are intron/exon boundaries that do not align in codons
1034       // represented
1035       if (ints.get(1).equals(lastppos) && lastpframe > 0)
1036       {
1037         // extend existing to map
1038         lastppos += ints.get(2) / 3;
1039         lastpframe = ints.get(2) % 3;
1040         torange.set(torange.size() - 1, new Integer(lastppos));
1041       }
1042       else
1043       {
1044         // new to map range
1045         torange.add(ints.get(1));
1046         lastppos = ints.get(1) + ints.get(2) / 3;
1047         lastpframe = ints.get(2) % 3;
1048         torange.add(new Integer(lastppos));
1049       }
1050     }
1051     // from and to ranges must end up being a series of start/end intervals
1052     if (fromrange.size() % 2 == 1)
1053     {
1054       throw new InvalidGFF3FieldException(attr, set,
1055               "Couldn't parse the DNA alignment range correctly");
1056     }
1057     if (torange.size() % 2 == 1)
1058     {
1059       throw new InvalidGFF3FieldException(attr, set,
1060               "Couldn't parse the protein alignment range correctly");
1061     }
1062     // finally, build the map
1063     int[] frommap = new int[fromrange.size()], tomap = new int[torange
1064             .size()];
1065     int p = 0;
1066     for (Integer ip : fromrange)
1067     {
1068       frommap[p++] = ip.intValue();
1069     }
1070     p = 0;
1071     for (Integer ip : torange)
1072     {
1073       tomap[p++] = ip.intValue();
1074     }
1075
1076     return new MapList(frommap, tomap, 3, 1);
1077   }
1078
1079   private List<SequenceI> findNames(AlignmentI align,
1080           List<SequenceI> newseqs, boolean relaxedIdMatching,
1081           List<String> list)
1082   {
1083     List<SequenceI> found = new ArrayList<SequenceI>();
1084     for (String seqId : list)
1085     {
1086       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1087       if (seq != null)
1088       {
1089         found.add(seq);
1090       }
1091     }
1092     return found;
1093   }
1094
1095   private AlignmentI lastmatchedAl = null;
1096
1097   private SequenceIdMatcher matcher = null;
1098
1099   /**
1100    * clear any temporary handles used to speed up ID matching
1101    */
1102   private void resetMatcher()
1103   {
1104     lastmatchedAl = null;
1105     matcher = null;
1106   }
1107
1108   private SequenceI findName(AlignmentI align, String seqId,
1109           boolean relaxedIdMatching, List<SequenceI> newseqs)
1110   {
1111     SequenceI match = null;
1112     if (relaxedIdMatching)
1113     {
1114       if (lastmatchedAl != align)
1115       {
1116         matcher = new SequenceIdMatcher(
1117                 (lastmatchedAl = align).getSequencesArray());
1118         if (newseqs != null)
1119         {
1120           matcher.addAll(newseqs);
1121         }
1122       }
1123       match = matcher.findIdMatch(seqId);
1124     }
1125     else
1126     {
1127       match = align.findName(seqId, true);
1128       if (match == null && newseqs != null)
1129       {
1130         for (SequenceI m : newseqs)
1131         {
1132           if (seqId.equals(m.getName()))
1133           {
1134             return m;
1135           }
1136         }
1137       }
1138
1139     }
1140     if (match==null && newseqs!=null)
1141     {
1142       match = new SequenceDummy(seqId);
1143       if (relaxedIdMatching)
1144       {
1145         matcher.addAll(Arrays.asList(new SequenceI[]
1146         { match }));
1147       }
1148       // add dummy sequence to the newseqs list
1149       newseqs.add(match);
1150     }
1151     return match;
1152   }
1153   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
1154   {
1155     if (sf.getDescription() == null)
1156     {
1157       return;
1158     }
1159     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
1160             sf.getDescription(), removeHTML, newline);
1161
1162     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
1163             : sf.description;
1164     for (String link : parsed.getLinks())
1165     {
1166       sf.addLink(link);
1167     }
1168
1169   }
1170
1171   /**
1172    * generate a features file for seqs includes non-pos features by default.
1173    *
1174    * @param seqs
1175    *          source of sequence features
1176    * @param visible
1177    *          hash of feature types and colours
1178    * @return features file contents
1179    */
1180   public String printJalviewFormat(SequenceI[] seqs, Map<String,Object> visible)
1181   {
1182     return printJalviewFormat(seqs, visible, true, true);
1183   }
1184
1185   /**
1186    * generate a features file for seqs with colours from visible (if any)
1187    *
1188    * @param seqs
1189    *          source of features
1190    * @param visible
1191    *          hash of Colours for each feature type
1192    * @param visOnly
1193    *          when true only feature types in 'visible' will be output
1194    * @param nonpos
1195    *          indicates if non-positional features should be output (regardless
1196    *          of group or type)
1197    * @return features file contents
1198    */
1199   public String printJalviewFormat(SequenceI[] seqs, Map visible,
1200           boolean visOnly, boolean nonpos)
1201   {
1202     StringBuffer out = new StringBuffer();
1203     SequenceFeature[] next;
1204     boolean featuresGen = false;
1205     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1206     {
1207       // no point continuing.
1208       return "No Features Visible";
1209     }
1210
1211     if (visible != null && visOnly)
1212     {
1213       // write feature colours only if we're given them and we are generating
1214       // viewed features
1215       // TODO: decide if feature links should also be written here ?
1216       Iterator en = visible.keySet().iterator();
1217       String type, color;
1218       while (en.hasNext())
1219       {
1220         type = en.next().toString();
1221
1222         if (visible.get(type) instanceof GraduatedColor)
1223         {
1224           GraduatedColor gc = (GraduatedColor) visible.get(type);
1225           color = (gc.isColourByLabel() ? "label|" : "")
1226                   + Format.getHexString(gc.getMinColor()) + "|"
1227                   + Format.getHexString(gc.getMaxColor())
1228                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
1229                   + gc.getMax() + "|";
1230           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
1231           {
1232             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
1233             {
1234               color += "below";
1235             }
1236             else
1237             {
1238               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
1239               {
1240                 System.err.println("WARNING: Unsupported threshold type ("
1241                         + gc.getThreshType() + ") : Assuming 'above'");
1242               }
1243               color += "above";
1244             }
1245             // add the value
1246             color += "|" + gc.getThresh();
1247           }
1248           else
1249           {
1250             color += "none";
1251           }
1252         }
1253         else if (visible.get(type) instanceof java.awt.Color)
1254         {
1255           color = Format.getHexString((java.awt.Color) visible.get(type));
1256         }
1257         else
1258         {
1259           // legacy support for integer objects containing colour triplet values
1260           color = Format.getHexString(new java.awt.Color(Integer
1261                   .parseInt(visible.get(type).toString())));
1262         }
1263         out.append(type);
1264         out.append("\t");
1265         out.append(color);
1266         out.append(newline);
1267       }
1268     }
1269     // Work out which groups are both present and visible
1270     Vector groups = new Vector();
1271     int groupIndex = 0;
1272     boolean isnonpos = false;
1273
1274     for (int i = 0; i < seqs.length; i++)
1275     {
1276       next = seqs[i].getSequenceFeatures();
1277       if (next != null)
1278       {
1279         for (int j = 0; j < next.length; j++)
1280         {
1281           isnonpos = next[j].begin == 0 && next[j].end == 0;
1282           if ((!nonpos && isnonpos)
1283                   || (!isnonpos && visOnly && !visible
1284                           .containsKey(next[j].type)))
1285           {
1286             continue;
1287           }
1288
1289           if (next[j].featureGroup != null
1290                   && !groups.contains(next[j].featureGroup))
1291           {
1292             groups.addElement(next[j].featureGroup);
1293           }
1294         }
1295       }
1296     }
1297
1298     String group = null;
1299     do
1300     {
1301
1302       if (groups.size() > 0 && groupIndex < groups.size())
1303       {
1304         group = groups.elementAt(groupIndex).toString();
1305         out.append(newline);
1306         out.append("STARTGROUP\t");
1307         out.append(group);
1308         out.append(newline);
1309       }
1310       else
1311       {
1312         group = null;
1313       }
1314
1315       for (int i = 0; i < seqs.length; i++)
1316       {
1317         next = seqs[i].getSequenceFeatures();
1318         if (next != null)
1319         {
1320           for (int j = 0; j < next.length; j++)
1321           {
1322             isnonpos = next[j].begin == 0 && next[j].end == 0;
1323             if ((!nonpos && isnonpos)
1324                     || (!isnonpos && visOnly && !visible
1325                             .containsKey(next[j].type)))
1326             {
1327               // skip if feature is nonpos and we ignore them or if we only
1328               // output visible and it isn't non-pos and it's not visible
1329               continue;
1330             }
1331
1332             if (group != null
1333                     && (next[j].featureGroup == null || !next[j].featureGroup
1334                             .equals(group)))
1335             {
1336               continue;
1337             }
1338
1339             if (group == null && next[j].featureGroup != null)
1340             {
1341               continue;
1342             }
1343             // we have features to output
1344             featuresGen = true;
1345             if (next[j].description == null
1346                     || next[j].description.equals(""))
1347             {
1348               out.append(next[j].type + "\t");
1349             }
1350             else
1351             {
1352               if (next[j].links != null
1353                       && next[j].getDescription().indexOf("<html>") == -1)
1354               {
1355                 out.append("<html>");
1356               }
1357
1358               out.append(next[j].description + " ");
1359               if (next[j].links != null)
1360               {
1361                 for (int l = 0; l < next[j].links.size(); l++)
1362                 {
1363                   String label = next[j].links.elementAt(l).toString();
1364                   String href = label.substring(label.indexOf("|") + 1);
1365                   label = label.substring(0, label.indexOf("|"));
1366
1367                   if (next[j].description.indexOf(href) == -1)
1368                   {
1369                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1370                   }
1371                 }
1372
1373                 if (next[j].getDescription().indexOf("</html>") == -1)
1374                 {
1375                   out.append("</html>");
1376                 }
1377               }
1378
1379               out.append("\t");
1380             }
1381             out.append(seqs[i].getName());
1382             out.append("\t-1\t");
1383             out.append(next[j].begin);
1384             out.append("\t");
1385             out.append(next[j].end);
1386             out.append("\t");
1387             out.append(next[j].type);
1388             if (next[j].score != Float.NaN)
1389             {
1390               out.append("\t");
1391               out.append(next[j].score);
1392             }
1393             out.append(newline);
1394           }
1395         }
1396       }
1397
1398       if (group != null)
1399       {
1400         out.append("ENDGROUP\t");
1401         out.append(group);
1402         out.append(newline);
1403         groupIndex++;
1404       }
1405       else
1406       {
1407         break;
1408       }
1409
1410     } while (groupIndex < groups.size() + 1);
1411
1412     if (!featuresGen)
1413     {
1414       return "No Features Visible";
1415     }
1416
1417     return out.toString();
1418   }
1419
1420   /**
1421    * generate a gff file for sequence features includes non-pos features by
1422    * default.
1423    *
1424    * @param seqs
1425    * @param visible
1426    * @return
1427    */
1428   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible)
1429   {
1430     return printGFFFormat(seqs, visible, true, true);
1431   }
1432
1433   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible,
1434           boolean visOnly, boolean nonpos)
1435   {
1436     StringBuffer out = new StringBuffer();
1437     SequenceFeature[] next;
1438     String source;
1439     boolean isnonpos;
1440     for (int i = 0; i < seqs.length; i++)
1441     {
1442       if (seqs[i].getSequenceFeatures() != null)
1443       {
1444         next = seqs[i].getSequenceFeatures();
1445         for (int j = 0; j < next.length; j++)
1446         {
1447           isnonpos = next[j].begin == 0 && next[j].end == 0;
1448           if ((!nonpos && isnonpos)
1449                   || (!isnonpos && visOnly && !visible
1450                           .containsKey(next[j].type)))
1451           {
1452             continue;
1453           }
1454
1455           source = next[j].featureGroup;
1456           if (source == null)
1457           {
1458             source = next[j].getDescription();
1459           }
1460
1461           out.append(seqs[i].getName());
1462           out.append("\t");
1463           out.append(source);
1464           out.append("\t");
1465           out.append(next[j].type);
1466           out.append("\t");
1467           out.append(next[j].begin);
1468           out.append("\t");
1469           out.append(next[j].end);
1470           out.append("\t");
1471           out.append(next[j].score);
1472           out.append("\t");
1473
1474           if (next[j].getValue("STRAND") != null)
1475           {
1476             out.append(next[j].getValue("STRAND"));
1477             out.append("\t");
1478           }
1479           else
1480           {
1481             out.append(".\t");
1482           }
1483
1484           if (next[j].getValue("FRAME") != null)
1485           {
1486             out.append(next[j].getValue("FRAME"));
1487           }
1488           else
1489           {
1490             out.append(".");
1491           }
1492           // TODO: verify/check GFF - should there be a /t here before attribute
1493           // output ?
1494
1495           if (next[j].getValue("ATTRIBUTES") != null)
1496           {
1497             out.append(next[j].getValue("ATTRIBUTES"));
1498           }
1499
1500           out.append(newline);
1501
1502         }
1503       }
1504     }
1505
1506     return out.toString();
1507   }
1508
1509   /**
1510    * this is only for the benefit of object polymorphism - method does nothing.
1511    */
1512   public void parse()
1513   {
1514     // IGNORED
1515   }
1516
1517   /**
1518    * this is only for the benefit of object polymorphism - method does nothing.
1519    *
1520    * @return error message
1521    */
1522   public String print()
1523   {
1524     return "USE printGFFFormat() or printJalviewFormat()";
1525   }
1526
1527 }