src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceDummy;
  27 import jalview.datamodel.SequenceFeature;
  28 import jalview.datamodel.SequenceI;
  29 import jalview.jsdev.GenericFileAdapter;
  30 import jalview.schemes.AnnotationColourGradient;
  31 import jalview.schemes.GraduatedColor;
  32 import jalview.schemes.UserColourScheme;
  33 import jalview.util.Format;
  34 import jalview.util.MapList;
  35 import jalview.util.ParseHtmlBodyAndLinks;
  36
  37 import java.io.IOException;
  38 import java.util.ArrayList;
  39 import java.util.Arrays;
  40 import java.util.HashMap;
  41 import java.util.Hashtable;
  42 import java.util.Iterator;
  43 import java.util.List;
  44 import java.util.Map;
  45 import java.util.StringTokenizer;
  46 import java.util.Vector;
  47
  48 /**
  49  * Parse and create Jalview Features files Detects GFF format features files and
  50  * parses. Does not implement standard print() - call specific printFeatures or
  51  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  52  * for the features annotation - this normally works on an exact match.
  53  *
  54  * @author AMW
  55  * @version $Revision$
  56  */
  57 public class FeaturesFile extends AlignFile
  58 {
  59   /**
  60    * work around for GFF interpretation bug where source string becomes
  61    * description rather than a group
  62    */
  63   private boolean doGffSource = true;
  64
  65   private int gffversion;
  66
  67   /**
  68    * Creates a new FeaturesFile object.
  69    */
  70   public FeaturesFile()
  71   {
  72   }
  73
  74   /**
  75    * @param inFile
  76    * @param type
  77    * @throws IOException
  78    */
  79   public FeaturesFile(String inFile, String type) throws IOException
  80   {
  81     super(inFile, type);
  82   }
  83
  84   /**
  85    * @param source
  86    * @throws IOException
  87    */
  88   public FeaturesFile(FileParse source) throws IOException
  89   {
  90     super(source);
  91   }
  92
  93   /**
  94    * @param parseImmediately
  95    * @param source
  96    * @throws IOException
  97    */
  98   public FeaturesFile(boolean parseImmediately, FileParse source)
  99           throws IOException
 100   {
 101     super(parseImmediately, source);
 102   }
 103
 104   /**
 105    * @param parseImmediately
 106    * @param inFile
 107    * @param type
 108    * @throws IOException
 109    */
 110   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 111           throws IOException
 112   {
 113     super(parseImmediately, inFile, type);
 114   }
 115
 116   /**
 117    * Parse GFF or sequence features file using case-independent matching,
 118    * discarding URLs
 119    *
 120    * @param align
 121    *          - alignment/dataset containing sequences that are to be annotated
 122    * @param colours
 123    *          - hashtable to store feature colour definitions
 124    * @param removeHTML
 125    *          - process html strings into plain text
 126    * @return true if features were added
 127    */
 128   public boolean parse(AlignmentI align, Hashtable colours,
 129           boolean removeHTML)
 130   {
 131     return parse(align, colours, null, removeHTML, false);
 132   }
 133
 134   /**
 135    * Parse GFF or sequence features file optionally using case-independent
 136    * matching, discarding URLs
 137    *
 138    * @param align
 139    *          - alignment/dataset containing sequences that are to be annotated
 140    * @param colours
 141    *          - hashtable to store feature colour definitions
 142    * @param removeHTML
 143    *          - process html strings into plain text
 144    * @param relaxedIdmatching
 145    *          - when true, ID matches to compound sequence IDs are allowed
 146    * @return true if features were added
 147    */
 148   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 149           boolean relaxedIdMatching)
 150   {
 151     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 152   }
 153
 154   /**
 155    * Parse GFF or sequence features file optionally using case-independent
 156    * matching
 157    *
 158    * @param align
 159    *          - alignment/dataset containing sequences that are to be annotated
 160    * @param colours
 161    *          - hashtable to store feature colour definitions
 162    * @param featureLink
 163    *          - hashtable to store associated URLs
 164    * @param removeHTML
 165    *          - process html strings into plain text
 166    * @return true if features were added
 167    */
 168   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 169           boolean removeHTML)
 170   {
 171     return parse(align, colours, featureLink, removeHTML, false);
 172   }
 173
 174   @Override
 175   public void addAnnotations(AlignmentI al)
 176   {
 177     // TODO Auto-generated method stub
 178     super.addAnnotations(al);
 179   }
 180
 181   @Override
 182   public void addProperties(AlignmentI al)
 183   {
 184     // TODO Auto-generated method stub
 185     super.addProperties(al);
 186   }
 187
 188   @Override
 189   public void addSeqGroups(AlignmentI al)
 190   {
 191     // TODO Auto-generated method stub
 192     super.addSeqGroups(al);
 193   }
 194
 195   /**
 196    * Parse GFF or sequence features file
 197    *
 198    * @param align
 199    *          - alignment/dataset containing sequences that are to be annotated
 200    * @param colours
 201    *          - hashtable to store feature colour definitions
 202    * @param featureLink
 203    *          - hashtable to store associated URLs
 204    * @param removeHTML
 205    *          - process html strings into plain text
 206    * @param relaxedIdmatching
 207    *          - when true, ID matches to compound sequence IDs are allowed
 208    * @return true if features were added
 209    */
 210   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 211           boolean removeHTML, boolean relaxedIdmatching)
 212   {
 213
 214     String line = null;
 215     try
 216     {
 217       SequenceI seq = null;
 218       /**
 219        * keep track of any sequences we try to create from the data if it is a GFF3 file
 220        */
 221       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 222       String type, desc, token = null;
 223
 224       int index, start, end;
 225       float score;
 226       StringTokenizer st;
 227       SequenceFeature sf;
 228       String featureGroup = null, groupLink = null;
 229       Map typeLink = new Hashtable();
 230       /**
 231        * when true, assume GFF style features rather than Jalview style.
 232        */
 233       boolean GFFFile = true;
 234       Map<String, String> gffProps = new HashMap<String, String>();
 235       while ((line = nextLine()) != null)
 236       {
 237         // skip comments/process pragmas
 238         if (line.startsWith("#"))
 239         {
 240           if (line.startsWith("##"))
 241           {
 242             // possibly GFF2/3 version and metadata header
 243             processGffPragma(line, gffProps, align, newseqs);
 244             line = "";
 245           }
 246           continue;
 247         }
 248
 249         st = new StringTokenizer(line, "\t");
 250         if (st.countTokens() == 1)
 251         {
 252           if (line.trim().equalsIgnoreCase("GFF"))
 253           {
 254             // Start parsing file as if it might be GFF again.
 255             GFFFile = true;
 256             continue;
 257           }
 258         }
 259         if (st.countTokens() > 1 && st.countTokens() < 4)
 260         {
 261           GFFFile = false;
 262           type = st.nextToken();
 263           if (type.equalsIgnoreCase("startgroup"))
 264           {
 265             featureGroup = st.nextToken();
 266             if (st.hasMoreElements())
 267             {
 268               groupLink = st.nextToken();
 269               featureLink.put(featureGroup, groupLink);
 270             }
 271           }
 272           else if (type.equalsIgnoreCase("endgroup"))
 273           {
 274             // We should check whether this is the current group,
 275             // but at present theres no way of showing more than 1 group
 276             st.nextToken();
 277             featureGroup = null;
 278             groupLink = null;
 279           }
 280           else
 281           {
 282             Object colour = null;
 283             String colscheme = st.nextToken();
 284             if (colscheme.indexOf("|") > -1
 285                     || colscheme.trim().equalsIgnoreCase("label"))
 286             {
 287               // Parse '|' separated graduated colourscheme fields:
 288               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 289               // can either provide 'label' only, first is optional, next two
 290               // colors are required (but may be
 291               // left blank), next is optional, nxt two min/max are required.
 292               // first is either 'label'
 293               // first/second and third are both hexadecimal or word equivalent
 294               // colour.
 295               // next two are values parsed as floats.
 296               // fifth is either 'above','below', or 'none'.
 297               // sixth is a float value and only required when fifth is either
 298               // 'above' or 'below'.
 299               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 300                       true);
 301               // set defaults
 302               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 303               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 304               boolean labelCol = false;
 305               // Parse spec line
 306               String mincol = gcol.nextToken();
 307               if (mincol == "|")
 308               {
 309                 System.err
 310                         .println("Expected either 'label' or a colour specification in the line: "
 311                                 + line);
 312                 continue;
 313               }
 314               String maxcol = null;
 315               if (mincol.toLowerCase().indexOf("label") == 0)
 316               {
 317                 labelCol = true;
 318                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 319                                                                            // '|'
 320                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 321               }
 322               String abso = null, minval, maxval;
 323               if (mincol != null)
 324               {
 325                 // at least four more tokens
 326                 if (mincol.equals("|"))
 327                 {
 328                   mincol = "";
 329                 }
 330                 else
 331                 {
 332                   gcol.nextToken(); // skip next '|'
 333                 }
 334                 // continue parsing rest of line
 335                 maxcol = gcol.nextToken();
 336                 if (maxcol.equals("|"))
 337                 {
 338                   maxcol = "";
 339                 }
 340                 else
 341                 {
 342                   gcol.nextToken(); // skip next '|'
 343                 }
 344                 abso = gcol.nextToken();
 345                 gcol.nextToken(); // skip next '|'
 346                 if (abso.toLowerCase().indexOf("abso") != 0)
 347                 {
 348                   minval = abso;
 349                   abso = null;
 350                 }
 351                 else
 352                 {
 353                   minval = gcol.nextToken();
 354                   gcol.nextToken(); // skip next '|'
 355                 }
 356                 maxval = gcol.nextToken();
 357                 if (gcol.hasMoreTokens())
 358                 {
 359                   gcol.nextToken(); // skip next '|'
 360                 }
 361                 try
 362                 {
 363                   if (minval.length() > 0)
 364                   {
 365                     min = new Float(minval).floatValue();
 366                   }
 367                 } catch (Exception e)
 368                 {
 369                   System.err
 370                           .println("Couldn't parse the minimum value for graduated colour for type ("
 371                                   + colscheme
 372                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 373                   e.printStackTrace();
 374                 }
 375                 try
 376                 {
 377                   if (maxval.length() > 0)
 378                   {
 379                     max = new Float(maxval).floatValue();
 380                   }
 381                 } catch (Exception e)
 382                 {
 383                   System.err
 384                           .println("Couldn't parse the maximum value for graduated colour for type ("
 385                                   + colscheme + ")");
 386                   e.printStackTrace();
 387                 }
 388               }
 389               else
 390               {
 391                 // add in some dummy min/max colours for the label-only
 392                 // colourscheme.
 393                 mincol = "FFFFFF";
 394                 maxcol = "000000";
 395               }
 396               try
 397               {
 398                 colour = new GraduatedColor(
 399                         new UserColourScheme(mincol).findColour('A'),
 400                         new UserColourScheme(maxcol).findColour('A'), min,
 401                         max);
 402               } catch (Exception e)
 403               {
 404                 System.err
 405                         .println("Couldn't parse the graduated colour scheme ("
 406                                 + colscheme + ")");
 407                 e.printStackTrace();
 408               }
 409               if (colour != null)
 410               {
 411                 ((GraduatedColor) colour)
 412                         .setColourByLabel(labelCol);
 413                 ((GraduatedColor) colour)
 414                         .setAutoScaled(abso == null);
 415                 // add in any additional parameters
 416                 String ttype = null, tval = null;
 417                 if (gcol.hasMoreTokens())
 418                 {
 419                   // threshold type and possibly a threshold value
 420                   ttype = gcol.nextToken();
 421                   if (ttype.toLowerCase().startsWith("below"))
 422                   {
 423                     ((GraduatedColor) colour)
 424                             .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 425                   }
 426                   else if (ttype.toLowerCase().startsWith("above"))
 427                   {
 428                     ((GraduatedColor) colour)
 429                             .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 430                   }
 431                   else
 432                   {
 433                     ((GraduatedColor) colour)
 434                             .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 435                     if (!ttype.toLowerCase().startsWith("no"))
 436                     {
 437                       System.err
 438                               .println("Ignoring unrecognised threshold type : "
 439                                       + ttype);
 440                     }
 441                   }
 442                 }
 443                 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 444                 {
 445                   try
 446                   {
 447                     gcol.nextToken();
 448                     tval = gcol.nextToken();
 449                     ((GraduatedColor) colour)
 450                             .setThresh(new Float(tval).floatValue());
 451                   } catch (Exception e)
 452                   {
 453                     System.err
 454                             .println("Couldn't parse threshold value as a float: ("
 455                                     + tval + ")");
 456                     e.printStackTrace();
 457                   }
 458                 }
 459                 // parse the thresh-is-min token ?
 460                 if (gcol.hasMoreTokens())
 461                 {
 462                   System.err
 463                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 464                   while (gcol.hasMoreTokens())
 465                   {
 466                     System.err.println("|" + gcol.nextToken());
 467                   }
 468                   System.err.println("\n");
 469                 }
 470               }
 471             }
 472             else
 473             {
 474               UserColourScheme ucs = new UserColourScheme(colscheme);
 475               colour = ucs.findColour('A');
 476             }
 477             if (colour != null)
 478             {
 479               colours.put(type, colour);
 480             }
 481             if (st.hasMoreElements())
 482             {
 483               String link = st.nextToken();
 484               typeLink.put(type, link);
 485               if (featureLink == null)
 486               {
 487                 featureLink = new Hashtable();
 488               }
 489               featureLink.put(type, link);
 490             }
 491           }
 492           continue;
 493         }
 494         String seqId = "";
 495         while (st.hasMoreElements())
 496         {
 497
 498           if (GFFFile)
 499           {
 500             // Still possible this is an old Jalview file,
 501             // which does not have type colours at the beginning
 502             seqId = token = st.nextToken();
 503             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 504             if (seq != null)
 505             {
 506               desc = st.nextToken();
 507               String group = null;
 508               if (doGffSource && desc.indexOf(' ') == -1)
 509               {
 510                 // could also be a source term rather than description line
 511                 group = new String(desc);
 512               }
 513               type = st.nextToken();
 514               try
 515               {
 516                 String stt = st.nextToken();
 517                 if (stt.length() == 0 || stt.equals("-"))
 518                 {
 519                   start = 0;
 520                 }
 521                 else
 522                 {
 523                   start = Integer.parseInt(stt);
 524                 }
 525               } catch (NumberFormatException ex)
 526               {
 527                 start = 0;
 528               }
 529               try
 530               {
 531                 String stt = st.nextToken();
 532                 if (stt.length() == 0 || stt.equals("-"))
 533                 {
 534                   end = 0;
 535                 }
 536                 else
 537                 {
 538                   end = Integer.parseInt(stt);
 539                 }
 540               } catch (NumberFormatException ex)
 541               {
 542                 end = 0;
 543               }
 544               // TODO: decide if non positional feature assertion for input data
 545               // where end==0 is generally valid
 546               if (end == 0)
 547               {
 548                 // treat as non-positional feature, regardless.
 549                 start = 0;
 550               }
 551               try
 552               {
 553                 score = new Float(st.nextToken()).floatValue();
 554               } catch (NumberFormatException ex)
 555               {
 556                 score = 0;
 557               }
 558
 559               sf = new SequenceFeature(type, desc, start, end, score, group);
 560
 561               try
 562               {
 563                 sf.setValue("STRAND", st.nextToken());
 564                 sf.setValue("FRAME", st.nextToken());
 565               } catch (Exception ex)
 566               {
 567               }
 568
 569               if (st.hasMoreTokens())
 570               {
 571                 StringBuffer attributes = new StringBuffer();
 572                 boolean sep = false;
 573                 while (st.hasMoreTokens())
 574                 {
 575                   attributes.append((sep ? "\t" : "") + st.nextElement());
 576                   sep = true;
 577                 }
 578                 // TODO validate and split GFF2 attributes field ? parse out
 579                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 580                 // sf.setValue(attrib, val);
 581                 sf.setValue("ATTRIBUTES", attributes.toString());
 582               }
 583
 584               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 585                       relaxedIdmatching))
 586               {
 587                 // check whether we should add the sequence feature to any other
 588                 // sequences in the alignment with the same or similar
 589                 while ((seq = align.findName(seq, seqId, true)) != null)
 590                 {
 591                   seq.addSequenceFeature(new SequenceFeature(sf));
 592                 }
 593               }
 594               break;
 595             }
 596           }
 597
 598           if (GFFFile && seq == null)
 599           {
 600             desc = token;
 601           }
 602           else
 603           {
 604             desc = st.nextToken();
 605           }
 606           if (!st.hasMoreTokens())
 607           {
 608             System.err
 609                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 610             // in all probability, this isn't a file we understand, so bail
 611             // quietly.
 612             return false;
 613           }
 614
 615           token = st.nextToken();
 616
 617           if (!token.equals("ID_NOT_SPECIFIED"))
 618           {
 619             seq = findName(align, seqId = token, relaxedIdmatching, null);
 620             st.nextToken();
 621           }
 622           else
 623           {
 624             seqId = null;
 625             try
 626             {
 627               index = Integer.parseInt(st.nextToken());
 628               seq = align.getSequenceAt(index);
 629             } catch (NumberFormatException ex)
 630             {
 631               seq = null;
 632             }
 633           }
 634
 635           if (seq == null)
 636           {
 637             System.out.println("Sequence not found: " + line);
 638             break;
 639           }
 640
 641           start = Integer.parseInt(st.nextToken());
 642           end = Integer.parseInt(st.nextToken());
 643
 644           type = st.nextToken();
 645
 646           if (!colours.containsKey(type))
 647           {
 648             // Probably the old style groups file
 649             UserColourScheme ucs = new UserColourScheme(type);
 650             colours.put(type, ucs.findColour('A'));
 651           }
 652           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 653           if (st.hasMoreTokens())
 654           {
 655             try
 656             {
 657               score = new Float(st.nextToken()).floatValue();
 658               // update colourgradient bounds if allowed to
 659             } catch (NumberFormatException ex)
 660             {
 661               score = 0;
 662             }
 663             sf.setScore(score);
 664           }
 665           if (groupLink != null && removeHTML)
 666           {
 667             sf.addLink(groupLink);
 668             sf.description += "%LINK%";
 669           }
 670           if (typeLink.containsKey(type) && removeHTML)
 671           {
 672             sf.addLink(typeLink.get(type).toString());
 673             sf.description += "%LINK%";
 674           }
 675
 676           parseDescriptionHTML(sf, removeHTML);
 677
 678           seq.addSequenceFeature(sf);
 679
 680           while (seqId != null
 681                   && (seq = align.findName(seq, seqId, false)) != null)
 682           {
 683             seq.addSequenceFeature(new SequenceFeature(sf));
 684           }
 685           // If we got here, its not a GFFFile
 686           GFFFile = false;
 687         }
 688       }
 689       resetMatcher();
 690     } catch (Exception ex)
 691     {
 692       // should report somewhere useful for UI if necessary
 693       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 694               + "Parsing error at\n" + line;
 695       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 696       ex.printStackTrace(System.err);
 697       resetMatcher();
 698       return false;
 699     }
 700
 701     return true;
 702   }
 703
 704   private enum GffPragmas
 705   {
 706     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 707   };
 708
 709   private static Map<String, GffPragmas> GFFPRAGMA;
 710   static
 711   {
 712     GFFPRAGMA = new HashMap<String, GffPragmas>();
 713     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 714     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 715     GFFPRAGMA.put("#", GffPragmas.hash);
 716     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 717     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 718     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 719     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 720   }
 721
 722   private void processGffPragma(String line, Map<String, String> gffProps,
 723           AlignmentI align, ArrayList<SequenceI> newseqs)
 724           throws IOException
 725   {
 726     // line starts with ##
 727     int spacepos = line.indexOf(' ');
 728     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 729             .substring(2, spacepos);
 730     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 731     if (gffpragma == null)
 732     {
 733       return;
 734     }
 735     switch (gffpragma)
 736     {
 737     case gff_version:
 738       try
 739       {
 740         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 741       } finally
 742       {
 743
 744       }
 745       break;
 746     case feature_ontology:
 747       // resolve against specific feature ontology
 748       break;
 749     case attribute_ontology:
 750       // resolve against specific attribute ontology
 751       break;
 752     case source_ontology:
 753       // resolve against specific source ontology
 754       break;
 755     case species_build:
 756       // resolve against specific NCBI taxon version
 757       break;
 758     case hash:
 759       // close off any open feature hierarchies
 760       break;
 761     case fasta:
 762       // process the rest of the file as a fasta file and replace any dummy
 763       // sequence IDs
 764       process_as_fasta(align, newseqs);
 765       break;
 766     default:
 767       // we do nothing ?
 768       System.err.println("Ignoring unknown pragma:\n" + line);
 769     }
 770   }
 771
 772   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 773           throws IOException
 774   {
 775     try
 776     {
 777       mark();
 778     } catch (IOException q)
 779     {
 780     }
 781     AlignFile parser = GenericFileAdapter.getFile("FastaFile");
 782     List<SequenceI> includedseqs = parser.getSeqs();
 783     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 784     // iterate over includedseqs, and replacing matching ones with newseqs
 785     // sequences. Generic iterator not used here because we modify includedseqs
 786     // as we go
 787     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 788     {
 789       // search for any dummy seqs that this sequence can be used to update
 790       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 791       if (dummyseq != null)
 792       {
 793         // dummyseq was created so it could be annotated and referred to in
 794         // alignments/codon mappings
 795
 796         SequenceI mseq = includedseqs.get(p);
 797         // mseq is the 'template' imported from the FASTA file which we'll use
 798         // to coomplete dummyseq
 799         if (dummyseq instanceof SequenceDummy)
 800         {
 801           // probably have the pattern wrong
 802           // idea is that a flyweight proxy for a sequence ID can be created for
 803           // 1. stable reference creation
 804           // 2. addition of annotation
 805           // 3. future replacement by a real sequence
 806           // current pattern is to create SequenceDummy objects - a convenience
 807           // constructor for a Sequence.
 808           // problem is that when promoted to a real sequence, all references
 809           // need
 810           // to be updated somehow.
 811           ((SequenceDummy) dummyseq).become(mseq);
 812           includedseqs.set(p, dummyseq); // template is no longer needed
 813         }
 814       }
 815     }
 816     // finally add sequences to the dataset
 817     for (SequenceI seq : includedseqs)
 818     {
 819       align.addSequence(seq);
 820     }
 821   }
 822
 823   /**
 824    * take a sequence feature and examine its attributes to decide how it should
 825    * be added to a sequence
 826    *
 827    * @param seq
 828    *          - the destination sequence constructed or discovered in the
 829    *          current context
 830    * @param sf
 831    *          - the base feature with ATTRIBUTES property containing any
 832    *          additional attributes
 833    * @param gFFFile
 834    *          - true if we are processing a GFF annotation file
 835    * @return true if sf was actually added to the sequence, false if it was
 836    *         processed in another way
 837    */
 838   public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 839           boolean gFFFile, boolean relaxedIdMatching)
 840   {
 841     String attr = (String) sf.getValue("ATTRIBUTES");
 842     boolean add = true;
 843     if (gFFFile && attr != null)
 844     {
 845       int nattr=8;
 846
 847       for (String attset : attr.split("\t"))
 848       {
 849         if (attset==null || attset.trim().length()==0)
 850         {
 851           continue;
 852         }
 853         nattr++;
 854         Map<String, List<String>> set = new HashMap<String, List<String>>();
 855         // normally, only expect one column - 9 - in this field
 856         // the attributes (Gff3) or groups (gff2) field
 857         for (String pair : attset.trim().split(";"))
 858         {
 859           pair = pair.trim();
 860           if (pair.length() == 0)
 861           {
 862             continue;
 863           }
 864
 865           // expect either space seperated (gff2) or '=' separated (gff3)
 866           // key/value pairs here
 867
 868           int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
 869           String key = null, value = null;
 870
 871           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 872           {
 873             key = pair.substring(0, sppos);
 874             value = pair.substring(sppos + 1);
 875           } else {
 876             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 877             {
 878               key = pair.substring(0, eqpos);
 879               value = pair.substring(eqpos + 1);
 880             } else
 881             {
 882               key = pair;
 883             }
 884           }
 885           if (key != null)
 886           {
 887             List<String> vals = set.get(key);
 888             if (vals == null)
 889             {
 890               vals = new ArrayList<String>();
 891               set.put(key, vals);
 892             }
 893             if (value != null)
 894             {
 895               vals.add(value.trim());
 896             }
 897           }
 898         }
 899         try
 900         {
 901           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 902                   relaxedIdMatching); // process decides if
 903                                                      // feature is actually
 904                                                      // added
 905         } catch (InvalidGFF3FieldException ivfe)
 906         {
 907           System.err.println(ivfe);
 908         }
 909       }
 910     }
 911     if (add)
 912     {
 913       seq.addSequenceFeature(sf);
 914     }
 915     return add;
 916   }
 917
 918   public class InvalidGFF3FieldException extends Exception
 919   {
 920     String field, value;
 921
 922     public InvalidGFF3FieldException(String field,
 923             Map<String, List<String>> set, String message)
 924     {
 925       super(message + " (Field was " + field + " and value was "
 926               + set.get(field).toString());
 927       this.field = field;
 928       this.value = set.get(field).toString();
 929     }
 930
 931   }
 932
 933   /**
 934    * take a set of keys for a feature and interpret them
 935    *
 936    * @param set
 937    * @param nattr
 938    * @param seq
 939    * @param sf
 940    * @return
 941    */
 942   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 943           SequenceI seq, SequenceFeature sf, AlignmentI align,
 944           List<SequenceI> newseqs, boolean relaxedIdMatching)
 945           throws InvalidGFF3FieldException
 946   {
 947     String attr;
 948     // decide how to interpret according to type
 949     if (sf.getType().equals("similarity"))
 950     {
 951       int strand = sf.getStrand();
 952       // exonerate cdna/protein map
 953       // look for fields
 954       List<SequenceI> querySeq = findNames(align, newseqs,
 955               relaxedIdMatching, set.get(attr="Query"));
 956       if (querySeq==null || querySeq.size()!=1)
 957       {
 958         throw new InvalidGFF3FieldException( attr, set,
 959                 "Expecting exactly one sequence in Query field (got "
 960                         + set.get(attr) + ")");
 961       }
 962       if (set.containsKey(attr="Align"))
 963       {
 964         // process the align maps and create cdna/protein maps
 965         // ideally, the query sequences are in the alignment, but maybe not...
 966
 967         AlignedCodonFrame alco = new AlignedCodonFrame();
 968         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 969                 strand);
 970
 971         // add codon mapping, and hope!
 972         alco.addMap(seq, querySeq.get(0), codonmapping);
 973         align.addCodonFrame(alco);
 974         // everything that's needed to be done is done
 975         // no features to create here !
 976         return false;
 977       }
 978
 979     }
 980     return true;
 981   }
 982
 983   private MapList constructCodonMappingFromAlign(
 984           Map<String, List<String>> set,
 985           String attr, int strand) throws InvalidGFF3FieldException
 986   {
 987     if (strand == 0)
 988     {
 989       throw new InvalidGFF3FieldException(attr, set,
 990               "Invalid strand for a codon mapping (cannot be 0)");
 991     }
 992     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 993     int lastppos = 0, lastpframe = 0;
 994     for (String range : set.get(attr))
 995     {
 996       List<Integer> ints = new ArrayList<Integer>();
 997       StringTokenizer st = new StringTokenizer(range, " ");
 998       while (st.hasMoreTokens())
 999       {
1000         String num = st.nextToken();
1001         try
1002         {
1003           ints.add(new Integer(num));
1004         } catch (NumberFormatException nfe)
1005         {
1006           throw new InvalidGFF3FieldException(attr, set,
1007                   "Invalid number in field " + num);
1008         }
1009       }
1010       // Align positionInRef positionInQuery LengthInRef
1011       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
1012       // 3652 - . alignment_id 0 ;
1013       // Query DDB_G0269124
1014       // Align 11270 143 120
1015       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
1016       // dna in strand direction
1017       // Align 11150 187 282
1018       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
1019       // dna in strand direction
1020       //
1021       // Align 10865 281 888
1022       // Align 9977 578 1068
1023       // Align 8909 935 375
1024       //
1025       if (ints.size() != 3)
1026       {
1027         throw new InvalidGFF3FieldException(attr, set,
1028                 "Invalid number of fields for this attribute ("
1029                         + ints.size() + ")");
1030       }
1031       fromrange.add(new Integer(ints.get(0).intValue()));
1032       fromrange.add(new Integer(ints.get(0).intValue() + strand
1033               * ints.get(2).intValue()));
1034       // how are intron/exon boundaries that do not align in codons
1035       // represented
1036       if (ints.get(1).equals(lastppos) && lastpframe > 0)
1037       {
1038         // extend existing to map
1039         lastppos += ints.get(2) / 3;
1040         lastpframe = ints.get(2) % 3;
1041         torange.set(torange.size() - 1, new Integer(lastppos));
1042       }
1043       else
1044       {
1045         // new to map range
1046         torange.add(ints.get(1));
1047         lastppos = ints.get(1) + ints.get(2) / 3;
1048         lastpframe = ints.get(2) % 3;
1049         torange.add(new Integer(lastppos));
1050       }
1051     }
1052     // from and to ranges must end up being a series of start/end intervals
1053     if (fromrange.size() % 2 == 1)
1054     {
1055       throw new InvalidGFF3FieldException(attr, set,
1056               "Couldn't parse the DNA alignment range correctly");
1057     }
1058     if (torange.size() % 2 == 1)
1059     {
1060       throw new InvalidGFF3FieldException(attr, set,
1061               "Couldn't parse the protein alignment range correctly");
1062     }
1063     // finally, build the map
1064     int[] frommap = new int[fromrange.size()], tomap = new int[torange
1065             .size()];
1066     int p = 0;
1067     for (Integer ip : fromrange)
1068     {
1069       frommap[p++] = ip.intValue();
1070     }
1071     p = 0;
1072     for (Integer ip : torange)
1073     {
1074       tomap[p++] = ip.intValue();
1075     }
1076
1077     return new MapList(frommap, tomap, 3, 1);
1078   }
1079
1080   private List<SequenceI> findNames(AlignmentI align,
1081           List<SequenceI> newseqs, boolean relaxedIdMatching,
1082           List<String> list)
1083   {
1084     List<SequenceI> found = new ArrayList<SequenceI>();
1085     for (String seqId : list)
1086     {
1087       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
1088       if (seq != null)
1089       {
1090         found.add(seq);
1091       }
1092     }
1093     return found;
1094   }
1095
1096   private AlignmentI lastmatchedAl = null;
1097
1098   private SequenceIdMatcher matcher = null;
1099
1100   /**
1101    * clear any temporary handles used to speed up ID matching
1102    */
1103   private void resetMatcher()
1104   {
1105     lastmatchedAl = null;
1106     matcher = null;
1107   }
1108
1109   private SequenceI findName(AlignmentI align, String seqId,
1110           boolean relaxedIdMatching, List<SequenceI> newseqs)
1111   {
1112     SequenceI match = null;
1113     if (relaxedIdMatching)
1114     {
1115       if (lastmatchedAl != align)
1116       {
1117         matcher = new SequenceIdMatcher(
1118                 (lastmatchedAl = align).getSequencesArray());
1119         if (newseqs != null)
1120         {
1121           matcher.addAll(newseqs);
1122         }
1123       }
1124       match = matcher.findIdMatch(seqId);
1125     }
1126     else
1127     {
1128       match = align.findName(seqId, true);
1129       if (match == null && newseqs != null)
1130       {
1131         for (SequenceI m : newseqs)
1132         {
1133           if (seqId.equals(m.getName()))
1134           {
1135             return m;
1136           }
1137         }
1138       }
1139
1140     }
1141     if (match==null && newseqs!=null)
1142     {
1143       match = new SequenceDummy(seqId);
1144       if (relaxedIdMatching)
1145       {
1146         matcher.addAll(Arrays.asList(new SequenceI[]
1147         { match }));
1148       }
1149       // add dummy sequence to the newseqs list
1150       newseqs.add(match);
1151     }
1152     return match;
1153   }
1154   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
1155   {
1156     if (sf.getDescription() == null)
1157     {
1158       return;
1159     }
1160     ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks(
1161             sf.getDescription(), removeHTML, newline);
1162
1163     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
1164             : sf.description;
1165     for (String link : parsed.getLinks())
1166     {
1167       sf.addLink(link);
1168     }
1169
1170   }
1171
1172   /**
1173    * generate a features file for seqs includes non-pos features by default.
1174    *
1175    * @param seqs
1176    *          source of sequence features
1177    * @param visible
1178    *          hash of feature types and colours
1179    * @return features file contents
1180    */
1181   public String printJalviewFormat(SequenceI[] seqs, Map<String,Object> visible)
1182   {
1183     return printJalviewFormat(seqs, visible, true, true);
1184   }
1185
1186   /**
1187    * generate a features file for seqs with colours from visible (if any)
1188    *
1189    * @param seqs
1190    *          source of features
1191    * @param visible
1192    *          hash of Colours for each feature type
1193    * @param visOnly
1194    *          when true only feature types in 'visible' will be output
1195    * @param nonpos
1196    *          indicates if non-positional features should be output (regardless
1197    *          of group or type)
1198    * @return features file contents
1199    */
1200   public String printJalviewFormat(SequenceI[] seqs, Map visible,
1201           boolean visOnly, boolean nonpos)
1202   {
1203     StringBuffer out = new StringBuffer();
1204     SequenceFeature[] next;
1205     boolean featuresGen = false;
1206     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1207     {
1208       // no point continuing.
1209       return "No Features Visible";
1210     }
1211
1212     if (visible != null && visOnly)
1213     {
1214       // write feature colours only if we're given them and we are generating
1215       // viewed features
1216       // TODO: decide if feature links should also be written here ?
1217       Iterator en = visible.keySet().iterator();
1218       String type, color;
1219       while (en.hasNext())
1220       {
1221         type = en.next().toString();
1222
1223         if (visible.get(type) instanceof GraduatedColor)
1224         {
1225           GraduatedColor gc = (GraduatedColor) visible.get(type);
1226           color = (gc.isColourByLabel() ? "label|" : "")
1227                   + Format.getHexString(gc.getMinColor()) + "|"
1228                   + Format.getHexString(gc.getMaxColor())
1229                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
1230                   + gc.getMax() + "|";
1231           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
1232           {
1233             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
1234             {
1235               color += "below";
1236             }
1237             else
1238             {
1239               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
1240               {
1241                 System.err.println("WARNING: Unsupported threshold type ("
1242                         + gc.getThreshType() + ") : Assuming 'above'");
1243               }
1244               color += "above";
1245             }
1246             // add the value
1247             color += "|" + gc.getThresh();
1248           }
1249           else
1250           {
1251             color += "none";
1252           }
1253         }
1254         else if (visible.get(type) instanceof java.awt.Color)
1255         {
1256           color = Format.getHexString((java.awt.Color) visible.get(type));
1257         }
1258         else
1259         {
1260           // legacy support for integer objects containing colour triplet values
1261           color = Format.getHexString(new java.awt.Color(Integer
1262                   .parseInt(visible.get(type).toString())));
1263         }
1264         out.append(type);
1265         out.append("\t");
1266         out.append(color);
1267         out.append(newline);
1268       }
1269     }
1270     // Work out which groups are both present and visible
1271     Vector groups = new Vector();
1272     int groupIndex = 0;
1273     boolean isnonpos = false;
1274
1275     for (int i = 0; i < seqs.length; i++)
1276     {
1277       next = seqs[i].getSequenceFeatures();
1278       if (next != null)
1279       {
1280         for (int j = 0; j < next.length; j++)
1281         {
1282           isnonpos = next[j].begin == 0 && next[j].end == 0;
1283           if ((!nonpos && isnonpos)
1284                   || (!isnonpos && visOnly && !visible
1285                           .containsKey(next[j].type)))
1286           {
1287             continue;
1288           }
1289
1290           if (next[j].featureGroup != null
1291                   && !groups.contains(next[j].featureGroup))
1292           {
1293             groups.addElement(next[j].featureGroup);
1294           }
1295         }
1296       }
1297     }
1298
1299     String group = null;
1300     do
1301     {
1302
1303       if (groups.size() > 0 && groupIndex < groups.size())
1304       {
1305         group = groups.elementAt(groupIndex).toString();
1306         out.append(newline);
1307         out.append("STARTGROUP\t");
1308         out.append(group);
1309         out.append(newline);
1310       }
1311       else
1312       {
1313         group = null;
1314       }
1315
1316       for (int i = 0; i < seqs.length; i++)
1317       {
1318         next = seqs[i].getSequenceFeatures();
1319         if (next != null)
1320         {
1321           for (int j = 0; j < next.length; j++)
1322           {
1323             isnonpos = next[j].begin == 0 && next[j].end == 0;
1324             if ((!nonpos && isnonpos)
1325                     || (!isnonpos && visOnly && !visible
1326                             .containsKey(next[j].type)))
1327             {
1328               // skip if feature is nonpos and we ignore them or if we only
1329               // output visible and it isn't non-pos and it's not visible
1330               continue;
1331             }
1332
1333             if (group != null
1334                     && (next[j].featureGroup == null || !next[j].featureGroup
1335                             .equals(group)))
1336             {
1337               continue;
1338             }
1339
1340             if (group == null && next[j].featureGroup != null)
1341             {
1342               continue;
1343             }
1344             // we have features to output
1345             featuresGen = true;
1346             if (next[j].description == null
1347                     || next[j].description.equals(""))
1348             {
1349               out.append(next[j].type + "\t");
1350             }
1351             else
1352             {
1353               if (next[j].links != null
1354                       && next[j].getDescription().indexOf("<html>") == -1)
1355               {
1356                 out.append("<html>");
1357               }
1358
1359               out.append(next[j].description + " ");
1360               if (next[j].links != null)
1361               {
1362                 for (int l = 0; l < next[j].links.size(); l++)
1363                 {
1364                   String label = next[j].links.elementAt(l).toString();
1365                   String href = label.substring(label.indexOf("|") + 1);
1366                   label = label.substring(0, label.indexOf("|"));
1367
1368                   if (next[j].description.indexOf(href) == -1)
1369                   {
1370                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1371                   }
1372                 }
1373
1374                 if (next[j].getDescription().indexOf("</html>") == -1)
1375                 {
1376                   out.append("</html>");
1377                 }
1378               }
1379
1380               out.append("\t");
1381             }
1382             out.append(seqs[i].getName());
1383             out.append("\t-1\t");
1384             out.append("" + next[j].begin);
1385             out.append("\t");
1386             out.append("" + next[j].end);
1387             out.append("\t");
1388             out.append(next[j].type);
1389             if (!Float.isNaN(next[j].score))
1390             {
1391               out.append("\t");
1392               out.append(next[j].score);
1393             }
1394             out.append(newline);
1395           }
1396         }
1397       }
1398
1399       if (group != null)
1400       {
1401         out.append("ENDGROUP\t");
1402         out.append(group);
1403         out.append(newline);
1404         groupIndex++;
1405       }
1406       else
1407       {
1408         break;
1409       }
1410
1411     } while (groupIndex < groups.size() + 1);
1412
1413     if (!featuresGen)
1414     {
1415       return "No Features Visible";
1416     }
1417
1418     return out.toString();
1419   }
1420
1421   /**
1422    * generate a gff file for sequence features includes non-pos features by
1423    * default.
1424    *
1425    * @param seqs
1426    * @param visible
1427    * @return
1428    */
1429   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible)
1430   {
1431     return printGFFFormat(seqs, visible, true, true);
1432   }
1433
1434   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible,
1435           boolean visOnly, boolean nonpos)
1436   {
1437     StringBuffer out = new StringBuffer();
1438     SequenceFeature[] next;
1439     String source;
1440     boolean isnonpos;
1441     for (int i = 0; i < seqs.length; i++)
1442     {
1443       if (seqs[i].getSequenceFeatures() != null)
1444       {
1445         next = seqs[i].getSequenceFeatures();
1446         for (int j = 0; j < next.length; j++)
1447         {
1448           isnonpos = next[j].begin == 0 && next[j].end == 0;
1449           if ((!nonpos && isnonpos)
1450                   || (!isnonpos && visOnly && !visible
1451                           .containsKey(next[j].type)))
1452           {
1453             continue;
1454           }
1455
1456           source = next[j].featureGroup;
1457           if (source == null)
1458           {
1459             source = next[j].getDescription();
1460           }
1461
1462           out.append(seqs[i].getName());
1463           out.append("\t");
1464           out.append(source);
1465           out.append("\t");
1466           out.append(next[j].type);
1467           out.append("\t");
1468           out.append("" + next[j].begin);
1469           out.append("\t");
1470           out.append("" + next[j].end);
1471           out.append("\t");
1472           out.append(next[j].score);
1473           out.append("\t");
1474
1475           if (next[j].getValue("STRAND") != null)
1476           {
1477             out.append(next[j].getValue("STRAND"));
1478             out.append("\t");
1479           }
1480           else
1481           {
1482             out.append(".\t");
1483           }
1484
1485           if (next[j].getValue("FRAME") != null)
1486           {
1487             out.append(next[j].getValue("FRAME"));
1488           }
1489           else
1490           {
1491             out.append(".");
1492           }
1493           // TODO: verify/check GFF - should there be a /t here before attribute
1494           // output ?
1495
1496           if (next[j].getValue("ATTRIBUTES") != null)
1497           {
1498             out.append(next[j].getValue("ATTRIBUTES"));
1499           }
1500
1501           out.append(newline);
1502
1503         }
1504       }
1505     }
1506
1507     return out.toString();
1508   }
1509
1510   /**
1511    * this is only for the benefit of object polymorphism - method does nothing.
1512    */
1513   public void parse()
1514   {
1515     // IGNORED
1516   }
1517
1518   /**
1519    * this is only for the benefit of object polymorphism - method does nothing.
1520    *
1521    * @return error message
1522    */
1523   public String print()
1524   {
1525     return "USE printGFFFormat() or printJalviewFormat()";
1526   }
1527
1528 }