src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceDummy;
  27 import jalview.datamodel.SequenceFeature;
  28 import jalview.datamodel.SequenceI;
  29 import jalview.schemes.AnnotationColourGradient;
  30 import jalview.schemes.GraduatedColor;
  31 import jalview.schemes.UserColourScheme;
  32 import jalview.util.Format;
  33 import jalview.util.MapList;
  34
  35 import java.io.IOException;
  36 import java.util.ArrayList;
  37 import java.util.Arrays;
  38 import java.util.HashMap;
  39 import java.util.Hashtable;
  40 import java.util.Iterator;
  41 import java.util.List;
  42 import java.util.Map;
  43 import java.util.StringTokenizer;
  44 import java.util.Vector;
  45
  46 /**
  47  * Parse and create Jalview Features files Detects GFF format features files and
  48  * parses. Does not implement standard print() - call specific printFeatures or
  49  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  50  * for the features annotation - this normally works on an exact match.
  51  *
  52  * @author AMW
  53  * @version $Revision$
  54  */
  55 public class FeaturesFile extends AlignFile
  56 {
  57   /**
  58    * work around for GFF interpretation bug where source string becomes
  59    * description rather than a group
  60    */
  61   private boolean doGffSource = true;
  62
  63   /**
  64    * Creates a new FeaturesFile object.
  65    */
  66   public FeaturesFile()
  67   {
  68   }
  69
  70   /**
  71    * Creates a new FeaturesFile object.
  72    *
  73    * @param inFile
  74    *          DOCUMENT ME!
  75    * @param type
  76    *          DOCUMENT ME!
  77    *
  78    * @throws IOException
  79    *           DOCUMENT ME!
  80    */
  81   public FeaturesFile(String inFile, String type) throws IOException
  82   {
  83     super(inFile, type);
  84   }
  85
  86   public FeaturesFile(FileParse source) throws IOException
  87   {
  88     super(source);
  89   }
  90
  91   /**
  92    * Parse GFF or sequence features file using case-independent matching,
  93    * discarding URLs
  94    *
  95    * @param align
  96    *          - alignment/dataset containing sequences that are to be annotated
  97    * @param colours
  98    *          - hashtable to store feature colour definitions
  99    * @param removeHTML
 100    *          - process html strings into plain text
 101    * @return true if features were added
 102    */
 103   public boolean parse(AlignmentI align, Hashtable colours,
 104           boolean removeHTML)
 105   {
 106     return parse(align, colours, null, removeHTML, false);
 107   }
 108
 109   /**
 110    * Parse GFF or sequence features file optionally using case-independent
 111    * matching, discarding URLs
 112    *
 113    * @param align
 114    *          - alignment/dataset containing sequences that are to be annotated
 115    * @param colours
 116    *          - hashtable to store feature colour definitions
 117    * @param removeHTML
 118    *          - process html strings into plain text
 119    * @param relaxedIdmatching
 120    *          - when true, ID matches to compound sequence IDs are allowed
 121    * @return true if features were added
 122    */
 123   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 124           boolean relaxedIdMatching)
 125   {
 126     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 127   }
 128
 129   /**
 130    * Parse GFF or sequence features file optionally using case-independent
 131    * matching
 132    *
 133    * @param align
 134    *          - alignment/dataset containing sequences that are to be annotated
 135    * @param colours
 136    *          - hashtable to store feature colour definitions
 137    * @param featureLink
 138    *          - hashtable to store associated URLs
 139    * @param removeHTML
 140    *          - process html strings into plain text
 141    * @return true if features were added
 142    */
 143   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 144           boolean removeHTML)
 145   {
 146     return parse(align, colours, featureLink, removeHTML, false);
 147   }
 148
 149   /**
 150    * Parse GFF or sequence features file
 151    *
 152    * @param align
 153    *          - alignment/dataset containing sequences that are to be annotated
 154    * @param colours
 155    *          - hashtable to store feature colour definitions
 156    * @param featureLink
 157    *          - hashtable to store associated URLs
 158    * @param removeHTML
 159    *          - process html strings into plain text
 160    * @param relaxedIdmatching
 161    *          - when true, ID matches to compound sequence IDs are allowed
 162    * @return true if features were added
 163    */
 164   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 165           boolean removeHTML, boolean relaxedIdmatching)
 166   {
 167
 168     String line = null;
 169     try
 170     {
 171       SequenceI seq = null;
 172       /**
 173        * keep track of any sequences we try to create from the data if it is a GFF3 file
 174        */
 175       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 176       String type, desc, token = null;
 177
 178       int index, start, end;
 179       float score;
 180       StringTokenizer st;
 181       SequenceFeature sf;
 182       String featureGroup = null, groupLink = null;
 183       Map typeLink = new Hashtable();
 184       /**
 185        * when true, assume GFF style features rather than Jalview style.
 186        */
 187       boolean GFFFile = true;
 188       while ((line = nextLine()) != null)
 189       {
 190         if (line.startsWith("#"))
 191         {
 192           continue;
 193         }
 194
 195         st = new StringTokenizer(line, "\t");
 196         if (st.countTokens() == 1)
 197         {
 198           if (line.trim().equalsIgnoreCase("GFF"))
 199           {
 200             // Start parsing file as if it might be GFF again.
 201             GFFFile = true;
 202             continue;
 203           }
 204         }
 205         if (st.countTokens() > 1 && st.countTokens() < 4)
 206         {
 207           GFFFile = false;
 208           type = st.nextToken();
 209           if (type.equalsIgnoreCase("startgroup"))
 210           {
 211             featureGroup = st.nextToken();
 212             if (st.hasMoreElements())
 213             {
 214               groupLink = st.nextToken();
 215               featureLink.put(featureGroup, groupLink);
 216             }
 217           }
 218           else if (type.equalsIgnoreCase("endgroup"))
 219           {
 220             // We should check whether this is the current group,
 221             // but at present theres no way of showing more than 1 group
 222             st.nextToken();
 223             featureGroup = null;
 224             groupLink = null;
 225           }
 226           else
 227           {
 228             Object colour = null;
 229             String colscheme = st.nextToken();
 230             if (colscheme.indexOf("|") > -1
 231                     || colscheme.trim().equalsIgnoreCase("label"))
 232             {
 233               // Parse '|' separated graduated colourscheme fields:
 234               // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
 235               // can either provide 'label' only, first is optional, next two
 236               // colors are required (but may be
 237               // left blank), next is optional, nxt two min/max are required.
 238               // first is either 'label'
 239               // first/second and third are both hexadecimal or word equivalent
 240               // colour.
 241               // next two are values parsed as floats.
 242               // fifth is either 'above','below', or 'none'.
 243               // sixth is a float value and only required when fifth is either
 244               // 'above' or 'below'.
 245               StringTokenizer gcol = new StringTokenizer(colscheme, "|",
 246                       true);
 247               // set defaults
 248               int threshtype = AnnotationColourGradient.NO_THRESHOLD;
 249               float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
 250               boolean labelCol = false;
 251               // Parse spec line
 252               String mincol = gcol.nextToken();
 253               if (mincol == "|")
 254               {
 255                 System.err
 256                         .println("Expected either 'label' or a colour specification in the line: "
 257                                 + line);
 258                 continue;
 259               }
 260               String maxcol = null;
 261               if (mincol.toLowerCase().indexOf("label") == 0)
 262               {
 263                 labelCol = true;
 264                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
 265                                                                            // '|'
 266                 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
 267               }
 268               String abso = null, minval, maxval;
 269               if (mincol != null)
 270               {
 271                 // at least four more tokens
 272                 if (mincol.equals("|"))
 273                 {
 274                   mincol = "";
 275                 }
 276                 else
 277                 {
 278                   gcol.nextToken(); // skip next '|'
 279                 }
 280                 // continue parsing rest of line
 281                 maxcol = gcol.nextToken();
 282                 if (maxcol.equals("|"))
 283                 {
 284                   maxcol = "";
 285                 }
 286                 else
 287                 {
 288                   gcol.nextToken(); // skip next '|'
 289                 }
 290                 abso = gcol.nextToken();
 291                 gcol.nextToken(); // skip next '|'
 292                 if (abso.toLowerCase().indexOf("abso") != 0)
 293                 {
 294                   minval = abso;
 295                   abso = null;
 296                 }
 297                 else
 298                 {
 299                   minval = gcol.nextToken();
 300                   gcol.nextToken(); // skip next '|'
 301                 }
 302                 maxval = gcol.nextToken();
 303                 if (gcol.hasMoreTokens())
 304                 {
 305                   gcol.nextToken(); // skip next '|'
 306                 }
 307                 try
 308                 {
 309                   if (minval.length() > 0)
 310                   {
 311                     min = new Float(minval).floatValue();
 312                   }
 313                 } catch (Exception e)
 314                 {
 315                   System.err
 316                           .println("Couldn't parse the minimum value for graduated colour for type ("
 317                                   + colscheme
 318                                   + ") - did you misspell 'auto' for the optional automatic colour switch ?");
 319                   e.printStackTrace();
 320                 }
 321                 try
 322                 {
 323                   if (maxval.length() > 0)
 324                   {
 325                     max = new Float(maxval).floatValue();
 326                   }
 327                 } catch (Exception e)
 328                 {
 329                   System.err
 330                           .println("Couldn't parse the maximum value for graduated colour for type ("
 331                                   + colscheme + ")");
 332                   e.printStackTrace();
 333                 }
 334               }
 335               else
 336               {
 337                 // add in some dummy min/max colours for the label-only
 338                 // colourscheme.
 339                 mincol = "FFFFFF";
 340                 maxcol = "000000";
 341               }
 342               try
 343               {
 344                 colour = new jalview.schemes.GraduatedColor(
 345                         new UserColourScheme(mincol).findColour('A'),
 346                         new UserColourScheme(maxcol).findColour('A'), min,
 347                         max);
 348               } catch (Exception e)
 349               {
 350                 System.err
 351                         .println("Couldn't parse the graduated colour scheme ("
 352                                 + colscheme + ")");
 353                 e.printStackTrace();
 354               }
 355               if (colour != null)
 356               {
 357                 ((jalview.schemes.GraduatedColor) colour)
 358                         .setColourByLabel(labelCol);
 359                 ((jalview.schemes.GraduatedColor) colour)
 360                         .setAutoScaled(abso == null);
 361                 // add in any additional parameters
 362                 String ttype = null, tval = null;
 363                 if (gcol.hasMoreTokens())
 364                 {
 365                   // threshold type and possibly a threshold value
 366                   ttype = gcol.nextToken();
 367                   if (ttype.toLowerCase().startsWith("below"))
 368                   {
 369                     ((jalview.schemes.GraduatedColor) colour)
 370                             .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
 371                   }
 372                   else if (ttype.toLowerCase().startsWith("above"))
 373                   {
 374                     ((jalview.schemes.GraduatedColor) colour)
 375                             .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
 376                   }
 377                   else
 378                   {
 379                     ((jalview.schemes.GraduatedColor) colour)
 380                             .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
 381                     if (!ttype.toLowerCase().startsWith("no"))
 382                     {
 383                       System.err
 384                               .println("Ignoring unrecognised threshold type : "
 385                                       + ttype);
 386                     }
 387                   }
 388                 }
 389                 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
 390                 {
 391                   try
 392                   {
 393                     gcol.nextToken();
 394                     tval = gcol.nextToken();
 395                     ((jalview.schemes.GraduatedColor) colour)
 396                             .setThresh(new Float(tval).floatValue());
 397                   } catch (Exception e)
 398                   {
 399                     System.err
 400                             .println("Couldn't parse threshold value as a float: ("
 401                                     + tval + ")");
 402                     e.printStackTrace();
 403                   }
 404                 }
 405                 // parse the thresh-is-min token ?
 406                 if (gcol.hasMoreTokens())
 407                 {
 408                   System.err
 409                           .println("Ignoring additional tokens in parameters in graduated colour specification\n");
 410                   while (gcol.hasMoreTokens())
 411                   {
 412                     System.err.println("|" + gcol.nextToken());
 413                   }
 414                   System.err.println("\n");
 415                 }
 416               }
 417             }
 418             else
 419             {
 420               UserColourScheme ucs = new UserColourScheme(colscheme);
 421               colour = ucs.findColour('A');
 422             }
 423             if (colour != null)
 424             {
 425               colours.put(type, colour);
 426             }
 427             if (st.hasMoreElements())
 428             {
 429               String link = st.nextToken();
 430               typeLink.put(type, link);
 431               if (featureLink == null)
 432               {
 433                 featureLink = new Hashtable();
 434               }
 435               featureLink.put(type, link);
 436             }
 437           }
 438           continue;
 439         }
 440         String seqId = "";
 441         while (st.hasMoreElements())
 442         {
 443
 444           if (GFFFile)
 445           {
 446             // Still possible this is an old Jalview file,
 447             // which does not have type colours at the beginning
 448             seqId = token = st.nextToken();
 449             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 450             if (seq != null)
 451             {
 452               desc = st.nextToken();
 453               String group = null;
 454               if (doGffSource && desc.indexOf(' ') == -1)
 455               {
 456                 // could also be a source term rather than description line
 457                 group = new String(desc);
 458               }
 459               type = st.nextToken();
 460               try
 461               {
 462                 String stt = st.nextToken();
 463                 if (stt.length() == 0 || stt.equals("-"))
 464                 {
 465                   start = 0;
 466                 }
 467                 else
 468                 {
 469                   start = Integer.parseInt(stt);
 470                 }
 471               } catch (NumberFormatException ex)
 472               {
 473                 start = 0;
 474               }
 475               try
 476               {
 477                 String stt = st.nextToken();
 478                 if (stt.length() == 0 || stt.equals("-"))
 479                 {
 480                   end = 0;
 481                 }
 482                 else
 483                 {
 484                   end = Integer.parseInt(stt);
 485                 }
 486               } catch (NumberFormatException ex)
 487               {
 488                 end = 0;
 489               }
 490               // TODO: decide if non positional feature assertion for input data
 491               // where end==0 is generally valid
 492               if (end == 0)
 493               {
 494                 // treat as non-positional feature, regardless.
 495                 start = 0;
 496               }
 497               try
 498               {
 499                 score = new Float(st.nextToken()).floatValue();
 500               } catch (NumberFormatException ex)
 501               {
 502                 score = 0;
 503               }
 504
 505               sf = new SequenceFeature(type, desc, start, end, score, group);
 506
 507               try
 508               {
 509                 sf.setValue("STRAND", st.nextToken());
 510                 sf.setValue("FRAME", st.nextToken());
 511               } catch (Exception ex)
 512               {
 513               }
 514
 515               if (st.hasMoreTokens())
 516               {
 517                 StringBuffer attributes = new StringBuffer();
 518                 boolean sep = false;
 519                 while (st.hasMoreTokens())
 520                 {
 521                   attributes.append((sep ? "\t" : "") + st.nextElement());
 522                   sep = true;
 523                 }
 524                 // TODO validate and split GFF2 attributes field ? parse out
 525                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 526                 // sf.setValue(attrib, val);
 527                 sf.setValue("ATTRIBUTES", attributes.toString());
 528               }
 529
 530               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 531                       relaxedIdmatching))
 532               {
 533                 // check whether we should add the sequence feature to any other
 534                 // sequences in the alignment with the same or similar
 535                 while ((seq = align.findName(seq, seqId, true)) != null)
 536                 {
 537                   seq.addSequenceFeature(new SequenceFeature(sf));
 538                 }
 539               }
 540               break;
 541             }
 542           }
 543
 544           if (GFFFile && seq == null)
 545           {
 546             desc = token;
 547           }
 548           else
 549           {
 550             desc = st.nextToken();
 551           }
 552           if (!st.hasMoreTokens())
 553           {
 554             System.err
 555                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 556             // in all probability, this isn't a file we understand, so bail
 557             // quietly.
 558             return false;
 559           }
 560
 561           token = st.nextToken();
 562
 563           if (!token.equals("ID_NOT_SPECIFIED"))
 564           {
 565             seq = findName(align, seqId = token, relaxedIdmatching, null);
 566             st.nextToken();
 567           }
 568           else
 569           {
 570             seqId = null;
 571             try
 572             {
 573               index = Integer.parseInt(st.nextToken());
 574               seq = align.getSequenceAt(index);
 575             } catch (NumberFormatException ex)
 576             {
 577               seq = null;
 578             }
 579           }
 580
 581           if (seq == null)
 582           {
 583             System.out.println("Sequence not found: " + line);
 584             break;
 585           }
 586
 587           start = Integer.parseInt(st.nextToken());
 588           end = Integer.parseInt(st.nextToken());
 589
 590           type = st.nextToken();
 591
 592           if (!colours.containsKey(type))
 593           {
 594             // Probably the old style groups file
 595             UserColourScheme ucs = new UserColourScheme(type);
 596             colours.put(type, ucs.findColour('A'));
 597           }
 598           sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
 599           if (st.hasMoreTokens())
 600           {
 601             try
 602             {
 603               score = new Float(st.nextToken()).floatValue();
 604               // update colourgradient bounds if allowed to
 605             } catch (NumberFormatException ex)
 606             {
 607               score = 0;
 608             }
 609             sf.setScore(score);
 610           }
 611           if (groupLink != null && removeHTML)
 612           {
 613             sf.addLink(groupLink);
 614             sf.description += "%LINK%";
 615           }
 616           if (typeLink.containsKey(type) && removeHTML)
 617           {
 618             sf.addLink(typeLink.get(type).toString());
 619             sf.description += "%LINK%";
 620           }
 621
 622           parseDescriptionHTML(sf, removeHTML);
 623
 624           seq.addSequenceFeature(sf);
 625
 626           while (seqId != null
 627                   && (seq = align.findName(seq, seqId, false)) != null)
 628           {
 629             seq.addSequenceFeature(new SequenceFeature(sf));
 630           }
 631           // If we got here, its not a GFFFile
 632           GFFFile = false;
 633         }
 634       }
 635       resetMatcher();
 636     } catch (Exception ex)
 637     {
 638       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 639       ex.printStackTrace(System.err);
 640       resetMatcher();
 641       return false;
 642     }
 643
 644     return true;
 645   }
 646
 647
 648   /**
 649    * take a sequence feature and examine its attributes to decide how it should
 650    * be added to a sequence
 651    *
 652    * @param seq
 653    *          - the destination sequence constructed or discovered in the
 654    *          current context
 655    * @param sf
 656    *          - the base feature with ATTRIBUTES property containing any
 657    *          additional attributes
 658    * @param gFFFile
 659    *          - true if we are processing a GFF annotation file
 660    * @return true if sf was actually added to the sequence, false if it was
 661    *         processed in another way
 662    */
 663   public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 664           boolean gFFFile, boolean relaxedIdMatching)
 665   {
 666     String attr = (String) sf.getValue("ATTRIBUTES");
 667     boolean add = true;
 668     if (gFFFile && attr != null)
 669     {
 670       int nattr=8;
 671
 672       for (String attset : attr.split("\t"))
 673       {
 674         if (attset==null || attset.trim().length()==0)
 675         {
 676           continue;
 677         }
 678         nattr++;
 679         Map<String, List<String>> set = new HashMap<String, List<String>>();
 680         // normally, only expect one column - 9 - in this field
 681         // the attributes (Gff3) or groups (gff2) field
 682         for (String pair : attset.trim().split(";"))
 683         {
 684           pair = pair.trim();
 685           if (pair.length() == 0)
 686           {
 687             continue;
 688           }
 689
 690           // expect either space seperated (gff2) or '=' separated (gff3)
 691           // key/value pairs here
 692
 693           int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
 694           String key = null, value = null;
 695
 696           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 697           {
 698             key = pair.substring(0, sppos);
 699             value = pair.substring(sppos + 1);
 700           } else {
 701             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 702             {
 703               key = pair.substring(0, eqpos);
 704               value = pair.substring(eqpos + 1);
 705             } else
 706             {
 707               key = pair;
 708             }
 709           }
 710           if (key != null)
 711           {
 712             List<String> vals = set.get(key);
 713             if (vals == null)
 714             {
 715               vals = new ArrayList<String>();
 716               set.put(key, vals);
 717             }
 718             if (value != null)
 719             {
 720               vals.add(value.trim());
 721             }
 722           }
 723         }
 724         try
 725         {
 726           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 727                   relaxedIdMatching); // process decides if
 728                                                      // feature is actually
 729                                                      // added
 730         } catch (InvalidGFF3FieldException ivfe)
 731         {
 732           System.err.println(ivfe);
 733         }
 734       }
 735     }
 736     if (add)
 737     {
 738       seq.addSequenceFeature(sf);
 739     }
 740     return add;
 741   }
 742
 743   public class InvalidGFF3FieldException extends Exception
 744   {
 745     String field, value;
 746
 747     public InvalidGFF3FieldException(String field,
 748             Map<String, List<String>> set, String message)
 749     {
 750       super(message + " (Field was " + field + " and value was "
 751               + set.get(field).toString());
 752       this.field = field;
 753       this.value = set.get(field).toString();
 754     }
 755
 756   }
 757
 758   /**
 759    * take a set of keys for a feature and interpret them
 760    *
 761    * @param set
 762    * @param nattr
 763    * @param seq
 764    * @param sf
 765    * @return
 766    */
 767   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 768           SequenceI seq, SequenceFeature sf, AlignmentI align,
 769           List<SequenceI> newseqs, boolean relaxedIdMatching)
 770           throws InvalidGFF3FieldException
 771   {
 772     String attr;
 773     // decide how to interpret according to type
 774     if (sf.getType().equals("similarity"))
 775     {
 776       int strand = sf.getStrand();
 777       // exonerate cdna/protein map
 778       // look for fields
 779       List<SequenceI> querySeq = findNames(align, newseqs,
 780               relaxedIdMatching, set.get(attr="Query"));
 781       if (querySeq==null || querySeq.size()!=1)
 782       {
 783         throw new InvalidGFF3FieldException( attr, set,
 784                 "Expecting exactly one sequence in Query field (got "
 785                         + set.get(attr) + ")");
 786       }
 787       if (set.containsKey(attr="Align"))
 788       {
 789         // process the align maps and create cdna/protein maps
 790         // ideally, the query sequences are in the alignment, but maybe not...
 791
 792         AlignedCodonFrame alco = new AlignedCodonFrame();
 793         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 794                 strand);
 795
 796         // add codon mapping, and hope!
 797         alco.addMap(seq, querySeq.get(0), codonmapping);
 798         align.addCodonFrame(alco);
 799         // everything that's needed to be done is done
 800         // no features to create here !
 801         return false;
 802       }
 803
 804     }
 805     return true;
 806   }
 807
 808   private MapList constructCodonMappingFromAlign(
 809           Map<String, List<String>> set,
 810           String attr, int strand) throws InvalidGFF3FieldException
 811   {
 812     if (strand == 0)
 813     {
 814       throw new InvalidGFF3FieldException(attr, set,
 815               "Invalid strand for a codon mapping (cannot be 0)");
 816     }
 817     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 818     int lastppos = 0, lastpframe = 0;
 819     for (String range : set.get(attr))
 820     {
 821       List<Integer> ints = new ArrayList<Integer>();
 822       StringTokenizer st = new StringTokenizer(range, " ");
 823       while (st.hasMoreTokens())
 824       {
 825         String num = st.nextToken();
 826         try
 827         {
 828           ints.add(new Integer(num));
 829         } catch (NumberFormatException nfe)
 830         {
 831           throw new InvalidGFF3FieldException(attr, set,
 832                   "Invalid number in field " + num);
 833         }
 834       }
 835       // Align positionInRef positionInQuery LengthInRef
 836       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
 837       // 3652 - . alignment_id 0 ;
 838       // Query DDB_G0269124
 839       // Align 11270 143 120
 840       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
 841       // dna in strand direction
 842       // Align 11150 187 282
 843       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
 844       // dna in strand direction
 845       //
 846       // Align 10865 281 888
 847       // Align 9977 578 1068
 848       // Align 8909 935 375
 849       //
 850       if (ints.size() != 3)
 851       {
 852         throw new InvalidGFF3FieldException(attr, set,
 853                 "Invalid number of fields for this attribute ("
 854                         + ints.size() + ")");
 855       }
 856       fromrange.add(new Integer(ints.get(0).intValue()));
 857       fromrange.add(new Integer(ints.get(0).intValue() + strand
 858               * ints.get(2).intValue()));
 859       // how are intron/exon boundaries that do not align in codons
 860       // represented
 861       if (ints.get(1).equals(lastppos) && lastpframe > 0)
 862       {
 863         // extend existing to map
 864         lastppos += ints.get(2) / 3;
 865         lastpframe = ints.get(2) % 3;
 866         torange.set(torange.size() - 1, new Integer(lastppos));
 867       }
 868       else
 869       {
 870         // new to map range
 871         torange.add(ints.get(1));
 872         lastppos = ints.get(1) + ints.get(2) / 3;
 873         lastpframe = ints.get(2) % 3;
 874         torange.add(new Integer(lastppos));
 875       }
 876     }
 877     // from and to ranges must end up being a series of start/end intervals
 878     if (fromrange.size() % 2 == 1)
 879     {
 880       throw new InvalidGFF3FieldException(attr, set,
 881               "Couldn't parse the DNA alignment range correctly");
 882     }
 883     if (torange.size() % 2 == 1)
 884     {
 885       throw new InvalidGFF3FieldException(attr, set,
 886               "Couldn't parse the protein alignment range correctly");
 887     }
 888     // finally, build the map
 889     int[] frommap = new int[fromrange.size()], tomap = new int[torange
 890             .size()];
 891     int p = 0;
 892     for (Integer ip : fromrange)
 893     {
 894       frommap[p++] = ip.intValue();
 895     }
 896     p = 0;
 897     for (Integer ip : torange)
 898     {
 899       tomap[p++] = ip.intValue();
 900     }
 901
 902     return new MapList(frommap, tomap, 3, 1);
 903   }
 904
 905   private List<SequenceI> findNames(AlignmentI align,
 906           List<SequenceI> newseqs, boolean relaxedIdMatching,
 907           List<String> list)
 908   {
 909     List<SequenceI> found = new ArrayList<SequenceI>();
 910     for (String seqId : list)
 911     {
 912       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
 913       if (seq != null)
 914       {
 915         found.add(seq);
 916       }
 917     }
 918     return found;
 919   }
 920
 921   private AlignmentI lastmatchedAl = null;
 922
 923   private SequenceIdMatcher matcher = null;
 924
 925   /**
 926    * clear any temporary handles used to speed up ID matching
 927    */
 928   private void resetMatcher()
 929   {
 930     lastmatchedAl = null;
 931     matcher = null;
 932   }
 933
 934   private SequenceI findName(AlignmentI align, String seqId,
 935           boolean relaxedIdMatching, List<SequenceI> newseqs)
 936   {
 937     SequenceI match = null;
 938     if (relaxedIdMatching)
 939     {
 940       if (lastmatchedAl != align)
 941       {
 942         matcher = new SequenceIdMatcher(
 943                 (lastmatchedAl = align).getSequencesArray());
 944         if (newseqs != null)
 945         {
 946           matcher.addAll(newseqs);
 947         }
 948       }
 949       match = matcher.findIdMatch(seqId);
 950     }
 951     else
 952     {
 953       match = align.findName(seqId, true);
 954
 955     }
 956     if (match==null && newseqs!=null)
 957     {
 958       match = new SequenceDummy(seqId);
 959       if (relaxedIdMatching)
 960       {
 961         matcher.addAll(Arrays.asList(new SequenceI[]
 962         { match }));
 963       }
 964     }
 965     return match;
 966   }
 967   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
 968   {
 969     if (sf.getDescription() == null)
 970     {
 971       return;
 972     }
 973     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
 974             sf.getDescription(), removeHTML, newline);
 975
 976     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
 977             : sf.description;
 978     for (String link : parsed.getLinks())
 979     {
 980       sf.addLink(link);
 981     }
 982
 983   }
 984
 985   /**
 986    * generate a features file for seqs includes non-pos features by default.
 987    *
 988    * @param seqs
 989    *          source of sequence features
 990    * @param visible
 991    *          hash of feature types and colours
 992    * @return features file contents
 993    */
 994   public String printJalviewFormat(SequenceI[] seqs, Map<String,Object> visible)
 995   {
 996     return printJalviewFormat(seqs, visible, true, true);
 997   }
 998
 999   /**
1000    * generate a features file for seqs with colours from visible (if any)
1001    *
1002    * @param seqs
1003    *          source of features
1004    * @param visible
1005    *          hash of Colours for each feature type
1006    * @param visOnly
1007    *          when true only feature types in 'visible' will be output
1008    * @param nonpos
1009    *          indicates if non-positional features should be output (regardless
1010    *          of group or type)
1011    * @return features file contents
1012    */
1013   public String printJalviewFormat(SequenceI[] seqs, Map visible,
1014           boolean visOnly, boolean nonpos)
1015   {
1016     StringBuffer out = new StringBuffer();
1017     SequenceFeature[] next;
1018     boolean featuresGen = false;
1019     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1020     {
1021       // no point continuing.
1022       return "No Features Visible";
1023     }
1024
1025     if (visible != null && visOnly)
1026     {
1027       // write feature colours only if we're given them and we are generating
1028       // viewed features
1029       // TODO: decide if feature links should also be written here ?
1030       Iterator en = visible.keySet().iterator();
1031       String type, color;
1032       while (en.hasNext())
1033       {
1034         type = en.next().toString();
1035
1036         if (visible.get(type) instanceof GraduatedColor)
1037         {
1038           GraduatedColor gc = (GraduatedColor) visible.get(type);
1039           color = (gc.isColourByLabel() ? "label|" : "")
1040                   + Format.getHexString(gc.getMinColor()) + "|"
1041                   + Format.getHexString(gc.getMaxColor())
1042                   + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
1043                   + gc.getMax() + "|";
1044           if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
1045           {
1046             if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
1047             {
1048               color += "below";
1049             }
1050             else
1051             {
1052               if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
1053               {
1054                 System.err.println("WARNING: Unsupported threshold type ("
1055                         + gc.getThreshType() + ") : Assuming 'above'");
1056               }
1057               color += "above";
1058             }
1059             // add the value
1060             color += "|" + gc.getThresh();
1061           }
1062           else
1063           {
1064             color += "none";
1065           }
1066         }
1067         else if (visible.get(type) instanceof java.awt.Color)
1068         {
1069           color = Format.getHexString((java.awt.Color) visible.get(type));
1070         }
1071         else
1072         {
1073           // legacy support for integer objects containing colour triplet values
1074           color = Format.getHexString(new java.awt.Color(Integer
1075                   .parseInt(visible.get(type).toString())));
1076         }
1077         out.append(type);
1078         out.append("\t");
1079         out.append(color);
1080         out.append(newline);
1081       }
1082     }
1083     // Work out which groups are both present and visible
1084     Vector groups = new Vector();
1085     int groupIndex = 0;
1086     boolean isnonpos = false;
1087
1088     for (int i = 0; i < seqs.length; i++)
1089     {
1090       next = seqs[i].getSequenceFeatures();
1091       if (next != null)
1092       {
1093         for (int j = 0; j < next.length; j++)
1094         {
1095           isnonpos = next[j].begin == 0 && next[j].end == 0;
1096           if ((!nonpos && isnonpos)
1097                   || (!isnonpos && visOnly && !visible
1098                           .containsKey(next[j].type)))
1099           {
1100             continue;
1101           }
1102
1103           if (next[j].featureGroup != null
1104                   && !groups.contains(next[j].featureGroup))
1105           {
1106             groups.addElement(next[j].featureGroup);
1107           }
1108         }
1109       }
1110     }
1111
1112     String group = null;
1113     do
1114     {
1115
1116       if (groups.size() > 0 && groupIndex < groups.size())
1117       {
1118         group = groups.elementAt(groupIndex).toString();
1119         out.append(newline);
1120         out.append("STARTGROUP\t");
1121         out.append(group);
1122         out.append(newline);
1123       }
1124       else
1125       {
1126         group = null;
1127       }
1128
1129       for (int i = 0; i < seqs.length; i++)
1130       {
1131         next = seqs[i].getSequenceFeatures();
1132         if (next != null)
1133         {
1134           for (int j = 0; j < next.length; j++)
1135           {
1136             isnonpos = next[j].begin == 0 && next[j].end == 0;
1137             if ((!nonpos && isnonpos)
1138                     || (!isnonpos && visOnly && !visible
1139                             .containsKey(next[j].type)))
1140             {
1141               // skip if feature is nonpos and we ignore them or if we only
1142               // output visible and it isn't non-pos and it's not visible
1143               continue;
1144             }
1145
1146             if (group != null
1147                     && (next[j].featureGroup == null || !next[j].featureGroup
1148                             .equals(group)))
1149             {
1150               continue;
1151             }
1152
1153             if (group == null && next[j].featureGroup != null)
1154             {
1155               continue;
1156             }
1157             // we have features to output
1158             featuresGen = true;
1159             if (next[j].description == null
1160                     || next[j].description.equals(""))
1161             {
1162               out.append(next[j].type + "\t");
1163             }
1164             else
1165             {
1166               if (next[j].links != null
1167                       && next[j].getDescription().indexOf("<html>") == -1)
1168               {
1169                 out.append("<html>");
1170               }
1171
1172               out.append(next[j].description + " ");
1173               if (next[j].links != null)
1174               {
1175                 for (int l = 0; l < next[j].links.size(); l++)
1176                 {
1177                   String label = next[j].links.elementAt(l).toString();
1178                   String href = label.substring(label.indexOf("|") + 1);
1179                   label = label.substring(0, label.indexOf("|"));
1180
1181                   if (next[j].description.indexOf(href) == -1)
1182                   {
1183                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1184                   }
1185                 }
1186
1187                 if (next[j].getDescription().indexOf("</html>") == -1)
1188                 {
1189                   out.append("</html>");
1190                 }
1191               }
1192
1193               out.append("\t");
1194             }
1195             out.append(seqs[i].getName());
1196             out.append("\t-1\t");
1197             out.append(next[j].begin);
1198             out.append("\t");
1199             out.append(next[j].end);
1200             out.append("\t");
1201             out.append(next[j].type);
1202             if (next[j].score != Float.NaN)
1203             {
1204               out.append("\t");
1205               out.append(next[j].score);
1206             }
1207             out.append(newline);
1208           }
1209         }
1210       }
1211
1212       if (group != null)
1213       {
1214         out.append("ENDGROUP\t");
1215         out.append(group);
1216         out.append(newline);
1217         groupIndex++;
1218       }
1219       else
1220       {
1221         break;
1222       }
1223
1224     } while (groupIndex < groups.size() + 1);
1225
1226     if (!featuresGen)
1227     {
1228       return "No Features Visible";
1229     }
1230
1231     return out.toString();
1232   }
1233
1234   /**
1235    * generate a gff file for sequence features includes non-pos features by
1236    * default.
1237    *
1238    * @param seqs
1239    * @param visible
1240    * @return
1241    */
1242   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible)
1243   {
1244     return printGFFFormat(seqs, visible, true, true);
1245   }
1246
1247   public String printGFFFormat(SequenceI[] seqs, Map<String,Object> visible,
1248           boolean visOnly, boolean nonpos)
1249   {
1250     StringBuffer out = new StringBuffer();
1251     SequenceFeature[] next;
1252     String source;
1253     boolean isnonpos;
1254     for (int i = 0; i < seqs.length; i++)
1255     {
1256       if (seqs[i].getSequenceFeatures() != null)
1257       {
1258         next = seqs[i].getSequenceFeatures();
1259         for (int j = 0; j < next.length; j++)
1260         {
1261           isnonpos = next[j].begin == 0 && next[j].end == 0;
1262           if ((!nonpos && isnonpos)
1263                   || (!isnonpos && visOnly && !visible
1264                           .containsKey(next[j].type)))
1265           {
1266             continue;
1267           }
1268
1269           source = next[j].featureGroup;
1270           if (source == null)
1271           {
1272             source = next[j].getDescription();
1273           }
1274
1275           out.append(seqs[i].getName());
1276           out.append("\t");
1277           out.append(source);
1278           out.append("\t");
1279           out.append(next[j].type);
1280           out.append("\t");
1281           out.append(next[j].begin);
1282           out.append("\t");
1283           out.append(next[j].end);
1284           out.append("\t");
1285           out.append(next[j].score);
1286           out.append("\t");
1287
1288           if (next[j].getValue("STRAND") != null)
1289           {
1290             out.append(next[j].getValue("STRAND"));
1291             out.append("\t");
1292           }
1293           else
1294           {
1295             out.append(".\t");
1296           }
1297
1298           if (next[j].getValue("FRAME") != null)
1299           {
1300             out.append(next[j].getValue("FRAME"));
1301           }
1302           else
1303           {
1304             out.append(".");
1305           }
1306           // TODO: verify/check GFF - should there be a /t here before attribute
1307           // output ?
1308
1309           if (next[j].getValue("ATTRIBUTES") != null)
1310           {
1311             out.append(next[j].getValue("ATTRIBUTES"));
1312           }
1313
1314           out.append(newline);
1315
1316         }
1317       }
1318     }
1319
1320     return out.toString();
1321   }
1322
1323   /**
1324    * this is only for the benefit of object polymorphism - method does nothing.
1325    */
1326   public void parse()
1327   {
1328     // IGNORED
1329   }
1330
1331   /**
1332    * this is only for the benefit of object polymorphism - method does nothing.
1333    *
1334    * @return error message
1335    */
1336   public String print()
1337   {
1338     return "USE printGFFFormat() or printJalviewFormat()";
1339   }
1340
1341 }