src/jalview/io/FeaturesFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.api.FeatureColourI;
  25 import jalview.datamodel.AlignedCodonFrame;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.schemes.FeatureColour;
  31 import jalview.schemes.UserColourScheme;
  32 import jalview.util.MapList;
  33
  34 import java.io.IOException;
  35 import java.util.ArrayList;
  36 import java.util.Arrays;
  37 import java.util.HashMap;
  38 import java.util.Hashtable;
  39 import java.util.Iterator;
  40 import java.util.List;
  41 import java.util.Map;
  42 import java.util.StringTokenizer;
  43 import java.util.Vector;
  44
  45 /**
  46  * Parse and create Jalview Features files Detects GFF format features files and
  47  * parses. Does not implement standard print() - call specific printFeatures or
  48  * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
  49  * for the features annotation - this normally works on an exact match.
  50  *
  51  * @author AMW
  52  * @version $Revision$
  53  */
  54 public class FeaturesFile extends AlignFile
  55 {
  56   /**
  57    * work around for GFF interpretation bug where source string becomes
  58    * description rather than a group
  59    */
  60   private boolean doGffSource = true;
  61
  62   private int gffversion;
  63
  64   /**
  65    * Creates a new FeaturesFile object.
  66    */
  67   public FeaturesFile()
  68   {
  69   }
  70
  71   /**
  72    * @param inFile
  73    * @param type
  74    * @throws IOException
  75    */
  76   public FeaturesFile(String inFile, String type) throws IOException
  77   {
  78     super(inFile, type);
  79   }
  80
  81   /**
  82    * @param source
  83    * @throws IOException
  84    */
  85   public FeaturesFile(FileParse source) throws IOException
  86   {
  87     super(source);
  88   }
  89
  90   /**
  91    * @param parseImmediately
  92    * @param source
  93    * @throws IOException
  94    */
  95   public FeaturesFile(boolean parseImmediately, FileParse source)
  96           throws IOException
  97   {
  98     super(parseImmediately, source);
  99   }
 100
 101   /**
 102    * @param parseImmediately
 103    * @param inFile
 104    * @param type
 105    * @throws IOException
 106    */
 107   public FeaturesFile(boolean parseImmediately, String inFile, String type)
 108           throws IOException
 109   {
 110     super(parseImmediately, inFile, type);
 111   }
 112
 113   /**
 114    * Parse GFF or sequence features file using case-independent matching,
 115    * discarding URLs
 116    *
 117    * @param align
 118    *          - alignment/dataset containing sequences that are to be annotated
 119    * @param colours
 120    *          - hashtable to store feature colour definitions
 121    * @param removeHTML
 122    *          - process html strings into plain text
 123    * @return true if features were added
 124    */
 125   public boolean parse(AlignmentI align, Map colours, boolean removeHTML)
 126   {
 127     return parse(align, colours, null, removeHTML, false);
 128   }
 129
 130   /**
 131    * Parse GFF or sequence features file optionally using case-independent
 132    * matching, discarding URLs
 133    *
 134    * @param align
 135    *          - alignment/dataset containing sequences that are to be annotated
 136    * @param colours
 137    *          - hashtable to store feature colour definitions
 138    * @param removeHTML
 139    *          - process html strings into plain text
 140    * @param relaxedIdmatching
 141    *          - when true, ID matches to compound sequence IDs are allowed
 142    * @return true if features were added
 143    */
 144   public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
 145           boolean relaxedIdMatching)
 146   {
 147     return parse(align, colours, null, removeHTML, relaxedIdMatching);
 148   }
 149
 150   /**
 151    * Parse GFF or sequence features file optionally using case-independent
 152    * matching
 153    *
 154    * @param align
 155    *          - alignment/dataset containing sequences that are to be annotated
 156    * @param colours
 157    *          - hashtable to store feature colour definitions
 158    * @param featureLink
 159    *          - hashtable to store associated URLs
 160    * @param removeHTML
 161    *          - process html strings into plain text
 162    * @return true if features were added
 163    */
 164   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 165           boolean removeHTML)
 166   {
 167     return parse(align, colours, featureLink, removeHTML, false);
 168   }
 169
 170   @Override
 171   public void addAnnotations(AlignmentI al)
 172   {
 173     // TODO Auto-generated method stub
 174     super.addAnnotations(al);
 175   }
 176
 177   @Override
 178   public void addProperties(AlignmentI al)
 179   {
 180     // TODO Auto-generated method stub
 181     super.addProperties(al);
 182   }
 183
 184   @Override
 185   public void addSeqGroups(AlignmentI al)
 186   {
 187     // TODO Auto-generated method stub
 188     super.addSeqGroups(al);
 189   }
 190
 191   /**
 192    * Parse GFF or sequence features file
 193    *
 194    * @param align
 195    *          - alignment/dataset containing sequences that are to be annotated
 196    * @param colours
 197    *          - hashtable to store feature colour definitions
 198    * @param featureLink
 199    *          - hashtable to store associated URLs
 200    * @param removeHTML
 201    *          - process html strings into plain text
 202    * @param relaxedIdmatching
 203    *          - when true, ID matches to compound sequence IDs are allowed
 204    * @return true if features were added
 205    */
 206   public boolean parse(AlignmentI align, Map colours, Map featureLink,
 207           boolean removeHTML, boolean relaxedIdmatching)
 208   {
 209
 210     String line = null;
 211     try
 212     {
 213       SequenceI seq = null;
 214       /**
 215        * keep track of any sequences we try to create from the data if it is a
 216        * GFF3 file
 217        */
 218       ArrayList<SequenceI> newseqs = new ArrayList<SequenceI>();
 219       String theType, desc, token = null;
 220
 221       StringTokenizer st;
 222       SequenceFeature sf;
 223       String featureGroup = null, groupLink = null;
 224       Map<String, String> typeLink = new Hashtable<String, String>();
 225       /**
 226        * when true, assume GFF style features rather than Jalview style.
 227        */
 228       boolean GFFFile = true;
 229       Map<String, String> gffProps = new HashMap<String, String>();
 230       while ((line = nextLine()) != null)
 231       {
 232         int featureStart, featureEnd;
 233         // skip comments/process pragmas
 234         if (line.startsWith("#"))
 235         {
 236           if (line.startsWith("##"))
 237           {
 238             // possibly GFF2/3 version and metadata header
 239             processGffPragma(line, gffProps, align, newseqs);
 240             line = "";
 241           }
 242           continue;
 243         }
 244
 245         st = new StringTokenizer(line, "\t");
 246         if (st.countTokens() == 1)
 247         {
 248           if (line.trim().equalsIgnoreCase("GFF"))
 249           {
 250             // Start parsing file as if it might be GFF again.
 251             GFFFile = true;
 252             continue;
 253           }
 254         }
 255         if (st.countTokens() > 1 && st.countTokens() < 4)
 256         {
 257           GFFFile = false;
 258           theType = st.nextToken();
 259           if (theType.equalsIgnoreCase("startgroup"))
 260           {
 261             featureGroup = st.nextToken();
 262             if (st.hasMoreElements())
 263             {
 264               groupLink = st.nextToken();
 265               featureLink.put(featureGroup, groupLink);
 266             }
 267           }
 268           else if (theType.equalsIgnoreCase("endgroup"))
 269           {
 270             // We should check whether this is the current group,
 271             // but at present theres no way of showing more than 1 group
 272             st.nextToken();
 273             featureGroup = null;
 274             groupLink = null;
 275           }
 276           else
 277           {
 278             String colscheme = st.nextToken();
 279             try
 280             {
 281               FeatureColourI colour = FeatureColour
 282                       .parseJalviewFeatureColour(colscheme);
 283               if (colour != null)
 284               {
 285                 colours.put(theType, colour);
 286               }
 287               if (st.hasMoreElements())
 288               {
 289                 String link = st.nextToken();
 290                 typeLink.put(theType, link);
 291                 if (featureLink == null)
 292                 {
 293                   featureLink = new Hashtable();
 294                 }
 295                 featureLink.put(theType, link);
 296               }
 297             } catch (IllegalArgumentException e)
 298             {
 299               System.err.println("Error parsing feature colour scheme "
 300                       + colscheme + " : " + e.getMessage());
 301             }
 302           }
 303           continue;
 304         }
 305         String seqId = "";
 306         while (st.hasMoreElements())
 307         {
 308
 309           if (GFFFile)
 310           {
 311             // Still possible this is an old Jalview file,
 312             // which does not have type colours at the beginning
 313             seqId = token = st.nextToken();
 314             seq = findName(align, seqId, relaxedIdmatching, newseqs);
 315             if (seq != null)
 316             {
 317               desc = st.nextToken();
 318               String group = null;
 319               if (doGffSource && desc.indexOf(' ') == -1)
 320               {
 321                 // could also be a source term rather than description line
 322                 group = new String(desc);
 323               }
 324               theType = st.nextToken();
 325               try
 326               {
 327                 String stt = st.nextToken();
 328                 if (stt.length() == 0 || stt.equals("-"))
 329                 {
 330                   featureStart = 0;
 331                 }
 332                 else
 333                 {
 334                   featureStart = Integer.parseInt(stt);
 335                 }
 336               } catch (NumberFormatException ex)
 337               {
 338                 featureStart = 0;
 339               }
 340               try
 341               {
 342                 String stt = st.nextToken();
 343                 if (stt.length() == 0 || stt.equals("-"))
 344                 {
 345                   featureEnd = 0;
 346                 }
 347                 else
 348                 {
 349                   featureEnd = Integer.parseInt(stt);
 350                 }
 351               } catch (NumberFormatException ex)
 352               {
 353                 featureEnd = 0;
 354               }
 355               // TODO: decide if non positional feature assertion for input data
 356               // where end==0 is generally valid
 357               if (featureEnd == 0)
 358               {
 359                 // treat as non-positional feature, regardless.
 360                 featureStart = 0;
 361               }
 362               float score = 0f;
 363               try
 364               {
 365                 score = new Float(st.nextToken()).floatValue();
 366               } catch (NumberFormatException ex)
 367               {
 368                 // ignore
 369               }
 370
 371               sf = new SequenceFeature(theType, desc, featureStart, featureEnd, score, group);
 372
 373               try
 374               {
 375                 sf.setValue("STRAND", st.nextToken());
 376                 sf.setValue("FRAME", st.nextToken());
 377               } catch (Exception ex)
 378               {
 379               }
 380
 381               if (st.hasMoreTokens())
 382               {
 383                 StringBuilder attributes = new StringBuilder();
 384                 boolean sep = false;
 385                 while (st.hasMoreTokens())
 386                 {
 387                   attributes.append(sep ? "\t" : "").append(
 388                           st.nextElement());
 389                   sep = true;
 390                 }
 391                 // TODO validate and split GFF2 attributes field ? parse out
 392                 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
 393                 // sf.setValue(attrib, val);
 394                 sf.setValue("ATTRIBUTES", attributes.toString());
 395               }
 396
 397               if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
 398                       relaxedIdmatching))
 399               {
 400                 // check whether we should add the sequence feature to any other
 401                 // sequences in the alignment with the same or similar
 402                 while ((seq = align.findName(seq, seqId, true)) != null)
 403                 {
 404                   seq.addSequenceFeature(new SequenceFeature(sf));
 405                 }
 406               }
 407               break;
 408             }
 409           }
 410
 411           if (GFFFile && seq == null)
 412           {
 413             desc = token;
 414           }
 415           else
 416           {
 417             desc = st.nextToken();
 418           }
 419           if (!st.hasMoreTokens())
 420           {
 421             System.err
 422                     .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
 423             // in all probability, this isn't a file we understand, so bail
 424             // quietly.
 425             return false;
 426           }
 427
 428           token = st.nextToken();
 429
 430           if (!token.equals("ID_NOT_SPECIFIED"))
 431           {
 432             seq = findName(align, seqId = token, relaxedIdmatching, null);
 433             st.nextToken();
 434           }
 435           else
 436           {
 437             seqId = null;
 438             try
 439             {
 440               int idx = Integer.parseInt(st.nextToken());
 441               seq = align.getSequenceAt(idx);
 442             } catch (NumberFormatException ex)
 443             {
 444               seq = null;
 445             }
 446           }
 447
 448           if (seq == null)
 449           {
 450             System.out.println("Sequence not found: " + line);
 451             break;
 452           }
 453
 454           featureStart = Integer.parseInt(st.nextToken());
 455           featureEnd = Integer.parseInt(st.nextToken());
 456
 457           theType = st.nextToken();
 458
 459           if (!colours.containsKey(theType))
 460           {
 461             // Probably the old style groups file
 462             colours.put(
 463                     theType,
 464                     new FeatureColour(UserColourScheme
 465                             .getColourFromString(theType)));
 466           }
 467           sf = new SequenceFeature(theType, desc, "", featureStart, featureEnd, featureGroup);
 468           if (st.hasMoreTokens())
 469           {
 470             float score = 0f;
 471             try
 472             {
 473               score = new Float(st.nextToken()).floatValue();
 474               // update colourgradient bounds if allowed to
 475             } catch (NumberFormatException ex)
 476             {
 477               // ignore
 478             }
 479             sf.setScore(score);
 480           }
 481           if (groupLink != null && removeHTML)
 482           {
 483             sf.addLink(groupLink);
 484             sf.description += "%LINK%";
 485           }
 486           if (typeLink.containsKey(theType) && removeHTML)
 487           {
 488             sf.addLink(typeLink.get(theType));
 489             sf.description += "%LINK%";
 490           }
 491
 492           parseDescriptionHTML(sf, removeHTML);
 493
 494           seq.addSequenceFeature(sf);
 495
 496           while (seqId != null
 497                   && (seq = align.findName(seq, seqId, false)) != null)
 498           {
 499             seq.addSequenceFeature(new SequenceFeature(sf));
 500           }
 501           // If we got here, its not a GFFFile
 502           GFFFile = false;
 503         }
 504       }
 505       resetMatcher();
 506     } catch (Exception ex)
 507     {
 508       // should report somewhere useful for UI if necessary
 509       warningMessage = ((warningMessage == null) ? "" : warningMessage)
 510               + "Parsing error at\n" + line;
 511       System.out.println("Error parsing feature file: " + ex + "\n" + line);
 512       ex.printStackTrace(System.err);
 513       resetMatcher();
 514       return false;
 515     }
 516
 517     return true;
 518   }
 519
 520   private enum GffPragmas
 521   {
 522     gff_version, sequence_region, feature_ontology, attribute_ontology, source_ontology, species_build, fasta, hash
 523   };
 524
 525   private static Map<String, GffPragmas> GFFPRAGMA;
 526   static
 527   {
 528     GFFPRAGMA = new HashMap<String, GffPragmas>();
 529     GFFPRAGMA.put("sequence-region", GffPragmas.sequence_region);
 530     GFFPRAGMA.put("feature-ontology", GffPragmas.feature_ontology);
 531     GFFPRAGMA.put("#", GffPragmas.hash);
 532     GFFPRAGMA.put("fasta", GffPragmas.fasta);
 533     GFFPRAGMA.put("species-build", GffPragmas.species_build);
 534     GFFPRAGMA.put("source-ontology", GffPragmas.source_ontology);
 535     GFFPRAGMA.put("attribute-ontology", GffPragmas.attribute_ontology);
 536   }
 537
 538   private void processGffPragma(String line, Map<String, String> gffProps,
 539           AlignmentI align, ArrayList<SequenceI> newseqs)
 540           throws IOException
 541   {
 542     // line starts with ##
 543     int spacepos = line.indexOf(' ');
 544     String pragma = spacepos == -1 ? line.substring(2).trim() : line
 545             .substring(2, spacepos);
 546     GffPragmas gffpragma = GFFPRAGMA.get(pragma.toLowerCase());
 547     if (gffpragma == null)
 548     {
 549       return;
 550     }
 551     switch (gffpragma)
 552     {
 553     case gff_version:
 554       try
 555       {
 556         gffversion = Integer.parseInt(line.substring(spacepos + 1));
 557       } finally
 558       {
 559
 560       }
 561       break;
 562     case feature_ontology:
 563       // resolve against specific feature ontology
 564       break;
 565     case attribute_ontology:
 566       // resolve against specific attribute ontology
 567       break;
 568     case source_ontology:
 569       // resolve against specific source ontology
 570       break;
 571     case species_build:
 572       // resolve against specific NCBI taxon version
 573       break;
 574     case hash:
 575       // close off any open feature hierarchies
 576       break;
 577     case fasta:
 578       // process the rest of the file as a fasta file and replace any dummy
 579       // sequence IDs
 580       process_as_fasta(align, newseqs);
 581       break;
 582     default:
 583       // we do nothing ?
 584       System.err.println("Ignoring unknown pragma:\n" + line);
 585     }
 586   }
 587
 588   private void process_as_fasta(AlignmentI align, List<SequenceI> newseqs)
 589           throws IOException
 590   {
 591     try
 592     {
 593       mark();
 594     } catch (IOException q)
 595     {
 596     }
 597     FastaFile parser = new FastaFile(this);
 598     List<SequenceI> includedseqs = parser.getSeqs();
 599     SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
 600     // iterate over includedseqs, and replacing matching ones with newseqs
 601     // sequences. Generic iterator not used here because we modify includedseqs
 602     // as we go
 603     for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
 604     {
 605       // search for any dummy seqs that this sequence can be used to update
 606       SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
 607       if (dummyseq != null)
 608       {
 609         // dummyseq was created so it could be annotated and referred to in
 610         // alignments/codon mappings
 611
 612         SequenceI mseq = includedseqs.get(p);
 613         // mseq is the 'template' imported from the FASTA file which we'll use
 614         // to coomplete dummyseq
 615         if (dummyseq instanceof SequenceDummy)
 616         {
 617           // probably have the pattern wrong
 618           // idea is that a flyweight proxy for a sequence ID can be created for
 619           // 1. stable reference creation
 620           // 2. addition of annotation
 621           // 3. future replacement by a real sequence
 622           // current pattern is to create SequenceDummy objects - a convenience
 623           // constructor for a Sequence.
 624           // problem is that when promoted to a real sequence, all references
 625           // need
 626           // to be updated somehow.
 627           ((SequenceDummy) dummyseq).become(mseq);
 628           includedseqs.set(p, dummyseq); // template is no longer needed
 629         }
 630       }
 631     }
 632     // finally add sequences to the dataset
 633     for (SequenceI seq : includedseqs)
 634     {
 635       align.addSequence(seq);
 636     }
 637   }
 638
 639   /**
 640    * take a sequence feature and examine its attributes to decide how it should
 641    * be added to a sequence
 642    *
 643    * @param seq
 644    *          - the destination sequence constructed or discovered in the
 645    *          current context
 646    * @param sf
 647    *          - the base feature with ATTRIBUTES property containing any
 648    *          additional attributes
 649    * @param gFFFile
 650    *          - true if we are processing a GFF annotation file
 651    * @return true if sf was actually added to the sequence, false if it was
 652    *         processed in another way
 653    */
 654   public boolean processOrAddSeqFeature(AlignmentI align,
 655           List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
 656           boolean gFFFile, boolean relaxedIdMatching)
 657   {
 658     String attr = (String) sf.getValue("ATTRIBUTES");
 659     boolean add = true;
 660     if (gFFFile && attr != null)
 661     {
 662       int nattr = 8;
 663
 664       for (String attset : attr.split("\t"))
 665       {
 666         if (attset == null || attset.trim().length() == 0)
 667         {
 668           continue;
 669         }
 670         nattr++;
 671         Map<String, List<String>> set = new HashMap<String, List<String>>();
 672         // normally, only expect one column - 9 - in this field
 673         // the attributes (Gff3) or groups (gff2) field
 674         for (String pair : attset.trim().split(";"))
 675         {
 676           pair = pair.trim();
 677           if (pair.length() == 0)
 678           {
 679             continue;
 680           }
 681
 682           // expect either space seperated (gff2) or '=' separated (gff3)
 683           // key/value pairs here
 684
 685           int eqpos = pair.indexOf('='), sppos = pair.indexOf(' ');
 686           String key = null, value = null;
 687
 688           if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
 689           {
 690             key = pair.substring(0, sppos);
 691             value = pair.substring(sppos + 1);
 692           }
 693           else
 694           {
 695             if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
 696             {
 697               key = pair.substring(0, eqpos);
 698               value = pair.substring(eqpos + 1);
 699             }
 700             else
 701             {
 702               key = pair;
 703             }
 704           }
 705           if (key != null)
 706           {
 707             List<String> vals = set.get(key);
 708             if (vals == null)
 709             {
 710               vals = new ArrayList<String>();
 711               set.put(key, vals);
 712             }
 713             if (value != null)
 714             {
 715               vals.add(value.trim());
 716             }
 717           }
 718         }
 719         try
 720         {
 721           add &= processGffKey(set, nattr, seq, sf, align, newseqs,
 722                   relaxedIdMatching); // process decides if
 723                                       // feature is actually
 724                                       // added
 725         } catch (InvalidGFF3FieldException ivfe)
 726         {
 727           System.err.println(ivfe);
 728         }
 729       }
 730     }
 731     if (add)
 732     {
 733       seq.addSequenceFeature(sf);
 734     }
 735     return add;
 736   }
 737
 738   public class InvalidGFF3FieldException extends Exception
 739   {
 740     String field, value;
 741
 742     public InvalidGFF3FieldException(String field,
 743             Map<String, List<String>> set, String message)
 744     {
 745       super(message + " (Field was " + field + " and value was "
 746               + set.get(field).toString());
 747       this.field = field;
 748       this.value = set.get(field).toString();
 749     }
 750
 751   }
 752
 753   /**
 754    * take a set of keys for a feature and interpret them
 755    *
 756    * @param set
 757    * @param nattr
 758    * @param seq
 759    * @param sf
 760    * @return
 761    */
 762   public boolean processGffKey(Map<String, List<String>> set, int nattr,
 763           SequenceI seq, SequenceFeature sf, AlignmentI align,
 764           List<SequenceI> newseqs, boolean relaxedIdMatching)
 765           throws InvalidGFF3FieldException
 766   {
 767     String attr;
 768     // decide how to interpret according to type
 769     if (sf.getType().equals("similarity"))
 770     {
 771       int strand = sf.getStrand();
 772       // exonerate cdna/protein map
 773       // look for fields
 774       List<SequenceI> querySeq = findNames(align, newseqs,
 775               relaxedIdMatching, set.get(attr = "Query"));
 776       if (querySeq == null || querySeq.size() != 1)
 777       {
 778         throw new InvalidGFF3FieldException(attr, set,
 779                 "Expecting exactly one sequence in Query field (got "
 780                         + set.get(attr) + ")");
 781       }
 782       if (set.containsKey(attr = "Align"))
 783       {
 784         // process the align maps and create cdna/protein maps
 785         // ideally, the query sequences are in the alignment, but maybe not...
 786
 787         AlignedCodonFrame alco = new AlignedCodonFrame();
 788         MapList codonmapping = constructCodonMappingFromAlign(set, attr,
 789                 strand);
 790
 791         // add codon mapping, and hope!
 792         alco.addMap(seq, querySeq.get(0), codonmapping);
 793         align.addCodonFrame(alco);
 794         // everything that's needed to be done is done
 795         // no features to create here !
 796         return false;
 797       }
 798
 799     }
 800     return true;
 801   }
 802
 803   private MapList constructCodonMappingFromAlign(
 804           Map<String, List<String>> set, String attr, int strand)
 805           throws InvalidGFF3FieldException
 806   {
 807     if (strand == 0)
 808     {
 809       throw new InvalidGFF3FieldException(attr, set,
 810               "Invalid strand for a codon mapping (cannot be 0)");
 811     }
 812     List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
 813     int lastppos = 0, lastpframe = 0;
 814     for (String range : set.get(attr))
 815     {
 816       List<Integer> ints = new ArrayList<Integer>();
 817       StringTokenizer st = new StringTokenizer(range, " ");
 818       while (st.hasMoreTokens())
 819       {
 820         String num = st.nextToken();
 821         try
 822         {
 823           ints.add(new Integer(num));
 824         } catch (NumberFormatException nfe)
 825         {
 826           throw new InvalidGFF3FieldException(attr, set,
 827                   "Invalid number in field " + num);
 828         }
 829       }
 830       // Align positionInRef positionInQuery LengthInRef
 831       // contig_1146 exonerate:protein2genome:local similarity 8534 11269
 832       // 3652 - . alignment_id 0 ;
 833       // Query DDB_G0269124
 834       // Align 11270 143 120
 835       // corresponds to : 120 bases align at pos 143 in protein to 11270 on
 836       // dna in strand direction
 837       // Align 11150 187 282
 838       // corresponds to : 282 bases align at pos 187 in protein to 11150 on
 839       // dna in strand direction
 840       //
 841       // Align 10865 281 888
 842       // Align 9977 578 1068
 843       // Align 8909 935 375
 844       //
 845       if (ints.size() != 3)
 846       {
 847         throw new InvalidGFF3FieldException(attr, set,
 848                 "Invalid number of fields for this attribute ("
 849                         + ints.size() + ")");
 850       }
 851       fromrange.add(new Integer(ints.get(0).intValue()));
 852       fromrange.add(new Integer(ints.get(0).intValue() + strand
 853               * ints.get(2).intValue()));
 854       // how are intron/exon boundaries that do not align in codons
 855       // represented
 856       if (ints.get(1).equals(lastppos) && lastpframe > 0)
 857       {
 858         // extend existing to map
 859         lastppos += ints.get(2) / 3;
 860         lastpframe = ints.get(2) % 3;
 861         torange.set(torange.size() - 1, new Integer(lastppos));
 862       }
 863       else
 864       {
 865         // new to map range
 866         torange.add(ints.get(1));
 867         lastppos = ints.get(1) + ints.get(2) / 3;
 868         lastpframe = ints.get(2) % 3;
 869         torange.add(new Integer(lastppos));
 870       }
 871     }
 872     // from and to ranges must end up being a series of start/end intervals
 873     if (fromrange.size() % 2 == 1)
 874     {
 875       throw new InvalidGFF3FieldException(attr, set,
 876               "Couldn't parse the DNA alignment range correctly");
 877     }
 878     if (torange.size() % 2 == 1)
 879     {
 880       throw new InvalidGFF3FieldException(attr, set,
 881               "Couldn't parse the protein alignment range correctly");
 882     }
 883     // finally, build the map
 884     int[] frommap = new int[fromrange.size()], tomap = new int[torange
 885             .size()];
 886     int p = 0;
 887     for (Integer ip : fromrange)
 888     {
 889       frommap[p++] = ip.intValue();
 890     }
 891     p = 0;
 892     for (Integer ip : torange)
 893     {
 894       tomap[p++] = ip.intValue();
 895     }
 896
 897     return new MapList(frommap, tomap, 3, 1);
 898   }
 899
 900   private List<SequenceI> findNames(AlignmentI align,
 901           List<SequenceI> newseqs, boolean relaxedIdMatching,
 902           List<String> list)
 903   {
 904     List<SequenceI> found = new ArrayList<SequenceI>();
 905     for (String seqId : list)
 906     {
 907       SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
 908       if (seq != null)
 909       {
 910         found.add(seq);
 911       }
 912     }
 913     return found;
 914   }
 915
 916   private AlignmentI lastmatchedAl = null;
 917
 918   private SequenceIdMatcher matcher = null;
 919
 920   /**
 921    * clear any temporary handles used to speed up ID matching
 922    */
 923   private void resetMatcher()
 924   {
 925     lastmatchedAl = null;
 926     matcher = null;
 927   }
 928
 929   private SequenceI findName(AlignmentI align, String seqId,
 930           boolean relaxedIdMatching, List<SequenceI> newseqs)
 931   {
 932     SequenceI match = null;
 933     if (relaxedIdMatching)
 934     {
 935       if (lastmatchedAl != align)
 936       {
 937         matcher = new SequenceIdMatcher(
 938                 (lastmatchedAl = align).getSequencesArray());
 939         if (newseqs != null)
 940         {
 941           matcher.addAll(newseqs);
 942         }
 943       }
 944       match = matcher.findIdMatch(seqId);
 945     }
 946     else
 947     {
 948       match = align.findName(seqId, true);
 949       if (match == null && newseqs != null)
 950       {
 951         for (SequenceI m : newseqs)
 952         {
 953           if (seqId.equals(m.getName()))
 954           {
 955             return m;
 956           }
 957         }
 958       }
 959
 960     }
 961     if (match == null && newseqs != null)
 962     {
 963       match = new SequenceDummy(seqId);
 964       if (relaxedIdMatching)
 965       {
 966         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 967       }
 968       // add dummy sequence to the newseqs list
 969       newseqs.add(match);
 970     }
 971     return match;
 972   }
 973
 974   public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
 975   {
 976     if (sf.getDescription() == null)
 977     {
 978       return;
 979     }
 980     jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
 981             sf.getDescription(), removeHTML, newline);
 982
 983     sf.description = (removeHTML) ? parsed.getNonHtmlContent()
 984             : sf.description;
 985     for (String link : parsed.getLinks())
 986     {
 987       sf.addLink(link);
 988     }
 989
 990   }
 991
 992   /**
 993    * generate a features file for seqs includes non-pos features by default.
 994    *
 995    * @param seqs
 996    *          source of sequence features
 997    * @param map
 998    *          hash of feature types and colours
 999    * @return features file contents
1000    */
1001   public String printJalviewFormat(SequenceI[] seqs,
1002           Map<String, FeatureColourI> map)
1003   {
1004     return printJalviewFormat(seqs, map, true, true);
1005   }
1006
1007   /**
1008    * generate a features file for seqs with colours from visible (if any)
1009    *
1010    * @param sequences
1011    *          source of features
1012    * @param visible
1013    *          hash of Colours for each feature type
1014    * @param visOnly
1015    *          when true only feature types in 'visible' will be output
1016    * @param nonpos
1017    *          indicates if non-positional features should be output (regardless
1018    *          of group or type)
1019    * @return features file contents
1020    */
1021   public String printJalviewFormat(SequenceI[] sequences,
1022           Map<String, FeatureColourI> visible,
1023           boolean visOnly, boolean nonpos)
1024   {
1025     if (visOnly && !nonpos && (visible == null || visible.size() < 1))
1026     {
1027       // no point continuing.
1028       return "No Features Visible";
1029     }
1030     StringBuilder out = new StringBuilder(128);
1031     SequenceFeature[] next;
1032     boolean featuresGen = false;
1033
1034     if (visible != null && visOnly)
1035     {
1036       // write feature colours only if we're given them and we are generating
1037       // viewed features
1038       // TODO: decide if feature links should also be written here ?
1039       Iterator<String> en = visible.keySet().iterator();
1040       while (en.hasNext())
1041       {
1042         String featureType = en.next();
1043         FeatureColourI colour = visible.get(featureType);
1044         out.append(colour.toJalviewFormat(featureType)).append(newline);
1045       }
1046     }
1047     // Work out which groups are both present and visible
1048     Vector groups = new Vector();
1049     int groupIndex = 0;
1050     boolean isnonpos = false;
1051
1052     for (int i = 0; i < sequences.length; i++)
1053     {
1054       next = sequences[i].getSequenceFeatures();
1055       if (next != null)
1056       {
1057         for (int j = 0; j < next.length; j++)
1058         {
1059           isnonpos = next[j].begin == 0 && next[j].end == 0;
1060           if ((!nonpos && isnonpos)
1061                   || (!isnonpos && visOnly && !visible
1062                           .containsKey(next[j].type)))
1063           {
1064             continue;
1065           }
1066
1067           if (next[j].featureGroup != null
1068                   && !groups.contains(next[j].featureGroup))
1069           {
1070             groups.addElement(next[j].featureGroup);
1071           }
1072         }
1073       }
1074     }
1075
1076     String group = null;
1077     do
1078     {
1079
1080       if (groups.size() > 0 && groupIndex < groups.size())
1081       {
1082         group = groups.elementAt(groupIndex).toString();
1083         out.append(newline);
1084         out.append("STARTGROUP\t");
1085         out.append(group);
1086         out.append(newline);
1087       }
1088       else
1089       {
1090         group = null;
1091       }
1092
1093       for (int i = 0; i < sequences.length; i++)
1094       {
1095         next = sequences[i].getSequenceFeatures();
1096         if (next != null)
1097         {
1098           for (int j = 0; j < next.length; j++)
1099           {
1100             isnonpos = next[j].begin == 0 && next[j].end == 0;
1101             if ((!nonpos && isnonpos)
1102                     || (!isnonpos && visOnly && !visible
1103                             .containsKey(next[j].type)))
1104             {
1105               // skip if feature is nonpos and we ignore them or if we only
1106               // output visible and it isn't non-pos and it's not visible
1107               continue;
1108             }
1109
1110             if (group != null
1111                     && (next[j].featureGroup == null || !next[j].featureGroup
1112                             .equals(group)))
1113             {
1114               continue;
1115             }
1116
1117             if (group == null && next[j].featureGroup != null)
1118             {
1119               continue;
1120             }
1121             // we have features to output
1122             featuresGen = true;
1123             if (next[j].description == null
1124                     || next[j].description.equals(""))
1125             {
1126               out.append(next[j].type + "\t");
1127             }
1128             else
1129             {
1130               if (next[j].links != null
1131                       && next[j].getDescription().indexOf("<html>") == -1)
1132               {
1133                 out.append("<html>");
1134               }
1135
1136               out.append(next[j].description + " ");
1137               if (next[j].links != null)
1138               {
1139                 for (int l = 0; l < next[j].links.size(); l++)
1140                 {
1141                   String label = next[j].links.elementAt(l).toString();
1142                   String href = label.substring(label.indexOf("|") + 1);
1143                   label = label.substring(0, label.indexOf("|"));
1144
1145                   if (next[j].description.indexOf(href) == -1)
1146                   {
1147                     out.append("<a href=\"" + href + "\">" + label + "</a>");
1148                   }
1149                 }
1150
1151                 if (next[j].getDescription().indexOf("</html>") == -1)
1152                 {
1153                   out.append("</html>");
1154                 }
1155               }
1156
1157               out.append("\t");
1158             }
1159             out.append(sequences[i].getName());
1160             out.append("\t-1\t");
1161             out.append(next[j].begin);
1162             out.append("\t");
1163             out.append(next[j].end);
1164             out.append("\t");
1165             out.append(next[j].type);
1166             if (!Float.isNaN(next[j].score))
1167             {
1168               out.append("\t");
1169               out.append(next[j].score);
1170             }
1171             out.append(newline);
1172           }
1173         }
1174       }
1175
1176       if (group != null)
1177       {
1178         out.append("ENDGROUP\t");
1179         out.append(group);
1180         out.append(newline);
1181         groupIndex++;
1182       }
1183       else
1184       {
1185         break;
1186       }
1187
1188     } while (groupIndex < groups.size() + 1);
1189
1190     if (!featuresGen)
1191     {
1192       return "No Features Visible";
1193     }
1194
1195     return out.toString();
1196   }
1197
1198   /**
1199    * generate a gff file for sequence features includes non-pos features by
1200    * default.
1201    *
1202    * @param seqs
1203    * @param map
1204    * @return
1205    */
1206   public String printGFFFormat(SequenceI[] seqs,
1207           Map<String, FeatureColourI> map)
1208   {
1209     return printGFFFormat(seqs, map, true, true);
1210   }
1211
1212   public String printGFFFormat(SequenceI[] seqs,
1213           Map<String, FeatureColourI> map, boolean visOnly, boolean nonpos)
1214   {
1215     StringBuffer out = new StringBuffer();
1216     SequenceFeature[] next;
1217     String source;
1218     boolean isnonpos;
1219     for (int i = 0; i < seqs.length; i++)
1220     {
1221       if (seqs[i].getSequenceFeatures() != null)
1222       {
1223         next = seqs[i].getSequenceFeatures();
1224         for (int j = 0; j < next.length; j++)
1225         {
1226           isnonpos = next[j].begin == 0 && next[j].end == 0;
1227           if ((!nonpos && isnonpos)
1228                   || (!isnonpos && visOnly && !map
1229                           .containsKey(next[j].type)))
1230           {
1231             continue;
1232           }
1233
1234           source = next[j].featureGroup;
1235           if (source == null)
1236           {
1237             source = next[j].getDescription();
1238           }
1239
1240           out.append(seqs[i].getName());
1241           out.append("\t");
1242           out.append(source);
1243           out.append("\t");
1244           out.append(next[j].type);
1245           out.append("\t");
1246           out.append(next[j].begin);
1247           out.append("\t");
1248           out.append(next[j].end);
1249           out.append("\t");
1250           out.append(next[j].score);
1251           out.append("\t");
1252
1253           if (next[j].getValue("STRAND") != null)
1254           {
1255             out.append(next[j].getValue("STRAND"));
1256             out.append("\t");
1257           }
1258           else
1259           {
1260             out.append(".\t");
1261           }
1262
1263           if (next[j].getValue("FRAME") != null)
1264           {
1265             out.append(next[j].getValue("FRAME"));
1266           }
1267           else
1268           {
1269             out.append(".");
1270           }
1271           // TODO: verify/check GFF - should there be a /t here before attribute
1272           // output ?
1273
1274           if (next[j].getValue("ATTRIBUTES") != null)
1275           {
1276             out.append(next[j].getValue("ATTRIBUTES"));
1277           }
1278
1279           out.append(newline);
1280
1281         }
1282       }
1283     }
1284
1285     return out.toString();
1286   }
1287
1288   /**
1289    * this is only for the benefit of object polymorphism - method does nothing.
1290    */
1291   @Override
1292   public void parse()
1293   {
1294     // IGNORED
1295   }
1296
1297   /**
1298    * this is only for the benefit of object polymorphism - method does nothing.
1299    *
1300    * @return error message
1301    */
1302   @Override
1303   public String print()
1304   {
1305     return "USE printGFFFormat() or printJalviewFormat()";
1306   }
1307
1308 }