2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
3 * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
23 import javax.xml.parsers.ParserConfigurationException;
25 import org.xml.sax.SAXException;
27 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
28 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
29 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
30 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
32 import jalview.analysis.SequenceIdMatcher;
33 import jalview.datamodel.*;
34 import jalview.schemes.*;
35 import jalview.util.Format;
38 * Parse and create Jalview Features files Detects GFF format features files and
39 * parses. Does not implement standard print() - call specific printFeatures or
40 * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
41 * for the features annotation - this normally works on an exact match.
46 public class FeaturesFile extends AlignFile
49 * work around for GFF interpretation bug where source string becomes
50 * description rather than a group
52 private boolean doGffSource = true;
55 * Creates a new FeaturesFile object.
62 * Creates a new FeaturesFile object.
70 public FeaturesFile(String inFile, String type) throws Exception
75 public FeaturesFile(FileParse source) throws Exception
81 * Parse GFF or sequence features file using case-independent matching,
85 * - alignment/dataset containing sequences that are to be annotated
87 * - hashtable to store feature colour definitions
89 * - process html strings into plain text
90 * @return true if features were added
92 public boolean parse(AlignmentI align, Hashtable colours,
95 return parse(align, colours, null, removeHTML, false);
99 * Parse GFF or sequence features file optionally using case-independent
100 * matching, discarding URLs
103 * - alignment/dataset containing sequences that are to be annotated
105 * - hashtable to store feature colour definitions
107 * - process html strings into plain text
108 * @param relaxedIdmatching
109 * - when true, ID matches to compound sequence IDs are allowed
110 * @return true if features were added
112 public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
113 boolean relaxedIdMatching)
115 return parse(align, colours, null, removeHTML, relaxedIdMatching);
119 * Parse GFF or sequence features file optionally using case-independent
123 * - alignment/dataset containing sequences that are to be annotated
125 * - hashtable to store feature colour definitions
127 * - hashtable to store associated URLs
129 * - process html strings into plain text
130 * @return true if features were added
132 public boolean parse(AlignmentI align, Map colours, Map featureLink,
135 return parse(align, colours, featureLink, removeHTML, false);
139 * Parse GFF or sequence features file
142 * - alignment/dataset containing sequences that are to be annotated
144 * - hashtable to store feature colour definitions
146 * - hashtable to store associated URLs
148 * - process html strings into plain text
149 * @param relaxedIdmatching
150 * - when true, ID matches to compound sequence IDs are allowed
151 * @return true if features were added
153 public boolean parse(AlignmentI align, Map colours, Map featureLink,
154 boolean removeHTML, boolean relaxedIdmatching)
160 SequenceI seq = null;
161 String type, desc, token = null;
163 int index, start, end;
167 String featureGroup = null, groupLink = null;
168 Map typeLink = new Hashtable();
170 * when true, assume GFF style features rather than Jalview style.
172 boolean GFFFile = true;
173 while ((line = nextLine()) != null)
175 if (line.startsWith("#"))
180 st = new StringTokenizer(line, "\t");
181 if (st.countTokens() == 1)
183 if (line.trim().equalsIgnoreCase("GFF"))
185 // Start parsing file as if it might be GFF again.
190 if (st.countTokens() > 1 && st.countTokens() < 4)
193 type = st.nextToken();
194 if (type.equalsIgnoreCase("startgroup"))
196 featureGroup = st.nextToken();
197 if (st.hasMoreElements())
199 groupLink = st.nextToken();
200 featureLink.put(featureGroup, groupLink);
203 else if (type.equalsIgnoreCase("endgroup"))
205 // We should check whether this is the current group,
206 // but at present theres no way of showing more than 1 group
213 Object colour = null;
214 String colscheme = st.nextToken();
215 if (colscheme.indexOf("|") > -1
216 || colscheme.trim().equalsIgnoreCase("label"))
218 // Parse '|' separated graduated colourscheme fields:
219 // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
220 // can either provide 'label' only, first is optional, next two
221 // colors are required (but may be
222 // left blank), next is optional, nxt two min/max are required.
223 // first is either 'label'
224 // first/second and third are both hexadecimal or word equivalent
226 // next two are values parsed as floats.
227 // fifth is either 'above','below', or 'none'.
228 // sixth is a float value and only required when fifth is either
229 // 'above' or 'below'.
230 StringTokenizer gcol = new StringTokenizer(colscheme, "|",
233 int threshtype = AnnotationColourGradient.NO_THRESHOLD;
234 float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
235 boolean labelCol = false;
237 String mincol = gcol.nextToken();
241 .println("Expected either 'label' or a colour specification in the line: "
245 String maxcol = null;
246 if (mincol.toLowerCase().indexOf("label") == 0)
249 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
251 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
253 String abso = null, minval, maxval;
256 // at least four more tokens
257 if (mincol.equals("|"))
263 gcol.nextToken(); // skip next '|'
265 // continue parsing rest of line
266 maxcol = gcol.nextToken();
267 if (maxcol.equals("|"))
273 gcol.nextToken(); // skip next '|'
275 abso = gcol.nextToken();
276 gcol.nextToken(); // skip next '|'
277 if (abso.toLowerCase().indexOf("abso") != 0)
284 minval = gcol.nextToken();
285 gcol.nextToken(); // skip next '|'
287 maxval = gcol.nextToken();
288 if (gcol.hasMoreTokens())
290 gcol.nextToken(); // skip next '|'
294 if (minval.length() > 0)
296 min = new Float(minval).floatValue();
298 } catch (Exception e)
301 .println("Couldn't parse the minimum value for graduated colour for type ("
303 + ") - did you misspell 'auto' for the optional automatic colour switch ?");
308 if (maxval.length() > 0)
310 max = new Float(maxval).floatValue();
312 } catch (Exception e)
315 .println("Couldn't parse the maximum value for graduated colour for type ("
322 // add in some dummy min/max colours for the label-only
329 colour = new jalview.schemes.GraduatedColor(
330 new UserColourScheme(mincol).findColour('A'),
331 new UserColourScheme(maxcol).findColour('A'), min,
333 } catch (Exception e)
336 .println("Couldn't parse the graduated colour scheme ("
342 ((jalview.schemes.GraduatedColor) colour)
343 .setColourByLabel(labelCol);
344 ((jalview.schemes.GraduatedColor) colour)
345 .setAutoScaled(abso == null);
346 // add in any additional parameters
347 String ttype = null, tval = null;
348 if (gcol.hasMoreTokens())
350 // threshold type and possibly a threshold value
351 ttype = gcol.nextToken();
352 if (ttype.toLowerCase().startsWith("below"))
354 ((jalview.schemes.GraduatedColor) colour)
355 .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
357 else if (ttype.toLowerCase().startsWith("above"))
359 ((jalview.schemes.GraduatedColor) colour)
360 .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
364 ((jalview.schemes.GraduatedColor) colour)
365 .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
366 if (!ttype.toLowerCase().startsWith("no"))
369 .println("Ignoring unrecognised threshold type : "
374 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
379 tval = gcol.nextToken();
380 ((jalview.schemes.GraduatedColor) colour)
381 .setThresh(new Float(tval).floatValue());
382 } catch (Exception e)
385 .println("Couldn't parse threshold value as a float: ("
390 // parse the thresh-is-min token ?
391 if (gcol.hasMoreTokens())
394 .println("Ignoring additional tokens in parameters in graduated colour specification\n");
395 while (gcol.hasMoreTokens())
397 System.err.println("|" + gcol.nextToken());
399 System.err.println("\n");
405 UserColourScheme ucs = new UserColourScheme(colscheme);
406 colour = ucs.findColour('A');
410 colours.put(type, colour);
412 if (st.hasMoreElements())
414 String link = st.nextToken();
415 typeLink.put(type, link);
416 if (featureLink == null)
418 featureLink = new Hashtable();
420 featureLink.put(type, link);
426 while (st.hasMoreElements())
431 // Still possible this is an old Jalview file,
432 // which does not have type colours at the beginning
433 seqId = token = st.nextToken();
434 seq = findName(align, seqId, relaxedIdmatching);
437 desc = st.nextToken();
439 if (doGffSource && desc.indexOf(' ') == -1)
441 // could also be a source term rather than description line
442 group = new String(desc);
444 type = st.nextToken();
447 String stt = st.nextToken();
448 if (stt.length() == 0 || stt.equals("-"))
454 start = Integer.parseInt(stt);
456 } catch (NumberFormatException ex)
462 String stt = st.nextToken();
463 if (stt.length() == 0 || stt.equals("-"))
469 end = Integer.parseInt(stt);
471 } catch (NumberFormatException ex)
475 // TODO: decide if non positional feature assertion for input data
476 // where end==0 is generally valid
479 // treat as non-positional feature, regardless.
484 score = new Float(st.nextToken()).floatValue();
485 } catch (NumberFormatException ex)
490 sf = new SequenceFeature(type, desc, start, end, score, group);
494 sf.setValue("STRAND", st.nextToken());
495 sf.setValue("FRAME", st.nextToken());
496 } catch (Exception ex)
500 if (st.hasMoreTokens())
502 StringBuffer attributes = new StringBuffer();
503 while (st.hasMoreTokens())
505 attributes.append("\t" + st.nextElement());
507 // TODO validate and split GFF2 attributes field ? parse out
508 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
509 // sf.setValue(attrib, val);
510 sf.setValue("ATTRIBUTES", attributes.toString());
513 seq.addSequenceFeature(sf);
514 while ((seq = align.findName(seq, seqId, true)) != null)
516 seq.addSequenceFeature(new SequenceFeature(sf));
522 if (GFFFile && seq == null)
528 desc = st.nextToken();
530 if (!st.hasMoreTokens())
533 .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
534 // in all probability, this isn't a file we understand, so bail
539 token = st.nextToken();
541 if (!token.equals("ID_NOT_SPECIFIED"))
543 seq = findName(align, seqId = token, relaxedIdmatching);
551 index = Integer.parseInt(st.nextToken());
552 seq = align.getSequenceAt(index);
553 } catch (NumberFormatException ex)
561 System.out.println("Sequence not found: " + line);
565 start = Integer.parseInt(st.nextToken());
566 end = Integer.parseInt(st.nextToken());
568 type = st.nextToken();
570 if (!colours.containsKey(type))
572 // Probably the old style groups file
573 UserColourScheme ucs = new UserColourScheme(type);
574 colours.put(type, ucs.findColour('A'));
576 sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
577 if (st.hasMoreTokens())
581 score = new Float(st.nextToken()).floatValue();
582 // update colourgradient bounds if allowed to
583 } catch (NumberFormatException ex)
589 if (groupLink != null && removeHTML)
591 sf.addLink(groupLink);
592 sf.description += "%LINK%";
594 if (typeLink.containsKey(type) && removeHTML)
596 sf.addLink(typeLink.get(type).toString());
597 sf.description += "%LINK%";
600 parseDescriptionHTML(sf, removeHTML);
602 seq.addSequenceFeature(sf);
605 && (seq = align.findName(seq, seqId, false)) != null)
607 seq.addSequenceFeature(new SequenceFeature(sf));
609 // If we got here, its not a GFFFile
614 } catch (Exception ex)
616 System.out.println("Error parsing feature file: " + ex + "\n" + line);
617 ex.printStackTrace(System.err);
625 private AlignmentI lastmatchedAl = null;
627 private SequenceIdMatcher matcher = null;
630 * clear any temporary handles used to speed up ID matching
632 private void resetMatcher()
634 lastmatchedAl = null;
638 private SequenceI findName(AlignmentI align, String seqId,
639 boolean relaxedIdMatching)
641 SequenceI match = null;
642 if (relaxedIdMatching)
644 if (lastmatchedAl != align)
646 matcher = new SequenceIdMatcher(
647 (lastmatchedAl = align).getSequencesArray());
649 match = matcher.findIdMatch(seqId);
653 match = align.findName(seqId, true);
658 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
660 if (sf.getDescription() == null)
664 jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
665 sf.getDescription(), removeHTML, newline);
667 sf.description = (removeHTML) ? parsed.getNonHtmlContent()
669 for (String link : parsed.getLinks())
677 * generate a features file for seqs includes non-pos features by default.
680 * source of sequence features
682 * hash of feature types and colours
683 * @return features file contents
685 public String printJalviewFormat(SequenceI[] seqs, Hashtable visible)
687 return printJalviewFormat(seqs, visible, true, true);
691 * generate a features file for seqs with colours from visible (if any)
696 * hash of Colours for each feature type
698 * when true only feature types in 'visible' will be output
700 * indicates if non-positional features should be output (regardless
702 * @return features file contents
704 public String printJalviewFormat(SequenceI[] seqs, Hashtable visible,
705 boolean visOnly, boolean nonpos)
707 StringBuffer out = new StringBuffer();
708 SequenceFeature[] next;
709 boolean featuresGen = false;
710 if (visOnly && !nonpos && (visible == null || visible.size() < 1))
712 // no point continuing.
713 return "No Features Visible";
716 if (visible != null && visOnly)
718 // write feature colours only if we're given them and we are generating
720 // TODO: decide if feature links should also be written here ?
721 Enumeration en = visible.keys();
723 while (en.hasMoreElements())
725 type = en.nextElement().toString();
727 if (visible.get(type) instanceof GraduatedColor)
729 GraduatedColor gc = (GraduatedColor) visible.get(type);
730 color = (gc.isColourByLabel() ? "label|" : "")
731 + Format.getHexString(gc.getMinColor()) + "|"
732 + Format.getHexString(gc.getMaxColor())
733 + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
735 if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
737 if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
743 if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
745 System.err.println("WARNING: Unsupported threshold type ("
746 + gc.getThreshType() + ") : Assuming 'above'");
751 color += "|" + gc.getThresh();
758 else if (visible.get(type) instanceof java.awt.Color)
760 color = Format.getHexString((java.awt.Color) visible.get(type));
764 // legacy support for integer objects containing colour triplet values
765 color = Format.getHexString(new java.awt.Color(Integer
766 .parseInt(visible.get(type).toString())));
774 // Work out which groups are both present and visible
775 Vector groups = new Vector();
777 boolean isnonpos = false;
779 for (int i = 0; i < seqs.length; i++)
781 next = seqs[i].getSequenceFeatures();
784 for (int j = 0; j < next.length; j++)
786 isnonpos = next[j].begin == 0 && next[j].end == 0;
787 if ((!nonpos && isnonpos)
788 || (!isnonpos && visOnly && !visible
789 .containsKey(next[j].type)))
794 if (next[j].featureGroup != null
795 && !groups.contains(next[j].featureGroup))
797 groups.addElement(next[j].featureGroup);
807 if (groups.size() > 0 && groupIndex < groups.size())
809 group = groups.elementAt(groupIndex).toString();
811 out.append("STARTGROUP\t");
820 for (int i = 0; i < seqs.length; i++)
822 next = seqs[i].getSequenceFeatures();
825 for (int j = 0; j < next.length; j++)
827 isnonpos = next[j].begin == 0 && next[j].end == 0;
828 if ((!nonpos && isnonpos)
829 || (!isnonpos && visOnly && !visible
830 .containsKey(next[j].type)))
832 // skip if feature is nonpos and we ignore them or if we only
833 // output visible and it isn't non-pos and it's not visible
838 && (next[j].featureGroup == null || !next[j].featureGroup
844 if (group == null && next[j].featureGroup != null)
848 // we have features to output
850 if (next[j].description == null
851 || next[j].description.equals(""))
853 out.append(next[j].type + "\t");
857 if (next[j].links != null
858 && next[j].getDescription().indexOf("<html>") == -1)
860 out.append("<html>");
863 out.append(next[j].description + " ");
864 if (next[j].links != null)
866 for (int l = 0; l < next[j].links.size(); l++)
868 String label = next[j].links.elementAt(l).toString();
869 String href = label.substring(label.indexOf("|") + 1);
870 label = label.substring(0, label.indexOf("|"));
872 if (next[j].description.indexOf(href) == -1)
874 out.append("<a href=\"" + href + "\">" + label + "</a>");
878 if (next[j].getDescription().indexOf("</html>") == -1)
880 out.append("</html>");
886 out.append(seqs[i].getName());
887 out.append("\t-1\t");
888 out.append(next[j].begin);
890 out.append(next[j].end);
892 out.append(next[j].type);
893 if (next[j].score != Float.NaN)
896 out.append(next[j].score);
905 out.append("ENDGROUP\t");
915 } while (groupIndex < groups.size() + 1);
919 return "No Features Visible";
922 return out.toString();
926 * generate a gff file for sequence features includes non-pos features by
933 public String printGFFFormat(SequenceI[] seqs, Hashtable visible)
935 return printGFFFormat(seqs, visible, true, true);
938 public String printGFFFormat(SequenceI[] seqs, Hashtable visible,
939 boolean visOnly, boolean nonpos)
941 StringBuffer out = new StringBuffer();
942 SequenceFeature[] next;
945 for (int i = 0; i < seqs.length; i++)
947 if (seqs[i].getSequenceFeatures() != null)
949 next = seqs[i].getSequenceFeatures();
950 for (int j = 0; j < next.length; j++)
952 isnonpos = next[j].begin == 0 && next[j].end == 0;
953 if ((!nonpos && isnonpos)
954 || (!isnonpos && visOnly && !visible
955 .containsKey(next[j].type)))
960 source = next[j].featureGroup;
963 source = next[j].getDescription();
966 out.append(seqs[i].getName());
970 out.append(next[j].type);
972 out.append(next[j].begin);
974 out.append(next[j].end);
976 out.append(next[j].score);
979 if (next[j].getValue("STRAND") != null)
981 out.append(next[j].getValue("STRAND"));
989 if (next[j].getValue("FRAME") != null)
991 out.append(next[j].getValue("FRAME"));
997 // TODO: verify/check GFF - should there be a /t here before attribute
1000 if (next[j].getValue("ATTRIBUTES") != null)
1002 out.append(next[j].getValue("ATTRIBUTES"));
1005 out.append(newline);
1011 return out.toString();
1015 * this is only for the benefit of object polymorphism - method does nothing.
1023 * this is only for the benefit of object polymorphism - method does nothing.
1025 * @return error message
1027 public String print()
1029 return "USE printGFFFormat() or printJalviewFormat()";