2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
3 * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
23 import javax.xml.parsers.ParserConfigurationException;
25 import org.xml.sax.SAXException;
27 import fr.orsay.lri.varna.exceptions.ExceptionFileFormatOrSyntax;
28 import fr.orsay.lri.varna.exceptions.ExceptionLoadingFailed;
29 import fr.orsay.lri.varna.exceptions.ExceptionPermissionDenied;
30 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
32 import jalview.analysis.SequenceIdMatcher;
33 import jalview.datamodel.*;
34 import jalview.schemes.*;
35 import jalview.util.Format;
38 * Parse and create Jalview Features files Detects GFF format features files and
39 * parses. Does not implement standard print() - call specific printFeatures or
40 * printGFF. Uses AlignmentI.findSequence(String id) to find the sequence object
41 * for the features annotation - this normally works on an exact match.
46 public class FeaturesFile extends AlignFile
49 * work around for GFF interpretation bug where source string becomes
50 * description rather than a group
52 private boolean doGffSource = true;
55 * Creates a new FeaturesFile object.
62 * Creates a new FeaturesFile object.
71 * @throws SAXException
72 * @throws ParserConfigurationException
73 * @throws ExceptionFileFormatOrSyntax
74 * @throws ExceptionLoadingFailed
75 * @throws ExceptionPermissionDenied
76 * @throws InterruptedException
77 * @throws ExceptionUnmatchedClosingParentheses
79 public FeaturesFile(String inFile, String type) throws IOException,
80 ExceptionFileFormatOrSyntax, ParserConfigurationException,
81 SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed,
82 InterruptedException, ExceptionUnmatchedClosingParentheses
87 public FeaturesFile(FileParse source) throws IOException,
88 ExceptionFileFormatOrSyntax, ParserConfigurationException,
89 SAXException, ExceptionPermissionDenied, ExceptionLoadingFailed,
90 InterruptedException, ExceptionUnmatchedClosingParentheses
96 * Parse GFF or sequence features file using case-independent matching,
100 * - alignment/dataset containing sequences that are to be annotated
102 * - hashtable to store feature colour definitions
104 * - process html strings into plain text
105 * @return true if features were added
107 public boolean parse(AlignmentI align, Hashtable colours,
110 return parse(align, colours, null, removeHTML, false);
114 * Parse GFF or sequence features file optionally using case-independent
115 * matching, discarding URLs
118 * - alignment/dataset containing sequences that are to be annotated
120 * - hashtable to store feature colour definitions
122 * - process html strings into plain text
123 * @param relaxedIdmatching
124 * - when true, ID matches to compound sequence IDs are allowed
125 * @return true if features were added
127 public boolean parse(AlignmentI align, Map colours, boolean removeHTML,
128 boolean relaxedIdMatching)
130 return parse(align, colours, null, removeHTML, relaxedIdMatching);
134 * Parse GFF or sequence features file optionally using case-independent
138 * - alignment/dataset containing sequences that are to be annotated
140 * - hashtable to store feature colour definitions
142 * - hashtable to store associated URLs
144 * - process html strings into plain text
145 * @return true if features were added
147 public boolean parse(AlignmentI align, Map colours, Map featureLink,
150 return parse(align, colours, featureLink, removeHTML, false);
154 * /** Parse GFF or sequence features file
157 * - alignment/dataset containing sequences that are to be annotated
159 * - hashtable to store feature colour definitions
161 * - hashtable to store associated URLs
163 * - process html strings into plain text
164 * @param relaxedIdmatching
165 * - when true, ID matches to compound sequence IDs are allowed
166 * @return true if features were added
168 public boolean parse(AlignmentI align, Map colours, Map featureLink,
169 boolean removeHTML, boolean relaxedIdmatching)
175 SequenceI seq = null;
176 String type, desc, token = null;
178 int index, start, end;
182 String featureGroup = null, groupLink = null;
183 Map typeLink = new Hashtable();
185 * when true, assume GFF style features rather than Jalview style.
187 boolean GFFFile = true;
188 while ((line = nextLine()) != null)
190 if (line.startsWith("#"))
195 st = new StringTokenizer(line, "\t");
196 if (st.countTokens() == 1)
198 if (line.trim().equalsIgnoreCase("GFF"))
200 // Start parsing file as if it might be GFF again.
205 if (st.countTokens() > 1 && st.countTokens() < 4)
208 type = st.nextToken();
209 if (type.equalsIgnoreCase("startgroup"))
211 featureGroup = st.nextToken();
212 if (st.hasMoreElements())
214 groupLink = st.nextToken();
215 featureLink.put(featureGroup, groupLink);
218 else if (type.equalsIgnoreCase("endgroup"))
220 // We should check whether this is the current group,
221 // but at present theres no way of showing more than 1 group
228 Object colour = null;
229 String colscheme = st.nextToken();
230 if (colscheme.indexOf("|") > -1
231 || colscheme.trim().equalsIgnoreCase("label"))
233 // Parse '|' separated graduated colourscheme fields:
234 // [label|][mincolour|maxcolour|[absolute|]minvalue|maxvalue|thresholdtype|thresholdvalue]
235 // can either provide 'label' only, first is optional, next two
236 // colors are required (but may be
237 // left blank), next is optional, nxt two min/max are required.
238 // first is either 'label'
239 // first/second and third are both hexadecimal or word equivalent
241 // next two are values parsed as floats.
242 // fifth is either 'above','below', or 'none'.
243 // sixth is a float value and only required when fifth is either
244 // 'above' or 'below'.
245 StringTokenizer gcol = new StringTokenizer(colscheme, "|",
248 int threshtype = AnnotationColourGradient.NO_THRESHOLD;
249 float min = Float.MIN_VALUE, max = Float.MAX_VALUE, threshval = Float.NaN;
250 boolean labelCol = false;
252 String mincol = gcol.nextToken();
256 .println("Expected either 'label' or a colour specification in the line: "
260 String maxcol = null;
261 if (mincol.toLowerCase().indexOf("label") == 0)
264 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null); // skip
266 mincol = (gcol.hasMoreTokens() ? gcol.nextToken() : null);
268 String abso = null, minval, maxval;
271 // at least four more tokens
272 if (mincol.equals("|"))
278 gcol.nextToken(); // skip next '|'
280 // continue parsing rest of line
281 maxcol = gcol.nextToken();
282 if (maxcol.equals("|"))
288 gcol.nextToken(); // skip next '|'
290 abso = gcol.nextToken();
291 gcol.nextToken(); // skip next '|'
292 if (abso.toLowerCase().indexOf("abso") != 0)
299 minval = gcol.nextToken();
300 gcol.nextToken(); // skip next '|'
302 maxval = gcol.nextToken();
303 if (gcol.hasMoreTokens())
305 gcol.nextToken(); // skip next '|'
309 if (minval.length() > 0)
311 min = new Float(minval).floatValue();
313 } catch (Exception e)
316 .println("Couldn't parse the minimum value for graduated colour for type ("
318 + ") - did you misspell 'auto' for the optional automatic colour switch ?");
323 if (maxval.length() > 0)
325 max = new Float(maxval).floatValue();
327 } catch (Exception e)
330 .println("Couldn't parse the maximum value for graduated colour for type ("
337 // add in some dummy min/max colours for the label-only
344 colour = new jalview.schemes.GraduatedColor(
345 new UserColourScheme(mincol).findColour('A'),
346 new UserColourScheme(maxcol).findColour('A'), min,
348 } catch (Exception e)
351 .println("Couldn't parse the graduated colour scheme ("
357 ((jalview.schemes.GraduatedColor) colour)
358 .setColourByLabel(labelCol);
359 ((jalview.schemes.GraduatedColor) colour)
360 .setAutoScaled(abso == null);
361 // add in any additional parameters
362 String ttype = null, tval = null;
363 if (gcol.hasMoreTokens())
365 // threshold type and possibly a threshold value
366 ttype = gcol.nextToken();
367 if (ttype.toLowerCase().startsWith("below"))
369 ((jalview.schemes.GraduatedColor) colour)
370 .setThreshType(AnnotationColourGradient.BELOW_THRESHOLD);
372 else if (ttype.toLowerCase().startsWith("above"))
374 ((jalview.schemes.GraduatedColor) colour)
375 .setThreshType(AnnotationColourGradient.ABOVE_THRESHOLD);
379 ((jalview.schemes.GraduatedColor) colour)
380 .setThreshType(AnnotationColourGradient.NO_THRESHOLD);
381 if (!ttype.toLowerCase().startsWith("no"))
384 .println("Ignoring unrecognised threshold type : "
389 if (((GraduatedColor) colour).getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
394 tval = gcol.nextToken();
395 ((jalview.schemes.GraduatedColor) colour)
396 .setThresh(new Float(tval).floatValue());
397 } catch (Exception e)
400 .println("Couldn't parse threshold value as a float: ("
405 // parse the thresh-is-min token ?
406 if (gcol.hasMoreTokens())
409 .println("Ignoring additional tokens in parameters in graduated colour specification\n");
410 while (gcol.hasMoreTokens())
412 System.err.println("|" + gcol.nextToken());
414 System.err.println("\n");
420 UserColourScheme ucs = new UserColourScheme(colscheme);
421 colour = ucs.findColour('A');
425 colours.put(type, colour);
427 if (st.hasMoreElements())
429 String link = st.nextToken();
430 typeLink.put(type, link);
431 if (featureLink == null)
433 featureLink = new Hashtable();
435 featureLink.put(type, link);
441 while (st.hasMoreElements())
446 // Still possible this is an old Jalview file,
447 // which does not have type colours at the beginning
448 seqId = token = st.nextToken();
449 seq = findName(align, seqId, relaxedIdmatching);
452 desc = st.nextToken();
454 if (doGffSource && desc.indexOf(' ') == -1)
456 // could also be a source term rather than description line
457 group = new String(desc);
459 type = st.nextToken();
462 String stt = st.nextToken();
463 if (stt.length() == 0 || stt.equals("-"))
469 start = Integer.parseInt(stt);
471 } catch (NumberFormatException ex)
477 String stt = st.nextToken();
478 if (stt.length() == 0 || stt.equals("-"))
484 end = Integer.parseInt(stt);
486 } catch (NumberFormatException ex)
490 // TODO: decide if non positional feature assertion for input data
491 // where end==0 is generally valid
494 // treat as non-positional feature, regardless.
499 score = new Float(st.nextToken()).floatValue();
500 } catch (NumberFormatException ex)
505 sf = new SequenceFeature(type, desc, start, end, score, group);
509 sf.setValue("STRAND", st.nextToken());
510 sf.setValue("FRAME", st.nextToken());
511 } catch (Exception ex)
515 if (st.hasMoreTokens())
517 StringBuffer attributes = new StringBuffer();
518 while (st.hasMoreTokens())
520 attributes.append("\t" + st.nextElement());
522 // TODO validate and split GFF2 attributes field ? parse out
523 // ([A-Za-z][A-Za-z0-9_]*) <value> ; and add as
524 // sf.setValue(attrib, val);
525 sf.setValue("ATTRIBUTES", attributes.toString());
528 seq.addSequenceFeature(sf);
529 while ((seq = align.findName(seq, seqId, true)) != null)
531 seq.addSequenceFeature(new SequenceFeature(sf));
537 if (GFFFile && seq == null)
543 desc = st.nextToken();
545 if (!st.hasMoreTokens())
548 .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
549 // in all probability, this isn't a file we understand, so bail
554 token = st.nextToken();
556 if (!token.equals("ID_NOT_SPECIFIED"))
558 seq = findName(align, seqId = token, relaxedIdmatching);
566 index = Integer.parseInt(st.nextToken());
567 seq = align.getSequenceAt(index);
568 } catch (NumberFormatException ex)
576 System.out.println("Sequence not found: " + line);
580 start = Integer.parseInt(st.nextToken());
581 end = Integer.parseInt(st.nextToken());
583 type = st.nextToken();
585 if (!colours.containsKey(type))
587 // Probably the old style groups file
588 UserColourScheme ucs = new UserColourScheme(type);
589 colours.put(type, ucs.findColour('A'));
591 sf = new SequenceFeature(type, desc, "", start, end, featureGroup);
592 if (st.hasMoreTokens())
596 score = new Float(st.nextToken()).floatValue();
597 // update colourgradient bounds if allowed to
598 } catch (NumberFormatException ex)
604 if (groupLink != null && removeHTML)
606 sf.addLink(groupLink);
607 sf.description += "%LINK%";
609 if (typeLink.containsKey(type) && removeHTML)
611 sf.addLink(typeLink.get(type).toString());
612 sf.description += "%LINK%";
615 parseDescriptionHTML(sf, removeHTML);
617 seq.addSequenceFeature(sf);
620 && (seq = align.findName(seq, seqId, false)) != null)
622 seq.addSequenceFeature(new SequenceFeature(sf));
624 // If we got here, its not a GFFFile
629 } catch (Exception ex)
631 System.out.println(line);
632 System.out.println("Error parsing feature file: " + ex + "\n" + line);
633 ex.printStackTrace(System.err);
641 private AlignmentI lastmatchedAl = null;
643 private SequenceIdMatcher matcher = null;
646 * clear any temporary handles used to speed up ID matching
648 private void resetMatcher()
650 lastmatchedAl = null;
654 private SequenceI findName(AlignmentI align, String seqId,
655 boolean relaxedIdMatching)
657 SequenceI match = null;
658 if (relaxedIdMatching)
660 if (lastmatchedAl != align)
662 matcher = new SequenceIdMatcher(
663 (lastmatchedAl = align).getSequencesArray());
665 match = matcher.findIdMatch(seqId);
669 match = align.findName(seqId, true);
674 public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML)
676 if (sf.getDescription() == null)
680 jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(
681 sf.getDescription(), removeHTML, newline);
683 sf.description = (removeHTML) ? parsed.getNonHtmlContent()
685 for (String link : parsed.getLinks())
693 * generate a features file for seqs includes non-pos features by default.
696 * source of sequence features
698 * hash of feature types and colours
699 * @return features file contents
701 public String printJalviewFormat(SequenceI[] seqs, Hashtable visible)
703 return printJalviewFormat(seqs, visible, true, true);
707 * generate a features file for seqs with colours from visible (if any)
712 * hash of Colours for each feature type
714 * when true only feature types in 'visible' will be output
716 * indicates if non-positional features should be output (regardless
718 * @return features file contents
720 public String printJalviewFormat(SequenceI[] seqs, Hashtable visible,
721 boolean visOnly, boolean nonpos)
723 StringBuffer out = new StringBuffer();
724 SequenceFeature[] next;
725 boolean featuresGen = false;
726 if (visOnly && !nonpos && (visible == null || visible.size() < 1))
728 // no point continuing.
729 return "No Features Visible";
732 if (visible != null && visOnly)
734 // write feature colours only if we're given them and we are generating
736 // TODO: decide if feature links should also be written here ?
737 Enumeration en = visible.keys();
739 while (en.hasMoreElements())
741 type = en.nextElement().toString();
743 if (visible.get(type) instanceof GraduatedColor)
745 GraduatedColor gc = (GraduatedColor) visible.get(type);
746 color = (gc.isColourByLabel() ? "label|" : "")
747 + Format.getHexString(gc.getMinColor()) + "|"
748 + Format.getHexString(gc.getMaxColor())
749 + (gc.isAutoScale() ? "|" : "|abso|") + gc.getMin() + "|"
751 if (gc.getThreshType() != AnnotationColourGradient.NO_THRESHOLD)
753 if (gc.getThreshType() == AnnotationColourGradient.BELOW_THRESHOLD)
759 if (gc.getThreshType() != AnnotationColourGradient.ABOVE_THRESHOLD)
761 System.err.println("WARNING: Unsupported threshold type ("
762 + gc.getThreshType() + ") : Assuming 'above'");
767 color += "|" + gc.getThresh();
774 else if (visible.get(type) instanceof java.awt.Color)
776 color = Format.getHexString((java.awt.Color) visible.get(type));
780 // legacy support for integer objects containing colour triplet values
781 color = Format.getHexString(new java.awt.Color(Integer
782 .parseInt(visible.get(type).toString())));
790 // Work out which groups are both present and visible
791 Vector groups = new Vector();
793 boolean isnonpos = false;
795 for (int i = 0; i < seqs.length; i++)
797 next = seqs[i].getSequenceFeatures();
800 for (int j = 0; j < next.length; j++)
802 isnonpos = next[j].begin == 0 && next[j].end == 0;
803 if ((!nonpos && isnonpos)
804 || (!isnonpos && visOnly && !visible
805 .containsKey(next[j].type)))
810 if (next[j].featureGroup != null
811 && !groups.contains(next[j].featureGroup))
813 groups.addElement(next[j].featureGroup);
823 if (groups.size() > 0 && groupIndex < groups.size())
825 group = groups.elementAt(groupIndex).toString();
827 out.append("STARTGROUP\t");
836 for (int i = 0; i < seqs.length; i++)
838 next = seqs[i].getSequenceFeatures();
841 for (int j = 0; j < next.length; j++)
843 isnonpos = next[j].begin == 0 && next[j].end == 0;
844 if ((!nonpos && isnonpos)
845 || (!isnonpos && visOnly && !visible
846 .containsKey(next[j].type)))
848 // skip if feature is nonpos and we ignore them or if we only
849 // output visible and it isn't non-pos and it's not visible
854 && (next[j].featureGroup == null || !next[j].featureGroup
860 if (group == null && next[j].featureGroup != null)
864 // we have features to output
866 if (next[j].description == null
867 || next[j].description.equals(""))
869 out.append(next[j].type + "\t");
873 if (next[j].links != null
874 && next[j].getDescription().indexOf("<html>") == -1)
876 out.append("<html>");
879 out.append(next[j].description + " ");
880 if (next[j].links != null)
882 for (int l = 0; l < next[j].links.size(); l++)
884 String label = next[j].links.elementAt(l).toString();
885 String href = label.substring(label.indexOf("|") + 1);
886 label = label.substring(0, label.indexOf("|"));
888 if (next[j].description.indexOf(href) == -1)
890 out.append("<a href=\"" + href + "\">" + label + "</a>");
894 if (next[j].getDescription().indexOf("</html>") == -1)
896 out.append("</html>");
902 out.append(seqs[i].getName());
903 out.append("\t-1\t");
904 out.append(next[j].begin);
906 out.append(next[j].end);
908 out.append(next[j].type);
909 if (next[j].score != Float.NaN)
912 out.append(next[j].score);
921 out.append("ENDGROUP\t");
931 } while (groupIndex < groups.size() + 1);
935 return "No Features Visible";
938 return out.toString();
942 * generate a gff file for sequence features includes non-pos features by
949 public String printGFFFormat(SequenceI[] seqs, Hashtable visible)
951 return printGFFFormat(seqs, visible, true, true);
954 public String printGFFFormat(SequenceI[] seqs, Hashtable visible,
955 boolean visOnly, boolean nonpos)
957 StringBuffer out = new StringBuffer();
958 SequenceFeature[] next;
961 for (int i = 0; i < seqs.length; i++)
963 if (seqs[i].getSequenceFeatures() != null)
965 next = seqs[i].getSequenceFeatures();
966 for (int j = 0; j < next.length; j++)
968 isnonpos = next[j].begin == 0 && next[j].end == 0;
969 if ((!nonpos && isnonpos)
970 || (!isnonpos && visOnly && !visible
971 .containsKey(next[j].type)))
976 source = next[j].featureGroup;
979 source = next[j].getDescription();
982 out.append(seqs[i].getName());
986 out.append(next[j].type);
988 out.append(next[j].begin);
990 out.append(next[j].end);
992 out.append(next[j].score);
995 if (next[j].getValue("STRAND") != null)
997 out.append(next[j].getValue("STRAND"));
1005 if (next[j].getValue("FRAME") != null)
1007 out.append(next[j].getValue("FRAME"));
1013 // TODO: verify/check GFF - should there be a /t here before attribute
1016 if (next[j].getValue("ATTRIBUTES") != null)
1018 out.append(next[j].getValue("ATTRIBUTES"));
1021 out.append(newline);
1027 return out.toString();
1031 * this is only for the benefit of object polymorphism - method does nothing.
1039 * this is only for the benefit of object polymorphism - method does nothing.
1041 * @return error message
1043 public String print()
1045 return "USE printGFFFormat() or printJalviewFormat()";