From da62703518a88707b9144bc51e50d6af7093a7c7 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 18 Aug 2020 13:05:24 +0100 Subject: [PATCH] JAL-1260 refactored GenBank (and EMBL) flat file parser --- src/jalview/io/EmblFlatFile.java | 770 +-------------------------------- src/jalview/io/FlatFile.java | 766 ++++++++++++++++++++++++++++++++ src/jalview/io/GenBankFile.java | 207 +++++++++ src/jalview/util/DnaUtils.java | 1 + test/jalview/io/EmblFlatFileTest.java | 10 +- test/jalview/io/GenBankFileTest.java | 204 +++++++++ test/jalview/io/J03321.embl.txt | 4 +- test/jalview/io/J03321.gb | 258 +++++++++++ 8 files changed, 1456 insertions(+), 764 deletions(-) create mode 100644 src/jalview/io/FlatFile.java create mode 100644 src/jalview/io/GenBankFile.java create mode 100644 test/jalview/io/GenBankFileTest.java create mode 100644 test/jalview/io/J03321.gb diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 92af0df..19496ef 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -1,28 +1,10 @@ package jalview.io; import java.io.IOException; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.TreeMap; import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; -import jalview.datamodel.FeatureProperties; -import jalview.datamodel.Mapping; -import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; -import jalview.util.DnaUtils; -import jalview.util.MapList; -import jalview.util.MappingUtils; /** * A class that provides selective parsing of the EMBL flatfile format. @@ -43,66 +25,10 @@ import jalview.util.MappingUtils; * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html */ -public class EmblFlatFile extends AlignFile // FileParse +public class EmblFlatFile extends FlatFile { - private static final String QUOTE = "\""; - - private static final String DOUBLED_QUOTE = QUOTE + QUOTE; - - /** - * when true, interpret the mol_type 'source' feature attribute and generate - * an RNA sequence from the DNA record - */ - private boolean produceRna = true; - /** - * A data bean class to hold values parsed from one CDS Feature (FT) - */ - class CdsData - { - String translation; // from CDS feature /translation - - String cdsLocation; // CDS /location raw value - - int codonStart = 1; // from CDS /codon_start - - String proteinName; // from CDS /product; used for protein description - - String proteinId; // from CDS /protein_id - - List xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers - - Map cdsProps = new Hashtable<>(); // CDS other qualifiers - } - - private static final String WHITESPACE = "\\s+"; - - private String sourceDb; - - /* - * values parsed from the EMBL flatfile record - */ - private String accession; // from ID (first token) - - private String version; // from ID (second token) - - private String description; // from (first) DE line - - private int length = 128; // from ID (7th token), with usable default - - private List dbrefs; // from DR - - private boolean sequenceStringIsRNA = false; - - private String sequenceString; // from SQ lines - - /* - * parsed CDS data fields, keyed by protein_id - */ - private Map cds; - - /** - * Constructor + * Constructor given a data source and the id of the source database * * @param fp * @param sourceId @@ -110,14 +36,7 @@ public class EmblFlatFile extends AlignFile // FileParse */ public EmblFlatFile(FileParse fp, String sourceId) throws IOException { - super(false, fp); // don't parse immediately - this.sourceDb = sourceId; - dbrefs = new ArrayList<>(); - - /* - * using TreeMap gives CDS sequences in alphabetical, so readable, order - */ - cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + super(fp, sourceId); } /** @@ -126,6 +45,7 @@ public class EmblFlatFile extends AlignFile // FileParse * * @throws IOException */ + @Override public void parse() throws IOException { String line = nextLine(); @@ -145,11 +65,11 @@ public class EmblFlatFile extends AlignFile // FileParse } else if (line.startsWith("SQ")) { - line = parseSQ(); + line = parseSequence(); } else if (line.startsWith("FT")) { - line = parseFT(line); + line = parseFeature(line.substring(2)); } else { @@ -281,683 +201,9 @@ public class EmblFlatFile extends AlignFile // FileParse return nextLine(); } - /** - * Reads and saves the sequence, read from the lines following the SQ line. - * Whitespace and position counters are discarded. Returns the next line - * following the sequence data (the next line that doesn't start with - * whitespace). - * - * @throws IOException - */ - String parseSQ() throws IOException - { - StringBuilder sb = new StringBuilder(this.length); - String line = nextLine(); - while (line != null && line.startsWith(" ")) - { - line = line.trim(); - String[] blocks = line.split(WHITESPACE); - - /* - * omit the last block (position counter) on each line - */ - for (int i = 0; i < blocks.length - 1; i++) - { - sb.append(blocks[i]); - } - line = nextLine(); - } - this.sequenceString = sb.toString(); - - return line; - } - - /** - * Processes an FT line. If it declares a feature type of interest (currently, - * only CDS is processed), processes all of the associated lines (feature - * qualifiers), and returns the next line after that, otherwise simply returns - * the next line. - * - * @param line - * @return - * @throws IOException - */ - String parseFT(String line) throws IOException - { - String[] tokens = line.split(WHITESPACE); - if (tokens.length < 3 - || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1]))) - { - return nextLine(); - } - - if (tokens[1].equals("source")) - { - return parseSourceQualifiers(tokens); - } - - /* - * parse location - which may be over more than one line e.g. EAW51554 - */ - CdsData data = new CdsData(); - data.cdsLocation = tokens[2]; - // TODO location can be over >1 line e.g. EAW51554 - - line = nextLine(); - while (line != null) - { - if (!line.startsWith("FT ")) // 4 spaces - { - // e.g. start of next feature "FT source..." - break; - } - - /* - * extract qualifier, e.g. FT /protein_id="CAA37824.1" - * - the value may extend over more than one line - * - if the value has enclosing quotes, these are removed - * - escaped double quotes ("") are reduced to a single character - */ - int slashPos = line.indexOf('/'); - if (slashPos == -1) - { - Cache.log.error("Unexpected EMBL line ignored: " + line); - line = nextLine(); - continue; - } - int eqPos = line.indexOf('=', slashPos + 1); - if (eqPos == -1) - { - // can happen, e.g. /ribosomal_slippage - // Cache.log.error("Unexpected EMBL line ignored: " + line); - line = nextLine(); - continue; - } - String qualifier = line.substring(slashPos + 1, eqPos); - String value = line.substring(eqPos + 1); - value = removeQuotes(value); - StringBuilder sb = new StringBuilder().append(value); - line = parseFeatureQualifier(sb, qualifier); - String featureValue = sb.toString(); - - if ("protein_id".equals(qualifier)) - { - data.proteinId = featureValue; - } - else if ("codon_start".equals(qualifier)) - { - try - { - data.codonStart = Integer.parseInt(featureValue.trim()); - } catch (NumberFormatException e) - { - Cache.log.error("Invalid codon_start in XML for " + this.accession - + ": " + e.getMessage()); - } - } - else if ("db_xref".equals(qualifier)) - { - String[] parts = featureValue.split(":"); - if (parts.length == 2) - { - String db = parts[0].trim(); - db = DBRefUtils.getCanonicalName(db); - DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); - data.xrefs.add(dbref); - } - } - else if ("product".equals(qualifier)) - { - data.proteinName = featureValue; - } - else if ("translation".equals(qualifier)) - { - data.translation = featureValue; - } - else if (!"".equals(featureValue)) - { - // throw anything else into the additional properties hash - data.cdsProps.put(qualifier, featureValue); - } - } - - if (data.proteinId != null) - { - this.cds.put(data.proteinId, data); - } - else - { - Cache.log.error("Ignoring CDS feature with no protein_id for " - + sourceDb + ":" + accession); - } - - return line; - } - - /** - * process attributes for 'source' until the next FT feature entry only - * interested in 'mol_type' - * - * @param tokens - * @return - * @throws IOException - */ - private String parseSourceQualifiers(String[] tokens) throws IOException - { - if (!"source".equals(tokens[1])) - { - throw (new RuntimeException("Not given a source qualifier")); - } - // search for mol_type attribute - - StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of - // sequence - - String line = parseFeatureQualifier(sb, "source"); - while (line != null) - { - if (!line.startsWith("FT ")) // four spaces, end of this feature table - // entry - { - return line; - } - - int p = line.indexOf("\\mol_type"); - int qs = line.indexOf("\"", p); - int qe = line.indexOf("\"", qs + 1); - String qualifier = line.substring(qs, qe).toLowerCase(); - if (qualifier.indexOf("rna") > -1) - { - sequenceStringIsRNA = true; - } - if (qualifier.indexOf("dna") > -1) - { - sequenceStringIsRNA = false; - } - line = parseFeatureQualifier(sb, "source"); - } - return line; - } - - /** - * Removes leading or trailing double quotes (") unless doubled, and changes - * any 'escaped' (doubled) double quotes to single characters. As per the - * Feature Table specification for Qualifiers, Free Text. - * - * @param value - * @return - */ - static String removeQuotes(String value) - { - if (value == null) - { - return null; - } - if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE)) - { - value = value.substring(1); - } - if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE)) - { - value = value.substring(0, value.length() - 1); - } - value = value.replace(DOUBLED_QUOTE, QUOTE); - return value; - } - - /** - * Reads the value of a feature (FT) qualifier from one or more lines of the - * file, and returns the next line after that. Values are appended to the - * string buffer, which should be already primed with the value read from the - * first line for the qualifier (with any leading double quote removed). - * Enclosing double quotes are removed, and escaped (repeated) double quotes - * reduced to one only. For example for - * - *
-   * FT      /note="gene_id=hCG28070.3 
-   * FT      ""foobar"" isoform=CRA_b"
-   * the returned value is
-   * gene_id=hCG28070.3 "foobar" isoform=CRA_b
-   * 
- * - * Note the side-effect of this method, to advance data reading to the next - * line after the feature qualifier. - * - * @param sb - * a string buffer primed with the first line of the value - * @param qualifierName - * @return - * @throws IOException - */ - String parseFeatureQualifier(StringBuilder sb, String qualifierName) - throws IOException - { - String line; - while ((line = nextLine()) != null) - { - if (!line.startsWith("FT ")) - { - break; // reached next feature or other input line - } - String[] tokens = line.split(WHITESPACE); - if (tokens.length < 2) - { - Cache.log.error("Ignoring bad EMBL line for " + this.accession - + ": " + line); - break; - } - if (tokens[1].startsWith("/")) - { - break; // next feature qualifier - } - - /* - * heuristic rule: most multi-line value (e.g. /product) are text, - * so add a space for word boundary at a new line; not for translation - */ - if (!"translation".equals(qualifierName)) - { - sb.append(" "); - } - - /* - * remove trailing " and unescape doubled "" - */ - String data = removeQuotes(tokens[1]); - sb.append(data); - } - - return line; - } - - /** - * Constructs and saves the sequence from parsed components - */ - void buildSequence() - { - if (this.accession == null || this.sequenceString == null) - { - Cache.log.error("Failed to parse data from EMBL"); - return; - } - - String name = this.accession; - if (this.sourceDb != null) - { - name = this.sourceDb + "|" + name; - } - - if (produceRna && sequenceStringIsRNA) - { - sequenceString = sequenceString.replace('T', 'U').replace('t', 'u'); - } - - SequenceI seq = new Sequence(name, this.sequenceString); - seq.setDescription(this.description); - - /* - * add a DBRef to itself - */ - DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession); - int[] startEnd = new int[] { 1, seq.getLength() }; - selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1)); - seq.addDBRef(selfRef); - - for (DBRefEntry dbref : this.dbrefs) - { - seq.addDBRef(dbref); - } - - processCDSFeatures(seq); - - seq.deriveSequence(); - - addSequence(seq); - } - - /** - * Process the CDS features, including generation of cross-references and - * mappings to the protein products (translation) - * - * @param seq - */ - protected void processCDSFeatures(SequenceI seq) - { - /* - * record protein products found to avoid duplication i.e. >1 CDS with - * the same /protein_id [though not sure I can find an example of this] - */ - Map proteins = new HashMap<>(); - for (CdsData data : cds.values()) - { - processCDSFeature(seq, data, proteins); - } - } - - /** - * Processes data for one parsed CDS feature to - *
    - *
  • create a protein product sequence for the translation
  • - *
  • create a cross-reference to protein with mapping from dna
  • - *
  • add a CDS feature to the sequence for each CDS start-end range
  • - *
  • add any CDS dbrefs to the sequence and to the protein product
  • - *
- * - * @param SequenceI - * dna - * @param proteins - * map of protein products so far derived from CDS data - */ - void processCDSFeature(SequenceI dna, CdsData data, - Map proteins) - { - /* - * parse location into a list of [start, end, start, end] positions - */ - int[] exons = getCdsRanges(this.accession, data.cdsLocation); - - MapList maplist = buildMappingToProtein(dna, exons, data); - - int exonNumber = 0; - - for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) - { - int exonStart = exons[xint]; - int exonEnd = exons[xint + 1]; - int begin = Math.min(exonStart, exonEnd); - int end = Math.max(exonStart, exonEnd); - exonNumber++; - String desc = String.format("Exon %d for protein EMBLCDS:%s", - exonNumber, data.proteinId); - - SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, - this.sourceDb); - for (Entry val : data.cdsProps.entrySet()) - { - sf.setValue(val.getKey(), val.getValue()); - } - - sf.setEnaLocation(data.cdsLocation); - boolean forwardStrand = exonStart <= exonEnd; - sf.setStrand(forwardStrand ? "+" : "-"); - sf.setPhase(String.valueOf(data.codonStart - 1)); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); - - dna.addSequenceFeature(sf); - } - - boolean hasUniprotDbref = false; - for (DBRefEntry xref : data.xrefs) - { - dna.addDBRef(xref); - if (xref.getSource().equals(DBRefSource.UNIPROT)) - { - /* - * construct (or find) the sequence for (data.protein_id, data.translation) - */ - SequenceI protein = buildProteinProduct(dna, xref, data, proteins); - Mapping map = new Mapping(protein, maplist); - map.setMappedFromId(data.proteinId); - xref.setMap(map); - - /* - * add DBRefs with mappings from dna to protein and the inverse - */ - DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession); - db1.setMap(new Mapping(dna, maplist.getInverse())); - protein.addDBRef(db1); - - hasUniprotDbref = true; - } - } - - /* - * if we have a product (translation) but no explicit Uniprot dbref - * (example: EMBL M19487 protein_id AAB02592.1) - * then construct mappings to an assumed EMBLCDSPROTEIN accession - */ - if (!hasUniprotDbref) - { - SequenceI protein = proteins.get(data.proteinId); - if (protein == null) - { - protein = new Sequence(data.proteinId, data.translation); - protein.setDescription(data.proteinName); - proteins.put(data.proteinId, protein); - } - // assuming CDSPROTEIN sequence version = dna version (?!) - DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct, - this.version, data.proteinId); - protein.addDBRef(db1); - - DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, this.version, data.proteinId); - Mapping map = new Mapping(protein, maplist); - map.setMappedFromId(data.proteinId); - dnaToEmblProteinRef.setMap(map); - dna.addDBRef(dnaToEmblProteinRef); - } - - /* - * comment brought forward from EmblXmlSource, lines 447-451: - * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL - * sequence with the exon map; if given a dataset reference, search - * dataset for parent EMBL sequence if it exists and set its map; - * make a new feature annotating the coding contig - */ - } - - /** - * Computes a mapping from CDS positions in DNA sequence to protein product - * positions, with allowance for stop codon or incomplete start codon - * - * @param dna - * @param exons - * @param data - * @return - */ - MapList buildMappingToProtein(final SequenceI dna, final int[] exons, - final CdsData data) - { - MapList dnaToProteinMapping = null; - int peptideLength = data.translation.length(); - - int[] proteinRange = new int[] { 1, peptideLength }; - if (exons != null && exons.length > 0) - { - /* - * We were able to parse 'location'; do a final - * product length truncation check - */ - int[] cdsRanges = adjustForProteinLength(peptideLength, exons); - dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); - } - else - { - /* - * workaround until we handle all 'location' formats fully - * e.g. X53828.1:60..1058 or <123..>289 - */ - Cache.log.error(String.format( - "Implementation Notice: EMBLCDS location '%s'not properly supported yet" - + " - Making up the CDNA region of (%s:%s)... may be incorrect", - data.cdsLocation, sourceDb, this.accession)); - - int completeCodonsLength = 1 - data.codonStart + dna.getLength(); - int mappedDnaEnd = dna.getEnd(); - if (peptideLength * 3 == completeCodonsLength) - { - // this might occur for CDS sequences where no features are marked - Cache.log.warn("Assuming no stop codon at end of cDNA fragment"); - mappedDnaEnd = dna.getEnd(); - } - else if ((peptideLength + 1) * 3 == completeCodonsLength) - { - Cache.log.warn("Assuming stop codon at end of cDNA fragment"); - mappedDnaEnd = dna.getEnd() - 3; - } - - if (mappedDnaEnd != -1) - { - int[] cdsRanges = new int[] { - dna.getStart() + (data.codonStart - 1), mappedDnaEnd }; - dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); - } - } - - return dnaToProteinMapping; - } - - /** - * Constructs a sequence for the protein product for the CDS data (if there is - * one), and dbrefs with mappings from CDS to protein and the reverse - * - * @param dna - * @param xref - * @param data - * @param proteins - * @return - */ - SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref, - CdsData data, Map proteins) - { - /* - * check we have some data to work with - */ - if (data.proteinId == null || data.translation == null) - { - return null; - } - - /* - * Construct the protein sequence (if not already seen) - */ - String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId(); - SequenceI protein = proteins.get(proteinSeqName); - if (protein == null) - { - protein = new Sequence(proteinSeqName, data.translation, 1, - data.translation.length()); - protein.setDescription(data.proteinName != null ? data.proteinName - : "Protein Product from " + sourceDb); - proteins.put(proteinSeqName, protein); - } - - return protein; - } - - /** - * Returns the CDS location as a single array of [start, end, start, end...] - * positions. If on the reverse strand, these will be in descending order. - * - * @param accession - * @param location - * @return - */ - protected int[] getCdsRanges(String accession, String location) - { - if (location == null) - { - return new int[] {}; - } - - try - { - List ranges = DnaUtils.parseLocation(location); - return MappingUtils.rangeListToArray(ranges); - } catch (ParseException e) - { - Cache.log.warn( - String.format("Not parsing inexact CDS location %s in ENA %s", - location, accession)); - return new int[] {}; - } - } - - /** - * Output (print) is not implemented for EMBL flat file format - */ @Override - public String print(SequenceI[] seqs, boolean jvsuffix) - { - return null; - } - - /** - * Truncates (if necessary) the exon intervals to match 3 times the length of - * the protein; also accepts 3 bases longer (for stop codon not included in - * protein) - * - * @param proteinLength - * @param exon - * an array of [start, end, start, end...] intervals - * @return the same array (if unchanged) or a truncated copy - */ - static int[] adjustForProteinLength(int proteinLength, int[] exon) + protected boolean isFeatureContinuationLine(String line) { - if (proteinLength <= 0 || exon == null) - { - return exon; - } - int expectedCdsLength = proteinLength * 3; - int exonLength = MappingUtils.getLength(Arrays.asList(exon)); - - /* - * if exon length matches protein, or is shorter, or longer by the - * length of a stop codon (3 bases), then leave it unchanged - */ - if (expectedCdsLength >= exonLength - || expectedCdsLength == exonLength - 3) - { - return exon; - } - - int origxon[]; - int sxpos = -1; - int endxon = 0; - origxon = new int[exon.length]; - System.arraycopy(exon, 0, origxon, 0, exon.length); - int cdspos = 0; - for (int x = 0; x < exon.length; x += 2) - { - cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; - if (expectedCdsLength <= cdspos) - { - // advanced beyond last codon. - sxpos = x; - if (expectedCdsLength != cdspos) - { - // System.err - // .println("Truncating final exon interval on region by " - // + (cdspos - cdslength)); - } - - /* - * shrink the final exon - reduce end position if forward - * strand, increase it if reverse - */ - if (exon[x + 1] >= exon[x]) - { - endxon = exon[x + 1] - cdspos + expectedCdsLength; - } - else - { - endxon = exon[x + 1] + cdspos - expectedCdsLength; - } - break; - } - } - - if (sxpos != -1) - { - // and trim the exon interval set if necessary - int[] nxon = new int[sxpos + 2]; - System.arraycopy(exon, 0, nxon, 0, sxpos + 2); - nxon[sxpos + 1] = endxon; // update the end boundary for the new exon - // set - exon = nxon; - } - return exon; + return line.startsWith("FT "); // 4 spaces } } diff --git a/src/jalview/io/FlatFile.java b/src/jalview/io/FlatFile.java new file mode 100644 index 0000000..737f1d8 --- /dev/null +++ b/src/jalview/io/FlatFile.java @@ -0,0 +1,766 @@ +package jalview.io; + +import java.io.IOException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; + +import jalview.bin.Cache; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.DBRefUtils; +import jalview.util.DnaUtils; +import jalview.util.MapList; +import jalview.util.MappingUtils; + +/** + * A base class to support parsing of GenBank, EMBL or DDBJ flat file format + * data. Example files (rather than formal specifications) are provided at + * + *
+ * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
+ * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
+ * 
+ * + * or to compare the same entry, see + * + *
+ * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
+ * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
+ * 
+ * + * The feature table part of the file has a common definition, only the start of + * each line is formatted differently in GenBank and EMBL. See + * http://www.insdc.org/files/feature_table.html#7.1. + */ +public abstract class FlatFile extends AlignFile +{ + protected static final String LOCATION = "location"; + + protected static final String QUOTE = "\""; + + protected static final String DOUBLED_QUOTE = QUOTE + QUOTE; + + protected static final String WHITESPACE = "\\s+"; + + /** + * Removes leading or trailing double quotes (") unless doubled, and changes + * any 'escaped' (doubled) double quotes to single characters. As per the + * Feature Table specification for Qualifiers, Free Text. + * + * @param value + * @return + */ + protected static String removeQuotes(String value) + { + if (value == null) + { + return null; + } + if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE)) + { + value = value.substring(1); + } + if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE)) + { + value = value.substring(0, value.length() - 1); + } + value = value.replace(DOUBLED_QUOTE, QUOTE); + return value; + } + + /** + * Truncates (if necessary) the exon intervals to match 3 times the length of + * the protein; also accepts 3 bases longer (for stop codon not included in + * protein) + * + * @param proteinLength + * @param exon + * an array of [start, end, start, end...] intervals + * @return the same array (if unchanged) or a truncated copy + */ + protected static int[] adjustForProteinLength(int proteinLength, + int[] exon) + { + if (proteinLength <= 0 || exon == null) + { + return exon; + } + int expectedCdsLength = proteinLength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); + + /* + * if exon length matches protein, or is shorter, or longer by the + * length of a stop codon (3 bases), then leave it unchanged + */ + if (expectedCdsLength >= exonLength + || expectedCdsLength == exonLength - 3) + { + return exon; + } + + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) + { + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (expectedCdsLength <= cdspos) + { + // advanced beyond last codon. + sxpos = x; + if (expectedCdsLength != cdspos) + { + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); + } + + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + expectedCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - expectedCdsLength; + } + break; + } + } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } + return exon; + } + + /* + * values parsed from the data file + */ + protected String sourceDb; + + protected String accession; + + protected String version; + + protected String description; + + protected int length = 128; + + protected List dbrefs; + + protected String sequenceString; + + protected Map cds; + + /** + * Constructor + * + * @param fp + * @param sourceId + * @throws IOException + */ + public FlatFile(FileParse fp, String sourceId) throws IOException + { + super(false, fp); // don't parse immediately + this.sourceDb = sourceId; + dbrefs = new ArrayList<>(); + + /* + * using TreeMap gives CDS sequences in alphabetical, so readable, order + */ + cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + } + + /** + * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and + * returns the next line + * + * @param location + * @return + * @throws IOException + */ + protected String parseCDSFeature(String location) throws IOException + { + String line; + + /* + * parse location, which can be over >1 line e.g. EAW51554 + */ + CdsData data = new CdsData(); + StringBuilder sb = new StringBuilder().append(location); + line = parseFeatureQualifier(sb, LOCATION); + data.cdsLocation = sb.toString(); + + while (line != null) + { + if (!isFeatureContinuationLine(line)) + { + // e.g. start of next feature "FT source..." + break; + } + + /* + * extract qualifier, e.g. FT /protein_id="CAA37824.1" + * - the value may extend over more than one line + * - if the value has enclosing quotes, these are removed + * - escaped double quotes ("") are reduced to a single character + */ + int slashPos = line.indexOf('/'); + if (slashPos == -1) + { + Cache.log.error("Unexpected EMBL line ignored: " + line); + line = nextLine(); + continue; + } + int eqPos = line.indexOf('=', slashPos + 1); + if (eqPos == -1) + { + // can happen, e.g. /ribosomal_slippage + line = nextLine(); + continue; + } + String qualifier = line.substring(slashPos + 1, eqPos); + String value = line.substring(eqPos + 1); + value = removeQuotes(value); + sb = new StringBuilder().append(value); + line = parseFeatureQualifier(sb, qualifier); + String featureValue = sb.toString(); + + if ("protein_id".equals(qualifier)) + { + data.proteinId = featureValue; + } + else if ("codon_start".equals(qualifier)) + { + try + { + data.codonStart = Integer.parseInt(featureValue.trim()); + } catch (NumberFormatException e) + { + Cache.log.error("Invalid codon_start in XML for " + this.accession + + ": " + e.getMessage()); + } + } + else if ("db_xref".equals(qualifier)) + { + String[] parts = featureValue.split(":"); + if (parts.length == 2) + { + String db = parts[0].trim(); + db = DBRefUtils.getCanonicalName(db); + DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); + data.xrefs.add(dbref); + } + } + else if ("product".equals(qualifier)) + { + data.proteinName = featureValue; + } + else if ("translation".equals(qualifier)) + { + data.translation = featureValue; + } + else if (!"".equals(featureValue)) + { + // throw anything else into the additional properties hash + data.cdsProps.put(qualifier, featureValue); + } + } + + if (data.proteinId != null) + { + this.cds.put(data.proteinId, data); + } + else + { + Cache.log.error("Ignoring CDS feature with no protein_id for " + + sourceDb + ":" + accession); + } + + return line; + } + + protected abstract boolean isFeatureContinuationLine(String line); + + /** + * Output (print) is not (yet) implemented for flat file format + */ + @Override + public String print(SequenceI[] seqs, boolean jvsuffix) + { + return null; + } + + /** + * Constructs and saves the sequence from parsed components + */ + protected void buildSequence() + { + if (this.accession == null || this.sequenceString == null) + { + Cache.log.error("Failed to parse data from EMBL"); + return; + } + + String name = this.accession; + if (this.sourceDb != null) + { + name = this.sourceDb + "|" + name; + } + SequenceI seq = new Sequence(name, this.sequenceString); + seq.setDescription(this.description); + + /* + * add a DBRef to itself + */ + DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession); + int[] startEnd = new int[] { 1, seq.getLength() }; + selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1)); + seq.addDBRef(selfRef); + + for (DBRefEntry dbref : this.dbrefs) + { + seq.addDBRef(dbref); + } + + processCDSFeatures(seq); + + seq.deriveSequence(); + + addSequence(seq); + } + + /** + * Process the CDS features, including generation of cross-references and + * mappings to the protein products (translation) + * + * @param seq + */ + protected void processCDSFeatures(SequenceI seq) + { + /* + * record protein products found to avoid duplication i.e. >1 CDS with + * the same /protein_id [though not sure I can find an example of this] + */ + Map proteins = new HashMap<>(); + for (CdsData data : cds.values()) + { + processCDSFeature(seq, data, proteins); + } + } + + /** + * Processes data for one parsed CDS feature to + *
    + *
  • create a protein product sequence for the translation
  • + *
  • create a cross-reference to protein with mapping from dna
  • + *
  • add a CDS feature to the sequence for each CDS start-end range
  • + *
  • add any CDS dbrefs to the sequence and to the protein product
  • + *
+ * + * @param SequenceI + * dna + * @param proteins + * map of protein products so far derived from CDS data + */ + void processCDSFeature(SequenceI dna, CdsData data, + Map proteins) + { + /* + * parse location into a list of [start, end, start, end] positions + */ + int[] exons = getCdsRanges(this.accession, data.cdsLocation); + + MapList maplist = buildMappingToProtein(dna, exons, data); + + int exonNumber = 0; + + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) + { + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + exonNumber++; + String desc = String.format("Exon %d for protein EMBLCDS:%s", + exonNumber, data.proteinId); + + SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, + this.sourceDb); + for (Entry val : data.cdsProps.entrySet()) + { + sf.setValue(val.getKey(), val.getValue()); + } + + sf.setEnaLocation(data.cdsLocation); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(data.codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); + + dna.addSequenceFeature(sf); + } + + boolean hasUniprotDbref = false; + for (DBRefEntry xref : data.xrefs) + { + dna.addDBRef(xref); + if (xref.getSource().equals(DBRefSource.UNIPROT)) + { + /* + * construct (or find) the sequence for (data.protein_id, data.translation) + */ + SequenceI protein = buildProteinProduct(dna, xref, data, proteins); + Mapping map = new Mapping(protein, maplist); + map.setMappedFromId(data.proteinId); + xref.setMap(map); + + /* + * add DBRefs with mappings from dna to protein and the inverse + */ + DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession); + db1.setMap(new Mapping(dna, maplist.getInverse())); + protein.addDBRef(db1); + + hasUniprotDbref = true; + } + } + + /* + * if we have a product (translation) but no explicit Uniprot dbref + * (example: EMBL M19487 protein_id AAB02592.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession + */ + if (!hasUniprotDbref) + { + SequenceI protein = proteins.get(data.proteinId); + if (protein == null) + { + protein = new Sequence(data.proteinId, data.translation); + protein.setDescription(data.proteinName); + proteins.put(data.proteinId, protein); + } + // assuming CDSPROTEIN sequence version = dna version (?!) + DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct, + this.version, data.proteinId); + protein.addDBRef(db1); + + DBRefEntry dnaToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, this.version, data.proteinId); + Mapping map = new Mapping(protein, maplist); + map.setMappedFromId(data.proteinId); + dnaToEmblProteinRef.setMap(map); + dna.addDBRef(dnaToEmblProteinRef); + } + + /* + * comment brought forward from EmblXmlSource, lines 447-451: + * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL + * sequence with the exon map; if given a dataset reference, search + * dataset for parent EMBL sequence if it exists and set its map; + * make a new feature annotating the coding contig + */ + } + + /** + * Computes a mapping from CDS positions in DNA sequence to protein product + * positions, with allowance for stop codon or incomplete start codon + * + * @param dna + * @param exons + * @param data + * @return + */ + MapList buildMappingToProtein(final SequenceI dna, final int[] exons, + final CdsData data) + { + MapList dnaToProteinMapping = null; + int peptideLength = data.translation.length(); + + int[] proteinRange = new int[] { 1, peptideLength }; + if (exons != null && exons.length > 0) + { + /* + * We were able to parse 'location'; do a final + * product length truncation check + */ + int[] cdsRanges = adjustForProteinLength(peptideLength, exons); + dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); + } + else + { + /* + * workaround until we handle all 'location' formats fully + * e.g. X53828.1:60..1058 or <123..>289 + */ + Cache.log.error(String.format( + "Implementation Notice: EMBLCDS location '%s'not properly supported yet" + + " - Making up the CDNA region of (%s:%s)... may be incorrect", + data.cdsLocation, sourceDb, this.accession)); + + int completeCodonsLength = 1 - data.codonStart + dna.getLength(); + int mappedDnaEnd = dna.getEnd(); + if (peptideLength * 3 == completeCodonsLength) + { + // this might occur for CDS sequences where no features are marked + Cache.log.warn("Assuming no stop codon at end of cDNA fragment"); + mappedDnaEnd = dna.getEnd(); + } + else if ((peptideLength + 1) * 3 == completeCodonsLength) + { + Cache.log.warn("Assuming stop codon at end of cDNA fragment"); + mappedDnaEnd = dna.getEnd() - 3; + } + + if (mappedDnaEnd != -1) + { + int[] cdsRanges = new int[] { + dna.getStart() + (data.codonStart - 1), mappedDnaEnd }; + dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); + } + } + + return dnaToProteinMapping; + } + + /** + * Constructs a sequence for the protein product for the CDS data (if there is + * one), and dbrefs with mappings from CDS to protein and the reverse + * + * @param dna + * @param xref + * @param data + * @param proteins + * @return + */ + SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref, + CdsData data, Map proteins) + { + /* + * check we have some data to work with + */ + if (data.proteinId == null || data.translation == null) + { + return null; + } + + /* + * Construct the protein sequence (if not already seen) + */ + String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId(); + SequenceI protein = proteins.get(proteinSeqName); + if (protein == null) + { + protein = new Sequence(proteinSeqName, data.translation, 1, + data.translation.length()); + protein.setDescription(data.proteinName != null ? data.proteinName + : "Protein Product from " + sourceDb); + proteins.put(proteinSeqName, protein); + } + + return protein; + } + + /** + * Returns the CDS location as a single array of [start, end, start, end...] + * positions. If on the reverse strand, these will be in descending order. + * + * @param accession + * @param location + * @return + */ + protected int[] getCdsRanges(String accession, String location) + { + if (location == null) + { + return new int[] {}; + } + + try + { + List ranges = DnaUtils.parseLocation(location); + return MappingUtils.rangeListToArray(ranges); + } catch (ParseException e) + { + Cache.log.warn( + String.format("Not parsing inexact CDS location %s in ENA %s", + location, accession)); + return new int[] {}; + } + } + + /** + * Reads the value of a feature (FT) qualifier from one or more lines of the + * file, and returns the next line after that. Values are appended to the + * string buffer, which should be already primed with the value read from the + * first line for the qualifier (with any leading double quote removed). + * Enclosing double quotes are removed, and escaped (repeated) double quotes + * reduced to one only. For example for + * + *
+   * FT      /note="gene_id=hCG28070.3 
+   * FT      ""foobar"" isoform=CRA_b"
+   * the returned value is
+   * gene_id=hCG28070.3 "foobar" isoform=CRA_b
+   * 
+ * + * Note the side-effect of this method, to advance data reading to the next + * line after the feature qualifier (which could be another qualifier, a + * different feature, a non-feature line, or null at end of file). + * + * @param sb + * a string buffer primed with the first line of the value + * @param qualifierName + * @return + * @throws IOException + */ + String parseFeatureQualifier(StringBuilder sb, String qualifierName) + throws IOException + { + String line; + while ((line = nextLine()) != null) + { + if (!isFeatureContinuationLine(line)) + { + break; // reached next feature or other input line + } + String[] tokens = line.split(WHITESPACE); + if (tokens.length < 2) + { + Cache.log.error("Ignoring bad EMBL line for " + this.accession + + ": " + line); + break; + } + if (tokens[1].startsWith("/")) + { + break; // next feature qualifier + } + + /* + * heuristic rule: most multi-line value (e.g. /product) are text, + * so add a space for word boundary at a new line; not for translation + */ + if (!"translation".equals(qualifierName) + && !LOCATION.equals(qualifierName)) + { + sb.append(" "); + } + + /* + * remove trailing " and unescape doubled "" + */ + String data = removeQuotes(tokens[1]); + sb.append(data); + } + + return line; + } + + /** + * Reads and saves the sequence, read from the lines following the ORIGIN + * (GenBank) or SQ (EMBL) line. Whitespace and position counters are + * discarded. Returns the next line following the sequence data (the next line + * that doesn't start with whitespace). + * + * @throws IOException + */ + protected String parseSequence() throws IOException + { + StringBuilder sb = new StringBuilder(this.length); + String line = nextLine(); + while (line != null && line.startsWith(" ")) + { + line = line.trim(); + String[] blocks = line.split(WHITESPACE); + + /* + * the first or last block on each line might be a position count - omit + */ + for (int i = 0; i < blocks.length; i++) + { + try + { + Long.parseLong(blocks[i]); + // position counter - ignore it + } catch (NumberFormatException e) + { + // sequence data - append it + sb.append(blocks[i]); + } + } + line = nextLine(); + } + this.sequenceString = sb.toString(); + + return line; + } + + /** + * Processes a feature line. If it declares a feature type of interest + * (currently, only CDS is processed), processes all of the associated lines + * (feature qualifiers), and returns the next line after that, otherwise + * simply returns the next line. + * + * @param line + * the first line for the feature (with initial FT omitted for EMBL + * format) + * @return + * @throws IOException + */ + protected String parseFeature(String line) throws IOException + { + String[] tokens = line.trim().split(WHITESPACE); + if (tokens.length < 2 || !"CDS".equals(tokens[0])) + { + return nextLine(); + } + + return parseCDSFeature(tokens[1]); + } +} + +/** + * A data bean class to hold values parsed from one CDS Feature + */ +class CdsData +{ + String translation; // from /translation qualifier + + String cdsLocation; // the raw value e.g. join(1..1234,2012..2837) + + int codonStart = 1; // from /codon_start qualifier + + String proteinName; // from /product qualifier; used for protein description + + String proteinId; // from /protein_id qualifier + + List xrefs = new ArrayList<>(); // from /db_xref qualifiers + + Map cdsProps = new Hashtable<>(); // other qualifiers +} diff --git a/src/jalview/io/GenBankFile.java b/src/jalview/io/GenBankFile.java new file mode 100644 index 0000000..7988764 --- /dev/null +++ b/src/jalview/io/GenBankFile.java @@ -0,0 +1,207 @@ +package jalview.io; + +import java.io.IOException; + +import jalview.bin.Cache; + +/** + * A class that provides selective parsing of the GenBank flatfile format. + *

+ * The initial implementation is limited to extracting fields used by Jalview + * after fetching an EMBL or EMBLCDS entry: + * + *

+ * accession, version, sequence, xref
+ * and (for CDS feature) location, protein_id, product, codon_start, translation
+ * 
+ * + * @author gmcarstairs + * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + */ +public class GenBankFile extends FlatFile +{ + private static final String DEFINITION = "DEFINITION"; + + /** + * Constructor given a data source and the id of the source database + * + * @param fp + * @param sourceId + * @throws IOException + */ + public GenBankFile(FileParse fp, String sourceId) throws IOException + { + super(fp, sourceId); + } + + /** + * Parses the flatfile, and if successful, saves as an annotated sequence + * which may be retrieved by calling {@code getSequence()} + * + * @throws IOException + * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + */ + @Override + public void parse() throws IOException + { + String line = nextLine(); + while (line != null) + { + if (line.startsWith(DEFINITION)) + { + line = parseDefinition(line); + } + else if (line.startsWith("ACCESSION")) + { + this.accession = line.split(WHITESPACE)[1]; + line = nextLine(); + } + else if (line.startsWith("VERSION")) + { + line = parseVersion(line); + } + else if (line.startsWith("ORIGIN")) + { + line = parseSequence(); + } + else if (line.startsWith("FEATURES")) + { + line = nextLine(); + while (line.startsWith(" ")) + { + line = parseFeature(line); + } + } + else + { + line = nextLine(); + } + } + buildSequence(); + } + + /** + * Extracts and saves the primary accession and version (SV value) from an ID + * line, or null if not found. Returns the next line after the one processed. + * + * @param line + * @throws IOException + */ + String parseLocus(String line) throws IOException + { + String[] tokens = line.substring(2).split(";"); + + /* + * first is primary accession + */ + String token = tokens[0].trim(); + if (!token.isEmpty()) + { + this.accession = token; + } + + /* + * second token is 'SV versionNo' + */ + if (tokens.length > 1) + { + token = tokens[1].trim(); + if (token.startsWith("SV")) + { + String[] bits = token.trim().split(WHITESPACE); + this.version = bits[bits.length - 1]; + } + } + + /* + * seventh token is 'length BP' + */ + if (tokens.length > 6) + { + token = tokens[6].trim(); + String[] bits = token.trim().split(WHITESPACE); + try + { + this.length = Integer.valueOf(bits[0]); + } catch (NumberFormatException e) + { + Cache.log.error("bad length read in flatfile, line: " + line); + } + } + + return nextLine(); + } + + /** + * Reads sequence description from DEFINITION lines. Any trailing period is + * discarded. Returns the next line after the definition line(s). + * + * @param line + * @return + * @throws IOException + */ + String parseDefinition(String line) throws IOException + { + String desc = line.substring(DEFINITION.length()).trim(); + if (desc.endsWith(".")) + { + desc = desc.substring(0, desc.length() - 1); + } + + /* + * pass over any additional DE lines + */ + while ((line = nextLine()) != null) + { + if (line.startsWith(" ")) + { + // definition continuation line + desc += line.trim(); + } + else + { + break; + } + } + this.description = desc; + + return line; + } + + /** + * Parses the VERSION line e.g. + * + *
+   * VERSION     X81322.1
+   * 
+ * + * and returns the next line + * + * @param line + * @throws IOException + */ + String parseVersion(String line) throws IOException + { + /* + * extract version part of . + * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#VersionB + */ + String[] tokens = line.split(WHITESPACE); + if (tokens.length > 1) + { + tokens = tokens[1].split("\\."); + if (tokens.length > 1) + { + this.version = tokens[1]; + } + } + + return nextLine(); + } + + @Override + protected boolean isFeatureContinuationLine(String line) + { + return line.startsWith(" "); // 6 spaces + } +} diff --git a/src/jalview/util/DnaUtils.java b/src/jalview/util/DnaUtils.java index 284ec10..654b03a 100644 --- a/src/jalview/util/DnaUtils.java +++ b/src/jalview/util/DnaUtils.java @@ -47,6 +47,7 @@ public class DnaUtils public static List parseLocation(String location) throws ParseException { + location = location.trim(); // failsafe for untidy input data if (location.startsWith("join(")) { return parseJoin(location); diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java index b04cddd..c893c09 100644 --- a/test/jalview/io/EmblFlatFileTest.java +++ b/test/jalview/io/EmblFlatFileTest.java @@ -3,9 +3,9 @@ package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import static org.testng.AssertJUnit.assertNotNull; +import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.fail; -import static org.testng.AssertJUnit.assertNull; import java.io.File; import java.io.IOException; @@ -14,8 +14,10 @@ import java.util.Arrays; import java.util.List; import java.util.Set; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence.DBModList; @@ -26,6 +28,12 @@ import jalview.util.MapList; public class EmblFlatFileTest { + @BeforeClass(alwaysRun = true) + public void setUp() + { + Cache.initLogger(); + } + /** * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, * one of them reverse strand diff --git a/test/jalview/io/GenBankFileTest.java b/test/jalview/io/GenBankFileTest.java new file mode 100644 index 0000000..d800b1d --- /dev/null +++ b/test/jalview/io/GenBankFileTest.java @@ -0,0 +1,204 @@ +package jalview.io; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.AssertJUnit.assertNull; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import jalview.bin.Cache; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.Mapping; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.datamodel.features.SequenceFeatures; +import jalview.util.MapList; + +public class GenBankFileTest +{ + @BeforeClass(alwaysRun = true) + public void setUp() + { + Cache.initLogger(); + } + + /** + * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, + * one of them reverse strand + * + * @throws MalformedURLException + * @throws IOException + */ + @Test(groups = "Functional") + public void testParse() throws MalformedURLException, IOException + { + File dataFile = new File("test/jalview/io/J03321.gb"); + FileParse fp = new FileParse(dataFile.getAbsolutePath(), + DataSourceType.FILE); + FlatFile parser = new GenBankFile(fp, "GenBankTest"); + parser.parse(); + List seqs = parser.getSeqs(); + + assertEquals(seqs.size(), 1); + SequenceI seq = seqs.get(0); + assertEquals(seq.getName(), "GenBankTest|J03321"); + assertEquals(seq.getLength(), 7502); + assertEquals(seq.getDescription(), + "Chlamydia trachomatis plasmid pCHL1, complete sequence"); + + /* + * should be 9 CDS features (one is a 'join' of two exons) + */ + Set featureTypes = seq.getFeatures().getFeatureTypes(); + assertEquals(featureTypes.size(), 1); + assertTrue(featureTypes.contains("CDS")); + + /* + * inspect some features (sorted just for convenience of test assertions) + */ + List features = seq.getFeatures() + .getAllFeatures("CDS"); + SequenceFeatures.sortFeatures(features, true); + assertEquals(features.size(), 9); + + SequenceFeature sf = features.get(0); + assertEquals(sf.getBegin(), 1); + assertEquals(sf.getEnd(), 437); + assertEquals(sf.getDescription(), + "Exon 2 for protein EMBLCDS:AAA91567.1"); + assertEquals(sf.getFeatureGroup(), "GenBankTest"); + assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP7-D"); + // this is the second exon of circular CDS! + assertEquals(sf.getValue("exon number"), 2); + assertEquals(sf.getValue("product"), "hypothetical protein"); + assertEquals(sf.getValue("transl_table"), "11"); + + sf = features.get(1); + assertEquals(sf.getBegin(), 488); + assertEquals(sf.getEnd(), 1480); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91568.1"); + assertEquals(sf.getFeatureGroup(), "GenBankTest"); + assertEquals(sf.getEnaLocation(), "complement(488..1480)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), -1); // reverse strand! + assertEquals(sf.getValue("note"), "pGP8-D"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + sf = features.get(7); + assertEquals(sf.getBegin(), 6045); + assertEquals(sf.getEnd(), 6788); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91574.1"); + assertEquals(sf.getFeatureGroup(), "GenBankTest"); + assertEquals(sf.getEnaLocation(), "6045..6788"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + /* + * CDS at 7022-7502 is the first exon of the circular CDS + */ + sf = features.get(8); + assertEquals(sf.getBegin(), 7022); + assertEquals(sf.getEnd(), 7502); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91567.1"); + assertEquals(sf.getFeatureGroup(), "GenBankTest"); + assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP7-D"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + /* + * GenBank doesn't declare accession or CDS xrefs; + * dbrefs are added by Jalview for + * xref to self : 1 + * protein products: 8 + */ + List dbrefs = Arrays.asList(seq.getDBRefs()); + + assertEquals(dbrefs.size(), 9); + // xref to 'self': + DBRefEntry selfRef = new DBRefEntry("GENBANKTEST", "1", "J03321"); + int[] range = new int[] { 1, seq.getLength() }; + selfRef.setMap(new Mapping(null, range, range, 1, 1)); + assertTrue(dbrefs.contains(selfRef)); + + /* + * dna should have dbref to itself, and to EMBLCDSPROTEIN + * for each /protein_id (synthesized as no UNIPROT xref) + */ + // TODO check if we should synthesize EMBLCDSPROTEIN dbrefs + DBRefEntry dbref = dbrefs.get(0); + assertEquals(dbref.getSource(), "GENBANKTEST"); + assertEquals(dbref.getAccessionId(), "J03321"); + Mapping mapping = dbref.getMap(); + assertNull(mapping.getTo()); + MapList map = mapping.getMap(); + assertEquals(map.getFromLowest(), 1); + assertEquals(map.getFromHighest(), 7502); + assertEquals(map.getToLowest(), 1); + assertEquals(map.getToHighest(), 7502); + assertEquals(map.getFromRatio(), 1); + assertEquals(map.getToRatio(), 1); + + // dbref to inferred EMBLCDSPROTEIN for first CDS + dbref = dbrefs.get(1); + assertEquals(dbref.getSource(), "EMBLCDSPROTEIN"); + assertEquals(dbref.getAccessionId(), "AAA91567.1"); + mapping = dbref.getMap(); + SequenceI mapTo = mapping.getTo(); + assertEquals(mapTo.getName(), "AAA91567.1"); + // the /product qualifier transfers to protein product description + assertEquals(mapTo.getDescription(), "hypothetical protein"); + String seqString = mapTo.getSequenceAsString(); + assertEquals(seqString.length(), 305); + assertTrue(seqString.startsWith("MGSMAF")); + assertTrue(seqString.endsWith("QTPTIL")); + map = mapping.getMap(); + assertEquals(map.getFromLowest(), 1); + assertEquals(map.getFromHighest(), 7502); + assertEquals(map.getToLowest(), 1); + assertEquals(map.getToHighest(), 305); + assertEquals(map.getFromRatio(), 3); + assertEquals(map.getToRatio(), 1); + + // dbref to inferred EMBLCDSPROTEIN for last CDS + dbref = dbrefs.get(8); + assertEquals(dbref.getSource(), "EMBLCDSPROTEIN"); + assertEquals(dbref.getAccessionId(), "AAA91574.1"); + mapping = dbref.getMap(); + mapTo = mapping.getTo(); + assertEquals(mapTo.getName(), "AAA91574.1"); + // the /product qualifier transfers to protein product description + assertEquals(mapTo.getDescription(), "hypothetical protein"); + seqString = mapTo.getSequenceAsString(); + assertEquals(seqString.length(), 247); + assertTrue(seqString.startsWith("MNKLK")); + assertTrue(seqString.endsWith("FKQKS")); + map = mapping.getMap(); + assertEquals(map.getFromLowest(), 6045); + assertEquals(map.getFromHighest(), 6788); + assertEquals(map.getToLowest(), 1); + assertEquals(map.getToHighest(), 247); + assertEquals(map.getFromRatio(), 3); + assertEquals(map.getToRatio(), 1); + } +} diff --git a/test/jalview/io/J03321.embl.txt b/test/jalview/io/J03321.embl.txt index 92065b9..555c2b5 100644 --- a/test/jalview/io/J03321.embl.txt +++ b/test/jalview/io/J03321.embl.txt @@ -48,7 +48,9 @@ FT /serotype="D" FT /mol_type="genomic DNA" FT /isolation_source="trachoma" FT /db_xref="taxon:813" -FT CDS join(7022..7502,1..437) +XX next line artificially split for test purposes! +FT CDS join(7022..7502, +FT 1..437) FT /codon_start=1 FT /transl_table=11 FT /product="hypothetical protein" diff --git a/test/jalview/io/J03321.gb b/test/jalview/io/J03321.gb new file mode 100644 index 0000000..99729e4 --- /dev/null +++ b/test/jalview/io/J03321.gb @@ -0,0 +1,258 @@ +LOCUS CH1L1CG 7502 bp DNA circular BCT 06-APR-2020 +DEFINITION Chlamydia trachomatis plasmid pCHL1, complete sequence. +ACCESSION J03321 +VERSION J03321.1 +DBLINK BioSample: SAMN14225621 +KEYWORDS . +SOURCE Chlamydia trachomatis + ORGANISM Chlamydia trachomatis + Bacteria; Chlamydiae; Chlamydiales; Chlamydiaceae; + Chlamydia/Chlamydophila group; Chlamydia. +REFERENCE 1 (bases 1 to 7502) + AUTHORS Comanducci,M., Ricci,S., Cevenini,R. and Ratti,G. + TITLE Diversity of the Chlamydia trachomatis common plasmid in biovars + with different pathogenicity + JOURNAL Plasmid 23 (2), 149-154 (1990) + PUBMED 2194229 +REFERENCE 2 (bases 1 to 7502) + AUTHORS Comanducci,M., Ricci,S., Cevenini,R. and Ratti,G. + TITLE Direct Submission + JOURNAL Submitted (23-JUN-2010) Sclavo Research Centre, Siena, Italy +COMMENT Draft entry and computer-readable sequence kindly submitted by + G.Ratti, 28-MAR-1990. + ! CDS location split below (and this line added), for Jalview test purposes ! +FEATURES Location/Qualifiers + source 1..7502 + /organism="Chlamydia trachomatis" + /mol_type="genomic DNA" + /serotype="D" + /isolate="G0/86" + /isolation_source="trachoma" + /db_xref="taxon:813" + /plasmid="pCHL1" + CDS join(7022..7502, + 1..437) + /note="pGP7-D" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91567.1" + /translation="MGSMAFHKSRLFLTFGDASEIWLSTLSYLTRKNYASGINFLVSL + EILDLSETLIKAISLDHSESLFKIKSLDVFNGKVVSEASKQARAACYISFTKFLYRLT + KGYIKPAIPLKDFGNTTFFKIRDKIKTESISKQEWTVFFEALRIVNYRDYLIGKLIVQ + GIRKLDEILSLRTDDLFFASNQISFRIKKRQNKETKILITFPISLMEELQKYTCGRNG + RVFVSKIGIPVTTSQVAHNFRLAEFHSAMKIKITPRVLRASALIHLKQIGLKDEEIMR + ISCLSSRQSVCSYCSGEEVIPLVQTPTIL" + CDS complement(488..1480) + /note="pGP8-D" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91568.1" + /translation="MGKGILSLQQEMSLEYSEKSYQEVLKIRQESYWKRMKSFSLFEV + IMHWTASLNKHTCRSYRGSFLSLEKIGLLSLDMNLQEFSLLNHNLILDAIKKVSSAKT + SWTEGTKQVRAASYISLTRFLNRMTQGIVAIAQPSKQENSRTFFKTREIVKTDAMNSL + QTASFLKELKKINARDWLIAQTMLQGGKRSSEVLSLEISQICFQQATISFSQLKNRQT + EKRIIITYPQKFMHFLQEYIGQRRGFVFVTRSGKMVGLRQIARTFSQAGLQAAIPFKI + TPHVLRATAVTEYKRLGCSDSDIMKVTGHATAKMIFAYDKSSREDNASKKMALI" + CDS 1579..2934 + /note="pGP1-D" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91569.1" + /translation="MKTRSEIENRMQDIEYALLGKALIFEDSTEYILRQLANYEFKCS + HHKNIFIVFKHLKDNGLPITVDSAWEELLRRRIKDMDKSYLGLMLHDALSNDKLRSVS + HTVFLDDLSVCSAEENLSNFIFRSFNEYNENPLRRSPFLLLERIKGRLDSAIAKTFSI + RSARGRSIYDIFSQSEIGVLARIKKRRVAFSENQNSFFDGFPTGYKDIDDKGVILAKG + NFVIIAARPSIGKTALAIDMAINLAVTQQRRVGFLSLEMSAGQIVERIIANLTGISGE + KLQRGDLSKEELFRVEEAGETVRESHFYICSDSQYKLNLIANQIRLLRKEDRVDVIFI + DYLQLINSSVGENRQNEIADISRTLRGLASELNIPIVCLSQLSRKVEDRANKVPMLSD + LRDSGQIEQDADVILFINRKESSSNCEITVGKNRHGSVFSSVLHFDPKISKFSAIKKV + W" + CDS 2928..3992 + /note="pGP2-D" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91570.1" + /translation="MVNYSNCHFIKSPIHLENQKFGRRPGQSIKISPKLAQNGMVEVI + GLDFLSSHYHALAAIQRLLTATNYKGNTKGVVLSRESNSFQFEGWIPRIRFTKTEFLE + AYGVKRYKTSRNKYEFSGKEAETALEALYHLGHQPFLIVATRTRWTNGTQIVDRYQTL + SPIIRIYEGWEGLTDEENIDIDLTPFNSPPTRKHKGFVVEPCPILVDQIESYFVIKPA + NVYQEIKMRFPNASKYAYTFIDWVITAAAKKRRKLTKDNSWPENLLLNVNVKSLAYIL + RMNRYICTRNWKKIELAIDKCIEIAIQLGWLSRRKRIEFLDSSKLSKKEILYLNKERF + EEITKKSKEQMEQLEQESIN" + CDS 4054..4848 + /note="pGP3-D" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91571.1" + /translation="MGNSGFYLYNTENCVFADNIKVGQMTEPLKDQQIILGTTSTPVA + AKMTASDGISLTVSNNSSTNASITIGLDAEKAYQLILEKLGDQILDGIADTIVDSTVQ + DILDKIKTDPSLGLLKAFNNFPITNKIQCNGLFTPSNIETLLGGTEIGKFTVTPKSSG + SMFLVSADIIASRMEGGVVLALVREGDSKPCAISYGYSSGIPNLCSLRTSITNTGLTP + TTYSLRVGGLESGVVWVNALSNGNDILGITNTSNVSFLEVIPQTNA" + CDS 4918..5226 + /note="pGP4-D" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91572.1" + /translation="MQNKRKVRDDFIKIVKDVKKDFPELDLKIRVNKEKVTFLNSPLE + LYHKSVSLILGLLQQIENSLGLFPDSPVLEKLEDNSLKLKKALIMLILSRKDMFSKAE + " + CDS 5317..6048 + /note="pGP5-D (gtg start codon)" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91573.1" + /translation="MGCNLAQFLGKKVLLADLDPQSNLSSGLGASVRSDQKGLHDIVY + TSNDLKSIICETKKDSVDLIPASFSSEQFRELDIHRGPSNNLKLFLNEYCAPFYDICI + IDTPPSLGGLTKEAFVAGDKLIACLTPEPFSILGLQKIREFLSSVGKPEEEHILGIAL + SFWDDRNSTNQMYIDIIESIYKNKLFSTKIRRDISLSRSLLKEDSVANVYPNSRAAED + ILKLTHEIANILHIEYERDYSQRTT" + CDS 6045..6788 + /note="pGP6-D (gtg start codon)" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="AAA91574.1" + /translation="MNKLKKEADVFFKKNQTAASLDFKKTLPSIELFSATLNSEESQS + LDRLFLSESQNYSDEEFYQEDILAVKLLTGQIKSIQKQHVLLLGEKIYNARKILSKDH + FSSTTFSSWIELVFRTKSSAYNALAYYELFINLPNQTLQKEFQSIPYKSAYILAARKG + DLKTKVDVIGKVCGMSNSSAIRVLDQFLPSSRNKDVRETIDKSDSEKNRQLSDFLIEI + LRIMCSGVSLSSYNENLLQQLFELFKQKS" + repeat_region 6857..6945 + /note="four tandem 22bp repeats" +ORIGIN + 1 ggatccgtaa gttagacgaa attttgtctt tgcgcacaga cgatctattt tttgcatcca + 61 atcagatttc ctttcgcatt aaaaaaagac agaataaaga aaccaaaatt ctaatcacat + 121 ttcctatcag cttaatggaa gagttgcaaa aatacacttg tgggagaaat gggagagtat + 181 ttgtttctaa aatagggatt cctgtaacaa caagtcaggt tgcgcataat tttaggcttg + 241 cagagttcca tagtgctatg aaaataaaaa ttactcccag agtacttcgt gcaagcgctt + 301 tgattcattt aaagcaaata ggattaaaag atgaggaaat catgcgtatt tcctgtcttt + 361 catcgagaca aagtgtgtgt tcttattgtt ctggggaaga ggtaattcct ctagtacaaa + 421 cacccacaat attgtgatat aattaaaatt atattcatat tctgttgcca gaaaaaacac + 481 ctttaggcta tattagagcc atcttctttg aagcgttgtc ttctcgagaa gatttatcgt + 541 acgcaaatat catctttgcg gttgcgtgtc ctgtgacctt cattatgtcg gagtctgagc + 601 accctaggcg tttgtactcc gtcacagcgg ttgctcgaag cacgtgcggg gttattttaa + 661 aagggattgc agcttgtagt cctgcttgag agaacgtgcg ggcgatttgc cttaacccca + 721 ccatttttcc ggagcgagtt acgaagacaa aacctcttcg ttgaccgatg tactcttgta + 781 gaaagtgcat aaacttctga ggataagtta taataatcct cttttctgtc tgacggttct + 841 taagctggga gaaagaaatg gtagcttgtt ggaaacaaat ctgactaatc tccaagctta + 901 agacttcaga ggagcgttta cctccttgga gcattgtctg ggcgatcaac caatcccggg + 961 cattgatttt ttttagctct tttaggaagg atgctgtttg caaactgttc atcgcatccg + 1021 tttttactat ttccctggtt ttaaaaaatg ttcgactatt ttcttgttta gaaggttgcg + 1081 ctatagcgac tattccttga gtcatcctgt ttaggaatct tgttaaggaa atatagcttg + 1141 ctgctcgaac ttgtttagta ccttcggtcc aagaagtctt ggcagaggaa acttttttaa + 1201 tcgcatctag gattagatta tgatttaaaa gggaaaactc ttgcagattc atatccaagg + 1261 acaatagacc aatcttttct aaagacaaaa aagatcctcg atatgatcta caagtatgtt + 1321 tgttgagtga tgcggtccaa tgcataataa cttcgaataa ggagaagctt ttcatgcgtt + 1381 tccaatagga ttcttggcga atttttaaaa cttcctgata agacttttca ctatattcta + 1441 acgacatttc ttgctgcaaa gataaaatcc ctttacccat gaaatccctc gtgatataac + 1501 ctatccgtaa aatgtcctga ttagtgaaat aatcaggttg ttaacaggat agcacgctcg + 1561 gtattttttt atataaacat gaaaactcgt tccgaaatag aaaatcgcat gcaagatatc + 1621 gagtatgcgt tgttaggtaa agctctgata tttgaagact ctactgagta tattctgagg + 1681 cagcttgcta attatgagtt taagtgttct catcataaaa acatattcat agtatttaaa + 1741 cacttaaaag acaatggatt acctataact gtagactcgg cttgggaaga gcttttgcgg + 1801 cgtcgtatca aagatatgga caaatcgtat ctcgggttaa tgttgcatga tgctttatca + 1861 aatgacaagc ttagatccgt ttctcatacg gttttcctcg atgatttgag cgtgtgtagc + 1921 gctgaagaaa atttgagtaa tttcattttc cgctcgttta atgagtacaa tgaaaatcca + 1981 ttgcgtagat ctccgtttct attgcttgag cgtataaagg gaaggcttga tagtgctata + 2041 gcaaagactt tttctattcg cagcgctaga ggccggtcta tttatgatat attctcacag + 2101 tcagaaattg gagtgctggc tcgtataaaa aaaagacgag tagcgttctc tgagaatcaa + 2161 aattctttct ttgatggctt cccaacagga tacaaggata ttgatgataa aggagttatc + 2221 ttagctaaag gtaatttcgt gattatagca gctagaccat ctatagggaa aacagcttta + 2281 gctatagaca tggcgataaa tcttgcggtt actcaacagc gtagagttgg tttcctatct + 2341 ctagaaatga gcgcaggtca aattgttgag cggattattg ctaatttaac aggaatatct + 2401 ggtgaaaaat tacaaagagg ggatctctct aaagaagaat tattccgagt agaagaagct + 2461 ggagaaacgg ttagagaatc acatttttat atctgcagtg atagtcagta taagcttaac + 2521 ttaatcgcga atcagatccg gttgctgaga aaagaagatc gagtagacgt aatatttatc + 2581 gattacttgc agttgatcaa ctcatcggtt ggagaaaatc gtcaaaatga aatagcagat + 2641 atatctagaa ccttaagagg tttagcctca gagctaaaca ttcctatagt ttgtttatcc + 2701 caactatcta gaaaagttga ggatagagca aataaagttc ccatgctttc agatttgcga + 2761 gacagcggtc aaatagagca agacgcagat gtgattttgt ttatcaatag gaaggaatcg + 2821 tcttctaatt gtgagataac tgttgggaaa aatagacatg gatcggtttt ctcttcggta + 2881 ttacatttcg atccaaaaat tagtaaattc tccgctatta aaaaagtatg gtaaattata + 2941 gtaactgcca cttcatcaaa agtcctatcc accttgaaaa tcagaagttt ggaagaagac + 3001 ctggtcaatc tattaagata tctcccaaat tggctcaaaa tgggatggta gaagttatag + 3061 gtcttgattt tctttcatct cattaccatg cattagcagc tatccaaaga ttactgaccg + 3121 caacgaatta caaggggaac acaaaagggg ttgttttatc cagagaatca aatagttttc + 3181 aatttgaagg atggatacca agaatccgtt ttacaaaaac tgaattctta gaggcttatg + 3241 gagttaagcg gtataaaaca tccagaaata agtatgagtt tagtggaaaa gaagctgaaa + 3301 ctgctttaga agccttatac catttaggac atcaaccgtt tttaatagtg gcaactagaa + 3361 ctcgatggac taatggaaca caaatagtag accgttacca aactctttct ccgatcatta + 3421 ggatttacga aggatgggaa ggtttaactg acgaagaaaa tatagatata gacttaacac + 3481 cttttaattc accacctaca cggaaacata aagggttcgt tgtagagcca tgtcctatct + 3541 tggtagatca aatagaatcc tactttgtaa tcaagcctgc aaatgtatac caagaaataa + 3601 aaatgcgttt cccaaatgca tcaaagtatg cttacacatt tatcgactgg gtgattacag + 3661 cagctgcgaa aaagagacga aaattaacta aggataattc ttggccagaa aacttgttat + 3721 taaacgttaa cgttaaaagt cttgcatata ttttaaggat gaatcggtac atctgtacaa + 3781 ggaactggaa aaaaatcgag ttagctatcg ataaatgtat agaaatcgcc attcagcttg + 3841 gctggttatc tagaagaaaa cgcattgaat ttctggattc ttctaaactc tctaaaaaag + 3901 aaattctata tctaaataaa gagcgctttg aagaaataac taagaaatct aaagaacaaa + 3961 tggaacaatt agaacaagaa tctattaatt aatagcaagc ttgaaactaa aaacctaatt + 4021 tatttaaagc tcaaaataaa aaagagtttt aaaatgggaa attctggttt ttatttgtat + 4081 aacactgaaa actgcgtctt tgctgataat atcaaagttg ggcaaatgac agagccgctc + 4141 aaggaccagc aaataatcct tgggacaaca tcaacacctg tcgcagccaa aatgacagct + 4201 tctgatggaa tatctttaac agtctccaat aattcatcaa ccaatgcttc tattacaatt + 4261 ggtttggatg cggaaaaagc ttaccagctt attctagaaa agttgggaga tcaaattctt + 4321 gatggaattg ctgatactat tgttgatagt acagtccaag atattttaga caaaatcaaa + 4381 acagaccctt ctctaggttt gttgaaagct tttaacaact ttccaatcac taataaaatt + 4441 caatgcaacg ggttattcac tcccagtaac attgaaactt tattaggagg aactgaaata + 4501 ggaaaattca cagtcacacc caaaagctct gggagcatgt tcttagtctc agcagatatt + 4561 attgcatcaa gaatggaagg cggcgttgtt ctagctttgg tacgagaagg tgattctaag + 4621 ccctgcgcga ttagttatgg atactcatca ggcattccta atttatgtag tctaagaacc + 4681 agtattacta atacaggatt gactccgaca acgtattcat tacgtgtagg cggtttagaa + 4741 agcggtgtgg tatgggttaa tgccctttct aatggcaatg atattttagg aataacaaat + 4801 acttctaatg tatctttttt agaggtaata cctcaaacaa acgcttaaac aatttttatt + 4861 ggatttttct tataggtttt atatttagag aaaacagttc gaattacggg gtttgttatg + 4921 caaaataaaa gaaaagtgag ggacgatttt attaaaattg ttaaagatgt gaaaaaagat + 4981 ttccccgaat tagacctaaa aatacgagta aacaaggaaa aagtaacttt cttaaattct + 5041 cccttagaac tctaccataa aagtgtctca ctaattctag gactgcttca acaaatagaa + 5101 aactctttag gattattccc agactctcct gttcttgaaa aattagagga taacagttta + 5161 aagctaaaaa aggctttgat tatgcttatc ttgtctagaa aagacatgtt ttccaaggct + 5221 gaatagacaa cttactctaa cgttggagtt gatttgcaca ccttagtttt ttgctctttt + 5281 aagggaggaa ctggaaaaac aacactttct ctaaacgtgg gatgcaactt ggcccaattt + 5341 ttagggaaaa aagtgttact tgctgaccta gacccgcaat ccaatttatc ttctggattg + 5401 ggggctagtg tcagaagtga ccaaaaaggc ttgcacgaca tagtatacac atcaaacgat + 5461 ttaaaatcaa tcatttgcga aacaaaaaaa gatagtgtgg acctaattcc tgcatcattt + 5521 tcatccgaac agtttagaga attggatatt catagaggac ctagtaacaa cttaaagtta + 5581 tttctgaatg agtactgcgc tcctttttat gacatctgca taatagacac tccacctagc + 5641 ctaggagggt taacgaaaga agcttttgtt gcaggagaca aattaattgc ttgtttaact + 5701 ccagaacctt tttctattct agggttacaa aagatacgtg aattcttaag ttcggtcgga + 5761 aaacctgaag aagaacacat tcttggaata gctttgtctt tttgggatga tcgtaactcg + 5821 actaaccaaa tgtatataga cattatcgag tctatttaca aaaacaagct tttttcaaca + 5881 aaaattcgtc gagatatttc tctcagccgt tctcttctta aagaagattc tgtagctaat + 5941 gtctatccaa attctagggc cgcagaagat attctgaagt taacgcatga aatagcaaat + 6001 attttgcata tcgaatatga acgagattac tctcagagga caacgtgaac aaactaaaaa + 6061 aagaagcgga tgtctttttt aaaaaaaatc aaactgccgc ttctctagat tttaagaaga + 6121 cgcttccctc cattgaacta ttctcagcaa ctttgaattc tgaggaaagt cagagtttgg + 6181 atcgattatt tttatcagag tcccaaaact attcggatga agaattttat caagaagaca + 6241 tcctagcggt aaaactgctt actggtcaga taaaatccat acagaagcaa cacgtacttc + 6301 ttttaggaga aaaaatctat aatgctagaa aaatcctgag taaggatcac ttctcctcaa + 6361 caactttttc atcttggata gagttagttt ttagaactaa gtcttctgct tacaatgctc + 6421 ttgcatatta cgagcttttt ataaacctcc ccaaccaaac tctacaaaaa gagtttcaat + 6481 cgatccccta taaatccgca tatattttgg ccgctagaaa aggcgattta aaaaccaagg + 6541 tcgatgtgat agggaaagta tgtggaatgt cgaactcatc ggcgataagg gtgttggatc + 6601 aatttcttcc ttcatctaga aacaaagacg ttagagaaac gatagataag tctgattcag + 6661 agaagaatcg ccaattatct gatttcttaa tagagatact tcgcatcatg tgttccggag + 6721 tttctttgtc ctcctataac gaaaatcttc tacaacagct ttttgaactt tttaagcaaa + 6781 agagctgatc ctccgtcagc tcatatatat atatctatta tatatatata tttagggatt + 6841 tgatttcacg agagagattt gcaactcttg gtggtagact ttgcaactct tggtggtaga + 6901 ctttgcaact cttggtggta gactttgcaa ctcttggtgg tagacttggt cataatggac + 6961 ttttgttaaa aaatttatta aaatcttaga gctccgattt tgaatagctt tggttaagaa + 7021 aatgggctcg atggctttcc ataaaagtag attgttttta acttttgggg acgcgtcgga + 7081 aatttggtta tctactttat cttatctaac tagaaaaaat tatgcgtctg ggattaactt + 7141 tcttgtttct ttagagattc tggatttatc ggaaaccttg ataaaggcta tttctcttga + 7201 ccacagcgaa tctttgttta aaatcaagtc tctagatgtt tttaatggaa aagttgtttc + 7261 agaggcatct aaacaggcta gagcggcatg ctacatatct ttcacaaagt ttttgtatag + 7321 attgaccaag ggatatatta aacccgctat tccattgaaa gattttggaa acactacatt + 7381 ttttaaaatc cgagacaaaa tcaaaacaga atcgatttct aagcaggaat ggacagtttt + 7441 ttttgaagcg ctccggatag tgaattatag agactattta atcggtaaat tgattgtaca + 7501 ag +// + -- 1.7.10.2