From 2d63cfb4d8f84de5f40670bb301ee8a22db321ff Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Mon, 9 Jun 2014 10:19:48 +0100 Subject: [PATCH] =?utf8?q?JAL-1260=20v2=20patch=20from=20David=20Rold=C3=A1n?= =?utf8?q?-Mart=C3=ADnez?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- src/jalview/datamodel/DBRefSource.java | 5 +- src/jalview/io/AppletFormatAdapter.java | 22 +- src/jalview/io/DnaUtils.java | 61 ++ src/jalview/io/GenBankFile.java | 873 +++++++++++++++++ src/jalview/io/IdentifyFile.java | 7 + src/jalview/io/xdb/genbank/GenBankFeature.java | 68 ++ src/jalview/io/xdb/genbank/GenBankLocation.java | 150 +++ .../io/xdb/genbank/GenBankLocationPoint.java | 140 +++ .../io/xdb/genbank/GenBankLocationRange.java | 86 ++ src/jalview/io/xdb/genbank/GenBankLocations.java | 98 ++ src/jalview/io/xdb/genbank/GenBankLocus.java | 138 +++ src/jalview/io/xdb/genbank/GenBankReference.java | 135 +++ src/jalview/io/xdb/genbank/GenBankSequence.java | 57 ++ src/jalview/io/xdb/genbank/GenBankSource.java | 85 ++ src/jalview/io/xdb/genbank/GenBankVersion.java | 89 ++ test/jalview/io/GU324925.1.gb | 1011 ++++++++++++++++++++ test/jalview/io/GenBankTest.java | 282 ++++++ test/jalview/io/M92650.1.gb | 92 ++ test/jalview/io/NC_000011.10.gb | 173 ++++ test/jalview/io/V00505.gb | 83 ++ 20 files changed, 3648 insertions(+), 7 deletions(-) mode change 100755 => 100644 src/jalview/datamodel/DBRefSource.java mode change 100755 => 100644 src/jalview/io/AppletFormatAdapter.java create mode 100644 src/jalview/io/DnaUtils.java create mode 100644 src/jalview/io/GenBankFile.java mode change 100755 => 100644 src/jalview/io/IdentifyFile.java create mode 100644 src/jalview/io/xdb/genbank/GenBankFeature.java create mode 100644 src/jalview/io/xdb/genbank/GenBankLocation.java create mode 100644 src/jalview/io/xdb/genbank/GenBankLocationPoint.java create mode 100644 src/jalview/io/xdb/genbank/GenBankLocationRange.java create mode 100644 src/jalview/io/xdb/genbank/GenBankLocations.java create mode 100644 src/jalview/io/xdb/genbank/GenBankLocus.java create mode 100644 src/jalview/io/xdb/genbank/GenBankReference.java create mode 100644 src/jalview/io/xdb/genbank/GenBankSequence.java create mode 100644 src/jalview/io/xdb/genbank/GenBankSource.java create mode 100644 src/jalview/io/xdb/genbank/GenBankVersion.java create mode 100644 test/jalview/io/GU324925.1.gb create mode 100644 test/jalview/io/GenBankTest.java create mode 100644 test/jalview/io/M92650.1.gb create mode 100644 test/jalview/io/NC_000011.10.gb create mode 100644 test/jalview/io/V00505.gb diff --git a/src/jalview/datamodel/DBRefSource.java b/src/jalview/datamodel/DBRefSource.java old mode 100755 new mode 100644 index 1af18b6..b3b3b64 --- a/src/jalview/datamodel/DBRefSource.java +++ b/src/jalview/datamodel/DBRefSource.java @@ -74,7 +74,10 @@ public class DBRefSource * GeneDB ID */ public static final String GENEDB = "GeneDB"; - + /** + * GeneBank + */ + public static final String GENBANK = "GenBank"; /** * List of databases whose sequences might have coding regions annotated */ diff --git a/src/jalview/io/AppletFormatAdapter.java b/src/jalview/io/AppletFormatAdapter.java old mode 100755 new mode 100644 index d7da302..ead63c8 --- a/src/jalview/io/AppletFormatAdapter.java +++ b/src/jalview/io/AppletFormatAdapter.java @@ -41,7 +41,7 @@ public class AppletFormatAdapter */ public static final String[] READABLE_FORMATS = new String[] { "BLC", "CLUSTAL", "FASTA", "MSF", "PileUp", "PIR", "PFAM", "STH", - "PDB", "JnetFile", "RNAML" }; // , "SimpleBLAST" }; + "PDB", "JnetFile", "RNAML", "GENBANK" }; /** * List of valid format strings for use by callers of the formatSequences @@ -71,8 +71,8 @@ public class AppletFormatAdapter * corresponding to READABLE_FNAMES */ public static final String[] READABLE_EXTENSIONS = new String[] - { "fa, fasta, mfa, fastq", "aln", "pfam", "msf", "pir", "blc", "amsa", - "jar,jvp", "sto,stk", "xml,rnaml" }; // ".blast" + { "fa,faa,fasta,mfa,fastq", "aln", "pfam", "msf", "pir", "blc", "amsa", + "jar,jvp", "sto,stk", "xml,rnaml", "gb" }; // ".blast" /** * List of readable formats by application in order corresponding to @@ -80,7 +80,7 @@ public class AppletFormatAdapter */ public static final String[] READABLE_FNAMES = new String[] { "Fasta", "Clustal", "PFAM", "MSF", "PIR", "BLC", "AMSA", "Jalview", - "Stockholm", "RNAML" };// , + "Stockholm", "RNAML", "GenBank" }; // "SimpleBLAST" // }; @@ -245,7 +245,10 @@ public class AppletFormatAdapter { afile = new RnamlFile(inFile, type); } - + else if (format.equals("GENBANK")) + { + afile = new GenBankFile(inFile, type); + } Alignment al = new Alignment(afile.getSeqsAsArray()); afile.addAnnotations(al); @@ -360,6 +363,10 @@ public class AppletFormatAdapter { afile = new SimpleBlastFile(source); } + else if (format.equals("GENBANK")) + { + afile = new GenBankFile(source); + } Alignment al = new Alignment(afile.getSeqsAsArray()); @@ -467,7 +474,10 @@ public class AppletFormatAdapter { afile = new RnamlFile(); } - + else if (format.equalsIgnoreCase("GENBANK")) + { + afile = new GenBankFile(); + } else { throw new Exception( diff --git a/src/jalview/io/DnaUtils.java b/src/jalview/io/DnaUtils.java new file mode 100644 index 0000000..acd0bb9 --- /dev/null +++ b/src/jalview/io/DnaUtils.java @@ -0,0 +1,61 @@ +package jalview.io; + +import jalview.io.xdb.genbank.GenBankFeature; +import jalview.io.xdb.genbank.GenBankSequence; + +import java.util.ArrayList; +import java.util.List; +import java.util.Vector; + +public class DnaUtils { + + /** + * @param gbf CDS feature data + * @param sequences ORIGIN data + * @return Nucleotid String (sequence) of CDS + */ + public static String getSequence(GenBankFeature gbf, Vector sequences){ + if (!gbf.getType().equals(GenBankFeature.CDS)){ + //If the feature is not a CDS, no sequence is returned + return null; + }else{ + String range = gbf.getQualifier("range"); + if (range.startsWith("join")){ + //TODO + //It's a composed sequence + }else{ + //It's a simple range + String[] positions = range.split(".."); + int initRange = Integer.parseInt(positions[0]); + int endRange = Integer.parseInt(positions[1]); + String sourceSequence = getNucleotidesFromSequenceVector(sequences); + return sourceSequence.substring(initRange, endRange); + } + } + return null; + + } + private static boolean isSequenceInRange(int initRange, int endRange, GenBankSequence gbs){ + return ((initRange>=gbs.getId()) && (endRange>=gbs.getId())); + } + private static String getNucleotidesInRangeFromSequence(int initRange, int endRange, GenBankSequence gbs){ + return ""; + } + public static String getNucleotidesFromSequenceVector(Vector v){ + StringBuffer sb = new StringBuffer(); + for (GenBankSequence gbs:v){ + Vector seqs = gbs.getSequences(); + for (String s:seqs) + sb.append(s); + } + return sb.toString(); + } + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + +} diff --git a/src/jalview/io/GenBankFile.java b/src/jalview/io/GenBankFile.java new file mode 100644 index 0000000..4715095 --- /dev/null +++ b/src/jalview/io/GenBankFile.java @@ -0,0 +1,873 @@ +package jalview.io; + +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.io.xdb.genbank.GenBankFeature; +import jalview.io.xdb.genbank.GenBankLocation; +import jalview.io.xdb.genbank.GenBankLocationPoint; +import jalview.io.xdb.genbank.GenBankLocationRange; +import jalview.io.xdb.genbank.GenBankLocations; +import jalview.io.xdb.genbank.GenBankLocus; +import jalview.io.xdb.genbank.GenBankReference; +import jalview.io.xdb.genbank.GenBankSequence; +import jalview.io.xdb.genbank.GenBankSource; +import jalview.io.xdb.genbank.GenBankVersion; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.List; +import java.util.Vector; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.james.mime4j.field.ParsedField; + +public class GenBankFile extends AlignFile { + private static final Logger log = Logger.getLogger(GenBankFile.class.getName()); + private GenBankVersion version = new GenBankVersion(); + private GenBankLocus locus = new GenBankLocus(); + private GenBankSource source = new GenBankSource(); + private static final Pattern patLocation = Pattern.compile("(\\d+)\\.\\.(\\d+)"); + private static final Pattern patLocationComp = Pattern.compile("(complement)\\((\\d+)\\.\\.(\\d+)\\)"); + private static final Pattern patLocus = Pattern.compile("^LOCUS +([a-z|A-Z|0-9|_]+) +([0-9]+) bp ( {3}|ss\\-|ds\\-|ms\\-)([a-z|A-Z|-|\\s]+) ([a-z| ]{8}) ([A-Z| ]{3}) ([0-9]+-[A-Z]+-[0-9]+)"); + private static final Pattern patQualifierKey = Pattern.compile("/(.*?)="); + private static final Pattern patFeatureKey = Pattern.compile("^\\s{5}([A-Za-z0-9\\_\\']+)\\s+"); + + private String definition; + private String accession; + private String keywords; + private String dblink; + private String baseCount; + + private Vector features; + private Vector comments; + //Items under origin + private Vector sequences; + private Vector references; + + private SequenceI genBankSequence; + + public GenBankFile() { + } + + public GenBankFile(String inFile, String type) throws IOException { + super(inFile, type); + } + + public GenBankFile(FileParse source) throws IOException { + super(source); + } + + public void initData() { + super.initData(); + features = new Vector(); + comments = new Vector(); + sequences = new Vector(); + references = new Vector(); + } + + public void parse() throws IOException { + String line; + boolean featureMode = false; //FEATURES found + boolean seqMode = false; //Parsing Sequences from SOURCE + boolean referenceMode = false; //REFERENCE found + boolean sourceMode = false; //SOURCE found + boolean commentMode = false; //COMMENT found + boolean parsingAuthors = false; //Parsing authors (multiline) + boolean parsingDefinition = false; //Parsing definition (multiline) + boolean parsingKeywords = false; //Parsing keywords (multiline) + boolean parsingDbLink = false; //Parsing DBLINK (multiline) + boolean parsingTitle = false; //Parsing title (multiline) + boolean parsingQualifier = false; //Parsing feature qualifier (multine) + String currentQualifierName = ""; + GenBankReference reference = null; + GenBankFeature feature = null; + List sourceLines = new ArrayList(); + + if (this.isValid()){ + + while ((line = nextLine()) != null) { + // We only process lines if they have contents within + if (line.length() == 0) + continue; + + if (line.startsWith("FEATURES")){ + featureMode = true; + seqMode = false; + referenceMode = false; + sourceMode = false; + commentMode = false; + feature = new GenBankFeature(); + source = parseSource(sourceLines); + } + + + if (seqMode) { + if (!line.startsWith("//")){ + GenBankSequence seq = processSequenceLine(line); + sequences.add(seq); + } + featureMode = false; + referenceMode = false; + sourceMode = false; + } + + if (line.startsWith("ORIGIN")){ + if (feature.getType()!=null) + features.add(feature); + featureMode = false; + referenceMode = false; + sourceMode = false; + seqMode = true; + } + + if (featureMode){ + // Process feature line + if (!line.startsWith("FEATURES") && !line.startsWith("BASE COUNT")){ + //Parse type + if (!line.trim().startsWith("/")){ + Matcher featuresMatch = patFeatureKey.matcher(line); + if (featuresMatch.find()){ + if (feature.getType()!=null) + features.add(feature); //Hay que añadirlo sólo si no se está a mitad de un qualif o una feature + //It's a feature + String type = featuresMatch.group(0); + feature = new GenBankFeature(); + feature.setType(type); + GenBankLocation loc = parserFeatureLocation(feature, line.replace(type,"")); + feature.setLocation(loc); + parsingQualifier = false; + continue; + }else if (parsingQualifier) { //If not a feature, it's another part of a qualifier + String qValue = feature.getQualifier(currentQualifierName); + StringBuffer sb = new StringBuffer().append(qValue).append(ltrim(line)); + feature.updateQualifier(currentQualifierName, sb.toString()); + continue; + } + }else{ + //It's the begining of a qualifier line + Matcher matcher = patQualifierKey.matcher(line); + if (matcher.find()){ + String qName = matcher.group(1); + currentQualifierName = qName.replace("/",""); + line = line.replace(qName,"").replace("/", "").replace("=",""); + feature.addQualifier(currentQualifierName, ltrim(line)); + parsingQualifier = true; + continue; + } + } + } + } + // Process REFERENCE line + if (line.startsWith("REFERENCE")) { + if (!referenceMode){ + //This is line is the REFERENCE line + referenceMode = true; + featureMode = false; + sourceMode = false; + seqMode = false; + }else{ + //We were at referenceMode, then add current reference to the list and create a new one + references.add(reference); + } + reference = new GenBankReference(); + String desc = processReferenceLine(line,"REFERENCE"); + int[] ranges = parseReferenceDescriptor(desc); + reference.setDescriptor(desc); + reference.setOrder(ranges[0]); + reference.setBegin(ranges[1]); + reference.setEnd(ranges[2]); + parsingAuthors = false; + parsingTitle = false; + continue; + } + + if (line.startsWith(" AUTHORS")){ + if (referenceMode){ + reference.setAuthors(processReferenceLine(line,"AUTHORS")); + parsingAuthors = true; + parsingTitle = false; + } + continue; + } + if (line.startsWith(" TITLE")){ + if (referenceMode){ + reference.setTitle(processReferenceLine(line,"TITLE")); + parsingAuthors = false; + parsingTitle = true; + } + continue; + } + if (line.startsWith(" JOURNAL")){ + if (referenceMode){ + reference.setJournal(processReferenceLine(line,"JOURNAL")); + parsingTitle = false; + parsingAuthors = false; + } + continue; + } + if (line.startsWith(" PUBMED")){ + if (referenceMode){ + reference.setPubmed(processReferenceLine(line,"PUBMED")); + parsingTitle = false; + parsingAuthors = false; + } + continue; + } + + if (line.startsWith(" MEDLINE")){ + if (referenceMode){ + reference.setMedline(processReferenceLine(line,"MEDLINE")); + parsingTitle = false; + parsingAuthors = false; + } + continue; + } + if (line.startsWith(" REMARK")){ + if (referenceMode){ + reference.setRemark(processReferenceLine(line,"REMARK")); + parsingTitle = false; + parsingAuthors = false; + } + continue; + } + if (line.startsWith(" CONSRTM")){ + if (referenceMode){ + reference.setConsortia(processReferenceLine(line,"CONSRTM")); + parsingTitle = false; + parsingAuthors = false; + } + continue; + } + + + if (line.startsWith("SOURCE")) { + parsingKeywords = false; + sourceMode = true; + commentMode = false; + if (sourceMode){ + sourceLines.add(line); + } + continue; + } + if (line.indexOf("ORGANISM")!=-1) { + if (sourceMode){ + sourceLines.add(line); + continue; + } + } + + if (line.startsWith("COMMENT")){ + if (reference!=null) + references.add(reference); + commentMode = true; + sourceMode = false; + referenceMode = false; + sourceMode = false; + seqMode = false; + comments.add(processCommentLine(line)); + continue; + } + // Process LOCUS line + if (line.startsWith("LOCUS")) { + locus = parseLocus(line); + continue; + } + // Process BASE COUNT line + if (line.startsWith("BASE COUNT")) { + baseCount = processHeaderLine(line,"BASE COUNT"); + featureMode = false; + continue; + } + // Process DEFINITION line + if (line.startsWith("DEFINITION")) { + definition = processHeaderLine(line,"DEFINITION"); + parsingDefinition = true; + continue; + } + // Process ACCESSION line + if (line.startsWith("ACCESSION")) { + accession = processHeaderLine(line,"ACCESSION"); + parsingDefinition = false; + continue; + } + // Process VERSION line + if (line.startsWith("VERSION")) { + version = parseVersion(line); + //headers.put("VERSION", processHeaderLine(line,"VERSION")); + continue; + } + // Process DBLINK line + if (line.startsWith("DBLINK")) { + dblink = processHeaderLine(line,"DBLINK"); + parsingDbLink = true; + continue; + } + // Process KEYWORDS line + if (line.startsWith("KEYWORDS")) { + keywords = processHeaderLine(line,"KEYWORDS"); + parsingKeywords = true; + parsingDbLink = false; + continue; + } + if (sourceMode){ + sourceLines.add(line); + continue; + } + if (parsingDefinition){ + StringBuffer sb = new StringBuffer().append(definition).append(line); + definition = sb.toString(); + continue; + } + if (referenceMode && parsingAuthors){ + if (reference!=null){ + StringBuffer authors = new StringBuffer().append(reference.getAuthors()).append(line); + reference.setAuthors(authors.toString()); + } + continue; + } + if (referenceMode && parsingTitle){ + if (reference!=null){ + StringBuffer title = new StringBuffer().append(reference.getTitle()).append(line); + reference.setTitle(title.toString()); + } + continue; + } + if (parsingKeywords){ + StringBuffer sb = new StringBuffer().append(keywords).append(line); + keywords = sb.toString(); + continue; + } + if (parsingDbLink){ + StringBuffer sb = new StringBuffer().append(dblink).append(line); + dblink = sb.toString(); + continue; + } + if (commentMode){ + comments.add(line); + } + } + setEntries(); + }else{ + //File is not valid + throw new IOException("GenBankFile is not valid."); + } + } + + protected void setEntries(){ + StringBuffer result = new StringBuffer(); + //Mapping GenBank info into Jalview data model + genBankSequence = new Sequence(accession,DnaUtils.getNucleotidesFromSequenceVector(sequences)); + //Mapping DBRefEntry + DBRefEntry dbRef = new DBRefEntry(); + dbRef.setSource(DBRefSource.GENBANK); + dbRef.setVersion(version == null ? "" : version.toString()); + dbRef.setAccessionId(accession); + // add map to indicate the sequence is a valid coordinate frame for the dbref + dbRef.setMap(new Mapping(null, new int[] + { 1, genBankSequence.getLength() }, new int[] + { 1, genBankSequence.getLength() }, 1, 1)); + genBankSequence.addDBRef(dbRef); + + //add header info as non-positional features + //add LOCUS + SequenceFeature locusF = new SequenceFeature("LOCUS", (locus == null ? "" : locus.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(locusF); + //add DEFNITION + SequenceFeature defF = new SequenceFeature("DEFINITION", definition, null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(defF); + //add ACCESSION + SequenceFeature accessionF = new SequenceFeature("ACCESSION", accession, null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(accessionF); + //add VERSION + SequenceFeature versionF = new SequenceFeature("VERSION", (version == null ? "" : version.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(versionF); + //add DBLINK + SequenceFeature dblinkF = new SequenceFeature("DBLINK", (dblink == null ? "" : dblink.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(dblinkF); + //add KEYWORDS + SequenceFeature keywordsF = new SequenceFeature("KEYWORDS", keywords, null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(keywordsF); + //add SOURCE + SequenceFeature sourceF = new SequenceFeature("SOURCE", (source == null ? "" : source.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(sourceF); + //add BASE COUNT + SequenceFeature baseCountF = new SequenceFeature("BASE COUNT", (baseCount == null ? "" : baseCount.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(baseCountF); + + // add literature and database cross references in the file + for (GenBankReference gbRef:references){ + //They are non-positional features + SequenceFeature refFeature = new SequenceFeature("REFERENCE", gbRef.toString(),null,gbRef.getBegin(),gbRef.getEnd(),DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(refFeature); + } + //add COMMENTS + if (comments.size()>0){ + StringBuffer sb = new StringBuffer(); + for (String comment: comments){ + sb.append(comment).append(newline); + } + SequenceFeature commentF = new SequenceFeature("COMMENT", sb.toString(), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK); + genBankSequence.addSequenceFeature(commentF); + } + //Mapping FEATURES + for (GenBankFeature feature:features){ + if (feature.getType()!=null){ + SequenceFeature sf = new SequenceFeature(); + sf.setType(feature.getType()); + sf.setDescription(feature.getType()); + + sf.setBegin(feature.getLocation()==null ? 0 : feature.getLocation().getMinor()); + sf.setEnd(feature.getLocation()==null ? 0 : feature.getLocation().getMajor()); + Enumeration names = feature.getQualifiersNames(); + while (names.hasMoreElements()){ + String qName = names.nextElement(); + String qValue = feature.getQualifier(qName); + sf.setValue(qName, qValue); + } + genBankSequence.addSequenceFeature(sf); + } + } + SequenceI[] parsedSeqs = new SequenceI[1]; + parsedSeqs[0] = genBankSequence; + this.setSeqs(parsedSeqs); + } + private GenBankVersion parseVersion(String line) { + //VERSION U00096.2 GI:48994873 + if (line.trim().equalsIgnoreCase("VERSION")){ + return null; + }else{ + GenBankVersion ver = new GenBankVersion(); + String v = line.substring(11, line.indexOf(" ", 12)).trim(); + ver.setVersion(v); + int posGI = line.indexOf("GI:", 11 + v.length()); + if (posGI > -1) { + ver.setGI(line.substring(posGI)); + } + return ver; + } + } + + private GenBankLocus parseLocus(String line){ + GenBankLocus loc = new GenBankLocus(); + Matcher mat = patLocus.matcher(line); + if (mat.find()) { + String name = mat.group(1); + String len = mat.group(2); + String strand = mat.group(3); + String mtype = mat.group(4); + String linear = mat.group(5); + String division = mat.group(6); + String date = mat.group(7); + + loc.setName(name == null ? "" : name.trim()); + loc.setSequenceLength(len == null ? 0 : Integer.parseInt(len)); + loc.setStrand(strand == null ? "" : strand); + loc.setMoleculeType(mtype == null ? "" : mtype); + loc.setLinearSequence("linear".equals(linear)); + loc.setDivision(division == null ? "" : division); + loc.setModificationDate(date == null ? "" :date); + } + return loc; + } + private GenBankSource parseSource(List lines){ + StringBuffer sb = new StringBuffer(); + for(String line:lines){ + sb.append(line).append(newline); + } + // Source section + GenBankSource sou = new GenBankSource(); + String aux = sb.toString().substring(11); + int fim1 = aux.indexOf("\n"); + if (fim1 > -1) { + sou.setSource(aux.substring(0, fim1)); + int ini2 = aux.indexOf("ORGANISM"); + if (ini2 > -1) { + fim1 = aux.indexOf("\n", ini2 + 10); + if (fim1 > -1) { + sou.setOrganism(aux.substring(ini2 + 10, fim1)); + sou.setTaxonomic(aux.substring(fim1).replaceAll(" ", "").replaceAll("\\s+", "")); + } else { + sou.setOrganism(aux); + } + } + } else { + sou.setSource(aux); + } + return sou; + } + + /** + * Possible situations: + * + * 467 Points to a single base in the presented sequence 340..565 Points to + * a continuous range of bases bounded by and including the starting and + * ending bases <345..500 Indicates that the exact lower boundary point + * of a feature is unknown. The location begins at some base previous to the + * first base specified (which need not be contained in the presented + * sequence) and continues to and includes the ending base <1..888 The + * feature starts before the first sequenced base and continues to and + * includes base 888 1..>888 The feature starts at the first sequenced + * base and continues beyond base 888 102.110 Indicates that the exact + * location is unknown but that it is one of the bases between bases 102 and + * 110, inclusive 123^124 Points to a site between bases 123 and 124 + * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to + * form one contiguous sequence complement(34..126) Start at the base + * complementary to 126 and finish at the base complementary to base 34 (the + * feature is on the strand complementary to the presented strand) + * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and + * 4918 to 5163, then complements the joined segments (the feature is on the + * strand complementary to the presented strand) + * join(complement(4918..5163),complement(2691..4571)) Complements regions + * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the + * feature is on the strand complementary to the presented strand) + * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in + * this database) with primary accession number 'J00194' + * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry + * with the region 100..202 of remote entry J00194 + * + * @param fea + * @param localiza + */ + private GenBankLocation parserFeatureLocation(GenBankFeature fea, String localiza) { + // remove os espaços, quebra de linhas etc + String buf = localiza.replaceAll("\\s", ""); + + // checks if there is a comma present between ranges + // complement(100..110),complement(90..100) + char[] buf2 = buf.toCharArray(); + int abertos = 0; + java.util.List lista = new java.util.ArrayList(); + int pinicial = 0; + for (int i = 0; i < buf2.length; i++) { + if (buf2[i] == '(') { + abertos++; + } else if (buf2[i] == ')') { + abertos--; + } else if (buf2[i] == ',' && abertos == 0) { + lista.add(buf.substring(pinicial, i)); + pinicial = i + 1; + } + } + if (lista.size() > 0) { + lista.add(buf.substring(pinicial)); + GenBankLocations um = new GenBankLocations(); + um.setOperator(GenBankLocations.NONE); + for (String s : lista) { + um.getUnits().add(parserFeatureLocation(fea, s)); + } + fea.setLocation(um); + return um; + } + + // trata as funcoes: complement(location,location...), + // join(location,location...), order(location,location...) + if (buf.contains("(")) { + GenBankLocations um = new GenBankLocations(); + int ini = buf.indexOf("("); + int fim = buf.lastIndexOf(")"); + String token = buf.substring(0, ini); + if ("complement".equalsIgnoreCase(token)) { + String inter = buf.substring(ini + 1, fim); + GenBankLocation interno = parserFeatureLocation(fea, inter); + interno.setComplement(true); + um.setOperator(GenBankLocations.COMPLEMENT); + um.getUnits().add(interno); + fea.setLocation(um); + } else if ("join".equalsIgnoreCase(token)) { + String inter = buf.substring(ini + 1, fim); + GenBankLocation interno = parserFeatureLocation(fea, inter); + um.setOperator(GenBankLocations.JOIN); + um.getUnits().add(interno); + fea.setLocation(um); + } else if ("order".equalsIgnoreCase(token)) { + String inter = buf.substring(ini + 1, fim); + GenBankLocation interno = parserFeatureLocation(fea, inter); + um.setOperator(GenBankLocations.ORDER); + um.getUnits().add(interno); + fea.setLocation(um); + } else { + log.log(Level.WARNING, "Token desconhecido em location/features - {0}", token); + String inter = buf.substring(ini + 1, fim); + fea.setLocation(parserFeatureLocation(fea, inter)); + } + return fea.getLocation(); + } else { + // trata quando tiver uma lista de location + if (buf.contains(",")) { + String[] partes = buf.split(","); + GenBankLocations um = new GenBankLocations(); + for (String p : partes) { + um.getUnits().add( + parserFeatureLocation(fea, p)); + } + fea.setLocation(um); + return um; + } else { + // trata quando tiver range + if (buf.contains("..")) { + String[] partes = buf.split("\\.\\."); + GenBankLocationRange range = new GenBankLocationRange(); + if (buf.contains(":")) { + for (int i = 0; i < partes.length; i++) { + int pos = partes[i].indexOf(":"); + if (pos > 0) { + String entry = partes[i].substring(0, pos); + partes[i] = partes[i].substring(pos + 1); + range.setEntry(entry); + } + } + } + GenBankLocationPoint gp0 = (GenBankLocationPoint) parserFeatureLocation(fea, partes[0]); + range.setStart(gp0); + GenBankLocationPoint gp1 = (GenBankLocationPoint) parserFeatureLocation(fea, partes[1]); + range.setEnd(gp1); + fea.setLocation(range); + return range; + } else { + // trata um ponto + // possibilidades consideradas: + // 467 + // 102.110 + // 123^124 + // <345 + // >400 + // 345> + // 400< + // ou uma combinacao dessas + GenBankLocationPoint gp = new GenBankLocationPoint(); + if (buf.contains(":")) { + int pos = buf.indexOf(":"); + if (pos > 0) { + String entry = buf.substring(0, pos); + buf = buf.substring(pos + 1); + gp.setEntry(entry); + } + } + int pos = 0; + // verifica os simb < e > antes do primeiro numero + if (buf.charAt(pos) == '<' || buf.charAt(pos) == '>') { + gp.setPrefix(buf.charAt(pos)); + pos++; + } + // pega o primeiro numero + int ini = pos; + while (pos < buf.length() && buf.charAt(pos) >= '0' + && buf.charAt(pos) <= '9') { + pos++; + } + if (buf.subSequence(ini, pos).length() < 1) { + System.out.println(localiza); + } + int num = Integer.parseInt(buf.substring(ini, pos)); + int num2 = num; + // o primeiro numero pode ser o unico numero + if (pos < buf.length()) { + // verifica se tem os sinais < e > apos o primeiro numero + if (buf.charAt(pos) == '<' || buf.charAt(pos) == '>') { + if (buf.contains(".") || buf.contains("^")) { + gp.setPrefix(buf.charAt(pos)); + } else { + gp.setSufix(buf.charAt(pos)); + } + pos++; + } + + // verifica a separacao dos numeros . ou ^ + if (pos < buf.length() + && (buf.charAt(pos) == '.' || buf.charAt(pos) == '^')) { + // separação localizada, possibilidade de mais numero + gp.setSymbol(buf.charAt(pos)); + pos++; + + // verifica os simb < e > antes do segundo numero + if (buf.charAt(pos) == '<' || buf.charAt(pos) == '>') { + gp.setSufix(buf.charAt(pos)); + pos++; + } + + // pega o segundo numero + ini = pos; + while (pos < buf.length() && buf.charAt(pos) >= '0' + && buf.charAt(pos) <= '9') { + pos++; + } + num2 = Integer.parseInt(buf.substring(ini, pos)); + + // verifica os simb < e > após o segundo numero + if (pos < buf.length() && (buf.charAt(pos) == '<' || buf.charAt(pos) == '>')) { + gp.setSufix(buf.charAt(pos)); + pos++; + } + } + } + gp.setMin(num); + gp.setMax(num2); + fea.setLocation(gp); + return gp; + } + } + } + } + + private int[] parseReferenceDescriptor(String descriptor){ + // 1 (bases 1 to 1609) + int[] resultado = new int[3]; + descriptor = descriptor.replace("(bases", ",").replace("to", ",").replace(")", ""); + String[] args = descriptor.split(","); + resultado[0] = Integer.parseInt(args[0].trim()); + resultado[1] = Integer.parseInt(args[1].trim()); + resultado[2] = Integer.parseInt(args[2].trim()); + return resultado; + } + private String processReferenceLine(String line, String component){ + int init = line.indexOf(component); + if (init!=-1){ + line = line.replace(component,""); + } + return line; + } + private String processHeaderLine(String line, String header){ + int init = line.indexOf(header); + if (init!=-1){ + line = line.replace(header,""); + } + return line; + } + + private GenBankSequence processSequenceLine(String line) { + GenBankSequence gbs = new GenBankSequence(); + line = ltrim(line); + String[] args = line.split(" "); + gbs.setId(Integer.parseInt(args[0])); + int len = args.length-1; + Vector seqs = new Vector(); + for (int i=0;i= 0 && Character.isWhitespace(s.charAt(i))) { + i--; + } + return s.substring(0,i+1); + } + + public String ltrim(String s) { + int i = 0; + while (i < s.length() && Character.isWhitespace(s.charAt(i))) { + i++; + } + return s.substring(i); + } + + public String print(){ + StringBuffer out = new StringBuffer(); + for (SequenceI seq: this.getSeqs()){ + SequenceFeature[] seqFeatures = seq.getSequenceFeatures(); + boolean featureLinePrinted = false; + for(SequenceFeature sf:seqFeatures){ + if(sf.getType().equals("LOCUS")){ + out.append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("DEFINITION")){ + out.append("DEFINITION ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("VERSION")){ + out.append("VERSION ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("ACCESSION")){ + out.append("ACCESSION ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("DBLINK")){ + out.append("DBLINK ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("KEYWORDS")){ + out.append("KEYWORDS ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("SOURCE")){ + out.append("SOURCE ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("REFERENCE")){ + out.append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("COMMENT")){ + out.append("COMMENT ").append(sf.getDescription()).append(newline); + }else if (sf.getType().equals("BASE COUNT")){ + out.append("BASE COUNT ").append(sf.getDescription()).append(newline); + }else{ + if (!featureLinePrinted){ + out.append("FEATURES Location/Qualifiers").append(newline); + featureLinePrinted = true; + } + out.append(" ").append(sf.getType()).append(" ").append(sf.getBegin()).append("..").append(sf.getEnd()).append(newline); + Hashtable qualifiers = sf.otherDetails; + if (qualifiers!=null){ + Enumeration keys = qualifiers.keys(); + while (keys.hasMoreElements()){ + String key = keys.nextElement(); + String value = qualifiers.get(key); + if (value!=null){ + out.append(" /").append(key).append("=").append(value).append(newline); + } + } + } + } + } + out.append("ORIGIN").append(newline); + //We have to divide sequence in groups of 6x10 chars + String sequenceString = seq.getSequenceAsString(); + int howManyGroups = (int) Math.floor(sequenceString.length()/60); + for (int i=0;i<=howManyGroups;i++){ + String sequenceSegment = sequenceString.substring(i*60,Math.min((i+1)*60, sequenceString.length())); + if ((!"".equals(sequenceSegment) && (sequenceSegment!=null) && (sequenceSegment.length()>0))){ + out.append(" ").append(60*i+1).append(" "); + } + int segmentLength = sequenceSegment.length(); + if (segmentLength>=10){ + out.append(sequenceSegment.substring(0,10)).append(" "); + if (segmentLength>=20){ + out.append(sequenceSegment.substring(10,20)).append(" "); + if (segmentLength>=30){ + out.append(sequenceSegment.substring(20,30)).append(" "); + if (segmentLength>=40){ + out.append(sequenceSegment.substring(30,40)).append(" "); + if (segmentLength>=50){ + out.append(sequenceSegment.substring(40,50)).append(" "); + if (segmentLength<=60){ + out.append(sequenceSegment.substring(50,sequenceSegment.length())); + } + }else{ + out.append(sequenceSegment.substring(40,sequenceSegment.length())); + } + }else{ + out.append(sequenceSegment.substring(30,sequenceSegment.length())); + } + }else{ + out.append(sequenceSegment.substring(20,sequenceSegment.length())); + } + }else{ + out.append(sequenceSegment.substring(10,sequenceSegment.length())); + } + } else if ((!"".equals(sequenceSegment) && (sequenceSegment!=null) && (sequenceSegment.length()>0))){ + out.append(sequenceSegment); + } + out.append(newline); + } + out.append("//"); + } + return out.toString(); + } +} diff --git a/src/jalview/io/IdentifyFile.java b/src/jalview/io/IdentifyFile.java old mode 100755 new mode 100644 index 08d4dca..8ce2af5 --- a/src/jalview/io/IdentifyFile.java +++ b/src/jalview/io/IdentifyFile.java @@ -128,6 +128,13 @@ public class IdentifyFile } data = data.toUpperCase(); + if ((data.indexOf("LOCUS") > -1)) + { + reply = "GENBANK"; + + break; + } + if ((data.indexOf("# STOCKHOLM") > -1)) { reply = "STH"; diff --git a/src/jalview/io/xdb/genbank/GenBankFeature.java b/src/jalview/io/xdb/genbank/GenBankFeature.java new file mode 100644 index 0000000..0de2e65 --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankFeature.java @@ -0,0 +1,68 @@ +package jalview.io.xdb.genbank; + +import java.util.Enumeration; +import java.util.Hashtable; + +public class GenBankFeature { + public static final String MISC_TYPE = "misc_feature"; + public static final String SOURCE = "source"; + public static final String CDS = "CDS"; + public static final String GENE = "gene"; + public static final String EXON = "exon"; + public static final String INTRON = "intron"; + public static final String PRIM_TRANSCRIPT = "prim_transcript"; + public static final String mRNA = "mRNA"; + public static final String MOBILE_ELEMENT = "mobile_element"; + public static final String VARIATION = "variation"; + + private String type; + private Hashtable qualifiers = new Hashtable(); + private GenBankLocation location = null; + + public GenBankFeature() { + super(); + } + + public GenBankFeature(String type) { + super(); + this.type = type; + } + + public void addQualifier(String key, String value){ + this.qualifiers.put(key, value); + } + public void updateQualifier(String key, String newValue){ + this.qualifiers.remove(key); + this.qualifiers.put(key, newValue); + } + + public String getQualifier(String key){ + return this.qualifiers.get(key); + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + public Enumeration getQualifiersNames(){ + return this.qualifiers.keys(); + } + public int getQualifiersSize(){ + return this.qualifiers.size(); + } + + public Hashtable getFields() { + return qualifiers; + } + + public GenBankLocation getLocation() { + return location; + } + + public void setLocation(GenBankLocation location) { + this.location = location; + } +} diff --git a/src/jalview/io/xdb/genbank/GenBankLocation.java b/src/jalview/io/xdb/genbank/GenBankLocation.java new file mode 100644 index 0000000..5d0db6c --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankLocation.java @@ -0,0 +1,150 @@ +package jalview.io.xdb.genbank; + +/** + * The location contains at least one sequence location descriptor and may + * contain one or more operators with one or more sequence location descriptors. + * Base numbers refer to the numbering in the entry. This numbering designates + * the first base (5' end) of the presented sequence as base 1. + * Base locations beyond the range of the presented sequence may not be used in + * location descriptors, the only exception being location in a remote entry (see + * 3.5.2.1, e). + * + * Location operators and descriptors are discussed in more detail below. + * + * 3.5.2.1 Location descriptors + * The location descriptor can be one of the following: + * (a) a single base number + * (b) a site between two indicated adjoining bases + * (c) a single base chosen from within a specified range of bases (not allowed for new + * entries) + * (d) the base numbers delimiting a sequence span + * (e) a remote entry identifier followed by a local location descriptor + * (i.e., a-d) + * + * A site between two adjoining nucleotides, such as endonucleolytic cleavage + * site, is indicated by listing the two points separated by a carat (^). The + * permitted formats for this descriptor are n^n+1 (for example 55^56), or, for + * circular molecules, n^1, where "n" is the full length of the molecule, ie + * 1000^1 for circular molecule with length 1000. + * + * A single base chosen from a range of bases is indicated by the first base + * number and the last base number of the range separated by a single period + * (e.g., '12.21' indicates a single base taken from between the indicated + * points). From October 2006 the usage of this descriptor is restricted : + * it is illegal to use "a single base from a range" (c) either on its own or + * in combination with the "sequence span" (d) descriptor for newly created entries. + * The existing entries where such descriptors exist are going to be retrofitted. + * + * Sequence spans are indicated by the starting base number and the ending base + * number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may + * be used with the starting and ending base numbers to indicate that an end + * point is beyond the specified base number. The starting and ending base + * positions can be represented as distinct base numbers ('34..456') or a site + * between two indicated adjoining bases. + * + * A location in a remote entry (not the entry to which the feature table + * belongs) can be specified by giving the accession-number and sequence version + * of the remote entry, followed by a colon ":", followed by a location + * descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see + * also examples below) + * + * 3.5.2.2 Operators + * + * The location operator is a prefix that specifies what must be done to the + * indicated sequence to find or construct the location corresponding to the + * feature. A list of operators is given below with their definitions and most + * common format. + * + * complement(location) + * Find the complement of the presented sequence in the span specified by " + * location" (i.e., read the complement of the presented strand in its 5'-to-3' + * direction) + * + * join(location,location, ... location) + * The indicated elements should be joined (placed end-to-end) to form one + * contiguous sequence + * + * order(location,location, ... location) + * The elements can be found in the + * specified order (5' to 3' direction), but nothing is implied about the + * reasonableness about joining them + * + * Note : location operator "complement" can be used in combination with either " + * join" or "order" within the same location; combinations of "join" and "order" + * within the same location (nested operators) are illegal. + * + * 3.5.3 Location examples + * + * The following is a list of common location descriptors with their meanings: + * Location Description + * 467 Points to a single base in the presented sequence + * 340..565 Points to a continuous range of bases bounded by and + * including the starting and ending bases + * <345..500 Indicates that the exact lower boundary point of a feature + * is unknown. The location begins at some base previous to + * the first base specified (which need not be contained in + * the presented sequence) and continues to and includes the + * ending base + * <1..888 The feature starts before the first sequenced base and + * continues to and includes base 888 + * 1..>888 The feature starts at the first sequenced base and + * continues beyond base 888 + * 102.110 Indicates that the exact location is unknown but that it is + * one of the bases between bases 102 and 110, inclusive + * 123^124 Points to a site between bases 123 and 124 + * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form + * one contiguous sequence + * complement(34..126) Start at the base complementary to 126 and finish at the + * base complementary to base 34 (the feature is on the strand + * complementary to the presented strand) + * complement(join(2691..4571,4918..5163)) + * Joins regions 2691 to 4571 and 4918 to 5163, then + * complements the joined segments (the feature is on the + * strand complementary to the presented strand) + * join(complement(4918..5163),complement(2691..4571)) + * Complements regions 4918 to 5163 and 2691 to 4571, then + * joins the complemented segments (the feature is on the + * strand complementary to the presented strand) + * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in + * this database) with primary accession number 'J00194' + * join(1..100,J00194.1:100..202) + * Joins region 1..100 of the existing entry with the region + * 100..202 of remote entry J00194 + * + * + */ +public abstract class GenBankLocation { + // the location is complement strand? + private boolean complement = false; + + public GenBankLocation() { + } + + /** + * The minor location in genome sequence + * + * @return position + */ + public abstract int getMinor(); + + /** + * The major location in genome sequence + * + * @return position + */ + public abstract int getMajor(); + + /** + * @return the complement + */ + public boolean isComplement() { + return complement; + } + + /** + * @param complement the complement to set + */ + public void setComplement(boolean complement) { + this.complement = complement; + } +} \ No newline at end of file diff --git a/src/jalview/io/xdb/genbank/GenBankLocationPoint.java b/src/jalview/io/xdb/genbank/GenBankLocationPoint.java new file mode 100644 index 0000000..6d3a475 --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankLocationPoint.java @@ -0,0 +1,140 @@ +package jalview.io.xdb.genbank; + +/** + * + */ +public class GenBankLocationPoint extends GenBankLocation { + private String entry; + private char prefix = 0; + private int min = 0; + private char symbol = 0; + private int max = 0; + private char sufix = 0; + + public GenBankLocationPoint() { + } + + public GenBankLocationPoint(int point) { + this.min = point; + this.max = point; + } + + public GenBankLocationPoint(int min, int max) { + this.min = min; + this.max = max; + } + + public int getMinor() { + return this.min; + } + + public int getMajor() { + return this.max; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if( prefix != 0 && prefix != ' ' ) { + sb.append(prefix); + } + if( symbol == '.' || symbol == '^' ) { + sb.append( String.format("%d%c%d",min,symbol,max) ); + } else { + if( min != max ) { + sb.append( String.format("%d.%d",min,max) ); + } else { + sb.append( min ); + } + } + if( sufix != 0 && sufix != ' ' ) { + sb.append(sufix); + } + return sb.toString(); + } + + /** + * @return the prefix + */ + public char getPrefix() { + return prefix; + } + + /** + * @param prefix the prefix to set + */ + public void setPrefix(char prefix) { + this.prefix = prefix; + } + + /** + * @return the min + */ + public int getMin() { + return min; + } + + /** + * @param min the min to set + */ + public void setMin(int min) { + this.min = min; + } + + /** + * @return the symbol + */ + public char getSymbol() { + return symbol; + } + + /** + * @param symbol the symbol to set + */ + public void setSymbol(char symbol) { + this.symbol = symbol; + } + + /** + * @return the max + */ + public int getMax() { + return max; + } + + /** + * @param max the max to set + */ + public void setMax(int max) { + this.max = max; + } + + /** + * @return the sufix + */ + public char getSufix() { + return sufix; + } + + /** + * @param sufix the sufix to set + */ + public void setSufix(char sufix) { + this.sufix = sufix; + } + + /** + * @return the entry + */ + public String getEntry() { + return entry; + } + + /** + * @param entry the entry to set + */ + public void setEntry(String entry) { + this.entry = entry; + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankLocationRange.java b/src/jalview/io/xdb/genbank/GenBankLocationRange.java new file mode 100644 index 0000000..552d1f9 --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankLocationRange.java @@ -0,0 +1,86 @@ +package jalview.io.xdb.genbank; + +/** + * + */ +public class GenBankLocationRange extends GenBankLocation { + private String entry = null; + private GenBankLocationPoint start = null; + private GenBankLocationPoint end = null; + + public GenBankLocationRange() { + } + + @Override + public int getMinor() { + return start == null ? 0 : start.getMinor(); + } + + @Override + public int getMajor() { + return end == null ? 0 : end.getMajor(); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); +// if( getDirecao() != '5' ) { +// sb.append("complement("); +// } + if( entry != null ) { + sb.append(entry); + sb.append(":"); + } + if( getStart() != null ) { + sb.append( getStart().toString() ); + } + if( getEnd() != null && getStart() != getEnd() && !start.equals(end) ) { + sb.append(".."); + sb.append( getEnd().toString() ); + } + return sb.toString(); + } + + /** + * @return the entry + */ + public String getEntry() { + return entry; + } + + /** + * @param entry the entry to set + */ + public void setEntry(String entry) { + this.entry = entry; + } + + /** + * @return the start + */ + public GenBankLocationPoint getStart() { + return start; + } + + /** + * @param start the start to set + */ + public void setStart(GenBankLocationPoint start) { + this.start = start; + } + + /** + * @return the end + */ + public GenBankLocationPoint getEnd() { + return end; + } + + /** + * @param end the end to set + */ + public void setEnd(GenBankLocationPoint end) { + this.end = end; + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankLocations.java b/src/jalview/io/xdb/genbank/GenBankLocations.java new file mode 100644 index 0000000..ae3e47b --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankLocations.java @@ -0,0 +1,98 @@ +package jalview.io.xdb.genbank; + +/** + * + * @author Dieval Guizelini + */ +public class GenBankLocations extends GenBankLocation { + public static final int NONE = 1; // default + public static final int COMPLEMENT = 2; + public static final int JOIN = 3; + public static final int ORDER = 4; // conj com ordem desconhecida + private int operator = NONE; + private java.util.List units; + + public GenBankLocations() { + units = new java.util.ArrayList(); + } + + @Override + public void setComplement(boolean complement){ + super.setComplement(complement); + this.operator = COMPLEMENT; + if (units != null) { + for (GenBankLocation o : units) { + o.setComplement(complement); + } + } + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (getOperator() == COMPLEMENT) { + sb.append("complement("); + } else if (getOperator() == JOIN) { + sb.append("join("); + } else if (getOperator() == ORDER) { + sb.append("order("); + } + if (units.size() > 0) { + sb.append(units.get(0).toString()); + for (int i = 1; i < units.size(); i++) { + sb.append(","); + sb.append(units.get(i).toString()); + } + } + if (getOperator() != NONE) { + sb.append(")"); + } + return sb.toString(); + } + + /** + * @return the units + */ + public java.util.List getUnits() { + return units; + } + + /** + * @param units the units to set + */ + public void setUnits(java.util.List units) { + this.units = units; + } + + @Override + public int getMinor() { + if( units.size() > 0 ) { + return units.get(0).getMinor(); + } + return 0; + } + + @Override + public int getMajor() { + int ind = units.size(); + if( ind > 0 ) { + return units.get(ind-1).getMajor(); + } + return 0; + } + + /** + * @return the operator + */ + public int getOperator() { + return operator; + } + + /** + * @param operator the operator to set + */ + public void setOperator(int operator) { + this.operator = operator; + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankLocus.java b/src/jalview/io/xdb/genbank/GenBankLocus.java new file mode 100644 index 0000000..cf6289e --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankLocus.java @@ -0,0 +1,138 @@ +package jalview.io.xdb.genbank; + +/** + * A short mnemonic name for the entry, chosen to suggest the + * sequence's definition. Mandatory keyword/exactly one record. + * + *

The LOCUS field contains a number of different data elements, including locus name, + * sequence length, molecule type, GenBank division, and modification date. Each element + * is described below.

+ * + */ +public class GenBankLocus { + private String name; + private int sequenceLength; + private String strand; + private String moleculeType; + private boolean linearSequence; + private String division; + private String modificationDate; + + public GenBankLocus() { + } + + public GenBankLocus(String name, int sequenceLength) { + this.name = name; + this.sequenceLength = sequenceLength; + } + + + /** + * @return the name + */ + public String getName() { + return name; + } + + /** + * @param name the name to set + */ + public void setName(String name) { + this.name = name; + } + + /** + * @return the sequenceLength + */ + public int getSequenceLength() { + return sequenceLength; + } + + /** + * @param sequenceLength the sequenceLength to set + */ + public void setSequenceLength(int sequenceLength) { + this.sequenceLength = sequenceLength; + } + + /** + * @return the strand + */ + public String getStrand() { + return strand; + } + + /** + * @param strand the strand to set + */ + public void setStrand(String strand) { + this.strand = strand; + } + + /** + * @return the moleculeType + */ + public String getMoleculeType() { + return moleculeType; + } + + /** + * @param moleculeType the moleculeType to set + */ + public void setMoleculeType(String moleculeType) { + this.moleculeType = moleculeType; + } + + /** + * @return the linearSequence + */ + public boolean isLinearSequence() { + return linearSequence; + } + + /** + * @param linearSequence the linearSequence to set + */ + public void setLinearSequence(boolean linearSequence) { + this.linearSequence = linearSequence; + } + + /** + * @return the division + */ + public String getDivision() { + return division; + } + + /** + * @param division the division to set + */ + public void setDivision(String division) { + this.division = division; + } + + /** + * @return the modificationDate + */ + public String getModificationDate() { + return modificationDate; + } + + /** + * @param modificationDate the modificationDate to set + */ + public void setModificationDate(String modificationDate) { + this.modificationDate = modificationDate; + } + + @Override + public String toString() { + + return String.format("LOCUS %-16s %11d bp %3s %6s %-8s %3s %s", + this.name, this.sequenceLength, this.strand, + this.moleculeType, linearSequence?"linear ":"circular", + this.division, ((modificationDate == null) || (modificationDate.equals("")) ? "" : modificationDate.toUpperCase()) + ); + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankReference.java b/src/jalview/io/xdb/genbank/GenBankReference.java new file mode 100644 index 0000000..74f0080 --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankReference.java @@ -0,0 +1,135 @@ +package jalview.io.xdb.genbank; + +public class GenBankReference { + private int order; + private int begin; + private int end; + private String descriptor; + private String authors; + private String title; + private String journal; + private String pubmed; + private String medline; + private String consortia; + private String remark; + + public GenBankReference() { + super(); + } + + public String getDescriptor() { + return descriptor; + } + + public void setDescriptor(String descriptor) { + this.descriptor = descriptor; + } + + public String getAuthors() { + return authors; + } + + public void setAuthors(String authors) { + this.authors = authors; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getJournal() { + return journal; + } + + public void setJournal(String journal) { + this.journal = journal; + } + + public String getPubmed() { + return pubmed; + } + + public void setPubmed(String pubmed) { + this.pubmed = pubmed; + } + + public int getOrder() { + return order; + } + + public void setOrder(int order) { + this.order = order; + } + public int getBegin() { + return begin; + } + + public void setBegin(int begin) { + this.begin = begin; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + + public String getMedline() { + return medline; + } + + public void setMedline(String medline) { + this.medline = medline; + } + + public String getConsortia() { + return consortia; + } + + public void setConsortia(String consortia) { + this.consortia = consortia; + } + + public String getRemark() { + return remark; + } + + public void setRemark(String remark) { + this.remark = remark; + } + + public String toString(){ +// References has the following format +// REFERENCE 1 (bases 1 to 1976) +// AUTHORS Spritz,R.A., DeRiel,J.K., Forget,B.G. and Weissman,S.M. +// TITLE Complete nucleotide sequence of the human delta-globin gene +// JOURNAL Cell 21 (3), 639-646 (1980) +// PUBMED 7438204 + + StringBuffer buf = new StringBuffer(); + buf.append("REFERENCE ").append(this.getOrder()).append(" (bases ").append(this.getBegin()).append(" to ").append(this.getEnd()).append(")\n"); + if (this.getAuthors()!=null) + buf.append(" AUTHORS ").append(this.getAuthors()).append("\n"); + if (this.getTitle()!=null) + buf.append(" TITLE ").append(this.getTitle()).append("\n"); + if (this.getJournal()!=null) + buf.append(" JOURNAL ").append(this.getJournal()).append("\n"); + if (this.getPubmed()!=null) + buf.append(" PUBMED ").append(this.getPubmed()).append("\n");; + if (this.getMedline()!=null) + buf.append(" MEDLINE ").append(this.getMedline()).append("\n");; + if (this.getRemark()!=null) + buf.append(" REMARK ").append(this.getRemark()).append("\n");; + if (this.getConsortia()!=null) + buf.append(" CONSRTM ").append(this.getConsortia()).append("\n");; + return buf.toString(); + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankSequence.java b/src/jalview/io/xdb/genbank/GenBankSequence.java new file mode 100644 index 0000000..279601c --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankSequence.java @@ -0,0 +1,57 @@ +package jalview.io.xdb.genbank; + +import java.util.Vector; +/** + * A line like the following: + * 1 aatgaaggtt catttttcat tctcacaaac taatgaaacc ctgcttatct taaaccaacc + * will be mapped as: + * id: 1 + * sequences: {"aatgaaggtt", "catttttcat", "tctcacaaac", "taatgaaacc", "ctgcttatct", "taaaccaacc"} + * Each sequence has 8 nucleotides long + * @author darolmar + * + */ +public class GenBankSequence { + //Initial position + private int id; + //Sequences in that line + private Vector sequences; + + public GenBankSequence() { + super(); + sequences = new Vector(); + } + + public int getId() { + return id; + } + + public void setId(int id) { + this.id = id; + } + + public Vector getSequences() { + return sequences; + } + + public void setSequences(Vector sequences) { + this.sequences = sequences; + } + + public String getSequencesAsString(){ + StringBuffer sb = new StringBuffer(); + for (String seq:sequences) + sb.append(seq).append(" "); + return sb.toString(); + } + + public String toString(){ + StringBuffer sb = new StringBuffer() + .append(" ").append(this.id); + for (String seq:sequences) + sb.append(" ").append(seq); + sb.append("\n"); + return sb.toString(); + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankSource.java b/src/jalview/io/xdb/genbank/GenBankSource.java new file mode 100644 index 0000000..c5ef3c2 --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankSource.java @@ -0,0 +1,85 @@ +package jalview.io.xdb.genbank; + +/** + *

Free-format information including an abbreviated form of the organism + * name, sometimes followed by a molecule type. (See section 3.4.10 of the + * GenBank release notes for more info.)

+ *

Entrez Search Field: Organism [ORGN]

+ *

Search Tip: For some organisms that have well-established common names, + * such as baker's yeast, mouse, and human, a search for the common name will + * yield the same results as a search for the scientific name, e.g., a search + * for "baker's yeast" in the organism field retrieves the same number of + * documents as "Saccharomyces cerevisiae". This is true because the Organism + * field is connected to the NCBI Taxonomy Database, which contains + * cross-references between common names, scientific names, and synonyms for + * organisms represented in the Sequence databases.

+ *

Organism

+ *

The formal scientific name for the source organism (genus and species, + * where appropriate) and its lineage, based on the phylogenetic classification + * scheme used in the NCBI Taxonomy Database. If the complete lineage of an + * organism is very long, an abbreviated lineage will be shown in the GenBank + * record and the complete lineage will be available in the Taxonomy Database. + * (See also the /db_xref=taxon:nnnn Feature qualifer, below.)

+ *

Entrez Search Field: Organism [ORGN]

+ *

Search Tip: You can search the Organism field by any node in the taxonomic + * hierarchy, e.g., you can search for the term "Saccharomyces cerevisiae", + * "Saccharomycetales", "Ascomycota", etc. to retrieve all the sequences from + * organisms in a particular taxon.

+ * + */ +public class GenBankSource { + private String source=""; + private String organism=""; + private String taxonomic=""; + + public GenBankSource() { + } + + @Override + public String toString() { + return String.format("%s\n\t%s\n\t%s", getSource(), getOrganism(), getTaxonomic()); + } + + /** + * @return the source + */ + public String getSource() { + return source; + } + + /** + * @param source the source to set + */ + public void setSource(String source) { + this.source = source; + } + + /** + * @return the organism + */ + public String getOrganism() { + return organism; + } + + /** + * @param organism the organism to set + */ + public void setOrganism(String organism) { + this.organism = organism; + } + + /** + * @return the taxonomic + */ + public String getTaxonomic() { + return taxonomic; + } + + /** + * @param taxonomic the taxonomic to set + */ + public void setTaxonomic(String taxonomic) { + this.taxonomic = taxonomic; + } + +} diff --git a/src/jalview/io/xdb/genbank/GenBankVersion.java b/src/jalview/io/xdb/genbank/GenBankVersion.java new file mode 100644 index 0000000..85a2fd1 --- /dev/null +++ b/src/jalview/io/xdb/genbank/GenBankVersion.java @@ -0,0 +1,89 @@ +package jalview.io.xdb.genbank; + +/** + *

A nucleotide sequence identification number that represents a single, + * specific sequence in the GenBank database. This identification number uses + * the accession.version format implemented by GenBank/EMBL/DDBJ in + * February 1999.

+ *

If there is any change to the sequence data (even a single base), the + * version number will be increased, e.g., U12345.1 → U12345.2, but the + * accession portion will remain stable.

+ *

The accession.version system of sequence identifiers runs parallel to + * the GI number system, i.e., when any change is made to a sequence, it + * receives a new GI number AND an increase to its version number.

+ *

For more information, see section 1.3.2 of the GenBank 111.0 release + * notes, and section 3.4.7 of the current GenBank release notes.

+ *

A Sequence Revision History tool is available to track the various GI + * numbers, version numbers, and update dates for sequences that appeared in + * a specific GenBank record (more information and example).

+ *

More details about sequence identification numbers and the difference + * between GI number and version are provided in Sequence Identifiers: + * A Historical Note.

+ *

Entrez Search Field: use the default setting of "All Fields"

+ *

GI

+ *

"GenInfo Identifier" sequence identification number, in this case, for + * the nucleotide sequence. If a sequence changes in any way, a new GI number + * will be assigned.

+ *

A separate GI number is also assigned to each protein translation within + * a nucleotide sequence record, and a new GI is assigned if the protein + * translation changes in any way (see below).

+ *

GI sequence identifiers run parallel to the new accession.version system + * of sequence identifiers. For more information, see the description of Version, + * above, and section 3.4.7 of the current GenBank release notes.

+ *

A Sequence Revision History tool is available to track the various GI + * numbers, version numbers, and update dates for sequences that appeared in a + * specific GenBank record (more information and example).

+ *

More details about sequence identification numbers and the difference + * between GI number and version are provided in Sequence Identifiers: A + * Historical Note.

+ *

Entrez Search Field: use the default setting of "All Fields"

+ * @author Dieval Guizelini + * @see Entry + */ +public class GenBankVersion { + private String version = ""; + private String gi = ""; + + public GenBankVersion() { + } + + + /** + * @return the version + */ + public String getVersion() { + return version; + } + + /** + * @param version the version to set + */ + public void setVersion(String version) { + this.version = version; + } + + /** + * @return the gi + */ + public String getGI() { + return gi; + } + + /** + * @param gi the gi to set + */ + public void setGI(String gi) { + this.gi = gi; + } + + + /** + * Version section in GenBank File Format is text with two fields (version and GI). + * + * @return version+" "+gi + */ + @Override + public String toString() { + return String.format("%s %s",version,gi); + } +} diff --git a/test/jalview/io/GU324925.1.gb b/test/jalview/io/GU324925.1.gb new file mode 100644 index 0000000..feacdfd --- /dev/null +++ b/test/jalview/io/GU324925.1.gb @@ -0,0 +1,1011 @@ +LOCUS GU324925 15440 bp DNA linear PRI 10-AUG-2010 +DEFINITION Homo sapiens hemoglobin, gamma A (HBG1) gene, complete cds. +ACCESSION GU324925 +VERSION GU324925.1 GI:302313142 +KEYWORDS . +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +REFERENCE 1 (bases 1 to 15440) + AUTHORS Rieder,M.J., Bertucci,C., Stanaway,I.B., Johnson,E.J., + Swanson,J.E., Siegel,D.L., da Ponte,S.H., Igartua,C., Patterson,K. + and Nickerson,D.A. + TITLE Direct Submission + JOURNAL Submitted (25-NOV-2009) Genome Sciences, University of Washington, + 1705 NE Pacific, Seattle, WA 98195, USA +COMMENT To cite this work please use: NHLBI Resequencing and Genotyping + Service (RSG),UW HV48194, Department of Genome Sciences, Seattle, + WA 98195-7730. +FEATURES Location/Qualifiers + source 1..15440 + /organism="Homo sapiens" + /mol_type="genomic DNA" + /db_xref="taxon:9606" + mobile_element 179..289 + /mobile_element_type="LINE:L2" + variation 293 + /frequency="0.0328" + /replace="t" + variation 337 + /frequency="0.0027" + /replace="c" + mobile_element 345..530 + /mobile_element_type="other:LTR/ERV1" + variation 406 + /frequency="0.3873" + /replace="a" + variation 534 + /frequency="0.6279" + /replace="" + mobile_element 544..619 + /mobile_element_type="LINE:L1" + variation 568 + /frequency="0.3088" + /replace="t" + variation 692 + /frequency="0.3038" + /replace="g" + variation 757 + /frequency="0.0053" + /replace="t" + variation 935 + /frequency="0.1888" + /replace="g" + variation 1017 + /frequency="0.0026" + /replace="t" + variation 1202 + /frequency="0.0133" + /replace="a" + variation 1350 + /frequency="0.3617" + /replace="t" + variation 1418 + /frequency="0.1818" + /replace="a" + variation 1507 + /frequency="0.2527" + /replace="a" + variation 1522 + /frequency="0.0027" + /replace="g" + variation 1608 + /frequency="0.0211" + /replace="a" + variation 1637 + /frequency="0.0395" + /replace="c" + variation 1650 + /frequency="0.0211" + /replace="g" + variation 1682 + /frequency="0.0211" + /replace="t" + variation 1689 + /frequency="0.0211" + /replace="g" + variation 1697 + /frequency="0.0211" + /replace="g" + variation 1699 + /frequency="0.0211" + /replace="a" + variation 1735 + /frequency="0.0816" + /replace="t" + variation 1990 + /frequency="0.0027" + /replace="g" + gene 2006..3591 + /gene="HBG1" + mRNA join(2006..2150,2273..2495,3376..3591) + /gene="HBG1" + /product="hemoglobin, gamma A" + variation 2030 + /gene="HBG1" + /frequency="0.1657" + /replace="a" + CDS join(2059..2150,2273..2495,3376..3504) + /gene="HBG1" + /codon_start=1 + /product="hemoglobin, gamma A" + /protein_id="ADL14496.1" + /db_xref="GI:302313143" + /translation="MGHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFD + SFGNLSSASAIMGNPKVKAHGKKVLTSLGDATKHLDDLKGTFAQLSELHCDKLHVDPE + NFKLLGNVLVTVLAIHFGKEFTPEVQASWQKMVTAVASALSSRYH" + variation 2190 + /gene="HBG1" + /frequency="0.3059" + /replace="a" + variation 2191 + /gene="HBG1" + /frequency="0.3032" + /replace="a" + variation 2215 + /gene="HBG1" + /frequency="0.1862" + /replace="t" + variation 2407 + /gene="HBG1" + /frequency="0.1342" + /replace="t" + variation 2518 + /gene="HBG1" + /frequency="0.0026" + /replace="t" + variation 2519 + /gene="HBG1" + /frequency="0.3342" + /replace="a" + variation 2554 + /gene="HBG1" + /frequency="0.4763" + /replace="t" + variation 2610 + /gene="HBG1" + /frequency="0.3128" + /replace="a" + variation 2643 + /gene="HBG1" + /frequency="0.0289" + /replace="c" + variation 2653 + /gene="HBG1" + /frequency="0.3105" + /replace="c" + variation 2675 + /gene="HBG1" + /frequency="0.1895" + /replace="c" + variation 2682 + /gene="HBG1" + /frequency="0.3105" + /replace="g" + variation 2700 + /gene="HBG1" + /frequency="0.3842" + /replace="c" + variation 2746..2749 + /gene="HBG1" + /frequency="0.2226" + /replace="" + variation 2758 + /gene="HBG1" + /frequency="0.3281" + /replace="t" + variation 2760 + /gene="HBG1" + /frequency="0.3219" + /replace="g" + variation 2777 + /gene="HBG1" + /frequency="0.2959" + /replace="c" + variation 2939 + /gene="HBG1" + /frequency="0.0026" + /replace="c" + variation 3023 + /gene="HBG1" + /frequency="0.0026" + /replace="c" + variation 3037 + /gene="HBG1" + /frequency="0.0026" + /replace="t" + variation 3064 + /gene="HBG1" + /frequency="0.0079" + /replace="c" + variation 3073 + /gene="HBG1" + /frequency="0.0026" + /replace="c" + variation 3074 + /gene="HBG1" + /frequency="0.0553" + /replace="t" + variation 3142 + /gene="HBG1" + /frequency="0.0056" + /replace="g" + variation 3158 + /gene="HBG1" + /frequency="0.0028" + /replace="a" + variation 3162 + /gene="HBG1" + /frequency="0.2781" + /replace="g" + variation 3205 + /gene="HBG1" + /frequency="0.3580" + /replace="g" + variation 3206 + /gene="HBG1" + /frequency="0.3571" + /replace="c" + variation 3210 + /gene="HBG1" + /frequency="0.3621" + /replace="a" + variation 3211 + /gene="HBG1" + /frequency="0.3103" + /replace="a" + variation 3238 + /gene="HBG1" + /frequency="0.2672" + /replace="a" + variation 3287 + /gene="HBG1" + /frequency="0.1784" + /replace="a" + variation 3287 + /gene="HBG1" + /frequency="0.3351" + /replace="t" + variation 3291 + /gene="HBG1" + /frequency="0.0081" + /replace="t" + variation 3294 + /gene="HBG1" + /frequency="0.1459" + /replace="g" + variation 3303 + /gene="HBG1" + /frequency="0.0081" + /replace="a" + variation 3507 + /gene="HBG1" + /frequency="0.1349" + /replace="c" + variation 3508 + /gene="HBG1" + /frequency="0.1402" + /replace="t" + variation 3509 + /gene="HBG1" + /frequency="0.1349" + /replace="c" + variation 3510 + /gene="HBG1" + /frequency="0.1508" + /replace="t" + variation 3519 + /gene="HBG1" + /frequency="0.0026" + /replace="c" + variation 3538 + /gene="HBG1" + /frequency="0.0026" + /replace="c" + variation 3556 + /gene="HBG1" + /frequency="0.0464" + /replace="" + variation 3620 + /frequency="0.0053" + /replace="t" + variation 3628 + /frequency="0.2751" + /replace="a" + variation 3644 + /frequency="0.0026" + /replace="g" + variation 3750 + /frequency="0.1852" + /replace="t" + variation 3763 + /frequency="0.3651" + /replace="t" + variation 3953 + /frequency="0.1349" + /replace="t" + variation 4296 + /frequency="0.1947" + /replace="t" + variation 4324 + /frequency="0.0026" + /replace="a" + variation 4333 + /frequency="0.0053" + /replace="a" + variation 4341 + /frequency="0.1342" + /replace="g" + mobile_element 4365..4701 + /mobile_element_type="other:LTR/MaLR" + variation 4471 + /frequency="0.1958" + /replace="c" + variation 4472 + /frequency="0.0291" + /replace="c" + variation 4595 + /frequency="0.0054" + /replace="a" + variation 4609 + /frequency="0.0108" + /replace="t" + variation 4687 + /frequency="0.1958" + /replace="a" + variation 4938 + /frequency="0.0185" + /replace="a" + mobile_element 4976..5095 + /mobile_element_type="LINE:L2" + variation 5070 + /frequency="0.0026" + /replace="g" + variation 5106 + /frequency="0.0132" + /replace="a" + mobile_element 5134..5321 + /mobile_element_type="other:LTR/ERV1" + variation 5179 + /frequency="0.0079" + /replace="t" + variation 5307 + /frequency="0.0053" + /replace="c" + mobile_element 5322..5414 + /mobile_element_type="LINE:L1" + mobile_element 5415..5887 + /mobile_element_type="LINE:L1" + variation 5423 + /frequency="0.0053" + /replace="t" + variation 5532 + /frequency="0.0026" + /replace="t" + variation 5671 + /frequency="0.0027" + /replace="c" + variation 5754 + /frequency="0.0143" + /replace="a" + variation 5871 + /frequency="0.1057" + /replace="t" + mobile_element 5898..6061 + /mobile_element_type="LINE:L2" + variation 6086 + /frequency="0.0158" + /replace="c" + variation 6132 + /frequency="0.0816" + /replace="g" + variation 6135 + /frequency="0.0158" + /replace="a" + variation 6165 + /frequency="0.0079" + /replace="c" + variation 6170 + /frequency="0.0026" + /replace="a" + variation 6200 + /frequency="0.0026" + /replace="g" + variation 6200 + /frequency="0.0053" + /replace="t" + variation 6286 + /frequency="0.0026" + /replace="a" + variation 6296 + /frequency="0.0447" + /replace="t" + variation 6365 + /frequency="0.1921" + /replace="a" + variation 6379 + /frequency="0.0026" + /replace="t" + variation 6467 + /frequency="0.0132" + /replace="a" + variation 6638 + /frequency="0.0159" + /replace="t" + variation 6860 + /frequency="0.0238" + /replace="c" + variation 6955 + /frequency="0.0053" + /replace="a" + variation 7107 + /frequency="0.0026" + /replace="t" + variation 7315 + /frequency="0.0026" + /replace="t" + mobile_element 7396..7708 + /mobile_element_type="SINE:Alu" + variation 7413 + /frequency="0.3128" + /replace="t" + variation 7535 + /frequency="0.0026" + /replace="a" + variation 7618 + /frequency="0.5000" + /replace="a" + variation 7727 + /frequency="0.3677" + /replace="g" + variation 7761 + /frequency="0.0106" + /replace="a" + variation 7872 + /frequency="0.0080" + /replace="t" + variation 7973 + /frequency="0.0132" + /replace="g" + variation 7987 + /frequency="0.2895" + /replace="t" + variation 8164 + /frequency="0.0737" + /replace="c" + variation 8171 + /frequency="0.0526" + /replace="c" + variation 8384 + /frequency="0.0026" + /replace="a" + variation 8410 + /frequency="0.0026" + /replace="a" + variation 8814 + /frequency="0.0079" + /replace="a" + variation 8830 + /frequency="0.0053" + /replace="g" + variation 8947 + /frequency="0.1816" + /replace="c" + variation 8962 + /frequency="0.0026" + /replace="t" + variation 9102 + /frequency="0.0079" + /replace="g" + variation 9240 + /frequency="0.1000" + /replace="c" + variation 9256 + /frequency="0.0026" + /replace="c" + variation 9281..9284 + /frequency="0.9484" + /replace="" + variation 9322 + /frequency="0.0053" + /replace="g" + variation 9338 + /frequency="0.0133" + /replace="c" + variation 9374 + /frequency="0.9658" + /replace="" + variation 9411 + /frequency="0.1842" + /replace="a" + variation 9517 + /frequency="0.0737" + /replace="c" + variation 9558 + /frequency="0.0079" + /replace="c" + variation 9645 + /frequency="0.0133" + /replace="c" + variation 9752..9773 + /frequency="0.0080" + /replace="" + variation 9759 + /frequency="0.0027" + /replace="g" + variation 9791 + /frequency="0.0426" + /replace="g" + variation 10103 + /frequency="0.0968" + /replace="t" + variation 10104 + /frequency="0.0054" + /replace="a" + variation 10244 + /frequency="0.1271" + /replace="" + variation 10251 + /frequency="0.0169" + /replace="t" + variation 10312 + /frequency="0.3511" + /replace="g" + mobile_element 10527..10608 + /mobile_element_type="SINE:MIR" + variation 10565 + /frequency="0.0027" + /replace="c" + variation 10705..10706 + /frequency="0.8806" + /replace="" + variation 10821 + /frequency="0.0160" + /replace="a" + variation 10864 + /frequency="0.0642" + /replace="a" + variation 10944 + /frequency="0.0027" + /replace="a" + variation 11154 + /frequency="0.0163" + /replace="t" + variation 11259 + /frequency="0.0136" + /replace="g" + variation 11475 + /frequency="0.0027" + /replace="t" + variation 11626 + /frequency="0.1190" + /replace="a" + variation 11706 + /frequency="0.0106" + /replace="a" + variation 11708 + /frequency="0.0026" + /replace="g" + variation 11722 + /frequency="0.1190" + /replace="t" + variation 11818 + /frequency="0.0079" + /replace="g" + variation 11857 + /frequency="0.4418" + /replace="c" + variation 11910 + /frequency="0.0079" + /replace="c" + variation 12024 + /frequency="0.0158" + /replace="a" + mobile_element 12140..12250 + /mobile_element_type="LINE:L2" + variation 12160 + /frequency="0.0789" + /replace="c" + variation 12253 + /frequency="0.9947" + /replace="" + variation 12267 + /frequency="0.0079" + /replace="g" + variation 12317 + /frequency="0.0026" + /replace="c" + variation 12350 + /frequency="0.0079" + /replace="a" + variation 12521 + /frequency="0.3042" + /replace="g" + variation 12551 + /frequency="0.0105" + /replace="a" + variation 12639 + /frequency="0.2857" + /replace="g" + variation 12697 + /frequency="0.0106" + /replace="g" + mobile_element 12718..12948 + /mobile_element_type="other:LTR/ERVL" + variation 12731 + /frequency="0.0026" + /replace="t" + variation 12740 + /frequency="0.0053" + /replace="a" + variation 12787 + /frequency="0.0026" + /replace="t" + variation 12814 + /frequency="0.2196" + /replace="a" + variation 12975 + /frequency="0.1164" + /replace="g" + variation 12987 + /frequency="0.1190" + /replace="g" + variation 13030 + /frequency="0.1170" + /replace="a" + variation 13042 + /frequency="0.0718" + /replace="g" + mobile_element 13120..13246 + /mobile_element_type="other:LTR/ERVL" + variation 13138 + /frequency="0.1156" + /replace="c" + variation 13286 + /frequency="0.0161" + /replace="t" + variation 13329 + /frequency="0.1216" + /replace="c" + variation 13370 + /frequency="0.4081" + /replace="g" + mobile_element 13541..13842 + /mobile_element_type="SINE:Alu" + variation 13563 + /frequency="0.1243" + /replace="g" + variation 13678 + /frequency="0.4021" + /replace="t" + variation 13749 + /frequency="0.0027" + /replace="t" + variation 13794 + /frequency="0.0316" + /replace="t" + variation 13805 + /frequency="0.3829" + /replace="t" + variation 13808 + /frequency="0.3818" + /replace="a" + variation 13992..13993 + /frequency="0.5895" + /replace="" + variation 14110 + /frequency="0.4105" + /replace="t" + variation 14158 + /frequency="0.0079" + /replace="g" + mobile_element 14206..14493 + /mobile_element_type="LINE:L2" + variation 14239 + /frequency="0.0079" + /replace="g" + variation 14243 + /frequency="0.4105" + /replace="t" + variation 14247 + /frequency="0.4105" + /replace="t" + variation 14264 + /frequency="0.0026" + /replace="t" + variation 14271 + /frequency="0.0184" + /replace="g" + variation 14272 + /frequency="0.4132" + /replace="g" + variation 14358 + /frequency="0.0158" + /replace="c" + variation 14371 + /frequency="0.0079" + /replace="t" + variation 14406 + /frequency="0.0553" + /replace="a" + variation 14503 + /frequency="0.0421" + /replace="c" + variation 14507 + /frequency="0.4105" + /replace="a" + variation 14609 + /frequency="0.4681" + /replace="a" + mobile_element 14622..14921 + /mobile_element_type="SINE:Alu" + variation 14646 + /frequency="0.0080" + /replace="c" + variation 14670 + /frequency="0.4309" + /replace="g" + variation 14767 + /frequency="0.0027" + /replace="c" + variation 14834 + /frequency="0.4574" + /replace="g" + variation 14861 + /frequency="0.0878" + /replace="t" + variation 14937 + /frequency="0.2872" + /replace="c" + variation 14991 + /frequency="0.0081" + /replace="g" + variation 15061 + /frequency="0.3758" + /replace="t" + misc_feature 15105..15439 + /note="Region not scanned for variation" + mobile_element 15305..15396 + /mobile_element_type="LINE:L2" +ORIGIN + 1 gtgtttcaga ataaaatacc aactctacta ctctcatctg taagatgcaa atagtaagcc + 61 tgagcccttc tgtctaactt tgaattctat tttttcttca acgtacttta ggcttgtaat + 121 gtgtttatat acagtgaaat gtcaagttct ttctttatat ttctttcttt cttttttttc + 181 ctcagcctca gagttttcca catgcccttc ctactttcag gaacttcttt ctccaaacgt + 241 cttctgcctg gctccatcaa atcataaagg acccacttca aatgccatca ctcactacca + 301 tttcacaatt cgcactttct ttctttgtcc tttttttttt tagtaaaaca agtttataaa + 361 aaattgaagg aataaatgaa tggctacttc ataggcagag tagacgcaag ggctactggt + 421 tgccgatttt tattgttatt tttcaatagt atgctaaaca aggggtagat tatttatgct + 481 gcccattttt agaccataaa agataacttc ctgatgttgc catggcattt tttttccttt + 541 taattttatt tcatttcatt ttaatttcga aggtacatgt gcaggatgtg caggcttgtt + 601 acatgggtaa atgtgtgtct ttctggcctt ttagccatct gtatcaatga gcagatataa + 661 gctttacaca ggatcatgaa ggatgaaaga atttcaccaa tattataata atttcaatca + 721 acctgatagc ttaggggata aactaatttg aagatacagc ttgcctccga taagccagaa + 781 ttccagagct tctggcatta taatctagca aggttagaga tcatggatca ctttcagaga + 841 aaaacaaaaa caaactaacc aaaagcaaaa cagaaccaaa aaaccaccat aaatacttcc + 901 taccctgtta atggtccaat atgtcagaaa cagcactgtg ttagaaataa agctgtctaa + 961 agtacactaa tattcgagtt ataatagtgt gtggactatt agtcaataaa aacaaccctt + 1021 gcctctttag agttgttttc catgtacacg cacatcttat gtcttagagt aagattccct + 1081 gagaagtgaa cctagcattt atacaagata attaattcta atccacagta cctgccaaag + 1141 aacattctac catcatcttt actgagcata gaagagctac gccaaaaccc tgggtcatca + 1201 gccagcacac acacttatcc agtggtaaat acacatcatc tggtgtatac atacatacct + 1261 gaatatggaa tcaaatattt ttctaagatg aaacagtcat gatttatttc aaataggtac + 1321 ggataagtag atattgaggt aagcattagg tcttatatta tgtaacacta atctattact + 1381 gcgctgaaac tgtggcttta tagaaattgt tttcactgca ctattgagaa attaagagat + 1441 aatggcaaaa gtcacaaaga gtatattcaa aaagaagtat agcacttttt ccttagaaac + 1501 cactgctaac tgaaagagac taagatttgt cccgtcaaaa atcctggacc tatgcctaaa + 1561 acacatttca caatccctga acttttcaaa aattggtaca tgctttagct ttaaactaca + 1621 ggcctcactg gagctagaga caagaaggta aaaaacggct gacaaaagaa gtcctggtat + 1681 cctctatgat gggagaagga aactagctaa agggaagaat aaattagaga aaaactggaa + 1741 tgactgaatc ggaacaaggc aaaggctata aaaaaaatta agcagcagta tcctcttggg + 1801 ggccccttcc ccacactatc tcaatgcaaa tatctgtctg aaacggtccc tggctaaact + 1861 ccacccatgg gttggccagc cttgccttga ccaatagcct tgacaaggca aacttgacca + 1921 atagtcttag agtatccagt gaggccaggg gccggcggct ggctagggat gaagaataaa + 1981 aggaagcacc cttcagcagt tccacacact cgcttctgga acgtctgagg ttatcaataa + 2041 gctcctagtc cagacgccat gggtcatttc acagaggagg acaaggctac tatcacaagc + 2101 ctgtggggca aggtgaatgt ggaagatgct ggaggagaaa ccctgggaag gtaggctctg + 2161 gtgaccagga caagggaggg aaggaaggac cctgtgcctg gcaaaagtcc aggtcgcttc + 2221 tcaggatttg tggcaccttc tgactgtcaa actgttcttg tcaatctcac aggctcctgg + 2281 ttgtctaccc atggacccag aggttctttg acagctttgg caacctgtcc tctgcctctg + 2341 ccatcatggg caaccccaaa gtcaaggcac atggcaagaa ggtgctgact tccttgggag + 2401 atgccacaaa gcacctggat gatctcaagg gcacctttgc ccagctgagt gaactgcact + 2461 gtgacaagct gcatgtggat cctgagaact tcaaggtgag tccaggagat gtttcagccc + 2521 tgttgccttt agtctcgagg caacttagac aacggagtat tgatctgagc acagcagggt + 2581 gtgagctgtt tgaagatact ggggttgggg gtgaagaaac tgcagaggac taactgggct + 2641 gagacccagt ggtaatgttt tagggcctaa ggagtgcctc taaaaatcta gatggacaat + 2701 tttgactttg agaaaagaga ggtggaaatg aggaaaatga cttttcttta ttagattcca + 2761 gtagaaagaa ctttcatctt tccctcattt ttgttgtttt aaaacatcta tctggaggca + 2821 ggacaagtat ggtcgttaaa aagatgcagg cagaaggcat atattggctc agtcaaagtg + 2881 gggaactttg gtggccaaac atacattgct aaggctattc ctatatcagc tggacacata + 2941 taaaatgctg ctaatgcttc attacaaact tatatccttt aattccagat gggggcaaag + 3001 tatgtccagg ggtgaggaac aattgaaaca tttgggctgg agtagatttt gaaagtcagc + 3061 tctgtgtgtg tgtgtgtgtg tgcgcgcgcg cgtgtgtgtg tgtgtgtcag cgtgtgtttc + 3121 ttttaacgtc ttcagcctac aacatacagg gttcatggtg gcaagaagat agcaagattt + 3181 aaattatggc cagtgactag tgcttgaagg ggaacaacta cctgcattta atgggaaggc + 3241 aaaatctcag gctttgaggg aagttaacat aggcttgatt ctgggtggaa gcttggtgtg + 3301 tagttatctg gaggccaggc tggagctctc agctcactat gggttcatct ttattgtctc + 3361 ctttcatctc aacagctcct gggaaatgtg ctggtgaccg ttttggcaat ccatttcggc + 3421 aaagaattca cccctgaggt gcaggcttcc tggcagaaga tggtgactgc agtggccagt + 3481 gccctgtcct ccagatacca ctgagctcac tgcccatgat tcagagcttt caaggatagg + 3541 ctttattctg caagcaatac aaataataaa tctattctgc tgagagatca cacatgattt + 3601 tcttcagctc ttttttttac atctttttaa atatatgagc cacaaagggt ttatattgag + 3661 ggaagtgtgt atgtgtattt ctgcatgcct gtttgtgttt gtggtgtgtg catgctcctc + 3721 atttattttt atatgagatg tgcattttga tgagcaaata aaagcagtaa agacacttgt + 3781 acacgggagt tctgcaagtg ggagtaaatg gtgtaggaga aatccggtgg gaagaaagac + 3841 ctctatagga caggacttct cagaaacaga tgttttggaa gagatgggaa aaggttcagt + 3901 gaagacctgg gggctggatt gattgcagct gagtagcaag gatggttctt aaggaaggga + 3961 aagtgttcca agctttagga attcaaggtt tagtcaggtg tagcaattct attttattag + 4021 gaggaatact atttctaatg gcacttagct tttcacagcc cttgtggatg cctaagaaag + 4081 tgaaattaat cccatgccct caagtgtgca gattggtcac agcatttcaa gggagagacc + 4141 tcattgtaag actctggggg aggtggggac ttaggtgtaa gaaatgaatc agcagaggct + 4201 cacaagtcag catgagcatg ttatgtctga gaaacagacc agcactgtga gatcaaaatg + 4261 tagtgggaag aatttgtaca acattaattg gaaggcttac ttaatggaat ttttgtatag + 4321 ttggatgtta gtgcatctct ataagtaaga gtttaatatg atggtgttac ggacctaatg + 4381 tttgtgtctc ctcaaaattc acatgctgaa tccccaactc ccaactgacc ttatctgtgg + 4441 gggaggcttt tgaaaagtaa ttaggtttag atgagctcat aagagcagat ccccatcata + 4501 aaattatttt ccttatcaga agcagagaga caagccattt ctctttcctc ccggtgagga + 4561 cacagtgaga agtccgccat ctgcaatcca ggaagagaac cctgaccacg agtcagcctt + 4621 cagaaatgtg agaaaaaact ctgttgttga agccacccag tcttttgtat tttgttatag + 4681 caccttgcac tgagtaaggc agatgaagaa ggagaaaaaa ataagcttgg gttttgagtg + 4741 gactacagac catgtttatc tcaggtttgc aaagctcccc tcgtccccta tgtttcagta + 4801 taaaatacct actctactac tctcatctat aagacccaaa taataagcct gcgcccttct + 4861 ctctaacttt gatttctcct atttttactt caacatgctt tactctagcc ttgtaatgtc + 4921 tttacataca gtgaaatgta aagttcttta ttcttttttt ctttctttct tttttctcct + 4981 cagcctcaga atttggcaca tgcccttcct tctttcagga acttctccaa catctctgcc + 5041 tggctccatc atatcataaa ggtcccactt caaatgcagt cactaccgtt tcagaatatg + 5101 cactttcttt cttttttgtt ttttgttttt tttaagtcaa agcaaatttc ttgagagagt + 5161 aaagaaataa acgaatgact actgcatagg cagagcagcc ccgagggccg ctggttgttc + 5221 cttttatggt tatttcttga tgatatgtta aacaagtttt ggattattta tgccttctct + 5281 ttttaggcca tatagggtaa ctttctgaca ttgccatggc atttttcttt taatttaatt + 5341 tactgttacc ttaaattcag gggtacacgt acaggatatg caggtttgtt ttataggtaa + 5401 aagtgtgcca tggttttaat gggttttttt tttcttgtaa agttgtttaa gtttcttgtt + 5461 tactctggat attaggcctt tgtcagaaga atagattgga aaatcttttt cccattctgt + 5521 agattgtctt tcgctctgat ggtagtttct tttgctgagc aggagctctt tagtttaatt + 5581 agattccatt ggtcaatttt tgcttttgct gcaattgctt ttcacgcttt catcatgaaa + 5641 tctgtgcccg tgtttatatc atgaatagta ttgccttgat ttttttctag gctttttata + 5701 gtttggggtt tttcatttaa gtctctaatc catctggagt taattttgga taaggtataa + 5761 ggaaggagtc cagtttcatt tttcagcata tggctagcca gttctccccc atcatttatt + 5821 aaattgaaaa tcctttcccc attgcttgct tttgtcaggt ttctaaaaga ccagatggtt + 5881 gtaggtacaa tatgcagttt cttcaagtca tataatacca tctgaaatct cttattaatt + 5941 catttctttt agtatgtatg ctggtctcct ctgctcacta tagtgagggc accattagcc + 6001 agagaatctg tctgtctagt tcatgtaaga ttctcagaat taagaaaaat ggatggcata + 6061 tgaatgaaac ttcatggatg acatatggaa tctaatatgt atttgttgaa ttaatgcata + 6121 agatgcaaca gagagaagtt gacaactgca atgataacct ggtattgatg atataagagt + 6181 ctatagatca cagtagaagc aataatcatg gaaaacaatt ggaaatgggg aacagccaca + 6241 aacaagaaag aatcaatact tccaggaaag tgactgcagg tcacttttcc tggagcgggt + 6301 gagagaaaag tggaagttag cagtaactgc tgaattcctg gttggctgat ggaaagatgg + 6361 ggcagctgtt cactggtacg cagggtttta gatgtatgta cctaaggata tgaggtatgg + 6421 caatgaacag aaattctttt gggaatgagt tttagggcca ttaaaggaca tgacctgaag + 6481 tttcctctga ggccagtccc cacaactcaa tataaatgtg tttcctgcat atagtcaaag + 6541 ttgccacttc tttttcttca tatcatcgat ctctgctctt aaagataatc ttggttttgc + 6601 ctcaaactgt ttgtcactac aaactttccc catgttccta agtaaaacag gtaactgcct + 6661 ctcaactata tcaagtagac taaaatattg tgtctctaat atcagaaatt cagctttaat + 6721 atattgggtt taactctttg aaatttagag tctccttgaa atacacatgg gggtgatttc + 6781 ctaaacttta tttcttgtaa ggatttatct caggggtaac acacaaacca gcatcctgaa + 6841 cctctaagta tgaggacagt aagccttaag aatataaaat aaactgttct tctctctgcc + 6901 ggtggaagtg tgccctgtct attcctgaaa ttgcttgttt gagacgcatg agacgtgcag + 6961 cacatgagac acgtgcagca gcctgtggaa tattgtcagt gaagaatgtc tttgcctgat + 7021 tagatataaa gacaagttaa acacagcatt agactataga tcaagcctgt gccagacaca + 7081 aatgacctaa tgcccagcac gggccacgga atctcctatc ctcttgcttg aacagagcag + 7141 cacacttctc ccccaacact attagatgtt ctggcataat tttgtagata tgtaggattt + 7201 gacatggact attgttcaat gattcagagg aaatctcctt tgttcagata agtacactga + 7261 ctactaaatg gattaaaaaa cacagtaata aaacccagtt ttccccttac ttccctagtt + 7321 tgtttcttat tctgctttct tccaagttga tgctggatag aggtgtttat ttctattcta + 7381 aaaagtgatg aaattggccg ggcgcggtgg ctcacacctg taatcccagc actttgggag + 7441 gctgaggtgg gcggatcacg aggtcaggag atcaagacca tcctggctaa catggtgaaa + 7501 ccccatctct actaaaaata caaaaaatta gccagagaca gtggcgggtg cctgtagtcc + 7561 cagctactcg ggaggctgag gcaggagaat ggcgtgaacc tgggaggcag agcttgcggt + 7621 gagcagagat cgcgccactg cacactccag cctgggtgac aaagcgagac tccatctcaa + 7681 aaaaaaaaaa aaaaaaaaga aaaagaaaga aagaaagaaa aaaaaactga tgaaattgtg + 7741 tattcaatgt agtctcaaga gaattgaaaa ccaagaaagg ctgtggcttc ttccacataa + 7801 agcctggatg aataacagga taacacgttg ttacattgtc acaactcctg atccaggaat + 7861 tgatggctaa gatattcgta attcttatcc ttttcagttg taacttattc ctatttgtca + 7921 gcattcaggt tattagcggc tgctggcgaa gtccttgaga aataaactgc acactggatg + 7981 gtgggggtag tgtaggaaaa tggaggggaa ggaagtaaag tttcaaatta agcctgaaca + 8041 gcaaagttcc cctgagaagg ccacctggat tctatcagaa actcgaatgt ccatcttgca + 8101 aaacttcctt gcccaaaccc cacccctgga gtcacaaccc acccttgacc aatagattca + 8161 ttttactgag ggaggcaaag ggctggtcaa tagattcatt tcactgggag aggcaaaggg + 8221 ctgggggcca gagaggagaa gtaaaaagcc acacatgaag cagcaatgca ggcatgcttc + 8281 tggctcatct gtgatcacca ggaaactccc agatctgaca ctgtagtgca tttcactgct + 8341 gacaagaagg ctgctgccac cagcctgtga agcaaggtta aggtgagaag gctggaggtg + 8401 agattctggg caggtaggta ctggaagccg ggacaaggtg cagaaaggca gaaagtgttt + 8461 ctgaaagagg gattagcccg ttgtcttaca tagtctgact ttgcacctgc tctgtgatta + 8521 tgactatccc acagtctcct ggttgtctac ccatggacct agaggtactt tgaaagtttt + 8581 ggatatctgg gctctgactg tgcaataatg ggcaacccca aagtcaaggc acatggcaag + 8641 aaggtgctga tctccttcgg aaaagctgtt atgctcacgg atgacctcaa aggcaccttt + 8701 gctacactga gtgacctgca ctgtaacaag ctgcacgtgg accctgagaa cttcctggtg + 8761 agtagtaagt acactcacgc tttcttcttt acccttagat atttgcacta tgggtacttt + 8821 tgaaagcaga ggtggctttc tcttgtgtta tgagtcagct atgggatatg atatttcagc + 8881 agtgggattt tgagagttat gttgctgtaa ataacataac taaaatttgg tagagcaagg + 8941 actatgaata atggaaggcc acttaccatt tgatagctct gaaaaacaca tcttataaaa + 9001 aattctggcc aaaatcaaac tgagtgtttt tggatgaggg aacagaagtt gagatagaga + 9061 aaataacatc tttcctttgg tcagcgaaat tttctataaa aattaatagt cacttttctg + 9121 catagtcctg gaggttagaa aaagatcaac tgaacaaagt agtgggaagc tgttaaaaag + 9181 aggattgttt ccctccgaat gatgatggta tacttttgta cgcatggtac aggattcttt + 9241 gttatgagtg tttgggaaaa ttgtatgtat gtatgtatgt atgtatgtga tgactgggga + 9301 cttatcctat ccattactgt tccttgaagt actattatcc tactttttaa aaggacgaag + 9361 tctctaaaaa aaaaaatgaa acaatcacaa tatgttgggg tagtgagttg gcatagcaag + 9421 taagagaagg ataggacaca atgggaggtg cagggctgcc agtcatattg aagctgatat + 9481 ctagcccata atggtgagag ttgctcaaac tctggtgaaa aaggatgtaa gtgttatatc + 9541 tatttactgc aagtccagct tgaggccttc tattcactat gtaccatttt cttttttatc + 9601 ttcactccct ccccagctct taggcaacgt gatattgatt gttttggcaa cccacttcag + 9661 cgaggatttt accctacaga tacaggcttc ttggcagtaa ctaacaaatg ctgtggttaa + 9721 tgctgtagcc cacaagacca ctgagttccc tgtccactat gtttgtacct atggtccact + 9781 atgtttgtac ctatgtccca aaatctcatc tcctttagat gggggaggtt ggggagaaga + 9841 gcagtatcct gcctgctgat tcagttcctg catgataaaa atagaataaa gaaatatgct + 9901 ctctaagaaa tatcattgta ctctttttct gtctttatat tttaccctga ttcagccaaa + 9961 aggacgcact atttctgatg gaaatgagaa tgttggagaa tgggagttta aggacagaga + 10021 agatactttc ttgcaatcct gcaagaaaag agagaactcg tgggtggatt tagtggggta + 10081 gttactccta ggaaggggaa atcgtctcta gaataagaca atgtttttac agaaagggag + 10141 gtcaatggag gtactctttg gaggtgtaag aggattgttg gtagtgtgta gaggtatgtt + 10201 aggactcaaa ttagaagttc tgtataggct attatttgta tgaaactcag gatatagctc + 10261 atttggtgac tgcagttcac ttctacttat tttaaacaac atatttttta ttatttataa + 10321 tgaagtgggg atggggcttc ctagagacca atcaagggcc aaaccttgaa ctttctctta + 10381 acgtcttcaa tggtattaat agagaattat ctctaaggca tgtgaactgg ctgtcttggt + 10441 tttcatctgt acttcatctg ctacctctgt gacctgaaac atatttataa ttccattaag + 10501 ctgtgcatat gatagattta tcatatgtat tttccttaaa ggatttttgt aagaactaat + 10561 tgaattgata cctgtaaagt ctttatcaca ctacccaata aataataaat ctctttgttc + 10621 agctctctgt ttctataaat atgtacaagt tttattgttt ttagtggtag tgattttatt + 10681 ctctttctat atatatacac acacatatgt gtgcattcat aaatatatac aatttttatg + 10741 aataaaaaat tattagcaat caatattgaa aaccactgat ttttgtttat gtgagcaaac + 10801 agcagattaa aaggctgaga tttaggaaac agcacgttaa gtcaagttga tagaggagaa + 10861 tatggacatt taaaagaggc aggatgatat aaaattaggg aaactggatg cagagaccag + 10921 atgaagtaag aaaaatagct atcgttttga gcaaaaatca ctgaagtttc ttgcatatga + 10981 gagtgacata ataaataggg aaacgtagaa aattgattca catgtatata tatatataga + 11041 actgattaga caaagtctaa cttgggtata gtcagaggag cttgctgtaa ttatattgag + 11101 gtgatggata aagaactgaa gttgatggaa acaatgaagt taagaaaaaa aatcgagtaa + 11161 gagaccattg tggcagtgat tgcacagaac tggaaaacat tgtgaaacag agagtcagag + 11221 atgacagcta aaatccctgt ctgtgaatga aaagaaggaa atttattgac agaacagcaa + 11281 atgcctacaa gccccctgtt tggatctggc aatgaacgta gccattctgt ggcaatcact + 11341 tcaaactcct gtacccaaga cccttaggaa gtatgtagca ccctcaaacc taaaacctca + 11401 aagaaagagg ttttagaaga tataataccc tttcttctcc agtttcatta atcccaaaac + 11461 ctctttctca aagtatttcc tctatgtgtc caccccaaag agctcacctc accatatctc + 11521 ttgagtggga gcacatagat aggcggtgct accatctaac agcttctgaa attcctttgt + 11581 catatttttg agtccccact aataacccac aaagcagaat aaataccagt tgctcatgta + 11641 caataatcac tcaactgctg tcttgtagca tacattaatt aagcacattc tttgaataat + 11701 tactgtgtcc aaacaatcac actttaaaat ctcacacttg tgctatccct tgcccttctg + 11761 aatgtcactc tgtattttaa atgaagagat gagggttgaa tttcctgtgt tacttattgt + 11821 tcatttctcg atgaggagtt ttcacattca cctttagtgg aaaacacata agtacacatc + 11881 ttacaggaaa aatataccaa actgacatgt agcatgaatg cttgtgcatg tagtcatata + 11941 aaatcttgta gcaatgtaaa cattctctga tatacacata cagatgtgtc tatatgtcta + 12001 cacaatttct tatgctccat gaacaaacat tccatgcaca cataagaaca cacactgtta + 12061 cagatgcata cttgagtgca ttgacaaaat taccccagtc aatctagaga atttggattt + 12121 ctgcatttga ctctgttagc tttgtacatg ctgttcattt actctgggtg atgtctttcc + 12181 ctcattttgc cttgtctatc ttgtactcat actttaagtc ctaacttata tgttatctca + 12241 actaagaagc tatttttttt ttaattttaa ctgggcttaa agccctgtct ataaactctg + 12301 ctacaattat gggctctttc ttataatatt tagtgttttt cctactaatg tacttaatct + 12361 gctcattgta tattcctacc actaaatttt aacctctttt atggtagaga cattgtcttg + 12421 taaactctta tttccctagt atttggagat gaaaaaaaag attaaattat ccaaaattag + 12481 atctctcttt tctacattat gagtattaca ctatccatag agaagtttgt ttgagaccta + 12541 aactgaggaa cctttggttc taaaatgact atgtgatatc ttagtattta taggtcatga + 12601 ggttccttcc tctgcctctg ctatagtttg attagtcaac aagcatgtgt catgcattta + 12661 ttcacatcag aatttcatac actaataaga catagtatca gaagtcagtt tattagttat + 12721 atcagttagg gtccatcaag gaaaggacaa accattatca gttactcaac ctagaattaa + 12781 atacagctct taatagttaa ttatccttgt attggaagag ctaaaatatc aaataaagga + 12841 cagtgcagaa atctagatgt tagtaacatc agaaaacctc ttccgccatt aggcctagaa + 12901 gggcagaagg agaaaatgtt tataccacca gagtccagaa ccagagccca taaccagagg + 12961 tccactggat tcagtgagct agtgggtgct ccttggagag agccagaact gtctaatggg + 13021 ggcatcaaag tatcagccat aaaaaaccat aaaaaagact gtctgctgta ggagatccgt + 13081 tcagagagag agagagacca gaaataatct tgcttatgct ttccctcagc cagtgtttac + 13141 cattgcagaa tgtacatgcg actgaaaggg tgaggaaacc tgggaaatgt cagttcctca + 13201 aatacagaga acactgaggg aaggatgaga aataaatgtg aaagcagaca tgaatggtaa + 13261 ttgacagaag gaaactagga tgtgtccagt aaatgaataa ttacagtgtg cagtgattat + 13321 tgcaatgatt aatgtattga taagataata tgaaaacaca gaattcaaac agcagtgaac + 13381 tgagattaga attgtggaga gcactggcat ttaagaatgt cacacttaga atgtgtctct + 13441 aggcattgtt ctgtgcatat atcatctcaa tattcattat ctgaaaatta tgaattaggt + 13501 acaaagctca aataatttat tttttcaggt tagcaagaac tttttttttt tttttctgag + 13561 atagagcatt gctatggttg cccaggctgg agtgcaatgg catgatccag gctcactgca + 13621 acatctgcct cccaggttca agcgattctc ctgcctcagc ctcccaagta gctggcacta + 13681 caggcatgtg ccaccaccat gcctggctaa ttttctattt ttagtagata gggggtttca + 13741 ccatgttggt caggctgatc tcgaactcct aacatcaggt gatccaccct cctcggcctc + 13801 tgaaagtgct gggatcacag gcgtgagcca ccacacccag ccaagaatgt gaattttgta + 13861 gaaggatata acccatattt ctctgaccct agagtcctta gtatacctcc cataccatgt + 13921 ggctcatcct ccttacatac atttcccatc tttcacccta ccttttcctt tttgtttcag + 13981 cttttcactg tgtgtcaaaa tctagaacct tatctcctac ctgctctgaa accaacagca + 14041 agttgacttc cattctaacc cacattggca ttacactaat taaaatcgat actgagttct + 14101 aaaatcatcg gggattttgg ggactatgtc ttacttcata cttccttgag atttcacatt + 14161 aaatgttggt gttcattaaa ggtccttcat ttaactttgt attcatcaca ctcttggatt + 14221 cacagttata tctaaactct taaatacagc ctgtataatc ccaattccca actctgattt + 14281 ctaacctctg acctccaacc tcagtgccaa acccatatat caaacaatgt actgggctta + 14341 tttatataga tgtcctatag gcacctcaga ctcagcatgg gtatttcact tgttatacta + 14401 aaactgtttc tcttccagtg ttttccattt tagtcattag atagctactt gcccattcac + 14461 caaggtcaca gattaaaatc atttccctac ctctaatcaa cagttcgatt ctgcttcaat + 14521 ttgtccctat ctattaatca ccactcttac tgcccagtca ggtcctcatt gtttcctgaa + 14581 caagagtaga tgctattctt tccactttta gaccttatcc tggctggatg cggtggctca + 14641 ggcttgtaaa cccagcactt tgggaggcca aggcaggcag atcacttgag gtcaggagtt + 14701 caagaccagc ctgaccaaca tggtgaaacc ccatctctac taaaaataca aaatcagccg + 14761 ggcgtgtggt gcatgcctgc agtcccagct attcaggtgg ctgaggcagg agaattgctt + 14821 gaacccagga ggcagaggtt gcggtgagcc tagattgcac cattgcactc tagcttgggc + 14881 aatagggatg aaactccatc tcagaagaga aaagaaaaaa agaccttatt ctgttataca + 14941 aatcctctca atgcaatcca tatagaataa acatgtaacc agatctccca atgtgtaaaa + 15001 tcatttcagg tagaacagaa ttaaagtgaa aagccaagtc tttggaatta acagacaaag + 15061 atcaaataac agtcctcatg gccttaagaa tttacctaac atttttttta gaatcaattt + 15121 tcttatatat gaattggaaa cataattcct ccctcacaaa cacattctaa gattttaagg + 15181 agatattgat gaagtacatc atctgtcatt tttaacaggt agtggtagtg attcacacag + 15241 cacattatga tctgttcttg tatgttctgt tccattctgt attcttgacc tggttgtatt + 15301 ctttctgagc tccagatcca catatctaag tacatctttt tgcattttac aagagtgcat + 15361 acaatacaat gtatccaaga ctgtatttct gattttatcg taccactaaa ctcacaaatg + 15421 tggccctatt cttgtgttca +// \ No newline at end of file diff --git a/test/jalview/io/GenBankTest.java b/test/jalview/io/GenBankTest.java new file mode 100644 index 0000000..d3c41da --- /dev/null +++ b/test/jalview/io/GenBankTest.java @@ -0,0 +1,282 @@ +package jalview.io; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentAnnotation; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.Annotation; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; + +import java.io.File; +import java.io.IOException; +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +public class GenBankTest { +// private final static File GENBANK_FILE = new File("test/jalview/io/V00505.gb"); +// private final static File GENBANK_FILE = new File("test/jalview/io/NC_000011.10.gb"); + private final static File GENBANK_FILE = new File("test/jalview/io/M92650.1.gb"); + + @Test + public void testParsing(){ + testFileIOwithFormat(GENBANK_FILE, "GENBANK"); + } + /** + * test alignment data in given file can be imported, exported and reimported + * with no dataloss + * + * @param f + * - source datafile (IdentifyFile.identify() should work with it) + * @param ioformat + * - label for IO class used to write and read back in the data from + * f + */ + public static void testFileIOwithFormat(File f, String ioformat) + { + System.out.println("Reading file: " + f); + String ff = f.getPath(); + try + { + AppletFormatAdapter rf = new AppletFormatAdapter(); + + Alignment al = rf.readFile(ff, AppletFormatAdapter.FILE, + new IdentifyFile().Identify(ff, AppletFormatAdapter.FILE)); + + assertNotNull("Couldn't read supplied alignment data.", al); + + // make sure dataset is initialised ? not sure about this + for (int i = 0; i < al.getSequencesArray().length; ++i) + { + al.getSequenceAt(i).setDatasetSequence(al.getSequenceAt(i)); + } + String outputfile = rf.formatSequences(ioformat, al, true); + System.out.println("Output file in '"+ioformat+"':\n"+outputfile+"\n< orig_groups=new HashMap(),new_groups=new HashMap(); + + if (aa_new != null && aa_original != null) + { + for (int i = 0; i < aa_original.length; i++) + { + if (aa_new.length>i) { + assertTrue("Different alignment annotation at position "+i, + equalss(aa_original[i], aa_new[i])); + // compare graphGroup or graph properties - needed to verify JAL-1299 + assertTrue("Graph type not identical.",aa_original[i].graph==aa_new[i].graph); + assertTrue("Visibility not identical.", aa_original[i].visible==aa_new[i].visible); + assertTrue( + "Threshold line not identical.", + aa_original[i].threshold == null ? aa_new[i].threshold == null + : aa_original[i].threshold + .equals(aa_new[i].threshold)); + // graphGroup may differ, but pattern should be the same + Integer o_ggrp=new Integer(aa_original[i].graphGroup+2),n_ggrp=new Integer(aa_new[i].graphGroup+2); + BitSet orig_g=orig_groups.get(o_ggrp),new_g=new_groups.get(n_ggrp); + if (orig_g==null) { + orig_groups.put(o_ggrp,orig_g= new BitSet()); + } + if (new_g==null) { + new_groups.put(n_ggrp, new_g=new BitSet()); + } + assertTrue("Graph Group pattern differs at annotation "+i, orig_g.equals(new_g)); + orig_g.set(i); new_g.set(i); + } else { + System.err.println("No matching annotation row for "+aa_original[i].toString()); + } + } + } + assertTrue( + "Generated and imported alignment have different annotation sets (" + + aa_new_size + " != " + aa_original_size + ")", + aa_new_size == aa_original_size); + + // check sequences, annotation and features + SequenceI[] seq_original = new SequenceI[al.getSequencesArray().length]; + seq_original = al.getSequencesArray(); + SequenceI[] seq_new = new SequenceI[al_input.getSequencesArray().length]; + seq_new = al_input.getSequencesArray(); + SequenceFeature[] sequenceFeatures_original, sequenceFeatures_new; + AlignmentAnnotation annot_original, annot_new; + // + for (int i = 0; i < al.getSequencesArray().length; i++) + { + String name = seq_original[i].getName(); + int start = seq_original[i].getStart(); + int end = seq_original[i].getEnd(); + System.out.println("Check sequence: " + name + "/" + start + "-" + + end); + + // search equal sequence + for (int in = 0; in < al_input.getSequencesArray().length; in++) + { + if (name.equals(seq_new[in].getName()) + && start == seq_new[in].getStart() + && end == seq_new[in].getEnd()) + { + String ss_original = seq_original[i].getSequenceAsString(); + String ss_new = seq_new[in].getSequenceAsString(); + assertTrue("The sequences " + name + "/" + start + "-" + end + + " are not equal", ss_original.equals(ss_new)); + + assertTrue( + "Sequence Features were not equivalent", + (seq_original[i].getSequenceFeatures() == null && seq_new[in] + .getSequenceFeatures() == null) + || (seq_original[i].getSequenceFeatures() != null && seq_new[in] + .getSequenceFeatures() != null)); + // compare sequence features + if (seq_original[i].getSequenceFeatures() != null + && seq_new[in].getSequenceFeatures() != null) + { + System.out.println("There are feature!!!"); + sequenceFeatures_original = new SequenceFeature[seq_original[i] + .getSequenceFeatures().length]; + sequenceFeatures_original = seq_original[i] + .getSequenceFeatures(); + sequenceFeatures_new = new SequenceFeature[seq_new[in] + .getSequenceFeatures().length]; + sequenceFeatures_new = seq_new[in].getSequenceFeatures(); + + assertTrue("different number of features", seq_original[i] + .getSequenceFeatures().length == seq_new[in] + .getSequenceFeatures().length); + + for (int feat = 0; feat < seq_original[i].getSequenceFeatures().length; feat++) + { + assertTrue("Different features", + sequenceFeatures_original[feat] + .equals(sequenceFeatures_new[feat])); + } + } + // compare alignment annotation + if (al.getSequenceAt(i).getAnnotation() != null + && al_input.getSequenceAt(in).getAnnotation() != null) + { + for (int j = 0; j < al.getSequenceAt(i).getAnnotation().length; j++) + { + if (al.getSequenceAt(i).getAnnotation()[j] != null + && al_input.getSequenceAt(in).getAnnotation()[j] != null) + { + annot_original = al.getSequenceAt(i).getAnnotation()[j]; + annot_new = al_input.getSequenceAt(in).getAnnotation()[j]; + assertTrue("Different annotation elements", + equalss(annot_original, annot_new)); + } + } + } + else if (al.getSequenceAt(i).getAnnotation() == null + && al_input.getSequenceAt(in).getAnnotation() == null) + { + System.out.println("No annotations"); + } + else if (al.getSequenceAt(i).getAnnotation() != null + && al_input.getSequenceAt(in).getAnnotation() == null) + { + assertTrue("Annotations differed between sequences (" + + al.getSequenceAt(i).getName() + ") and (" + + al_input.getSequenceAt(i).getName() + ")", false); + } + break; + } + } + } + } + /* + * compare annotations + */ + private static boolean equalss(AlignmentAnnotation annot_or, + AlignmentAnnotation annot_new) + { + if (annot_or.annotations.length != annot_new.annotations.length) + { + System.err.println("Different lengths for annotation row elements: "+annot_or.annotations.length +"!="+ annot_new.annotations.length); + return false; + } + for (int i = 0; i < annot_or.annotations.length; i++) + { + Annotation an_or=annot_or.annotations[i],an_new=annot_new.annotations[i]; + if (an_or != null + && an_new!= null) + { + if (!an_or.displayCharacter.trim() + .equals(an_new.displayCharacter.trim()) + || !(""+an_or.secondaryStructure).trim().equals((""+an_new.secondaryStructure).trim()) + || ((!an_or.description.equals(an_new.description)) && (an_or.description == null + || an_new.description == null || !an_or.description + .equals(an_new.description)))) + { + System.err.println("Annotation Element Mismatch\nElement "+i+" in original: "+annot_or.annotations[i].toString()+"\nElement "+i+" in new: "+annot_new.annotations[i].toString()); + return false; + } + } + else if (annot_or.annotations[i] == null + && annot_new.annotations[i] == null) + { + continue; + } + else + { + System.err.println("Annotation Element Mismatch\nElement "+i+" in original: "+(annot_or.annotations[i]==null ? "is null" : annot_or.annotations[i].toString())+"\nElement "+i+" in new: "+(annot_new.annotations[i] == null ? "is null" : annot_new.annotations[i].toString())); + return false; + } + } + return true; + } +} diff --git a/test/jalview/io/M92650.1.gb b/test/jalview/io/M92650.1.gb new file mode 100644 index 0000000..8abbba9 --- /dev/null +++ b/test/jalview/io/M92650.1.gb @@ -0,0 +1,92 @@ +LOCUS HUMDMDXX 2110 bp mRNA linear PRI 07-NOV-1994 +DEFINITION Human Duchenne muscular dystrophy (DMD) mRNA, +complete cds. +ACCESSION M92650 +VERSION M92650.1 GI:181598 +KEYWORDS Duchenne muscular +dystrophy protein. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +REFERENCE 1 (bases 1 to 2110) + AUTHORS Lederfein,D., Levy,Z., Augier,N., Mornet,D., +Morris,G., Fuchs,O., + Yaffe,D. and Nudel,U. + TITLE A 71-kilodalton protein is a major product of the Duchenne muscular + dystrophy gene in brain and other nonmuscle tissues + JOURNAL Proc. Natl. Acad. Sci. U.S.A. 89 (12), 5346-5350 (1992) + PUBMED 1319059 +COMMENT Original source text: Homo sapiens brain cDNA to mRNA. +FEATURES Location/Qualifiers + source 1..2110 + /organism="Homo sapiens" + /mol_type="mRNA" + /db_xref="taxon:9606" + /map="Xp21.3-p21.1" + /cell_type="amniotic fluid" + /tissue_type="brain" + gene 1..2110 + /gene="DMD" + 5'UTR 1..52 + /gene="DMD" + /note="G00-119-850" + CDS 53..1921 + /gene="DMD" + /codon_start=1 + /protein_id="AAA52316.1" + /db_xref="GI:181599" + /db_xref="GDB:G00-119-850" + /translation="MREQLKGHETQTTCWDHPKMTELYQSLADLNNVRFSAYRTAMKL + RRLQKALCLDLLSLSAACDALDQHNLKQNDQPMDILQIINCLTTIYDRLEQEHNNLVN + VPLCVDMCLNWLLNVYDTGRTGRIRVLSFKTGIISLCKAHLEDKYRYLFKQVASSTGF + CDQRRLGLLLHDSIQIPRQLGEVASFGGSNIEPSVRSCFQFANNKPEIEAALFLDWMR + LEPQSMVWLPVLHRVAAAETAKHQAKCNICKECPIIGFRYRSLKHFNYDICQSCFFSG + RVAKGHKMHYPMVEYCTPTTSGEDVRDFAKVLKNKFRTKRYFAKHPRMGYLPVQTVLE + GDNMETPASSPQLSHDDTHSRIEHYASRLAEMENSNGSYLNDSISPNESIDDEHLLIQ + HYCQSLNQDSPLSQPRSPAQILISLESEERGELERILADLEEENRNLQAEYDRLKQQH + EHKGLSPLPSPPEMMPTSPQSPRDAELIAEAKLLRQHKGRLEARMQILEDHNKQLESQ + LHRLRQLLEQPQAEAKVNGTTVSSPSTSLQRSDSSQPMLLRVVGSQTSDSMGEEDLLS + PPQDTSTGLEEVMEQLNNSFPSSRGHNVGSLFHMADDLGRAMESLVSVMTDEEGAE" + 3'UTR 1922..2110 + /gene="DMD" + /note="G00-119-850" +ORIGIN + 1 gaagctcact cctccactcg tacccacact cgaccgcgga gcccttgcag ccatgaggga + 61 acagctcaaa ggccacgaga ctcaaacaac ttgctgggac catcccaaaa tgacagagct + 121 ctaccagtct ttagctgacc tgaataatgt cagattctca gcttatagga ctgccatgaa + 181 actccgaaga ctgcagaagg ccctttgctt ggatctcttg agcctgtcag ctgcatgtga + 241 tgccttggac cagcacaacc tcaagcaaaa tgaccagccc atggatatcc tgcagattat + 301 taattgtttg accactattt atgaccgcct ggagcaagag cacaacaatt tggtcaacgt + 361 ccctctctgc gtggatatgt gtctgaactg gctgctgaat gtttatgata cgggacgaac + 421 agggaggatc cgtgtcctgt cttttaaaac tggcatcatt tccctgtgta aagcacattt + 481 ggaagacaag tacagatacc ttttcaagca agtggcaagt tcaacaggat tttgtgacca + 541 gcgcaggctg ggcctccttc tgcatgattc tatccaaatt ccaagacagt tgggtgaagt + 601 tgcatccttt gggggcagta acattgagcc aagtgtccgg agctgcttcc aatttgctaa + 661 taataagcca gagatcgaag cggccctctt cctagactgg atgagactgg aaccccagtc + 721 catggtgtgg ctgcccgtcc tgcacagagt ggctgctgca gaaactgcca agcatcaggc + 781 caaatgtaac atctgcaaag agtgtccaat cattggattc aggtacagga gtctaaagca + 841 ctttaattat gacatctgcc aaagctgctt tttttctggt cgagttgcaa aaggccataa + 901 aatgcactat cccatggtgg aatattgcac tccgactaca tcaggagaag atgttcgaga + 961 ctttgccaag gtactaaaaa acaaatttcg aaccaaaagg tattttgcga agcatccccg + 1021 aatgggctac ctgccagtgc agactgtctt agagggggac aacatggaaa cgcctgcctc + 1081 gtcccctcag ctttcacacg atgatactca ttcacgcatt gaacattatg ctagcaggct + 1141 agcagaaatg gaaaacagca atggatctta tctaaatgat agcatctctc ctaatgagag + 1201 catagatgat gaacatttgt taatccagca ttactgccaa agtttgaacc aggactcccc + 1261 cctgagccag cctcgtagtc ctgcccagat cttgatttcc ttagagagtg aggaaagagg + 1321 ggagctagag agaatcctag cagatcttga ggaagaaaac aggaatctgc aagcagaata + 1381 tgaccgtcta aagcagcagc acgaacataa aggcctgtcc ccactgccgt cccctcctga + 1441 aatgatgccc acctctcccc agagtccccg ggatgctgag ctcattgctg aggccaagct + 1501 actgcgtcaa cacaaaggcc gcctggaagc caggatgcaa atcctggaag accacaataa + 1561 acagctggag tcacagttac acaggctaag gcagctgctg gagcaacccc aggcagaggc + 1621 caaagtgaat ggcacaacgg tgtcctctcc ttctacctct ctacagaggt ccgacagcag + 1681 tcagcctatg ctgctccgag tggttggcag tcaaacttcg gactccatgg gtgaggaaga + 1741 tcttctcagt cctccccagg acacaagcac agggttagag gaggtgatgg agcaactcaa + 1801 caactccttc cctagttcaa gaggacacaa tgtaggaagt cttttccaca tggcagatga + 1861 tttgggcaga gcgatggagt ccttagtatc agtcatgaca gatgaagaag gagcagaata + 1921 aatgttttac aactcctgat tcccgcatgg tttttataat attcatacaa caaagaggat + 1981 tagacagtaa gagtttacaa gaaataaatc tatatttttg tgaagggtag tggtattata + 2041 ctgtagattt cagtagtttc taagtctgtt attgttttgt taacaatggc aggttttaca + 2101 cgtctatgca +// \ No newline at end of file diff --git a/test/jalview/io/NC_000011.10.gb b/test/jalview/io/NC_000011.10.gb new file mode 100644 index 0000000..850c004 --- /dev/null +++ b/test/jalview/io/NC_000011.10.gb @@ -0,0 +1,173 @@ +LOCUS NC_000011 1800 bp DNA linear CON 03-FEB-2014 +DEFINITION Homo sapiens chromosome 11, GRCh38 Primary Assembly. +ACCESSION NC_000011 REGION: complement(5232829..5234628) GPC_000001303 +VERSION NC_000011.10 GI:568815587 +DBLINK BioProject: PRJNA168 + Assembly: GCF_000001405.26 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +REFERENCE 1 (bases 1 to 1800) + AUTHORS Taylor,T.D., Noguchi,H., Totoki,Y., Toyoda,A., Kuroki,Y., Dewar,K., + Lloyd,C., Itoh,T., Takeda,T., Kim,D.W., She,X., Barlow,K.F., + Bloom,T., Bruford,E., Chang,J.L., Cuomo,C.A., Eichler,E., + FitzGerald,M.G., Jaffe,D.B., LaButti,K., Nicol,R., Park,H.S., + Seaman,C., Sougnez,C., Yang,X., Zimmer,A.R., Zody,M.C., + Birren,B.W., Nusbaum,C., Fujiyama,A., Hattori,M., Rogers,J., + Lander,E.S. and Sakaki,Y. + TITLE Human chromosome 11 DNA sequence and analysis including novel gene identification + JOURNAL Nature 440 (7083), 497-500 (2006) + PUBMED 16554811 +REFERENCE 2 (bases 1 to 1800) + CONSRTM International Human Genome Sequencing Consortium + TITLE Finishing the euchromatic sequence of the human genome + JOURNAL Nature 431 (7011), 931-945 (2004) + PUBMED 15496913 +REFERENCE 3 (bases 1 to 1800) + AUTHORS Lander,E.S., Linton,L.M., Birren,B., Nusbaum,C., Zody,M.C., + Baldwin,J., Devon,K., Dewar,K., Doyle,M., FitzHugh,W., Funke,R., + Gage,D., Harris,K., Heaford,A., Howland,J., Kann,L., Lehoczky,J., + LeVine,R., McEwan,P., McKernan,K., Meldrim,J., Mesirov,J.P., + Miranda,C., Morris,W., Naylor,J., Raymond,C., Rosetti,M., + Santos,R., Sheridan,A., Sougnez,C., Stange-Thomann,N., + Stojanovic,N., Subramanian,A., Wyman,D., Rogers,J., Sulston,J., + Ainscough,R., Beck,S., Bentley,D., Burton,J., Clee,C., Carter,N., + Coulson,A., Deadman,R., Deloukas,P., Dunham,A., Dunham,I., + Durbin,R., French,L., Grafham,D., Gregory,S., Hubbard,T., + Humphray,S., Hunt,A., Jones,M., Lloyd,C., McMurray,A., Matthews,L., + Mercer,S., Milne,S., Mullikin,J.C., Mungall,A., Plumb,R., Ross,M., + Shownkeen,R., Sims,S., Waterston,R.H., Wilson,R.K., Hillier,L.W., + McPherson,J.D., Marra,M.A., Mardis,E.R., Fulton,L.A., + Chinwalla,A.T., Pepin,K.H., Gish,W.R., Chissoe,S.L., Wendl,M.C., + Delehaunty,K.D., Miner,T.L., Delehaunty,A., Kramer,J.B., Cook,L.L., + Fulton,R.S., Johnson,D.L., Minx,P.J., Clifton,S.W., Hawkins,T., + Branscomb,E., Predki,P., Richardson,P., Wenning,S., Slezak,T., + Doggett,N., Cheng,J.F., Olsen,A., Lucas,S., Elkin,C., + Uberbacher,E., Frazier,M., Gibbs,R.A., Muzny,D.M., Scherer,S.E., + Bouck,J.B., Sodergren,E.J., Worley,K.C., Rives,C.M., Gorrell,J.H., + Metzker,M.L., Naylor,S.L., Kucherlapati,R.S., Nelson,D.L., + Weinstock,G.M., Sakaki,Y., Fujiyama,A., Hattori,M., Yada,T., + Toyoda,A., Itoh,T., Kawagoe,C., Watanabe,H., Totoki,Y., Taylor,T., + Weissenbach,J., Heilig,R., Saurin,W., Artiguenave,F., Brottier,P., + Bruls,T., Pelletier,E., Robert,C., Wincker,P., Smith,D.R., + Doucette-Stamm,L., Rubenfield,M., Weinstock,K., Lee,H.M., + Dubois,J., Rosenthal,A., Platzer,M., Nyakatura,G., Taudien,S., + Rump,A., Yang,H., Yu,J., Wang,J., Huang,G., Gu,J., Hood,L., + Rowen,L., Madan,A., Qin,S., Davis,R.W., Federspiel,N.A., + Abola,A.P., Proctor,M.J., Myers,R.M., Schmutz,J., Dickson,M., + Grimwood,J., Cox,D.R., Olson,M.V., Kaul,R., Raymond,C., Shimizu,N., + Kawasaki,K., Minoshima,S., Evans,G.A., Athanasiou,M., Schultz,R., + Roe,B.A., Chen,F., Pan,H., Ramser,J., Lehrach,H., Reinhardt,R., + McCombie,W.R., de la Bastide,M., Dedhia,N., Blocker,H., + Hornischer,K., Nordsiek,G., Agarwala,R., Aravind,L., Bailey,J.A., + Bateman,A., Batzoglou,S., Birney,E., Bork,P., Brown,D.G., + Burge,C.B., Cerutti,L., Chen,H.C., Church,D., Clamp,M., + Copley,R.R., Doerks,T., Eddy,S.R., Eichler,E.E., Furey,T.S., + Galagan,J., Gilbert,J.G., Harmon,C., Hayashizaki,Y., Haussler,D., + Hermjakob,H., Hokamp,K., Jang,W., Johnson,L.S., Jones,T.A., + Kasif,S., Kaspryzk,A., Kennedy,S., Kent,W.J., Kitts,P., + Koonin,E.V., Korf,I., Kulp,D., Lancet,D., Lowe,T.M., McLysaght,A., + Mikkelsen,T., Moran,J.V., Mulder,N., Pollara,V.J., Ponting,C.P., + Schuler,G., Schultz,J., Slater,G., Smit,A.F., Stupka,E., + Szustakowski,J., Thierry-Mieg,D., Thierry-Mieg,J., Wagner,L., + Wallis,J., Wheeler,R., Williams,A., Wolf,Y.I., Wolfe,K.H., + Yang,S.P., Yeh,R.F., Collins,F., Guyer,M.S., Peterson,J., + Felsenfeld,A., Wetterstrand,K.A., Patrinos,A., Morgan,M.J., de + Jong,P., Catanese,J.J., Osoegawa,K., Shizuya,H., Choi,S. and + Chen,Y.J. + CONSRTM International Human Genome Sequencing Consortium + TITLE Initial sequencing and analysis of the human genome + JOURNAL Nature 409 (6822), 860-921 (2001) + PUBMED 11237011 + REMARK Erratum:[Nature 2001 Aug 2;412(6846):565] +COMMENT REFSEQ INFORMATION: The reference sequence is identical to + CM000673.2. + On Feb 3, 2014 this sequence version replaced gi:224589802. + Assembly Name: GRCh38 Primary Assembly + The DNA sequence is composed of genomic sequence, primarily + finished clones that were sequenced as part of the Human Genome + Project. PCR products and WGS shotgun sequence have been added + where necessary to fill gaps or correct errors. All such additions + are manually curated by GRC staff. For more information see: + http://genomereference.org. + + ##Genome-Annotation-Data-START## + Annotation Provider :: NCBI + Annotation Status :: Full annotation + Annotation Version :: Homo sapiens Annotation Release 106 + Annotation Pipeline :: NCBI eukaryotic genome annotation + pipeline + Annotation Software Version :: 5.2 + Annotation Method :: Best-placed RefSeq; Gnomon + Features Annotated :: Gene; mRNA; CDS; ncRNA + ##Genome-Annotation-Data-END## +FEATURES Location/Qualifiers + source 1..1800 + /organism="Homo sapiens" + /mol_type="genomic DNA" + /db_xref="taxon:9606" + /chromosome="11" + gene 1..1800 + /gene="HBD" + /note="hemoglobin, delta; Derived by automated computational analysis using gene prediction method: Curated Genomic." + /db_xref="GeneID:3045" + /db_xref="HGNC:4829" + /db_xref="MIM:142000" + mRNA join(1..287,416..638,1537..1800) + /gene="HBD" + /product="hemoglobin, delta" + /note="Derived by automated computational analysis using gene prediction method: Curated Genomic." + /transcript_id="NM_000519.3" + /db_xref="GI:62865863" + /db_xref="GeneID:3045" + /db_xref="HGNC:4829" + /db_xref="MIM:142000" + CDS join(196..287,416..638,1537..1665) + /gene="HBD" + /note="delta globin; delta-globin chain; hemoglobin delta chain; Derived by automated computational analysis using gene prediction method: Curated Genomic." + /codon_start=1 + /product="hemoglobin subunit delta" + /protein_id="NP_000510.1" + /db_xref="GI:4504351" + /db_xref="CCDS:CCDS31376.1" + /db_xref="GeneID:3045" + /db_xref="HGNC:4829" + /db_xref="MIM:142000" + /translation="MVHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFE + SFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPE + NFRLLGNVLVCVLARNFGKEFTPQMQAAYQKVVAGVANALAHKYH" +ORIGIN + 1 agggcaagtt aagggaatag tggaatgaag gttcattttt cattctcaca aactaatgaa + 61 accctgctta tcttaaacca acctgctcac tggagcaggg aggacaggac cagcataaaa + 121 ggcagggcag agtcgactgt tgcttacact ttcttctgac ataacagtgt tcactagcaa + 181 cctcaaacag acaccatggt gcatctgact cctgaggaga agactgctgt caatgccctg + 241 tggggcaaag tgaacgtgga tgcagttggt ggtgaggccc tgggcaggtt ggtatcaagg + 301 ttataagaga ggctcaagga ggcaaatgga aactgggcat gtgtagacag agaagactct + 361 tgggtttctg ataggcactg actctctgtc ccttgggctg ttttcctacc ctcagattac + 421 tggtggtcta cccttggacc cagaggttct ttgagtcctt tggggatctg tcctctcctg + 481 atgctgttat gggcaaccct aaggtgaagg ctcatggcaa gaaggtgcta ggtgccttta + 541 gtgatggcct ggctcacctg gacaacctca agggcacttt ttctcagctg agtgagctgc + 601 actgtgacaa gctgcacgtg gatcctgaga acttcagggt gagtccagga gatgcttcac + 661 ttttctcttt ttactttcta atcttacatt ttggttcttt tacctacctg ctcttctccc + 721 acatttttgt cattttacta tattttatca tttaatgctt ctaaaatttt gttaattttt + 781 tatttaaata ttctgcattt tttccttcct cacaatcttg ctattttaaa ttatttaata + 841 tcctgtcttt ctctcccaac cccctccctt catttttcct tctctaacaa caactcaaat + 901 tatgcatacc agctctcacc tgctaattct gcacttagaa taatcctttt gtctctccac + 961 atgggtatgg gagaggctcc aactcaaaga tgagaggcat agaatactgt tttagaggct + 1021 ataaatcatt ttacaataag gaataattgg aattttataa attctgtagt aaatggaatg + 1081 gaaaggaaag tgaatatttg attatgaaag actaggcagt tacactggag gtggggcaga + 1141 agtcgttgct aggagacagc ccatcatcac actgattaat caattaattt gtatctatta + 1201 atctgtttat agtaattaat ttgtatatgc tatatacaca tacaaaatta aaactaattt + 1261 ggaattaatt tgtatatagt attatacagc atatatagca tatatgtaca tatatagact + 1321 acatgctagt taagtacata gaggatgtgt gtgtatagat atatgttata tgtatgcatt + 1381 catatatgta cttatttatg ctgatgggaa taacctgggg atcagttttg tctaagattt + 1441 gggcagaaaa aaatgggtgt tggctcagtt tctcagaagc cagtctttat ttctctgtta + 1501 accatatgca tgtatctgcc tacctcttct ccgcagctct tgggcaatgt gctggtgtgt + 1561 gtgctggccc gcaactttgg caaggaattc accccacaaa tgcaggctgc ctatcagaag + 1621 gtggtggctg gtgtggctaa tgccctggct cacaagtacc attgagatcc tggactgttt + 1681 cctgataacc ataagaagac cctatttccc tagattctat tttctgaact tgggaacaca + 1741 atgcctactt caagggtatg gcttctgcct aataaagaat gttcagctca acttcctgat +// \ No newline at end of file diff --git a/test/jalview/io/V00505.gb b/test/jalview/io/V00505.gb new file mode 100644 index 0000000..73c5cf4 --- /dev/null +++ b/test/jalview/io/V00505.gb @@ -0,0 +1,83 @@ +LOCUS V00505 1976 bp DNA linear PRI 14-NOV-2006 +DEFINITION Human gene for delta-globin. +ACCESSION V00505 +VERSION V00505.1 GI:30510 +KEYWORDS delta globin; germ line; globin. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +REFERENCE 1 (bases 1 to 1976) + AUTHORS Spritz,R.A., DeRiel,J.K., Forget,B.G. and Weissman,S.M. + TITLE Complete nucleotide sequence of the human delta-globin gene + JOURNAL Cell 21 (3), 639-646 (1980) + PUBMED 7438204 +COMMENT KST HSA.DELGLOBIN. +FEATURES Location/Qualifiers + source 1..1976 + /organism="Homo sapiens" + /mol_type="genomic DNA" + /db_xref="taxon:9606" + prim_transcript 123..1763 + exon 123..265 + /number=1 + CDS join(173..265,394..615,1505..1633) + /codon_start=1 + /product="delta globin" + /protein_id="CAA23763.1" + /db_xref="GI:30511" + /db_xref="GDB:119298" + /db_xref="GOA:P02042" + /db_xref="HGNC:4829" + /db_xref="InterPro:IPR000971" + /db_xref="InterPro:IPR002337" + /db_xref="InterPro:IPR009050" + /db_xref="InterPro:IPR012292" + /db_xref="PDB:1SHR" + /db_xref="PDB:1SI4" + /db_xref="UniProtKB/Swiss-Prot:P02042" + /translation="MVHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLARNFGKEFTPQMQAAYQKVVAGVANALAHKYH" + intron 266..393 + /number=1 + exon 394..615 + /number=2 + intron 616..1504 + /number=3 + exon 1505..1763 + /number=3 +ORIGIN + 1 aatgaaggtt catttttcat tctcacaaac taatgaaacc ctgcttatct taaaccaacc + 61 tgctcactgg agcagggagg acaggaccag cataaaaggc agggcagagt cgactgttgc + 121 ttacactttc ttctgacata acagtgttca ctagcaacct caaacagaca ccatggtgca + 181 tctgactcct gaggagaaga ctgctgtcaa tgccctgtgg ggcaaagtga acgtggatgc + 241 agttggtggt gaggccctgg gcaggttggt atcaaggtta taagagaggc tcaaggaggc + 301 aaatggaaac tgggcatgtg tagacagaga agactcttgg gtttctgata ggcactgact + 361 ctctgtccct tgggctgttt tcctaccctc agattactgg tggtctaccc ttggacccag + 421 aggttctttg agtcctttgg ggatctgtcc tctcctgatg ctgttatggg caaccctaag + 481 gtgaaggctc atggcaagaa ggtgctaggt gcctttagtg atggcctggc tcacctggac + 541 aacctcaagg gcactttttc tcagctgagt gagctgcact gtgacaagct gcacgtggat + 601 cctgagaact tcagggtgag tccaggagat gcttcacttt tctcttttta ctttctaatc + 661 ttacattttg gttcttttac ctacctgctc ttctcccaca tttttgtcat tttactatat + 721 tttatcattt aatgcttcta aaattttgtt atttttttat ttaaaaattc tgcatttttt + 781 ccttcctcac aatcttgcta ctctaaatta tttaatatcc tgtctttctc tcccaacccc + 841 ctcccttcat ttttccttct ctaacaacaa ctcaaattat gcataccagc tctcacctgc + 901 taatttcgca cttagaataa tccttttgtc tctccacatg ggtatgggag aggctccaac + 961 tcaaagatga gaggcataga atactgtttt agaggctata aatcatttta caataaggaa + 1021 taattggaat tttataaatt ctgtagtaaa tggaatggaa aggaaagtga atatttgatt + 1081 atgaaagact aggcagttac actggaggtg gggcagaagt cgttgctagg agacagccca + 1141 tcatcacact gatttatcaa ttcaatttgt atctattaat ctgtttatag taattaattt + 1201 gtatatgcta tatacacata caaaattaaa actaatttgg aattaatttg tatatagtat + 1261 tatacagcat atatgtacat atatagacta catgctagtt aagtacatag aggatgtgtg + 1321 tgtatagata tatgttatat gtatgcattc atatatgtac ttatttatgc tgatgggaat + 1381 aacctgggga tcagttttgt ctaagatttg ggcagaaaaa aatgggtgtt ggctcagttc + 1441 tcagaagcca gtctttattt ctctgttaac catatgcatg tatctgccta cctcttctcc + 1501 gcagctcttg ggcaatgtgc tggtgtgtgt gctggcccgc aactttggca aggaattcac + 1561 cccacaaatg caggctgcct atcagaaggt ggtggctggt gtggctaatg ccttggctca + 1621 caagtaccat tgagatcctg gactgtttcc tgataaccat aagaagaccc tatttcccta + 1681 gattctattt tctgaacttg ggaacacaat gcctacttca agggtatggc ttctgcctaa + 1741 taaagaatgt tcagctcaac ttcctgatta atttcactta tttcattttt ttgtccaggt + 1801 gtgtaagaag gttcctgagg ctctacagat agggagcact tctttatttt acaaagagta + 1861 catgggaaaa gagaaaagca agggaaccgt acaaggcatt aatgggtgac acttctacct + 1921 ccaaagagca gaaattatca agaactcttg atacaaagat aatactggca ctgcag +// \ No newline at end of file -- 1.7.10.2