* GeneDB ID
*/
public static final String GENEDB = "GeneDB";
-
+ /**
+ * GeneBank
+ */
+ public static final String GENBANK = "GenBank";
/**
* List of databases whose sequences might have coding regions annotated
*/
*/
public static final String[] READABLE_FORMATS = new String[]
{ "BLC", "CLUSTAL", "FASTA", "MSF", "PileUp", "PIR", "PFAM", "STH",
- "PDB", "JnetFile", "RNAML" }; // , "SimpleBLAST" };
+ "PDB", "JnetFile", "RNAML", "GENBANK" };
/**
* List of valid format strings for use by callers of the formatSequences
* corresponding to READABLE_FNAMES
*/
public static final String[] READABLE_EXTENSIONS = new String[]
- { "fa, fasta, mfa, fastq", "aln", "pfam", "msf", "pir", "blc", "amsa",
- "jar,jvp", "sto,stk", "xml,rnaml" }; // ".blast"
+ { "fa,faa,fasta,mfa,fastq", "aln", "pfam", "msf", "pir", "blc", "amsa",
+ "jar,jvp", "sto,stk", "xml,rnaml", "gb" }; // ".blast"
/**
* List of readable formats by application in order corresponding to
*/
public static final String[] READABLE_FNAMES = new String[]
{ "Fasta", "Clustal", "PFAM", "MSF", "PIR", "BLC", "AMSA", "Jalview",
- "Stockholm", "RNAML" };// ,
+ "Stockholm", "RNAML", "GenBank" };
// "SimpleBLAST"
// };
{
afile = new RnamlFile(inFile, type);
}
-
+ else if (format.equals("GENBANK"))
+ {
+ afile = new GenBankFile(inFile, type);
+ }
Alignment al = new Alignment(afile.getSeqsAsArray());
afile.addAnnotations(al);
{
afile = new SimpleBlastFile(source);
}
+ else if (format.equals("GENBANK"))
+ {
+ afile = new GenBankFile(source);
+ }
Alignment al = new Alignment(afile.getSeqsAsArray());
{
afile = new RnamlFile();
}
-
+ else if (format.equalsIgnoreCase("GENBANK"))
+ {
+ afile = new GenBankFile();
+ }
else
{
throw new Exception(
--- /dev/null
+package jalview.io;
+
+import jalview.io.xdb.genbank.GenBankFeature;
+import jalview.io.xdb.genbank.GenBankSequence;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Vector;
+
+public class DnaUtils {
+
+ /**
+ * @param gbf CDS feature data
+ * @param sequences ORIGIN data
+ * @return Nucleotid String (sequence) of CDS
+ */
+ public static String getSequence(GenBankFeature gbf, Vector<GenBankSequence> sequences){
+ if (!gbf.getType().equals(GenBankFeature.CDS)){
+ //If the feature is not a CDS, no sequence is returned
+ return null;
+ }else{
+ String range = gbf.getQualifier("range");
+ if (range.startsWith("join")){
+ //TODO
+ //It's a composed sequence
+ }else{
+ //It's a simple range
+ String[] positions = range.split("..");
+ int initRange = Integer.parseInt(positions[0]);
+ int endRange = Integer.parseInt(positions[1]);
+ String sourceSequence = getNucleotidesFromSequenceVector(sequences);
+ return sourceSequence.substring(initRange, endRange);
+ }
+ }
+ return null;
+
+ }
+ private static boolean isSequenceInRange(int initRange, int endRange, GenBankSequence gbs){
+ return ((initRange>=gbs.getId()) && (endRange>=gbs.getId()));
+ }
+ private static String getNucleotidesInRangeFromSequence(int initRange, int endRange, GenBankSequence gbs){
+ return "";
+ }
+ public static String getNucleotidesFromSequenceVector(Vector<GenBankSequence> v){
+ StringBuffer sb = new StringBuffer();
+ for (GenBankSequence gbs:v){
+ Vector<String> seqs = gbs.getSequences();
+ for (String s:seqs)
+ sb.append(s);
+ }
+ return sb.toString();
+ }
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ // TODO Auto-generated method stub
+
+ }
+
+}
--- /dev/null
+package jalview.io;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.io.xdb.genbank.GenBankFeature;
+import jalview.io.xdb.genbank.GenBankLocation;
+import jalview.io.xdb.genbank.GenBankLocationPoint;
+import jalview.io.xdb.genbank.GenBankLocationRange;
+import jalview.io.xdb.genbank.GenBankLocations;
+import jalview.io.xdb.genbank.GenBankLocus;
+import jalview.io.xdb.genbank.GenBankReference;
+import jalview.io.xdb.genbank.GenBankSequence;
+import jalview.io.xdb.genbank.GenBankSource;
+import jalview.io.xdb.genbank.GenBankVersion;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.james.mime4j.field.ParsedField;
+
+public class GenBankFile extends AlignFile {
+ private static final Logger log = Logger.getLogger(GenBankFile.class.getName());
+ private GenBankVersion version = new GenBankVersion();
+ private GenBankLocus locus = new GenBankLocus();
+ private GenBankSource source = new GenBankSource();
+ private static final Pattern patLocation = Pattern.compile("(\\d+)\\.\\.(\\d+)");
+ private static final Pattern patLocationComp = Pattern.compile("(complement)\\((\\d+)\\.\\.(\\d+)\\)");
+ private static final Pattern patLocus = Pattern.compile("^LOCUS +([a-z|A-Z|0-9|_]+) +([0-9]+) bp ( {3}|ss\\-|ds\\-|ms\\-)([a-z|A-Z|-|\\s]+) ([a-z| ]{8}) ([A-Z| ]{3}) ([0-9]+-[A-Z]+-[0-9]+)");
+ private static final Pattern patQualifierKey = Pattern.compile("/(.*?)=");
+ private static final Pattern patFeatureKey = Pattern.compile("^\\s{5}([A-Za-z0-9\\_\\']+)\\s+");
+
+ private String definition;
+ private String accession;
+ private String keywords;
+ private String dblink;
+ private String baseCount;
+
+ private Vector<GenBankFeature> features;
+ private Vector<String> comments;
+ //Items under origin
+ private Vector<GenBankSequence> sequences;
+ private Vector<GenBankReference> references;
+
+ private SequenceI genBankSequence;
+
+ public GenBankFile() {
+ }
+
+ public GenBankFile(String inFile, String type) throws IOException {
+ super(inFile, type);
+ }
+
+ public GenBankFile(FileParse source) throws IOException {
+ super(source);
+ }
+
+ public void initData() {
+ super.initData();
+ features = new Vector<GenBankFeature>();
+ comments = new Vector<String>();
+ sequences = new Vector<GenBankSequence>();
+ references = new Vector<GenBankReference>();
+ }
+
+ public void parse() throws IOException {
+ String line;
+ boolean featureMode = false; //FEATURES found
+ boolean seqMode = false; //Parsing Sequences from SOURCE
+ boolean referenceMode = false; //REFERENCE found
+ boolean sourceMode = false; //SOURCE found
+ boolean commentMode = false; //COMMENT found
+ boolean parsingAuthors = false; //Parsing authors (multiline)
+ boolean parsingDefinition = false; //Parsing definition (multiline)
+ boolean parsingKeywords = false; //Parsing keywords (multiline)
+ boolean parsingDbLink = false; //Parsing DBLINK (multiline)
+ boolean parsingTitle = false; //Parsing title (multiline)
+ boolean parsingQualifier = false; //Parsing feature qualifier (multine)
+ String currentQualifierName = "";
+ GenBankReference reference = null;
+ GenBankFeature feature = null;
+ List<String> sourceLines = new ArrayList<String>();
+
+ if (this.isValid()){
+
+ while ((line = nextLine()) != null) {
+ // We only process lines if they have contents within
+ if (line.length() == 0)
+ continue;
+
+ if (line.startsWith("FEATURES")){
+ featureMode = true;
+ seqMode = false;
+ referenceMode = false;
+ sourceMode = false;
+ commentMode = false;
+ feature = new GenBankFeature();
+ source = parseSource(sourceLines);
+ }
+
+
+ if (seqMode) {
+ if (!line.startsWith("//")){
+ GenBankSequence seq = processSequenceLine(line);
+ sequences.add(seq);
+ }
+ featureMode = false;
+ referenceMode = false;
+ sourceMode = false;
+ }
+
+ if (line.startsWith("ORIGIN")){
+ if (feature.getType()!=null)
+ features.add(feature);
+ featureMode = false;
+ referenceMode = false;
+ sourceMode = false;
+ seqMode = true;
+ }
+
+ if (featureMode){
+ // Process feature line
+ if (!line.startsWith("FEATURES") && !line.startsWith("BASE COUNT")){
+ //Parse type
+ if (!line.trim().startsWith("/")){
+ Matcher featuresMatch = patFeatureKey.matcher(line);
+ if (featuresMatch.find()){
+ if (feature.getType()!=null)
+ features.add(feature); //Hay que añadirlo sólo si no se está a mitad de un qualif o una feature
+ //It's a feature
+ String type = featuresMatch.group(0);
+ feature = new GenBankFeature();
+ feature.setType(type);
+ GenBankLocation loc = parserFeatureLocation(feature, line.replace(type,""));
+ feature.setLocation(loc);
+ parsingQualifier = false;
+ continue;
+ }else if (parsingQualifier) { //If not a feature, it's another part of a qualifier
+ String qValue = feature.getQualifier(currentQualifierName);
+ StringBuffer sb = new StringBuffer().append(qValue).append(ltrim(line));
+ feature.updateQualifier(currentQualifierName, sb.toString());
+ continue;
+ }
+ }else{
+ //It's the begining of a qualifier line
+ Matcher matcher = patQualifierKey.matcher(line);
+ if (matcher.find()){
+ String qName = matcher.group(1);
+ currentQualifierName = qName.replace("/","");
+ line = line.replace(qName,"").replace("/", "").replace("=","");
+ feature.addQualifier(currentQualifierName, ltrim(line));
+ parsingQualifier = true;
+ continue;
+ }
+ }
+ }
+ }
+ // Process REFERENCE line
+ if (line.startsWith("REFERENCE")) {
+ if (!referenceMode){
+ //This is line is the REFERENCE line
+ referenceMode = true;
+ featureMode = false;
+ sourceMode = false;
+ seqMode = false;
+ }else{
+ //We were at referenceMode, then add current reference to the list and create a new one
+ references.add(reference);
+ }
+ reference = new GenBankReference();
+ String desc = processReferenceLine(line,"REFERENCE");
+ int[] ranges = parseReferenceDescriptor(desc);
+ reference.setDescriptor(desc);
+ reference.setOrder(ranges[0]);
+ reference.setBegin(ranges[1]);
+ reference.setEnd(ranges[2]);
+ parsingAuthors = false;
+ parsingTitle = false;
+ continue;
+ }
+
+ if (line.startsWith(" AUTHORS")){
+ if (referenceMode){
+ reference.setAuthors(processReferenceLine(line,"AUTHORS"));
+ parsingAuthors = true;
+ parsingTitle = false;
+ }
+ continue;
+ }
+ if (line.startsWith(" TITLE")){
+ if (referenceMode){
+ reference.setTitle(processReferenceLine(line,"TITLE"));
+ parsingAuthors = false;
+ parsingTitle = true;
+ }
+ continue;
+ }
+ if (line.startsWith(" JOURNAL")){
+ if (referenceMode){
+ reference.setJournal(processReferenceLine(line,"JOURNAL"));
+ parsingTitle = false;
+ parsingAuthors = false;
+ }
+ continue;
+ }
+ if (line.startsWith(" PUBMED")){
+ if (referenceMode){
+ reference.setPubmed(processReferenceLine(line,"PUBMED"));
+ parsingTitle = false;
+ parsingAuthors = false;
+ }
+ continue;
+ }
+
+ if (line.startsWith(" MEDLINE")){
+ if (referenceMode){
+ reference.setMedline(processReferenceLine(line,"MEDLINE"));
+ parsingTitle = false;
+ parsingAuthors = false;
+ }
+ continue;
+ }
+ if (line.startsWith(" REMARK")){
+ if (referenceMode){
+ reference.setRemark(processReferenceLine(line,"REMARK"));
+ parsingTitle = false;
+ parsingAuthors = false;
+ }
+ continue;
+ }
+ if (line.startsWith(" CONSRTM")){
+ if (referenceMode){
+ reference.setConsortia(processReferenceLine(line,"CONSRTM"));
+ parsingTitle = false;
+ parsingAuthors = false;
+ }
+ continue;
+ }
+
+
+ if (line.startsWith("SOURCE")) {
+ parsingKeywords = false;
+ sourceMode = true;
+ commentMode = false;
+ if (sourceMode){
+ sourceLines.add(line);
+ }
+ continue;
+ }
+ if (line.indexOf("ORGANISM")!=-1) {
+ if (sourceMode){
+ sourceLines.add(line);
+ continue;
+ }
+ }
+
+ if (line.startsWith("COMMENT")){
+ if (reference!=null)
+ references.add(reference);
+ commentMode = true;
+ sourceMode = false;
+ referenceMode = false;
+ sourceMode = false;
+ seqMode = false;
+ comments.add(processCommentLine(line));
+ continue;
+ }
+ // Process LOCUS line
+ if (line.startsWith("LOCUS")) {
+ locus = parseLocus(line);
+ continue;
+ }
+ // Process BASE COUNT line
+ if (line.startsWith("BASE COUNT")) {
+ baseCount = processHeaderLine(line,"BASE COUNT");
+ featureMode = false;
+ continue;
+ }
+ // Process DEFINITION line
+ if (line.startsWith("DEFINITION")) {
+ definition = processHeaderLine(line,"DEFINITION");
+ parsingDefinition = true;
+ continue;
+ }
+ // Process ACCESSION line
+ if (line.startsWith("ACCESSION")) {
+ accession = processHeaderLine(line,"ACCESSION");
+ parsingDefinition = false;
+ continue;
+ }
+ // Process VERSION line
+ if (line.startsWith("VERSION")) {
+ version = parseVersion(line);
+ //headers.put("VERSION", processHeaderLine(line,"VERSION"));
+ continue;
+ }
+ // Process DBLINK line
+ if (line.startsWith("DBLINK")) {
+ dblink = processHeaderLine(line,"DBLINK");
+ parsingDbLink = true;
+ continue;
+ }
+ // Process KEYWORDS line
+ if (line.startsWith("KEYWORDS")) {
+ keywords = processHeaderLine(line,"KEYWORDS");
+ parsingKeywords = true;
+ parsingDbLink = false;
+ continue;
+ }
+ if (sourceMode){
+ sourceLines.add(line);
+ continue;
+ }
+ if (parsingDefinition){
+ StringBuffer sb = new StringBuffer().append(definition).append(line);
+ definition = sb.toString();
+ continue;
+ }
+ if (referenceMode && parsingAuthors){
+ if (reference!=null){
+ StringBuffer authors = new StringBuffer().append(reference.getAuthors()).append(line);
+ reference.setAuthors(authors.toString());
+ }
+ continue;
+ }
+ if (referenceMode && parsingTitle){
+ if (reference!=null){
+ StringBuffer title = new StringBuffer().append(reference.getTitle()).append(line);
+ reference.setTitle(title.toString());
+ }
+ continue;
+ }
+ if (parsingKeywords){
+ StringBuffer sb = new StringBuffer().append(keywords).append(line);
+ keywords = sb.toString();
+ continue;
+ }
+ if (parsingDbLink){
+ StringBuffer sb = new StringBuffer().append(dblink).append(line);
+ dblink = sb.toString();
+ continue;
+ }
+ if (commentMode){
+ comments.add(line);
+ }
+ }
+ setEntries();
+ }else{
+ //File is not valid
+ throw new IOException("GenBankFile is not valid.");
+ }
+ }
+
+ protected void setEntries(){
+ StringBuffer result = new StringBuffer();
+ //Mapping GenBank info into Jalview data model
+ genBankSequence = new Sequence(accession,DnaUtils.getNucleotidesFromSequenceVector(sequences));
+ //Mapping DBRefEntry
+ DBRefEntry dbRef = new DBRefEntry();
+ dbRef.setSource(DBRefSource.GENBANK);
+ dbRef.setVersion(version == null ? "" : version.toString());
+ dbRef.setAccessionId(accession);
+ // add map to indicate the sequence is a valid coordinate frame for the dbref
+ dbRef.setMap(new Mapping(null, new int[]
+ { 1, genBankSequence.getLength() }, new int[]
+ { 1, genBankSequence.getLength() }, 1, 1));
+ genBankSequence.addDBRef(dbRef);
+
+ //add header info as non-positional features
+ //add LOCUS
+ SequenceFeature locusF = new SequenceFeature("LOCUS", (locus == null ? "" : locus.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(locusF);
+ //add DEFNITION
+ SequenceFeature defF = new SequenceFeature("DEFINITION", definition, null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(defF);
+ //add ACCESSION
+ SequenceFeature accessionF = new SequenceFeature("ACCESSION", accession, null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(accessionF);
+ //add VERSION
+ SequenceFeature versionF = new SequenceFeature("VERSION", (version == null ? "" : version.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(versionF);
+ //add DBLINK
+ SequenceFeature dblinkF = new SequenceFeature("DBLINK", (dblink == null ? "" : dblink.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(dblinkF);
+ //add KEYWORDS
+ SequenceFeature keywordsF = new SequenceFeature("KEYWORDS", keywords, null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(keywordsF);
+ //add SOURCE
+ SequenceFeature sourceF = new SequenceFeature("SOURCE", (source == null ? "" : source.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(sourceF);
+ //add BASE COUNT
+ SequenceFeature baseCountF = new SequenceFeature("BASE COUNT", (baseCount == null ? "" : baseCount.toString()), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(baseCountF);
+
+ // add literature and database cross references in the file
+ for (GenBankReference gbRef:references){
+ //They are non-positional features
+ SequenceFeature refFeature = new SequenceFeature("REFERENCE", gbRef.toString(),null,gbRef.getBegin(),gbRef.getEnd(),DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(refFeature);
+ }
+ //add COMMENTS
+ if (comments.size()>0){
+ StringBuffer sb = new StringBuffer();
+ for (String comment: comments){
+ sb.append(comment).append(newline);
+ }
+ SequenceFeature commentF = new SequenceFeature("COMMENT", sb.toString(), null, 1, genBankSequence.getLength(), DBRefSource.GENBANK);
+ genBankSequence.addSequenceFeature(commentF);
+ }
+ //Mapping FEATURES
+ for (GenBankFeature feature:features){
+ if (feature.getType()!=null){
+ SequenceFeature sf = new SequenceFeature();
+ sf.setType(feature.getType());
+ sf.setDescription(feature.getType());
+
+ sf.setBegin(feature.getLocation()==null ? 0 : feature.getLocation().getMinor());
+ sf.setEnd(feature.getLocation()==null ? 0 : feature.getLocation().getMajor());
+ Enumeration<String> names = feature.getQualifiersNames();
+ while (names.hasMoreElements()){
+ String qName = names.nextElement();
+ String qValue = feature.getQualifier(qName);
+ sf.setValue(qName, qValue);
+ }
+ genBankSequence.addSequenceFeature(sf);
+ }
+ }
+ SequenceI[] parsedSeqs = new SequenceI[1];
+ parsedSeqs[0] = genBankSequence;
+ this.setSeqs(parsedSeqs);
+ }
+ private GenBankVersion parseVersion(String line) {
+ //VERSION U00096.2 GI:48994873
+ if (line.trim().equalsIgnoreCase("VERSION")){
+ return null;
+ }else{
+ GenBankVersion ver = new GenBankVersion();
+ String v = line.substring(11, line.indexOf(" ", 12)).trim();
+ ver.setVersion(v);
+ int posGI = line.indexOf("GI:", 11 + v.length());
+ if (posGI > -1) {
+ ver.setGI(line.substring(posGI));
+ }
+ return ver;
+ }
+ }
+
+ private GenBankLocus parseLocus(String line){
+ GenBankLocus loc = new GenBankLocus();
+ Matcher mat = patLocus.matcher(line);
+ if (mat.find()) {
+ String name = mat.group(1);
+ String len = mat.group(2);
+ String strand = mat.group(3);
+ String mtype = mat.group(4);
+ String linear = mat.group(5);
+ String division = mat.group(6);
+ String date = mat.group(7);
+
+ loc.setName(name == null ? "" : name.trim());
+ loc.setSequenceLength(len == null ? 0 : Integer.parseInt(len));
+ loc.setStrand(strand == null ? "" : strand);
+ loc.setMoleculeType(mtype == null ? "" : mtype);
+ loc.setLinearSequence("linear".equals(linear));
+ loc.setDivision(division == null ? "" : division);
+ loc.setModificationDate(date == null ? "" :date);
+ }
+ return loc;
+ }
+ private GenBankSource parseSource(List<String> lines){
+ StringBuffer sb = new StringBuffer();
+ for(String line:lines){
+ sb.append(line).append(newline);
+ }
+ // Source section
+ GenBankSource sou = new GenBankSource();
+ String aux = sb.toString().substring(11);
+ int fim1 = aux.indexOf("\n");
+ if (fim1 > -1) {
+ sou.setSource(aux.substring(0, fim1));
+ int ini2 = aux.indexOf("ORGANISM");
+ if (ini2 > -1) {
+ fim1 = aux.indexOf("\n", ini2 + 10);
+ if (fim1 > -1) {
+ sou.setOrganism(aux.substring(ini2 + 10, fim1));
+ sou.setTaxonomic(aux.substring(fim1).replaceAll(" ", "").replaceAll("\\s+", ""));
+ } else {
+ sou.setOrganism(aux);
+ }
+ }
+ } else {
+ sou.setSource(aux);
+ }
+ return sou;
+ }
+
+ /**
+ * Possible situations:
+ *
+ * 467 Points to a single base in the presented sequence 340..565 Points to
+ * a continuous range of bases bounded by and including the starting and
+ * ending bases <345..500 Indicates that the exact lower boundary point
+ * of a feature is unknown. The location begins at some base previous to the
+ * first base specified (which need not be contained in the presented
+ * sequence) and continues to and includes the ending base <1..888 The
+ * feature starts before the first sequenced base and continues to and
+ * includes base 888 1..>888 The feature starts at the first sequenced
+ * base and continues beyond base 888 102.110 Indicates that the exact
+ * location is unknown but that it is one of the bases between bases 102 and
+ * 110, inclusive 123^124 Points to a site between bases 123 and 124
+ * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to
+ * form one contiguous sequence complement(34..126) Start at the base
+ * complementary to 126 and finish at the base complementary to base 34 (the
+ * feature is on the strand complementary to the presented strand)
+ * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and
+ * 4918 to 5163, then complements the joined segments (the feature is on the
+ * strand complementary to the presented strand)
+ * join(complement(4918..5163),complement(2691..4571)) Complements regions
+ * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the
+ * feature is on the strand complementary to the presented strand)
+ * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in
+ * this database) with primary accession number 'J00194'
+ * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry
+ * with the region 100..202 of remote entry J00194
+ *
+ * @param fea
+ * @param localiza
+ */
+ private GenBankLocation parserFeatureLocation(GenBankFeature fea, String localiza) {
+ // remove os espaços, quebra de linhas etc
+ String buf = localiza.replaceAll("\\s", "");
+
+ // checks if there is a comma present between ranges
+ // complement(100..110),complement(90..100)
+ char[] buf2 = buf.toCharArray();
+ int abertos = 0;
+ java.util.List<String> lista = new java.util.ArrayList<String>();
+ int pinicial = 0;
+ for (int i = 0; i < buf2.length; i++) {
+ if (buf2[i] == '(') {
+ abertos++;
+ } else if (buf2[i] == ')') {
+ abertos--;
+ } else if (buf2[i] == ',' && abertos == 0) {
+ lista.add(buf.substring(pinicial, i));
+ pinicial = i + 1;
+ }
+ }
+ if (lista.size() > 0) {
+ lista.add(buf.substring(pinicial));
+ GenBankLocations um = new GenBankLocations();
+ um.setOperator(GenBankLocations.NONE);
+ for (String s : lista) {
+ um.getUnits().add(parserFeatureLocation(fea, s));
+ }
+ fea.setLocation(um);
+ return um;
+ }
+
+ // trata as funcoes: complement(location,location...),
+ // join(location,location...), order(location,location...)
+ if (buf.contains("(")) {
+ GenBankLocations um = new GenBankLocations();
+ int ini = buf.indexOf("(");
+ int fim = buf.lastIndexOf(")");
+ String token = buf.substring(0, ini);
+ if ("complement".equalsIgnoreCase(token)) {
+ String inter = buf.substring(ini + 1, fim);
+ GenBankLocation interno = parserFeatureLocation(fea, inter);
+ interno.setComplement(true);
+ um.setOperator(GenBankLocations.COMPLEMENT);
+ um.getUnits().add(interno);
+ fea.setLocation(um);
+ } else if ("join".equalsIgnoreCase(token)) {
+ String inter = buf.substring(ini + 1, fim);
+ GenBankLocation interno = parserFeatureLocation(fea, inter);
+ um.setOperator(GenBankLocations.JOIN);
+ um.getUnits().add(interno);
+ fea.setLocation(um);
+ } else if ("order".equalsIgnoreCase(token)) {
+ String inter = buf.substring(ini + 1, fim);
+ GenBankLocation interno = parserFeatureLocation(fea, inter);
+ um.setOperator(GenBankLocations.ORDER);
+ um.getUnits().add(interno);
+ fea.setLocation(um);
+ } else {
+ log.log(Level.WARNING, "Token desconhecido em location/features - {0}", token);
+ String inter = buf.substring(ini + 1, fim);
+ fea.setLocation(parserFeatureLocation(fea, inter));
+ }
+ return fea.getLocation();
+ } else {
+ // trata quando tiver uma lista de location
+ if (buf.contains(",")) {
+ String[] partes = buf.split(",");
+ GenBankLocations um = new GenBankLocations();
+ for (String p : partes) {
+ um.getUnits().add(
+ parserFeatureLocation(fea, p));
+ }
+ fea.setLocation(um);
+ return um;
+ } else {
+ // trata quando tiver range
+ if (buf.contains("..")) {
+ String[] partes = buf.split("\\.\\.");
+ GenBankLocationRange range = new GenBankLocationRange();
+ if (buf.contains(":")) {
+ for (int i = 0; i < partes.length; i++) {
+ int pos = partes[i].indexOf(":");
+ if (pos > 0) {
+ String entry = partes[i].substring(0, pos);
+ partes[i] = partes[i].substring(pos + 1);
+ range.setEntry(entry);
+ }
+ }
+ }
+ GenBankLocationPoint gp0 = (GenBankLocationPoint) parserFeatureLocation(fea, partes[0]);
+ range.setStart(gp0);
+ GenBankLocationPoint gp1 = (GenBankLocationPoint) parserFeatureLocation(fea, partes[1]);
+ range.setEnd(gp1);
+ fea.setLocation(range);
+ return range;
+ } else {
+ // trata um ponto
+ // possibilidades consideradas:
+ // 467
+ // 102.110
+ // 123^124
+ // <345
+ // >400
+ // 345>
+ // 400<
+ // ou uma combinacao dessas
+ GenBankLocationPoint gp = new GenBankLocationPoint();
+ if (buf.contains(":")) {
+ int pos = buf.indexOf(":");
+ if (pos > 0) {
+ String entry = buf.substring(0, pos);
+ buf = buf.substring(pos + 1);
+ gp.setEntry(entry);
+ }
+ }
+ int pos = 0;
+ // verifica os simb < e > antes do primeiro numero
+ if (buf.charAt(pos) == '<' || buf.charAt(pos) == '>') {
+ gp.setPrefix(buf.charAt(pos));
+ pos++;
+ }
+ // pega o primeiro numero
+ int ini = pos;
+ while (pos < buf.length() && buf.charAt(pos) >= '0'
+ && buf.charAt(pos) <= '9') {
+ pos++;
+ }
+ if (buf.subSequence(ini, pos).length() < 1) {
+ System.out.println(localiza);
+ }
+ int num = Integer.parseInt(buf.substring(ini, pos));
+ int num2 = num;
+ // o primeiro numero pode ser o unico numero
+ if (pos < buf.length()) {
+ // verifica se tem os sinais < e > apos o primeiro numero
+ if (buf.charAt(pos) == '<' || buf.charAt(pos) == '>') {
+ if (buf.contains(".") || buf.contains("^")) {
+ gp.setPrefix(buf.charAt(pos));
+ } else {
+ gp.setSufix(buf.charAt(pos));
+ }
+ pos++;
+ }
+
+ // verifica a separacao dos numeros . ou ^
+ if (pos < buf.length()
+ && (buf.charAt(pos) == '.' || buf.charAt(pos) == '^')) {
+ // separação localizada, possibilidade de mais numero
+ gp.setSymbol(buf.charAt(pos));
+ pos++;
+
+ // verifica os simb < e > antes do segundo numero
+ if (buf.charAt(pos) == '<' || buf.charAt(pos) == '>') {
+ gp.setSufix(buf.charAt(pos));
+ pos++;
+ }
+
+ // pega o segundo numero
+ ini = pos;
+ while (pos < buf.length() && buf.charAt(pos) >= '0'
+ && buf.charAt(pos) <= '9') {
+ pos++;
+ }
+ num2 = Integer.parseInt(buf.substring(ini, pos));
+
+ // verifica os simb < e > após o segundo numero
+ if (pos < buf.length() && (buf.charAt(pos) == '<' || buf.charAt(pos) == '>')) {
+ gp.setSufix(buf.charAt(pos));
+ pos++;
+ }
+ }
+ }
+ gp.setMin(num);
+ gp.setMax(num2);
+ fea.setLocation(gp);
+ return gp;
+ }
+ }
+ }
+ }
+
+ private int[] parseReferenceDescriptor(String descriptor){
+ // 1 (bases 1 to 1609)
+ int[] resultado = new int[3];
+ descriptor = descriptor.replace("(bases", ",").replace("to", ",").replace(")", "");
+ String[] args = descriptor.split(",");
+ resultado[0] = Integer.parseInt(args[0].trim());
+ resultado[1] = Integer.parseInt(args[1].trim());
+ resultado[2] = Integer.parseInt(args[2].trim());
+ return resultado;
+ }
+ private String processReferenceLine(String line, String component){
+ int init = line.indexOf(component);
+ if (init!=-1){
+ line = line.replace(component,"");
+ }
+ return line;
+ }
+ private String processHeaderLine(String line, String header){
+ int init = line.indexOf(header);
+ if (init!=-1){
+ line = line.replace(header,"");
+ }
+ return line;
+ }
+
+ private GenBankSequence processSequenceLine(String line) {
+ GenBankSequence gbs = new GenBankSequence();
+ line = ltrim(line);
+ String[] args = line.split(" ");
+ gbs.setId(Integer.parseInt(args[0]));
+ int len = args.length-1;
+ Vector<String> seqs = new Vector<String>();
+ for (int i=0;i<len;i++)
+ seqs.add(args[i+1]);
+ gbs.setSequences(seqs);
+ return gbs;
+ }
+
+ private String processCommentLine(String line){
+ int init = line.indexOf("COMMENT");
+ if (init!=-1){
+ line = line.replace("COMMENT","");
+ }
+ return line;
+ }
+ public String rtrim(String s) {
+ int i = s.length()-1;
+ while (i >= 0 && Character.isWhitespace(s.charAt(i))) {
+ i--;
+ }
+ return s.substring(0,i+1);
+ }
+
+ public String ltrim(String s) {
+ int i = 0;
+ while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
+ i++;
+ }
+ return s.substring(i);
+ }
+
+ public String print(){
+ StringBuffer out = new StringBuffer();
+ for (SequenceI seq: this.getSeqs()){
+ SequenceFeature[] seqFeatures = seq.getSequenceFeatures();
+ boolean featureLinePrinted = false;
+ for(SequenceFeature sf:seqFeatures){
+ if(sf.getType().equals("LOCUS")){
+ out.append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("DEFINITION")){
+ out.append("DEFINITION ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("VERSION")){
+ out.append("VERSION ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("ACCESSION")){
+ out.append("ACCESSION ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("DBLINK")){
+ out.append("DBLINK ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("KEYWORDS")){
+ out.append("KEYWORDS ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("SOURCE")){
+ out.append("SOURCE ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("REFERENCE")){
+ out.append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("COMMENT")){
+ out.append("COMMENT ").append(sf.getDescription()).append(newline);
+ }else if (sf.getType().equals("BASE COUNT")){
+ out.append("BASE COUNT ").append(sf.getDescription()).append(newline);
+ }else{
+ if (!featureLinePrinted){
+ out.append("FEATURES Location/Qualifiers").append(newline);
+ featureLinePrinted = true;
+ }
+ out.append(" ").append(sf.getType()).append(" ").append(sf.getBegin()).append("..").append(sf.getEnd()).append(newline);
+ Hashtable<String,String> qualifiers = sf.otherDetails;
+ if (qualifiers!=null){
+ Enumeration<String> keys = qualifiers.keys();
+ while (keys.hasMoreElements()){
+ String key = keys.nextElement();
+ String value = qualifiers.get(key);
+ if (value!=null){
+ out.append(" /").append(key).append("=").append(value).append(newline);
+ }
+ }
+ }
+ }
+ }
+ out.append("ORIGIN").append(newline);
+ //We have to divide sequence in groups of 6x10 chars
+ String sequenceString = seq.getSequenceAsString();
+ int howManyGroups = (int) Math.floor(sequenceString.length()/60);
+ for (int i=0;i<=howManyGroups;i++){
+ String sequenceSegment = sequenceString.substring(i*60,Math.min((i+1)*60, sequenceString.length()));
+ if ((!"".equals(sequenceSegment) && (sequenceSegment!=null) && (sequenceSegment.length()>0))){
+ out.append(" ").append(60*i+1).append(" ");
+ }
+ int segmentLength = sequenceSegment.length();
+ if (segmentLength>=10){
+ out.append(sequenceSegment.substring(0,10)).append(" ");
+ if (segmentLength>=20){
+ out.append(sequenceSegment.substring(10,20)).append(" ");
+ if (segmentLength>=30){
+ out.append(sequenceSegment.substring(20,30)).append(" ");
+ if (segmentLength>=40){
+ out.append(sequenceSegment.substring(30,40)).append(" ");
+ if (segmentLength>=50){
+ out.append(sequenceSegment.substring(40,50)).append(" ");
+ if (segmentLength<=60){
+ out.append(sequenceSegment.substring(50,sequenceSegment.length()));
+ }
+ }else{
+ out.append(sequenceSegment.substring(40,sequenceSegment.length()));
+ }
+ }else{
+ out.append(sequenceSegment.substring(30,sequenceSegment.length()));
+ }
+ }else{
+ out.append(sequenceSegment.substring(20,sequenceSegment.length()));
+ }
+ }else{
+ out.append(sequenceSegment.substring(10,sequenceSegment.length()));
+ }
+ } else if ((!"".equals(sequenceSegment) && (sequenceSegment!=null) && (sequenceSegment.length()>0))){
+ out.append(sequenceSegment);
+ }
+ out.append(newline);
+ }
+ out.append("//");
+ }
+ return out.toString();
+ }
+}
}
data = data.toUpperCase();
+ if ((data.indexOf("LOCUS") > -1))
+ {
+ reply = "GENBANK";
+
+ break;
+ }
+
if ((data.indexOf("# STOCKHOLM") > -1))
{
reply = "STH";
--- /dev/null
+package jalview.io.xdb.genbank;
+
+import java.util.Enumeration;
+import java.util.Hashtable;
+
+public class GenBankFeature {
+ public static final String MISC_TYPE = "misc_feature";
+ public static final String SOURCE = "source";
+ public static final String CDS = "CDS";
+ public static final String GENE = "gene";
+ public static final String EXON = "exon";
+ public static final String INTRON = "intron";
+ public static final String PRIM_TRANSCRIPT = "prim_transcript";
+ public static final String mRNA = "mRNA";
+ public static final String MOBILE_ELEMENT = "mobile_element";
+ public static final String VARIATION = "variation";
+
+ private String type;
+ private Hashtable<String,String> qualifiers = new Hashtable<String,String>();
+ private GenBankLocation location = null;
+
+ public GenBankFeature() {
+ super();
+ }
+
+ public GenBankFeature(String type) {
+ super();
+ this.type = type;
+ }
+
+ public void addQualifier(String key, String value){
+ this.qualifiers.put(key, value);
+ }
+ public void updateQualifier(String key, String newValue){
+ this.qualifiers.remove(key);
+ this.qualifiers.put(key, newValue);
+ }
+
+ public String getQualifier(String key){
+ return this.qualifiers.get(key);
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+ public Enumeration<String> getQualifiersNames(){
+ return this.qualifiers.keys();
+ }
+ public int getQualifiersSize(){
+ return this.qualifiers.size();
+ }
+
+ public Hashtable<String, String> getFields() {
+ return qualifiers;
+ }
+
+ public GenBankLocation getLocation() {
+ return location;
+ }
+
+ public void setLocation(GenBankLocation location) {
+ this.location = location;
+ }
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ * The location contains at least one sequence location descriptor and may
+ * contain one or more operators with one or more sequence location descriptors.
+ * Base numbers refer to the numbering in the entry. This numbering designates
+ * the first base (5' end) of the presented sequence as base 1.
+ * Base locations beyond the range of the presented sequence may not be used in
+ * location descriptors, the only exception being location in a remote entry (see
+ * 3.5.2.1, e).
+ *
+ * Location operators and descriptors are discussed in more detail below.
+ *
+ * 3.5.2.1 Location descriptors
+ * The location descriptor can be one of the following:
+ * (a) a single base number
+ * (b) a site between two indicated adjoining bases
+ * (c) a single base chosen from within a specified range of bases (not allowed for new
+ * entries)
+ * (d) the base numbers delimiting a sequence span
+ * (e) a remote entry identifier followed by a local location descriptor
+ * (i.e., a-d)
+ *
+ * A site between two adjoining nucleotides, such as endonucleolytic cleavage
+ * site, is indicated by listing the two points separated by a carat (^). The
+ * permitted formats for this descriptor are n^n+1 (for example 55^56), or, for
+ * circular molecules, n^1, where "n" is the full length of the molecule, ie
+ * 1000^1 for circular molecule with length 1000.
+ *
+ * A single base chosen from a range of bases is indicated by the first base
+ * number and the last base number of the range separated by a single period
+ * (e.g., '12.21' indicates a single base taken from between the indicated
+ * points). From October 2006 the usage of this descriptor is restricted :
+ * it is illegal to use "a single base from a range" (c) either on its own or
+ * in combination with the "sequence span" (d) descriptor for newly created entries.
+ * The existing entries where such descriptors exist are going to be retrofitted.
+ *
+ * Sequence spans are indicated by the starting base number and the ending base
+ * number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may
+ * be used with the starting and ending base numbers to indicate that an end
+ * point is beyond the specified base number. The starting and ending base
+ * positions can be represented as distinct base numbers ('34..456') or a site
+ * between two indicated adjoining bases.
+ *
+ * A location in a remote entry (not the entry to which the feature table
+ * belongs) can be specified by giving the accession-number and sequence version
+ * of the remote entry, followed by a colon ":", followed by a location
+ * descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see
+ * also examples below)
+ *
+ * 3.5.2.2 Operators
+ *
+ * The location operator is a prefix that specifies what must be done to the
+ * indicated sequence to find or construct the location corresponding to the
+ * feature. A list of operators is given below with their definitions and most
+ * common format.
+ *
+ * complement(location)
+ * Find the complement of the presented sequence in the span specified by "
+ * location" (i.e., read the complement of the presented strand in its 5'-to-3'
+ * direction)
+ *
+ * join(location,location, ... location)
+ * The indicated elements should be joined (placed end-to-end) to form one
+ * contiguous sequence
+ *
+ * order(location,location, ... location)
+ * The elements can be found in the
+ * specified order (5' to 3' direction), but nothing is implied about the
+ * reasonableness about joining them
+ *
+ * Note : location operator "complement" can be used in combination with either "
+ * join" or "order" within the same location; combinations of "join" and "order"
+ * within the same location (nested operators) are illegal.
+ *
+ * 3.5.3 Location examples
+ *
+ * The following is a list of common location descriptors with their meanings:
+ * Location Description
+ * 467 Points to a single base in the presented sequence
+ * 340..565 Points to a continuous range of bases bounded by and
+ * including the starting and ending bases
+ * <345..500 Indicates that the exact lower boundary point of a feature
+ * is unknown. The location begins at some base previous to
+ * the first base specified (which need not be contained in
+ * the presented sequence) and continues to and includes the
+ * ending base
+ * <1..888 The feature starts before the first sequenced base and
+ * continues to and includes base 888
+ * 1..>888 The feature starts at the first sequenced base and
+ * continues beyond base 888
+ * 102.110 Indicates that the exact location is unknown but that it is
+ * one of the bases between bases 102 and 110, inclusive
+ * 123^124 Points to a site between bases 123 and 124
+ * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form
+ * one contiguous sequence
+ * complement(34..126) Start at the base complementary to 126 and finish at the
+ * base complementary to base 34 (the feature is on the strand
+ * complementary to the presented strand)
+ * complement(join(2691..4571,4918..5163))
+ * Joins regions 2691 to 4571 and 4918 to 5163, then
+ * complements the joined segments (the feature is on the
+ * strand complementary to the presented strand)
+ * join(complement(4918..5163),complement(2691..4571))
+ * Complements regions 4918 to 5163 and 2691 to 4571, then
+ * joins the complemented segments (the feature is on the
+ * strand complementary to the presented strand)
+ * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in
+ * this database) with primary accession number 'J00194'
+ * join(1..100,J00194.1:100..202)
+ * Joins region 1..100 of the existing entry with the region
+ * 100..202 of remote entry J00194
+ *
+ *
+ */
+public abstract class GenBankLocation {
+ // the location is complement strand?
+ private boolean complement = false;
+
+ public GenBankLocation() {
+ }
+
+ /**
+ * The minor location in genome sequence
+ *
+ * @return position
+ */
+ public abstract int getMinor();
+
+ /**
+ * The major location in genome sequence
+ *
+ * @return position
+ */
+ public abstract int getMajor();
+
+ /**
+ * @return the complement
+ */
+ public boolean isComplement() {
+ return complement;
+ }
+
+ /**
+ * @param complement the complement to set
+ */
+ public void setComplement(boolean complement) {
+ this.complement = complement;
+ }
+}
\ No newline at end of file
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ *
+ */
+public class GenBankLocationPoint extends GenBankLocation {
+ private String entry;
+ private char prefix = 0;
+ private int min = 0;
+ private char symbol = 0;
+ private int max = 0;
+ private char sufix = 0;
+
+ public GenBankLocationPoint() {
+ }
+
+ public GenBankLocationPoint(int point) {
+ this.min = point;
+ this.max = point;
+ }
+
+ public GenBankLocationPoint(int min, int max) {
+ this.min = min;
+ this.max = max;
+ }
+
+ public int getMinor() {
+ return this.min;
+ }
+
+ public int getMajor() {
+ return this.max;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if( prefix != 0 && prefix != ' ' ) {
+ sb.append(prefix);
+ }
+ if( symbol == '.' || symbol == '^' ) {
+ sb.append( String.format("%d%c%d",min,symbol,max) );
+ } else {
+ if( min != max ) {
+ sb.append( String.format("%d.%d",min,max) );
+ } else {
+ sb.append( min );
+ }
+ }
+ if( sufix != 0 && sufix != ' ' ) {
+ sb.append(sufix);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * @return the prefix
+ */
+ public char getPrefix() {
+ return prefix;
+ }
+
+ /**
+ * @param prefix the prefix to set
+ */
+ public void setPrefix(char prefix) {
+ this.prefix = prefix;
+ }
+
+ /**
+ * @return the min
+ */
+ public int getMin() {
+ return min;
+ }
+
+ /**
+ * @param min the min to set
+ */
+ public void setMin(int min) {
+ this.min = min;
+ }
+
+ /**
+ * @return the symbol
+ */
+ public char getSymbol() {
+ return symbol;
+ }
+
+ /**
+ * @param symbol the symbol to set
+ */
+ public void setSymbol(char symbol) {
+ this.symbol = symbol;
+ }
+
+ /**
+ * @return the max
+ */
+ public int getMax() {
+ return max;
+ }
+
+ /**
+ * @param max the max to set
+ */
+ public void setMax(int max) {
+ this.max = max;
+ }
+
+ /**
+ * @return the sufix
+ */
+ public char getSufix() {
+ return sufix;
+ }
+
+ /**
+ * @param sufix the sufix to set
+ */
+ public void setSufix(char sufix) {
+ this.sufix = sufix;
+ }
+
+ /**
+ * @return the entry
+ */
+ public String getEntry() {
+ return entry;
+ }
+
+ /**
+ * @param entry the entry to set
+ */
+ public void setEntry(String entry) {
+ this.entry = entry;
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ *
+ */
+public class GenBankLocationRange extends GenBankLocation {
+ private String entry = null;
+ private GenBankLocationPoint start = null;
+ private GenBankLocationPoint end = null;
+
+ public GenBankLocationRange() {
+ }
+
+ @Override
+ public int getMinor() {
+ return start == null ? 0 : start.getMinor();
+ }
+
+ @Override
+ public int getMajor() {
+ return end == null ? 0 : end.getMajor();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+// if( getDirecao() != '5' ) {
+// sb.append("complement(");
+// }
+ if( entry != null ) {
+ sb.append(entry);
+ sb.append(":");
+ }
+ if( getStart() != null ) {
+ sb.append( getStart().toString() );
+ }
+ if( getEnd() != null && getStart() != getEnd() && !start.equals(end) ) {
+ sb.append("..");
+ sb.append( getEnd().toString() );
+ }
+ return sb.toString();
+ }
+
+ /**
+ * @return the entry
+ */
+ public String getEntry() {
+ return entry;
+ }
+
+ /**
+ * @param entry the entry to set
+ */
+ public void setEntry(String entry) {
+ this.entry = entry;
+ }
+
+ /**
+ * @return the start
+ */
+ public GenBankLocationPoint getStart() {
+ return start;
+ }
+
+ /**
+ * @param start the start to set
+ */
+ public void setStart(GenBankLocationPoint start) {
+ this.start = start;
+ }
+
+ /**
+ * @return the end
+ */
+ public GenBankLocationPoint getEnd() {
+ return end;
+ }
+
+ /**
+ * @param end the end to set
+ */
+ public void setEnd(GenBankLocationPoint end) {
+ this.end = end;
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ *
+ * @author Dieval Guizelini
+ */
+public class GenBankLocations extends GenBankLocation {
+ public static final int NONE = 1; // default
+ public static final int COMPLEMENT = 2;
+ public static final int JOIN = 3;
+ public static final int ORDER = 4; // conj com ordem desconhecida
+ private int operator = NONE;
+ private java.util.List<GenBankLocation> units;
+
+ public GenBankLocations() {
+ units = new java.util.ArrayList<GenBankLocation>();
+ }
+
+ @Override
+ public void setComplement(boolean complement){
+ super.setComplement(complement);
+ this.operator = COMPLEMENT;
+ if (units != null) {
+ for (GenBankLocation o : units) {
+ o.setComplement(complement);
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if (getOperator() == COMPLEMENT) {
+ sb.append("complement(");
+ } else if (getOperator() == JOIN) {
+ sb.append("join(");
+ } else if (getOperator() == ORDER) {
+ sb.append("order(");
+ }
+ if (units.size() > 0) {
+ sb.append(units.get(0).toString());
+ for (int i = 1; i < units.size(); i++) {
+ sb.append(",");
+ sb.append(units.get(i).toString());
+ }
+ }
+ if (getOperator() != NONE) {
+ sb.append(")");
+ }
+ return sb.toString();
+ }
+
+ /**
+ * @return the units
+ */
+ public java.util.List<GenBankLocation> getUnits() {
+ return units;
+ }
+
+ /**
+ * @param units the units to set
+ */
+ public void setUnits(java.util.List<GenBankLocation> units) {
+ this.units = units;
+ }
+
+ @Override
+ public int getMinor() {
+ if( units.size() > 0 ) {
+ return units.get(0).getMinor();
+ }
+ return 0;
+ }
+
+ @Override
+ public int getMajor() {
+ int ind = units.size();
+ if( ind > 0 ) {
+ return units.get(ind-1).getMajor();
+ }
+ return 0;
+ }
+
+ /**
+ * @return the operator
+ */
+ public int getOperator() {
+ return operator;
+ }
+
+ /**
+ * @param operator the operator to set
+ */
+ public void setOperator(int operator) {
+ this.operator = operator;
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ * A short mnemonic name for the entry, chosen to suggest the
+ * sequence's definition. Mandatory keyword/exactly one record.
+ *
+ * <p>The LOCUS field contains a number of different data elements, including locus name,
+ * sequence length, molecule type, GenBank division, and modification date. Each element
+ * is described below.</p>
+ *
+ */
+public class GenBankLocus {
+ private String name;
+ private int sequenceLength;
+ private String strand;
+ private String moleculeType;
+ private boolean linearSequence;
+ private String division;
+ private String modificationDate;
+
+ public GenBankLocus() {
+ }
+
+ public GenBankLocus(String name, int sequenceLength) {
+ this.name = name;
+ this.sequenceLength = sequenceLength;
+ }
+
+
+ /**
+ * @return the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @param name the name to set
+ */
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * @return the sequenceLength
+ */
+ public int getSequenceLength() {
+ return sequenceLength;
+ }
+
+ /**
+ * @param sequenceLength the sequenceLength to set
+ */
+ public void setSequenceLength(int sequenceLength) {
+ this.sequenceLength = sequenceLength;
+ }
+
+ /**
+ * @return the strand
+ */
+ public String getStrand() {
+ return strand;
+ }
+
+ /**
+ * @param strand the strand to set
+ */
+ public void setStrand(String strand) {
+ this.strand = strand;
+ }
+
+ /**
+ * @return the moleculeType
+ */
+ public String getMoleculeType() {
+ return moleculeType;
+ }
+
+ /**
+ * @param moleculeType the moleculeType to set
+ */
+ public void setMoleculeType(String moleculeType) {
+ this.moleculeType = moleculeType;
+ }
+
+ /**
+ * @return the linearSequence
+ */
+ public boolean isLinearSequence() {
+ return linearSequence;
+ }
+
+ /**
+ * @param linearSequence the linearSequence to set
+ */
+ public void setLinearSequence(boolean linearSequence) {
+ this.linearSequence = linearSequence;
+ }
+
+ /**
+ * @return the division
+ */
+ public String getDivision() {
+ return division;
+ }
+
+ /**
+ * @param division the division to set
+ */
+ public void setDivision(String division) {
+ this.division = division;
+ }
+
+ /**
+ * @return the modificationDate
+ */
+ public String getModificationDate() {
+ return modificationDate;
+ }
+
+ /**
+ * @param modificationDate the modificationDate to set
+ */
+ public void setModificationDate(String modificationDate) {
+ this.modificationDate = modificationDate;
+ }
+
+ @Override
+ public String toString() {
+
+ return String.format("LOCUS %-16s %11d bp %3s %6s %-8s %3s %s",
+ this.name, this.sequenceLength, this.strand,
+ this.moleculeType, linearSequence?"linear ":"circular",
+ this.division, ((modificationDate == null) || (modificationDate.equals("")) ? "" : modificationDate.toUpperCase())
+ );
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+public class GenBankReference {
+ private int order;
+ private int begin;
+ private int end;
+ private String descriptor;
+ private String authors;
+ private String title;
+ private String journal;
+ private String pubmed;
+ private String medline;
+ private String consortia;
+ private String remark;
+
+ public GenBankReference() {
+ super();
+ }
+
+ public String getDescriptor() {
+ return descriptor;
+ }
+
+ public void setDescriptor(String descriptor) {
+ this.descriptor = descriptor;
+ }
+
+ public String getAuthors() {
+ return authors;
+ }
+
+ public void setAuthors(String authors) {
+ this.authors = authors;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ public String getJournal() {
+ return journal;
+ }
+
+ public void setJournal(String journal) {
+ this.journal = journal;
+ }
+
+ public String getPubmed() {
+ return pubmed;
+ }
+
+ public void setPubmed(String pubmed) {
+ this.pubmed = pubmed;
+ }
+
+ public int getOrder() {
+ return order;
+ }
+
+ public void setOrder(int order) {
+ this.order = order;
+ }
+ public int getBegin() {
+ return begin;
+ }
+
+ public void setBegin(int begin) {
+ this.begin = begin;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+ public void setEnd(int end) {
+ this.end = end;
+ }
+
+
+ public String getMedline() {
+ return medline;
+ }
+
+ public void setMedline(String medline) {
+ this.medline = medline;
+ }
+
+ public String getConsortia() {
+ return consortia;
+ }
+
+ public void setConsortia(String consortia) {
+ this.consortia = consortia;
+ }
+
+ public String getRemark() {
+ return remark;
+ }
+
+ public void setRemark(String remark) {
+ this.remark = remark;
+ }
+
+ public String toString(){
+// References has the following format
+// REFERENCE 1 (bases 1 to 1976)
+// AUTHORS Spritz,R.A., DeRiel,J.K., Forget,B.G. and Weissman,S.M.
+// TITLE Complete nucleotide sequence of the human delta-globin gene
+// JOURNAL Cell 21 (3), 639-646 (1980)
+// PUBMED 7438204
+
+ StringBuffer buf = new StringBuffer();
+ buf.append("REFERENCE ").append(this.getOrder()).append(" (bases ").append(this.getBegin()).append(" to ").append(this.getEnd()).append(")\n");
+ if (this.getAuthors()!=null)
+ buf.append(" AUTHORS ").append(this.getAuthors()).append("\n");
+ if (this.getTitle()!=null)
+ buf.append(" TITLE ").append(this.getTitle()).append("\n");
+ if (this.getJournal()!=null)
+ buf.append(" JOURNAL ").append(this.getJournal()).append("\n");
+ if (this.getPubmed()!=null)
+ buf.append(" PUBMED ").append(this.getPubmed()).append("\n");;
+ if (this.getMedline()!=null)
+ buf.append(" MEDLINE ").append(this.getMedline()).append("\n");;
+ if (this.getRemark()!=null)
+ buf.append(" REMARK ").append(this.getRemark()).append("\n");;
+ if (this.getConsortia()!=null)
+ buf.append(" CONSRTM ").append(this.getConsortia()).append("\n");;
+ return buf.toString();
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+import java.util.Vector;
+/**
+ * A line like the following:
+ * 1 aatgaaggtt catttttcat tctcacaaac taatgaaacc ctgcttatct taaaccaacc
+ * will be mapped as:
+ * id: 1
+ * sequences: {"aatgaaggtt", "catttttcat", "tctcacaaac", "taatgaaacc", "ctgcttatct", "taaaccaacc"}
+ * Each sequence has 8 nucleotides long
+ * @author darolmar
+ *
+ */
+public class GenBankSequence {
+ //Initial position
+ private int id;
+ //Sequences in that line
+ private Vector<String> sequences;
+
+ public GenBankSequence() {
+ super();
+ sequences = new Vector<String>();
+ }
+
+ public int getId() {
+ return id;
+ }
+
+ public void setId(int id) {
+ this.id = id;
+ }
+
+ public Vector<String> getSequences() {
+ return sequences;
+ }
+
+ public void setSequences(Vector<String> sequences) {
+ this.sequences = sequences;
+ }
+
+ public String getSequencesAsString(){
+ StringBuffer sb = new StringBuffer();
+ for (String seq:sequences)
+ sb.append(seq).append(" ");
+ return sb.toString();
+ }
+
+ public String toString(){
+ StringBuffer sb = new StringBuffer()
+ .append(" ").append(this.id);
+ for (String seq:sequences)
+ sb.append(" ").append(seq);
+ sb.append("\n");
+ return sb.toString();
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ * <p>Free-format information including an abbreviated form of the organism
+ * name, sometimes followed by a molecule type. (See section 3.4.10 of the
+ * GenBank release notes for more info.)</p>
+ * <p>Entrez Search Field: Organism [ORGN] </p>
+ * <p>Search Tip: For some organisms that have well-established common names,
+ * such as baker's yeast, mouse, and human, a search for the common name will
+ * yield the same results as a search for the scientific name, e.g., a search
+ * for "baker's yeast" in the organism field retrieves the same number of
+ * documents as "Saccharomyces cerevisiae". This is true because the Organism
+ * field is connected to the NCBI Taxonomy Database, which contains
+ * cross-references between common names, scientific names, and synonyms for
+ * organisms represented in the Sequence databases.</p>
+ * <h1>Organism</h1>
+ * <p>The formal scientific name for the source organism (genus and species,
+ * where appropriate) and its lineage, based on the phylogenetic classification
+ * scheme used in the NCBI Taxonomy Database. If the complete lineage of an
+ * organism is very long, an abbreviated lineage will be shown in the GenBank
+ * record and the complete lineage will be available in the Taxonomy Database.
+ * (See also the /db_xref=taxon:nnnn Feature qualifer, below.)</p>
+ * <p>Entrez Search Field: Organism [ORGN] </p>
+ * <p>Search Tip: You can search the Organism field by any node in the taxonomic
+ * hierarchy, e.g., you can search for the term "Saccharomyces cerevisiae",
+ * "Saccharomycetales", "Ascomycota", etc. to retrieve all the sequences from
+ * organisms in a particular taxon. </p>
+ *
+ */
+public class GenBankSource {
+ private String source="";
+ private String organism="";
+ private String taxonomic="";
+
+ public GenBankSource() {
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s\n\t%s\n\t%s", getSource(), getOrganism(), getTaxonomic());
+ }
+
+ /**
+ * @return the source
+ */
+ public String getSource() {
+ return source;
+ }
+
+ /**
+ * @param source the source to set
+ */
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ /**
+ * @return the organism
+ */
+ public String getOrganism() {
+ return organism;
+ }
+
+ /**
+ * @param organism the organism to set
+ */
+ public void setOrganism(String organism) {
+ this.organism = organism;
+ }
+
+ /**
+ * @return the taxonomic
+ */
+ public String getTaxonomic() {
+ return taxonomic;
+ }
+
+ /**
+ * @param taxonomic the taxonomic to set
+ */
+ public void setTaxonomic(String taxonomic) {
+ this.taxonomic = taxonomic;
+ }
+
+}
--- /dev/null
+package jalview.io.xdb.genbank;
+
+/**
+ * <p>A nucleotide sequence identification number that represents a single,
+ * specific sequence in the GenBank database. This identification number uses
+ * the accession.version format implemented by GenBank/EMBL/DDBJ in
+ * February 1999.</p>
+ * <p>If there is any change to the sequence data (even a single base), the
+ * version number will be increased, e.g., U12345.1 → U12345.2, but the
+ * accession portion will remain stable.</p>
+ * <p>The accession.version system of sequence identifiers runs parallel to
+ * the GI number system, i.e., when any change is made to a sequence, it
+ * receives a new GI number AND an increase to its version number.</p>
+ * <p>For more information, see section 1.3.2 of the GenBank 111.0 release
+ * notes, and section 3.4.7 of the current GenBank release notes.</p>
+ * <p>A Sequence Revision History tool is available to track the various GI
+ * numbers, version numbers, and update dates for sequences that appeared in
+ * a specific GenBank record (more information and example).</p>
+ * <p>More details about sequence identification numbers and the difference
+ * between GI number and version are provided in Sequence Identifiers:
+ * A Historical Note.</p>
+ * <p>Entrez Search Field: use the default setting of "All Fields"</p>
+ * <h1>GI</h1>
+ * <p>"GenInfo Identifier" sequence identification number, in this case, for
+ * the nucleotide sequence. If a sequence changes in any way, a new GI number
+ * will be assigned.</p>
+ * <p>A separate GI number is also assigned to each protein translation within
+ * a nucleotide sequence record, and a new GI is assigned if the protein
+ * translation changes in any way (see below).</p>
+ * <p>GI sequence identifiers run parallel to the new accession.version system
+ * of sequence identifiers. For more information, see the description of Version,
+ * above, and section 3.4.7 of the current GenBank release notes.</p>
+ * <p>A Sequence Revision History tool is available to track the various GI
+ * numbers, version numbers, and update dates for sequences that appeared in a
+ * specific GenBank record (more information and example).</p>
+ * <p>More details about sequence identification numbers and the difference
+ * between GI number and version are provided in Sequence Identifiers: A
+ * Historical Note.</p>
+ * <p>Entrez Search Field: use the default setting of "All Fields"</p>
+ * @author Dieval Guizelini
+ * @see Entry
+ */
+public class GenBankVersion {
+ private String version = "";
+ private String gi = "";
+
+ public GenBankVersion() {
+ }
+
+
+ /**
+ * @return the version
+ */
+ public String getVersion() {
+ return version;
+ }
+
+ /**
+ * @param version the version to set
+ */
+ public void setVersion(String version) {
+ this.version = version;
+ }
+
+ /**
+ * @return the gi
+ */
+ public String getGI() {
+ return gi;
+ }
+
+ /**
+ * @param gi the gi to set
+ */
+ public void setGI(String gi) {
+ this.gi = gi;
+ }
+
+
+ /**
+ * Version section in GenBank File Format is text with two fields (version and GI).
+ *
+ * @return version+" "+gi
+ */
+ @Override
+ public String toString() {
+ return String.format("%s %s",version,gi);
+ }
+}
--- /dev/null
+LOCUS GU324925 15440 bp DNA linear PRI 10-AUG-2010
+DEFINITION Homo sapiens hemoglobin, gamma A (HBG1) gene, complete cds.
+ACCESSION GU324925
+VERSION GU324925.1 GI:302313142
+KEYWORDS .
+SOURCE Homo sapiens (human)
+ ORGANISM Homo sapiens
+ Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+ Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+ Catarrhini; Hominidae; Homo.
+REFERENCE 1 (bases 1 to 15440)
+ AUTHORS Rieder,M.J., Bertucci,C., Stanaway,I.B., Johnson,E.J.,
+ Swanson,J.E., Siegel,D.L., da Ponte,S.H., Igartua,C., Patterson,K.
+ and Nickerson,D.A.
+ TITLE Direct Submission
+ JOURNAL Submitted (25-NOV-2009) Genome Sciences, University of Washington,
+ 1705 NE Pacific, Seattle, WA 98195, USA
+COMMENT To cite this work please use: NHLBI Resequencing and Genotyping
+ Service (RSG),UW HV48194, Department of Genome Sciences, Seattle,
+ WA 98195-7730.
+FEATURES Location/Qualifiers
+ source 1..15440
+ /organism="Homo sapiens"
+ /mol_type="genomic DNA"
+ /db_xref="taxon:9606"
+ mobile_element 179..289
+ /mobile_element_type="LINE:L2"
+ variation 293
+ /frequency="0.0328"
+ /replace="t"
+ variation 337
+ /frequency="0.0027"
+ /replace="c"
+ mobile_element 345..530
+ /mobile_element_type="other:LTR/ERV1"
+ variation 406
+ /frequency="0.3873"
+ /replace="a"
+ variation 534
+ /frequency="0.6279"
+ /replace=""
+ mobile_element 544..619
+ /mobile_element_type="LINE:L1"
+ variation 568
+ /frequency="0.3088"
+ /replace="t"
+ variation 692
+ /frequency="0.3038"
+ /replace="g"
+ variation 757
+ /frequency="0.0053"
+ /replace="t"
+ variation 935
+ /frequency="0.1888"
+ /replace="g"
+ variation 1017
+ /frequency="0.0026"
+ /replace="t"
+ variation 1202
+ /frequency="0.0133"
+ /replace="a"
+ variation 1350
+ /frequency="0.3617"
+ /replace="t"
+ variation 1418
+ /frequency="0.1818"
+ /replace="a"
+ variation 1507
+ /frequency="0.2527"
+ /replace="a"
+ variation 1522
+ /frequency="0.0027"
+ /replace="g"
+ variation 1608
+ /frequency="0.0211"
+ /replace="a"
+ variation 1637
+ /frequency="0.0395"
+ /replace="c"
+ variation 1650
+ /frequency="0.0211"
+ /replace="g"
+ variation 1682
+ /frequency="0.0211"
+ /replace="t"
+ variation 1689
+ /frequency="0.0211"
+ /replace="g"
+ variation 1697
+ /frequency="0.0211"
+ /replace="g"
+ variation 1699
+ /frequency="0.0211"
+ /replace="a"
+ variation 1735
+ /frequency="0.0816"
+ /replace="t"
+ variation 1990
+ /frequency="0.0027"
+ /replace="g"
+ gene 2006..3591
+ /gene="HBG1"
+ mRNA join(2006..2150,2273..2495,3376..3591)
+ /gene="HBG1"
+ /product="hemoglobin, gamma A"
+ variation 2030
+ /gene="HBG1"
+ /frequency="0.1657"
+ /replace="a"
+ CDS join(2059..2150,2273..2495,3376..3504)
+ /gene="HBG1"
+ /codon_start=1
+ /product="hemoglobin, gamma A"
+ /protein_id="ADL14496.1"
+ /db_xref="GI:302313143"
+ /translation="MGHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFD
+ SFGNLSSASAIMGNPKVKAHGKKVLTSLGDATKHLDDLKGTFAQLSELHCDKLHVDPE
+ NFKLLGNVLVTVLAIHFGKEFTPEVQASWQKMVTAVASALSSRYH"
+ variation 2190
+ /gene="HBG1"
+ /frequency="0.3059"
+ /replace="a"
+ variation 2191
+ /gene="HBG1"
+ /frequency="0.3032"
+ /replace="a"
+ variation 2215
+ /gene="HBG1"
+ /frequency="0.1862"
+ /replace="t"
+ variation 2407
+ /gene="HBG1"
+ /frequency="0.1342"
+ /replace="t"
+ variation 2518
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="t"
+ variation 2519
+ /gene="HBG1"
+ /frequency="0.3342"
+ /replace="a"
+ variation 2554
+ /gene="HBG1"
+ /frequency="0.4763"
+ /replace="t"
+ variation 2610
+ /gene="HBG1"
+ /frequency="0.3128"
+ /replace="a"
+ variation 2643
+ /gene="HBG1"
+ /frequency="0.0289"
+ /replace="c"
+ variation 2653
+ /gene="HBG1"
+ /frequency="0.3105"
+ /replace="c"
+ variation 2675
+ /gene="HBG1"
+ /frequency="0.1895"
+ /replace="c"
+ variation 2682
+ /gene="HBG1"
+ /frequency="0.3105"
+ /replace="g"
+ variation 2700
+ /gene="HBG1"
+ /frequency="0.3842"
+ /replace="c"
+ variation 2746..2749
+ /gene="HBG1"
+ /frequency="0.2226"
+ /replace=""
+ variation 2758
+ /gene="HBG1"
+ /frequency="0.3281"
+ /replace="t"
+ variation 2760
+ /gene="HBG1"
+ /frequency="0.3219"
+ /replace="g"
+ variation 2777
+ /gene="HBG1"
+ /frequency="0.2959"
+ /replace="c"
+ variation 2939
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="c"
+ variation 3023
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="c"
+ variation 3037
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="t"
+ variation 3064
+ /gene="HBG1"
+ /frequency="0.0079"
+ /replace="c"
+ variation 3073
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="c"
+ variation 3074
+ /gene="HBG1"
+ /frequency="0.0553"
+ /replace="t"
+ variation 3142
+ /gene="HBG1"
+ /frequency="0.0056"
+ /replace="g"
+ variation 3158
+ /gene="HBG1"
+ /frequency="0.0028"
+ /replace="a"
+ variation 3162
+ /gene="HBG1"
+ /frequency="0.2781"
+ /replace="g"
+ variation 3205
+ /gene="HBG1"
+ /frequency="0.3580"
+ /replace="g"
+ variation 3206
+ /gene="HBG1"
+ /frequency="0.3571"
+ /replace="c"
+ variation 3210
+ /gene="HBG1"
+ /frequency="0.3621"
+ /replace="a"
+ variation 3211
+ /gene="HBG1"
+ /frequency="0.3103"
+ /replace="a"
+ variation 3238
+ /gene="HBG1"
+ /frequency="0.2672"
+ /replace="a"
+ variation 3287
+ /gene="HBG1"
+ /frequency="0.1784"
+ /replace="a"
+ variation 3287
+ /gene="HBG1"
+ /frequency="0.3351"
+ /replace="t"
+ variation 3291
+ /gene="HBG1"
+ /frequency="0.0081"
+ /replace="t"
+ variation 3294
+ /gene="HBG1"
+ /frequency="0.1459"
+ /replace="g"
+ variation 3303
+ /gene="HBG1"
+ /frequency="0.0081"
+ /replace="a"
+ variation 3507
+ /gene="HBG1"
+ /frequency="0.1349"
+ /replace="c"
+ variation 3508
+ /gene="HBG1"
+ /frequency="0.1402"
+ /replace="t"
+ variation 3509
+ /gene="HBG1"
+ /frequency="0.1349"
+ /replace="c"
+ variation 3510
+ /gene="HBG1"
+ /frequency="0.1508"
+ /replace="t"
+ variation 3519
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="c"
+ variation 3538
+ /gene="HBG1"
+ /frequency="0.0026"
+ /replace="c"
+ variation 3556
+ /gene="HBG1"
+ /frequency="0.0464"
+ /replace=""
+ variation 3620
+ /frequency="0.0053"
+ /replace="t"
+ variation 3628
+ /frequency="0.2751"
+ /replace="a"
+ variation 3644
+ /frequency="0.0026"
+ /replace="g"
+ variation 3750
+ /frequency="0.1852"
+ /replace="t"
+ variation 3763
+ /frequency="0.3651"
+ /replace="t"
+ variation 3953
+ /frequency="0.1349"
+ /replace="t"
+ variation 4296
+ /frequency="0.1947"
+ /replace="t"
+ variation 4324
+ /frequency="0.0026"
+ /replace="a"
+ variation 4333
+ /frequency="0.0053"
+ /replace="a"
+ variation 4341
+ /frequency="0.1342"
+ /replace="g"
+ mobile_element 4365..4701
+ /mobile_element_type="other:LTR/MaLR"
+ variation 4471
+ /frequency="0.1958"
+ /replace="c"
+ variation 4472
+ /frequency="0.0291"
+ /replace="c"
+ variation 4595
+ /frequency="0.0054"
+ /replace="a"
+ variation 4609
+ /frequency="0.0108"
+ /replace="t"
+ variation 4687
+ /frequency="0.1958"
+ /replace="a"
+ variation 4938
+ /frequency="0.0185"
+ /replace="a"
+ mobile_element 4976..5095
+ /mobile_element_type="LINE:L2"
+ variation 5070
+ /frequency="0.0026"
+ /replace="g"
+ variation 5106
+ /frequency="0.0132"
+ /replace="a"
+ mobile_element 5134..5321
+ /mobile_element_type="other:LTR/ERV1"
+ variation 5179
+ /frequency="0.0079"
+ /replace="t"
+ variation 5307
+ /frequency="0.0053"
+ /replace="c"
+ mobile_element 5322..5414
+ /mobile_element_type="LINE:L1"
+ mobile_element 5415..5887
+ /mobile_element_type="LINE:L1"
+ variation 5423
+ /frequency="0.0053"
+ /replace="t"
+ variation 5532
+ /frequency="0.0026"
+ /replace="t"
+ variation 5671
+ /frequency="0.0027"
+ /replace="c"
+ variation 5754
+ /frequency="0.0143"
+ /replace="a"
+ variation 5871
+ /frequency="0.1057"
+ /replace="t"
+ mobile_element 5898..6061
+ /mobile_element_type="LINE:L2"
+ variation 6086
+ /frequency="0.0158"
+ /replace="c"
+ variation 6132
+ /frequency="0.0816"
+ /replace="g"
+ variation 6135
+ /frequency="0.0158"
+ /replace="a"
+ variation 6165
+ /frequency="0.0079"
+ /replace="c"
+ variation 6170
+ /frequency="0.0026"
+ /replace="a"
+ variation 6200
+ /frequency="0.0026"
+ /replace="g"
+ variation 6200
+ /frequency="0.0053"
+ /replace="t"
+ variation 6286
+ /frequency="0.0026"
+ /replace="a"
+ variation 6296
+ /frequency="0.0447"
+ /replace="t"
+ variation 6365
+ /frequency="0.1921"
+ /replace="a"
+ variation 6379
+ /frequency="0.0026"
+ /replace="t"
+ variation 6467
+ /frequency="0.0132"
+ /replace="a"
+ variation 6638
+ /frequency="0.0159"
+ /replace="t"
+ variation 6860
+ /frequency="0.0238"
+ /replace="c"
+ variation 6955
+ /frequency="0.0053"
+ /replace="a"
+ variation 7107
+ /frequency="0.0026"
+ /replace="t"
+ variation 7315
+ /frequency="0.0026"
+ /replace="t"
+ mobile_element 7396..7708
+ /mobile_element_type="SINE:Alu"
+ variation 7413
+ /frequency="0.3128"
+ /replace="t"
+ variation 7535
+ /frequency="0.0026"
+ /replace="a"
+ variation 7618
+ /frequency="0.5000"
+ /replace="a"
+ variation 7727
+ /frequency="0.3677"
+ /replace="g"
+ variation 7761
+ /frequency="0.0106"
+ /replace="a"
+ variation 7872
+ /frequency="0.0080"
+ /replace="t"
+ variation 7973
+ /frequency="0.0132"
+ /replace="g"
+ variation 7987
+ /frequency="0.2895"
+ /replace="t"
+ variation 8164
+ /frequency="0.0737"
+ /replace="c"
+ variation 8171
+ /frequency="0.0526"
+ /replace="c"
+ variation 8384
+ /frequency="0.0026"
+ /replace="a"
+ variation 8410
+ /frequency="0.0026"
+ /replace="a"
+ variation 8814
+ /frequency="0.0079"
+ /replace="a"
+ variation 8830
+ /frequency="0.0053"
+ /replace="g"
+ variation 8947
+ /frequency="0.1816"
+ /replace="c"
+ variation 8962
+ /frequency="0.0026"
+ /replace="t"
+ variation 9102
+ /frequency="0.0079"
+ /replace="g"
+ variation 9240
+ /frequency="0.1000"
+ /replace="c"
+ variation 9256
+ /frequency="0.0026"
+ /replace="c"
+ variation 9281..9284
+ /frequency="0.9484"
+ /replace=""
+ variation 9322
+ /frequency="0.0053"
+ /replace="g"
+ variation 9338
+ /frequency="0.0133"
+ /replace="c"
+ variation 9374
+ /frequency="0.9658"
+ /replace=""
+ variation 9411
+ /frequency="0.1842"
+ /replace="a"
+ variation 9517
+ /frequency="0.0737"
+ /replace="c"
+ variation 9558
+ /frequency="0.0079"
+ /replace="c"
+ variation 9645
+ /frequency="0.0133"
+ /replace="c"
+ variation 9752..9773
+ /frequency="0.0080"
+ /replace=""
+ variation 9759
+ /frequency="0.0027"
+ /replace="g"
+ variation 9791
+ /frequency="0.0426"
+ /replace="g"
+ variation 10103
+ /frequency="0.0968"
+ /replace="t"
+ variation 10104
+ /frequency="0.0054"
+ /replace="a"
+ variation 10244
+ /frequency="0.1271"
+ /replace=""
+ variation 10251
+ /frequency="0.0169"
+ /replace="t"
+ variation 10312
+ /frequency="0.3511"
+ /replace="g"
+ mobile_element 10527..10608
+ /mobile_element_type="SINE:MIR"
+ variation 10565
+ /frequency="0.0027"
+ /replace="c"
+ variation 10705..10706
+ /frequency="0.8806"
+ /replace=""
+ variation 10821
+ /frequency="0.0160"
+ /replace="a"
+ variation 10864
+ /frequency="0.0642"
+ /replace="a"
+ variation 10944
+ /frequency="0.0027"
+ /replace="a"
+ variation 11154
+ /frequency="0.0163"
+ /replace="t"
+ variation 11259
+ /frequency="0.0136"
+ /replace="g"
+ variation 11475
+ /frequency="0.0027"
+ /replace="t"
+ variation 11626
+ /frequency="0.1190"
+ /replace="a"
+ variation 11706
+ /frequency="0.0106"
+ /replace="a"
+ variation 11708
+ /frequency="0.0026"
+ /replace="g"
+ variation 11722
+ /frequency="0.1190"
+ /replace="t"
+ variation 11818
+ /frequency="0.0079"
+ /replace="g"
+ variation 11857
+ /frequency="0.4418"
+ /replace="c"
+ variation 11910
+ /frequency="0.0079"
+ /replace="c"
+ variation 12024
+ /frequency="0.0158"
+ /replace="a"
+ mobile_element 12140..12250
+ /mobile_element_type="LINE:L2"
+ variation 12160
+ /frequency="0.0789"
+ /replace="c"
+ variation 12253
+ /frequency="0.9947"
+ /replace=""
+ variation 12267
+ /frequency="0.0079"
+ /replace="g"
+ variation 12317
+ /frequency="0.0026"
+ /replace="c"
+ variation 12350
+ /frequency="0.0079"
+ /replace="a"
+ variation 12521
+ /frequency="0.3042"
+ /replace="g"
+ variation 12551
+ /frequency="0.0105"
+ /replace="a"
+ variation 12639
+ /frequency="0.2857"
+ /replace="g"
+ variation 12697
+ /frequency="0.0106"
+ /replace="g"
+ mobile_element 12718..12948
+ /mobile_element_type="other:LTR/ERVL"
+ variation 12731
+ /frequency="0.0026"
+ /replace="t"
+ variation 12740
+ /frequency="0.0053"
+ /replace="a"
+ variation 12787
+ /frequency="0.0026"
+ /replace="t"
+ variation 12814
+ /frequency="0.2196"
+ /replace="a"
+ variation 12975
+ /frequency="0.1164"
+ /replace="g"
+ variation 12987
+ /frequency="0.1190"
+ /replace="g"
+ variation 13030
+ /frequency="0.1170"
+ /replace="a"
+ variation 13042
+ /frequency="0.0718"
+ /replace="g"
+ mobile_element 13120..13246
+ /mobile_element_type="other:LTR/ERVL"
+ variation 13138
+ /frequency="0.1156"
+ /replace="c"
+ variation 13286
+ /frequency="0.0161"
+ /replace="t"
+ variation 13329
+ /frequency="0.1216"
+ /replace="c"
+ variation 13370
+ /frequency="0.4081"
+ /replace="g"
+ mobile_element 13541..13842
+ /mobile_element_type="SINE:Alu"
+ variation 13563
+ /frequency="0.1243"
+ /replace="g"
+ variation 13678
+ /frequency="0.4021"
+ /replace="t"
+ variation 13749
+ /frequency="0.0027"
+ /replace="t"
+ variation 13794
+ /frequency="0.0316"
+ /replace="t"
+ variation 13805
+ /frequency="0.3829"
+ /replace="t"
+ variation 13808
+ /frequency="0.3818"
+ /replace="a"
+ variation 13992..13993
+ /frequency="0.5895"
+ /replace=""
+ variation 14110
+ /frequency="0.4105"
+ /replace="t"
+ variation 14158
+ /frequency="0.0079"
+ /replace="g"
+ mobile_element 14206..14493
+ /mobile_element_type="LINE:L2"
+ variation 14239
+ /frequency="0.0079"
+ /replace="g"
+ variation 14243
+ /frequency="0.4105"
+ /replace="t"
+ variation 14247
+ /frequency="0.4105"
+ /replace="t"
+ variation 14264
+ /frequency="0.0026"
+ /replace="t"
+ variation 14271
+ /frequency="0.0184"
+ /replace="g"
+ variation 14272
+ /frequency="0.4132"
+ /replace="g"
+ variation 14358
+ /frequency="0.0158"
+ /replace="c"
+ variation 14371
+ /frequency="0.0079"
+ /replace="t"
+ variation 14406
+ /frequency="0.0553"
+ /replace="a"
+ variation 14503
+ /frequency="0.0421"
+ /replace="c"
+ variation 14507
+ /frequency="0.4105"
+ /replace="a"
+ variation 14609
+ /frequency="0.4681"
+ /replace="a"
+ mobile_element 14622..14921
+ /mobile_element_type="SINE:Alu"
+ variation 14646
+ /frequency="0.0080"
+ /replace="c"
+ variation 14670
+ /frequency="0.4309"
+ /replace="g"
+ variation 14767
+ /frequency="0.0027"
+ /replace="c"
+ variation 14834
+ /frequency="0.4574"
+ /replace="g"
+ variation 14861
+ /frequency="0.0878"
+ /replace="t"
+ variation 14937
+ /frequency="0.2872"
+ /replace="c"
+ variation 14991
+ /frequency="0.0081"
+ /replace="g"
+ variation 15061
+ /frequency="0.3758"
+ /replace="t"
+ misc_feature 15105..15439
+ /note="Region not scanned for variation"
+ mobile_element 15305..15396
+ /mobile_element_type="LINE:L2"
+ORIGIN
+ 1 gtgtttcaga ataaaatacc aactctacta ctctcatctg taagatgcaa atagtaagcc
+ 61 tgagcccttc tgtctaactt tgaattctat tttttcttca acgtacttta ggcttgtaat
+ 121 gtgtttatat acagtgaaat gtcaagttct ttctttatat ttctttcttt cttttttttc
+ 181 ctcagcctca gagttttcca catgcccttc ctactttcag gaacttcttt ctccaaacgt
+ 241 cttctgcctg gctccatcaa atcataaagg acccacttca aatgccatca ctcactacca
+ 301 tttcacaatt cgcactttct ttctttgtcc tttttttttt tagtaaaaca agtttataaa
+ 361 aaattgaagg aataaatgaa tggctacttc ataggcagag tagacgcaag ggctactggt
+ 421 tgccgatttt tattgttatt tttcaatagt atgctaaaca aggggtagat tatttatgct
+ 481 gcccattttt agaccataaa agataacttc ctgatgttgc catggcattt tttttccttt
+ 541 taattttatt tcatttcatt ttaatttcga aggtacatgt gcaggatgtg caggcttgtt
+ 601 acatgggtaa atgtgtgtct ttctggcctt ttagccatct gtatcaatga gcagatataa
+ 661 gctttacaca ggatcatgaa ggatgaaaga atttcaccaa tattataata atttcaatca
+ 721 acctgatagc ttaggggata aactaatttg aagatacagc ttgcctccga taagccagaa
+ 781 ttccagagct tctggcatta taatctagca aggttagaga tcatggatca ctttcagaga
+ 841 aaaacaaaaa caaactaacc aaaagcaaaa cagaaccaaa aaaccaccat aaatacttcc
+ 901 taccctgtta atggtccaat atgtcagaaa cagcactgtg ttagaaataa agctgtctaa
+ 961 agtacactaa tattcgagtt ataatagtgt gtggactatt agtcaataaa aacaaccctt
+ 1021 gcctctttag agttgttttc catgtacacg cacatcttat gtcttagagt aagattccct
+ 1081 gagaagtgaa cctagcattt atacaagata attaattcta atccacagta cctgccaaag
+ 1141 aacattctac catcatcttt actgagcata gaagagctac gccaaaaccc tgggtcatca
+ 1201 gccagcacac acacttatcc agtggtaaat acacatcatc tggtgtatac atacatacct
+ 1261 gaatatggaa tcaaatattt ttctaagatg aaacagtcat gatttatttc aaataggtac
+ 1321 ggataagtag atattgaggt aagcattagg tcttatatta tgtaacacta atctattact
+ 1381 gcgctgaaac tgtggcttta tagaaattgt tttcactgca ctattgagaa attaagagat
+ 1441 aatggcaaaa gtcacaaaga gtatattcaa aaagaagtat agcacttttt ccttagaaac
+ 1501 cactgctaac tgaaagagac taagatttgt cccgtcaaaa atcctggacc tatgcctaaa
+ 1561 acacatttca caatccctga acttttcaaa aattggtaca tgctttagct ttaaactaca
+ 1621 ggcctcactg gagctagaga caagaaggta aaaaacggct gacaaaagaa gtcctggtat
+ 1681 cctctatgat gggagaagga aactagctaa agggaagaat aaattagaga aaaactggaa
+ 1741 tgactgaatc ggaacaaggc aaaggctata aaaaaaatta agcagcagta tcctcttggg
+ 1801 ggccccttcc ccacactatc tcaatgcaaa tatctgtctg aaacggtccc tggctaaact
+ 1861 ccacccatgg gttggccagc cttgccttga ccaatagcct tgacaaggca aacttgacca
+ 1921 atagtcttag agtatccagt gaggccaggg gccggcggct ggctagggat gaagaataaa
+ 1981 aggaagcacc cttcagcagt tccacacact cgcttctgga acgtctgagg ttatcaataa
+ 2041 gctcctagtc cagacgccat gggtcatttc acagaggagg acaaggctac tatcacaagc
+ 2101 ctgtggggca aggtgaatgt ggaagatgct ggaggagaaa ccctgggaag gtaggctctg
+ 2161 gtgaccagga caagggaggg aaggaaggac cctgtgcctg gcaaaagtcc aggtcgcttc
+ 2221 tcaggatttg tggcaccttc tgactgtcaa actgttcttg tcaatctcac aggctcctgg
+ 2281 ttgtctaccc atggacccag aggttctttg acagctttgg caacctgtcc tctgcctctg
+ 2341 ccatcatggg caaccccaaa gtcaaggcac atggcaagaa ggtgctgact tccttgggag
+ 2401 atgccacaaa gcacctggat gatctcaagg gcacctttgc ccagctgagt gaactgcact
+ 2461 gtgacaagct gcatgtggat cctgagaact tcaaggtgag tccaggagat gtttcagccc
+ 2521 tgttgccttt agtctcgagg caacttagac aacggagtat tgatctgagc acagcagggt
+ 2581 gtgagctgtt tgaagatact ggggttgggg gtgaagaaac tgcagaggac taactgggct
+ 2641 gagacccagt ggtaatgttt tagggcctaa ggagtgcctc taaaaatcta gatggacaat
+ 2701 tttgactttg agaaaagaga ggtggaaatg aggaaaatga cttttcttta ttagattcca
+ 2761 gtagaaagaa ctttcatctt tccctcattt ttgttgtttt aaaacatcta tctggaggca
+ 2821 ggacaagtat ggtcgttaaa aagatgcagg cagaaggcat atattggctc agtcaaagtg
+ 2881 gggaactttg gtggccaaac atacattgct aaggctattc ctatatcagc tggacacata
+ 2941 taaaatgctg ctaatgcttc attacaaact tatatccttt aattccagat gggggcaaag
+ 3001 tatgtccagg ggtgaggaac aattgaaaca tttgggctgg agtagatttt gaaagtcagc
+ 3061 tctgtgtgtg tgtgtgtgtg tgcgcgcgcg cgtgtgtgtg tgtgtgtcag cgtgtgtttc
+ 3121 ttttaacgtc ttcagcctac aacatacagg gttcatggtg gcaagaagat agcaagattt
+ 3181 aaattatggc cagtgactag tgcttgaagg ggaacaacta cctgcattta atgggaaggc
+ 3241 aaaatctcag gctttgaggg aagttaacat aggcttgatt ctgggtggaa gcttggtgtg
+ 3301 tagttatctg gaggccaggc tggagctctc agctcactat gggttcatct ttattgtctc
+ 3361 ctttcatctc aacagctcct gggaaatgtg ctggtgaccg ttttggcaat ccatttcggc
+ 3421 aaagaattca cccctgaggt gcaggcttcc tggcagaaga tggtgactgc agtggccagt
+ 3481 gccctgtcct ccagatacca ctgagctcac tgcccatgat tcagagcttt caaggatagg
+ 3541 ctttattctg caagcaatac aaataataaa tctattctgc tgagagatca cacatgattt
+ 3601 tcttcagctc ttttttttac atctttttaa atatatgagc cacaaagggt ttatattgag
+ 3661 ggaagtgtgt atgtgtattt ctgcatgcct gtttgtgttt gtggtgtgtg catgctcctc
+ 3721 atttattttt atatgagatg tgcattttga tgagcaaata aaagcagtaa agacacttgt
+ 3781 acacgggagt tctgcaagtg ggagtaaatg gtgtaggaga aatccggtgg gaagaaagac
+ 3841 ctctatagga caggacttct cagaaacaga tgttttggaa gagatgggaa aaggttcagt
+ 3901 gaagacctgg gggctggatt gattgcagct gagtagcaag gatggttctt aaggaaggga
+ 3961 aagtgttcca agctttagga attcaaggtt tagtcaggtg tagcaattct attttattag
+ 4021 gaggaatact atttctaatg gcacttagct tttcacagcc cttgtggatg cctaagaaag
+ 4081 tgaaattaat cccatgccct caagtgtgca gattggtcac agcatttcaa gggagagacc
+ 4141 tcattgtaag actctggggg aggtggggac ttaggtgtaa gaaatgaatc agcagaggct
+ 4201 cacaagtcag catgagcatg ttatgtctga gaaacagacc agcactgtga gatcaaaatg
+ 4261 tagtgggaag aatttgtaca acattaattg gaaggcttac ttaatggaat ttttgtatag
+ 4321 ttggatgtta gtgcatctct ataagtaaga gtttaatatg atggtgttac ggacctaatg
+ 4381 tttgtgtctc ctcaaaattc acatgctgaa tccccaactc ccaactgacc ttatctgtgg
+ 4441 gggaggcttt tgaaaagtaa ttaggtttag atgagctcat aagagcagat ccccatcata
+ 4501 aaattatttt ccttatcaga agcagagaga caagccattt ctctttcctc ccggtgagga
+ 4561 cacagtgaga agtccgccat ctgcaatcca ggaagagaac cctgaccacg agtcagcctt
+ 4621 cagaaatgtg agaaaaaact ctgttgttga agccacccag tcttttgtat tttgttatag
+ 4681 caccttgcac tgagtaaggc agatgaagaa ggagaaaaaa ataagcttgg gttttgagtg
+ 4741 gactacagac catgtttatc tcaggtttgc aaagctcccc tcgtccccta tgtttcagta
+ 4801 taaaatacct actctactac tctcatctat aagacccaaa taataagcct gcgcccttct
+ 4861 ctctaacttt gatttctcct atttttactt caacatgctt tactctagcc ttgtaatgtc
+ 4921 tttacataca gtgaaatgta aagttcttta ttcttttttt ctttctttct tttttctcct
+ 4981 cagcctcaga atttggcaca tgcccttcct tctttcagga acttctccaa catctctgcc
+ 5041 tggctccatc atatcataaa ggtcccactt caaatgcagt cactaccgtt tcagaatatg
+ 5101 cactttcttt cttttttgtt ttttgttttt tttaagtcaa agcaaatttc ttgagagagt
+ 5161 aaagaaataa acgaatgact actgcatagg cagagcagcc ccgagggccg ctggttgttc
+ 5221 cttttatggt tatttcttga tgatatgtta aacaagtttt ggattattta tgccttctct
+ 5281 ttttaggcca tatagggtaa ctttctgaca ttgccatggc atttttcttt taatttaatt
+ 5341 tactgttacc ttaaattcag gggtacacgt acaggatatg caggtttgtt ttataggtaa
+ 5401 aagtgtgcca tggttttaat gggttttttt tttcttgtaa agttgtttaa gtttcttgtt
+ 5461 tactctggat attaggcctt tgtcagaaga atagattgga aaatcttttt cccattctgt
+ 5521 agattgtctt tcgctctgat ggtagtttct tttgctgagc aggagctctt tagtttaatt
+ 5581 agattccatt ggtcaatttt tgcttttgct gcaattgctt ttcacgcttt catcatgaaa
+ 5641 tctgtgcccg tgtttatatc atgaatagta ttgccttgat ttttttctag gctttttata
+ 5701 gtttggggtt tttcatttaa gtctctaatc catctggagt taattttgga taaggtataa
+ 5761 ggaaggagtc cagtttcatt tttcagcata tggctagcca gttctccccc atcatttatt
+ 5821 aaattgaaaa tcctttcccc attgcttgct tttgtcaggt ttctaaaaga ccagatggtt
+ 5881 gtaggtacaa tatgcagttt cttcaagtca tataatacca tctgaaatct cttattaatt
+ 5941 catttctttt agtatgtatg ctggtctcct ctgctcacta tagtgagggc accattagcc
+ 6001 agagaatctg tctgtctagt tcatgtaaga ttctcagaat taagaaaaat ggatggcata
+ 6061 tgaatgaaac ttcatggatg acatatggaa tctaatatgt atttgttgaa ttaatgcata
+ 6121 agatgcaaca gagagaagtt gacaactgca atgataacct ggtattgatg atataagagt
+ 6181 ctatagatca cagtagaagc aataatcatg gaaaacaatt ggaaatgggg aacagccaca
+ 6241 aacaagaaag aatcaatact tccaggaaag tgactgcagg tcacttttcc tggagcgggt
+ 6301 gagagaaaag tggaagttag cagtaactgc tgaattcctg gttggctgat ggaaagatgg
+ 6361 ggcagctgtt cactggtacg cagggtttta gatgtatgta cctaaggata tgaggtatgg
+ 6421 caatgaacag aaattctttt gggaatgagt tttagggcca ttaaaggaca tgacctgaag
+ 6481 tttcctctga ggccagtccc cacaactcaa tataaatgtg tttcctgcat atagtcaaag
+ 6541 ttgccacttc tttttcttca tatcatcgat ctctgctctt aaagataatc ttggttttgc
+ 6601 ctcaaactgt ttgtcactac aaactttccc catgttccta agtaaaacag gtaactgcct
+ 6661 ctcaactata tcaagtagac taaaatattg tgtctctaat atcagaaatt cagctttaat
+ 6721 atattgggtt taactctttg aaatttagag tctccttgaa atacacatgg gggtgatttc
+ 6781 ctaaacttta tttcttgtaa ggatttatct caggggtaac acacaaacca gcatcctgaa
+ 6841 cctctaagta tgaggacagt aagccttaag aatataaaat aaactgttct tctctctgcc
+ 6901 ggtggaagtg tgccctgtct attcctgaaa ttgcttgttt gagacgcatg agacgtgcag
+ 6961 cacatgagac acgtgcagca gcctgtggaa tattgtcagt gaagaatgtc tttgcctgat
+ 7021 tagatataaa gacaagttaa acacagcatt agactataga tcaagcctgt gccagacaca
+ 7081 aatgacctaa tgcccagcac gggccacgga atctcctatc ctcttgcttg aacagagcag
+ 7141 cacacttctc ccccaacact attagatgtt ctggcataat tttgtagata tgtaggattt
+ 7201 gacatggact attgttcaat gattcagagg aaatctcctt tgttcagata agtacactga
+ 7261 ctactaaatg gattaaaaaa cacagtaata aaacccagtt ttccccttac ttccctagtt
+ 7321 tgtttcttat tctgctttct tccaagttga tgctggatag aggtgtttat ttctattcta
+ 7381 aaaagtgatg aaattggccg ggcgcggtgg ctcacacctg taatcccagc actttgggag
+ 7441 gctgaggtgg gcggatcacg aggtcaggag atcaagacca tcctggctaa catggtgaaa
+ 7501 ccccatctct actaaaaata caaaaaatta gccagagaca gtggcgggtg cctgtagtcc
+ 7561 cagctactcg ggaggctgag gcaggagaat ggcgtgaacc tgggaggcag agcttgcggt
+ 7621 gagcagagat cgcgccactg cacactccag cctgggtgac aaagcgagac tccatctcaa
+ 7681 aaaaaaaaaa aaaaaaaaga aaaagaaaga aagaaagaaa aaaaaactga tgaaattgtg
+ 7741 tattcaatgt agtctcaaga gaattgaaaa ccaagaaagg ctgtggcttc ttccacataa
+ 7801 agcctggatg aataacagga taacacgttg ttacattgtc acaactcctg atccaggaat
+ 7861 tgatggctaa gatattcgta attcttatcc ttttcagttg taacttattc ctatttgtca
+ 7921 gcattcaggt tattagcggc tgctggcgaa gtccttgaga aataaactgc acactggatg
+ 7981 gtgggggtag tgtaggaaaa tggaggggaa ggaagtaaag tttcaaatta agcctgaaca
+ 8041 gcaaagttcc cctgagaagg ccacctggat tctatcagaa actcgaatgt ccatcttgca
+ 8101 aaacttcctt gcccaaaccc cacccctgga gtcacaaccc acccttgacc aatagattca
+ 8161 ttttactgag ggaggcaaag ggctggtcaa tagattcatt tcactgggag aggcaaaggg
+ 8221 ctgggggcca gagaggagaa gtaaaaagcc acacatgaag cagcaatgca ggcatgcttc
+ 8281 tggctcatct gtgatcacca ggaaactccc agatctgaca ctgtagtgca tttcactgct
+ 8341 gacaagaagg ctgctgccac cagcctgtga agcaaggtta aggtgagaag gctggaggtg
+ 8401 agattctggg caggtaggta ctggaagccg ggacaaggtg cagaaaggca gaaagtgttt
+ 8461 ctgaaagagg gattagcccg ttgtcttaca tagtctgact ttgcacctgc tctgtgatta
+ 8521 tgactatccc acagtctcct ggttgtctac ccatggacct agaggtactt tgaaagtttt
+ 8581 ggatatctgg gctctgactg tgcaataatg ggcaacccca aagtcaaggc acatggcaag
+ 8641 aaggtgctga tctccttcgg aaaagctgtt atgctcacgg atgacctcaa aggcaccttt
+ 8701 gctacactga gtgacctgca ctgtaacaag ctgcacgtgg accctgagaa cttcctggtg
+ 8761 agtagtaagt acactcacgc tttcttcttt acccttagat atttgcacta tgggtacttt
+ 8821 tgaaagcaga ggtggctttc tcttgtgtta tgagtcagct atgggatatg atatttcagc
+ 8881 agtgggattt tgagagttat gttgctgtaa ataacataac taaaatttgg tagagcaagg
+ 8941 actatgaata atggaaggcc acttaccatt tgatagctct gaaaaacaca tcttataaaa
+ 9001 aattctggcc aaaatcaaac tgagtgtttt tggatgaggg aacagaagtt gagatagaga
+ 9061 aaataacatc tttcctttgg tcagcgaaat tttctataaa aattaatagt cacttttctg
+ 9121 catagtcctg gaggttagaa aaagatcaac tgaacaaagt agtgggaagc tgttaaaaag
+ 9181 aggattgttt ccctccgaat gatgatggta tacttttgta cgcatggtac aggattcttt
+ 9241 gttatgagtg tttgggaaaa ttgtatgtat gtatgtatgt atgtatgtga tgactgggga
+ 9301 cttatcctat ccattactgt tccttgaagt actattatcc tactttttaa aaggacgaag
+ 9361 tctctaaaaa aaaaaatgaa acaatcacaa tatgttgggg tagtgagttg gcatagcaag
+ 9421 taagagaagg ataggacaca atgggaggtg cagggctgcc agtcatattg aagctgatat
+ 9481 ctagcccata atggtgagag ttgctcaaac tctggtgaaa aaggatgtaa gtgttatatc
+ 9541 tatttactgc aagtccagct tgaggccttc tattcactat gtaccatttt cttttttatc
+ 9601 ttcactccct ccccagctct taggcaacgt gatattgatt gttttggcaa cccacttcag
+ 9661 cgaggatttt accctacaga tacaggcttc ttggcagtaa ctaacaaatg ctgtggttaa
+ 9721 tgctgtagcc cacaagacca ctgagttccc tgtccactat gtttgtacct atggtccact
+ 9781 atgtttgtac ctatgtccca aaatctcatc tcctttagat gggggaggtt ggggagaaga
+ 9841 gcagtatcct gcctgctgat tcagttcctg catgataaaa atagaataaa gaaatatgct
+ 9901 ctctaagaaa tatcattgta ctctttttct gtctttatat tttaccctga ttcagccaaa
+ 9961 aggacgcact atttctgatg gaaatgagaa tgttggagaa tgggagttta aggacagaga
+ 10021 agatactttc ttgcaatcct gcaagaaaag agagaactcg tgggtggatt tagtggggta
+ 10081 gttactccta ggaaggggaa atcgtctcta gaataagaca atgtttttac agaaagggag
+ 10141 gtcaatggag gtactctttg gaggtgtaag aggattgttg gtagtgtgta gaggtatgtt
+ 10201 aggactcaaa ttagaagttc tgtataggct attatttgta tgaaactcag gatatagctc
+ 10261 atttggtgac tgcagttcac ttctacttat tttaaacaac atatttttta ttatttataa
+ 10321 tgaagtgggg atggggcttc ctagagacca atcaagggcc aaaccttgaa ctttctctta
+ 10381 acgtcttcaa tggtattaat agagaattat ctctaaggca tgtgaactgg ctgtcttggt
+ 10441 tttcatctgt acttcatctg ctacctctgt gacctgaaac atatttataa ttccattaag
+ 10501 ctgtgcatat gatagattta tcatatgtat tttccttaaa ggatttttgt aagaactaat
+ 10561 tgaattgata cctgtaaagt ctttatcaca ctacccaata aataataaat ctctttgttc
+ 10621 agctctctgt ttctataaat atgtacaagt tttattgttt ttagtggtag tgattttatt
+ 10681 ctctttctat atatatacac acacatatgt gtgcattcat aaatatatac aatttttatg
+ 10741 aataaaaaat tattagcaat caatattgaa aaccactgat ttttgtttat gtgagcaaac
+ 10801 agcagattaa aaggctgaga tttaggaaac agcacgttaa gtcaagttga tagaggagaa
+ 10861 tatggacatt taaaagaggc aggatgatat aaaattaggg aaactggatg cagagaccag
+ 10921 atgaagtaag aaaaatagct atcgttttga gcaaaaatca ctgaagtttc ttgcatatga
+ 10981 gagtgacata ataaataggg aaacgtagaa aattgattca catgtatata tatatataga
+ 11041 actgattaga caaagtctaa cttgggtata gtcagaggag cttgctgtaa ttatattgag
+ 11101 gtgatggata aagaactgaa gttgatggaa acaatgaagt taagaaaaaa aatcgagtaa
+ 11161 gagaccattg tggcagtgat tgcacagaac tggaaaacat tgtgaaacag agagtcagag
+ 11221 atgacagcta aaatccctgt ctgtgaatga aaagaaggaa atttattgac agaacagcaa
+ 11281 atgcctacaa gccccctgtt tggatctggc aatgaacgta gccattctgt ggcaatcact
+ 11341 tcaaactcct gtacccaaga cccttaggaa gtatgtagca ccctcaaacc taaaacctca
+ 11401 aagaaagagg ttttagaaga tataataccc tttcttctcc agtttcatta atcccaaaac
+ 11461 ctctttctca aagtatttcc tctatgtgtc caccccaaag agctcacctc accatatctc
+ 11521 ttgagtggga gcacatagat aggcggtgct accatctaac agcttctgaa attcctttgt
+ 11581 catatttttg agtccccact aataacccac aaagcagaat aaataccagt tgctcatgta
+ 11641 caataatcac tcaactgctg tcttgtagca tacattaatt aagcacattc tttgaataat
+ 11701 tactgtgtcc aaacaatcac actttaaaat ctcacacttg tgctatccct tgcccttctg
+ 11761 aatgtcactc tgtattttaa atgaagagat gagggttgaa tttcctgtgt tacttattgt
+ 11821 tcatttctcg atgaggagtt ttcacattca cctttagtgg aaaacacata agtacacatc
+ 11881 ttacaggaaa aatataccaa actgacatgt agcatgaatg cttgtgcatg tagtcatata
+ 11941 aaatcttgta gcaatgtaaa cattctctga tatacacata cagatgtgtc tatatgtcta
+ 12001 cacaatttct tatgctccat gaacaaacat tccatgcaca cataagaaca cacactgtta
+ 12061 cagatgcata cttgagtgca ttgacaaaat taccccagtc aatctagaga atttggattt
+ 12121 ctgcatttga ctctgttagc tttgtacatg ctgttcattt actctgggtg atgtctttcc
+ 12181 ctcattttgc cttgtctatc ttgtactcat actttaagtc ctaacttata tgttatctca
+ 12241 actaagaagc tatttttttt ttaattttaa ctgggcttaa agccctgtct ataaactctg
+ 12301 ctacaattat gggctctttc ttataatatt tagtgttttt cctactaatg tacttaatct
+ 12361 gctcattgta tattcctacc actaaatttt aacctctttt atggtagaga cattgtcttg
+ 12421 taaactctta tttccctagt atttggagat gaaaaaaaag attaaattat ccaaaattag
+ 12481 atctctcttt tctacattat gagtattaca ctatccatag agaagtttgt ttgagaccta
+ 12541 aactgaggaa cctttggttc taaaatgact atgtgatatc ttagtattta taggtcatga
+ 12601 ggttccttcc tctgcctctg ctatagtttg attagtcaac aagcatgtgt catgcattta
+ 12661 ttcacatcag aatttcatac actaataaga catagtatca gaagtcagtt tattagttat
+ 12721 atcagttagg gtccatcaag gaaaggacaa accattatca gttactcaac ctagaattaa
+ 12781 atacagctct taatagttaa ttatccttgt attggaagag ctaaaatatc aaataaagga
+ 12841 cagtgcagaa atctagatgt tagtaacatc agaaaacctc ttccgccatt aggcctagaa
+ 12901 gggcagaagg agaaaatgtt tataccacca gagtccagaa ccagagccca taaccagagg
+ 12961 tccactggat tcagtgagct agtgggtgct ccttggagag agccagaact gtctaatggg
+ 13021 ggcatcaaag tatcagccat aaaaaaccat aaaaaagact gtctgctgta ggagatccgt
+ 13081 tcagagagag agagagacca gaaataatct tgcttatgct ttccctcagc cagtgtttac
+ 13141 cattgcagaa tgtacatgcg actgaaaggg tgaggaaacc tgggaaatgt cagttcctca
+ 13201 aatacagaga acactgaggg aaggatgaga aataaatgtg aaagcagaca tgaatggtaa
+ 13261 ttgacagaag gaaactagga tgtgtccagt aaatgaataa ttacagtgtg cagtgattat
+ 13321 tgcaatgatt aatgtattga taagataata tgaaaacaca gaattcaaac agcagtgaac
+ 13381 tgagattaga attgtggaga gcactggcat ttaagaatgt cacacttaga atgtgtctct
+ 13441 aggcattgtt ctgtgcatat atcatctcaa tattcattat ctgaaaatta tgaattaggt
+ 13501 acaaagctca aataatttat tttttcaggt tagcaagaac tttttttttt tttttctgag
+ 13561 atagagcatt gctatggttg cccaggctgg agtgcaatgg catgatccag gctcactgca
+ 13621 acatctgcct cccaggttca agcgattctc ctgcctcagc ctcccaagta gctggcacta
+ 13681 caggcatgtg ccaccaccat gcctggctaa ttttctattt ttagtagata gggggtttca
+ 13741 ccatgttggt caggctgatc tcgaactcct aacatcaggt gatccaccct cctcggcctc
+ 13801 tgaaagtgct gggatcacag gcgtgagcca ccacacccag ccaagaatgt gaattttgta
+ 13861 gaaggatata acccatattt ctctgaccct agagtcctta gtatacctcc cataccatgt
+ 13921 ggctcatcct ccttacatac atttcccatc tttcacccta ccttttcctt tttgtttcag
+ 13981 cttttcactg tgtgtcaaaa tctagaacct tatctcctac ctgctctgaa accaacagca
+ 14041 agttgacttc cattctaacc cacattggca ttacactaat taaaatcgat actgagttct
+ 14101 aaaatcatcg gggattttgg ggactatgtc ttacttcata cttccttgag atttcacatt
+ 14161 aaatgttggt gttcattaaa ggtccttcat ttaactttgt attcatcaca ctcttggatt
+ 14221 cacagttata tctaaactct taaatacagc ctgtataatc ccaattccca actctgattt
+ 14281 ctaacctctg acctccaacc tcagtgccaa acccatatat caaacaatgt actgggctta
+ 14341 tttatataga tgtcctatag gcacctcaga ctcagcatgg gtatttcact tgttatacta
+ 14401 aaactgtttc tcttccagtg ttttccattt tagtcattag atagctactt gcccattcac
+ 14461 caaggtcaca gattaaaatc atttccctac ctctaatcaa cagttcgatt ctgcttcaat
+ 14521 ttgtccctat ctattaatca ccactcttac tgcccagtca ggtcctcatt gtttcctgaa
+ 14581 caagagtaga tgctattctt tccactttta gaccttatcc tggctggatg cggtggctca
+ 14641 ggcttgtaaa cccagcactt tgggaggcca aggcaggcag atcacttgag gtcaggagtt
+ 14701 caagaccagc ctgaccaaca tggtgaaacc ccatctctac taaaaataca aaatcagccg
+ 14761 ggcgtgtggt gcatgcctgc agtcccagct attcaggtgg ctgaggcagg agaattgctt
+ 14821 gaacccagga ggcagaggtt gcggtgagcc tagattgcac cattgcactc tagcttgggc
+ 14881 aatagggatg aaactccatc tcagaagaga aaagaaaaaa agaccttatt ctgttataca
+ 14941 aatcctctca atgcaatcca tatagaataa acatgtaacc agatctccca atgtgtaaaa
+ 15001 tcatttcagg tagaacagaa ttaaagtgaa aagccaagtc tttggaatta acagacaaag
+ 15061 atcaaataac agtcctcatg gccttaagaa tttacctaac atttttttta gaatcaattt
+ 15121 tcttatatat gaattggaaa cataattcct ccctcacaaa cacattctaa gattttaagg
+ 15181 agatattgat gaagtacatc atctgtcatt tttaacaggt agtggtagtg attcacacag
+ 15241 cacattatga tctgttcttg tatgttctgt tccattctgt attcttgacc tggttgtatt
+ 15301 ctttctgagc tccagatcca catatctaag tacatctttt tgcattttac aagagtgcat
+ 15361 acaatacaat gtatccaaga ctgtatttct gattttatcg taccactaaa ctcacaaatg
+ 15421 tggccctatt cttgtgttca
+//
\ No newline at end of file
--- /dev/null
+package jalview.io;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentAnnotation;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Annotation;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Test;
+
+public class GenBankTest {
+// private final static File GENBANK_FILE = new File("test/jalview/io/V00505.gb");
+// private final static File GENBANK_FILE = new File("test/jalview/io/NC_000011.10.gb");
+ private final static File GENBANK_FILE = new File("test/jalview/io/M92650.1.gb");
+
+ @Test
+ public void testParsing(){
+ testFileIOwithFormat(GENBANK_FILE, "GENBANK");
+ }
+ /**
+ * test alignment data in given file can be imported, exported and reimported
+ * with no dataloss
+ *
+ * @param f
+ * - source datafile (IdentifyFile.identify() should work with it)
+ * @param ioformat
+ * - label for IO class used to write and read back in the data from
+ * f
+ */
+ public static void testFileIOwithFormat(File f, String ioformat)
+ {
+ System.out.println("Reading file: " + f);
+ String ff = f.getPath();
+ try
+ {
+ AppletFormatAdapter rf = new AppletFormatAdapter();
+
+ Alignment al = rf.readFile(ff, AppletFormatAdapter.FILE,
+ new IdentifyFile().Identify(ff, AppletFormatAdapter.FILE));
+
+ assertNotNull("Couldn't read supplied alignment data.", al);
+
+ // make sure dataset is initialised ? not sure about this
+ for (int i = 0; i < al.getSequencesArray().length; ++i)
+ {
+ al.getSequenceAt(i).setDatasetSequence(al.getSequenceAt(i));
+ }
+ String outputfile = rf.formatSequences(ioformat, al, true);
+ System.out.println("Output file in '"+ioformat+"':\n"+outputfile+"\n<<EOF\n");
+ // test for consistency in io
+ Alignment al_input = new AppletFormatAdapter().readFile(outputfile,
+ AppletFormatAdapter.PASTE, ioformat);
+ assertNotNull("Couldn't parse reimported alignment data.", al_input);
+
+ String identifyoutput = new IdentifyFile().Identify(outputfile,
+ AppletFormatAdapter.PASTE);
+ assertNotNull("Identify routine failed for outputformat " + ioformat,
+ identifyoutput);
+ assertTrue(
+ "Identify routine could not recognise output generated by '"
+ + ioformat + "' writer",
+ ioformat.equals(identifyoutput));
+ testAlignmentEquivalence(al, al_input);
+ } catch (Exception e)
+ {
+ e.printStackTrace();
+ assertTrue("Couln't format the alignment for output file.", false);
+ }
+ }
+ /**
+ * assert alignment equivalence
+ *
+ * @param al
+ * 'original'
+ * @param al_input
+ * 'secondary' or generated alignment from some datapreserving
+ * transformation
+ */
+ public static void testAlignmentEquivalence(AlignmentI al,
+ AlignmentI al_input)
+ {
+ assertNotNull("Original alignment was null", al);
+ assertNotNull("Generated alignment was null", al_input);
+
+ assertTrue(
+ "Alignment dimension mismatch: original contains "
+ + al.getHeight() + " and generated has "
+ + al_input.getHeight() + " sequences; original has "
+ + al.getWidth() + " and generated has "
+ + al_input.getWidth() + " columns.",
+ al.getHeight() == al_input.getHeight()
+ && al.getWidth() == al_input.getWidth());
+
+ // check Alignment annotation
+ AlignmentAnnotation[] aa_new = al_input.getAlignmentAnnotation();
+ AlignmentAnnotation[] aa_original = al.getAlignmentAnnotation();
+
+ // note - at moment we do not distinguish between alignment without any
+ // annotation rows and alignment with no annotation row vector
+ // we might want to revise this in future
+ int aa_new_size = (aa_new == null ? 0 : aa_new.length), aa_original_size = (aa_original == null ? 0
+ : aa_original.length);
+ Map<Integer,java.util.BitSet> orig_groups=new HashMap<Integer,java.util.BitSet>(),new_groups=new HashMap<Integer,java.util.BitSet>();
+
+ if (aa_new != null && aa_original != null)
+ {
+ for (int i = 0; i < aa_original.length; i++)
+ {
+ if (aa_new.length>i) {
+ assertTrue("Different alignment annotation at position "+i,
+ equalss(aa_original[i], aa_new[i]));
+ // compare graphGroup or graph properties - needed to verify JAL-1299
+ assertTrue("Graph type not identical.",aa_original[i].graph==aa_new[i].graph);
+ assertTrue("Visibility not identical.", aa_original[i].visible==aa_new[i].visible);
+ assertTrue(
+ "Threshold line not identical.",
+ aa_original[i].threshold == null ? aa_new[i].threshold == null
+ : aa_original[i].threshold
+ .equals(aa_new[i].threshold));
+ // graphGroup may differ, but pattern should be the same
+ Integer o_ggrp=new Integer(aa_original[i].graphGroup+2),n_ggrp=new Integer(aa_new[i].graphGroup+2);
+ BitSet orig_g=orig_groups.get(o_ggrp),new_g=new_groups.get(n_ggrp);
+ if (orig_g==null) {
+ orig_groups.put(o_ggrp,orig_g= new BitSet());
+ }
+ if (new_g==null) {
+ new_groups.put(n_ggrp, new_g=new BitSet());
+ }
+ assertTrue("Graph Group pattern differs at annotation "+i, orig_g.equals(new_g));
+ orig_g.set(i); new_g.set(i);
+ } else {
+ System.err.println("No matching annotation row for "+aa_original[i].toString());
+ }
+ }
+ }
+ assertTrue(
+ "Generated and imported alignment have different annotation sets ("
+ + aa_new_size + " != " + aa_original_size + ")",
+ aa_new_size == aa_original_size);
+
+ // check sequences, annotation and features
+ SequenceI[] seq_original = new SequenceI[al.getSequencesArray().length];
+ seq_original = al.getSequencesArray();
+ SequenceI[] seq_new = new SequenceI[al_input.getSequencesArray().length];
+ seq_new = al_input.getSequencesArray();
+ SequenceFeature[] sequenceFeatures_original, sequenceFeatures_new;
+ AlignmentAnnotation annot_original, annot_new;
+ //
+ for (int i = 0; i < al.getSequencesArray().length; i++)
+ {
+ String name = seq_original[i].getName();
+ int start = seq_original[i].getStart();
+ int end = seq_original[i].getEnd();
+ System.out.println("Check sequence: " + name + "/" + start + "-"
+ + end);
+
+ // search equal sequence
+ for (int in = 0; in < al_input.getSequencesArray().length; in++)
+ {
+ if (name.equals(seq_new[in].getName())
+ && start == seq_new[in].getStart()
+ && end == seq_new[in].getEnd())
+ {
+ String ss_original = seq_original[i].getSequenceAsString();
+ String ss_new = seq_new[in].getSequenceAsString();
+ assertTrue("The sequences " + name + "/" + start + "-" + end
+ + " are not equal", ss_original.equals(ss_new));
+
+ assertTrue(
+ "Sequence Features were not equivalent",
+ (seq_original[i].getSequenceFeatures() == null && seq_new[in]
+ .getSequenceFeatures() == null)
+ || (seq_original[i].getSequenceFeatures() != null && seq_new[in]
+ .getSequenceFeatures() != null));
+ // compare sequence features
+ if (seq_original[i].getSequenceFeatures() != null
+ && seq_new[in].getSequenceFeatures() != null)
+ {
+ System.out.println("There are feature!!!");
+ sequenceFeatures_original = new SequenceFeature[seq_original[i]
+ .getSequenceFeatures().length];
+ sequenceFeatures_original = seq_original[i]
+ .getSequenceFeatures();
+ sequenceFeatures_new = new SequenceFeature[seq_new[in]
+ .getSequenceFeatures().length];
+ sequenceFeatures_new = seq_new[in].getSequenceFeatures();
+
+ assertTrue("different number of features", seq_original[i]
+ .getSequenceFeatures().length == seq_new[in]
+ .getSequenceFeatures().length);
+
+ for (int feat = 0; feat < seq_original[i].getSequenceFeatures().length; feat++)
+ {
+ assertTrue("Different features",
+ sequenceFeatures_original[feat]
+ .equals(sequenceFeatures_new[feat]));
+ }
+ }
+ // compare alignment annotation
+ if (al.getSequenceAt(i).getAnnotation() != null
+ && al_input.getSequenceAt(in).getAnnotation() != null)
+ {
+ for (int j = 0; j < al.getSequenceAt(i).getAnnotation().length; j++)
+ {
+ if (al.getSequenceAt(i).getAnnotation()[j] != null
+ && al_input.getSequenceAt(in).getAnnotation()[j] != null)
+ {
+ annot_original = al.getSequenceAt(i).getAnnotation()[j];
+ annot_new = al_input.getSequenceAt(in).getAnnotation()[j];
+ assertTrue("Different annotation elements",
+ equalss(annot_original, annot_new));
+ }
+ }
+ }
+ else if (al.getSequenceAt(i).getAnnotation() == null
+ && al_input.getSequenceAt(in).getAnnotation() == null)
+ {
+ System.out.println("No annotations");
+ }
+ else if (al.getSequenceAt(i).getAnnotation() != null
+ && al_input.getSequenceAt(in).getAnnotation() == null)
+ {
+ assertTrue("Annotations differed between sequences ("
+ + al.getSequenceAt(i).getName() + ") and ("
+ + al_input.getSequenceAt(i).getName() + ")", false);
+ }
+ break;
+ }
+ }
+ }
+ }
+ /*
+ * compare annotations
+ */
+ private static boolean equalss(AlignmentAnnotation annot_or,
+ AlignmentAnnotation annot_new)
+ {
+ if (annot_or.annotations.length != annot_new.annotations.length)
+ {
+ System.err.println("Different lengths for annotation row elements: "+annot_or.annotations.length +"!="+ annot_new.annotations.length);
+ return false;
+ }
+ for (int i = 0; i < annot_or.annotations.length; i++)
+ {
+ Annotation an_or=annot_or.annotations[i],an_new=annot_new.annotations[i];
+ if (an_or != null
+ && an_new!= null)
+ {
+ if (!an_or.displayCharacter.trim()
+ .equals(an_new.displayCharacter.trim())
+ || !(""+an_or.secondaryStructure).trim().equals((""+an_new.secondaryStructure).trim())
+ || ((!an_or.description.equals(an_new.description)) && (an_or.description == null
+ || an_new.description == null || !an_or.description
+ .equals(an_new.description))))
+ {
+ System.err.println("Annotation Element Mismatch\nElement "+i+" in original: "+annot_or.annotations[i].toString()+"\nElement "+i+" in new: "+annot_new.annotations[i].toString());
+ return false;
+ }
+ }
+ else if (annot_or.annotations[i] == null
+ && annot_new.annotations[i] == null)
+ {
+ continue;
+ }
+ else
+ {
+ System.err.println("Annotation Element Mismatch\nElement "+i+" in original: "+(annot_or.annotations[i]==null ? "is null" : annot_or.annotations[i].toString())+"\nElement "+i+" in new: "+(annot_new.annotations[i] == null ? "is null" : annot_new.annotations[i].toString()));
+ return false;
+ }
+ }
+ return true;
+ }
+}
--- /dev/null
+LOCUS HUMDMDXX 2110 bp mRNA linear PRI 07-NOV-1994
+DEFINITION Human Duchenne muscular dystrophy (DMD) mRNA,
+complete cds.
+ACCESSION M92650
+VERSION M92650.1 GI:181598
+KEYWORDS Duchenne muscular
+dystrophy protein.
+SOURCE Homo sapiens (human)
+ ORGANISM Homo sapiens
+ Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+ Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+ Catarrhini; Hominidae; Homo.
+REFERENCE 1 (bases 1 to 2110)
+ AUTHORS Lederfein,D., Levy,Z., Augier,N., Mornet,D.,
+Morris,G., Fuchs,O.,
+ Yaffe,D. and Nudel,U.
+ TITLE A 71-kilodalton protein is a major product of the Duchenne muscular
+ dystrophy gene in brain and other nonmuscle tissues
+ JOURNAL Proc. Natl. Acad. Sci. U.S.A. 89 (12), 5346-5350 (1992)
+ PUBMED 1319059
+COMMENT Original source text: Homo sapiens brain cDNA to mRNA.
+FEATURES Location/Qualifiers
+ source 1..2110
+ /organism="Homo sapiens"
+ /mol_type="mRNA"
+ /db_xref="taxon:9606"
+ /map="Xp21.3-p21.1"
+ /cell_type="amniotic fluid"
+ /tissue_type="brain"
+ gene 1..2110
+ /gene="DMD"
+ 5'UTR 1..52
+ /gene="DMD"
+ /note="G00-119-850"
+ CDS 53..1921
+ /gene="DMD"
+ /codon_start=1
+ /protein_id="AAA52316.1"
+ /db_xref="GI:181599"
+ /db_xref="GDB:G00-119-850"
+ /translation="MREQLKGHETQTTCWDHPKMTELYQSLADLNNVRFSAYRTAMKL
+ RRLQKALCLDLLSLSAACDALDQHNLKQNDQPMDILQIINCLTTIYDRLEQEHNNLVN
+ VPLCVDMCLNWLLNVYDTGRTGRIRVLSFKTGIISLCKAHLEDKYRYLFKQVASSTGF
+ CDQRRLGLLLHDSIQIPRQLGEVASFGGSNIEPSVRSCFQFANNKPEIEAALFLDWMR
+ LEPQSMVWLPVLHRVAAAETAKHQAKCNICKECPIIGFRYRSLKHFNYDICQSCFFSG
+ RVAKGHKMHYPMVEYCTPTTSGEDVRDFAKVLKNKFRTKRYFAKHPRMGYLPVQTVLE
+ GDNMETPASSPQLSHDDTHSRIEHYASRLAEMENSNGSYLNDSISPNESIDDEHLLIQ
+ HYCQSLNQDSPLSQPRSPAQILISLESEERGELERILADLEEENRNLQAEYDRLKQQH
+ EHKGLSPLPSPPEMMPTSPQSPRDAELIAEAKLLRQHKGRLEARMQILEDHNKQLESQ
+ LHRLRQLLEQPQAEAKVNGTTVSSPSTSLQRSDSSQPMLLRVVGSQTSDSMGEEDLLS
+ PPQDTSTGLEEVMEQLNNSFPSSRGHNVGSLFHMADDLGRAMESLVSVMTDEEGAE"
+ 3'UTR 1922..2110
+ /gene="DMD"
+ /note="G00-119-850"
+ORIGIN
+ 1 gaagctcact cctccactcg tacccacact cgaccgcgga gcccttgcag ccatgaggga
+ 61 acagctcaaa ggccacgaga ctcaaacaac ttgctgggac catcccaaaa tgacagagct
+ 121 ctaccagtct ttagctgacc tgaataatgt cagattctca gcttatagga ctgccatgaa
+ 181 actccgaaga ctgcagaagg ccctttgctt ggatctcttg agcctgtcag ctgcatgtga
+ 241 tgccttggac cagcacaacc tcaagcaaaa tgaccagccc atggatatcc tgcagattat
+ 301 taattgtttg accactattt atgaccgcct ggagcaagag cacaacaatt tggtcaacgt
+ 361 ccctctctgc gtggatatgt gtctgaactg gctgctgaat gtttatgata cgggacgaac
+ 421 agggaggatc cgtgtcctgt cttttaaaac tggcatcatt tccctgtgta aagcacattt
+ 481 ggaagacaag tacagatacc ttttcaagca agtggcaagt tcaacaggat tttgtgacca
+ 541 gcgcaggctg ggcctccttc tgcatgattc tatccaaatt ccaagacagt tgggtgaagt
+ 601 tgcatccttt gggggcagta acattgagcc aagtgtccgg agctgcttcc aatttgctaa
+ 661 taataagcca gagatcgaag cggccctctt cctagactgg atgagactgg aaccccagtc
+ 721 catggtgtgg ctgcccgtcc tgcacagagt ggctgctgca gaaactgcca agcatcaggc
+ 781 caaatgtaac atctgcaaag agtgtccaat cattggattc aggtacagga gtctaaagca
+ 841 ctttaattat gacatctgcc aaagctgctt tttttctggt cgagttgcaa aaggccataa
+ 901 aatgcactat cccatggtgg aatattgcac tccgactaca tcaggagaag atgttcgaga
+ 961 ctttgccaag gtactaaaaa acaaatttcg aaccaaaagg tattttgcga agcatccccg
+ 1021 aatgggctac ctgccagtgc agactgtctt agagggggac aacatggaaa cgcctgcctc
+ 1081 gtcccctcag ctttcacacg atgatactca ttcacgcatt gaacattatg ctagcaggct
+ 1141 agcagaaatg gaaaacagca atggatctta tctaaatgat agcatctctc ctaatgagag
+ 1201 catagatgat gaacatttgt taatccagca ttactgccaa agtttgaacc aggactcccc
+ 1261 cctgagccag cctcgtagtc ctgcccagat cttgatttcc ttagagagtg aggaaagagg
+ 1321 ggagctagag agaatcctag cagatcttga ggaagaaaac aggaatctgc aagcagaata
+ 1381 tgaccgtcta aagcagcagc acgaacataa aggcctgtcc ccactgccgt cccctcctga
+ 1441 aatgatgccc acctctcccc agagtccccg ggatgctgag ctcattgctg aggccaagct
+ 1501 actgcgtcaa cacaaaggcc gcctggaagc caggatgcaa atcctggaag accacaataa
+ 1561 acagctggag tcacagttac acaggctaag gcagctgctg gagcaacccc aggcagaggc
+ 1621 caaagtgaat ggcacaacgg tgtcctctcc ttctacctct ctacagaggt ccgacagcag
+ 1681 tcagcctatg ctgctccgag tggttggcag tcaaacttcg gactccatgg gtgaggaaga
+ 1741 tcttctcagt cctccccagg acacaagcac agggttagag gaggtgatgg agcaactcaa
+ 1801 caactccttc cctagttcaa gaggacacaa tgtaggaagt cttttccaca tggcagatga
+ 1861 tttgggcaga gcgatggagt ccttagtatc agtcatgaca gatgaagaag gagcagaata
+ 1921 aatgttttac aactcctgat tcccgcatgg tttttataat attcatacaa caaagaggat
+ 1981 tagacagtaa gagtttacaa gaaataaatc tatatttttg tgaagggtag tggtattata
+ 2041 ctgtagattt cagtagtttc taagtctgtt attgttttgt taacaatggc aggttttaca
+ 2101 cgtctatgca
+//
\ No newline at end of file
--- /dev/null
+LOCUS NC_000011 1800 bp DNA linear CON 03-FEB-2014
+DEFINITION Homo sapiens chromosome 11, GRCh38 Primary Assembly.
+ACCESSION NC_000011 REGION: complement(5232829..5234628) GPC_000001303
+VERSION NC_000011.10 GI:568815587
+DBLINK BioProject: PRJNA168
+ Assembly: GCF_000001405.26
+KEYWORDS RefSeq.
+SOURCE Homo sapiens (human)
+ ORGANISM Homo sapiens
+ Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+ Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+ Catarrhini; Hominidae; Homo.
+REFERENCE 1 (bases 1 to 1800)
+ AUTHORS Taylor,T.D., Noguchi,H., Totoki,Y., Toyoda,A., Kuroki,Y., Dewar,K.,
+ Lloyd,C., Itoh,T., Takeda,T., Kim,D.W., She,X., Barlow,K.F.,
+ Bloom,T., Bruford,E., Chang,J.L., Cuomo,C.A., Eichler,E.,
+ FitzGerald,M.G., Jaffe,D.B., LaButti,K., Nicol,R., Park,H.S.,
+ Seaman,C., Sougnez,C., Yang,X., Zimmer,A.R., Zody,M.C.,
+ Birren,B.W., Nusbaum,C., Fujiyama,A., Hattori,M., Rogers,J.,
+ Lander,E.S. and Sakaki,Y.
+ TITLE Human chromosome 11 DNA sequence and analysis including novel gene identification
+ JOURNAL Nature 440 (7083), 497-500 (2006)
+ PUBMED 16554811
+REFERENCE 2 (bases 1 to 1800)
+ CONSRTM International Human Genome Sequencing Consortium
+ TITLE Finishing the euchromatic sequence of the human genome
+ JOURNAL Nature 431 (7011), 931-945 (2004)
+ PUBMED 15496913
+REFERENCE 3 (bases 1 to 1800)
+ AUTHORS Lander,E.S., Linton,L.M., Birren,B., Nusbaum,C., Zody,M.C.,
+ Baldwin,J., Devon,K., Dewar,K., Doyle,M., FitzHugh,W., Funke,R.,
+ Gage,D., Harris,K., Heaford,A., Howland,J., Kann,L., Lehoczky,J.,
+ LeVine,R., McEwan,P., McKernan,K., Meldrim,J., Mesirov,J.P.,
+ Miranda,C., Morris,W., Naylor,J., Raymond,C., Rosetti,M.,
+ Santos,R., Sheridan,A., Sougnez,C., Stange-Thomann,N.,
+ Stojanovic,N., Subramanian,A., Wyman,D., Rogers,J., Sulston,J.,
+ Ainscough,R., Beck,S., Bentley,D., Burton,J., Clee,C., Carter,N.,
+ Coulson,A., Deadman,R., Deloukas,P., Dunham,A., Dunham,I.,
+ Durbin,R., French,L., Grafham,D., Gregory,S., Hubbard,T.,
+ Humphray,S., Hunt,A., Jones,M., Lloyd,C., McMurray,A., Matthews,L.,
+ Mercer,S., Milne,S., Mullikin,J.C., Mungall,A., Plumb,R., Ross,M.,
+ Shownkeen,R., Sims,S., Waterston,R.H., Wilson,R.K., Hillier,L.W.,
+ McPherson,J.D., Marra,M.A., Mardis,E.R., Fulton,L.A.,
+ Chinwalla,A.T., Pepin,K.H., Gish,W.R., Chissoe,S.L., Wendl,M.C.,
+ Delehaunty,K.D., Miner,T.L., Delehaunty,A., Kramer,J.B., Cook,L.L.,
+ Fulton,R.S., Johnson,D.L., Minx,P.J., Clifton,S.W., Hawkins,T.,
+ Branscomb,E., Predki,P., Richardson,P., Wenning,S., Slezak,T.,
+ Doggett,N., Cheng,J.F., Olsen,A., Lucas,S., Elkin,C.,
+ Uberbacher,E., Frazier,M., Gibbs,R.A., Muzny,D.M., Scherer,S.E.,
+ Bouck,J.B., Sodergren,E.J., Worley,K.C., Rives,C.M., Gorrell,J.H.,
+ Metzker,M.L., Naylor,S.L., Kucherlapati,R.S., Nelson,D.L.,
+ Weinstock,G.M., Sakaki,Y., Fujiyama,A., Hattori,M., Yada,T.,
+ Toyoda,A., Itoh,T., Kawagoe,C., Watanabe,H., Totoki,Y., Taylor,T.,
+ Weissenbach,J., Heilig,R., Saurin,W., Artiguenave,F., Brottier,P.,
+ Bruls,T., Pelletier,E., Robert,C., Wincker,P., Smith,D.R.,
+ Doucette-Stamm,L., Rubenfield,M., Weinstock,K., Lee,H.M.,
+ Dubois,J., Rosenthal,A., Platzer,M., Nyakatura,G., Taudien,S.,
+ Rump,A., Yang,H., Yu,J., Wang,J., Huang,G., Gu,J., Hood,L.,
+ Rowen,L., Madan,A., Qin,S., Davis,R.W., Federspiel,N.A.,
+ Abola,A.P., Proctor,M.J., Myers,R.M., Schmutz,J., Dickson,M.,
+ Grimwood,J., Cox,D.R., Olson,M.V., Kaul,R., Raymond,C., Shimizu,N.,
+ Kawasaki,K., Minoshima,S., Evans,G.A., Athanasiou,M., Schultz,R.,
+ Roe,B.A., Chen,F., Pan,H., Ramser,J., Lehrach,H., Reinhardt,R.,
+ McCombie,W.R., de la Bastide,M., Dedhia,N., Blocker,H.,
+ Hornischer,K., Nordsiek,G., Agarwala,R., Aravind,L., Bailey,J.A.,
+ Bateman,A., Batzoglou,S., Birney,E., Bork,P., Brown,D.G.,
+ Burge,C.B., Cerutti,L., Chen,H.C., Church,D., Clamp,M.,
+ Copley,R.R., Doerks,T., Eddy,S.R., Eichler,E.E., Furey,T.S.,
+ Galagan,J., Gilbert,J.G., Harmon,C., Hayashizaki,Y., Haussler,D.,
+ Hermjakob,H., Hokamp,K., Jang,W., Johnson,L.S., Jones,T.A.,
+ Kasif,S., Kaspryzk,A., Kennedy,S., Kent,W.J., Kitts,P.,
+ Koonin,E.V., Korf,I., Kulp,D., Lancet,D., Lowe,T.M., McLysaght,A.,
+ Mikkelsen,T., Moran,J.V., Mulder,N., Pollara,V.J., Ponting,C.P.,
+ Schuler,G., Schultz,J., Slater,G., Smit,A.F., Stupka,E.,
+ Szustakowski,J., Thierry-Mieg,D., Thierry-Mieg,J., Wagner,L.,
+ Wallis,J., Wheeler,R., Williams,A., Wolf,Y.I., Wolfe,K.H.,
+ Yang,S.P., Yeh,R.F., Collins,F., Guyer,M.S., Peterson,J.,
+ Felsenfeld,A., Wetterstrand,K.A., Patrinos,A., Morgan,M.J., de
+ Jong,P., Catanese,J.J., Osoegawa,K., Shizuya,H., Choi,S. and
+ Chen,Y.J.
+ CONSRTM International Human Genome Sequencing Consortium
+ TITLE Initial sequencing and analysis of the human genome
+ JOURNAL Nature 409 (6822), 860-921 (2001)
+ PUBMED 11237011
+ REMARK Erratum:[Nature 2001 Aug 2;412(6846):565]
+COMMENT REFSEQ INFORMATION: The reference sequence is identical to
+ CM000673.2.
+ On Feb 3, 2014 this sequence version replaced gi:224589802.
+ Assembly Name: GRCh38 Primary Assembly
+ The DNA sequence is composed of genomic sequence, primarily
+ finished clones that were sequenced as part of the Human Genome
+ Project. PCR products and WGS shotgun sequence have been added
+ where necessary to fill gaps or correct errors. All such additions
+ are manually curated by GRC staff. For more information see:
+ http://genomereference.org.
+
+ ##Genome-Annotation-Data-START##
+ Annotation Provider :: NCBI
+ Annotation Status :: Full annotation
+ Annotation Version :: Homo sapiens Annotation Release 106
+ Annotation Pipeline :: NCBI eukaryotic genome annotation
+ pipeline
+ Annotation Software Version :: 5.2
+ Annotation Method :: Best-placed RefSeq; Gnomon
+ Features Annotated :: Gene; mRNA; CDS; ncRNA
+ ##Genome-Annotation-Data-END##
+FEATURES Location/Qualifiers
+ source 1..1800
+ /organism="Homo sapiens"
+ /mol_type="genomic DNA"
+ /db_xref="taxon:9606"
+ /chromosome="11"
+ gene 1..1800
+ /gene="HBD"
+ /note="hemoglobin, delta; Derived by automated computational analysis using gene prediction method: Curated Genomic."
+ /db_xref="GeneID:3045"
+ /db_xref="HGNC:4829"
+ /db_xref="MIM:142000"
+ mRNA join(1..287,416..638,1537..1800)
+ /gene="HBD"
+ /product="hemoglobin, delta"
+ /note="Derived by automated computational analysis using gene prediction method: Curated Genomic."
+ /transcript_id="NM_000519.3"
+ /db_xref="GI:62865863"
+ /db_xref="GeneID:3045"
+ /db_xref="HGNC:4829"
+ /db_xref="MIM:142000"
+ CDS join(196..287,416..638,1537..1665)
+ /gene="HBD"
+ /note="delta globin; delta-globin chain; hemoglobin delta chain; Derived by automated computational analysis using gene prediction method: Curated Genomic."
+ /codon_start=1
+ /product="hemoglobin subunit delta"
+ /protein_id="NP_000510.1"
+ /db_xref="GI:4504351"
+ /db_xref="CCDS:CCDS31376.1"
+ /db_xref="GeneID:3045"
+ /db_xref="HGNC:4829"
+ /db_xref="MIM:142000"
+ /translation="MVHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFE
+ SFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPE
+ NFRLLGNVLVCVLARNFGKEFTPQMQAAYQKVVAGVANALAHKYH"
+ORIGIN
+ 1 agggcaagtt aagggaatag tggaatgaag gttcattttt cattctcaca aactaatgaa
+ 61 accctgctta tcttaaacca acctgctcac tggagcaggg aggacaggac cagcataaaa
+ 121 ggcagggcag agtcgactgt tgcttacact ttcttctgac ataacagtgt tcactagcaa
+ 181 cctcaaacag acaccatggt gcatctgact cctgaggaga agactgctgt caatgccctg
+ 241 tggggcaaag tgaacgtgga tgcagttggt ggtgaggccc tgggcaggtt ggtatcaagg
+ 301 ttataagaga ggctcaagga ggcaaatgga aactgggcat gtgtagacag agaagactct
+ 361 tgggtttctg ataggcactg actctctgtc ccttgggctg ttttcctacc ctcagattac
+ 421 tggtggtcta cccttggacc cagaggttct ttgagtcctt tggggatctg tcctctcctg
+ 481 atgctgttat gggcaaccct aaggtgaagg ctcatggcaa gaaggtgcta ggtgccttta
+ 541 gtgatggcct ggctcacctg gacaacctca agggcacttt ttctcagctg agtgagctgc
+ 601 actgtgacaa gctgcacgtg gatcctgaga acttcagggt gagtccagga gatgcttcac
+ 661 ttttctcttt ttactttcta atcttacatt ttggttcttt tacctacctg ctcttctccc
+ 721 acatttttgt cattttacta tattttatca tttaatgctt ctaaaatttt gttaattttt
+ 781 tatttaaata ttctgcattt tttccttcct cacaatcttg ctattttaaa ttatttaata
+ 841 tcctgtcttt ctctcccaac cccctccctt catttttcct tctctaacaa caactcaaat
+ 901 tatgcatacc agctctcacc tgctaattct gcacttagaa taatcctttt gtctctccac
+ 961 atgggtatgg gagaggctcc aactcaaaga tgagaggcat agaatactgt tttagaggct
+ 1021 ataaatcatt ttacaataag gaataattgg aattttataa attctgtagt aaatggaatg
+ 1081 gaaaggaaag tgaatatttg attatgaaag actaggcagt tacactggag gtggggcaga
+ 1141 agtcgttgct aggagacagc ccatcatcac actgattaat caattaattt gtatctatta
+ 1201 atctgtttat agtaattaat ttgtatatgc tatatacaca tacaaaatta aaactaattt
+ 1261 ggaattaatt tgtatatagt attatacagc atatatagca tatatgtaca tatatagact
+ 1321 acatgctagt taagtacata gaggatgtgt gtgtatagat atatgttata tgtatgcatt
+ 1381 catatatgta cttatttatg ctgatgggaa taacctgggg atcagttttg tctaagattt
+ 1441 gggcagaaaa aaatgggtgt tggctcagtt tctcagaagc cagtctttat ttctctgtta
+ 1501 accatatgca tgtatctgcc tacctcttct ccgcagctct tgggcaatgt gctggtgtgt
+ 1561 gtgctggccc gcaactttgg caaggaattc accccacaaa tgcaggctgc ctatcagaag
+ 1621 gtggtggctg gtgtggctaa tgccctggct cacaagtacc attgagatcc tggactgttt
+ 1681 cctgataacc ataagaagac cctatttccc tagattctat tttctgaact tgggaacaca
+ 1741 atgcctactt caagggtatg gcttctgcct aataaagaat gttcagctca acttcctgat
+//
\ No newline at end of file
--- /dev/null
+LOCUS V00505 1976 bp DNA linear PRI 14-NOV-2006
+DEFINITION Human gene for delta-globin.
+ACCESSION V00505
+VERSION V00505.1 GI:30510
+KEYWORDS delta globin; germ line; globin.
+SOURCE Homo sapiens (human)
+ ORGANISM Homo sapiens
+ Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+ Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+ Catarrhini; Hominidae; Homo.
+REFERENCE 1 (bases 1 to 1976)
+ AUTHORS Spritz,R.A., DeRiel,J.K., Forget,B.G. and Weissman,S.M.
+ TITLE Complete nucleotide sequence of the human delta-globin gene
+ JOURNAL Cell 21 (3), 639-646 (1980)
+ PUBMED 7438204
+COMMENT KST HSA.DELGLOBIN.
+FEATURES Location/Qualifiers
+ source 1..1976
+ /organism="Homo sapiens"
+ /mol_type="genomic DNA"
+ /db_xref="taxon:9606"
+ prim_transcript 123..1763
+ exon 123..265
+ /number=1
+ CDS join(173..265,394..615,1505..1633)
+ /codon_start=1
+ /product="delta globin"
+ /protein_id="CAA23763.1"
+ /db_xref="GI:30511"
+ /db_xref="GDB:119298"
+ /db_xref="GOA:P02042"
+ /db_xref="HGNC:4829"
+ /db_xref="InterPro:IPR000971"
+ /db_xref="InterPro:IPR002337"
+ /db_xref="InterPro:IPR009050"
+ /db_xref="InterPro:IPR012292"
+ /db_xref="PDB:1SHR"
+ /db_xref="PDB:1SI4"
+ /db_xref="UniProtKB/Swiss-Prot:P02042"
+ /translation="MVHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLARNFGKEFTPQMQAAYQKVVAGVANALAHKYH"
+ intron 266..393
+ /number=1
+ exon 394..615
+ /number=2
+ intron 616..1504
+ /number=3
+ exon 1505..1763
+ /number=3
+ORIGIN
+ 1 aatgaaggtt catttttcat tctcacaaac taatgaaacc ctgcttatct taaaccaacc
+ 61 tgctcactgg agcagggagg acaggaccag cataaaaggc agggcagagt cgactgttgc
+ 121 ttacactttc ttctgacata acagtgttca ctagcaacct caaacagaca ccatggtgca
+ 181 tctgactcct gaggagaaga ctgctgtcaa tgccctgtgg ggcaaagtga acgtggatgc
+ 241 agttggtggt gaggccctgg gcaggttggt atcaaggtta taagagaggc tcaaggaggc
+ 301 aaatggaaac tgggcatgtg tagacagaga agactcttgg gtttctgata ggcactgact
+ 361 ctctgtccct tgggctgttt tcctaccctc agattactgg tggtctaccc ttggacccag
+ 421 aggttctttg agtcctttgg ggatctgtcc tctcctgatg ctgttatggg caaccctaag
+ 481 gtgaaggctc atggcaagaa ggtgctaggt gcctttagtg atggcctggc tcacctggac
+ 541 aacctcaagg gcactttttc tcagctgagt gagctgcact gtgacaagct gcacgtggat
+ 601 cctgagaact tcagggtgag tccaggagat gcttcacttt tctcttttta ctttctaatc
+ 661 ttacattttg gttcttttac ctacctgctc ttctcccaca tttttgtcat tttactatat
+ 721 tttatcattt aatgcttcta aaattttgtt atttttttat ttaaaaattc tgcatttttt
+ 781 ccttcctcac aatcttgcta ctctaaatta tttaatatcc tgtctttctc tcccaacccc
+ 841 ctcccttcat ttttccttct ctaacaacaa ctcaaattat gcataccagc tctcacctgc
+ 901 taatttcgca cttagaataa tccttttgtc tctccacatg ggtatgggag aggctccaac
+ 961 tcaaagatga gaggcataga atactgtttt agaggctata aatcatttta caataaggaa
+ 1021 taattggaat tttataaatt ctgtagtaaa tggaatggaa aggaaagtga atatttgatt
+ 1081 atgaaagact aggcagttac actggaggtg gggcagaagt cgttgctagg agacagccca
+ 1141 tcatcacact gatttatcaa ttcaatttgt atctattaat ctgtttatag taattaattt
+ 1201 gtatatgcta tatacacata caaaattaaa actaatttgg aattaatttg tatatagtat
+ 1261 tatacagcat atatgtacat atatagacta catgctagtt aagtacatag aggatgtgtg
+ 1321 tgtatagata tatgttatat gtatgcattc atatatgtac ttatttatgc tgatgggaat
+ 1381 aacctgggga tcagttttgt ctaagatttg ggcagaaaaa aatgggtgtt ggctcagttc
+ 1441 tcagaagcca gtctttattt ctctgttaac catatgcatg tatctgccta cctcttctcc
+ 1501 gcagctcttg ggcaatgtgc tggtgtgtgt gctggcccgc aactttggca aggaattcac
+ 1561 cccacaaatg caggctgcct atcagaaggt ggtggctggt gtggctaatg ccttggctca
+ 1621 caagtaccat tgagatcctg gactgtttcc tgataaccat aagaagaccc tatttcccta
+ 1681 gattctattt tctgaacttg ggaacacaat gcctacttca agggtatggc ttctgcctaa
+ 1741 taaagaatgt tcagctcaac ttcctgatta atttcactta tttcattttt ttgtccaggt
+ 1801 gtgtaagaag gttcctgagg ctctacagat agggagcact tctttatttt acaaagagta
+ 1861 catgggaaaa gagaaaagca agggaaccgt acaaggcatt aatgggtgac acttctacct
+ 1921 ccaaagagca gaaattatca agaactcttg atacaaagat aatactggca ctgcag
+//
\ No newline at end of file